dayannex commited on
Commit
a03f45e
·
1 Parent(s): 9f03f6c

cambio de modelo

Browse files
Files changed (2) hide show
  1. app.py +594 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,594 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import json
4
+ import pandas as pd
5
+ import numpy as np
6
+ from transformers import AutoTokenizer, RobertaForTokenClassification
7
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
8
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
9
+ from json import JSONEncoder
10
+ from faker import Faker
11
+ from keras.utils import pad_sequences
12
+ class out_json():
13
+ def __init__(self, w,l):
14
+ self.word = w
15
+ self.label = l
16
+ class MyEncoder(JSONEncoder):
17
+ def default(self, o):
18
+ return o.__dict__
19
+ class Model:
20
+ def __init__(self):
21
+ self.texto=""
22
+ self.idioma=""
23
+ self.modelo_ner=""
24
+ self.categoria_texto=""
25
+ ##
26
+ ### Función que aplica el modelo e identifica su idioma
27
+ ###
28
+ def identificacion_idioma(self,text):
29
+ self.texto=text
30
+ tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
31
+ model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")
32
+
33
+ inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
34
+
35
+ with torch.no_grad():
36
+ logits = model(**inputs).logits
37
+
38
+ preds = torch.softmax(logits, dim=-1)
39
+
40
+
41
+ id2lang = model.config.id2label
42
+ vals, idxs = torch.max(preds, dim=1)
43
+
44
+ #retorna el idioma con mayor porcentaje
45
+ maximo=vals.max()
46
+ idioma=''
47
+ porcentaje=0
48
+ for k, v in zip(idxs, vals):
49
+ if v.item()==maximo:
50
+ idioma,porcentaje=id2lang[k.item()],v.item()
51
+
52
+
53
+ if idioma=='es':
54
+ self.idioma="es"
55
+ self.modelo_ner='BSC-LT/roberta_model_for_anonimization'
56
+ self.faker_ = Faker('es_MX')
57
+ self.model = RobertaForTokenClassification.from_pretrained(self.modelo_ner)
58
+ else:
59
+ self.idioma="en"
60
+ self.faker_ = Faker('en_US')
61
+ self.modelo_ner="dayannex/distilbert-tuned-4labels"
62
+ self.model = AutoModelForTokenClassification.from_pretrained(self.modelo_ner)
63
+ self.categorizar_texto(self.texto)
64
+ def reordenacion_tokens(self,tokens,caracter):
65
+
66
+ i=0
67
+ new_tokens=[]
68
+ ig_tokens=[]
69
+ for token in tokens:
70
+ print('token_texto:',token,caracter)
71
+ ind=len(new_tokens)
72
+ if i<len(tokens):
73
+ if not token.startswith(caracter):
74
+
75
+ new_tokens.append(token)
76
+
77
+ i=i+1
78
+ else:
79
+ new_tokens[ind-1] = (new_tokens[ind-1] + token.replace(caracter,''))
80
+ ig_tokens.append(i)
81
+
82
+ i=i+1
83
+ return (
84
+ new_tokens,
85
+ ig_tokens
86
+ )
87
+
88
+ def reordenacion_identificadores(self,ig_tokens,predicted_tokens_classes):
89
+ x=0
90
+ new_identificadores=[]
91
+ for token in predicted_tokens_classes:
92
+
93
+ if x not in ig_tokens:
94
+ new_identificadores.append(token)
95
+ x=x+1
96
+ else:
97
+ x=x+1
98
+ return new_identificadores
99
+ def salida_json(self,tokens,pre_tokens):
100
+ list=[]
101
+ i=0
102
+ for t in tokens:
103
+ if pre_tokens[i]!='O':
104
+ a = out_json(t.replace('##','').replace('Ġ','').replace('Ċ',''),pre_tokens[i].replace('▁',''))
105
+ list.append(a)
106
+ i=i+1
107
+ return MyEncoder().encode(list)
108
+ def salida_texto( self,tokens,pre_tokens):
109
+ new_labels = []
110
+ current_word = None
111
+ i=0
112
+ for token in tokens:
113
+
114
+ if pre_tokens[i]=='O' or 'MISC' in pre_tokens[i]:
115
+ new_labels.append(' ' +token.replace('##','').replace('Ġ',''))
116
+ else:
117
+ new_labels.append(' ' + pre_tokens[i])
118
+ i=i+1
119
+ a=''
120
+ for i in new_labels:
121
+ a = a+i
122
+ return a
123
+
124
+ def salida_texto_anonimizado(self, ids,pre_tokens):
125
+ new_labels = []
126
+ current_word = None
127
+ i=0
128
+ for identificador in pre_tokens:
129
+
130
+ if identificador=='O' or 'OTH' in identificador:
131
+ new_labels.append(self.tokenizer.decode(ids[i]))
132
+ else:
133
+ new_labels.append(' ' + identificador)
134
+ i=i+1
135
+ a=''
136
+ for i in new_labels:
137
+ a = a+i
138
+ return a
139
+ def formato_salida(self,out):
140
+ a=""
141
+ for i in out:
142
+ a = a + i.replace('▁','').replace(' ','') + ' '
143
+ return a
144
+ def fake_pers(self):
145
+ return self.faker_.name(self)
146
+ def fake_word(self):
147
+ return self.faker_.word()
148
+ def fake_first_name(self):
149
+ return self.faker_.first_name()
150
+ def fake_last_name(self):
151
+ return self.faker_.last_name()
152
+ def fake_address(self):
153
+ return self.faker_.address()
154
+ def fake_sentence(self,n):
155
+ return self.faker_.sentence(nb_words=n)
156
+ def fake_text(self):
157
+ return self.faker_.text()
158
+ def fake_company(self):
159
+ return self.faker_.company()
160
+ def fake_city(self):
161
+ return self.faker_.city()
162
+ def reemplazo_fake(self,identificadores):
163
+
164
+
165
+
166
+ new_iden=[]
167
+ for id in identificadores:
168
+
169
+ if 'PER' in id:
170
+ new_iden.append(self.fake_first_name())
171
+
172
+ elif 'ORG' in id:
173
+ new_iden.append(self.fake_company())
174
+
175
+ elif 'LOC' in id:
176
+ new_iden.append(self.fake_city())
177
+ else:
178
+ new_iden.append(id)
179
+ return new_iden
180
+ ###
181
+ ### Función que aplica los modelo para categorizar el texto segun su contexto
182
+ ###
183
+ def categorizar_texto(self,texto):
184
+ name="elozano/bert-base-cased-news-category"
185
+ tokenizer = AutoTokenizer.from_pretrained(name)
186
+ model_ = AutoModelForSequenceClassification.from_pretrained(name)
187
+
188
+ inputs_ = tokenizer(texto, padding=True, truncation=True, return_tensors="pt")
189
+
190
+ with torch.no_grad():
191
+ logits = model_(**inputs_).logits
192
+
193
+ preds = torch.softmax(logits, dim=-1)
194
+
195
+
196
+ id2lang = model_.config.id2label
197
+ vals, idxs = torch.max(preds, dim=1)
198
+
199
+ #retorna el idioma con mayor porcentaje
200
+ maximo=vals.max()
201
+ cat=''
202
+ self.categoria_texto=''
203
+ porcentaje=0
204
+ for k, v in zip(idxs, vals):
205
+ if v.item()==maximo:
206
+ cat,porcentaje=id2lang[k.item()],v.item()
207
+ self.categoria_texto=cat
208
+
209
+
210
+ return cat, porcentaje
211
+ ###
212
+ ### Función que aplica los modelos sobre un texto
213
+ ###
214
+ def predict(self,etiquetas):
215
+
216
+ categoria, porcentaje = self.categorizar_texto(self.texto)
217
+ print(categoria, porcentaje)
218
+
219
+ self.tokenizer = AutoTokenizer.from_pretrained(self.modelo_ner)
220
+
221
+ inputs = self.tokenizer(self.texto, return_tensors="pt")
222
+ with torch.no_grad():
223
+ outputs = model(**inputs)
224
+ logits = outputs.logits
225
+ predictions = torch.argmax(logits, dim=2)
226
+
227
+ predicted_token_class_ids = predictions[0].tolist()
228
+
229
+
230
+ predicted_tokens_classes = [self.model.config.id2label[label_id] for label_id in predicted_token_class_ids]
231
+ tokens = self.tokenizer.convert_ids_to_tokens(inputs.input_ids[0])
232
+
233
+
234
+ if (self.idioma=='es'):
235
+
236
+ new_tokens,ig_tokens=self.reordenacion_tokens(tokens,'Ġ')
237
+
238
+
239
+ else:
240
+ new_tokens,ig_tokens=self.reordenacion_tokens(tokens,'#')
241
+
242
+ new_identificadores = self.reordenacion_identificadores(ig_tokens,predicted_tokens_classes)
243
+ out1 = self.salida_json(new_tokens,new_identificadores)
244
+ if etiquetas:
245
+ out2 = self.salida_texto(new_tokens,new_identificadores)#solo identificadores
246
+ else:
247
+ out2 = self.salida_texto(new_tokens,self.reemplazo_fake(new_identificadores))
248
+
249
+
250
+ return (
251
+
252
+
253
+ out1,
254
+ str(out2)
255
+
256
+
257
+ )
258
+ class ModeloDataset:
259
+ def __init__(self):
260
+ self.texto=""
261
+ self.idioma=""
262
+ self.modelo_ner=""
263
+ self.categoria_texto=""
264
+ self.tokenizer = AutoTokenizer.from_pretrained("BSC-LT/roberta_model_for_anonimization")
265
+ def reordenacion_tokens(self,tokens,caracter):
266
+
267
+ i=0
268
+ new_tokens=[]
269
+ ig_tokens=[]
270
+ for token in tokens:
271
+ print('tokensss:',tokens,caracter)
272
+ ind=len(new_tokens)
273
+ if i<len(tokens):
274
+ if token.startswith(caracter):
275
+
276
+ new_tokens.append(token)
277
+
278
+ i=i+1
279
+ else:
280
+ new_tokens[ind-1] = (new_tokens[ind-1] + token)
281
+ ig_tokens.append(i)
282
+
283
+ i=i+1
284
+ return (
285
+ new_tokens,
286
+ ig_tokens
287
+ )
288
+ def reordenacion_identificadores(self,ig_tokens,predicted_tokens_classes, tamano):
289
+ x=0
290
+ new_identificadores=[]
291
+ for token in predicted_tokens_classes:
292
+
293
+ if x not in ig_tokens:
294
+ if len(new_identificadores) < tamano:
295
+
296
+ new_identificadores.append(token)
297
+ x=x+1
298
+ else:
299
+ x=x+1
300
+ return new_identificadores
301
+ ###
302
+ ### Funciones para generar diversos datos fake dependiendo de la catagoria
303
+ ###
304
+ def fake_pers(self):
305
+ return self.faker_.name(self)
306
+ def fake_word(self):
307
+ return self.faker_.word()
308
+ def fake_first_name(self):
309
+ return self.faker_.first_name()
310
+ def fake_last_name(self):
311
+ return self.faker_.last_name()
312
+ def fake_address(self):
313
+ return self.faker_.address()
314
+ def fake_sentence(self,n):
315
+ return self.faker_.sentence(nb_words=n)
316
+ def fake_text(self):
317
+ return self.faker_.text()
318
+ def fake_company(self):
319
+ return self.faker_.company()
320
+ def fake_city(self):
321
+ return self.faker_.city()
322
+ def reemplazo_fake(self,identificadores):
323
+
324
+ if self.idioma=='es':
325
+ self.faker_ = Faker('es_MX')
326
+
327
+ else:
328
+ self.faker_ = Faker('en_US')
329
+ new_iden=[]
330
+ for id in identificadores:
331
+
332
+ if 'PER' in id:
333
+ new_iden.append(self.fake_first_name())
334
+
335
+ elif 'ORG' in id:
336
+ new_iden.append(self.fake_company())
337
+
338
+ elif 'LOC' in id:
339
+ new_iden.append(self.fake_city())
340
+ else:
341
+ new_iden.append(id)
342
+ return new_iden
343
+ ###
344
+ ### Función que aplica los modelos de acuerdo al idioma detectado
345
+ ###
346
+ def aplicar_modelo(self,_sentences,idioma, etiquetas):
347
+ if idioma=="es":
348
+ self.tokenizer = AutoTokenizer.from_pretrained("BSC-LT/roberta_model_for_anonimization")
349
+ tokenized_text=[self.tokenizer.tokenize(sentence[:500]) for sentence in _sentences]
350
+
351
+ ids = [self.tokenizer.convert_tokens_to_ids(x) for x in tokenized_text]
352
+ MAX_LEN=128
353
+ ids=pad_sequences(ids,maxlen=MAX_LEN,dtype="long",truncating="post", padding="post")
354
+ input_ids = torch.tensor(ids)
355
+
356
+ self.model = RobertaForTokenClassification.from_pretrained("BSC-LT/roberta_model_for_anonimization")
357
+ with torch.no_grad():
358
+ logits = self.model(input_ids).logits
359
+ predicted_token_class_ids = logits.argmax(-1)
360
+ i=0
361
+ _predicted_tokens_classes=[]
362
+ for a in predicted_token_class_ids:
363
+
364
+ _predicted_tokens_classes.append([self.model.config.id2label[t.item()] for t in predicted_token_class_ids[i]])
365
+ i=i+1
366
+ labels = predicted_token_class_ids
367
+ loss = self.model(input_ids, labels=labels).loss
368
+
369
+ new_tokens=[]
370
+ ig_tok=[]
371
+ i=0
372
+ new_identificadores=[]
373
+ for item in tokenized_text:
374
+
375
+ aux1, aux2= self.reordenacion_tokens(item,"Ġ")
376
+ new_tokens.append(aux1)
377
+ ig_tok.append(aux2)
378
+
379
+
380
+ for items in _predicted_tokens_classes:
381
+ aux=self.reordenacion_identificadores(ig_tok[i],items,len(new_tokens[i]))
382
+ new_identificadores.append(aux)
383
+ i=i+1
384
+
385
+ return new_identificadores, new_tokens
386
+ else:
387
+
388
+ print('idioma:',idioma)
389
+ self.tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll03-english")
390
+ tokenized_text=[self.tokenizer.tokenize(sentence[:500]) for sentence in _sentences]
391
+
392
+ ids = [self.tokenizer.convert_tokens_to_ids(x) for x in tokenized_text]
393
+
394
+
395
+ MAX_LEN=128
396
+ ids=pad_sequences(ids,maxlen=MAX_LEN,dtype="long",truncating="post", padding="post")
397
+ input_ids = torch.tensor(ids)
398
+
399
+
400
+ self.model = AutoModelForTokenClassification.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll03-english")
401
+ with torch.no_grad():
402
+ logits = self.model(input_ids).logits
403
+ predicted_token_class_ids = logits.argmax(-1)
404
+ i=0
405
+ _predicted_tokens_classes=[]
406
+ for a in predicted_token_class_ids:
407
+
408
+ _predicted_tokens_classes.append([self.model.config.id2label[t.item()] for t in predicted_token_class_ids[i]])
409
+ i=i+1
410
+ labels = predicted_token_class_ids
411
+ loss = self.model(input_ids, labels=labels).loss
412
+
413
+ new_tokens=[]
414
+ ig_tok=[]
415
+ i=0
416
+ new_identificadores=[]
417
+ for item in tokenized_text:
418
+
419
+ aux1, aux2= self.reordenacion_tokens(item,"▁")
420
+ new_tokens.append(aux1)
421
+ ig_tok.append(aux2)
422
+
423
+
424
+ for items in _predicted_tokens_classes:
425
+ aux=self.reordenacion_identificadores(ig_tok[i],items,len(new_tokens[i]))
426
+ new_identificadores.append(aux)
427
+ i=i+1
428
+
429
+
430
+ return new_identificadores, new_tokens
431
+
432
+ ###
433
+ ### Procesa los tokens generados del texto de entradas con los tokens predichos, para generar los tokens por palabra
434
+ ###
435
+ def salida_texto( self,tokens,pre_tokens):
436
+ new_labels = []
437
+ current_word = None
438
+ i=0
439
+ for token in tokens:
440
+
441
+ if pre_tokens[i]=='O' or 'MISC' in pre_tokens[i]:
442
+ new_labels.append(' ' +token.replace('▁','').replace('Ġ',''))
443
+ else:
444
+ new_labels.append(' ' + pre_tokens[i])
445
+ i=i+1
446
+ a=''
447
+ for i in new_labels:
448
+ a = a+i
449
+ return a
450
+ def salida_texto2(self, tokens,labels,etiquetas):
451
+ i=0
452
+ out=[]
453
+ for iden in labels:
454
+
455
+ if etiquetas:
456
+ out.append(self.salida_texto( iden,np.array(tokens[i])))
457
+ else:
458
+ out.append(self.salida_texto(iden,self.reemplazo_fake(np.array(tokens[i]))))
459
+ i=i+1
460
+
461
+ return out
462
+
463
+ def unir_array(self,_out):
464
+ i=0
465
+ salida=[]
466
+ for item in _out:
467
+ salida.append("".join(str(x) for x in _out[i]))
468
+ i=i+1
469
+ return salida
470
+ def unir_columna_valores(self,df,columna):
471
+ out = ','.join(df[columna])
472
+ return out
473
+ ###
474
+ ### Funcion para procesar archivos json, recibe archivo
475
+ ###
476
+ class utilJSON:
477
+ def __init__(self,archivo):
478
+ with open(archivo, encoding='utf-8') as f:
479
+ self.data = json.load(f)
480
+ def obtener_keys_json(self,data):
481
+ out=[]
482
+ for key in data:
483
+ out.append(key)
484
+ return(out)
485
+ ###
486
+ ### funcion "flatten_json" tomada de https://levelup.gitconnected.com/a-deep-dive-into-nested-json-to-data-frame-with-python-69bdabb41938
487
+ ### Renu Khandelwal Jul 23, 2023
488
+ def flatten_json(self,y):
489
+ try:
490
+ out = {}
491
+
492
+ def flatten(x, name=''):
493
+ if type(x) is dict:
494
+ for a in x:
495
+ flatten(x[a], name + a + '_')
496
+ elif type(x) is list:
497
+ i = 0
498
+ for a in x:
499
+ flatten(a, name + str(i) + '_')
500
+ i += 1
501
+ else:
502
+ out[name[:-1]] = x
503
+
504
+ flatten(y)
505
+ return out
506
+ except json.JSONDecodeError:
507
+ print("Error: The JSON document could not be decoded.")
508
+ except TypeError:
509
+ print("Error: Invalid operation or function argument type.")
510
+ except KeyError:
511
+ print("Error: One or more keys do not exist.")
512
+ except ValueError:
513
+ print("Error: Invalid value detected.")
514
+ except Exception as e:
515
+
516
+ print(f"An unexpected error occurred: {str(e)}")
517
+
518
+ def obtener_dataframe(self,data):
519
+ claves=self.obtener_keys_json(data)
520
+
521
+ if len(claves)==1:
522
+
523
+
524
+ data_flattened = [self.flatten_json(class_info) for class_info in data[claves[0]]]
525
+
526
+ df = pd.DataFrame(data_flattened)
527
+
528
+ else:
529
+
530
+ data_flattened = [self.flatten_json(class_info) for class_info in data]
531
+ df = pd.DataFrame(data_flattened)
532
+
533
+ return df
534
+ modelo = ModeloDataset()
535
+ model = Model()
536
+ def get_model():
537
+ return model
538
+ ###
539
+ ### Función que interactúa con la interfaz Gradio para el procesamiento de texto, csv o json
540
+ ###
541
+ def procesar(texto,archivo, etiquetas):
542
+
543
+
544
+ if len(texto)>0:
545
+ print('text')
546
+ model.identificacion_idioma(texto[:1700])
547
+ return model.idioma + "/" + model.categoria_texto, model.predict(etiquetas),gr.Dataframe(),gr.File()
548
+ else:
549
+
550
+ if archivo.name.split(".")[1]=="csv":
551
+ print('csv')
552
+ df=pd.read_csv(archivo.name,delimiter=";",encoding='latin-1')
553
+
554
+ df_new = pd.DataFrame( columns=df.columns.values)
555
+ model.identificacion_idioma(df.iloc[0][0])
556
+ modelo.idioma=model.idioma
557
+ print(model.idioma)
558
+ for item in df.columns.values:
559
+ sentences=df[item]
560
+
561
+ ides, predicted = modelo.aplicar_modelo(sentences,model.idioma,etiquetas)
562
+ out=modelo.salida_texto2( ides,predicted,etiquetas)
563
+ print('out es:',out)
564
+ df_new[item] = modelo.unir_array(out)
565
+
566
+ return modelo.idioma,"", df_new, df_new.to_csv(sep='\t', encoding='utf-8',index=False)
567
+
568
+ else:
569
+ print('json')
570
+ if archivo.name.split(".")[1]=="json":
571
+ util = utilJSON(archivo.name)
572
+ df=util.obtener_dataframe(util.data)
573
+ df_new = pd.DataFrame( columns=df.columns.values)
574
+
575
+ model.identificacion_idioma(df.iloc[0][0])
576
+ modelo.idioma=model.idioma
577
+
578
+ for item in df.columns.values:
579
+ sentences=df[item]
580
+
581
+ ides, predicted = modelo.aplicar_modelo(sentences,modelo.idioma,etiquetas)
582
+ out=modelo.salida_texto2( ides,predicted,etiquetas)
583
+
584
+ print('out:',out)
585
+ df_new[item] = modelo.unir_array(out)
586
+
587
+
588
+
589
+ return modelo.idioma,"", df_new, df_new.to_csv(sep='\t', encoding='utf-8',index=False)
590
+
591
+ demo = gr.Interface(fn=procesar,inputs=["text",gr.File(), "checkbox"] , outputs=[gr.Label(label="idioma/categoría"),gr.Textbox(label="texto procesado"),gr.Dataframe(label="Datos procesados en dataframe",interactive=False),gr.Textbox(label="datos csv")])
592
+ #
593
+ demo.launch(share=True)
594
+
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ Faker
4
+ keras
5
+ tensorflow