Spaces:
Sleeping
Sleeping
app modified
Browse files
app.py
CHANGED
@@ -295,8 +295,41 @@ class ModeloDataset:
|
|
295 |
x=x+1
|
296 |
else:
|
297 |
x=x+1
|
298 |
-
return new_identificadores
|
299 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
300 |
if idioma=="es":
|
301 |
self.tokenizer = AutoTokenizer.from_pretrained("BSC-LT/roberta_model_for_anonimization")
|
302 |
tokenized_text=[self.tokenizer.tokenize(sentence) for sentence in _sentences]
|
@@ -345,10 +378,7 @@ class ModeloDataset:
|
|
345 |
i=i+1
|
346 |
labels = predicted_token_class_ids
|
347 |
loss = self.model(input_ids, labels=labels).loss
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
new_tokens=[]
|
353 |
ig_tok=[]
|
354 |
i=0
|
@@ -358,17 +388,14 @@ class ModeloDataset:
|
|
358 |
aux1, aux2= self.reordenacion_tokens(item)
|
359 |
new_tokens.append(aux1)
|
360 |
ig_tok.append(aux2)
|
361 |
-
|
362 |
-
print('ig_tok',ig_tok)
|
363 |
|
364 |
for items in _predicted_tokens_classes:
|
365 |
-
|
366 |
-
|
367 |
aux=self.reordenacion_identificadores(ig_tok[i],items,len(new_tokens[i]))
|
368 |
new_identificadores.append(aux)
|
369 |
i=i+1
|
370 |
print('new_identificadores:',new_identificadores, ' ',len(new_identificadores) )
|
371 |
-
|
372 |
return new_identificadores, new_tokens
|
373 |
#return ids, _predicted_tokens_classes
|
374 |
def salida_texto_es( self,ids,pre_tokens):
|
@@ -400,13 +427,15 @@ class ModeloDataset:
|
|
400 |
for i in new_labels:
|
401 |
a = a+i
|
402 |
return a
|
403 |
-
def salida_texto2(self, tokens,labels):
|
404 |
i=0
|
405 |
out=[]
|
406 |
for iden in labels:
|
407 |
#if i<len(ids):
|
408 |
-
|
409 |
-
|
|
|
|
|
410 |
i=i+1
|
411 |
|
412 |
return out
|
@@ -513,14 +542,14 @@ def procesar(texto,archivo, etiquetas):
|
|
513 |
sentences=df[item]
|
514 |
model.identificacion_idioma(sentences[0])
|
515 |
|
516 |
-
ides, predicted = modelo.aplicar_modelo(sentences,model.idioma)
|
517 |
|
518 |
if model.idioma=="es":
|
519 |
out=modelo.salida_texto2_es( ides,predicted)
|
520 |
print('out:',out)
|
521 |
df_new[item] = modelo.unir_array(out)
|
522 |
else:
|
523 |
-
out=modelo.salida_texto2( ides,predicted)#tokens,labels
|
524 |
print('out:',out)
|
525 |
df_new[item] = modelo.unir_array(out)
|
526 |
return "", df_new, df_new.to_csv(sep='\t', encoding='utf-8',index=False)
|
@@ -535,7 +564,7 @@ def procesar(texto,archivo, etiquetas):
|
|
535 |
for item in df.columns.values:
|
536 |
sentences=df[item]
|
537 |
|
538 |
-
ides, predicted = modelo.aplicar_modelo(sentences,"en")
|
539 |
out=modelo.salida_texto2( ides,predicted)#tokens,labels
|
540 |
print('out:',out)
|
541 |
df_new[item] = modelo.unir_array(out)
|
|
|
295 |
x=x+1
|
296 |
else:
|
297 |
x=x+1
|
298 |
+
return new_identificadores
|
299 |
+
def fake_pers(self):
|
300 |
+
return self.faker_.name(self)
|
301 |
+
def fake_word(self):
|
302 |
+
return self.faker_.word()
|
303 |
+
def fake_first_name(self):
|
304 |
+
return self.faker_.first_name()
|
305 |
+
def fake_last_name(self):
|
306 |
+
return self.faker_.last_name()
|
307 |
+
def fake_address(self):
|
308 |
+
return self.faker_.address()
|
309 |
+
def fake_sentence(self,n):
|
310 |
+
return self.faker_.sentence(nb_words=n)
|
311 |
+
def fake_text(self):
|
312 |
+
return self.faker_.text()
|
313 |
+
def fake_company(self):
|
314 |
+
return self.faker_.company()
|
315 |
+
def fake_city(self):
|
316 |
+
return self.faker_.city()
|
317 |
+
def reemplazo_fake(self,identificadores):
|
318 |
+
new_iden=[]
|
319 |
+
for id in identificadores:
|
320 |
+
|
321 |
+
if 'PER' in id:
|
322 |
+
new_iden.append(self.fake_first_name())
|
323 |
+
|
324 |
+
elif 'ORG' in id:
|
325 |
+
new_iden.append(self.fake_company())
|
326 |
+
|
327 |
+
elif 'LOC' in id:
|
328 |
+
new_iden.append(self.fake_city())
|
329 |
+
else:
|
330 |
+
new_iden.append(id)
|
331 |
+
return new_iden
|
332 |
+
def aplicar_modelo(self,_sentences,idioma, etiquetas):
|
333 |
if idioma=="es":
|
334 |
self.tokenizer = AutoTokenizer.from_pretrained("BSC-LT/roberta_model_for_anonimization")
|
335 |
tokenized_text=[self.tokenizer.tokenize(sentence) for sentence in _sentences]
|
|
|
378 |
i=i+1
|
379 |
labels = predicted_token_class_ids
|
380 |
loss = self.model(input_ids, labels=labels).loss
|
381 |
+
|
|
|
|
|
|
|
382 |
new_tokens=[]
|
383 |
ig_tok=[]
|
384 |
i=0
|
|
|
388 |
aux1, aux2= self.reordenacion_tokens(item)
|
389 |
new_tokens.append(aux1)
|
390 |
ig_tok.append(aux2)
|
391 |
+
|
|
|
392 |
|
393 |
for items in _predicted_tokens_classes:
|
|
|
|
|
394 |
aux=self.reordenacion_identificadores(ig_tok[i],items,len(new_tokens[i]))
|
395 |
new_identificadores.append(aux)
|
396 |
i=i+1
|
397 |
print('new_identificadores:',new_identificadores, ' ',len(new_identificadores) )
|
398 |
+
|
399 |
return new_identificadores, new_tokens
|
400 |
#return ids, _predicted_tokens_classes
|
401 |
def salida_texto_es( self,ids,pre_tokens):
|
|
|
427 |
for i in new_labels:
|
428 |
a = a+i
|
429 |
return a
|
430 |
+
def salida_texto2(self, tokens,labels,etiquetas):
|
431 |
i=0
|
432 |
out=[]
|
433 |
for iden in labels:
|
434 |
#if i<len(ids):
|
435 |
+
if etiquetas:
|
436 |
+
out.append(self.salida_texto( iden,np.array(tokens[i])))
|
437 |
+
else:
|
438 |
+
out.append(self.salida_texto(iden,self.reemplazo_fake(np.array(tokens[i]))))
|
439 |
i=i+1
|
440 |
|
441 |
return out
|
|
|
542 |
sentences=df[item]
|
543 |
model.identificacion_idioma(sentences[0])
|
544 |
|
545 |
+
ides, predicted = modelo.aplicar_modelo(sentences,model.idioma,etiquetas)
|
546 |
|
547 |
if model.idioma=="es":
|
548 |
out=modelo.salida_texto2_es( ides,predicted)
|
549 |
print('out:',out)
|
550 |
df_new[item] = modelo.unir_array(out)
|
551 |
else:
|
552 |
+
out=modelo.salida_texto2( ides,predicted,etiquetas)#tokens,labels
|
553 |
print('out:',out)
|
554 |
df_new[item] = modelo.unir_array(out)
|
555 |
return "", df_new, df_new.to_csv(sep='\t', encoding='utf-8',index=False)
|
|
|
564 |
for item in df.columns.values:
|
565 |
sentences=df[item]
|
566 |
|
567 |
+
ides, predicted = modelo.aplicar_modelo(sentences,"en",etiquetas)
|
568 |
out=modelo.salida_texto2( ides,predicted)#tokens,labels
|
569 |
print('out:',out)
|
570 |
df_new[item] = modelo.unir_array(out)
|