dayannex commited on
Commit
6f1e876
·
1 Parent(s): f97dfd6

app modified

Browse files
Files changed (1) hide show
  1. app.py +46 -17
app.py CHANGED
@@ -295,8 +295,41 @@ class ModeloDataset:
295
  x=x+1
296
  else:
297
  x=x+1
298
- return new_identificadores
299
- def aplicar_modelo(self,_sentences,idioma):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
  if idioma=="es":
301
  self.tokenizer = AutoTokenizer.from_pretrained("BSC-LT/roberta_model_for_anonimization")
302
  tokenized_text=[self.tokenizer.tokenize(sentence) for sentence in _sentences]
@@ -345,10 +378,7 @@ class ModeloDataset:
345
  i=i+1
346
  labels = predicted_token_class_ids
347
  loss = self.model(input_ids, labels=labels).loss
348
-
349
-
350
-
351
-
352
  new_tokens=[]
353
  ig_tok=[]
354
  i=0
@@ -358,17 +388,14 @@ class ModeloDataset:
358
  aux1, aux2= self.reordenacion_tokens(item)
359
  new_tokens.append(aux1)
360
  ig_tok.append(aux2)
361
- print('new_tokens:',new_tokens,' ' ,len(new_tokens))
362
- print('ig_tok',ig_tok)
363
 
364
  for items in _predicted_tokens_classes:
365
-
366
-
367
  aux=self.reordenacion_identificadores(ig_tok[i],items,len(new_tokens[i]))
368
  new_identificadores.append(aux)
369
  i=i+1
370
  print('new_identificadores:',new_identificadores, ' ',len(new_identificadores) )
371
-
372
  return new_identificadores, new_tokens
373
  #return ids, _predicted_tokens_classes
374
  def salida_texto_es( self,ids,pre_tokens):
@@ -400,13 +427,15 @@ class ModeloDataset:
400
  for i in new_labels:
401
  a = a+i
402
  return a
403
- def salida_texto2(self, tokens,labels):
404
  i=0
405
  out=[]
406
  for iden in labels:
407
  #if i<len(ids):
408
-
409
- out.append(self.salida_texto( iden,np.array(tokens[i])) )
 
 
410
  i=i+1
411
 
412
  return out
@@ -513,14 +542,14 @@ def procesar(texto,archivo, etiquetas):
513
  sentences=df[item]
514
  model.identificacion_idioma(sentences[0])
515
 
516
- ides, predicted = modelo.aplicar_modelo(sentences,model.idioma)
517
 
518
  if model.idioma=="es":
519
  out=modelo.salida_texto2_es( ides,predicted)
520
  print('out:',out)
521
  df_new[item] = modelo.unir_array(out)
522
  else:
523
- out=modelo.salida_texto2( ides,predicted)#tokens,labels
524
  print('out:',out)
525
  df_new[item] = modelo.unir_array(out)
526
  return "", df_new, df_new.to_csv(sep='\t', encoding='utf-8',index=False)
@@ -535,7 +564,7 @@ def procesar(texto,archivo, etiquetas):
535
  for item in df.columns.values:
536
  sentences=df[item]
537
 
538
- ides, predicted = modelo.aplicar_modelo(sentences,"en")
539
  out=modelo.salida_texto2( ides,predicted)#tokens,labels
540
  print('out:',out)
541
  df_new[item] = modelo.unir_array(out)
 
295
  x=x+1
296
  else:
297
  x=x+1
298
+ return new_identificadores
299
+ def fake_pers(self):
300
+ return self.faker_.name(self)
301
+ def fake_word(self):
302
+ return self.faker_.word()
303
+ def fake_first_name(self):
304
+ return self.faker_.first_name()
305
+ def fake_last_name(self):
306
+ return self.faker_.last_name()
307
+ def fake_address(self):
308
+ return self.faker_.address()
309
+ def fake_sentence(self,n):
310
+ return self.faker_.sentence(nb_words=n)
311
+ def fake_text(self):
312
+ return self.faker_.text()
313
+ def fake_company(self):
314
+ return self.faker_.company()
315
+ def fake_city(self):
316
+ return self.faker_.city()
317
+ def reemplazo_fake(self,identificadores):
318
+ new_iden=[]
319
+ for id in identificadores:
320
+
321
+ if 'PER' in id:
322
+ new_iden.append(self.fake_first_name())
323
+
324
+ elif 'ORG' in id:
325
+ new_iden.append(self.fake_company())
326
+
327
+ elif 'LOC' in id:
328
+ new_iden.append(self.fake_city())
329
+ else:
330
+ new_iden.append(id)
331
+ return new_iden
332
+ def aplicar_modelo(self,_sentences,idioma, etiquetas):
333
  if idioma=="es":
334
  self.tokenizer = AutoTokenizer.from_pretrained("BSC-LT/roberta_model_for_anonimization")
335
  tokenized_text=[self.tokenizer.tokenize(sentence) for sentence in _sentences]
 
378
  i=i+1
379
  labels = predicted_token_class_ids
380
  loss = self.model(input_ids, labels=labels).loss
381
+
 
 
 
382
  new_tokens=[]
383
  ig_tok=[]
384
  i=0
 
388
  aux1, aux2= self.reordenacion_tokens(item)
389
  new_tokens.append(aux1)
390
  ig_tok.append(aux2)
391
+
 
392
 
393
  for items in _predicted_tokens_classes:
 
 
394
  aux=self.reordenacion_identificadores(ig_tok[i],items,len(new_tokens[i]))
395
  new_identificadores.append(aux)
396
  i=i+1
397
  print('new_identificadores:',new_identificadores, ' ',len(new_identificadores) )
398
+
399
  return new_identificadores, new_tokens
400
  #return ids, _predicted_tokens_classes
401
  def salida_texto_es( self,ids,pre_tokens):
 
427
  for i in new_labels:
428
  a = a+i
429
  return a
430
+ def salida_texto2(self, tokens,labels,etiquetas):
431
  i=0
432
  out=[]
433
  for iden in labels:
434
  #if i<len(ids):
435
+ if etiquetas:
436
+ out.append(self.salida_texto( iden,np.array(tokens[i])))
437
+ else:
438
+ out.append(self.salida_texto(iden,self.reemplazo_fake(np.array(tokens[i]))))
439
  i=i+1
440
 
441
  return out
 
542
  sentences=df[item]
543
  model.identificacion_idioma(sentences[0])
544
 
545
+ ides, predicted = modelo.aplicar_modelo(sentences,model.idioma,etiquetas)
546
 
547
  if model.idioma=="es":
548
  out=modelo.salida_texto2_es( ides,predicted)
549
  print('out:',out)
550
  df_new[item] = modelo.unir_array(out)
551
  else:
552
+ out=modelo.salida_texto2( ides,predicted,etiquetas)#tokens,labels
553
  print('out:',out)
554
  df_new[item] = modelo.unir_array(out)
555
  return "", df_new, df_new.to_csv(sep='\t', encoding='utf-8',index=False)
 
564
  for item in df.columns.values:
565
  sentences=df[item]
566
 
567
+ ides, predicted = modelo.aplicar_modelo(sentences,"en",etiquetas)
568
  out=modelo.salida_texto2( ides,predicted)#tokens,labels
569
  print('out:',out)
570
  df_new[item] = modelo.unir_array(out)