dayannex commited on
Commit
a408f8a
·
1 Parent(s): ac86bd6

app modified

Browse files
Files changed (1) hide show
  1. app.py +31 -2
app.py CHANGED
@@ -261,6 +261,29 @@ class ModeloDataset:
261
  self.modelo_ner=""
262
  self.categoria_texto=""
263
  self.tokenizer = AutoTokenizer.from_pretrained("BSC-LT/roberta_model_for_anonimization")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  def aplicar_modelo(self,_sentences,idioma):
265
  if idioma=="es":
266
  self.tokenizer = AutoTokenizer.from_pretrained("BSC-LT/roberta_model_for_anonimization")
@@ -311,8 +334,12 @@ class ModeloDataset:
311
  i=i+1
312
  labels = predicted_token_class_ids
313
  loss = self.model(input_ids, labels=labels).loss
314
- #print(round(loss.item(), 2))
315
- return ids, _predicted_tokens_classes
 
 
 
 
316
  def salida_texto( self,ids,pre_tokens):
317
  new_labels = []
318
  current_word = None
@@ -430,6 +457,8 @@ def procesar(texto,archivo, etiquetas):
430
  model.identificacion_idioma(sentences[0])
431
  print('sentences',sentences)
432
  ides, predicted = modelo.aplicar_modelo(sentences,model.idioma)
 
 
433
  out=modelo.salida_texto2( ides,predicted)
434
  print('out:',out)
435
  df_new[item] = modelo.unir_array(out)
 
261
  self.modelo_ner=""
262
  self.categoria_texto=""
263
  self.tokenizer = AutoTokenizer.from_pretrained("BSC-LT/roberta_model_for_anonimization")
264
+ def reordenacion_tokens(self,tokens):
265
+
266
+ i=0
267
+ new_tokens=[]
268
+ ig_tokens=[] #ignorar estos indices del array de indentificadores
269
+ for token in tokens:
270
+ ind=len(new_tokens)
271
+ if i<len(tokens):
272
+ if token.startswith("▁"):
273
+
274
+ new_tokens.append(token)
275
+
276
+ i=i+1
277
+ else:
278
+ new_tokens[ind-1] = (new_tokens[ind-1] + token)
279
+ ig_tokens.append(i)
280
+
281
+ i=i+1
282
+ return (
283
+ new_tokens,
284
+ ig_tokens
285
+ )
286
+
287
  def aplicar_modelo(self,_sentences,idioma):
288
  if idioma=="es":
289
  self.tokenizer = AutoTokenizer.from_pretrained("BSC-LT/roberta_model_for_anonimization")
 
334
  i=i+1
335
  labels = predicted_token_class_ids
336
  loss = self.model(input_ids, labels=labels).loss
337
+
338
+ new_tokens,ig_tokens=self.reordenacion_tokens(tokenized_text) #
339
+ new_identificadores = self.reordenacion_identificadores(ig_tokens,_predicted_tokens_classes)#
340
+
341
+ return new_identificadores,new_identificadores
342
+ #return ids, _predicted_tokens_classes
343
  def salida_texto( self,ids,pre_tokens):
344
  new_labels = []
345
  current_word = None
 
457
  model.identificacion_idioma(sentences[0])
458
  print('sentences',sentences)
459
  ides, predicted = modelo.aplicar_modelo(sentences,model.idioma)
460
+
461
+
462
  out=modelo.salida_texto2( ides,predicted)
463
  print('out:',out)
464
  df_new[item] = modelo.unir_array(out)