dayannex commited on
Commit
6633793
·
1 Parent(s): 0895315

app modified orden tokens

Browse files
Files changed (1) hide show
  1. app.py +15 -11
app.py CHANGED
@@ -61,7 +61,7 @@ class Model:
61
  self.modelo_ner="FacebookAI/xlm-roberta-large-finetuned-conll03-english"
62
  self.model = AutoModelForTokenClassification.from_pretrained(self.modelo_ner)
63
  self.categorizar_texto(self.texto)
64
- def reordenacion_tokens(self,tokens):
65
 
66
  i=0
67
  new_tokens=[]
@@ -69,7 +69,7 @@ class Model:
69
  for token in tokens:
70
  ind=len(new_tokens)
71
  if i<len(tokens):
72
- if token.startswith("▁"):
73
 
74
  new_tokens.append(token)
75
 
@@ -111,7 +111,7 @@ class Model:
111
  for token in tokens:
112
 
113
  if pre_tokens[i]=='O' or 'MISC' in pre_tokens[i]:
114
- new_labels.append(' ' +token.replace('▁',''))
115
  else:
116
  new_labels.append(' ' + pre_tokens[i])
117
  i=i+1
@@ -227,17 +227,21 @@ class Model:
227
 
228
  if (self.idioma=='es'):
229
 
230
-
 
 
231
  out1 = self.salida_json(tokens,predicted_tokens_classes) #spanish solo palabras sensibles
232
 
233
  if etiquetas:
234
- out2 = self.salida_texto_anonimizado(ids,predicted_tokens_classes) #solo identificadores
235
- else:
236
- out2 = self.salida_texto_anonimizado(ids,self.reemplazo_fake(predicted_tokens_classes)) #español texto completo
 
 
237
 
238
  else:
239
 
240
- new_tokens,ig_tokens=self.reordenacion_tokens(tokens)
241
  new_identificadores = self.reordenacion_identificadores(ig_tokens,predicted_tokens_classes)
242
 
243
 
@@ -264,7 +268,7 @@ class ModeloDataset:
264
  self.modelo_ner=""
265
  self.categoria_texto=""
266
  self.tokenizer = AutoTokenizer.from_pretrained("BSC-LT/roberta_model_for_anonimization")
267
- def reordenacion_tokens(self,tokens):
268
 
269
  i=0
270
  new_tokens=[]
@@ -272,7 +276,7 @@ class ModeloDataset:
272
  for token in tokens:
273
  ind=len(new_tokens)
274
  if i<len(tokens):
275
- if token.startswith("▁"):
276
 
277
  new_tokens.append(token)
278
 
@@ -394,7 +398,7 @@ class ModeloDataset:
394
  new_identificadores=[]
395
  for item in tokenized_text:
396
 
397
- aux1, aux2= self.reordenacion_tokens(item)
398
  new_tokens.append(aux1)
399
  ig_tok.append(aux2)
400
 
 
61
  self.modelo_ner="FacebookAI/xlm-roberta-large-finetuned-conll03-english"
62
  self.model = AutoModelForTokenClassification.from_pretrained(self.modelo_ner)
63
  self.categorizar_texto(self.texto)
64
+ def reordenacion_tokens(self,tokens,caracter):
65
 
66
  i=0
67
  new_tokens=[]
 
69
  for token in tokens:
70
  ind=len(new_tokens)
71
  if i<len(tokens):
72
+ if token.startswith(caracter):
73
 
74
  new_tokens.append(token)
75
 
 
111
  for token in tokens:
112
 
113
  if pre_tokens[i]=='O' or 'MISC' in pre_tokens[i]:
114
+ new_labels.append(' ' +token.replace('▁','').replace('Ġ',''))
115
  else:
116
  new_labels.append(' ' + pre_tokens[i])
117
  i=i+1
 
227
 
228
  if (self.idioma=='es'):
229
 
230
+ new_tokens,ig_tokens=self.reordenacion_tokens(tokens,'Ġ')
231
+ new_identificadores = self.reordenacion_identificadores(ig_tokens,predicted_tokens_classes)
232
+
233
  out1 = self.salida_json(tokens,predicted_tokens_classes) #spanish solo palabras sensibles
234
 
235
  if etiquetas:
236
+ out2 = self.salida_texto(new_tokens,new_identificadores)#solo identificadores
237
+ #out2 = self.salida_texto_anonimizado(ids,predicted_tokens_classes) #solo identificadores
238
+ else:
239
+ out2 = self.salida_texto(new_tokens,self.reemplazo_fake(new_identificadores))
240
+ #out2 = self.salida_texto_anonimizado(ids,self.reemplazo_fake(predicted_tokens_classes)) #español texto completo
241
 
242
  else:
243
 
244
+ new_tokens,ig_tokens=self.reordenacion_tokens(tokens,'_')
245
  new_identificadores = self.reordenacion_identificadores(ig_tokens,predicted_tokens_classes)
246
 
247
 
 
268
  self.modelo_ner=""
269
  self.categoria_texto=""
270
  self.tokenizer = AutoTokenizer.from_pretrained("BSC-LT/roberta_model_for_anonimization")
271
+ def reordenacion_tokens(self,tokens,caracter):
272
 
273
  i=0
274
  new_tokens=[]
 
276
  for token in tokens:
277
  ind=len(new_tokens)
278
  if i<len(tokens):
279
+ if token.startswith(caracter):
280
 
281
  new_tokens.append(token)
282
 
 
398
  new_identificadores=[]
399
  for item in tokenized_text:
400
 
401
+ aux1, aux2= self.reordenacion_tokens(item,"_")
402
  new_tokens.append(aux1)
403
  ig_tok.append(aux2)
404