Spaces:
Sleeping
Sleeping
app modified orden tokens
Browse files
app.py
CHANGED
@@ -61,7 +61,7 @@ class Model:
|
|
61 |
self.modelo_ner="FacebookAI/xlm-roberta-large-finetuned-conll03-english"
|
62 |
self.model = AutoModelForTokenClassification.from_pretrained(self.modelo_ner)
|
63 |
self.categorizar_texto(self.texto)
|
64 |
-
def reordenacion_tokens(self,tokens):
|
65 |
|
66 |
i=0
|
67 |
new_tokens=[]
|
@@ -69,7 +69,7 @@ class Model:
|
|
69 |
for token in tokens:
|
70 |
ind=len(new_tokens)
|
71 |
if i<len(tokens):
|
72 |
-
if token.startswith(
|
73 |
|
74 |
new_tokens.append(token)
|
75 |
|
@@ -111,7 +111,7 @@ class Model:
|
|
111 |
for token in tokens:
|
112 |
|
113 |
if pre_tokens[i]=='O' or 'MISC' in pre_tokens[i]:
|
114 |
-
new_labels.append(' ' +token.replace('▁',''))
|
115 |
else:
|
116 |
new_labels.append(' ' + pre_tokens[i])
|
117 |
i=i+1
|
@@ -227,17 +227,21 @@ class Model:
|
|
227 |
|
228 |
if (self.idioma=='es'):
|
229 |
|
230 |
-
|
|
|
|
|
231 |
out1 = self.salida_json(tokens,predicted_tokens_classes) #spanish solo palabras sensibles
|
232 |
|
233 |
if etiquetas:
|
234 |
-
out2 = self.
|
235 |
-
|
236 |
-
|
|
|
|
|
237 |
|
238 |
else:
|
239 |
|
240 |
-
new_tokens,ig_tokens=self.reordenacion_tokens(tokens)
|
241 |
new_identificadores = self.reordenacion_identificadores(ig_tokens,predicted_tokens_classes)
|
242 |
|
243 |
|
@@ -264,7 +268,7 @@ class ModeloDataset:
|
|
264 |
self.modelo_ner=""
|
265 |
self.categoria_texto=""
|
266 |
self.tokenizer = AutoTokenizer.from_pretrained("BSC-LT/roberta_model_for_anonimization")
|
267 |
-
def reordenacion_tokens(self,tokens):
|
268 |
|
269 |
i=0
|
270 |
new_tokens=[]
|
@@ -272,7 +276,7 @@ class ModeloDataset:
|
|
272 |
for token in tokens:
|
273 |
ind=len(new_tokens)
|
274 |
if i<len(tokens):
|
275 |
-
if token.startswith(
|
276 |
|
277 |
new_tokens.append(token)
|
278 |
|
@@ -394,7 +398,7 @@ class ModeloDataset:
|
|
394 |
new_identificadores=[]
|
395 |
for item in tokenized_text:
|
396 |
|
397 |
-
aux1, aux2= self.reordenacion_tokens(item)
|
398 |
new_tokens.append(aux1)
|
399 |
ig_tok.append(aux2)
|
400 |
|
|
|
61 |
self.modelo_ner="FacebookAI/xlm-roberta-large-finetuned-conll03-english"
|
62 |
self.model = AutoModelForTokenClassification.from_pretrained(self.modelo_ner)
|
63 |
self.categorizar_texto(self.texto)
|
64 |
+
def reordenacion_tokens(self,tokens,caracter):
|
65 |
|
66 |
i=0
|
67 |
new_tokens=[]
|
|
|
69 |
for token in tokens:
|
70 |
ind=len(new_tokens)
|
71 |
if i<len(tokens):
|
72 |
+
if token.startswith(caracter):
|
73 |
|
74 |
new_tokens.append(token)
|
75 |
|
|
|
111 |
for token in tokens:
|
112 |
|
113 |
if pre_tokens[i]=='O' or 'MISC' in pre_tokens[i]:
|
114 |
+
new_labels.append(' ' +token.replace('▁','').replace('Ġ',''))
|
115 |
else:
|
116 |
new_labels.append(' ' + pre_tokens[i])
|
117 |
i=i+1
|
|
|
227 |
|
228 |
if (self.idioma=='es'):
|
229 |
|
230 |
+
new_tokens,ig_tokens=self.reordenacion_tokens(tokens,'Ġ')
|
231 |
+
new_identificadores = self.reordenacion_identificadores(ig_tokens,predicted_tokens_classes)
|
232 |
+
|
233 |
out1 = self.salida_json(tokens,predicted_tokens_classes) #spanish solo palabras sensibles
|
234 |
|
235 |
if etiquetas:
|
236 |
+
out2 = self.salida_texto(new_tokens,new_identificadores)#solo identificadores
|
237 |
+
#out2 = self.salida_texto_anonimizado(ids,predicted_tokens_classes) #solo identificadores
|
238 |
+
else:
|
239 |
+
out2 = self.salida_texto(new_tokens,self.reemplazo_fake(new_identificadores))
|
240 |
+
#out2 = self.salida_texto_anonimizado(ids,self.reemplazo_fake(predicted_tokens_classes)) #español texto completo
|
241 |
|
242 |
else:
|
243 |
|
244 |
+
new_tokens,ig_tokens=self.reordenacion_tokens(tokens,'_')
|
245 |
new_identificadores = self.reordenacion_identificadores(ig_tokens,predicted_tokens_classes)
|
246 |
|
247 |
|
|
|
268 |
self.modelo_ner=""
|
269 |
self.categoria_texto=""
|
270 |
self.tokenizer = AutoTokenizer.from_pretrained("BSC-LT/roberta_model_for_anonimization")
|
271 |
+
def reordenacion_tokens(self,tokens,caracter):
|
272 |
|
273 |
i=0
|
274 |
new_tokens=[]
|
|
|
276 |
for token in tokens:
|
277 |
ind=len(new_tokens)
|
278 |
if i<len(tokens):
|
279 |
+
if token.startswith(caracter):
|
280 |
|
281 |
new_tokens.append(token)
|
282 |
|
|
|
398 |
new_identificadores=[]
|
399 |
for item in tokenized_text:
|
400 |
|
401 |
+
aux1, aux2= self.reordenacion_tokens(item,"_")
|
402 |
new_tokens.append(aux1)
|
403 |
ig_tok.append(aux2)
|
404 |
|