Spaces:
Sleeping
Sleeping
app modified
Browse files
app.py
CHANGED
@@ -261,6 +261,29 @@ class ModeloDataset:
|
|
261 |
self.modelo_ner=""
|
262 |
self.categoria_texto=""
|
263 |
self.tokenizer = AutoTokenizer.from_pretrained("BSC-LT/roberta_model_for_anonimization")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
def aplicar_modelo(self,_sentences,idioma):
|
265 |
if idioma=="es":
|
266 |
self.tokenizer = AutoTokenizer.from_pretrained("BSC-LT/roberta_model_for_anonimization")
|
@@ -311,8 +334,12 @@ class ModeloDataset:
|
|
311 |
i=i+1
|
312 |
labels = predicted_token_class_ids
|
313 |
loss = self.model(input_ids, labels=labels).loss
|
314 |
-
|
315 |
-
|
|
|
|
|
|
|
|
|
316 |
def salida_texto( self,ids,pre_tokens):
|
317 |
new_labels = []
|
318 |
current_word = None
|
@@ -430,6 +457,8 @@ def procesar(texto,archivo, etiquetas):
|
|
430 |
model.identificacion_idioma(sentences[0])
|
431 |
print('sentences',sentences)
|
432 |
ides, predicted = modelo.aplicar_modelo(sentences,model.idioma)
|
|
|
|
|
433 |
out=modelo.salida_texto2( ides,predicted)
|
434 |
print('out:',out)
|
435 |
df_new[item] = modelo.unir_array(out)
|
|
|
261 |
self.modelo_ner=""
|
262 |
self.categoria_texto=""
|
263 |
self.tokenizer = AutoTokenizer.from_pretrained("BSC-LT/roberta_model_for_anonimization")
|
264 |
+
def reordenacion_tokens(self,tokens):
|
265 |
+
|
266 |
+
i=0
|
267 |
+
new_tokens=[]
|
268 |
+
ig_tokens=[] #ignorar estos indices del array de indentificadores
|
269 |
+
for token in tokens:
|
270 |
+
ind=len(new_tokens)
|
271 |
+
if i<len(tokens):
|
272 |
+
if token.startswith("▁"):
|
273 |
+
|
274 |
+
new_tokens.append(token)
|
275 |
+
|
276 |
+
i=i+1
|
277 |
+
else:
|
278 |
+
new_tokens[ind-1] = (new_tokens[ind-1] + token)
|
279 |
+
ig_tokens.append(i)
|
280 |
+
|
281 |
+
i=i+1
|
282 |
+
return (
|
283 |
+
new_tokens,
|
284 |
+
ig_tokens
|
285 |
+
)
|
286 |
+
|
287 |
def aplicar_modelo(self,_sentences,idioma):
|
288 |
if idioma=="es":
|
289 |
self.tokenizer = AutoTokenizer.from_pretrained("BSC-LT/roberta_model_for_anonimization")
|
|
|
334 |
i=i+1
|
335 |
labels = predicted_token_class_ids
|
336 |
loss = self.model(input_ids, labels=labels).loss
|
337 |
+
|
338 |
+
new_tokens,ig_tokens=self.reordenacion_tokens(tokenized_text) #
|
339 |
+
new_identificadores = self.reordenacion_identificadores(ig_tokens,_predicted_tokens_classes)#
|
340 |
+
|
341 |
+
return new_identificadores,new_identificadores
|
342 |
+
#return ids, _predicted_tokens_classes
|
343 |
def salida_texto( self,ids,pre_tokens):
|
344 |
new_labels = []
|
345 |
current_word = None
|
|
|
457 |
model.identificacion_idioma(sentences[0])
|
458 |
print('sentences',sentences)
|
459 |
ides, predicted = modelo.aplicar_modelo(sentences,model.idioma)
|
460 |
+
|
461 |
+
|
462 |
out=modelo.salida_texto2( ides,predicted)
|
463 |
print('out:',out)
|
464 |
df_new[item] = modelo.unir_array(out)
|