Spaces:
Sleeping
Sleeping
app modified
Browse files
app.py
CHANGED
@@ -318,7 +318,7 @@ class ModeloDataset:
|
|
318 |
i=i+1
|
319 |
labels = predicted_token_class_ids
|
320 |
loss = self.model(input_ids, labels=labels).loss
|
321 |
-
|
322 |
else:
|
323 |
|
324 |
print('idioma:',idioma)
|
@@ -369,22 +369,22 @@ class ModeloDataset:
|
|
369 |
i=i+1
|
370 |
print('new_identificadores:',new_identificadores, ' ',len(new_identificadores) )
|
371 |
|
372 |
-
|
373 |
#return ids, _predicted_tokens_classes
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
def salida_texto( self,tokens,pre_tokens):
|
389 |
new_labels = []
|
390 |
current_word = None
|
@@ -410,16 +410,16 @@ class ModeloDataset:
|
|
410 |
i=i+1
|
411 |
|
412 |
return out
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
def unir_array(self,_out):
|
424 |
i=0
|
425 |
salida=[]
|
@@ -515,10 +515,14 @@ def procesar(texto,archivo, etiquetas):
|
|
515 |
#print('sentences',sentences)
|
516 |
ides, predicted = modelo.aplicar_modelo(sentences,model.idioma)
|
517 |
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
|
|
|
|
|
|
|
|
522 |
return "", df_new, df_new.to_csv(sep='\t', encoding='utf-8',index=False)
|
523 |
#return "", df_new, df_new.to_excel( index=False)
|
524 |
else:
|
|
|
318 |
i=i+1
|
319 |
labels = predicted_token_class_ids
|
320 |
loss = self.model(input_ids, labels=labels).loss
|
321 |
+
return ids,_predicted_tokens_classes
|
322 |
else:
|
323 |
|
324 |
print('idioma:',idioma)
|
|
|
369 |
i=i+1
|
370 |
print('new_identificadores:',new_identificadores, ' ',len(new_identificadores) )
|
371 |
|
372 |
+
return new_identificadores, new_tokens
|
373 |
#return ids, _predicted_tokens_classes
|
374 |
+
def salida_texto_es( self,ids,pre_tokens):
|
375 |
+
new_labels = []
|
376 |
+
current_word = None
|
377 |
+
i=0
|
378 |
+
for identificador in pre_tokens:
|
379 |
+
if (self.tokenizer.decode(ids[i])!="<s>"):
|
380 |
+
if identificador=='O':
|
381 |
+
|
382 |
+
new_labels.append(self.tokenizer.decode(ids[i]))
|
383 |
+
else:
|
384 |
+
new_labels.append(' ' + identificador)
|
385 |
+
i=i+1
|
386 |
+
|
387 |
+
return new_labels
|
388 |
def salida_texto( self,tokens,pre_tokens):
|
389 |
new_labels = []
|
390 |
current_word = None
|
|
|
410 |
i=i+1
|
411 |
|
412 |
return out
|
413 |
+
def salida_texto2_es(self, ids,pre_tokens):
|
414 |
+
i=0
|
415 |
+
out=[]
|
416 |
+
for iden in pre_tokens:
|
417 |
+
if i<len(ids):
|
418 |
+
|
419 |
+
out.append(self.salida_texto_es( ids[i],np.array(pre_tokens[i])) )
|
420 |
+
i=i+1
|
421 |
+
|
422 |
+
return out
|
423 |
def unir_array(self,_out):
|
424 |
i=0
|
425 |
salida=[]
|
|
|
515 |
#print('sentences',sentences)
|
516 |
ides, predicted = modelo.aplicar_modelo(sentences,model.idioma)
|
517 |
|
518 |
+
if model.idioma=="es":
|
519 |
+
out=modelo.salida_texto2_es( ides,predicted)
|
520 |
+
print('out:',out)
|
521 |
+
df_new[item] = modelo.unir_array(out)
|
522 |
+
else:
|
523 |
+
out=modelo.salida_texto2( ides,predicted)
|
524 |
+
print('out:',out)
|
525 |
+
df_new[item] = modelo.unir_array(out)
|
526 |
return "", df_new, df_new.to_csv(sep='\t', encoding='utf-8',index=False)
|
527 |
#return "", df_new, df_new.to_excel( index=False)
|
528 |
else:
|