import gradio as gr import torch import json import pandas as pd import numpy as np from transformers import AutoTokenizer, RobertaForTokenClassification from transformers import AutoTokenizer, AutoModelForTokenClassification from transformers import AutoModelForSequenceClassification, AutoTokenizer from json import JSONEncoder from faker import Faker from keras.utils import pad_sequences class out_json(): def __init__(self, w,l): self.word = w self.label = l class MyEncoder(JSONEncoder): def default(self, o): return o.__dict__ class Model: def __init__(self): self.texto="" self.idioma="" self.modelo_ner="" self.categoria_texto="" def identificacion_idioma(self,text): self.texto=text tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection") model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection") inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt") with torch.no_grad(): logits = model(**inputs).logits preds = torch.softmax(logits, dim=-1) id2lang = model.config.id2label vals, idxs = torch.max(preds, dim=1) #retorna el idioma con mayor porcentaje maximo=vals.max() idioma='' porcentaje=0 for k, v in zip(idxs, vals): if v.item()==maximo: idioma,porcentaje=id2lang[k.item()],v.item() if idioma=='es': self.idioma="es" self.modelo_ner='BSC-LT/roberta_model_for_anonimization' self.faker_ = Faker('es_MX') self.model = RobertaForTokenClassification.from_pretrained(self.modelo_ner) else: self.idioma="en" self.faker_ = Faker('en_US') self.modelo_ner="FacebookAI/xlm-roberta-large-finetuned-conll03-english" self.model = AutoModelForTokenClassification.from_pretrained(self.modelo_ner) self.categorizar_texto(self.texto) def reordenacion_tokens(self,tokens): i=0 new_tokens=[] ig_tokens=[] #ignorar estos indices del array de indentificadores for token in tokens: ind=len(new_tokens) if i0: print('text') model.identificacion_idioma(texto) return model.predict(etiquetas),gr.Dataframe(),gr.File() else: if archivo.name.split(".")[1]=="csv": print('csv') df=pd.read_csv(archivo.name,delimiter=",") df_new = pd.DataFrame( columns=df.columns.values) for item in df.columns.values: sentences=df[item] model.identificacion_idioma(sentences[0]) ides, predicted = modelo.aplicar_modelo(sentences,model.idioma,etiquetas) if model.idioma=="es": out=modelo.salida_texto2_es( ides,predicted) print('out:',out) df_new[item] = modelo.unir_array(out) else: out=modelo.salida_texto2( ides,predicted,etiquetas)#tokens,labels print('out:',out) df_new[item] = modelo.unir_array(out) return "", df_new, df_new.to_csv(sep='\t', encoding='utf-8',index=False) #return "", df_new, df_new.to_excel( index=False) else: print('json') if archivo.name.split(".")[1]=="json": util = utilJSON(archivo.name) df=util.obtener_dataframe(util.data) df_new = pd.DataFrame( columns=df.columns.values) for item in df.columns.values: sentences=df[item] ides, predicted = modelo.aplicar_modelo(sentences,"en",etiquetas) out=modelo.salida_texto2( ides,predicted)#tokens,labels print('out:',out) df_new[item] = modelo.unir_array(out) #out=modelo.salida_texto2( ides,predicted) #df_new[item] = modelo.unir_array(out) #return "", df, df.to_csv(sep='\t', encoding='utf-8',index=False) return "", df_new, df_new.to_csv(sep='\t', encoding='utf-8',index=False) demo = gr.Interface(fn=procesar,inputs=["text",gr.File(), "checkbox"] , outputs=["text",gr.Dataframe(interactive=False),"text"]) # demo.launch(share=True)