Spaces:
Sleeping
Sleeping
update app
Browse files
app.py
CHANGED
@@ -249,18 +249,101 @@ class Model:
|
|
249 |
|
250 |
|
251 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
model = Model()
|
253 |
def get_model():
|
254 |
return model
|
255 |
|
256 |
def procesar(texto,archivo):
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
|
265 |
demo = gr.Interface(fn=procesar,inputs=["text",gr.File()] , outputs="text")
|
266 |
-
demo.launch(share=True)
|
|
|
|
|
|
|
|
249 |
|
250 |
|
251 |
)
|
252 |
+
class ModeloDataset:
|
253 |
+
def __init__(self):
|
254 |
+
self.texto=""
|
255 |
+
self.idioma=""
|
256 |
+
self.modelo_ner=""
|
257 |
+
self.categoria_texto=""
|
258 |
+
|
259 |
+
def aplicar_modelo(self,_sentences):
|
260 |
+
|
261 |
+
tokenizer = AutoTokenizer.from_pretrained("BSC-LT/roberta_model_for_anonimization")
|
262 |
+
tokenized_text=[tokenizer.tokenize(sentence) for sentence in _sentences]
|
263 |
+
|
264 |
+
ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_text]
|
265 |
+
MAX_LEN=128
|
266 |
+
ids=pad_sequences(ids,maxlen=MAX_LEN,dtype="long",truncating="post", padding="post")
|
267 |
+
input_ids = torch.tensor(ids)
|
268 |
+
model = RobertaForTokenClassification.from_pretrained("BSC-LT/roberta_model_for_anonimization")
|
269 |
+
|
270 |
+
|
271 |
+
model = AutoModelForTokenClassification.from_pretrained("BSC-LT/roberta_model_for_anonimization")
|
272 |
+
with torch.no_grad():
|
273 |
+
logits = model(input_ids).logits
|
274 |
+
predicted_token_class_ids = logits.argmax(-1)
|
275 |
+
i=0
|
276 |
+
_predicted_tokens_classes=[]
|
277 |
+
for a in predicted_token_class_ids:
|
278 |
+
#_predicted_tokens_classes[i]=[model.config.id2label[t.item()] for t in predicted_token_class_ids[i]]
|
279 |
+
_predicted_tokens_classes.append([model.config.id2label[t.item()] for t in predicted_token_class_ids[i]])
|
280 |
+
i=i+1
|
281 |
+
labels = predicted_token_class_ids
|
282 |
+
loss = model(input_ids, labels=labels).loss
|
283 |
+
#print(round(loss.item(), 2))
|
284 |
+
|
285 |
+
return ids, _predicted_tokens_classes
|
286 |
+
def salida_texto( self,ids,pre_tokens):
|
287 |
+
new_labels = []
|
288 |
+
current_word = None
|
289 |
+
i=0
|
290 |
+
for identificador in pre_tokens:
|
291 |
+
if (tokenizer.decode(ids[i])!="<s>"):
|
292 |
+
if identificador=='O':
|
293 |
+
|
294 |
+
new_labels.append(tokenizer.decode(ids[i]))
|
295 |
+
else:
|
296 |
+
new_labels.append(' ' + identificador)
|
297 |
+
i=i+1
|
298 |
+
|
299 |
+
return new_labels
|
300 |
+
|
301 |
+
def salida_texto2(self, ids,pre_tokens):
|
302 |
+
i=0
|
303 |
+
out=[]
|
304 |
+
for iden in pre_tokens:
|
305 |
+
if i<len(ids):
|
306 |
+
|
307 |
+
out.append(salida_texto( ids[i],np.array(_predicted_tokens_classes[i])) )
|
308 |
+
i=i+1
|
309 |
+
|
310 |
+
return out
|
311 |
+
def unir_array(self,_out):
|
312 |
+
i=0
|
313 |
+
salida=[]
|
314 |
+
for item in _out:
|
315 |
+
salida.append("".join(str(x) for x in _out[i]))
|
316 |
+
i=i+1
|
317 |
+
return salida
|
318 |
+
def unir_columna_valores(self,df,columna):
|
319 |
+
out = ','.join(df[columna])
|
320 |
+
return out
|
321 |
+
modelo = ModeloDataset()
|
322 |
model = Model()
|
323 |
def get_model():
|
324 |
return model
|
325 |
|
326 |
def procesar(texto,archivo):
|
327 |
+
if len(texto)>0:
|
328 |
+
model.identificacion_idioma(texto)
|
329 |
+
return model.predict()
|
330 |
+
else:
|
331 |
+
df=pd.read_csv(archivo.name,delimiter=",")
|
332 |
+
print(archivo.name)
|
333 |
+
df_new = pd.DataFrame(data, columns=df.columns.values)
|
334 |
+
print(df_new)
|
335 |
+
for item in df.columns.values:
|
336 |
+
sentences=df[item]
|
337 |
+
ides, predicted = modelo.aplicar_modelo(sentences)
|
338 |
+
out=modelo.salida_texto2( ides,predicted)
|
339 |
+
|
340 |
+
df_new[item] = modelo.unir(out)
|
341 |
+
plotting_df=gr.Dataframe(value=df_new,headers=["nombre","trabajo"],label="label:",type="pandas", visible=True, interactive=False)
|
342 |
+
print(df_new)
|
343 |
+
return plotting_df
|
344 |
|
345 |
demo = gr.Interface(fn=procesar,inputs=["text",gr.File()] , outputs="text")
|
346 |
+
demo.launch(share=True)
|
347 |
+
|
348 |
+
#plotting_df=gr.Dataframe(value=df2,headers="class","type","group","ε54Cr","ε50Ti","ε94Mo"],
|
349 |
+
# label="CosmoPlot Compilation:",type="pandas", visible=True, interactive=False)
|