dayannex commited on
Commit
beb281e
·
1 Parent(s): 59c1049

update app

Browse files
Files changed (1) hide show
  1. app.py +91 -8
app.py CHANGED
@@ -249,18 +249,101 @@ class Model:
249
 
250
 
251
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  model = Model()
253
  def get_model():
254
  return model
255
 
256
  def procesar(texto,archivo):
257
- print(texto)
258
- print(archivo.name)
259
- df=pd.read_csv(archivo.name,delimiter=",")
260
- print(df["nombre"])
261
- model.identificacion_idioma(texto)
262
-
263
- return model.predict()
 
 
 
 
 
 
 
 
 
 
264
 
265
  demo = gr.Interface(fn=procesar,inputs=["text",gr.File()] , outputs="text")
266
- demo.launch(share=True)
 
 
 
 
249
 
250
 
251
  )
252
+ class ModeloDataset:
253
+ def __init__(self):
254
+ self.texto=""
255
+ self.idioma=""
256
+ self.modelo_ner=""
257
+ self.categoria_texto=""
258
+
259
+ def aplicar_modelo(self,_sentences):
260
+
261
+ tokenizer = AutoTokenizer.from_pretrained("BSC-LT/roberta_model_for_anonimization")
262
+ tokenized_text=[tokenizer.tokenize(sentence) for sentence in _sentences]
263
+
264
+ ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_text]
265
+ MAX_LEN=128
266
+ ids=pad_sequences(ids,maxlen=MAX_LEN,dtype="long",truncating="post", padding="post")
267
+ input_ids = torch.tensor(ids)
268
+ model = RobertaForTokenClassification.from_pretrained("BSC-LT/roberta_model_for_anonimization")
269
+
270
+
271
+ model = AutoModelForTokenClassification.from_pretrained("BSC-LT/roberta_model_for_anonimization")
272
+ with torch.no_grad():
273
+ logits = model(input_ids).logits
274
+ predicted_token_class_ids = logits.argmax(-1)
275
+ i=0
276
+ _predicted_tokens_classes=[]
277
+ for a in predicted_token_class_ids:
278
+ #_predicted_tokens_classes[i]=[model.config.id2label[t.item()] for t in predicted_token_class_ids[i]]
279
+ _predicted_tokens_classes.append([model.config.id2label[t.item()] for t in predicted_token_class_ids[i]])
280
+ i=i+1
281
+ labels = predicted_token_class_ids
282
+ loss = model(input_ids, labels=labels).loss
283
+ #print(round(loss.item(), 2))
284
+
285
+ return ids, _predicted_tokens_classes
286
+ def salida_texto( self,ids,pre_tokens):
287
+ new_labels = []
288
+ current_word = None
289
+ i=0
290
+ for identificador in pre_tokens:
291
+ if (tokenizer.decode(ids[i])!="<s>"):
292
+ if identificador=='O':
293
+
294
+ new_labels.append(tokenizer.decode(ids[i]))
295
+ else:
296
+ new_labels.append(' ' + identificador)
297
+ i=i+1
298
+
299
+ return new_labels
300
+
301
+ def salida_texto2(self, ids,pre_tokens):
302
+ i=0
303
+ out=[]
304
+ for iden in pre_tokens:
305
+ if i<len(ids):
306
+
307
+ out.append(salida_texto( ids[i],np.array(_predicted_tokens_classes[i])) )
308
+ i=i+1
309
+
310
+ return out
311
+ def unir_array(self,_out):
312
+ i=0
313
+ salida=[]
314
+ for item in _out:
315
+ salida.append("".join(str(x) for x in _out[i]))
316
+ i=i+1
317
+ return salida
318
+ def unir_columna_valores(self,df,columna):
319
+ out = ','.join(df[columna])
320
+ return out
321
+ modelo = ModeloDataset()
322
  model = Model()
323
  def get_model():
324
  return model
325
 
326
  def procesar(texto,archivo):
327
+ if len(texto)>0:
328
+ model.identificacion_idioma(texto)
329
+ return model.predict()
330
+ else:
331
+ df=pd.read_csv(archivo.name,delimiter=",")
332
+ print(archivo.name)
333
+ df_new = pd.DataFrame(data, columns=df.columns.values)
334
+ print(df_new)
335
+ for item in df.columns.values:
336
+ sentences=df[item]
337
+ ides, predicted = modelo.aplicar_modelo(sentences)
338
+ out=modelo.salida_texto2( ides,predicted)
339
+
340
+ df_new[item] = modelo.unir(out)
341
+ plotting_df=gr.Dataframe(value=df_new,headers=["nombre","trabajo"],label="label:",type="pandas", visible=True, interactive=False)
342
+ print(df_new)
343
+ return plotting_df
344
 
345
  demo = gr.Interface(fn=procesar,inputs=["text",gr.File()] , outputs="text")
346
+ demo.launch(share=True)
347
+
348
+ #plotting_df=gr.Dataframe(value=df2,headers="class","type","group","ε54Cr","ε50Ti","ε94Mo"],
349
+ # label="CosmoPlot Compilation:",type="pandas", visible=True, interactive=False)