dayannex commited on
Commit
0715d07
·
1 Parent(s): 6874949

Add app and requirements

Browse files
Files changed (2) hide show
  1. app.py +266 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import json
4
+ import pandas as pd
5
+ import numpy as np
6
+ from transformers import AutoTokenizer, RobertaForTokenClassification
7
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
8
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
9
+ from json import JSONEncoder
10
+ from faker import Faker
11
+
12
+ class out_json():
13
+ def __init__(self, w,l):
14
+ self.word = w
15
+ self.label = l
16
+ class MyEncoder(JSONEncoder):
17
+ def default(self, o):
18
+ return o.__dict__
19
+ class Model:
20
+ def __init__(self):
21
+ self.texto=""
22
+ self.idioma=""
23
+ self.modelo_ner=""
24
+ self.categoria_texto=""
25
+
26
+ def identificacion_idioma(self,text):
27
+ self.texto=text
28
+ tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
29
+ model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")
30
+
31
+ inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
32
+
33
+ with torch.no_grad():
34
+ logits = model(**inputs).logits
35
+
36
+ preds = torch.softmax(logits, dim=-1)
37
+
38
+
39
+ id2lang = model.config.id2label
40
+ vals, idxs = torch.max(preds, dim=1)
41
+
42
+
43
+
44
+ #retorna el idioma con mayor porcentaje
45
+ maximo=vals.max()
46
+ idioma=''
47
+ porcentaje=0
48
+ for k, v in zip(idxs, vals):
49
+ if v.item()==maximo:
50
+ idioma,porcentaje=id2lang[k.item()],v.item()
51
+
52
+
53
+ if idioma=='es':
54
+ self.idioma="es"
55
+ self.modelo_ner='BSC-LT/roberta_model_for_anonimization'
56
+ self.faker_ = Faker('es_MX')
57
+ self.model = RobertaForTokenClassification.from_pretrained(self.modelo_ner)
58
+ else:
59
+ self.idioma="en"
60
+ self.faker_ = Faker('en_US')
61
+ self.modelo_ner="FacebookAI/xlm-roberta-large-finetuned-conll03-english"
62
+ self.model = AutoModelForTokenClassification.from_pretrained(self.modelo_ner)
63
+ self.categorizar_texto(self.texto)
64
+ def reordenacion_tokens(self,tokens):
65
+
66
+ i=0
67
+ new_tokens=[]
68
+ ig_tokens=[] #ignorar estos indices del array de indentificadores
69
+ for token in tokens:
70
+ ind=len(new_tokens)
71
+ if i<len(tokens):
72
+ if token.startswith("▁"):
73
+
74
+ new_tokens.append(token)
75
+
76
+ i=i+1
77
+ else:
78
+ new_tokens[ind-1] = (new_tokens[ind-1] + token)
79
+ ig_tokens.append(i)
80
+
81
+ i=i+1
82
+ return (
83
+ new_tokens,
84
+ ig_tokens
85
+ )
86
+
87
+ def reordenacion_identificadores(self,ig_tokens,predicted_tokens_classes):
88
+ x=0
89
+ new_identificadores=[]
90
+ for token in predicted_tokens_classes:
91
+
92
+ if x not in ig_tokens:
93
+ new_identificadores.append(token)
94
+ x=x+1
95
+ else:
96
+ x=x+1
97
+ return new_identificadores
98
+ def salida_json(self,tokens,pre_tokens):
99
+ list=[]
100
+ i=0
101
+ for t in tokens:
102
+ if pre_tokens[i]!='O':
103
+ a = out_json(t.replace('▁','').replace('Ġ',''),pre_tokens[i].replace('▁',''))
104
+ list.append(a)
105
+ i=i+1
106
+ return MyEncoder().encode(list)
107
+ def salida_texto( self,tokens,pre_tokens):
108
+ new_labels = []
109
+ current_word = None
110
+ i=0
111
+ for token in tokens:
112
+
113
+ if pre_tokens[i]=='O' or 'MISC' in pre_tokens[i]:
114
+ new_labels.append(' ' +token.replace('▁',''))
115
+ else:
116
+ new_labels.append(' ' + pre_tokens[i])
117
+ i=i+1
118
+ a=''
119
+ for i in new_labels:
120
+ a = a+i
121
+ return a
122
+ #return new_labels
123
+ def salida_texto_anonimizado(self, ids,pre_tokens):
124
+ new_labels = []
125
+ current_word = None
126
+ i=0
127
+ for identificador in pre_tokens:
128
+
129
+ if identificador=='O' or 'OTH' in identificador:
130
+ new_labels.append(self.tokenizer.decode(ids[i]))
131
+ else:
132
+ new_labels.append(' ' + identificador)
133
+ i=i+1
134
+ a=''
135
+ for i in new_labels:
136
+ a = a+i
137
+ return a
138
+ def formato_salida(self,out):
139
+ a=""
140
+ for i in out:
141
+ a = a + i.replace('▁','').replace(' ','') + ' '
142
+ return a
143
+ def fake_pers(self):
144
+ return self.faker_.name(self)
145
+ def fake_word(self):
146
+ return self.faker_.word()
147
+ def fake_first_name(self):
148
+ return self.faker_.first_name()
149
+ def fake_last_name(self):
150
+ return self.faker_.last_name()
151
+ def fake_address(self):
152
+ return self.faker_.address()
153
+ def fake_sentence(self,n):
154
+ return self.faker_.sentence(nb_words=n)
155
+ def fake_text(self):
156
+ return self.faker_.text()
157
+ def fake_company(self):
158
+ return self.faker_.company()
159
+ def fake_city(self):
160
+ return self.faker_.city()
161
+ def reemplazo_fake(self,identificadores):
162
+ new_iden=[]
163
+ for id in identificadores:
164
+
165
+ if 'PER' in id:
166
+ new_iden.append(self.fake_first_name())
167
+
168
+ elif 'ORG' in id:
169
+ new_iden.append(self.fake_company())
170
+
171
+ elif 'LOC' in id:
172
+ new_iden.append(self.fake_city())
173
+ else:
174
+ new_iden.append(id)
175
+ return new_iden
176
+ def categorizar_texto(self,texto):
177
+ name="elozano/bert-base-cased-news-category"
178
+ tokenizer = AutoTokenizer.from_pretrained(name)
179
+ model_ = AutoModelForSequenceClassification.from_pretrained(name)
180
+
181
+ inputs_ = tokenizer(texto, padding=True, truncation=True, return_tensors="pt")
182
+
183
+ with torch.no_grad():
184
+ logits = model_(**inputs_).logits
185
+
186
+ preds = torch.softmax(logits, dim=-1)
187
+
188
+
189
+ id2lang = model_.config.id2label
190
+ vals, idxs = torch.max(preds, dim=1)
191
+
192
+ #retorna el idioma con mayor porcentaje
193
+ maximo=vals.max()
194
+ cat=''
195
+ self.categoria_texto=''
196
+ porcentaje=0
197
+ for k, v in zip(idxs, vals):
198
+ if v.item()==maximo:
199
+ cat,porcentaje=id2lang[k.item()],v.item()
200
+ self.categoria_texto=cat
201
+
202
+
203
+ return cat, porcentaje
204
+ def predict(self):
205
+
206
+ categoria, porcentaje = self.categorizar_texto(self.texto)
207
+ print(categoria, porcentaje)
208
+
209
+ self.tokenizer = AutoTokenizer.from_pretrained(self.modelo_ner)
210
+ tokens = self.tokenizer.tokenize(self.texto)
211
+
212
+ ids = self.tokenizer.convert_tokens_to_ids(tokens)
213
+
214
+ input_ids = torch.tensor([ids])
215
+ with torch.no_grad():
216
+ logits = self.model(input_ids).logits
217
+
218
+ predicted_token_class_ids = logits.argmax(-1)
219
+
220
+ predicted_tokens_classes = [self.model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
221
+
222
+ labels = predicted_token_class_ids
223
+ loss = self.model(input_ids, labels=labels).loss
224
+
225
+ if (self.idioma=='es'):
226
+
227
+
228
+ out1 = self.salida_json(tokens,predicted_tokens_classes) #spanish solo palabras sensibles
229
+
230
+ out2 = self.salida_texto_anonimizado(ids,self.reemplazo_fake(predicted_tokens_classes)) #español texto completo
231
+
232
+ else:
233
+
234
+ new_tokens,ig_tokens=self.reordenacion_tokens(tokens)
235
+ new_identificadores = self.reordenacion_identificadores(ig_tokens,predicted_tokens_classes)
236
+
237
+ out1 = self.salida_json(new_tokens,new_identificadores),
238
+
239
+
240
+
241
+ out2 = self.salida_texto(new_tokens,self.reemplazo_fake(new_identificadores))
242
+
243
+
244
+ return (
245
+
246
+ self.texto,
247
+ out1,
248
+ str(out2)
249
+
250
+
251
+ )
252
+ model = Model()
253
+ def get_model():
254
+ return model
255
+
256
+ def procesar(texto,archivo):
257
+ print(texto)
258
+ print(archivo.name)
259
+ df=pd.read_csv(archivo.name,delimiter=",")
260
+ print(df["nombre"])
261
+ model.identificacion_idioma(texto)
262
+
263
+ return model.predict()
264
+
265
+ demo = gr.Interface(fn=procesar,inputs=[gr.inputs.Textbox(),gr.inputs.File()] , outputs="text")
266
+ demo.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ transformers
2
+ torch
3
+ Faker