Spaces:
Sleeping
Sleeping
Manejo de datos tipo Date
Browse files
app.py
CHANGED
@@ -9,6 +9,7 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
|
9 |
from json import JSONEncoder
|
10 |
from faker import Faker
|
11 |
from keras.utils import pad_sequences
|
|
|
12 |
class out_json():
|
13 |
def __init__(self, w,l):
|
14 |
self.word = w
|
@@ -183,7 +184,13 @@ class Model:
|
|
183 |
a=''
|
184 |
for i in new_labels:
|
185 |
a = a+i
|
186 |
-
return a
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
def formato_salida(self,out):
|
188 |
a=""
|
189 |
for i in out:
|
@@ -207,10 +214,23 @@ class Model:
|
|
207 |
return self.faker_.company()
|
208 |
def fake_city(self):
|
209 |
return self.faker_.city()
|
210 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
|
212 |
|
213 |
-
|
|
|
|
|
214 |
new_iden=[]
|
215 |
for id in identificadores:
|
216 |
|
@@ -222,8 +242,40 @@ class Model:
|
|
222 |
|
223 |
elif 'LOC' in id:
|
224 |
new_iden.append(self.fake_city())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
else:
|
226 |
new_iden.append(id)
|
|
|
227 |
return new_iden
|
228 |
###
|
229 |
### Funci贸n que aplica los modelo para categorizar el texto segun su contexto
|
@@ -335,7 +387,7 @@ class Model:
|
|
335 |
coincidencia=""
|
336 |
else:
|
337 |
#out2 = self.salida_texto(new_tokens,self.reemplazo_fake(new_identificadores))
|
338 |
-
_fake=self.reemplazo_fake(new_identificadores)
|
339 |
coincidencia=self.metricas_anonimizacion(_fake,new_tokens,new_identificadores)
|
340 |
out2 = self.salida_texto(new_tokens,_fake)
|
341 |
out3 = self.salida_json(_fake,new_identificadores)
|
@@ -745,7 +797,7 @@ def procesar(texto,archivo, etiquetas):
|
|
745 |
|
746 |
return modelo.idioma,"","", df_new, df_new.to_csv(sep='\t', encoding='utf-8',index=False),"",""
|
747 |
|
748 |
-
demo = gr.Interface(fn=procesar,inputs=["text",gr.File(), "checkbox"] , outputs=[gr.Label(label="idioma/categor铆a"),gr.Textbox(label="etiquetas"),gr.Textbox(label="texto procesado"),gr.Dataframe(label="Datos procesados en dataframe",interactive=False),gr.Textbox(label="datos csv"),gr.Textbox(label="
|
749 |
#
|
750 |
demo.launch(share=True)
|
751 |
|
|
|
9 |
from json import JSONEncoder
|
10 |
from faker import Faker
|
11 |
from keras.utils import pad_sequences
|
12 |
+
import calendar
|
13 |
class out_json():
|
14 |
def __init__(self, w,l):
|
15 |
self.word = w
|
|
|
184 |
a=''
|
185 |
for i in new_labels:
|
186 |
a = a+i
|
187 |
+
return a
|
188 |
+
def is_integer_string(self,value):
|
189 |
+
try:
|
190 |
+
int(value)
|
191 |
+
return True
|
192 |
+
except ValueError:
|
193 |
+
return False
|
194 |
def formato_salida(self,out):
|
195 |
a=""
|
196 |
for i in out:
|
|
|
214 |
return self.faker_.company()
|
215 |
def fake_city(self):
|
216 |
return self.faker_.city()
|
217 |
+
def get_day_of(self, month_name, year=2024):
|
218 |
+
|
219 |
+
months = {
|
220 |
+
'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4, 'mayo': 5, 'junio': 6,
|
221 |
+
'julio': 7, 'agosto': 8, 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12,
|
222 |
+
'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6,
|
223 |
+
'july': 7, 'august': 8, 'september': 9, 'october': 10, 'november': 11, 'december': 12
|
224 |
+
}
|
225 |
+
month = months[month_name]
|
226 |
+
_, num_days = calendar.monthrange(year, month)
|
227 |
+
return str(num_days)
|
228 |
+
def reemplazo_fake(self,identificadores, new_tokens):
|
229 |
|
230 |
|
231 |
+
a=['Enero','January', 'February','Febrero','Marzo','March','Abril','April','Mayo','May','Junio','June','Julio','July','Agosto','August','Septiembre','September','Octubre','October','Noviembre','November','Diciembre','December']
|
232 |
+
b=['Ene','Jan', 'Feb','Mar','Mar','Abr','Apr','May','May','Jun','Jun','Jul','Jul','Ago','Aug','Sep','Oct','Nov','Dic','Dec']
|
233 |
+
i=0
|
234 |
new_iden=[]
|
235 |
for id in identificadores:
|
236 |
|
|
|
242 |
|
243 |
elif 'LOC' in id:
|
244 |
new_iden.append(self.fake_city())
|
245 |
+
elif 'DATE' in id:
|
246 |
+
|
247 |
+
if self.is_integer_string(new_tokens[i]):
|
248 |
+
|
249 |
+
match len(new_tokens[i]):
|
250 |
+
case 4:
|
251 |
+
new_iden.append(self.faker_.date()[:4])
|
252 |
+
case 10:
|
253 |
+
new_iden.append(self.faker_.date())
|
254 |
+
case 1:
|
255 |
+
new_iden.append(self.get_day_of('february'))
|
256 |
+
case 2:
|
257 |
+
new_iden.append(self.get_day_of('february'))
|
258 |
+
case _:
|
259 |
+
new_iden.append(id)
|
260 |
+
else:
|
261 |
+
match new_tokens[i]:
|
262 |
+
case w if w in a:
|
263 |
+
new_iden.append(self.faker_.month_name())
|
264 |
+
case w if w in b:
|
265 |
+
new_iden.append(self.faker_.month_name()[:3])
|
266 |
+
case "-":
|
267 |
+
new_iden.append("-")
|
268 |
+
case ".":
|
269 |
+
new_iden.append(".")
|
270 |
+
case ",":
|
271 |
+
new_iden.append(",")
|
272 |
+
case "/":
|
273 |
+
new_iden.append("/")
|
274 |
+
case _:
|
275 |
+
new_iden.append(id)
|
276 |
else:
|
277 |
new_iden.append(id)
|
278 |
+
i=i+1
|
279 |
return new_iden
|
280 |
###
|
281 |
### Funci贸n que aplica los modelo para categorizar el texto segun su contexto
|
|
|
387 |
coincidencia=""
|
388 |
else:
|
389 |
#out2 = self.salida_texto(new_tokens,self.reemplazo_fake(new_identificadores))
|
390 |
+
_fake=self.reemplazo_fake(new_identificadores,new_tokens)
|
391 |
coincidencia=self.metricas_anonimizacion(_fake,new_tokens,new_identificadores)
|
392 |
out2 = self.salida_texto(new_tokens,_fake)
|
393 |
out3 = self.salida_json(_fake,new_identificadores)
|
|
|
797 |
|
798 |
return modelo.idioma,"","", df_new, df_new.to_csv(sep='\t', encoding='utf-8',index=False),"",""
|
799 |
|
800 |
+
demo = gr.Interface(fn=procesar,inputs=["text",gr.File(), "checkbox"] , outputs=[gr.Label(label="idioma/categor铆a"),gr.Textbox(label="etiquetas"),gr.Textbox(label="texto procesado"),gr.Dataframe(label="Datos procesados en dataframe",interactive=False),gr.Textbox(label="datos csv"),gr.Textbox(label="etiquetas anonimizadas"),gr.Label(label="coincidencia tokens originales vs anonimizados")])
|
801 |
#
|
802 |
demo.launch(share=True)
|
803 |
|