Spaces:

dayannex
/

anonimizador_V2

Sleeping

App Files Files Community

dayannex commited on Sep 16, 2024

Commit

c748665

1 Parent(s): 750a2ed

Manejo de datos tipo Date

Browse files

Files changed (1) hide show

app.py +57 -5

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer
 from json import JSONEncoder
 from faker import Faker
 from keras.utils import pad_sequences
 class out_json():
     def __init__(self, w,l):
         self.word = w
@@ -183,7 +184,13 @@ class Model:
     a=''
     for i in new_labels:
         a = a+i
-    return a
    def formato_salida(self,out):
        a=""
        for i in out:
@@ -207,10 +214,23 @@ class Model:
        return self.faker_.company()
    def fake_city(self):
        return self.faker_.city()
-   def reemplazo_fake(self,identificadores):
        new_iden=[]
        for id in identificadores:
@@ -222,8 +242,40 @@ class Model:
            elif 'LOC' in id:
                new_iden.append(self.fake_city())
            else:
                new_iden.append(id)
        return new_iden
   ###
   ### Función que aplica los modelo para categorizar el texto segun su contexto
@@ -335,7 +387,7 @@ class Model:
                 coincidencia=""
         else:
                 #out2 = self.salida_texto(new_tokens,self.reemplazo_fake(new_identificadores))
-                _fake=self.reemplazo_fake(new_identificadores)
                 coincidencia=self.metricas_anonimizacion(_fake,new_tokens,new_identificadores)
                 out2 = self.salida_texto(new_tokens,_fake)
                 out3 = self.salida_json(_fake,new_identificadores)
@@ -745,7 +797,7 @@ def procesar(texto,archivo, etiquetas):
                 return modelo.idioma,"","", df_new, df_new.to_csv(sep='\t', encoding='utf-8',index=False),"",""
-demo = gr.Interface(fn=procesar,inputs=["text",gr.File(), "checkbox"] , outputs=[gr.Label(label="idioma/categoría"),gr.Textbox(label="etiquetas"),gr.Textbox(label="texto procesado"),gr.Dataframe(label="Datos procesados en dataframe",interactive=False),gr.Textbox(label="datos csv"),gr.Textbox(label="labels anonimizados"),gr.Label(label="coincidencia tokens originales")])
        #
 demo.launch(share=True)

 from json import JSONEncoder
 from faker import Faker
 from keras.utils import pad_sequences
+import calendar
 class out_json():
     def __init__(self, w,l):
         self.word = w
     a=''
     for i in new_labels:
         a = a+i
+    return a
+   def is_integer_string(self,value):
+    try:
+        int(value)
+        return True
+    except ValueError:
+        return False
    def formato_salida(self,out):
        a=""
        for i in out:
        return self.faker_.company()
    def fake_city(self):
        return self.faker_.city()
+   def get_day_of(self, month_name, year=2024):
+        months = {
+        'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4, 'mayo': 5, 'junio': 6,
+        'julio': 7, 'agosto': 8, 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12,
+        'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6,
+        'july': 7, 'august': 8, 'september': 9, 'october': 10, 'november': 11, 'december': 12
+        }
+        month = months[month_name]
+        _, num_days = calendar.monthrange(year, month)
+        return str(num_days)
+   def reemplazo_fake(self,identificadores, new_tokens):
+       a=['Enero','January', 'February','Febrero','Marzo','March','Abril','April','Mayo','May','Junio','June','Julio','July','Agosto','August','Septiembre','September','Octubre','October','Noviembre','November','Diciembre','December']
+       b=['Ene','Jan', 'Feb','Mar','Mar','Abr','Apr','May','May','Jun','Jun','Jul','Jul','Ago','Aug','Sep','Oct','Nov','Dic','Dec']
+       i=0
        new_iden=[]
        for id in identificadores:
            elif 'LOC' in id:
                new_iden.append(self.fake_city())
+           elif 'DATE' in id:
+                 if self.is_integer_string(new_tokens[i]):
+                    match len(new_tokens[i]):
+                        case 4:
+                            new_iden.append(self.faker_.date()[:4])
+                        case 10:
+                            new_iden.append(self.faker_.date())
+                        case 1:
+                            new_iden.append(self.get_day_of('february'))
+                        case 2:
+                            new_iden.append(self.get_day_of('february'))
+                        case _:
+                             new_iden.append(id)
+                 else:
+                    match new_tokens[i]:
+                        case w if w in a:
+                             new_iden.append(self.faker_.month_name())
+                        case w if w in b:
+                             new_iden.append(self.faker_.month_name()[:3])
+                        case "-":
+                             new_iden.append("-")
+                        case ".":
+                             new_iden.append(".")
+                        case ",":
+                             new_iden.append(",")
+                        case "/":
+                             new_iden.append("/")
+                        case _:
+                             new_iden.append(id)
            else:
                new_iden.append(id)
+           i=i+1
        return new_iden
   ###
   ### Función que aplica los modelo para categorizar el texto segun su contexto
                 coincidencia=""
         else:
                 #out2 = self.salida_texto(new_tokens,self.reemplazo_fake(new_identificadores))
+                _fake=self.reemplazo_fake(new_identificadores,new_tokens)
                 coincidencia=self.metricas_anonimizacion(_fake,new_tokens,new_identificadores)
                 out2 = self.salida_texto(new_tokens,_fake)
                 out3 = self.salida_json(_fake,new_identificadores)
                 return modelo.idioma,"","", df_new, df_new.to_csv(sep='\t', encoding='utf-8',index=False),"",""
+demo = gr.Interface(fn=procesar,inputs=["text",gr.File(), "checkbox"] , outputs=[gr.Label(label="idioma/categoría"),gr.Textbox(label="etiquetas"),gr.Textbox(label="texto procesado"),gr.Dataframe(label="Datos procesados en dataframe",interactive=False),gr.Textbox(label="datos csv"),gr.Textbox(label="etiquetas anonimizadas"),gr.Label(label="coincidencia tokens originales vs anonimizados")])
        #
 demo.launch(share=True)