Spaces:
Sleeping
Sleeping
app modified dataset reorden token
Browse files
app.py
CHANGED
@@ -110,7 +110,7 @@ class Model:
|
|
110 |
i=0
|
111 |
for token in tokens:
|
112 |
|
113 |
-
if pre_tokens[i]=='O' or 'MISC' in pre_tokens[i]
|
114 |
new_labels.append(' ' +token.replace('▁','').replace('Ġ',''))
|
115 |
else:
|
116 |
new_labels.append(' ' + pre_tokens[i])
|
@@ -120,21 +120,21 @@ class Model:
|
|
120 |
a = a+i
|
121 |
return a
|
122 |
#return new_labels
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
def formato_salida(self,out):
|
139 |
a=""
|
140 |
for i in out:
|
@@ -428,20 +428,20 @@ class ModeloDataset:
|
|
428 |
|
429 |
return new_identificadores, new_tokens
|
430 |
#return ids, _predicted_tokens_classes
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
def salida_texto( self,tokens,pre_tokens):
|
446 |
new_labels = []
|
447 |
current_word = None
|
@@ -607,7 +607,7 @@ def procesar(texto,archivo, etiquetas):
|
|
607 |
ides, predicted = modelo.aplicar_modelo(sentences,modelo.idioma,etiquetas)
|
608 |
|
609 |
if model.idioma=="es":
|
610 |
-
out=modelo.
|
611 |
else:
|
612 |
out=modelo.salida_texto2( ides,predicted,etiquetas)#tokens,labels
|
613 |
|
|
|
110 |
i=0
|
111 |
for token in tokens:
|
112 |
|
113 |
+
if pre_tokens[i]=='O' or 'MISC' in pre_tokens[i]:
|
114 |
new_labels.append(' ' +token.replace('▁','').replace('Ġ',''))
|
115 |
else:
|
116 |
new_labels.append(' ' + pre_tokens[i])
|
|
|
120 |
a = a+i
|
121 |
return a
|
122 |
#return new_labels
|
123 |
+
def salida_texto_anonimizado(self, ids,pre_tokens):
|
124 |
+
new_labels = []
|
125 |
+
current_word = None
|
126 |
+
i=0
|
127 |
+
for identificador in pre_tokens:
|
128 |
+
|
129 |
+
if identificador=='O' or 'OTH' in identificador:
|
130 |
+
new_labels.append(self.tokenizer.decode(ids[i]))
|
131 |
+
else:
|
132 |
+
new_labels.append(' ' + identificador)
|
133 |
+
i=i+1
|
134 |
+
a=''
|
135 |
+
for i in new_labels:
|
136 |
+
a = a+i
|
137 |
+
return a
|
138 |
def formato_salida(self,out):
|
139 |
a=""
|
140 |
for i in out:
|
|
|
428 |
|
429 |
return new_identificadores, new_tokens
|
430 |
#return ids, _predicted_tokens_classes
|
431 |
+
def salida_texto_es( self,ids,pre_tokens):
|
432 |
+
new_labels = []
|
433 |
+
current_word = None
|
434 |
+
i=0
|
435 |
+
for identificador in pre_tokens:
|
436 |
+
if (self.tokenizer.decode(ids[i])!="<s>"):
|
437 |
+
if identificador=='O':
|
438 |
+
|
439 |
+
new_labels.append(self.tokenizer.decode(ids[i]))
|
440 |
+
else:
|
441 |
+
new_labels.append(' ' + identificador)
|
442 |
+
i=i+1
|
443 |
+
|
444 |
+
return new_labels
|
445 |
def salida_texto( self,tokens,pre_tokens):
|
446 |
new_labels = []
|
447 |
current_word = None
|
|
|
607 |
ides, predicted = modelo.aplicar_modelo(sentences,modelo.idioma,etiquetas)
|
608 |
|
609 |
if model.idioma=="es":
|
610 |
+
out=modelo.salida_texto2_es( ides,predicted,etiquetas)#tokens,labels
|
611 |
else:
|
612 |
out=modelo.salida_texto2( ides,predicted,etiquetas)#tokens,labels
|
613 |
|