dayannex commited on
Commit
bb4144d
·
1 Parent(s): 4d5132e

app modified dataset reorden token

Browse files
Files changed (1) hide show
  1. app.py +31 -31
app.py CHANGED
@@ -110,7 +110,7 @@ class Model:
110
  i=0
111
  for token in tokens:
112
 
113
- if pre_tokens[i]=='O' or 'MISC' in pre_tokens[i] or 'OTH' in pre_tokens[i]:
114
  new_labels.append(' ' +token.replace('▁','').replace('Ġ',''))
115
  else:
116
  new_labels.append(' ' + pre_tokens[i])
@@ -120,21 +120,21 @@ class Model:
120
  a = a+i
121
  return a
122
  #return new_labels
123
- #def salida_texto_anonimizado(self, ids,pre_tokens):
124
- # new_labels = []
125
- # current_word = None
126
- # i=0
127
- # for identificador in pre_tokens:
128
- #
129
- # if identificador=='O' or 'OTH' in identificador:
130
- # new_labels.append(self.tokenizer.decode(ids[i]))
131
- # else:
132
- # new_labels.append(' ' + identificador)
133
- # i=i+1
134
- # a=''
135
- # for i in new_labels:
136
- # a = a+i
137
- # return a
138
  def formato_salida(self,out):
139
  a=""
140
  for i in out:
@@ -428,20 +428,20 @@ class ModeloDataset:
428
 
429
  return new_identificadores, new_tokens
430
  #return ids, _predicted_tokens_classes
431
- #def salida_texto_es( self,ids,pre_tokens):
432
- # new_labels = []
433
- # current_word = None
434
- # i=0
435
- # for identificador in pre_tokens:
436
- # if (self.tokenizer.decode(ids[i])!="<s>"):
437
- # if identificador=='O':
438
- #
439
- # new_labels.append(self.tokenizer.decode(ids[i]))
440
- # else:
441
- # new_labels.append(' ' + identificador)
442
- # i=i+1
443
- #
444
- # return new_labels
445
  def salida_texto( self,tokens,pre_tokens):
446
  new_labels = []
447
  current_word = None
@@ -607,7 +607,7 @@ def procesar(texto,archivo, etiquetas):
607
  ides, predicted = modelo.aplicar_modelo(sentences,modelo.idioma,etiquetas)
608
 
609
  if model.idioma=="es":
610
- out=modelo.salida_texto2( ides,predicted,etiquetas)#tokens,labels
611
  else:
612
  out=modelo.salida_texto2( ides,predicted,etiquetas)#tokens,labels
613
 
 
110
  i=0
111
  for token in tokens:
112
 
113
+ if pre_tokens[i]=='O' or 'MISC' in pre_tokens[i]:
114
  new_labels.append(' ' +token.replace('▁','').replace('Ġ',''))
115
  else:
116
  new_labels.append(' ' + pre_tokens[i])
 
120
  a = a+i
121
  return a
122
  #return new_labels
123
+ def salida_texto_anonimizado(self, ids,pre_tokens):
124
+ new_labels = []
125
+ current_word = None
126
+ i=0
127
+ for identificador in pre_tokens:
128
+
129
+ if identificador=='O' or 'OTH' in identificador:
130
+ new_labels.append(self.tokenizer.decode(ids[i]))
131
+ else:
132
+ new_labels.append(' ' + identificador)
133
+ i=i+1
134
+ a=''
135
+ for i in new_labels:
136
+ a = a+i
137
+ return a
138
  def formato_salida(self,out):
139
  a=""
140
  for i in out:
 
428
 
429
  return new_identificadores, new_tokens
430
  #return ids, _predicted_tokens_classes
431
+ def salida_texto_es( self,ids,pre_tokens):
432
+ new_labels = []
433
+ current_word = None
434
+ i=0
435
+ for identificador in pre_tokens:
436
+ if (self.tokenizer.decode(ids[i])!="<s>"):
437
+ if identificador=='O':
438
+
439
+ new_labels.append(self.tokenizer.decode(ids[i]))
440
+ else:
441
+ new_labels.append(' ' + identificador)
442
+ i=i+1
443
+
444
+ return new_labels
445
  def salida_texto( self,tokens,pre_tokens):
446
  new_labels = []
447
  current_word = None
 
607
  ides, predicted = modelo.aplicar_modelo(sentences,modelo.idioma,etiquetas)
608
 
609
  if model.idioma=="es":
610
+ out=modelo.salida_texto2_es( ides,predicted,etiquetas)#tokens,labels
611
  else:
612
  out=modelo.salida_texto2( ides,predicted,etiquetas)#tokens,labels
613