alirezamsh commited on
Commit
8ab680e
·
verified ·
1 Parent(s): 29eb74f

Update tokenization_small100.py

Browse files
Files changed (1) hide show
  1. tokenization_small100.py +16 -15
tokenization_small100.py CHANGED
@@ -145,19 +145,6 @@ class SMALL100Tokenizer(PreTrainedTokenizer):
145
  if self.get_lang_token(lang_code) not in kwargs["additional_special_tokens"]
146
  ]
147
 
148
- super().__init__(
149
- tgt_lang=tgt_lang,
150
- bos_token=bos_token,
151
- eos_token=eos_token,
152
- sep_token=sep_token,
153
- unk_token=unk_token,
154
- pad_token=pad_token,
155
- language_codes=language_codes,
156
- sp_model_kwargs=self.sp_model_kwargs,
157
- num_madeup_words=num_madeup_words,
158
- **kwargs,
159
- )
160
-
161
  self.vocab_file = vocab_file
162
  self.encoder = load_json(vocab_file)
163
  self.decoder = {v: k for k, v in self.encoder.items()}
@@ -174,9 +161,23 @@ class SMALL100Tokenizer(PreTrainedTokenizer):
174
 
175
  self._tgt_lang = tgt_lang if tgt_lang is not None else "en"
176
  self.cur_lang_id = self.get_lang_id(self._tgt_lang)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  self.set_lang_special_tokens(self._tgt_lang)
178
 
179
- self.num_madeup_words = num_madeup_words
180
 
181
  @property
182
  def vocab_size(self) -> int:
@@ -361,4 +362,4 @@ def load_json(path: str) -> Union[Dict, List]:
361
 
362
  def save_json(data, path: str) -> None:
363
  with open(path, "w") as f:
364
- json.dump(data, f, indent=2)
 
145
  if self.get_lang_token(lang_code) not in kwargs["additional_special_tokens"]
146
  ]
147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  self.vocab_file = vocab_file
149
  self.encoder = load_json(vocab_file)
150
  self.decoder = {v: k for k, v in self.encoder.items()}
 
161
 
162
  self._tgt_lang = tgt_lang if tgt_lang is not None else "en"
163
  self.cur_lang_id = self.get_lang_id(self._tgt_lang)
164
+ self.num_madeup_words = num_madeup_words
165
+
166
+ super().__init__(
167
+ tgt_lang=tgt_lang,
168
+ bos_token=bos_token,
169
+ eos_token=eos_token,
170
+ sep_token=sep_token,
171
+ unk_token=unk_token,
172
+ pad_token=pad_token,
173
+ language_codes=language_codes,
174
+ sp_model_kwargs=self.sp_model_kwargs,
175
+ num_madeup_words=num_madeup_words,
176
+ **kwargs,
177
+ )
178
+
179
  self.set_lang_special_tokens(self._tgt_lang)
180
 
 
181
 
182
  @property
183
  def vocab_size(self) -> int:
 
362
 
363
  def save_json(data, path: str) -> None:
364
  with open(path, "w") as f:
365
+ json.dump(data, f, indent=2)