fix tokenizer save_pretrained method

#32
by katuni4ka - opened
Files changed (1) hide show
  1. tokenization_xgen.py +175 -1
tokenization_xgen.py CHANGED
@@ -4,10 +4,16 @@
4
  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/Apache-2.0
5
  """Tokenization classes for xgen."""
6
 
7
- from typing import List, Optional
 
 
 
 
8
 
9
  from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
10
  from transformers.utils import logging
 
 
11
 
12
  try:
13
  import tiktoken
@@ -246,3 +252,171 @@ class XgenTokenizer(PreTrainedTokenizer):
246
  # has no vocab file
247
  def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None):
248
  return ()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/Apache-2.0
5
  """Tokenization classes for xgen."""
6
 
7
+ import os
8
+ import json
9
+ from typing import List, Optional, Tuple, Union
10
+ import warnings
11
+ import copy
12
 
13
  from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
14
  from transformers.utils import logging
15
+ from transformers.dynamic_module_utils import custom_object_save
16
+ from transformers.tokenization_utils import TOKENIZER_CONFIG_FILE, SPECIAL_TOKENS_MAP_FILE
17
 
18
  try:
19
  import tiktoken
 
252
  # has no vocab file
253
  def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None):
254
  return ()
255
+
256
+
257
+ def save_pretrained(
258
+ self,
259
+ save_directory: Union[str, os.PathLike],
260
+ legacy_format: Optional[bool] = None,
261
+ filename_prefix: Optional[str] = None,
262
+ push_to_hub: bool = False,
263
+ **kwargs,
264
+ ) -> Tuple[str]:
265
+ """
266
+ Save the full tokenizer state.
267
+
268
+
269
+ This method make sure the full tokenizer can then be re-loaded using the
270
+ [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] class method..
271
+
272
+ Warning,None This won't save modifications you may have applied to the tokenizer after the instantiation (for
273
+ instance, modifying `tokenizer.do_lower_case` after creation).
274
+
275
+ Args:
276
+ save_directory (`str` or `os.PathLike`): The path to a directory where the tokenizer will be saved.
277
+ legacy_format (`bool`, *optional*):
278
+ Only applicable for a fast tokenizer. If unset (default), will save the tokenizer in the unified JSON
279
+ format as well as in legacy format if it exists, i.e. with tokenizer specific vocabulary and a separate
280
+ added_tokens files.
281
+
282
+ If `False`, will only save the tokenizer in the unified JSON format. This format is incompatible with
283
+ "slow" tokenizers (not powered by the *tokenizers* library), so the tokenizer will not be able to be
284
+ loaded in the corresponding "slow" tokenizer.
285
+
286
+ If `True`, will save the tokenizer in legacy format. If the "slow" tokenizer doesn't exits, a value
287
+ error is raised.
288
+ filename_prefix (`str`, *optional*):
289
+ A prefix to add to the names of the files saved by the tokenizer.
290
+ push_to_hub (`bool`, *optional*, defaults to `False`):
291
+ Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
292
+ repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
293
+ namespace).
294
+ kwargs (`Dict[str, Any]`, *optional*):
295
+ Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
296
+
297
+ Returns:
298
+ A tuple of `str`: The files saved.
299
+ """
300
+ use_auth_token = kwargs.pop("use_auth_token", None)
301
+
302
+ if use_auth_token is not None:
303
+ warnings.warn(
304
+ "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
305
+ FutureWarning,
306
+ )
307
+ if kwargs.get("token", None) is not None:
308
+ raise ValueError(
309
+ "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
310
+ )
311
+ kwargs["token"] = use_auth_token
312
+
313
+ if os.path.isfile(save_directory):
314
+ logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
315
+ return
316
+
317
+ os.makedirs(save_directory, exist_ok=True)
318
+
319
+ if push_to_hub:
320
+ commit_message = kwargs.pop("commit_message", None)
321
+ repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
322
+ repo_id = self._create_repo(repo_id, **kwargs)
323
+ files_timestamps = self._get_files_timestamps(save_directory)
324
+
325
+ special_tokens_map_file = os.path.join(
326
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + SPECIAL_TOKENS_MAP_FILE
327
+ )
328
+ tokenizer_config_file = os.path.join(
329
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_CONFIG_FILE
330
+ )
331
+
332
+ tokenizer_config = copy.deepcopy(self.init_kwargs)
333
+
334
+ # Let's save the init kwargs
335
+ target_keys = set(self.init_kwargs.keys())
336
+ # Let's save the special tokens map (only the strings)
337
+ target_keys.update(["model_max_length", "clean_up_tokenization_spaces"])
338
+
339
+ for k in target_keys:
340
+ if hasattr(self, k) and k != "add_special_tokens":
341
+ tokenizer_config[k] = getattr(self, k)
342
+
343
+ # Let's make sure we properly save the special tokens.
344
+ tokenizer_config.update(self.special_tokens_map)
345
+
346
+ if self.chat_template is not None:
347
+ if isinstance(self.chat_template, dict):
348
+ # Chat template dicts are saved to the config as lists of dicts with fixed key names.
349
+ # They will be reconstructed as a single dict during loading.
350
+ tokenizer_config["chat_template"] = [{"name": k, "template": v} for k, v in self.chat_template.items()]
351
+ else:
352
+ tokenizer_config["chat_template"] = self.chat_template
353
+
354
+ if len(self.init_inputs) > 0:
355
+ tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
356
+ for file_id in self.vocab_files_names.keys():
357
+ tokenizer_config.pop(file_id, None)
358
+
359
+ # no typefields, this way old fast and slow can load it
360
+ tokenizer_config = self.convert_added_tokens(tokenizer_config, add_type_field=True, save=True)
361
+
362
+ # Process added tokens seperatly: allows previous versions to ignore it!
363
+ added_tokens = {}
364
+ for key, value in self.added_tokens_decoder.items():
365
+ added_tokens[key] = value.__getstate__()
366
+ tokenizer_config["added_tokens_decoder"] = added_tokens
367
+
368
+ # Add tokenizer class to the tokenizer config to be able to reload it with from_pretrained
369
+ tokenizer_class = self.__class__.__name__
370
+ # Remove the Fast at the end unless we have a special `PreTrainedTokenizerFast`
371
+ if tokenizer_class.endswith("Fast") and tokenizer_class != "PreTrainedTokenizerFast":
372
+ tokenizer_class = tokenizer_class[:-4]
373
+ tokenizer_config["tokenizer_class"] = tokenizer_class
374
+ if getattr(self, "_auto_map", None) is not None:
375
+ tokenizer_config["auto_map"] = self._auto_map
376
+ if getattr(self, "_processor_class", None) is not None:
377
+ tokenizer_config["processor_class"] = self._processor_class
378
+
379
+ # If we have a custom model, we copy the file defining it in the folder and set the attributes so it can be
380
+ # loaded from the Hub.
381
+ if self._auto_class is not None:
382
+ custom_object_save(self, save_directory, config=tokenizer_config)
383
+
384
+ # remove private information
385
+ if "name_or_path" in tokenizer_config:
386
+ tokenizer_config.pop("name_or_path")
387
+ tokenizer_config.pop("special_tokens_map_file", None)
388
+ tokenizer_config.pop("tokenizer_file", None)
389
+
390
+ with open(tokenizer_config_file, "w", encoding="utf-8") as f:
391
+ out_str = json.dumps(tokenizer_config, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
392
+ f.write(out_str)
393
+ logger.info(f"tokenizer config file saved in {tokenizer_config_file}")
394
+
395
+ # Sanitize AddedTokens in special_tokens_map
396
+
397
+ # kept for forward compatibility, will be removed in transoformers 5. Typefields are not saved for FC, special should not be save either
398
+ write_dict = self.convert_added_tokens(self.special_tokens_map_extended, save=True, add_type_field=False)
399
+ with open(special_tokens_map_file, "w", encoding="utf-8") as f:
400
+ out_str = json.dumps(write_dict, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
401
+ f.write(out_str)
402
+ logger.info(f"Special tokens file saved in {special_tokens_map_file}")
403
+
404
+ file_names = (tokenizer_config_file, special_tokens_map_file)
405
+
406
+ save_files = self._save_pretrained(
407
+ save_directory=save_directory,
408
+ file_names=file_names,
409
+ legacy_format=legacy_format,
410
+ filename_prefix=filename_prefix,
411
+ )
412
+
413
+ if push_to_hub:
414
+ self._upload_modified_files(
415
+ save_directory,
416
+ repo_id,
417
+ files_timestamps,
418
+ commit_message=commit_message,
419
+ token=kwargs.get("token"),
420
+ )
421
+
422
+ return save_files