""" This module supplies `transformers`-compatible wrappers for `GPTXTokenizer`s. The tokenizers in this do not conform to the `PreTrainedTokenizer` API, but allow for better practical usage. """ from typing import List try: from gptxdata.tokenization.hf_wrappers import ( HFTokenizer as _HFTokenizer, SPTokenizer as _SPTokenizer, ) except ImportError: from gptx_tokenizer.hf_wrappers import ( HFTokenizer as _HFTokenizer, SPTokenizer as _SPTokenizer, ) class HFTokenizer(_HFTokenizer): # The tokenizer is ridiculously slow without this; however, this # doesn't implement all APIs of `PreTrainedTokenizer`. def encode(self, text: str, **kwargs) -> List[int]: return_tokens = kwargs.pop('return_tokens', False) return self._tok.encode(text, return_tokens=return_tokens) class SPTokenizer(_SPTokenizer): # `is_continuation` does not work without this, but it doesn't # implement all APIs of `PreTrainedTokenizer`. def encode(self, text: str, **kwargs) -> List[int]: return_tokens = kwargs.pop('return_tokens', False) is_continuation = kwargs.pop('is_continuation', False) return self._tok.encode( text, return_tokens=return_tokens, is_continuation=is_continuation, )