|
""" |
|
This module supplies `transformers`-compatible wrappers for |
|
`GPTXTokenizer`s. |
|
|
|
The tokenizers in this do not conform to the `PreTrainedTokenizer` API, |
|
but allow for better practical usage. |
|
""" |
|
|
|
from typing import List |
|
|
|
try: |
|
from gptxdata.tokenization.hf_wrappers import ( |
|
HFTokenizer as _HFTokenizer, |
|
SPTokenizer as _SPTokenizer, |
|
) |
|
except ImportError: |
|
from gptx_tokenizer.hf_wrappers import ( |
|
HFTokenizer as _HFTokenizer, |
|
SPTokenizer as _SPTokenizer, |
|
) |
|
|
|
class HFTokenizer(_HFTokenizer): |
|
|
|
|
|
def encode(self, text: str, **kwargs) -> List[int]: |
|
return_tokens = kwargs.pop('return_tokens', False) |
|
return self._tok.encode(text, return_tokens=return_tokens) |
|
|
|
|
|
class SPTokenizer(_SPTokenizer): |
|
|
|
|
|
def encode(self, text: str, **kwargs) -> List[int]: |
|
return_tokens = kwargs.pop('return_tokens', False) |
|
is_continuation = kwargs.pop('is_continuation', False) |
|
return self._tok.encode( |
|
text, |
|
return_tokens=return_tokens, |
|
is_continuation=is_continuation, |
|
) |
|
|