Error loading the tokenizer
#2
by
hyesunyun
- opened
Hello!
I am trying to load and use this model for inference.
However, I am getting the following error when I try to load the tokenizer (tokenizer = AutoTokenizer.from_pretrained("xz97/AlpaCare-llama2-13b")
):
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.
Traceback (most recent call last):
File "/home/yun.hy/.conda/envs/MedLitSpin/lib/python3.11/site-packages/transformers/convert_slow_tokenizer.py", line 1636, in convert_slow_tokenizer
).converted()
^^^^^^^^^^^
File "/home/yun.hy/.conda/envs/MedLitSpin/lib/python3.11/site-packages/transformers/convert_slow_tokenizer.py", line 1533, in converted
tokenizer = self.tokenizer()
^^^^^^^^^^^^^^^^
File "/home/yun.hy/.conda/envs/MedLitSpin/lib/python3.11/site-packages/transformers/convert_slow_tokenizer.py", line 1526, in tokenizer
vocab_scores, merges = self.extract_vocab_merges_from_model(self.vocab_file)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/yun.hy/.conda/envs/MedLitSpin/lib/python3.11/site-packages/transformers/convert_slow_tokenizer.py", line 1502, in extract_vocab_merges_from_model
bpe_ranks = load_tiktoken_bpe(tiktoken_url)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/yun.hy/.conda/envs/MedLitSpin/lib/python3.11/site-packages/tiktoken/load.py", line 145, in load_tiktoken_bpe
return {
^
File "/home/yun.hy/.conda/envs/MedLitSpin/lib/python3.11/site-packages/tiktoken/load.py", line 147, in <dictcomp>
for token, rank in (line.split() for line in contents.splitlines() if line)
^^^^^^^^^^^
ValueError: not enough values to unpack (expected 2, got 1)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/scratch/yun.hy/MedLitSpin/code/run_spin_detection_evaluation.py", line 263, in <module>
evaluator = Evaluator(model_name, output_path, is_debug)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/scratch/yun.hy/MedLitSpin/code/run_spin_detection_evaluation.py", line 53, in __init__
self.__load_model()
File "/scratch/yun.hy/MedLitSpin/code/run_spin_detection_evaluation.py", line 109, in __load_model
self.model = model_class(model_type=type)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/scratch/yun.hy/MedLitSpin/code/models/alpacare.py", line 30, in __init__
self.tokenizer = self.__load_tokenizer()
^^^^^^^^^^^^^^^^^^^^^^^
File "/scratch/yun.hy/MedLitSpin/code/models/alpacare.py", line 49, in __load_tokenizer
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/yun.hy/.conda/envs/MedLitSpin/lib/python3.11/site-packages/transformers/models/auto/tokenization_auto.py", line 921, in from_pretrained
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/yun.hy/.conda/envs/MedLitSpin/lib/python3.11/site-packages/transformers/tokenization_utils_base.py", line 2032, in from_pretrained
return cls._from_pretrained(
^^^^^^^^^^^^^^^^^^^^^
File "/home/yun.hy/.conda/envs/MedLitSpin/lib/python3.11/site-packages/transformers/tokenization_utils_base.py", line 2272, in _from_pretrained
tokenizer = cls(*init_inputs, **init_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/yun.hy/.conda/envs/MedLitSpin/lib/python3.11/site-packages/transformers/models/llama/tokenization_llama_fast.py", line 157, in __init__
super().__init__(
File "/home/yun.hy/.conda/envs/MedLitSpin/lib/python3.11/site-packages/transformers/tokenization_utils_fast.py", line 138, in __init__
fast_tokenizer = convert_slow_tokenizer(self, from_tiktoken=True)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/yun.hy/.conda/envs/MedLitSpin/lib/python3.11/site-packages/transformers/convert_slow_tokenizer.py", line 1638, in convert_slow_tokenizer
raise ValueError(
ValueError: Converting from Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast convertors: ['AlbertTokenizer', 'BartTokenizer', 'BarthezTokenizer', 'BertTokenizer', 'BigBirdTokenizer', 'BlenderbotTokenizer', 'CamembertTokenizer', 'CLIPTokenizer', 'CodeGenTokenizer', 'ConvBertTokenizer', 'DebertaTokenizer', 'DebertaV2Tokenizer', 'DistilBertTokenizer', 'DPRReaderTokenizer', 'DPRQuestionEncoderTokenizer', 'DPRContextEncoderTokenizer', 'ElectraTokenizer', 'FNetTokenizer', 'FunnelTokenizer', 'GPT2Tokenizer', 'HerbertTokenizer', 'LayoutLMTokenizer', 'LayoutLMv2Tokenizer', 'LayoutLMv3Tokenizer', 'LayoutXLMTokenizer', 'LongformerTokenizer', 'LEDTokenizer', 'LxmertTokenizer', 'MarkupLMTokenizer', 'MBartTokenizer', 'MBart50Tokenizer', 'MPNetTokenizer', 'MobileBertTokenizer', 'MvpTokenizer', 'NllbTokenizer', 'OpenAIGPTTokenizer', 'PegasusTokenizer', 'Qwen2Tokenizer', 'RealmTokenizer', 'ReformerTokenizer', 'RemBertTokenizer', 'RetriBertTokenizer', 'RobertaTokenizer', 'RoFormerTokenizer', 'SeamlessM4TTokenizer', 'SqueezeBertTokenizer', 'T5Tokenizer', 'UdopTokenizer', 'WhisperTokenizer', 'XLMRobertaTokenizer', 'XLNetTokenizer', 'SplinterTokenizer', 'XGLMTokenizer', 'LlamaTokenizer', 'CodeLlamaTokenizer', 'GemmaTokenizer', 'Phi3Tokenizer']