An issue with paulhindemith/fasttext-classification json not found
Hello guys,
i just new here, i used paulhindemith/fasttext-classification model but while executing it raise the following error in my COLAB notebook.
HTTPError: 404 Client Error: Not Found for url: https://huggingface.co/paulhindemith/fasttext-classification/resolve/2022.11.7/config.json
Hello
@devanghingu
Thank you for your issue, and I'm new too.
This error is caused by my forgetting to create '2022.11.7' branch.
I pushed my fixed commit, so could you see in your COLAB notebook?
Sure @paulhindemith Let me check
Got another encoding error on it
2 classifier = pipeline("zero-shot-classification", "paulhindemith/fasttext-classification", revision="2022.11.7", trust_remote_code=True)
----> 3 output = classifier(data, candidate_labels=topics, hypothesis_template="{}", multi_label=True)
4 return output
/usr/local/lib/python3.7/dist-packages/transformers/pipelines/zero_shot_classification.py in __call__(self, sequences, *args, **kwargs)
180 raise ValueError(f"Unable to understand extra arguments {args}")
181
--> 182 return super().__call__(sequences, **kwargs)
183
184 def preprocess(self, inputs, candidate_labels=None, hypothesis_template="This example is {}."):
/usr/local/lib/python3.7/dist-packages/transformers/pipelines/base.py in __call__(self, inputs, num_workers, batch_size, *args, **kwargs)
1072 return self.iterate(inputs, preprocess_params, forward_params, postprocess_params)
1073 else:
-> 1074 return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)
1075
1076 def run_multi(self, inputs, preprocess_params, forward_params, postprocess_params):
/usr/local/lib/python3.7/dist-packages/transformers/pipelines/base.py in run_single(self, inputs, preprocess_params, forward_params, postprocess_params)
1093 def run_single(self, inputs, preprocess_params, forward_params, postprocess_params):
1094 all_outputs = []
-> 1095 for model_inputs in self.preprocess(inputs, **preprocess_params):
1096 model_outputs = self.forward(model_inputs, **forward_params)
1097 all_outputs.append(model_outputs)
/usr/local/lib/python3.7/dist-packages/transformers/pipelines/zero_shot_classification.py in preprocess(self, inputs, candidate_labels, hypothesis_template)
186
187 for i, (candidate_label, sequence_pair) in enumerate(zip(candidate_labels, sequence_pairs)):
--> 188 model_input = self._parse_and_tokenize([sequence_pair])
189
190 yield {
/usr/local/lib/python3.7/dist-packages/transformers/pipelines/zero_shot_classification.py in _parse_and_tokenize(self, sequence_pairs, padding, add_special_tokens, truncation, **kwargs)
115 )
116 else:
--> 117 raise e
118
119 return inputs
/usr/local/lib/python3.7/dist-packages/transformers/pipelines/zero_shot_classification.py in _parse_and_tokenize(self, sequence_pairs, padding, add_special_tokens, truncation, **kwargs)
97 return_tensors=return_tensors,
98 padding=padding,
---> 99 truncation=truncation,
100 )
101 except Exception as e:
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py in __call__(self, text, text_pair, text_target, text_pair_target, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
2486 if not self._in_target_context_manager:
2487 self._switch_to_input_mode()
-> 2488 encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs)
2489 if text_target is not None:
2490 self._switch_to_target_mode()
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py in _call_one(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
2589 return_length=return_length,
2590 verbose=verbose,
-> 2591 **kwargs,
2592 )
2593 else:
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py in batch_encode_plus(self, batch_text_or_text_pairs, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
2780 return_length=return_length,
2781 verbose=verbose,
-> 2782 **kwargs,
2783 )
2784
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils.py in _batch_encode_plus(self, batch_text_or_text_pairs, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
731 ids, pair_ids = ids_or_pair_ids
732
--> 733 first_ids = get_input_ids(ids)
734 second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
735 input_ids.append((first_ids, second_ids))
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils.py in get_input_ids(text)
699 if isinstance(text, str):
700 tokens = self.tokenize(text, **kwargs)
--> 701 return self.convert_tokens_to_ids(tokens)
702 elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
703 if is_split_into_words:
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils.py in convert_tokens_to_ids(self, tokens)
577 ids = []
578 for token in tokens:
--> 579 ids.append(self._convert_token_to_id_with_added_voc(token))
580 return ids
581
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils.py in _convert_token_to_id_with_added_voc(self, token)
586 if token in self.added_tokens_encoder:
587 return self.added_tokens_encoder[token]
--> 588 return self._convert_token_to_id(token)
589
590 def _convert_token_to_id(self, token):
~/.cache/huggingface/modules/transformers_modules/paulhindemith/fasttext-classification/31d9e69fb14966aa8a72bd91a9623407f573ed28/fasttext_jp_tokenizer.py in _convert_token_to_id(self, token)
101 int: ID
102 """
--> 103 return self.stoi[token]
104
105 def _convert_id_to_token(self, index: int) -> str:
KeyError: '\u3000'
umm...
I think the reason why KeyError raised is because my model can not handle 'unknown words'.
I will fix it on this weekend.
If you use this model soon, it may be better that you clone this repository, and fix by yourself.
Because this model is created for my trying zero-shot-pipeline, so it is experimental.
@devanghingu
I fixed this issue.
Now you can use the model even for unknown words 😄
The branch "2022.11.13" is now available, including this fix.
Thank you @paulhindemith