Text Generation
Transformers
Safetensors
llama
text-generation-inference
Inference Endpoints
mfromm commited on
Commit
074b5e7
·
verified ·
1 Parent(s): 4159c04

Update gptx_tokenizer.py

Browse files
Files changed (1) hide show
  1. gptx_tokenizer.py +9 -1
gptx_tokenizer.py CHANGED
@@ -233,6 +233,7 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
233
  token_ids: Union[List[int], List[List[int]]],
234
  num_threads: Optional[int] = None,
235
  skip_special_tokens: bool = False,
 
236
  ) -> str:
237
  """
238
  Decode a list of token IDs into a string.
@@ -244,7 +245,14 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
244
  """
245
  output = self.tok.decode(input=token_ids, num_threads=num_threads)
246
  if skip_special_tokens:
247
- token_ids = [token for token in output if token not in self.additional_special_tokens]
 
 
 
 
 
 
 
248
  return output
249
 
250
  def _convert_id_to_token(self, index: int) -> str:
 
233
  token_ids: Union[List[int], List[List[int]]],
234
  num_threads: Optional[int] = None,
235
  skip_special_tokens: bool = False,
236
+ clean_up_tokenization_spaces: bool = False,
237
  ) -> str:
238
  """
239
  Decode a list of token IDs into a string.
 
245
  """
246
  output = self.tok.decode(input=token_ids, num_threads=num_threads)
247
  if skip_special_tokens:
248
+ output = [token for token in output if token not in self.additional_special_tokens]
249
+ if clean_up_tokenization_spaces:
250
+ warnings.warn(
251
+ "when cleaning up tokenization spaces, this will not behave "
252
+ "like the original `GPTXTokenizer`., Please supply "
253
+ "`clean_up_tokenization_spaces=False` for decoding."
254
+ )
255
+ output = self.clean_up_tokenization(output)
256
  return output
257
 
258
  def _convert_id_to_token(self, index: int) -> str: