Stanislas Muennighoff commited on
Commit
ee1e7db
·
1 Parent(s): 13193de

Update tokenization_chatglm.py (#3)

Browse files

- Update tokenization_chatglm.py (5c8ffabbfc8ba950a8f450939138bc5254b6ee1f)
- Update config.json (73a1403930e64d11e421f8eb7b1d19eb49cfeeeb)


Co-authored-by: Niklas Muennighoff <[email protected]>

Files changed (2) hide show
  1. config.json +1 -0
  2. tokenization_chatglm.py +13 -6
config.json CHANGED
@@ -7,6 +7,7 @@
7
  "auto_map": {
8
  "AutoConfig": "configuration_chatglm.ChatGLMConfig",
9
  "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
 
10
  "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration"
11
  },
12
  "add_bias_linear": false,
 
7
  "auto_map": {
8
  "AutoConfig": "configuration_chatglm.ChatGLMConfig",
9
  "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
10
+ "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
11
  "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration"
12
  },
13
  "add_bias_linear": false,
tokenization_chatglm.py CHANGED
@@ -225,7 +225,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
225
  (optional) Set to False to avoid returning attention mask (default: set to model specifics)
226
  """
227
  # Load from model defaults
228
- assert self.padding_side == "left"
229
 
230
  required_input = encoded_inputs[self.model_input_names[0]]
231
  seq_length = len(required_input)
@@ -248,10 +248,17 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
248
  if needs_to_be_padded:
249
  difference = max_length - len(required_input)
250
 
251
- if "attention_mask" in encoded_inputs:
252
- encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
253
- if "position_ids" in encoded_inputs:
254
- encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
255
- encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
 
 
 
 
 
 
 
256
 
257
  return encoded_inputs
 
225
  (optional) Set to False to avoid returning attention mask (default: set to model specifics)
226
  """
227
  # Load from model defaults
228
+ # assert self.padding_side == "left"
229
 
230
  required_input = encoded_inputs[self.model_input_names[0]]
231
  seq_length = len(required_input)
 
248
  if needs_to_be_padded:
249
  difference = max_length - len(required_input)
250
 
251
+ if self.padding_side == "left":
252
+ if "attention_mask" in encoded_inputs:
253
+ encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
254
+ if "position_ids" in encoded_inputs:
255
+ encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
256
+ encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
257
+ else:
258
+ if "attention_mask" in encoded_inputs:
259
+ encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
260
+ if "position_ids" in encoded_inputs:
261
+ encoded_inputs["position_ids"] = encoded_inputs["position_ids"] + [0] * difference
262
+ encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
263
 
264
  return encoded_inputs