zxdu20 commited on
Commit
63ce1ba
·
1 Parent(s): 72985e8

Update code for slim

Browse files
config.json CHANGED
@@ -8,9 +8,11 @@
8
  "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
9
  "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration"
10
  },
11
- "bos_token_id": 150004,
12
- "eos_token_id": 150005,
13
- "pad_token_id": 20003,
 
 
14
  "hidden_size": 4096,
15
  "inner_hidden_size": 16384,
16
  "layernorm_epsilon": 1e-05,
@@ -22,5 +24,5 @@
22
  "torch_dtype": "float16",
23
  "transformers_version": "4.23.1",
24
  "use_cache": true,
25
- "vocab_size": 150528
26
  }
 
8
  "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
9
  "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration"
10
  },
11
+ "bos_token_id": 130004,
12
+ "eos_token_id": 130005,
13
+ "mask_token_id": 130000,
14
+ "gmask_token_id": 130001,
15
+ "pad_token_id": 3,
16
  "hidden_size": 4096,
17
  "inner_hidden_size": 16384,
18
  "layernorm_epsilon": 1e-05,
 
24
  "torch_dtype": "float16",
25
  "transformers_version": "4.23.1",
26
  "use_cache": true,
27
+ "vocab_size": 130528
28
  }
configuration_chatglm.py CHANGED
@@ -66,6 +66,8 @@ class ChatGLMConfig(PretrainedConfig):
66
  use_cache=False,
67
  bos_token_id=150004,
68
  eos_token_id=150005,
 
 
69
  pad_token_id=0,
70
  max_sequence_length=2048,
71
  inner_hidden_size=16384,
@@ -86,6 +88,8 @@ class ChatGLMConfig(PretrainedConfig):
86
  self.bos_token_id = bos_token_id
87
  self.eos_token_id = eos_token_id
88
  self.pad_token_id = pad_token_id
 
 
89
  self.position_encoding_2d = position_encoding_2d
90
  self.quantization_bit = quantization_bit
91
  self.pre_seq_len = pre_seq_len
 
66
  use_cache=False,
67
  bos_token_id=150004,
68
  eos_token_id=150005,
69
+ mask_token_id=150000,
70
+ gmask_token_id=150001,
71
  pad_token_id=0,
72
  max_sequence_length=2048,
73
  inner_hidden_size=16384,
 
88
  self.bos_token_id = bos_token_id
89
  self.eos_token_id = eos_token_id
90
  self.pad_token_id = pad_token_id
91
+ self.mask_token_id = mask_token_id
92
+ self.gmask_token_id = gmask_token_id
93
  self.position_encoding_2d = position_encoding_2d
94
  self.quantization_bit = quantization_bit
95
  self.pre_seq_len = pre_seq_len
modeling_chatglm.py CHANGED
@@ -921,7 +921,7 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
921
 
922
 
923
  if position_ids is None:
924
- MASK, gMASK = 150000, 150001
925
  mask_token = MASK if MASK in input_ids else gMASK
926
  use_gmask = False if MASK in input_ids else True
927
 
@@ -1084,7 +1084,7 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
1084
  **kwargs
1085
  ) -> dict:
1086
  batch_size, seq_length = input_ids.shape
1087
- MASK, gMASK = 150000, 150001
1088
  mask_token = MASK if MASK in input_ids else gMASK
1089
  use_gmask = False if MASK in input_ids else True
1090
  seqs = input_ids.tolist()
 
921
 
922
 
923
  if position_ids is None:
924
+ MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
925
  mask_token = MASK if MASK in input_ids else gMASK
926
  use_gmask = False if MASK in input_ids else True
927
 
 
1084
  **kwargs
1085
  ) -> dict:
1086
  batch_size, seq_length = input_ids.shape
1087
+ MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
1088
  mask_token = MASK if MASK in input_ids else gMASK
1089
  use_gmask = False if MASK in input_ids else True
1090
  seqs = input_ids.tolist()
tokenization_chatglm.py CHANGED
@@ -48,11 +48,13 @@ class SPTokenizer:
48
  def __init__(
49
  self,
50
  vocab_file,
 
51
  max_blank_length=80,
52
  byte_fallback=True,
53
  ):
54
  assert vocab_file is not None
55
  self.vocab_file = vocab_file
 
56
  self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
57
  self.max_blank_length = max_blank_length
58
  self.byte_fallback = byte_fallback
@@ -70,10 +72,6 @@ class SPTokenizer:
70
  def get_tab_token():
71
  return f"<|tab|>"
72
 
73
- @property
74
- def num_image_tokens(self):
75
- return 20000
76
-
77
  @property
78
  def num_text_tokens(self):
79
  return self.text_tokenizer.num_tokens
@@ -178,6 +176,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
178
  mask_token='[MASK]',
179
  gmask_token='[gMASK]',
180
  padding_side="left",
 
181
  **kwargs
182
  ) -> None:
183
  super().__init__(
@@ -197,7 +196,7 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
197
  self.mask_token = mask_token
198
  self.gmask_token = gmask_token
199
 
200
- self.sp_tokenizer = SPTokenizer(vocab_file)
201
 
202
  """ Initialisation """
203
 
 
48
  def __init__(
49
  self,
50
  vocab_file,
51
+ num_image_tokens=20000,
52
  max_blank_length=80,
53
  byte_fallback=True,
54
  ):
55
  assert vocab_file is not None
56
  self.vocab_file = vocab_file
57
+ self.num_image_tokens = num_image_tokens
58
  self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
59
  self.max_blank_length = max_blank_length
60
  self.byte_fallback = byte_fallback
 
72
  def get_tab_token():
73
  return f"<|tab|>"
74
 
 
 
 
 
75
  @property
76
  def num_text_tokens(self):
77
  return self.text_tokenizer.num_tokens
 
176
  mask_token='[MASK]',
177
  gmask_token='[gMASK]',
178
  padding_side="left",
179
+ num_image_tokens=20000,
180
  **kwargs
181
  ) -> None:
182
  super().__init__(
 
196
  self.mask_token = mask_token
197
  self.gmask_token = gmask_token
198
 
199
+ self.sp_tokenizer = SPTokenizer(vocab_file, num_image_tokens=num_image_tokens)
200
 
201
  """ Initialisation """
202
 
tokenizer_config.json CHANGED
@@ -10,6 +10,7 @@
10
  "remove_space": false,
11
  "do_lower_case": false,
12
  "tokenizer_class": "ChatGLMTokenizer",
 
13
  "auto_map": {
14
  "AutoTokenizer": [
15
  "tokenization_chatglm.ChatGLMTokenizer",
 
10
  "remove_space": false,
11
  "do_lower_case": false,
12
  "tokenizer_class": "ChatGLMTokenizer",
13
+ "num_image_tokens": 0,
14
  "auto_map": {
15
  "AutoTokenizer": [
16
  "tokenization_chatglm.ChatGLMTokenizer",