richardllin cArlIcon commited on
Commit
8782f92
·
1 Parent(s): aa0636a

compatible with Llama (#16)

Browse files

- compatible with Llama (81b8830e79d18b85fc9e3ac40df9b144255afbb3)


Co-authored-by: cArlIcon <[email protected]>

config.json CHANGED
@@ -1,12 +1,7 @@
1
  {
2
  "architectures": [
3
- "YiForCausalLM"
4
  ],
5
- "auto_map": {
6
- "AutoConfig": "configuration_yi.YiConfig",
7
- "AutoModel": "modeling_yi.YiModel",
8
- "AutoModelForCausalLM":"modeling_yi.YiForCausalLM"
9
- },
10
  "bos_token_id": 1,
11
  "eos_token_id": 2,
12
  "hidden_act": "silu",
@@ -14,12 +9,14 @@
14
  "initializer_range": 0.02,
15
  "intermediate_size": 11008,
16
  "max_position_embeddings": 4096,
17
- "model_type": "Yi",
18
  "num_attention_heads": 32,
19
  "num_hidden_layers": 32,
20
  "num_key_value_heads": 4,
21
  "pad_token_id": 0,
 
22
  "rms_norm_eps": 1e-05,
 
23
  "rope_theta": 5000000.0,
24
  "tie_word_embeddings": false,
25
  "torch_dtype": "bfloat16",
 
1
  {
2
  "architectures": [
3
+ "LlamaForCausalLM"
4
  ],
 
 
 
 
 
5
  "bos_token_id": 1,
6
  "eos_token_id": 2,
7
  "hidden_act": "silu",
 
9
  "initializer_range": 0.02,
10
  "intermediate_size": 11008,
11
  "max_position_embeddings": 4096,
12
+ "model_type": "llama",
13
  "num_attention_heads": 32,
14
  "num_hidden_layers": 32,
15
  "num_key_value_heads": 4,
16
  "pad_token_id": 0,
17
+ "pretraining_tp": 1,
18
  "rms_norm_eps": 1e-05,
19
+ "rope_scaling": null,
20
  "rope_theta": 5000000.0,
21
  "tie_word_embeddings": false,
22
  "torch_dtype": "bfloat16",
configuration_yi.py DELETED
@@ -1,121 +0,0 @@
1
- """ Yi model configuration"""
2
- from transformers.configuration_utils import PretrainedConfig
3
- from transformers.utils import logging
4
-
5
- logger = logging.get_logger(__name__)
6
-
7
- Yi_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
8
-
9
-
10
- class YiConfig(PretrainedConfig):
11
- r"""
12
- This is the configuration class to store the configuration of a [`YiModel`]. It is used to instantiate an Yi
13
- model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
14
- defaults will yield a similar configuration to that of the Yi model.
15
-
16
- Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
17
- documentation from [`PretrainedConfig`] for more information.
18
-
19
-
20
- Args:
21
- vocab_size (`int`, *optional*, defaults to 64000):
22
- Vocabulary size of the Yi model. Defines the number of different tokens that can be represented by the
23
- `inputs_ids` passed when calling [`YiModel`]
24
- hidden_size (`int`, *optional*, defaults to 4096):
25
- Dimension of the hidden representations.
26
- intermediate_size (`int`, *optional*, defaults to 11008):
27
- Dimension of the MLP representations.
28
- num_hidden_layers (`int`, *optional*, defaults to 32):
29
- Number of hidden layers in the Transformer encoder.
30
- num_attention_heads (`int`, *optional*, defaults to 32):
31
- Number of attention heads for each attention layer in the Transformer encoder.
32
- num_key_value_heads (`int`, *optional*):
33
- This is the number of key_value heads that should be used to implement Grouped Query Attention. If
34
- `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
35
- `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
36
- converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
37
- by meanpooling all the original heads within that group. For more details checkout [this
38
- paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
39
- `num_attention_heads`.
40
- hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
41
- The non-linear activation function (function or string) in the decoder.
42
- max_position_embeddings (`int`, *optional*, defaults to 4096):
43
- The maximum sequence length that this model might ever be used with. Typically set this to something large
44
- just in case (e.g., 512 or 1024 or 2048 or 4096).
45
- initializer_range (`float`, *optional*, defaults to 0.02):
46
- The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
47
- rms_norm_eps (`float`, *optional*, defaults to 1e-5):
48
- The epsilon used by the rms normalization layers.
49
- use_cache (`bool`, *optional*, defaults to `True`):
50
- Whether or not the model should return the last key/values attentions (not used by all models). Only
51
- relevant if `config.is_decoder=True`.
52
- tie_word_embeddings(`bool`, *optional*, defaults to `False`):
53
- Whether to tie weight embeddings
54
- output_attentions (`bool`, *optional*, defaults to `False`):
55
- Whether or not to output attentions.
56
- rope_theta (`float`, *optional*, defaults to 5000000.0):
57
- The base period of the RoPE embeddings.
58
- Example:
59
-
60
- ```python
61
- >>> from transformers import YiModel, YiConfig
62
-
63
- >>> # Initializing a Yi style configuration
64
- >>> configuration = YiConfig()
65
-
66
- >>> # Initializing a model from the Yi style configuration
67
- >>> model = YiModel(configuration)
68
-
69
- >>> # Accessing the model configuration
70
- >>> configuration = model.config
71
- ```"""
72
- model_type = "Yi"
73
- keys_to_ignore_at_inference = ["past_key_values"]
74
-
75
- def __init__(
76
- self,
77
- vocab_size=64000,
78
- hidden_size=4096,
79
- intermediate_size=11008,
80
- num_hidden_layers=32,
81
- num_attention_heads=32,
82
- num_key_value_heads=4,
83
- hidden_act="silu",
84
- max_position_embeddings=4096,
85
- initializer_range=0.02,
86
- rms_norm_eps=1e-5,
87
- use_cache=True,
88
- pad_token_id=0,
89
- bos_token_id=1,
90
- eos_token_id=2,
91
- tie_word_embeddings=False,
92
- output_attentions=False,
93
- rope_theta=5000000.0,
94
- **kwargs,
95
- ):
96
- self.vocab_size = vocab_size
97
- self.max_position_embeddings = max_position_embeddings
98
- self.hidden_size = hidden_size
99
- self.intermediate_size = intermediate_size
100
- self.num_hidden_layers = num_hidden_layers
101
- self.num_attention_heads = num_attention_heads
102
-
103
- # for backward compatibility
104
- if num_key_value_heads is None:
105
- num_key_value_heads = num_attention_heads
106
-
107
- self.num_key_value_heads = num_key_value_heads
108
- self.hidden_act = hidden_act
109
- self.initializer_range = initializer_range
110
- self.rms_norm_eps = rms_norm_eps
111
- self.use_cache = use_cache
112
- self.output_attentions = output_attentions
113
- self.rope_theta = rope_theta
114
-
115
- super().__init__(
116
- pad_token_id=pad_token_id,
117
- bos_token_id=bos_token_id,
118
- eos_token_id=eos_token_id,
119
- tie_word_embeddings=tie_word_embeddings,
120
- **kwargs,
121
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
md5 CHANGED
@@ -1,3 +1,3 @@
1
- e11decc690391b47f62e217a3faca830 pytorch_model-00001-of-00002.bin
2
- ea6e2b5eddc1416a101361efb286d79c pytorch_model-00002-of-00002.bin
3
  291724ef50f729e45d68f474a7755bbc tokenizer.model
 
1
+ 9ed9eddef19f15eeac05e1536c2c134c pytorch_model-00001-of-00002.bin
2
+ 6163ccbe85e077b49a5777f048692eeb pytorch_model-00002-of-00002.bin
3
  291724ef50f729e45d68f474a7755bbc tokenizer.model
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9874235f174cc47cb05ddc2a6806b3e81c5b45b403a85b8339f564673047e82d
3
- size 9943067928
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef83351b38f5b5377283c507015f80c17a79d7e7a4d0b90f5b8f64dd1bb84f5a
3
+ size 9943068816
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c4472218560a9a56162a4a65fdb1f0bcbab3f2bd747291e61914e990eec4a3f
3
- size 2179035848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5dd44ffe41b07e5c3ab04fc6a41d8461d45487586a17388e6290aac54568a23c
3
+ size 2179036016
model.safetensors.index.json CHANGED
@@ -5,290 +5,290 @@
5
  "weight_map": {
6
  "lm_head.weight": "model-00002-of-00002.safetensors",
7
  "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
8
- "model.layers.0.ln1.weight": "model-00001-of-00002.safetensors",
9
- "model.layers.0.ln2.weight": "model-00001-of-00002.safetensors",
10
  "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
11
  "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
12
  "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
13
  "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
14
  "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
15
  "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
16
  "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
17
- "model.layers.1.ln1.weight": "model-00001-of-00002.safetensors",
18
- "model.layers.1.ln2.weight": "model-00001-of-00002.safetensors",
19
  "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
20
  "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
21
  "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
22
  "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
23
  "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
24
  "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
25
  "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
26
- "model.layers.10.ln1.weight": "model-00001-of-00002.safetensors",
27
- "model.layers.10.ln2.weight": "model-00001-of-00002.safetensors",
28
  "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
29
  "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
30
  "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
31
  "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
32
  "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
33
  "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
34
  "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
35
- "model.layers.11.ln1.weight": "model-00001-of-00002.safetensors",
36
- "model.layers.11.ln2.weight": "model-00001-of-00002.safetensors",
37
  "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
38
  "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
39
  "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
40
  "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
41
  "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
42
  "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
43
  "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
44
- "model.layers.12.ln1.weight": "model-00001-of-00002.safetensors",
45
- "model.layers.12.ln2.weight": "model-00001-of-00002.safetensors",
46
  "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
47
  "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
48
  "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
49
  "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
50
  "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
51
  "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
52
  "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
53
- "model.layers.13.ln1.weight": "model-00001-of-00002.safetensors",
54
- "model.layers.13.ln2.weight": "model-00001-of-00002.safetensors",
55
  "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
56
  "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
57
  "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
58
  "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
59
  "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
60
  "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
61
  "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
62
- "model.layers.14.ln1.weight": "model-00001-of-00002.safetensors",
63
- "model.layers.14.ln2.weight": "model-00001-of-00002.safetensors",
64
  "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
65
  "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
66
  "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
67
  "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
68
  "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
69
  "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
70
  "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
71
- "model.layers.15.ln1.weight": "model-00001-of-00002.safetensors",
72
- "model.layers.15.ln2.weight": "model-00001-of-00002.safetensors",
73
  "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
74
  "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
75
  "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
76
  "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
77
  "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
78
  "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
79
  "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
80
- "model.layers.16.ln1.weight": "model-00001-of-00002.safetensors",
81
- "model.layers.16.ln2.weight": "model-00001-of-00002.safetensors",
82
  "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
83
  "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
84
  "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
85
  "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
86
  "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
87
  "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
88
  "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
89
- "model.layers.17.ln1.weight": "model-00001-of-00002.safetensors",
90
- "model.layers.17.ln2.weight": "model-00001-of-00002.safetensors",
91
  "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
92
  "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
93
  "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
94
  "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
95
  "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
96
  "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
97
  "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
98
- "model.layers.18.ln1.weight": "model-00001-of-00002.safetensors",
99
- "model.layers.18.ln2.weight": "model-00001-of-00002.safetensors",
100
  "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
101
  "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
102
  "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
103
  "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
104
  "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
105
  "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
106
  "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
107
- "model.layers.19.ln1.weight": "model-00001-of-00002.safetensors",
108
- "model.layers.19.ln2.weight": "model-00001-of-00002.safetensors",
109
  "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
110
  "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
111
  "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
112
  "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
113
  "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
114
  "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
115
  "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
116
- "model.layers.2.ln1.weight": "model-00001-of-00002.safetensors",
117
- "model.layers.2.ln2.weight": "model-00001-of-00002.safetensors",
118
  "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
119
  "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
120
  "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
121
  "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
122
  "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
123
  "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
124
  "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
125
- "model.layers.20.ln1.weight": "model-00001-of-00002.safetensors",
126
- "model.layers.20.ln2.weight": "model-00001-of-00002.safetensors",
127
  "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
128
  "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
129
  "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
130
  "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
131
  "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
132
  "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
133
  "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
134
- "model.layers.21.ln1.weight": "model-00001-of-00002.safetensors",
135
- "model.layers.21.ln2.weight": "model-00001-of-00002.safetensors",
136
  "model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
137
  "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
138
  "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
139
  "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
140
  "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
141
  "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
142
  "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
143
- "model.layers.22.ln1.weight": "model-00001-of-00002.safetensors",
144
- "model.layers.22.ln2.weight": "model-00001-of-00002.safetensors",
145
  "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
146
  "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
147
  "model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
148
  "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
149
  "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
150
  "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
151
  "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
152
- "model.layers.23.ln1.weight": "model-00001-of-00002.safetensors",
153
- "model.layers.23.ln2.weight": "model-00001-of-00002.safetensors",
154
  "model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
155
  "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
156
  "model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
157
  "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
158
  "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
159
  "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
160
  "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
161
- "model.layers.24.ln1.weight": "model-00001-of-00002.safetensors",
162
- "model.layers.24.ln2.weight": "model-00001-of-00002.safetensors",
163
  "model.layers.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
164
  "model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
165
  "model.layers.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
166
  "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
167
  "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
168
  "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
169
  "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
170
- "model.layers.25.ln1.weight": "model-00001-of-00002.safetensors",
171
- "model.layers.25.ln2.weight": "model-00001-of-00002.safetensors",
172
  "model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
173
  "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
174
  "model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
175
  "model.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
176
  "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
177
  "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
178
  "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
179
- "model.layers.26.ln1.weight": "model-00001-of-00002.safetensors",
180
- "model.layers.26.ln2.weight": "model-00001-of-00002.safetensors",
181
  "model.layers.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
182
  "model.layers.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
183
  "model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
184
  "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
185
  "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
186
  "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
187
  "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
188
- "model.layers.27.ln1.weight": "model-00002-of-00002.safetensors",
189
- "model.layers.27.ln2.weight": "model-00002-of-00002.safetensors",
190
  "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
191
  "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
192
  "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
 
193
  "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
194
  "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
195
  "model.layers.27.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
196
  "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
197
- "model.layers.28.ln1.weight": "model-00002-of-00002.safetensors",
198
- "model.layers.28.ln2.weight": "model-00002-of-00002.safetensors",
199
  "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
200
  "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
201
  "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
 
202
  "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
203
  "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
204
  "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
205
  "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
206
- "model.layers.29.ln1.weight": "model-00002-of-00002.safetensors",
207
- "model.layers.29.ln2.weight": "model-00002-of-00002.safetensors",
208
  "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
209
  "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
210
  "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
 
211
  "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
212
  "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
213
  "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
214
  "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
215
- "model.layers.3.ln1.weight": "model-00001-of-00002.safetensors",
216
- "model.layers.3.ln2.weight": "model-00001-of-00002.safetensors",
217
  "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
218
  "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
219
  "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
220
  "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
221
  "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
222
  "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
223
  "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
224
- "model.layers.30.ln1.weight": "model-00002-of-00002.safetensors",
225
- "model.layers.30.ln2.weight": "model-00002-of-00002.safetensors",
226
  "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
227
  "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
228
  "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
 
229
  "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
230
  "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
231
  "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
232
  "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
233
- "model.layers.31.ln1.weight": "model-00002-of-00002.safetensors",
234
- "model.layers.31.ln2.weight": "model-00002-of-00002.safetensors",
235
  "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
236
  "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
237
  "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
 
238
  "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
239
  "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
240
  "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
241
  "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
242
- "model.layers.4.ln1.weight": "model-00001-of-00002.safetensors",
243
- "model.layers.4.ln2.weight": "model-00001-of-00002.safetensors",
244
  "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
245
  "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
246
  "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
247
  "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
248
  "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
249
  "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
250
  "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
251
- "model.layers.5.ln1.weight": "model-00001-of-00002.safetensors",
252
- "model.layers.5.ln2.weight": "model-00001-of-00002.safetensors",
253
  "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
254
  "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
255
  "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
256
  "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
257
  "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
258
  "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
259
  "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
260
- "model.layers.6.ln1.weight": "model-00001-of-00002.safetensors",
261
- "model.layers.6.ln2.weight": "model-00001-of-00002.safetensors",
262
  "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
263
  "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
264
  "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
265
  "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
266
  "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
267
  "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
268
  "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
269
- "model.layers.7.ln1.weight": "model-00001-of-00002.safetensors",
270
- "model.layers.7.ln2.weight": "model-00001-of-00002.safetensors",
271
  "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
272
  "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
273
  "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
274
  "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
275
  "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
276
  "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
277
  "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
278
- "model.layers.8.ln1.weight": "model-00001-of-00002.safetensors",
279
- "model.layers.8.ln2.weight": "model-00001-of-00002.safetensors",
280
  "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
281
  "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
282
  "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
283
  "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
284
  "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
285
  "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
286
  "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
287
- "model.layers.9.ln1.weight": "model-00001-of-00002.safetensors",
288
- "model.layers.9.ln2.weight": "model-00001-of-00002.safetensors",
289
  "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
290
  "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
291
  "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
 
292
  "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
293
  "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
294
  "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
 
5
  "weight_map": {
6
  "lm_head.weight": "model-00002-of-00002.safetensors",
7
  "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
9
  "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
10
  "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
11
  "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
13
  "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
14
  "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
15
  "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
16
  "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
17
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
18
  "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
19
  "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
20
  "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
21
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
22
  "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
23
  "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
24
  "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
25
  "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
26
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
27
  "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
28
  "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
29
  "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
30
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
31
  "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
32
  "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
33
  "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
34
  "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
35
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
36
  "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
37
  "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
38
  "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
39
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
40
  "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
41
  "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
42
  "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
43
  "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
44
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
45
  "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
46
  "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
47
  "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
48
+ "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
49
  "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
50
  "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
51
  "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
52
  "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
53
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
54
  "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
55
  "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
56
  "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
57
+ "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
58
  "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
59
  "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
60
  "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
61
  "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
62
+ "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
63
  "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
64
  "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
65
  "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
66
+ "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
67
  "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
68
  "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
69
  "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
70
  "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
71
+ "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
72
  "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
73
  "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
74
  "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
75
+ "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
76
  "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
77
  "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
78
  "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
79
  "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
80
+ "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
81
  "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
82
  "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
83
  "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
84
+ "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
85
  "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
86
  "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
87
  "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
88
  "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
89
+ "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
90
  "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
91
  "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
92
  "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
93
+ "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
94
  "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
95
  "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
96
  "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
97
  "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
98
+ "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
99
  "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
100
  "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
101
  "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
102
+ "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
103
  "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
104
  "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
105
  "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
106
  "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
107
+ "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
108
  "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
109
  "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
110
  "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
111
+ "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
112
  "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
113
  "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
114
  "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
115
  "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
116
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
117
  "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
118
  "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
119
  "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
120
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
121
  "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
122
  "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
123
  "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
124
  "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
125
+ "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
126
  "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
127
  "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
128
  "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
129
+ "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
130
  "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
131
  "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
132
  "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
133
  "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
134
+ "model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
135
  "model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
136
  "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
137
  "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
138
+ "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
139
  "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
140
  "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
141
  "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
142
  "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
143
+ "model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
144
  "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
145
  "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
146
  "model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
147
+ "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
148
  "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
149
  "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
150
  "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
151
  "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
152
+ "model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
153
  "model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
154
  "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
155
  "model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
156
+ "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
157
  "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
158
  "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
159
  "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
160
  "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
161
+ "model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
162
  "model.layers.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
163
  "model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
164
  "model.layers.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
165
+ "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
166
  "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
167
  "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
168
  "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
169
  "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
170
+ "model.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
171
  "model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
172
  "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
173
  "model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
174
+ "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
175
  "model.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
176
  "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
177
  "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
178
  "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
179
+ "model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
180
  "model.layers.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
181
  "model.layers.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
182
  "model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
183
+ "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
184
  "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
185
  "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
186
  "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
187
  "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
188
+ "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
 
189
  "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
190
  "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
191
  "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
192
+ "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
193
  "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
194
  "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
195
  "model.layers.27.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
196
  "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
197
+ "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
 
198
  "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
199
  "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
200
  "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
201
+ "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
202
  "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
203
  "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
204
  "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
205
  "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
206
+ "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
 
207
  "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
208
  "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
209
  "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
210
+ "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
211
  "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
212
  "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
213
  "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
214
  "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
215
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
216
  "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
217
  "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
218
  "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
219
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
220
  "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
221
  "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
222
  "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
223
  "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
224
+ "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
 
225
  "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
226
  "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
227
  "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
228
+ "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
229
  "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
230
  "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
231
  "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
232
  "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
233
+ "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
 
234
  "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
235
  "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
236
  "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
237
+ "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
238
  "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
239
  "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
240
  "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
241
  "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
242
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
243
  "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
244
  "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
245
  "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
246
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
247
  "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
248
  "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
249
  "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
250
  "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
251
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
252
  "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
253
  "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
254
  "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
255
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
256
  "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
257
  "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
258
  "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
259
  "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
260
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
261
  "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
262
  "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
263
  "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
264
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
265
  "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
266
  "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
267
  "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
268
  "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
269
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
270
  "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
271
  "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
272
  "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
273
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
274
  "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
275
  "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
276
  "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
277
  "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
278
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
279
  "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
280
  "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
281
  "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
282
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
283
  "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
284
  "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
285
  "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
286
  "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
287
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
 
288
  "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
289
  "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
290
  "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
291
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
292
  "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
293
  "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
294
  "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
modeling_yi.py DELETED
@@ -1,1028 +0,0 @@
1
- """ PyTorch Yi model."""
2
- import math
3
- from typing import List, Optional, Tuple, Union
4
-
5
- import torch.utils.checkpoint
6
- from einops import repeat
7
- from torch import nn
8
- from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
9
- from transformers.activations import ACT2FN
10
- from transformers.modeling_outputs import (
11
- BaseModelOutputWithPast,
12
- CausalLMOutputWithPast,
13
- SequenceClassifierOutputWithPast,
14
- )
15
- from transformers.modeling_utils import PreTrainedModel
16
- from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
17
- from transformers.utils import (
18
- add_start_docstrings,
19
- add_start_docstrings_to_model_forward,
20
- logging,
21
- replace_return_docstrings,
22
- )
23
-
24
- from .configuration_yi import YiConfig
25
-
26
- is_flash_attn_available = True
27
- try:
28
- from flash_attn import flash_attn_func
29
- except Exception:
30
- is_flash_attn_available = False
31
-
32
- logger = logging.get_logger(__name__)
33
-
34
- _CONFIG_FOR_DOC = "YiConfig"
35
-
36
-
37
- # Copied from transformers.models.bart.modeling_bart._make_causal_mask
38
- def _make_causal_mask(
39
- input_ids_shape: torch.Size,
40
- dtype: torch.dtype,
41
- device: torch.device,
42
- past_key_values_length: int = 0,
43
- ):
44
- """
45
- Make causal mask used for bi-directional self-attention.
46
- """
47
- bsz, tgt_len = input_ids_shape
48
- mask = torch.full(
49
- (tgt_len, tgt_len),
50
- torch.tensor(torch.finfo(dtype).min, device=device),
51
- device=device,
52
- )
53
- mask_cond = torch.arange(mask.size(-1), device=device)
54
- mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
55
- mask = mask.to(dtype)
56
-
57
- if past_key_values_length > 0:
58
- mask = torch.cat(
59
- [
60
- torch.zeros(
61
- tgt_len, past_key_values_length, dtype=dtype, device=device
62
- ),
63
- mask,
64
- ],
65
- dim=-1,
66
- )
67
- return mask[None, None, :, :].expand(
68
- bsz, 1, tgt_len, tgt_len + past_key_values_length
69
- )
70
-
71
-
72
- # Copied from transformers.models.bart.modeling_bart._expand_mask
73
- def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
74
- """
75
- Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
76
- """
77
- bsz, src_len = mask.size()
78
- tgt_len = tgt_len if tgt_len is not None else src_len
79
-
80
- expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
81
-
82
- inverted_mask = 1.0 - expanded_mask
83
-
84
- return inverted_mask.masked_fill(
85
- inverted_mask.to(torch.bool), torch.finfo(dtype).min
86
- )
87
-
88
-
89
- class YiRMSNorm(nn.Module):
90
- def __init__(self, hidden_size, eps=1e-5):
91
- """
92
- YiRMSNorm is equivalent to T5LayerNorm
93
- """
94
- super().__init__()
95
- self.weight = nn.Parameter(torch.ones(hidden_size))
96
- self.variance_epsilon = eps
97
-
98
- def forward(self, hidden_states):
99
- input_dtype = hidden_states.dtype
100
- hidden_states = hidden_states.to(torch.float32)
101
- variance = hidden_states.pow(2).mean(-1, keepdim=True)
102
- hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
103
-
104
- return self.weight * hidden_states.to(input_dtype)
105
-
106
-
107
- ALL_LAYERNORM_LAYERS.append(YiRMSNorm)
108
-
109
-
110
- class YiRotaryEmbedding(torch.nn.Module):
111
- def __init__(self, dim, max_position_embeddings=4096, base=5000000, device=None):
112
- super().__init__()
113
-
114
- self.dim = dim
115
- self.max_position_embeddings = max_position_embeddings
116
- self.base = base
117
-
118
- # Build here to make `torch.jit.trace` work.
119
- self._set_cos_sin_cache(seq_len=max_position_embeddings, device=device)
120
-
121
- def _set_cos_sin_cache(self, seq_len, device):
122
- self.max_seq_len_cached = seq_len
123
- inv_freq = 1.0 / (
124
- self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
125
- )
126
- t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.float32)
127
- freqs = torch.einsum("i,j->ij", t, inv_freq)
128
- # Different from paper, but it uses a different permutation in order to obtain the same calculation
129
- emb = torch.cat((freqs, freqs), dim=-1)
130
- self.register_buffer(
131
- "cos_cached", emb.cos()[None, None, :, :], persistent=False
132
- )
133
- self.register_buffer(
134
- "sin_cached", emb.sin()[None, None, :, :], persistent=False
135
- )
136
-
137
- def forward(self, x, seq_len=None):
138
- # x: [bs, num_attention_heads, seq_len, head_size]
139
- if seq_len > self.max_seq_len_cached:
140
- self._set_cos_sin_cache(seq_len=seq_len, device=x.device)
141
-
142
- return (
143
- self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
144
- self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
145
- )
146
-
147
-
148
- def rotate_half(x):
149
- """Rotates half the hidden dims of the input."""
150
- x1 = x[..., : x.shape[-1] // 2]
151
- x2 = x[..., x.shape[-1] // 2 :]
152
- return torch.cat((-x2, x1), dim=-1)
153
-
154
-
155
- def apply_rotary_pos_emb(q, k, cos, sin, position_ids, flash_attn_available):
156
- # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
157
- cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
158
- sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
159
- expand_dim = 2 if flash_attn_available else 1
160
- cos = cos[position_ids].unsqueeze(expand_dim) # [bs, seq_len, dim]
161
- sin = sin[position_ids].unsqueeze(expand_dim) # [bs, seq_len, dim]
162
- q_embed = (q * cos) + (rotate_half(q) * sin)
163
- k_embed = (k * cos) + (rotate_half(k) * sin)
164
- return q_embed, k_embed
165
-
166
-
167
- class YiMLP(nn.Module):
168
- def __init__(self, hidden_size: int, intermediate_size: int, hidden_act: str):
169
- super().__init__()
170
- self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
171
- self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
172
- self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
173
- self.act_fn = ACT2FN[hidden_act]
174
-
175
- def forward(self, x):
176
- return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
177
-
178
-
179
- class YiAttention(nn.Module):
180
- """Multi-headed attention from 'Attention Is All You Need' paper"""
181
-
182
- def __init__(self, config: YiConfig):
183
- super().__init__()
184
- self.config = config
185
- self.hidden_size = config.hidden_size
186
- self.num_heads = config.num_attention_heads
187
- self.head_dim = self.hidden_size // self.num_heads
188
- self.num_key_value_heads = config.num_key_value_heads
189
- self.num_key_value_groups = self.num_heads // self.num_key_value_heads
190
- self.max_position_embeddings = config.max_position_embeddings
191
-
192
- if (self.head_dim * self.num_heads) != self.hidden_size:
193
- raise ValueError(
194
- f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
195
- f" and `num_heads`: {self.num_heads})."
196
- )
197
- self.q_proj = nn.Linear(
198
- self.hidden_size, self.num_heads * self.head_dim, bias=False
199
- )
200
- self.k_proj = nn.Linear(
201
- self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False
202
- )
203
- self.v_proj = nn.Linear(
204
- self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False
205
- )
206
- self.o_proj = nn.Linear(
207
- self.num_heads * self.head_dim, self.hidden_size, bias=False
208
- )
209
-
210
- self.rotary_emb = YiRotaryEmbedding(
211
- self.head_dim,
212
- max_position_embeddings=self.max_position_embeddings,
213
- base=self.config.rope_theta,
214
- )
215
-
216
- def forward(
217
- self,
218
- hidden_states: torch.Tensor,
219
- attention_mask: Optional[torch.Tensor] = None,
220
- position_ids: Optional[torch.LongTensor] = None,
221
- past_key_value: Optional[Tuple[torch.Tensor]] = None,
222
- output_attentions: bool = False,
223
- use_cache: bool = False,
224
- ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
225
- bsz, q_len, _ = hidden_states.size()
226
-
227
- query_states = self.q_proj(hidden_states).view(
228
- bsz, q_len, self.num_heads, self.head_dim
229
- )
230
-
231
- key_states = self.k_proj(hidden_states).view(
232
- bsz, q_len, self.num_key_value_heads, self.head_dim
233
- )
234
- value_states = self.v_proj(hidden_states).view(
235
- bsz, q_len, self.num_key_value_heads, self.head_dim
236
- )
237
-
238
- if not is_flash_attn_available:
239
- if self.num_key_value_groups > 1:
240
- key_states = repeat(
241
- key_states, f"b n h d -> b n (h {self.num_key_value_groups}) d"
242
- )
243
- value_states = repeat(
244
- value_states, f"b n h d -> b n (h {self.num_key_value_groups}) d"
245
- )
246
-
247
- # b n h d -> b h n d
248
- query_states = query_states.transpose(1, 2)
249
- key_states = key_states.transpose(1, 2)
250
- value_states = value_states.transpose(1, 2)
251
-
252
- seq_dim = 1 if is_flash_attn_available else 2
253
- kv_seq_len = key_states.shape[seq_dim]
254
- if past_key_value is not None:
255
- kv_seq_len += past_key_value[0].shape[seq_dim]
256
- cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
257
- query_states, key_states = apply_rotary_pos_emb(
258
- query_states, key_states, cos, sin, position_ids, is_flash_attn_available
259
- )
260
-
261
- if past_key_value is not None:
262
- # reuse k, v, self_attention
263
- key_states = torch.cat([past_key_value[0], key_states], dim=seq_dim)
264
- value_states = torch.cat([past_key_value[1], value_states], dim=seq_dim)
265
-
266
- past_key_value = (key_states, value_states) if use_cache else None
267
-
268
- if is_flash_attn_available:
269
- attn_output = flash_attn_func(
270
- query_states, key_states, value_states, dropout_p=0.0, causal=True
271
- )
272
- else:
273
- attn_weights = torch.matmul(
274
- query_states, key_states.transpose(2, 3)
275
- ) / math.sqrt(self.head_dim)
276
-
277
- if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
278
- raise ValueError(
279
- f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
280
- f" {attn_weights.size()}"
281
- )
282
-
283
- if attention_mask is not None:
284
- if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
285
- raise ValueError(
286
- f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is"
287
- f"{attention_mask.size()}"
288
- )
289
- attn_weights = attn_weights + attention_mask
290
- dtype_min = torch.tensor(
291
- torch.finfo(attn_weights.dtype).min,
292
- device=attn_weights.device,
293
- dtype=attn_weights.dtype,
294
- )
295
- attn_weights = torch.max(attn_weights, dtype_min)
296
-
297
- # upcast attention to fp32
298
- attn_weights = nn.functional.softmax(
299
- attn_weights, dim=-1, dtype=torch.float32
300
- ).to(query_states.dtype)
301
- attn_output = torch.matmul(attn_weights, value_states)
302
-
303
- if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
304
- raise ValueError(
305
- f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
306
- f" {attn_output.size()}"
307
- )
308
-
309
- if not is_flash_attn_available:
310
- attn_output = attn_output.transpose(1, 2)
311
-
312
- attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
313
-
314
- attn_output = self.o_proj(attn_output)
315
-
316
- if not output_attentions:
317
- attn_weights = None
318
-
319
- return attn_output, attn_weights, past_key_value
320
-
321
-
322
- class YiDecoderLayer(nn.Module):
323
- def __init__(self, config: YiConfig):
324
- super().__init__()
325
-
326
- self.hidden_size = config.hidden_size
327
- self.self_attn = YiAttention(config=config)
328
- self.mlp = YiMLP(
329
- hidden_size=self.hidden_size,
330
- intermediate_size=config.intermediate_size,
331
- hidden_act=config.hidden_act,
332
- )
333
-
334
- self.ln1 = YiRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
335
- self.ln2 = YiRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
336
-
337
- def forward(
338
- self,
339
- hidden_states: torch.Tensor,
340
- attention_mask: Optional[torch.Tensor] = None,
341
- position_ids: Optional[torch.LongTensor] = None,
342
- past_key_value: Optional[Tuple[torch.Tensor]] = None,
343
- output_attentions: Optional[bool] = False,
344
- use_cache: Optional[bool] = False,
345
- ) -> Tuple[
346
- torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
347
- ]:
348
- """
349
- Args:
350
- hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
351
- attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
352
- `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
353
- output_attentions (`bool`, *optional*):
354
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under
355
- returned tensors for more detail.
356
- use_cache (`bool`, *optional*):
357
- If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
358
- (see `past_key_values`).
359
- past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
360
- """
361
-
362
- residual = hidden_states
363
-
364
- hidden_states = self.ln1(hidden_states)
365
-
366
- # Self Attention
367
- hidden_states, self_attn_weights, present_key_value = self.self_attn(
368
- hidden_states=hidden_states,
369
- attention_mask=attention_mask,
370
- position_ids=position_ids,
371
- past_key_value=past_key_value,
372
- output_attentions=output_attentions,
373
- use_cache=use_cache,
374
- )
375
- hidden_states = residual + hidden_states
376
-
377
- # Fully Connected
378
- residual = hidden_states
379
- hidden_states = self.ln2(hidden_states)
380
- hidden_states = self.mlp(hidden_states)
381
- hidden_states = residual + hidden_states
382
-
383
- outputs = (hidden_states,)
384
-
385
- if output_attentions:
386
- outputs += (self_attn_weights,)
387
-
388
- if use_cache:
389
- outputs += (present_key_value,)
390
-
391
- return outputs
392
-
393
-
394
- Yi_START_DOCSTRING = r"""
395
- This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
396
- library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
397
- etc.)
398
-
399
- This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
400
- Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
401
- and behavior.
402
-
403
- Parameters:
404
- config ([`YiConfig`]):
405
- Model configuration class with all the parameters of the model. Initializing with a config file does not
406
- load the weights associated with the model, only the configuration. Check out the
407
- [`~PreTrainedModel.from_pretrained`] method to load the model weights.
408
- """
409
-
410
-
411
- @add_start_docstrings(
412
- "The bare Yi Model outputting raw hidden-states without any specific head on top.",
413
- Yi_START_DOCSTRING,
414
- )
415
- class YiPreTrainedModel(PreTrainedModel):
416
- config_class = YiConfig
417
- base_model_prefix = "model"
418
- supports_gradient_checkpointing = True
419
- _no_split_modules = ["YiDecoderLayer"]
420
- _skip_keys_device_placement = "past_key_values"
421
-
422
- def _init_weights(self, module):
423
- std = self.config.initializer_range
424
- if isinstance(module, nn.Linear):
425
- module.weight.data.normal_(mean=0.0, std=std)
426
- if module.bias is not None:
427
- module.bias.data.zero_()
428
- elif isinstance(module, nn.Embedding):
429
- module.weight.data.normal_(mean=0.0, std=std)
430
- if module.padding_idx is not None:
431
- module.weight.data[module.padding_idx].zero_()
432
-
433
- def _set_gradient_checkpointing(self, module, value=False):
434
- if isinstance(module, YiModel):
435
- module.gradient_checkpointing = value
436
-
437
-
438
- Yi_INPUTS_DOCSTRING = r"""
439
- Args:
440
- input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
441
- Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
442
- it.
443
-
444
- Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
445
- [`PreTrainedTokenizer.__call__`] for details.
446
-
447
- [What are input IDs?](../glossary#input-ids)
448
- attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
449
- Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
450
-
451
- - 1 for tokens that are **not masked**,
452
- - 0 for tokens that are **masked**.
453
-
454
- [What are attention masks?](../glossary#attention-mask)
455
-
456
- Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
457
- [`PreTrainedTokenizer.__call__`] for details.
458
-
459
- If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
460
- `past_key_values`).
461
-
462
- If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
463
- and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
464
- information on the default strategy.
465
-
466
- - 1 indicates the head is **not masked**,
467
- - 0 indicates the head is **masked**.
468
- position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
469
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
470
- config.n_positions - 1]`.
471
-
472
- [What are position IDs?](../glossary#position-ids)
473
- past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
474
- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
475
- `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
476
- `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
477
-
478
- Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
479
- blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
480
-
481
- If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
482
- don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
483
- `decoder_input_ids` of shape `(batch_size, sequence_length)`.
484
- inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
485
- Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
486
- is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
487
- model's internal embedding lookup matrix.
488
- use_cache (`bool`, *optional*):
489
- If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
490
- `past_key_values`).
491
- output_attentions (`bool`, *optional*):
492
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
493
- tensors for more detail.
494
- output_hidden_states (`bool`, *optional*):
495
- Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
496
- more detail.
497
- return_dict (`bool`, *optional*):
498
- Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
499
- """
500
-
501
-
502
- @add_start_docstrings(
503
- "The bare Yi Model outputting raw hidden-states without any specific head on top.",
504
- Yi_START_DOCSTRING,
505
- )
506
- class YiModel(YiPreTrainedModel):
507
- """
508
- Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`YiDecoderLayer`]
509
-
510
- Args:
511
- config: YiConfig
512
- """
513
-
514
- def __init__(self, config: YiConfig):
515
- super().__init__(config)
516
- self.padding_idx = config.pad_token_id
517
- self.vocab_size = config.vocab_size
518
-
519
- self.embed_tokens = nn.Embedding(
520
- config.vocab_size, config.hidden_size, self.padding_idx
521
- )
522
- self.layers = nn.ModuleList(
523
- [YiDecoderLayer(config) for _ in range(config.num_hidden_layers)]
524
- )
525
-
526
- self.norm = YiRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
527
-
528
- self.gradient_checkpointing = False
529
- # Initialize weights and apply final processing
530
- self.post_init()
531
-
532
- def get_input_embeddings(self):
533
- return self.embed_tokens
534
-
535
- def set_input_embeddings(self, value):
536
- self.embed_tokens = value
537
-
538
- # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
539
- def _prepare_decoder_attention_mask(
540
- self, attention_mask, input_ids, inputs_embeds, past_key_values_length
541
- ):
542
- input_shape = input_ids.shape if input_ids is not None else inputs_embeds.shape[:-1]
543
- # create causal mask
544
- # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
545
- combined_attention_mask = None
546
- if input_shape[-1] > 1:
547
- combined_attention_mask = _make_causal_mask(
548
- input_shape,
549
- inputs_embeds.dtype,
550
- device=inputs_embeds.device,
551
- past_key_values_length=past_key_values_length,
552
- )
553
-
554
- if attention_mask is not None:
555
- # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
556
- expanded_attn_mask = _expand_mask(
557
- attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
558
- ).to(inputs_embeds.device)
559
- combined_attention_mask = (
560
- expanded_attn_mask
561
- if combined_attention_mask is None
562
- else expanded_attn_mask + combined_attention_mask
563
- )
564
-
565
- return combined_attention_mask
566
-
567
- @add_start_docstrings_to_model_forward(Yi_INPUTS_DOCSTRING)
568
- def forward(
569
- self,
570
- input_ids: torch.LongTensor = None,
571
- attention_mask: Optional[torch.Tensor] = None,
572
- position_ids: Optional[torch.LongTensor] = None,
573
- past_key_values: Optional[List[torch.FloatTensor]] = None,
574
- inputs_embeds: Optional[torch.FloatTensor] = None,
575
- use_cache: Optional[bool] = None,
576
- output_attentions: Optional[bool] = None,
577
- output_hidden_states: Optional[bool] = None,
578
- return_dict: Optional[bool] = None,
579
- ) -> Union[Tuple, BaseModelOutputWithPast]:
580
- output_attentions = (
581
- output_attentions
582
- if output_attentions is not None
583
- else self.config.output_attentions
584
- )
585
- output_hidden_states = (
586
- output_hidden_states
587
- if output_hidden_states is not None
588
- else self.config.output_hidden_states
589
- )
590
- use_cache = use_cache if use_cache is not None else self.config.use_cache
591
-
592
- return_dict = (
593
- return_dict if return_dict is not None else self.config.use_return_dict
594
- )
595
-
596
- # retrieve input_ids and inputs_embeds
597
- if input_ids is not None and inputs_embeds is not None:
598
- raise ValueError(
599
- "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
600
- )
601
- elif input_ids is not None:
602
- batch_size, seq_length = input_ids.shape
603
- elif inputs_embeds is not None:
604
- batch_size, seq_length, _ = inputs_embeds.shape
605
- else:
606
- raise ValueError(
607
- "You have to specify either decoder_input_ids or decoder_inputs_embeds"
608
- )
609
-
610
- seq_length_with_past = seq_length
611
- past_key_values_length = 0
612
-
613
- if past_key_values is not None:
614
- past_key_values_length = past_key_values[0][0].shape[2]
615
- seq_length_with_past = seq_length_with_past + past_key_values_length
616
-
617
- if position_ids is None:
618
- device = input_ids.device if input_ids is not None else inputs_embeds.device
619
- position_ids = torch.arange(
620
- past_key_values_length,
621
- seq_length + past_key_values_length,
622
- dtype=torch.long,
623
- device=device,
624
- )
625
- position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
626
- else:
627
- position_ids = position_ids.view(-1, seq_length).long()
628
-
629
- if inputs_embeds is None:
630
- inputs_embeds = self.embed_tokens(input_ids)
631
-
632
- if not is_flash_attn_available:
633
- # embed positions
634
- if attention_mask is None:
635
- attention_mask = torch.ones(
636
- (batch_size, seq_length_with_past),
637
- dtype=torch.bool,
638
- device=inputs_embeds.device,
639
- )
640
- attention_mask = self._prepare_decoder_attention_mask(
641
- attention_mask,
642
- input_ids,
643
- inputs_embeds,
644
- past_key_values_length,
645
- )
646
- else:
647
- attention_mask = None
648
-
649
- hidden_states = inputs_embeds
650
- if self.gradient_checkpointing and self.training:
651
- if use_cache:
652
- logger.warning_once(
653
- "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
654
- )
655
- use_cache = False
656
-
657
- # decoder layers
658
- all_hidden_states = () if output_hidden_states else None
659
- all_self_attns = () if output_attentions else None
660
- next_decoder_cache = () if use_cache else None
661
-
662
- for idx, decoder_layer in enumerate(self.layers):
663
- if output_hidden_states:
664
- all_hidden_states += (hidden_states,)
665
-
666
- past_key_value = (
667
- past_key_values[idx] if past_key_values is not None else None
668
- )
669
-
670
- if self.gradient_checkpointing and self.training:
671
-
672
- def create_custom_forward(module):
673
- def custom_forward(*inputs):
674
- # None for past_key_value
675
- return module(*inputs, past_key_value, output_attentions)
676
-
677
- return custom_forward
678
-
679
- layer_outputs = torch.utils.checkpoint.checkpoint(
680
- create_custom_forward(decoder_layer),
681
- hidden_states,
682
- attention_mask,
683
- position_ids,
684
- )
685
- else:
686
- layer_outputs = decoder_layer(
687
- hidden_states,
688
- attention_mask=attention_mask,
689
- position_ids=position_ids,
690
- past_key_value=past_key_value,
691
- output_attentions=output_attentions,
692
- use_cache=use_cache,
693
- )
694
-
695
- hidden_states = layer_outputs[0]
696
-
697
- if use_cache:
698
- next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
699
-
700
- if output_attentions:
701
- all_self_attns += (layer_outputs[1],)
702
-
703
- hidden_states = self.norm(hidden_states)
704
- # add hidden states from the last decoder layer
705
- if output_hidden_states:
706
- all_hidden_states += (hidden_states,)
707
-
708
- next_cache = next_decoder_cache if use_cache else None
709
- if not return_dict:
710
- return tuple(
711
- v
712
- for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
713
- if v is not None
714
- )
715
- return BaseModelOutputWithPast(
716
- last_hidden_state=hidden_states,
717
- past_key_values=next_cache,
718
- hidden_states=all_hidden_states,
719
- attentions=all_self_attns,
720
- )
721
-
722
-
723
- class YiForCausalLM(YiPreTrainedModel):
724
- _tied_weights_keys = ["lm_head.weight"]
725
-
726
- def __init__(self, config):
727
- super().__init__(config)
728
- self.model = YiModel(config)
729
-
730
- self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
731
-
732
- # Initialize weights and apply final processing
733
- self.post_init()
734
-
735
- def get_input_embeddings(self):
736
- return self.model.embed_tokens
737
-
738
- def set_input_embeddings(self, value):
739
- self.model.embed_tokens = value
740
-
741
- def get_output_embeddings(self):
742
- return self.lm_head
743
-
744
- def set_output_embeddings(self, new_embeddings):
745
- self.lm_head = new_embeddings
746
-
747
- def set_decoder(self, decoder):
748
- self.model = decoder
749
-
750
- def get_decoder(self):
751
- return self.model
752
-
753
- @add_start_docstrings_to_model_forward(Yi_INPUTS_DOCSTRING)
754
- @replace_return_docstrings(
755
- output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
756
- )
757
- def forward(
758
- self,
759
- input_ids: torch.LongTensor = None,
760
- attention_mask: Optional[torch.Tensor] = None,
761
- position_ids: Optional[torch.LongTensor] = None,
762
- past_key_values: Optional[List[torch.FloatTensor]] = None,
763
- inputs_embeds: Optional[torch.FloatTensor] = None,
764
- labels: Optional[torch.LongTensor] = None,
765
- use_cache: Optional[bool] = None,
766
- output_attentions: Optional[bool] = None,
767
- output_hidden_states: Optional[bool] = None,
768
- return_dict: Optional[bool] = None,
769
- ) -> Union[Tuple, CausalLMOutputWithPast]:
770
- r"""
771
- Args:
772
- labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
773
- Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
774
- config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
775
- (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
776
-
777
- Returns:
778
-
779
- Example:
780
-
781
- ```python
782
- >>> from transformers import AutoTokenizer, YiForCausalLM
783
-
784
- >>> model = YiForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
785
- >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
786
-
787
- >>> prompt = "Hey, are you conscious? Can you talk to me?"
788
- >>> inputs = tokenizer(prompt, return_tensors="pt")
789
-
790
- >>> # Generate
791
- >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
792
- >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
793
- "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
794
- ```"""
795
-
796
- output_attentions = (
797
- output_attentions
798
- if output_attentions is not None
799
- else self.config.output_attentions
800
- )
801
- output_hidden_states = (
802
- output_hidden_states
803
- if output_hidden_states is not None
804
- else self.config.output_hidden_states
805
- )
806
- return_dict = (
807
- return_dict if return_dict is not None else self.config.use_return_dict
808
- )
809
-
810
- # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
811
- outputs = self.model(
812
- input_ids=input_ids,
813
- attention_mask=attention_mask,
814
- position_ids=position_ids,
815
- past_key_values=past_key_values,
816
- inputs_embeds=inputs_embeds,
817
- use_cache=use_cache,
818
- output_attentions=output_attentions,
819
- output_hidden_states=output_hidden_states,
820
- return_dict=return_dict,
821
- )
822
-
823
- hidden_states = outputs[0]
824
- logits = self.lm_head(hidden_states)
825
-
826
- loss = None
827
- if labels is not None:
828
- # Shift so that tokens < n predict n
829
- shift_logits = logits[..., :-1, :].contiguous()
830
- shift_labels = labels[..., 1:].contiguous()
831
- # Flatten the tokens
832
- loss_fct = CrossEntropyLoss()
833
- shift_logits = shift_logits.view(-1, self.config.vocab_size)
834
- shift_labels = shift_labels.view(-1)
835
- # Enable model parallelism
836
- shift_labels = shift_labels.to(shift_logits.device)
837
- loss = loss_fct(shift_logits, shift_labels)
838
-
839
- if not return_dict:
840
- output = (logits,) + outputs[1:]
841
- return (loss,) + output if loss is not None else output
842
-
843
- return CausalLMOutputWithPast(
844
- loss=loss,
845
- logits=logits,
846
- past_key_values=outputs.past_key_values,
847
- hidden_states=outputs.hidden_states,
848
- attentions=outputs.attentions,
849
- )
850
-
851
- def prepare_inputs_for_generation(
852
- self,
853
- input_ids,
854
- past_key_values=None,
855
- attention_mask=None,
856
- inputs_embeds=None,
857
- **kwargs,
858
- ):
859
- if past_key_values:
860
- input_ids = input_ids[:, -1:]
861
-
862
- position_ids = kwargs.get("position_ids", None)
863
- if attention_mask is not None and position_ids is None:
864
- # create position_ids on the fly for batch generation
865
- position_ids = attention_mask.long().cumsum(-1) - 1
866
- position_ids.masked_fill_(attention_mask == 0, 1)
867
- if past_key_values:
868
- position_ids = position_ids[:, -1].unsqueeze(-1)
869
-
870
- # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
871
- if inputs_embeds is not None and past_key_values is None:
872
- model_inputs = {"inputs_embeds": inputs_embeds}
873
- else:
874
- model_inputs = {"input_ids": input_ids}
875
-
876
- model_inputs.update(
877
- {
878
- "position_ids": position_ids,
879
- "past_key_values": past_key_values,
880
- "use_cache": kwargs.get("use_cache"),
881
- "attention_mask": attention_mask,
882
- }
883
- )
884
- return model_inputs
885
-
886
- @staticmethod
887
- def _reorder_cache(past_key_values, beam_idx):
888
- reordered_past = ()
889
- for layer_past in past_key_values:
890
- reordered_past += (
891
- tuple(
892
- past_state.index_select(0, beam_idx.to(past_state.device))
893
- for past_state in layer_past
894
- ),
895
- )
896
- return reordered_past
897
-
898
-
899
- @add_start_docstrings(
900
- """
901
- The Yi Model transformer with a sequence classification head on top (linear layer).
902
-
903
- [`YiForSequenceClassification`] uses the last token in order to do the classification, as other causal models
904
- (e.g. GPT-2) do.
905
-
906
- Since it does classification on the last token, it requires to know the position of the last token. If a
907
- `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
908
- no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
909
- padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
910
- each row of the batch).
911
- """,
912
- Yi_START_DOCSTRING,
913
- )
914
- class YiForSequenceClassification(YiPreTrainedModel):
915
- def __init__(self, config):
916
- super().__init__(config)
917
- self.num_labels = config.num_labels
918
- self.model = YiModel(config)
919
- self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
920
-
921
- # Initialize weights and apply final processing
922
- self.post_init()
923
-
924
- def get_input_embeddings(self):
925
- return self.model.embed_tokens
926
-
927
- def set_input_embeddings(self, value):
928
- self.model.embed_tokens = value
929
-
930
- @add_start_docstrings_to_model_forward(Yi_INPUTS_DOCSTRING)
931
- def forward(
932
- self,
933
- input_ids: torch.LongTensor = None,
934
- attention_mask: Optional[torch.Tensor] = None,
935
- position_ids: Optional[torch.LongTensor] = None,
936
- past_key_values: Optional[List[torch.FloatTensor]] = None,
937
- inputs_embeds: Optional[torch.FloatTensor] = None,
938
- labels: Optional[torch.LongTensor] = None,
939
- use_cache: Optional[bool] = None,
940
- output_attentions: Optional[bool] = None,
941
- output_hidden_states: Optional[bool] = None,
942
- return_dict: Optional[bool] = None,
943
- ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
944
- r"""
945
- labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
946
- Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
947
- config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
948
- `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
949
- """
950
- return_dict = (
951
- return_dict if return_dict is not None else self.config.use_return_dict
952
- )
953
-
954
- transformer_outputs = self.model(
955
- input_ids,
956
- attention_mask=attention_mask,
957
- position_ids=position_ids,
958
- past_key_values=past_key_values,
959
- inputs_embeds=inputs_embeds,
960
- use_cache=use_cache,
961
- output_attentions=output_attentions,
962
- output_hidden_states=output_hidden_states,
963
- return_dict=return_dict,
964
- )
965
- hidden_states = transformer_outputs[0]
966
- logits = self.score(hidden_states)
967
-
968
- if input_ids is not None:
969
- batch_size = input_ids.shape[0]
970
- else:
971
- batch_size = inputs_embeds.shape[0]
972
-
973
- if self.config.pad_token_id is None and batch_size != 1:
974
- raise ValueError(
975
- "Cannot handle batch sizes > 1 if no padding token is defined."
976
- )
977
- if self.config.pad_token_id is None:
978
- sequence_lengths = -1
979
- else:
980
- if input_ids is not None:
981
- sequence_lengths = (
982
- torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1
983
- ).to(logits.device)
984
- else:
985
- sequence_lengths = -1
986
-
987
- pooled_logits = logits[
988
- torch.arange(batch_size, device=logits.device), sequence_lengths
989
- ]
990
-
991
- loss = None
992
- if labels is not None:
993
- labels = labels.to(logits.device)
994
- if self.config.problem_type is None:
995
- if self.num_labels == 1:
996
- self.config.problem_type = "regression"
997
- elif self.num_labels > 1 and (
998
- labels.dtype == torch.long or labels.dtype == torch.int
999
- ):
1000
- self.config.problem_type = "single_label_classification"
1001
- else:
1002
- self.config.problem_type = "multi_label_classification"
1003
-
1004
- if self.config.problem_type == "regression":
1005
- loss_fct = MSELoss()
1006
- if self.num_labels == 1:
1007
- loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
1008
- else:
1009
- loss = loss_fct(pooled_logits, labels)
1010
- elif self.config.problem_type == "single_label_classification":
1011
- loss_fct = CrossEntropyLoss()
1012
- loss = loss_fct(
1013
- pooled_logits.view(-1, self.num_labels), labels.view(-1)
1014
- )
1015
- elif self.config.problem_type == "multi_label_classification":
1016
- loss_fct = BCEWithLogitsLoss()
1017
- loss = loss_fct(pooled_logits, labels)
1018
- if not return_dict:
1019
- output = (pooled_logits,) + transformer_outputs[1:]
1020
- return ((loss,) + output) if loss is not None else output
1021
-
1022
- return SequenceClassifierOutputWithPast(
1023
- loss=loss,
1024
- logits=pooled_logits,
1025
- past_key_values=transformer_outputs.past_key_values,
1026
- hidden_states=transformer_outputs.hidden_states,
1027
- attentions=transformer_outputs.attentions,
1028
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pytorch_model-00001-of-00002.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7335422e13c29396b729da0337edd6280f0ccee4b714a59051547b0f5d802d13
3
- size 9943124771
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2901d81dd964014fe39d58f9c399df187acc53387a8e99f216706609d933b45a
3
+ size 9943125667
pytorch_model-00002-of-00002.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:145fb5246919be8da75f5ed01b02fc45ce8dea0cba43d1e095892c98be81e1ff
3
- size 2179045205
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b49e2ff5733dd267212044d77bfed497f5fff5b1963909dcb53f1bb6963a10a8
3
+ size 2179045397
pytorch_model.bin.index.json CHANGED
@@ -5,8 +5,8 @@
5
  "weight_map": {
6
  "lm_head.weight": "pytorch_model-00002-of-00002.bin",
7
  "model.embed_tokens.weight": "pytorch_model-00001-of-00002.bin",
8
- "model.layers.0.ln1.weight": "pytorch_model-00001-of-00002.bin",
9
- "model.layers.0.ln2.weight": "pytorch_model-00001-of-00002.bin",
10
  "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
11
  "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
12
  "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
@@ -14,8 +14,8 @@
14
  "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
15
  "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
16
  "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
17
- "model.layers.1.ln1.weight": "pytorch_model-00001-of-00002.bin",
18
- "model.layers.1.ln2.weight": "pytorch_model-00001-of-00002.bin",
19
  "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
20
  "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
21
  "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
@@ -23,8 +23,8 @@
23
  "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
24
  "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
25
  "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
26
- "model.layers.10.ln1.weight": "pytorch_model-00001-of-00002.bin",
27
- "model.layers.10.ln2.weight": "pytorch_model-00001-of-00002.bin",
28
  "model.layers.10.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
29
  "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
30
  "model.layers.10.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
@@ -32,8 +32,8 @@
32
  "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
33
  "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
34
  "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
35
- "model.layers.11.ln1.weight": "pytorch_model-00001-of-00002.bin",
36
- "model.layers.11.ln2.weight": "pytorch_model-00001-of-00002.bin",
37
  "model.layers.11.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
38
  "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
39
  "model.layers.11.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
@@ -41,8 +41,8 @@
41
  "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
42
  "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
43
  "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
44
- "model.layers.12.ln1.weight": "pytorch_model-00001-of-00002.bin",
45
- "model.layers.12.ln2.weight": "pytorch_model-00001-of-00002.bin",
46
  "model.layers.12.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
47
  "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
48
  "model.layers.12.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
@@ -50,8 +50,8 @@
50
  "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
51
  "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
52
  "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
53
- "model.layers.13.ln1.weight": "pytorch_model-00001-of-00002.bin",
54
- "model.layers.13.ln2.weight": "pytorch_model-00001-of-00002.bin",
55
  "model.layers.13.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
56
  "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
57
  "model.layers.13.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
@@ -59,8 +59,8 @@
59
  "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
60
  "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
61
  "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
62
- "model.layers.14.ln1.weight": "pytorch_model-00001-of-00002.bin",
63
- "model.layers.14.ln2.weight": "pytorch_model-00001-of-00002.bin",
64
  "model.layers.14.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
65
  "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
66
  "model.layers.14.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
@@ -68,8 +68,8 @@
68
  "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
69
  "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
70
  "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
71
- "model.layers.15.ln1.weight": "pytorch_model-00001-of-00002.bin",
72
- "model.layers.15.ln2.weight": "pytorch_model-00001-of-00002.bin",
73
  "model.layers.15.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
74
  "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
75
  "model.layers.15.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
@@ -77,8 +77,8 @@
77
  "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
78
  "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
79
  "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
80
- "model.layers.16.ln1.weight": "pytorch_model-00001-of-00002.bin",
81
- "model.layers.16.ln2.weight": "pytorch_model-00001-of-00002.bin",
82
  "model.layers.16.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
83
  "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
84
  "model.layers.16.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
@@ -86,8 +86,8 @@
86
  "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
87
  "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
88
  "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
89
- "model.layers.17.ln1.weight": "pytorch_model-00001-of-00002.bin",
90
- "model.layers.17.ln2.weight": "pytorch_model-00001-of-00002.bin",
91
  "model.layers.17.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
92
  "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
93
  "model.layers.17.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
@@ -95,8 +95,8 @@
95
  "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
96
  "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
97
  "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
98
- "model.layers.18.ln1.weight": "pytorch_model-00001-of-00002.bin",
99
- "model.layers.18.ln2.weight": "pytorch_model-00001-of-00002.bin",
100
  "model.layers.18.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
101
  "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
102
  "model.layers.18.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
@@ -104,8 +104,8 @@
104
  "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
105
  "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
106
  "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
107
- "model.layers.19.ln1.weight": "pytorch_model-00001-of-00002.bin",
108
- "model.layers.19.ln2.weight": "pytorch_model-00001-of-00002.bin",
109
  "model.layers.19.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
110
  "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
111
  "model.layers.19.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
@@ -113,8 +113,8 @@
113
  "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
114
  "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
115
  "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
116
- "model.layers.2.ln1.weight": "pytorch_model-00001-of-00002.bin",
117
- "model.layers.2.ln2.weight": "pytorch_model-00001-of-00002.bin",
118
  "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
119
  "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
120
  "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
@@ -122,8 +122,8 @@
122
  "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
123
  "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
124
  "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
125
- "model.layers.20.ln1.weight": "pytorch_model-00001-of-00002.bin",
126
- "model.layers.20.ln2.weight": "pytorch_model-00001-of-00002.bin",
127
  "model.layers.20.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
128
  "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
129
  "model.layers.20.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
@@ -131,8 +131,8 @@
131
  "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
132
  "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
133
  "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
134
- "model.layers.21.ln1.weight": "pytorch_model-00001-of-00002.bin",
135
- "model.layers.21.ln2.weight": "pytorch_model-00001-of-00002.bin",
136
  "model.layers.21.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
137
  "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
138
  "model.layers.21.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
@@ -140,8 +140,8 @@
140
  "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
141
  "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
142
  "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
143
- "model.layers.22.ln1.weight": "pytorch_model-00001-of-00002.bin",
144
- "model.layers.22.ln2.weight": "pytorch_model-00001-of-00002.bin",
145
  "model.layers.22.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
146
  "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
147
  "model.layers.22.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
@@ -149,8 +149,8 @@
149
  "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
150
  "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
151
  "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
152
- "model.layers.23.ln1.weight": "pytorch_model-00001-of-00002.bin",
153
- "model.layers.23.ln2.weight": "pytorch_model-00001-of-00002.bin",
154
  "model.layers.23.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
155
  "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
156
  "model.layers.23.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
@@ -158,8 +158,8 @@
158
  "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
159
  "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
160
  "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
161
- "model.layers.24.ln1.weight": "pytorch_model-00001-of-00002.bin",
162
- "model.layers.24.ln2.weight": "pytorch_model-00001-of-00002.bin",
163
  "model.layers.24.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
164
  "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
165
  "model.layers.24.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
@@ -167,8 +167,8 @@
167
  "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
168
  "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
169
  "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
170
- "model.layers.25.ln1.weight": "pytorch_model-00001-of-00002.bin",
171
- "model.layers.25.ln2.weight": "pytorch_model-00001-of-00002.bin",
172
  "model.layers.25.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
173
  "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
174
  "model.layers.25.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
@@ -176,8 +176,8 @@
176
  "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
177
  "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
178
  "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
179
- "model.layers.26.ln1.weight": "pytorch_model-00001-of-00002.bin",
180
- "model.layers.26.ln2.weight": "pytorch_model-00001-of-00002.bin",
181
  "model.layers.26.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
182
  "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
183
  "model.layers.26.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
@@ -185,8 +185,8 @@
185
  "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
186
  "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
187
  "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
188
- "model.layers.27.ln1.weight": "pytorch_model-00002-of-00002.bin",
189
- "model.layers.27.ln2.weight": "pytorch_model-00002-of-00002.bin",
190
  "model.layers.27.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
191
  "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
192
  "model.layers.27.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
@@ -194,8 +194,8 @@
194
  "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
195
  "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
196
  "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
197
- "model.layers.28.ln1.weight": "pytorch_model-00002-of-00002.bin",
198
- "model.layers.28.ln2.weight": "pytorch_model-00002-of-00002.bin",
199
  "model.layers.28.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
200
  "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
201
  "model.layers.28.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
@@ -203,8 +203,8 @@
203
  "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
204
  "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
205
  "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
206
- "model.layers.29.ln1.weight": "pytorch_model-00002-of-00002.bin",
207
- "model.layers.29.ln2.weight": "pytorch_model-00002-of-00002.bin",
208
  "model.layers.29.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
209
  "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
210
  "model.layers.29.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
@@ -212,8 +212,8 @@
212
  "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
213
  "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
214
  "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
215
- "model.layers.3.ln1.weight": "pytorch_model-00001-of-00002.bin",
216
- "model.layers.3.ln2.weight": "pytorch_model-00001-of-00002.bin",
217
  "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
218
  "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
219
  "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
@@ -221,8 +221,8 @@
221
  "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
222
  "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
223
  "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
224
- "model.layers.30.ln1.weight": "pytorch_model-00002-of-00002.bin",
225
- "model.layers.30.ln2.weight": "pytorch_model-00002-of-00002.bin",
226
  "model.layers.30.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
227
  "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
228
  "model.layers.30.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
@@ -230,8 +230,8 @@
230
  "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
231
  "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
232
  "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
233
- "model.layers.31.ln1.weight": "pytorch_model-00002-of-00002.bin",
234
- "model.layers.31.ln2.weight": "pytorch_model-00002-of-00002.bin",
235
  "model.layers.31.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
236
  "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
237
  "model.layers.31.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
@@ -239,8 +239,8 @@
239
  "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
240
  "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
241
  "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
242
- "model.layers.4.ln1.weight": "pytorch_model-00001-of-00002.bin",
243
- "model.layers.4.ln2.weight": "pytorch_model-00001-of-00002.bin",
244
  "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
245
  "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
246
  "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
@@ -248,8 +248,8 @@
248
  "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
249
  "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
250
  "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
251
- "model.layers.5.ln1.weight": "pytorch_model-00001-of-00002.bin",
252
- "model.layers.5.ln2.weight": "pytorch_model-00001-of-00002.bin",
253
  "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
254
  "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
255
  "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
@@ -257,8 +257,8 @@
257
  "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
258
  "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
259
  "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
260
- "model.layers.6.ln1.weight": "pytorch_model-00001-of-00002.bin",
261
- "model.layers.6.ln2.weight": "pytorch_model-00001-of-00002.bin",
262
  "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
263
  "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
264
  "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
@@ -266,8 +266,8 @@
266
  "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
267
  "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
268
  "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
269
- "model.layers.7.ln1.weight": "pytorch_model-00001-of-00002.bin",
270
- "model.layers.7.ln2.weight": "pytorch_model-00001-of-00002.bin",
271
  "model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
272
  "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
273
  "model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
@@ -275,8 +275,8 @@
275
  "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
276
  "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
277
  "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
278
- "model.layers.8.ln1.weight": "pytorch_model-00001-of-00002.bin",
279
- "model.layers.8.ln2.weight": "pytorch_model-00001-of-00002.bin",
280
  "model.layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
281
  "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
282
  "model.layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
@@ -284,8 +284,8 @@
284
  "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
285
  "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
286
  "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
287
- "model.layers.9.ln1.weight": "pytorch_model-00001-of-00002.bin",
288
- "model.layers.9.ln2.weight": "pytorch_model-00001-of-00002.bin",
289
  "model.layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
290
  "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
291
  "model.layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
5
  "weight_map": {
6
  "lm_head.weight": "pytorch_model-00002-of-00002.bin",
7
  "model.embed_tokens.weight": "pytorch_model-00001-of-00002.bin",
8
+ "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
9
+ "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
10
  "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
11
  "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
12
  "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
14
  "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
15
  "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
16
  "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
17
+ "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
18
+ "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
19
  "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
20
  "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
21
  "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
23
  "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
24
  "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
25
  "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
26
+ "model.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
27
+ "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
28
  "model.layers.10.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
29
  "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
30
  "model.layers.10.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
32
  "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
33
  "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
34
  "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
35
+ "model.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
36
+ "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
37
  "model.layers.11.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
38
  "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
39
  "model.layers.11.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
41
  "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
42
  "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
43
  "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
44
+ "model.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
45
+ "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
46
  "model.layers.12.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
47
  "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
48
  "model.layers.12.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
50
  "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
51
  "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
52
  "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
53
+ "model.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
54
+ "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
55
  "model.layers.13.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
56
  "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
57
  "model.layers.13.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
59
  "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
60
  "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
61
  "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
62
+ "model.layers.14.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
63
+ "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
64
  "model.layers.14.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
65
  "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
66
  "model.layers.14.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
68
  "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
69
  "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
70
  "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
71
+ "model.layers.15.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
72
+ "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
73
  "model.layers.15.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
74
  "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
75
  "model.layers.15.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
77
  "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
78
  "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
79
  "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
80
+ "model.layers.16.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
81
+ "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
82
  "model.layers.16.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
83
  "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
84
  "model.layers.16.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
86
  "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
87
  "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
88
  "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
89
+ "model.layers.17.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
90
+ "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
91
  "model.layers.17.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
92
  "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
93
  "model.layers.17.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
95
  "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
96
  "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
97
  "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
98
+ "model.layers.18.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
99
+ "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
100
  "model.layers.18.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
101
  "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
102
  "model.layers.18.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
104
  "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
105
  "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
106
  "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
107
+ "model.layers.19.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
108
+ "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
109
  "model.layers.19.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
110
  "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
111
  "model.layers.19.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
113
  "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
114
  "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
115
  "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
116
+ "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
117
+ "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
118
  "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
119
  "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
120
  "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
122
  "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
123
  "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
124
  "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
125
+ "model.layers.20.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
126
+ "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
127
  "model.layers.20.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
128
  "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
129
  "model.layers.20.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
131
  "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
132
  "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
133
  "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
134
+ "model.layers.21.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
135
+ "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
136
  "model.layers.21.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
137
  "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
138
  "model.layers.21.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
140
  "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
141
  "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
142
  "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
143
+ "model.layers.22.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
144
+ "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
145
  "model.layers.22.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
146
  "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
147
  "model.layers.22.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
149
  "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
150
  "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
151
  "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
152
+ "model.layers.23.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
153
+ "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
154
  "model.layers.23.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
155
  "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
156
  "model.layers.23.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
158
  "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
159
  "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
160
  "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
161
+ "model.layers.24.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
162
+ "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
163
  "model.layers.24.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
164
  "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
165
  "model.layers.24.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
167
  "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
168
  "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
169
  "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
170
+ "model.layers.25.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
171
+ "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
172
  "model.layers.25.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
173
  "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
174
  "model.layers.25.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
176
  "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
177
  "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
178
  "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
179
+ "model.layers.26.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
180
+ "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
181
  "model.layers.26.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
182
  "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
183
  "model.layers.26.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
185
  "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
186
  "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
187
  "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
188
+ "model.layers.27.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
189
+ "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
190
  "model.layers.27.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
191
  "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
192
  "model.layers.27.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
 
194
  "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
195
  "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
196
  "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
197
+ "model.layers.28.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
198
+ "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
199
  "model.layers.28.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
200
  "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
201
  "model.layers.28.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
 
203
  "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
204
  "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
205
  "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
206
+ "model.layers.29.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
207
+ "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
208
  "model.layers.29.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
209
  "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
210
  "model.layers.29.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
 
212
  "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
213
  "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
214
  "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
215
+ "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
216
+ "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
217
  "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
218
  "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
219
  "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
221
  "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
222
  "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
223
  "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
224
+ "model.layers.30.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
225
+ "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
226
  "model.layers.30.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
227
  "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
228
  "model.layers.30.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
 
230
  "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
231
  "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
232
  "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
233
+ "model.layers.31.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
234
+ "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
235
  "model.layers.31.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
236
  "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
237
  "model.layers.31.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
 
239
  "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
240
  "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
241
  "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
242
+ "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
243
+ "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
244
  "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
245
  "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
246
  "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
248
  "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
249
  "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
250
  "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
251
+ "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
252
+ "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
253
  "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
254
  "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
255
  "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
257
  "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
258
  "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
259
  "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
260
+ "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
261
+ "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
262
  "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
263
  "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
264
  "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
266
  "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
267
  "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
268
  "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
269
+ "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
270
+ "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
271
  "model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
272
  "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
273
  "model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
275
  "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
276
  "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
277
  "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
278
+ "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
279
+ "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
280
  "model.layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
281
  "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
282
  "model.layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
 
284
  "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
285
  "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
286
  "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
287
+ "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
288
+ "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
289
  "model.layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
290
  "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
291
  "model.layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
tokenization_yi.py DELETED
@@ -1,255 +0,0 @@
1
- import os
2
- from shutil import copyfile
3
- from typing import Any, Dict, List, Optional, Tuple
4
-
5
- import sentencepiece as spm
6
- from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
7
- from transformers.utils import logging
8
-
9
- logger = logging.get_logger(__name__)
10
-
11
- VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
12
-
13
- PRETRAINED_VOCAB_FILES_MAP = {
14
- "vocab_file": {},
15
- "tokenizer_file": {},
16
- }
17
- PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
18
-
19
-
20
- class YiTokenizer(PreTrainedTokenizer):
21
- """
22
- Construct a Yi tokenizer. Based on byte-level Byte-Pair-Encoding.
23
-
24
- Args:
25
- vocab_file (`str`):
26
- Path to the vocabulary file.
27
- """
28
-
29
- vocab_files_names = VOCAB_FILES_NAMES
30
- pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
31
- max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
32
- model_input_names = ["input_ids", "attention_mask"]
33
-
34
- def __init__(
35
- self,
36
- vocab_file,
37
- unk_token="<unk>",
38
- bos_token="<|startoftext|>",
39
- eos_token="<|endoftext|>",
40
- pad_token="<unk>",
41
- sp_model_kwargs: Optional[Dict[str, Any]] = None,
42
- add_bos_token=True,
43
- add_eos_token=False,
44
- clean_up_tokenization_spaces=False,
45
- **kwargs,
46
- ):
47
- self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
48
- bos_token = (
49
- AddedToken(bos_token, lstrip=False, rstrip=False)
50
- if isinstance(bos_token, str)
51
- else bos_token
52
- )
53
- eos_token = (
54
- AddedToken(eos_token, lstrip=False, rstrip=False)
55
- if isinstance(eos_token, str)
56
- else eos_token
57
- )
58
- unk_token = (
59
- AddedToken(unk_token, lstrip=False, rstrip=False)
60
- if isinstance(unk_token, str)
61
- else unk_token
62
- )
63
- pad_token = (
64
- AddedToken(pad_token, lstrip=False, rstrip=False)
65
- if isinstance(pad_token, str)
66
- else pad_token
67
- )
68
- self.vocab_file = vocab_file
69
- self.add_bos_token = add_bos_token
70
- self.add_eos_token = add_eos_token
71
- self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
72
- self.sp_model.Load(vocab_file)
73
- super().__init__(
74
- bos_token=bos_token,
75
- eos_token=eos_token,
76
- unk_token=unk_token,
77
- pad_token=pad_token,
78
- add_bos_token=add_bos_token,
79
- add_eos_token=add_eos_token,
80
- sp_model_kwargs=self.sp_model_kwargs,
81
- clean_up_tokenization_spaces=clean_up_tokenization_spaces,
82
- **kwargs,
83
- )
84
-
85
- def __getstate__(self):
86
- state = self.__dict__.copy()
87
- state["sp_model"] = None
88
- return state
89
-
90
- def __setstate__(self, d):
91
- self.__dict__ = d
92
- self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
93
- self.sp_model.Load(self.vocab_file)
94
-
95
- @property
96
- def vocab_size(self):
97
- """Returns vocab size"""
98
- return self.sp_model.get_piece_size()
99
-
100
- def get_vocab(self):
101
- """Returns vocab as a dict"""
102
- vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
103
- vocab.update(self.added_tokens_encoder)
104
- return vocab
105
-
106
- def _tokenize(self, text):
107
- """Returns a tokenized string."""
108
- return self.sp_model.encode(text, out_type=str)
109
-
110
- def _convert_token_to_id(self, token):
111
- """Converts a token (str) in an id using the vocab."""
112
- return self.sp_model.piece_to_id(token)
113
-
114
- def _convert_id_to_token(self, index):
115
- """Converts an index (integer) in a token (str) using the vocab."""
116
- token = self.sp_model.IdToPiece(index)
117
- return token
118
-
119
- def convert_tokens_to_string(self, tokens):
120
- """Converts a sequence of tokens (string) in a single string."""
121
- current_sub_tokens = []
122
- out_string = ""
123
- prev_is_special = False
124
- for i, token in enumerate(tokens):
125
- # make sure that special tokens are not decoded using sentencepiece model
126
- if token in self.all_special_tokens:
127
- if not prev_is_special and i != 0:
128
- out_string += " "
129
- out_string += self.sp_model.decode(current_sub_tokens) + token
130
- prev_is_special = True
131
- current_sub_tokens = []
132
- else:
133
- current_sub_tokens.append(token)
134
- prev_is_special = False
135
- out_string += self.sp_model.decode(current_sub_tokens)
136
- return out_string
137
-
138
- def save_vocabulary(
139
- self, save_directory, filename_prefix: Optional[str] = None
140
- ) -> Tuple[str]:
141
- """
142
- Save the vocabulary and special tokens file to a directory.
143
-
144
- Args:
145
- save_directory (`str`):
146
- The directory in which to save the vocabulary.
147
-
148
- Returns:
149
- `Tuple(str)`: Paths to the files saved.
150
- """
151
- if not os.path.isdir(save_directory):
152
- logger.error(f"Vocabulary path ({save_directory}) should be a directory")
153
- return
154
- out_vocab_file = os.path.join(
155
- save_directory,
156
- (filename_prefix + "-" if filename_prefix else "")
157
- + VOCAB_FILES_NAMES["vocab_file"],
158
- )
159
-
160
- if os.path.abspath(self.vocab_file) != os.path.abspath(
161
- out_vocab_file
162
- ) and os.path.isfile(self.vocab_file):
163
- copyfile(self.vocab_file, out_vocab_file)
164
- elif not os.path.isfile(self.vocab_file):
165
- with open(out_vocab_file, "wb") as fi:
166
- content_spiece_model = self.sp_model.serialized_model_proto()
167
- fi.write(content_spiece_model)
168
-
169
- return (out_vocab_file,)
170
-
171
- def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
172
- bos_token_id = [self.bos_token_id] if self.add_bos_token else []
173
- eos_token_id = [self.eos_token_id] if self.add_eos_token else []
174
-
175
- output = bos_token_id + token_ids_0 + eos_token_id
176
-
177
- if token_ids_1 is not None:
178
- output = output + bos_token_id + token_ids_1 + eos_token_id
179
-
180
- return output
181
-
182
- def get_special_tokens_mask(
183
- self,
184
- token_ids_0: List[int],
185
- token_ids_1: Optional[List[int]] = None,
186
- already_has_special_tokens: bool = False,
187
- ) -> List[int]:
188
- """
189
- Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
190
- special tokens using the tokenizer `prepare_for_model` method.
191
-
192
- Args:
193
- token_ids_0 (`List[int]`):
194
- List of IDs.
195
- token_ids_1 (`List[int]`, *optional*):
196
- Optional second list of IDs for sequence pairs.
197
- already_has_special_tokens (`bool`, *optional*, defaults to `False`):
198
- Whether or not the token list is already formatted with special tokens for the model.
199
-
200
- Returns:
201
- `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
202
- """
203
- if already_has_special_tokens:
204
- return super().get_special_tokens_mask(
205
- token_ids_0=token_ids_0,
206
- token_ids_1=token_ids_1,
207
- already_has_special_tokens=True,
208
- )
209
-
210
- bos_token_id = [1] if self.add_bos_token else []
211
- eos_token_id = [1] if self.add_eos_token else []
212
-
213
- if token_ids_1 is None:
214
- return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
215
- return (
216
- bos_token_id
217
- + ([0] * len(token_ids_0))
218
- + eos_token_id
219
- + bos_token_id
220
- + ([0] * len(token_ids_1))
221
- + eos_token_id
222
- )
223
-
224
- def create_token_type_ids_from_sequences(
225
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
226
- ) -> List[int]:
227
- """
228
- Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
229
- sequence pair mask has the following format:
230
-
231
- ```
232
- 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
233
- | first sequence | second sequence |
234
- ```
235
-
236
- if token_ids_1 is None, only returns the first portion of the mask (0s).
237
-
238
- Args:
239
- token_ids_0 (`List[int]`):
240
- List of ids.
241
- token_ids_1 (`List[int]`, *optional*):
242
- Optional second list of IDs for sequence pairs.
243
-
244
- Returns:
245
- `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
246
- """
247
- bos_token_id = [self.bos_token_id] if self.add_bos_token else []
248
- eos_token_id = [self.eos_token_id] if self.add_eos_token else []
249
-
250
- output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
251
-
252
- if token_ids_1 is not None:
253
- output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
254
-
255
- return output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tokenizer_config.json CHANGED
@@ -1,9 +1,13 @@
1
  {
2
- "auto_map": {
3
- "AutoTokenizer": ["tokenization_yi.YiTokenizer", null]
4
- },
5
  "add_bos_token": false,
6
  "add_eos_token": false,
7
  "model_max_length": 4096,
8
- "tokenizer_class": "YiTokenizer"
 
 
 
 
 
 
 
9
  }
 
1
  {
 
 
 
2
  "add_bos_token": false,
3
  "add_eos_token": false,
4
  "model_max_length": 4096,
5
+ "unk_token": "<unk>",
6
+ "bos_token": "<|startoftext|>",
7
+ "eos_token": "<|endoftext|>",
8
+ "pad_token": "<unk>",
9
+ "sp_model_kwargs": {},
10
+ "clean_up_tokenization_spaces": false,
11
+ "legacy": true,
12
+ "tokenizer_class": "LlamaTokenizer"
13
  }