yujiepan commited on
Commit
5dbd012
·
verified ·
1 Parent(s): da4621b

Upload folder using huggingface_hub

Browse files
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[PAD]": 32000
3
+ }
config.json ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "nvidia/Hymba-1.5B-Instruct",
3
+ "architectures": [
4
+ "HymbaForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "attn_hidden_size": -1,
8
+ "attn_implementation": "flex",
9
+ "attn_implementation_new": "flex",
10
+ "auto_map": {
11
+ "AutoConfig": "nvidia/Hymba-1.5B-Instruct--configuration_hymba.HymbaConfig",
12
+ "AutoModelForCausalLM": "nvidia/Hymba-1.5B-Instruct--modeling_hymba.HymbaForCausalLM"
13
+ },
14
+ "bos_token_id": 1,
15
+ "calc_logits_for_entire_prompt": false,
16
+ "conv_dim": {
17
+ "0": 32,
18
+ "1": 32,
19
+ "2": 32
20
+ },
21
+ "eos_token_id": 2,
22
+ "global_attn_idx": [
23
+ 0,
24
+ 15,
25
+ 31
26
+ ],
27
+ "hidden_act": "silu",
28
+ "hidden_size": 16,
29
+ "initializer_range": 0.02,
30
+ "intermediate_size": 32,
31
+ "kq_head_dim": -1,
32
+ "kq_norm": "none",
33
+ "kv_reuse_every_i_layer": -1,
34
+ "kv_reuse_group": [
35
+ [
36
+ 1,
37
+ 2
38
+ ],
39
+ [
40
+ 3,
41
+ 4
42
+ ],
43
+ [
44
+ 5,
45
+ 6
46
+ ],
47
+ [
48
+ 7,
49
+ 8
50
+ ],
51
+ [
52
+ 9,
53
+ 10
54
+ ],
55
+ [
56
+ 11,
57
+ 12
58
+ ],
59
+ [
60
+ 13,
61
+ 14
62
+ ],
63
+ [
64
+ 16,
65
+ 17,
66
+ 18
67
+ ],
68
+ [
69
+ 19,
70
+ 20
71
+ ],
72
+ [
73
+ 21,
74
+ 22
75
+ ],
76
+ [
77
+ 23,
78
+ 24
79
+ ],
80
+ [
81
+ 25,
82
+ 26
83
+ ],
84
+ [
85
+ 27,
86
+ 28
87
+ ],
88
+ [
89
+ 29,
90
+ 30
91
+ ]
92
+ ],
93
+ "kv_weight_reuse": false,
94
+ "layer_type": [
95
+ "h",
96
+ "h",
97
+ "h"
98
+ ],
99
+ "mamba_conv_bias": true,
100
+ "mamba_d_conv": 4,
101
+ "mamba_d_state": 16,
102
+ "mamba_dt_rank": 100,
103
+ "mamba_expand": 2,
104
+ "mamba_inner_layernorms": true,
105
+ "mamba_proj_bias": false,
106
+ "max_position_embeddings": 8192,
107
+ "memory_tokens_interspersed_every": 0,
108
+ "mlp_hidden_act": "silu",
109
+ "model_type": "hymba",
110
+ "num_attention_heads": 2,
111
+ "num_experts": 1,
112
+ "num_experts_per_tok": 1,
113
+ "num_hidden_layers": 3,
114
+ "num_key_value_heads": 1,
115
+ "num_mamba": 1,
116
+ "num_memory_tokens": 128,
117
+ "orig_max_position_embeddings": 2048,
118
+ "output_router_logits": false,
119
+ "pad_token_id": 0,
120
+ "rms_norm_eps": 1e-06,
121
+ "rope": true,
122
+ "rope_theta": 10000.0,
123
+ "rope_type": "ntk",
124
+ "router_aux_loss_coef": 0.001,
125
+ "seq_length": 8192,
126
+ "sliding_window": 1024,
127
+ "tie_word_embeddings": true,
128
+ "torch_dtype": "bfloat16",
129
+ "transformers_version": "4.46.3",
130
+ "use_cache": false,
131
+ "use_mamba_kernels": true,
132
+ "v_head_dim": 16,
133
+ "vocab_size": 32001
134
+ }
configuration_hymba.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from transformers.configuration_utils import PretrainedConfig
3
+
4
+
5
+ class HymbaConfig(PretrainedConfig):
6
+
7
+ model_type = "hymba"
8
+ keys_to_ignore_at_inference = ["past_key_values"]
9
+
10
+ def __init__(
11
+ self,
12
+ vocab_size=65536,
13
+ tie_word_embeddings=False,
14
+ hidden_size=4096,
15
+ intermediate_size=14336,
16
+ num_hidden_layers=32,
17
+ num_attention_heads=32,
18
+ num_key_value_heads=8,
19
+ hidden_act="silu",
20
+ initializer_range=0.02,
21
+ rms_norm_eps=1e-6,
22
+ use_cache=True,
23
+ calc_logits_for_entire_prompt=False,
24
+ output_router_logits=False,
25
+ router_aux_loss_coef=0.001,
26
+ pad_token_id=0,
27
+ bos_token_id=1,
28
+ eos_token_id=2,
29
+ sliding_window=None,
30
+ max_position_embeddings=262144,
31
+ orig_max_position_embeddings=None,
32
+ attention_dropout=0.0,
33
+ num_experts_per_tok=2,
34
+ num_experts=16,
35
+ use_mamba_kernels=True,
36
+ mamba_d_state=16,
37
+ mamba_d_conv=4,
38
+ mamba_expand=2,
39
+ mamba_dt_rank="auto",
40
+ mamba_conv_bias=True,
41
+ mamba_proj_bias=False,
42
+ mamba_inner_layernorms=True,
43
+ kv_reuse_every_i_layer=-1,
44
+ kv_reuse_group=None,
45
+ kv_weight_reuse=False,
46
+ global_attn_idx=None,
47
+ num_mamba=1,
48
+ attn_implementation_new='sdpa',
49
+ rope_type=None,
50
+ **kwargs,
51
+ ):
52
+ self.vocab_size = vocab_size
53
+ self.tie_word_embeddings = tie_word_embeddings
54
+ self.hidden_size = hidden_size
55
+ self.intermediate_size = intermediate_size
56
+ self.num_hidden_layers = num_hidden_layers
57
+ self.num_attention_heads = num_attention_heads
58
+ self.sliding_window = sliding_window
59
+ self.max_position_embeddings = max_position_embeddings
60
+ self.orig_max_position_embeddings = orig_max_position_embeddings
61
+ self.attention_dropout = attention_dropout
62
+
63
+ if num_key_value_heads is None:
64
+ num_key_value_heads = num_attention_heads
65
+
66
+ self.num_key_value_heads = num_key_value_heads
67
+ self.hidden_act = hidden_act
68
+ self.initializer_range = initializer_range
69
+ self.rms_norm_eps = rms_norm_eps
70
+
71
+ self.use_cache = use_cache
72
+ self.calc_logits_for_entire_prompt = calc_logits_for_entire_prompt
73
+ self.output_router_logits = output_router_logits
74
+ self.router_aux_loss_coef = router_aux_loss_coef
75
+
76
+ self.num_experts_per_tok = num_experts_per_tok
77
+ self.num_experts = num_experts
78
+
79
+ self.use_mamba_kernels = use_mamba_kernels
80
+ self.mamba_d_state = mamba_d_state
81
+ self.mamba_d_conv = mamba_d_conv
82
+ self.mamba_expand = mamba_expand
83
+ self.mamba_dt_rank = math.ceil(self.hidden_size / 16) if mamba_dt_rank == "auto" else mamba_dt_rank
84
+ self.mamba_conv_bias = mamba_conv_bias
85
+ self.mamba_proj_bias = mamba_proj_bias
86
+ self.mamba_inner_layernorms = mamba_inner_layernorms
87
+
88
+ self.attn_hidden_size = kwargs.pop("attn_hidden_size", -1)
89
+ self.kq_head_dim = kwargs.pop("kq_head_dim", -1)
90
+ self.v_head_dim = kwargs.pop("v_head_dim", -1)
91
+ self.kq_norm = kwargs.pop("kq_norm", None)
92
+ self.rope = kwargs.pop("rope", False)
93
+ self.rope_theta = kwargs.pop("rope_theta", 10000.0)
94
+ self.num_memory_tokens = kwargs.pop("num_memory_tokens", 0)
95
+ self.memory_tokens_interspersed_every = kwargs.pop("memory_tokens_interspersed_every", 0)
96
+
97
+ self.kv_reuse_every_i_layer = kv_reuse_every_i_layer
98
+ self.kv_reuse_group = kv_reuse_group
99
+ self.kv_weight_reuse = kv_weight_reuse
100
+
101
+ self.global_attn_idx = global_attn_idx
102
+
103
+ self.num_mamba = num_mamba
104
+
105
+ self.attn_implementation_new = attn_implementation_new
106
+
107
+ self.rope_type = rope_type
108
+
109
+
110
+ super().__init__(
111
+ pad_token_id=pad_token_id,
112
+ bos_token_id=bos_token_id,
113
+ eos_token_id=eos_token_id,
114
+ tie_word_embeddings=tie_word_embeddings,
115
+ **kwargs,
116
+ )
generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.46.3",
7
+ "use_cache": false
8
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0790172bf25b656f6f7638ef68d946d7da103930c637604908da09830de8e112
3
+ size 1109368
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
tokenizer_config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": true,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ }
38
+ },
39
+ "bos_token": "<s>",
40
+ "chat_template": "{{'<extra_id_0>System'}}{% for message in messages %}{% if message['role'] == 'system' %}{{'\n' + message['content'].strip()}}{% if tools or contexts %}{{'\n'}}{% endif %}{% endif %}{% endfor %}{% if tools %}{% for tool in tools %}{{ '\n<tool> ' + tool|tojson + ' </tool>' }}{% endfor %}{% endif %}{% if contexts %}{% if tools %}{{'\n'}}{% endif %}{% for context in contexts %}{{ '\n<context> ' + context.strip() + ' </context>' }}{% endfor %}{% endif %}{{'\n\n'}}{% for message in messages %}{% if message['role'] == 'user' %}{{ '<extra_id_1>User\n' + message['content'].strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{ '<extra_id_1>Assistant\n' + message['content'].strip() + '\n' }}{% elif message['role'] == 'tool' %}{{ '<extra_id_1>Tool\n' + message['content'].strip() + '\n' }}{% endif %}{% endfor %}{%- if add_generation_prompt %}{{'<extra_id_1>Assistant\n'}}{%- endif %}",
41
+ "clean_up_tokenization_spaces": false,
42
+ "eos_token": "</s>",
43
+ "legacy": true,
44
+ "model_max_length": 1000000000000000019884624838656,
45
+ "pad_token": "[PAD]",
46
+ "padding_side": "left",
47
+ "sp_model_kwargs": {},
48
+ "spaces_between_special_tokens": false,
49
+ "tokenizer_class": "LlamaTokenizer",
50
+ "unk_token": "<unk>",
51
+ "use_default_system_prompt": false
52
+ }