Upload folder using huggingface_hub
Browse files- added_tokens.json +3 -0
- config.json +134 -0
- configuration_hymba.py +116 -0
- generation_config.json +8 -0
- model.safetensors +3 -0
- special_tokens_map.json +30 -0
- tokenizer.json +0 -0
- tokenizer.model +3 -0
- tokenizer_config.json +52 -0
added_tokens.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"[PAD]": 32000
|
3 |
+
}
|
config.json
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "nvidia/Hymba-1.5B-Instruct",
|
3 |
+
"architectures": [
|
4 |
+
"HymbaForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_dropout": 0.0,
|
7 |
+
"attn_hidden_size": -1,
|
8 |
+
"attn_implementation": "flex",
|
9 |
+
"attn_implementation_new": "flex",
|
10 |
+
"auto_map": {
|
11 |
+
"AutoConfig": "nvidia/Hymba-1.5B-Instruct--configuration_hymba.HymbaConfig",
|
12 |
+
"AutoModelForCausalLM": "nvidia/Hymba-1.5B-Instruct--modeling_hymba.HymbaForCausalLM"
|
13 |
+
},
|
14 |
+
"bos_token_id": 1,
|
15 |
+
"calc_logits_for_entire_prompt": false,
|
16 |
+
"conv_dim": {
|
17 |
+
"0": 32,
|
18 |
+
"1": 32,
|
19 |
+
"2": 32
|
20 |
+
},
|
21 |
+
"eos_token_id": 2,
|
22 |
+
"global_attn_idx": [
|
23 |
+
0,
|
24 |
+
15,
|
25 |
+
31
|
26 |
+
],
|
27 |
+
"hidden_act": "silu",
|
28 |
+
"hidden_size": 16,
|
29 |
+
"initializer_range": 0.02,
|
30 |
+
"intermediate_size": 32,
|
31 |
+
"kq_head_dim": -1,
|
32 |
+
"kq_norm": "none",
|
33 |
+
"kv_reuse_every_i_layer": -1,
|
34 |
+
"kv_reuse_group": [
|
35 |
+
[
|
36 |
+
1,
|
37 |
+
2
|
38 |
+
],
|
39 |
+
[
|
40 |
+
3,
|
41 |
+
4
|
42 |
+
],
|
43 |
+
[
|
44 |
+
5,
|
45 |
+
6
|
46 |
+
],
|
47 |
+
[
|
48 |
+
7,
|
49 |
+
8
|
50 |
+
],
|
51 |
+
[
|
52 |
+
9,
|
53 |
+
10
|
54 |
+
],
|
55 |
+
[
|
56 |
+
11,
|
57 |
+
12
|
58 |
+
],
|
59 |
+
[
|
60 |
+
13,
|
61 |
+
14
|
62 |
+
],
|
63 |
+
[
|
64 |
+
16,
|
65 |
+
17,
|
66 |
+
18
|
67 |
+
],
|
68 |
+
[
|
69 |
+
19,
|
70 |
+
20
|
71 |
+
],
|
72 |
+
[
|
73 |
+
21,
|
74 |
+
22
|
75 |
+
],
|
76 |
+
[
|
77 |
+
23,
|
78 |
+
24
|
79 |
+
],
|
80 |
+
[
|
81 |
+
25,
|
82 |
+
26
|
83 |
+
],
|
84 |
+
[
|
85 |
+
27,
|
86 |
+
28
|
87 |
+
],
|
88 |
+
[
|
89 |
+
29,
|
90 |
+
30
|
91 |
+
]
|
92 |
+
],
|
93 |
+
"kv_weight_reuse": false,
|
94 |
+
"layer_type": [
|
95 |
+
"h",
|
96 |
+
"h",
|
97 |
+
"h"
|
98 |
+
],
|
99 |
+
"mamba_conv_bias": true,
|
100 |
+
"mamba_d_conv": 4,
|
101 |
+
"mamba_d_state": 16,
|
102 |
+
"mamba_dt_rank": 100,
|
103 |
+
"mamba_expand": 2,
|
104 |
+
"mamba_inner_layernorms": true,
|
105 |
+
"mamba_proj_bias": false,
|
106 |
+
"max_position_embeddings": 8192,
|
107 |
+
"memory_tokens_interspersed_every": 0,
|
108 |
+
"mlp_hidden_act": "silu",
|
109 |
+
"model_type": "hymba",
|
110 |
+
"num_attention_heads": 2,
|
111 |
+
"num_experts": 1,
|
112 |
+
"num_experts_per_tok": 1,
|
113 |
+
"num_hidden_layers": 3,
|
114 |
+
"num_key_value_heads": 1,
|
115 |
+
"num_mamba": 1,
|
116 |
+
"num_memory_tokens": 128,
|
117 |
+
"orig_max_position_embeddings": 2048,
|
118 |
+
"output_router_logits": false,
|
119 |
+
"pad_token_id": 0,
|
120 |
+
"rms_norm_eps": 1e-06,
|
121 |
+
"rope": true,
|
122 |
+
"rope_theta": 10000.0,
|
123 |
+
"rope_type": "ntk",
|
124 |
+
"router_aux_loss_coef": 0.001,
|
125 |
+
"seq_length": 8192,
|
126 |
+
"sliding_window": 1024,
|
127 |
+
"tie_word_embeddings": true,
|
128 |
+
"torch_dtype": "bfloat16",
|
129 |
+
"transformers_version": "4.46.3",
|
130 |
+
"use_cache": false,
|
131 |
+
"use_mamba_kernels": true,
|
132 |
+
"v_head_dim": 16,
|
133 |
+
"vocab_size": 32001
|
134 |
+
}
|
configuration_hymba.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
from transformers.configuration_utils import PretrainedConfig
|
3 |
+
|
4 |
+
|
5 |
+
class HymbaConfig(PretrainedConfig):
|
6 |
+
|
7 |
+
model_type = "hymba"
|
8 |
+
keys_to_ignore_at_inference = ["past_key_values"]
|
9 |
+
|
10 |
+
def __init__(
|
11 |
+
self,
|
12 |
+
vocab_size=65536,
|
13 |
+
tie_word_embeddings=False,
|
14 |
+
hidden_size=4096,
|
15 |
+
intermediate_size=14336,
|
16 |
+
num_hidden_layers=32,
|
17 |
+
num_attention_heads=32,
|
18 |
+
num_key_value_heads=8,
|
19 |
+
hidden_act="silu",
|
20 |
+
initializer_range=0.02,
|
21 |
+
rms_norm_eps=1e-6,
|
22 |
+
use_cache=True,
|
23 |
+
calc_logits_for_entire_prompt=False,
|
24 |
+
output_router_logits=False,
|
25 |
+
router_aux_loss_coef=0.001,
|
26 |
+
pad_token_id=0,
|
27 |
+
bos_token_id=1,
|
28 |
+
eos_token_id=2,
|
29 |
+
sliding_window=None,
|
30 |
+
max_position_embeddings=262144,
|
31 |
+
orig_max_position_embeddings=None,
|
32 |
+
attention_dropout=0.0,
|
33 |
+
num_experts_per_tok=2,
|
34 |
+
num_experts=16,
|
35 |
+
use_mamba_kernels=True,
|
36 |
+
mamba_d_state=16,
|
37 |
+
mamba_d_conv=4,
|
38 |
+
mamba_expand=2,
|
39 |
+
mamba_dt_rank="auto",
|
40 |
+
mamba_conv_bias=True,
|
41 |
+
mamba_proj_bias=False,
|
42 |
+
mamba_inner_layernorms=True,
|
43 |
+
kv_reuse_every_i_layer=-1,
|
44 |
+
kv_reuse_group=None,
|
45 |
+
kv_weight_reuse=False,
|
46 |
+
global_attn_idx=None,
|
47 |
+
num_mamba=1,
|
48 |
+
attn_implementation_new='sdpa',
|
49 |
+
rope_type=None,
|
50 |
+
**kwargs,
|
51 |
+
):
|
52 |
+
self.vocab_size = vocab_size
|
53 |
+
self.tie_word_embeddings = tie_word_embeddings
|
54 |
+
self.hidden_size = hidden_size
|
55 |
+
self.intermediate_size = intermediate_size
|
56 |
+
self.num_hidden_layers = num_hidden_layers
|
57 |
+
self.num_attention_heads = num_attention_heads
|
58 |
+
self.sliding_window = sliding_window
|
59 |
+
self.max_position_embeddings = max_position_embeddings
|
60 |
+
self.orig_max_position_embeddings = orig_max_position_embeddings
|
61 |
+
self.attention_dropout = attention_dropout
|
62 |
+
|
63 |
+
if num_key_value_heads is None:
|
64 |
+
num_key_value_heads = num_attention_heads
|
65 |
+
|
66 |
+
self.num_key_value_heads = num_key_value_heads
|
67 |
+
self.hidden_act = hidden_act
|
68 |
+
self.initializer_range = initializer_range
|
69 |
+
self.rms_norm_eps = rms_norm_eps
|
70 |
+
|
71 |
+
self.use_cache = use_cache
|
72 |
+
self.calc_logits_for_entire_prompt = calc_logits_for_entire_prompt
|
73 |
+
self.output_router_logits = output_router_logits
|
74 |
+
self.router_aux_loss_coef = router_aux_loss_coef
|
75 |
+
|
76 |
+
self.num_experts_per_tok = num_experts_per_tok
|
77 |
+
self.num_experts = num_experts
|
78 |
+
|
79 |
+
self.use_mamba_kernels = use_mamba_kernels
|
80 |
+
self.mamba_d_state = mamba_d_state
|
81 |
+
self.mamba_d_conv = mamba_d_conv
|
82 |
+
self.mamba_expand = mamba_expand
|
83 |
+
self.mamba_dt_rank = math.ceil(self.hidden_size / 16) if mamba_dt_rank == "auto" else mamba_dt_rank
|
84 |
+
self.mamba_conv_bias = mamba_conv_bias
|
85 |
+
self.mamba_proj_bias = mamba_proj_bias
|
86 |
+
self.mamba_inner_layernorms = mamba_inner_layernorms
|
87 |
+
|
88 |
+
self.attn_hidden_size = kwargs.pop("attn_hidden_size", -1)
|
89 |
+
self.kq_head_dim = kwargs.pop("kq_head_dim", -1)
|
90 |
+
self.v_head_dim = kwargs.pop("v_head_dim", -1)
|
91 |
+
self.kq_norm = kwargs.pop("kq_norm", None)
|
92 |
+
self.rope = kwargs.pop("rope", False)
|
93 |
+
self.rope_theta = kwargs.pop("rope_theta", 10000.0)
|
94 |
+
self.num_memory_tokens = kwargs.pop("num_memory_tokens", 0)
|
95 |
+
self.memory_tokens_interspersed_every = kwargs.pop("memory_tokens_interspersed_every", 0)
|
96 |
+
|
97 |
+
self.kv_reuse_every_i_layer = kv_reuse_every_i_layer
|
98 |
+
self.kv_reuse_group = kv_reuse_group
|
99 |
+
self.kv_weight_reuse = kv_weight_reuse
|
100 |
+
|
101 |
+
self.global_attn_idx = global_attn_idx
|
102 |
+
|
103 |
+
self.num_mamba = num_mamba
|
104 |
+
|
105 |
+
self.attn_implementation_new = attn_implementation_new
|
106 |
+
|
107 |
+
self.rope_type = rope_type
|
108 |
+
|
109 |
+
|
110 |
+
super().__init__(
|
111 |
+
pad_token_id=pad_token_id,
|
112 |
+
bos_token_id=bos_token_id,
|
113 |
+
eos_token_id=eos_token_id,
|
114 |
+
tie_word_embeddings=tie_word_embeddings,
|
115 |
+
**kwargs,
|
116 |
+
)
|
generation_config.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"bos_token_id": 1,
|
4 |
+
"eos_token_id": 2,
|
5 |
+
"pad_token_id": 0,
|
6 |
+
"transformers_version": "4.46.3",
|
7 |
+
"use_cache": false
|
8 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0790172bf25b656f6f7638ef68d946d7da103930c637604908da09830de8e112
|
3 |
+
size 1109368
|
special_tokens_map.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "</s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "[PAD]",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"unk_token": {
|
24 |
+
"content": "<unk>",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
}
|
30 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
3 |
+
size 499723
|
tokenizer_config.json
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": true,
|
3 |
+
"add_eos_token": false,
|
4 |
+
"add_prefix_space": true,
|
5 |
+
"added_tokens_decoder": {
|
6 |
+
"0": {
|
7 |
+
"content": "<unk>",
|
8 |
+
"lstrip": false,
|
9 |
+
"normalized": false,
|
10 |
+
"rstrip": false,
|
11 |
+
"single_word": false,
|
12 |
+
"special": true
|
13 |
+
},
|
14 |
+
"1": {
|
15 |
+
"content": "<s>",
|
16 |
+
"lstrip": false,
|
17 |
+
"normalized": false,
|
18 |
+
"rstrip": false,
|
19 |
+
"single_word": false,
|
20 |
+
"special": true
|
21 |
+
},
|
22 |
+
"2": {
|
23 |
+
"content": "</s>",
|
24 |
+
"lstrip": false,
|
25 |
+
"normalized": false,
|
26 |
+
"rstrip": false,
|
27 |
+
"single_word": false,
|
28 |
+
"special": true
|
29 |
+
},
|
30 |
+
"32000": {
|
31 |
+
"content": "[PAD]",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false,
|
36 |
+
"special": true
|
37 |
+
}
|
38 |
+
},
|
39 |
+
"bos_token": "<s>",
|
40 |
+
"chat_template": "{{'<extra_id_0>System'}}{% for message in messages %}{% if message['role'] == 'system' %}{{'\n' + message['content'].strip()}}{% if tools or contexts %}{{'\n'}}{% endif %}{% endif %}{% endfor %}{% if tools %}{% for tool in tools %}{{ '\n<tool> ' + tool|tojson + ' </tool>' }}{% endfor %}{% endif %}{% if contexts %}{% if tools %}{{'\n'}}{% endif %}{% for context in contexts %}{{ '\n<context> ' + context.strip() + ' </context>' }}{% endfor %}{% endif %}{{'\n\n'}}{% for message in messages %}{% if message['role'] == 'user' %}{{ '<extra_id_1>User\n' + message['content'].strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{ '<extra_id_1>Assistant\n' + message['content'].strip() + '\n' }}{% elif message['role'] == 'tool' %}{{ '<extra_id_1>Tool\n' + message['content'].strip() + '\n' }}{% endif %}{% endfor %}{%- if add_generation_prompt %}{{'<extra_id_1>Assistant\n'}}{%- endif %}",
|
41 |
+
"clean_up_tokenization_spaces": false,
|
42 |
+
"eos_token": "</s>",
|
43 |
+
"legacy": true,
|
44 |
+
"model_max_length": 1000000000000000019884624838656,
|
45 |
+
"pad_token": "[PAD]",
|
46 |
+
"padding_side": "left",
|
47 |
+
"sp_model_kwargs": {},
|
48 |
+
"spaces_between_special_tokens": false,
|
49 |
+
"tokenizer_class": "LlamaTokenizer",
|
50 |
+
"unk_token": "<unk>",
|
51 |
+
"use_default_system_prompt": false
|
52 |
+
}
|