sam-mosaic
commited on
Commit
·
a59f066
1
Parent(s):
7756256
Upload folder using huggingface_hub
Browse files- attention.py +1 -1
- config.json +3 -3
- modeling_mpt.py +6 -1
- pytorch_model-00001-of-00007.bin +1 -1
- pytorch_model-00002-of-00007.bin +1 -1
- pytorch_model-00003-of-00007.bin +1 -1
- pytorch_model-00004-of-00007.bin +1 -1
- pytorch_model-00005-of-00007.bin +1 -1
- pytorch_model-00006-of-00007.bin +1 -1
- pytorch_model-00007-of-00007.bin +1 -1
- tokenizer.json +8 -1
- tokenizer_config.json +1 -1
attention.py
CHANGED
@@ -55,7 +55,7 @@ def scaled_multihead_dot_product_attention(query, key, value, n_heads, past_key_
|
|
55 |
attn_weight = torch.softmax(attn_weight, dim=-1)
|
56 |
if dropout_p:
|
57 |
attn_weight = torch.nn.functional.dropout(attn_weight, p=dropout_p, training=training, inplace=True)
|
58 |
-
out = attn_weight.matmul(v)
|
59 |
out = rearrange(out, 'b h s d -> b s (h d)')
|
60 |
if needs_weights:
|
61 |
return (out, attn_weight, past_key_value)
|
|
|
55 |
attn_weight = torch.softmax(attn_weight, dim=-1)
|
56 |
if dropout_p:
|
57 |
attn_weight = torch.nn.functional.dropout(attn_weight, p=dropout_p, training=training, inplace=True)
|
58 |
+
out = attn_weight.to(v.dtype).matmul(v)
|
59 |
out = rearrange(out, 'b h s d -> b s (h d)')
|
60 |
if needs_weights:
|
61 |
return (out, attn_weight, past_key_value)
|
config.json
CHANGED
@@ -8,8 +8,8 @@
|
|
8 |
"attn_impl": "torch",
|
9 |
"attn_pdrop": 0,
|
10 |
"attn_type": "multihead_attention",
|
11 |
-
"attn_uses_sequence_id":
|
12 |
-
"clip_qkv":
|
13 |
"prefix_lm": false,
|
14 |
"qk_ln": false,
|
15 |
"softmax_scale": null
|
@@ -36,7 +36,7 @@
|
|
36 |
"init_device": "cpu",
|
37 |
"learned_pos_emb": true,
|
38 |
"logit_scale": null,
|
39 |
-
"max_seq_len":
|
40 |
"model_type": "mpt",
|
41 |
"n_heads": 64,
|
42 |
"n_layers": 48,
|
|
|
8 |
"attn_impl": "torch",
|
9 |
"attn_pdrop": 0,
|
10 |
"attn_type": "multihead_attention",
|
11 |
+
"attn_uses_sequence_id": false,
|
12 |
+
"clip_qkv": null,
|
13 |
"prefix_lm": false,
|
14 |
"qk_ln": false,
|
15 |
"softmax_scale": null
|
|
|
36 |
"init_device": "cpu",
|
37 |
"learned_pos_emb": true,
|
38 |
"logit_scale": null,
|
39 |
+
"max_seq_len": 8192,
|
40 |
"model_type": "mpt",
|
41 |
"n_heads": 64,
|
42 |
"n_layers": 48,
|
modeling_mpt.py
CHANGED
@@ -40,6 +40,11 @@ class MPTModel(MPTPreTrainedModel):
|
|
40 |
self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
|
41 |
self.alibi = config.attn_config['alibi']
|
42 |
self.alibi_bias_max = config.attn_config['alibi_bias_max']
|
|
|
|
|
|
|
|
|
|
|
43 |
if config.norm_type.lower() not in NORM_CLASS_REGISTRY.keys():
|
44 |
norm_options = ' | '.join(NORM_CLASS_REGISTRY.keys())
|
45 |
raise NotImplementedError(f'Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options}).')
|
@@ -182,7 +187,7 @@ class MPTModel(MPTPreTrainedModel):
|
|
182 |
x_shrunk = x * self.embedding_fraction + x.detach() * (1 - self.embedding_fraction)
|
183 |
assert isinstance(self.emb_drop, nn.Module)
|
184 |
x = self.emb_drop(x_shrunk)
|
185 |
-
(attn_bias, attention_mask) = self._attn_bias(device=x.device, dtype=
|
186 |
if use_cache and past_key_values is None:
|
187 |
past_key_values = [() for _ in range(self.config.n_layers)]
|
188 |
all_hidden_states = () if output_hidden_states else None
|
|
|
40 |
self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
|
41 |
self.alibi = config.attn_config['alibi']
|
42 |
self.alibi_bias_max = config.attn_config['alibi_bias_max']
|
43 |
+
if config.init_device == 'mixed':
|
44 |
+
if dist.get_local_rank() == 0:
|
45 |
+
config.init_device = 'cpu'
|
46 |
+
else:
|
47 |
+
config.init_device = 'meta'
|
48 |
if config.norm_type.lower() not in NORM_CLASS_REGISTRY.keys():
|
49 |
norm_options = ' | '.join(NORM_CLASS_REGISTRY.keys())
|
50 |
raise NotImplementedError(f'Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options}).')
|
|
|
187 |
x_shrunk = x * self.embedding_fraction + x.detach() * (1 - self.embedding_fraction)
|
188 |
assert isinstance(self.emb_drop, nn.Module)
|
189 |
x = self.emb_drop(x_shrunk)
|
190 |
+
(attn_bias, attention_mask) = self._attn_bias(device=x.device, dtype=torch.float32, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id)
|
191 |
if use_cache and past_key_values is None:
|
192 |
past_key_values = [() for _ in range(self.config.n_layers)]
|
193 |
all_hidden_states = () if output_hidden_states else None
|
pytorch_model-00001-of-00007.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 9766157965
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bb8229f99d31f14643324ae69c945bfc5c2548dee813ef23b7260b8d22ff3d82
|
3 |
size 9766157965
|
pytorch_model-00002-of-00007.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 9865248775
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f4612b6e2b09839df9f3babfb1d00a49ff6eef3a8e6126a1e37732f59b9539c7
|
3 |
size 9865248775
|
pytorch_model-00003-of-00007.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 9865248775
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:abf88070285848375c85b1fc5328b8b10485286b7c453a7a3e66e36f1be19aa8
|
3 |
size 9865248775
|
pytorch_model-00004-of-00007.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 9865248775
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:028048cf61661c4c9173918053dad5aab752767f63d3e11133aee4f2c0431fbd
|
3 |
size 9865248775
|
pytorch_model-00005-of-00007.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 9865248775
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5990a41d6a5a2418977a75622b00106043b68786cf85f221361b3711522290e8
|
3 |
size 9865248775
|
pytorch_model-00006-of-00007.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 9865248775
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c7f090b3e9c31fef6b85569d537c86f1648bc61631e7d1fa83d498124bb7e752
|
3 |
size 9865248775
|
pytorch_model-00007-of-00007.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 822099468
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d1ab59fada51f526990510c5601e82d4f606214cfad59ebf5a333f6d12d63f1f
|
3 |
size 822099468
|
tokenizer.json
CHANGED
@@ -1,7 +1,14 @@
|
|
1 |
{
|
2 |
"version": "1.0",
|
3 |
"truncation": null,
|
4 |
-
"padding":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
"added_tokens": [
|
6 |
{
|
7 |
"id": 0,
|
|
|
1 |
{
|
2 |
"version": "1.0",
|
3 |
"truncation": null,
|
4 |
+
"padding": {
|
5 |
+
"strategy": "BatchLongest",
|
6 |
+
"direction": "Left",
|
7 |
+
"pad_to_multiple_of": null,
|
8 |
+
"pad_id": 0,
|
9 |
+
"pad_type_id": 0,
|
10 |
+
"pad_token": "<|endoftext|>"
|
11 |
+
},
|
12 |
"added_tokens": [
|
13 |
{
|
14 |
"id": 0,
|
tokenizer_config.json
CHANGED
@@ -3,7 +3,7 @@
|
|
3 |
"bos_token": "<|endoftext|>",
|
4 |
"clean_up_tokenization_spaces": true,
|
5 |
"eos_token": "<|endoftext|>",
|
6 |
-
"model_max_length":
|
7 |
"tokenizer_class": "GPTNeoXTokenizer",
|
8 |
"unk_token": "<|endoftext|>"
|
9 |
}
|
|
|
3 |
"bos_token": "<|endoftext|>",
|
4 |
"clean_up_tokenization_spaces": true,
|
5 |
"eos_token": "<|endoftext|>",
|
6 |
+
"model_max_length": 8192,
|
7 |
"tokenizer_class": "GPTNeoXTokenizer",
|
8 |
"unk_token": "<|endoftext|>"
|
9 |
}
|