Inc44 commited on
Commit
5660c1c
·
verified ·
1 Parent(s): abb173f

Training in progress, step 200

Browse files
README.md ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - generated_from_trainer
4
+ model-index:
5
+ - name: myBit-Llama2-jp-127M-3
6
+ results: []
7
+ ---
8
+
9
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
10
+ should probably proofread and complete it, then remove this comment. -->
11
+
12
+ # myBit-Llama2-jp-127M-3
13
+
14
+ This model is a fine-tuned version of [](https://huggingface.co/) on an unknown dataset.
15
+ It achieves the following results on the evaluation set:
16
+ - Loss: 3.8396
17
+
18
+ ## Model description
19
+
20
+ More information needed
21
+
22
+ ## Intended uses & limitations
23
+
24
+ More information needed
25
+
26
+ ## Training and evaluation data
27
+
28
+ More information needed
29
+
30
+ ## Training procedure
31
+
32
+ ### Training hyperparameters
33
+
34
+ The following hyperparameters were used during training:
35
+ - learning_rate: 0.00024
36
+ - train_batch_size: 96
37
+ - eval_batch_size: 96
38
+ - seed: 42
39
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
40
+ - lr_scheduler_type: polynomial
41
+ - lr_scheduler_warmup_steps: 5000
42
+ - num_epochs: 1
43
+
44
+ ### Training results
45
+
46
+ | Training Loss | Epoch | Step | Validation Loss |
47
+ |:-------------:|:-----:|:-----:|:---------------:|
48
+ | 6.9048 | 0.05 | 2000 | 4.7602 |
49
+ | 4.4421 | 0.1 | 4000 | 4.2117 |
50
+ | 4.0625 | 0.15 | 6000 | 3.9227 |
51
+ | 3.807 | 0.2 | 8000 | 3.7181 |
52
+ | 3.6547 | 0.25 | 10000 | 3.5929 |
53
+ | 3.5296 | 0.29 | 12000 | 3.4812 |
54
+ | 3.4492 | 0.34 | 14000 | 3.4236 |
55
+ | 3.4065 | 0.39 | 16000 | 3.3923 |
56
+ | 3.3816 | 0.44 | 18000 | 3.3778 |
57
+ | 3.3815 | 0.49 | 20000 | 3.3907 |
58
+ | 3.431 | 0.54 | 22000 | 3.4870 |
59
+ | 3.5507 | 0.59 | 24000 | 3.5969 |
60
+ | 3.6557 | 0.64 | 26000 | 3.6918 |
61
+ | 3.715 | 0.69 | 28000 | 3.7377 |
62
+ | 3.7646 | 0.74 | 30000 | 3.7620 |
63
+ | 3.8005 | 0.79 | 32000 | 3.8221 |
64
+ | 3.8288 | 0.83 | 34000 | 3.8550 |
65
+ | 3.8552 | 0.88 | 36000 | 3.8449 |
66
+ | 3.8591 | 0.93 | 38000 | 3.8483 |
67
+ | 3.8452 | 0.98 | 40000 | 3.8396 |
68
+
69
+
70
+ ### Framework versions
71
+
72
+ - Transformers 4.38.2
73
+ - Pytorch 2.2.1+cu121
74
+ - Datasets 2.18.0
75
+ - Tokenizers 0.15.2
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BitLlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "modeling_bit_llama.BitLlamaConfig",
9
+ "AutoModelForCausalLM": "modeling_bit_llama.BitLlamaForCausalLM"
10
+ },
11
+ "bits": 8,
12
+ "bos_token_id": 1,
13
+ "eos_token_id": 2,
14
+ "hidden_act": "silu",
15
+ "hidden_size": 768,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 1536,
18
+ "max_position_embeddings": 1024,
19
+ "model_type": "bit_llama",
20
+ "n_ctx": 128,
21
+ "num_attention_heads": 12,
22
+ "num_hidden_layers": 12,
23
+ "num_key_value_heads": 4,
24
+ "pretraining_tp": 1,
25
+ "rms_norm_eps": 1e-05,
26
+ "rope_scaling": null,
27
+ "rope_theta": 10000.0,
28
+ "tie_word_embeddings": false,
29
+ "torch_dtype": "float32",
30
+ "transformers_version": "4.39.1",
31
+ "use_cache": true,
32
+ "vocab_size": 43176
33
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.38.2"
6
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8751cbbbd70b51c14908672987f94e710682d7fdb3fbb4e6a083ce7e0f6b0989
3
+ size 510960712
modeling_bit_llama.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+ from transformers.models.llama.modeling_llama import (
3
+ LlamaConfig,
4
+ LlamaModel,
5
+ LlamaForCausalLM,
6
+ LlamaAttention,
7
+ LlamaFlashAttention2,
8
+ LlamaSdpaAttention,
9
+ LlamaMLP,
10
+ LlamaDecoderLayer,
11
+ )
12
+ from mybitnet.bitnet import BitLinear
13
+ from torch import nn
14
+
15
+ class BitLlamaConfig(LlamaConfig):
16
+ model_type = "bit_llama"
17
+
18
+ def __init__(self, bits=8, **kwargs):
19
+ super().__init__(**kwargs)
20
+ self.bits = bits
21
+
22
+ class BitLlamaMLP(LlamaMLP):
23
+ def __init__(self, config):
24
+ super().__init__(config)
25
+ self.gate_proj = BitLinear(self.hidden_size, self.intermediate_size, bias=False, bits=config.bits, flg_before_linear=True)
26
+ self.up_proj = BitLinear(self.hidden_size, self.intermediate_size, bias=False, bits=config.bits, flg_before_linear=True)
27
+ self.down_proj = BitLinear(self.intermediate_size, self.hidden_size, bias=False, bits=config.bits, flg_before_linear=False)
28
+
29
+ class BitLlamaAttention(LlamaAttention):
30
+ def __init__(self, config: BitLlamaConfig, layer_idx: Optional[int] = None):
31
+ super().__init__(config)
32
+ self.q_proj = BitLinear(self.hidden_size, self.num_heads * self.head_dim, bias=False, bits=config.bits, flg_before_linear=True)
33
+ self.k_proj = BitLinear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False, bits=config.bits, flg_before_linear=True)
34
+ self.v_proj = BitLinear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False, bits=config.bits, flg_before_linear=True)
35
+ self.o_proj = BitLinear(self.hidden_size, self.hidden_size, bias=False, bits=config.bits, flg_before_linear=True)
36
+
37
+ class BitLlamaFlashAttention2(LlamaFlashAttention2):
38
+ def __init__(self, config: BitLlamaConfig, layer_idx: Optional[int] = None):
39
+ super().__init__(config, layer_idx)
40
+ self.q_proj = BitLinear(self.hidden_size, self.num_heads * self.head_dim, bias=False, bits=config.bits, flg_before_linear=True)
41
+ self.k_proj = BitLinear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False, bits=config.bits, flg_before_linear=True)
42
+ self.v_proj = BitLinear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False, bits=config.bits, flg_before_linear=True)
43
+ self.o_proj = BitLinear(self.hidden_size, self.hidden_size, bias=False, bits=config.bits, flg_before_linear=True)
44
+
45
+ class BitLlamaSdpaAttention(LlamaSdpaAttention):
46
+ def __init__(self, config: BitLlamaConfig, layer_idx: Optional[int] = None):
47
+ super().__init__(config, layer_idx)
48
+ self.q_proj = BitLinear(self.hidden_size, self.num_heads * self.head_dim, bias=False, bits=config.bits, flg_before_linear=True)
49
+ self.k_proj = BitLinear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False, bits=config.bits, flg_before_linear=True)
50
+ self.v_proj = BitLinear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False, bits=config.bits, flg_before_linear=True)
51
+ self.o_proj = BitLinear(self.hidden_size, self.hidden_size, bias=False, bits=config.bits, flg_before_linear=True)
52
+
53
+ BITLLAMA_ATTENTION_CLASSES = {
54
+ "eager": BitLlamaAttention,
55
+ "flash_attention_2": BitLlamaFlashAttention2,
56
+ "sdpa": BitLlamaSdpaAttention,
57
+ }
58
+
59
+ class BitLlamaDecoderLayer(LlamaDecoderLayer):
60
+ def __init__(self, config: BitLlamaConfig, layer_idx: int):
61
+ super().__init__(config, layer_idx)
62
+ self.self_attn = BITLLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
63
+ self.mlp = BitLlamaMLP(config)
64
+
65
+ class BitLlamaModel(LlamaModel):
66
+ def __init__(self, config: BitLlamaConfig):
67
+ super().__init__(config)
68
+ self.layers = nn.ModuleList(
69
+ [BitLlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
70
+ )
71
+
72
+ class BitLlamaForCausalLM(LlamaForCausalLM):
73
+ config_class = BitLlamaConfig
74
+
75
+ def __init__(self, config: BitLlamaConfig):
76
+ super().__init__(config)
77
+ self.model = BitLlamaModel(config)
78
+ self.lm_head = BitLinear(config.hidden_size, config.vocab_size, bias=False, bits=config.bits, flg_before_linear=True)
79
+
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c877c5ca885bad5c19d1b1706a2703f8b30de90f03c1f834f8bdb9faf79821e8
3
+ size 914000
tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": true,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "legacy": false,
35
+ "model_max_length": 1000000000000000019884624838656,
36
+ "pad_token": "</s>",
37
+ "padding_side": "right",
38
+ "sp_model_kwargs": {},
39
+ "spaces_between_special_tokens": false,
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2edef78613ec11331f1e86b427554b65d0fab164a2cbea1240516eba1e252b2b
3
+ size 4920