Ubuntu commited on
Commit
0092716
·
1 Parent(s): 28e1fec

push first version model

Browse files
README.md ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: alignment-handbook/zephyr-7b-sft-full
5
+ tags:
6
+ - alignment-handbook
7
+ - generated_from_trainer
8
+ datasets:
9
+ - generator
10
+ model-index:
11
+ - name: IRL_iter0_best_of_16_spin_iter0_epoch_5_saving
12
+ results: []
13
+ ---
14
+
15
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
16
+ should probably proofread and complete it, then remove this comment. -->
17
+
18
+ # IRL_iter0_best_of_16_spin_iter0_epoch_5_saving
19
+
20
+ This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the d, the a, the t, the a, the _, the g, the e, the n, the e, the r, the a, the t, the e, the d, the /, the s, the p, the i, the n, the _, the i, the t, the e, the r, the 0, the _, the b, the e, the s, the t, the _, the o, the f, the _, the 1, the 6, the /, the t, the o, the p, the 1, the _, the s, the e, the l, the e, the c, the t, the e, the d, the _, the I, the R, the L, the _, the r, the e, the w, the a, the r, the d, the _, the s, the e, the l, the e, the c, the t, the e and the d datasets.
21
+ It achieves the following results on the evaluation set:
22
+ - Loss: 0.0396
23
+
24
+ ## Model description
25
+
26
+ More information needed
27
+
28
+ ## Intended uses & limitations
29
+
30
+ More information needed
31
+
32
+ ## Training and evaluation data
33
+
34
+ More information needed
35
+
36
+ ## Training procedure
37
+
38
+ ### Training hyperparameters
39
+
40
+ The following hyperparameters were used during training:
41
+ - learning_rate: 2e-05
42
+ - train_batch_size: 4
43
+ - eval_batch_size: 8
44
+ - seed: 42
45
+ - distributed_type: multi-GPU
46
+ - num_devices: 8
47
+ - gradient_accumulation_steps: 4
48
+ - total_train_batch_size: 128
49
+ - total_eval_batch_size: 64
50
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
51
+ - lr_scheduler_type: cosine
52
+ - lr_scheduler_warmup_ratio: 0.1
53
+ - num_epochs: 5.0
54
+
55
+ ### Training results
56
+
57
+ | Training Loss | Epoch | Step | Validation Loss |
58
+ |:-------------:|:-----:|:----:|:---------------:|
59
+ | 1.0895 | 1.0 | 79 | 0.7223 |
60
+ | 0.6454 | 2.0 | 158 | 0.3317 |
61
+ | 0.2926 | 3.0 | 237 | 0.1293 |
62
+ | 0.1048 | 4.0 | 316 | 0.0542 |
63
+ | 0.0465 | 5.0 | 395 | 0.0396 |
64
+
65
+
66
+ ### Framework versions
67
+
68
+ - Transformers 4.44.2
69
+ - Pytorch 2.1.2+cu121
70
+ - Datasets 2.21.0
71
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "eval_loss": 0.039588429033756256,
4
+ "eval_runtime": 214.1263,
5
+ "eval_samples": 49792,
6
+ "eval_samples_per_second": 47.192,
7
+ "eval_steps_per_second": 0.738,
8
+ "total_flos": 165305238159360.0,
9
+ "train_loss": 0.43836132314386245,
10
+ "train_runtime": 6288.8643,
11
+ "train_samples": 49792,
12
+ "train_samples_per_second": 8.034,
13
+ "train_steps_per_second": 0.063
14
+ }
config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "alignment-handbook/zephyr-7b-sft-full",
3
+ "architectures": [
4
+ "MistralForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 4096,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 14336,
14
+ "max_position_embeddings": 32768,
15
+ "model_type": "mistral",
16
+ "num_attention_heads": 32,
17
+ "num_hidden_layers": 32,
18
+ "num_key_value_heads": 8,
19
+ "rms_norm_eps": 1e-05,
20
+ "rope_theta": 10000.0,
21
+ "sliding_window": 4096,
22
+ "tie_word_embeddings": false,
23
+ "torch_dtype": "bfloat16",
24
+ "transformers_version": "4.44.2",
25
+ "use_cache": true,
26
+ "vocab_size": 32000
27
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "eval_loss": 0.039588429033756256,
4
+ "eval_runtime": 214.1263,
5
+ "eval_samples": 49792,
6
+ "eval_samples_per_second": 47.192,
7
+ "eval_steps_per_second": 0.738
8
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.44.2"
6
+ }
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8537b0cbce772547e5b7f8070cbf23b30637521bf08cafeb231b0c36cfa829c
3
+ size 4943162336
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6f891bb1b6838b429ffc2b9f8bdc7a2e010268eaa9737c26dd89d8c8fa00bc2
3
+ size 4999819336
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67b7097f0b7e0b93c2d3eb91d778a7bf91c645a0b0168a4bec9e5aa09f2da375
3
+ size 4540516344
model.safetensors.index.json ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 14483464192
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00003-of-00003.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
15
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
16
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
17
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
18
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
19
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
20
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
21
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
22
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
23
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
24
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
25
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
26
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors",
27
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
28
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
29
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
30
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
31
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
32
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
33
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
34
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
35
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
36
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
37
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
38
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
39
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
40
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
41
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
42
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
43
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
44
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
45
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
46
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
47
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
48
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
49
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
50
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
51
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
52
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
53
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
54
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
55
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
56
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
57
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
58
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
59
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
60
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
61
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
62
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
63
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
64
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
65
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
66
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
67
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
68
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
69
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
70
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
71
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
72
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
73
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
74
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
75
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
76
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
77
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
78
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
79
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
80
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
81
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
82
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
83
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
84
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
85
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
86
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
87
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
88
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
89
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
90
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
91
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
92
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
93
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
94
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
95
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
96
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
97
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
98
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
99
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
100
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
101
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
102
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
103
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
104
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
105
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
106
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
107
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
108
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
109
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
110
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
111
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
112
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
113
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
114
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
115
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
116
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
117
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
118
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
119
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
120
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
121
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
122
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
123
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
124
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
125
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
126
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
127
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
128
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
129
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
130
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
131
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
132
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
133
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
134
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
135
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
136
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
137
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
138
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
139
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
140
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
141
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
142
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
143
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00003.safetensors",
144
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
145
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
146
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
147
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
148
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
149
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
150
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
151
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
152
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
153
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
154
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
155
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
156
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
157
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
158
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
159
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
160
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
161
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
162
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
163
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
164
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
165
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
166
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
167
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
168
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
169
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
170
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
171
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
172
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
173
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
174
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
175
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
176
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
177
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
178
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
179
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
180
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
181
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
182
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
183
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
184
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
185
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
186
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
187
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
188
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
189
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
190
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
191
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
192
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
193
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
194
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
195
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
196
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
197
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
198
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
199
+ "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
200
+ "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
201
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
202
+ "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
203
+ "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
204
+ "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
205
+ "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
206
+ "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
207
+ "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
208
+ "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
209
+ "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
210
+ "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
211
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
212
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
213
+ "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
214
+ "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
215
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
216
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
217
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
218
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
219
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
220
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
221
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
222
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
223
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
224
+ "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
225
+ "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
226
+ "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
227
+ "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
228
+ "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
229
+ "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
230
+ "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
231
+ "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
232
+ "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
233
+ "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
234
+ "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
235
+ "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
236
+ "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
237
+ "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
238
+ "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
239
+ "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
240
+ "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
241
+ "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
242
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
243
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
244
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
245
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
246
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
247
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
248
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
249
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
250
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
251
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
252
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
253
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
254
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
255
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
256
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
257
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
258
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
259
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
260
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
261
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
262
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
263
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
264
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
265
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
266
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
267
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
268
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
269
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
270
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
271
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
272
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
273
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
274
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
275
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
276
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
277
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
278
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
279
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
280
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
281
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
282
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
283
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
284
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
285
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
286
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
287
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
288
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
289
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
290
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
291
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
292
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
293
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
294
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
295
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
296
+ "model.norm.weight": "model-00003-of-00003.safetensors"
297
+ }
298
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "additional_special_tokens": [],
32
+ "bos_token": "<s>",
33
+ "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
34
+ "clean_up_tokenization_spaces": false,
35
+ "eos_token": "</s>",
36
+ "legacy": true,
37
+ "model_max_length": 2048,
38
+ "pad_token": "</s>",
39
+ "sp_model_kwargs": {},
40
+ "spaces_between_special_tokens": false,
41
+ "tokenizer_class": "LlamaTokenizer",
42
+ "unk_token": "<unk>",
43
+ "use_default_system_prompt": false
44
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "total_flos": 165305238159360.0,
4
+ "train_loss": 0.43836132314386245,
5
+ "train_runtime": 6288.8643,
6
+ "train_samples": 49792,
7
+ "train_samples_per_second": 8.034,
8
+ "train_steps_per_second": 0.063
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,642 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 5.0,
5
+ "eval_steps": 500,
6
+ "global_step": 395,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.012658227848101266,
13
+ "grad_norm": 6.695639272540547,
14
+ "learning_rate": 5.000000000000001e-07,
15
+ "loss": 1.1195,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.06329113924050633,
20
+ "grad_norm": 13.683866082279918,
21
+ "learning_rate": 2.5e-06,
22
+ "loss": 1.0968,
23
+ "step": 5
24
+ },
25
+ {
26
+ "epoch": 0.12658227848101267,
27
+ "grad_norm": 2.7914110872997275,
28
+ "learning_rate": 5e-06,
29
+ "loss": 1.0868,
30
+ "step": 10
31
+ },
32
+ {
33
+ "epoch": 0.189873417721519,
34
+ "grad_norm": 2.2869962494889995,
35
+ "learning_rate": 7.500000000000001e-06,
36
+ "loss": 1.0771,
37
+ "step": 15
38
+ },
39
+ {
40
+ "epoch": 0.25316455696202533,
41
+ "grad_norm": 2.315595057148159,
42
+ "learning_rate": 1e-05,
43
+ "loss": 1.0546,
44
+ "step": 20
45
+ },
46
+ {
47
+ "epoch": 0.31645569620253167,
48
+ "grad_norm": 1.9266353743480364,
49
+ "learning_rate": 1.25e-05,
50
+ "loss": 1.084,
51
+ "step": 25
52
+ },
53
+ {
54
+ "epoch": 0.379746835443038,
55
+ "grad_norm": 2.150532379708014,
56
+ "learning_rate": 1.5000000000000002e-05,
57
+ "loss": 1.0654,
58
+ "step": 30
59
+ },
60
+ {
61
+ "epoch": 0.4430379746835443,
62
+ "grad_norm": 2.1018953026935905,
63
+ "learning_rate": 1.7500000000000002e-05,
64
+ "loss": 1.0817,
65
+ "step": 35
66
+ },
67
+ {
68
+ "epoch": 0.5063291139240507,
69
+ "grad_norm": 2.3849545878679272,
70
+ "learning_rate": 2e-05,
71
+ "loss": 1.0971,
72
+ "step": 40
73
+ },
74
+ {
75
+ "epoch": 0.569620253164557,
76
+ "grad_norm": 2.345688862339495,
77
+ "learning_rate": 1.9990212265199738e-05,
78
+ "loss": 1.086,
79
+ "step": 45
80
+ },
81
+ {
82
+ "epoch": 0.6329113924050633,
83
+ "grad_norm": 2.5866807131537906,
84
+ "learning_rate": 1.996086822074945e-05,
85
+ "loss": 1.0656,
86
+ "step": 50
87
+ },
88
+ {
89
+ "epoch": 0.6962025316455697,
90
+ "grad_norm": 2.4392627537159655,
91
+ "learning_rate": 1.9912025308994146e-05,
92
+ "loss": 1.1059,
93
+ "step": 55
94
+ },
95
+ {
96
+ "epoch": 0.759493670886076,
97
+ "grad_norm": 2.2706485443813276,
98
+ "learning_rate": 1.9843779142227258e-05,
99
+ "loss": 1.0973,
100
+ "step": 60
101
+ },
102
+ {
103
+ "epoch": 0.8227848101265823,
104
+ "grad_norm": 2.3020097653483496,
105
+ "learning_rate": 1.975626331552507e-05,
106
+ "loss": 1.1083,
107
+ "step": 65
108
+ },
109
+ {
110
+ "epoch": 0.8860759493670886,
111
+ "grad_norm": 2.0670888705685955,
112
+ "learning_rate": 1.96496491452281e-05,
113
+ "loss": 1.1039,
114
+ "step": 70
115
+ },
116
+ {
117
+ "epoch": 0.9493670886075949,
118
+ "grad_norm": 2.371742516361207,
119
+ "learning_rate": 1.9524145333581315e-05,
120
+ "loss": 1.0895,
121
+ "step": 75
122
+ },
123
+ {
124
+ "epoch": 1.0,
125
+ "eval_loss": 0.7223408222198486,
126
+ "eval_runtime": 213.2984,
127
+ "eval_samples_per_second": 47.375,
128
+ "eval_steps_per_second": 0.741,
129
+ "step": 79
130
+ },
131
+ {
132
+ "epoch": 1.0126582278481013,
133
+ "grad_norm": 8.106924593883994,
134
+ "learning_rate": 1.9379997560189677e-05,
135
+ "loss": 1.0198,
136
+ "step": 80
137
+ },
138
+ {
139
+ "epoch": 1.0759493670886076,
140
+ "grad_norm": 3.486817193319529,
141
+ "learning_rate": 1.9217488001088784e-05,
142
+ "loss": 0.6812,
143
+ "step": 85
144
+ },
145
+ {
146
+ "epoch": 1.139240506329114,
147
+ "grad_norm": 2.921219837737029,
148
+ "learning_rate": 1.903693477637204e-05,
149
+ "loss": 0.6683,
150
+ "step": 90
151
+ },
152
+ {
153
+ "epoch": 1.2025316455696202,
154
+ "grad_norm": 3.1078224489011683,
155
+ "learning_rate": 1.883869132745561e-05,
156
+ "loss": 0.653,
157
+ "step": 95
158
+ },
159
+ {
160
+ "epoch": 1.2658227848101267,
161
+ "grad_norm": 2.8280645620621874,
162
+ "learning_rate": 1.862314572520028e-05,
163
+ "loss": 0.6501,
164
+ "step": 100
165
+ },
166
+ {
167
+ "epoch": 1.3291139240506329,
168
+ "grad_norm": 2.920193489057834,
169
+ "learning_rate": 1.8390719910244487e-05,
170
+ "loss": 0.647,
171
+ "step": 105
172
+ },
173
+ {
174
+ "epoch": 1.3924050632911391,
175
+ "grad_norm": 2.647561794905359,
176
+ "learning_rate": 1.8141868867035745e-05,
177
+ "loss": 0.6531,
178
+ "step": 110
179
+ },
180
+ {
181
+ "epoch": 1.4556962025316456,
182
+ "grad_norm": 2.7660605803019447,
183
+ "learning_rate": 1.7877079733177185e-05,
184
+ "loss": 0.6232,
185
+ "step": 115
186
+ },
187
+ {
188
+ "epoch": 1.518987341772152,
189
+ "grad_norm": 2.497219995540892,
190
+ "learning_rate": 1.759687084583285e-05,
191
+ "loss": 0.6401,
192
+ "step": 120
193
+ },
194
+ {
195
+ "epoch": 1.5822784810126582,
196
+ "grad_norm": 2.447623483254921,
197
+ "learning_rate": 1.7301790727058344e-05,
198
+ "loss": 0.6592,
199
+ "step": 125
200
+ },
201
+ {
202
+ "epoch": 1.6455696202531644,
203
+ "grad_norm": 2.4783693483832563,
204
+ "learning_rate": 1.6992417010043144e-05,
205
+ "loss": 0.6441,
206
+ "step": 130
207
+ },
208
+ {
209
+ "epoch": 1.7088607594936709,
210
+ "grad_norm": 2.348228472338975,
211
+ "learning_rate": 1.666935530836651e-05,
212
+ "loss": 0.6439,
213
+ "step": 135
214
+ },
215
+ {
216
+ "epoch": 1.7721518987341773,
217
+ "grad_norm": 2.316917620320839,
218
+ "learning_rate": 1.6333238030480473e-05,
219
+ "loss": 0.6572,
220
+ "step": 140
221
+ },
222
+ {
223
+ "epoch": 1.8354430379746836,
224
+ "grad_norm": 2.614503810192861,
225
+ "learning_rate": 1.5984723141740578e-05,
226
+ "loss": 0.6764,
227
+ "step": 145
228
+ },
229
+ {
230
+ "epoch": 1.8987341772151898,
231
+ "grad_norm": 2.452736422029602,
232
+ "learning_rate": 1.562449287640781e-05,
233
+ "loss": 0.6555,
234
+ "step": 150
235
+ },
236
+ {
237
+ "epoch": 1.9620253164556962,
238
+ "grad_norm": 2.737510300725156,
239
+ "learning_rate": 1.5253252402142989e-05,
240
+ "loss": 0.6454,
241
+ "step": 155
242
+ },
243
+ {
244
+ "epoch": 2.0,
245
+ "eval_loss": 0.33173683285713196,
246
+ "eval_runtime": 213.2106,
247
+ "eval_samples_per_second": 47.394,
248
+ "eval_steps_per_second": 0.741,
249
+ "step": 158
250
+ },
251
+ {
252
+ "epoch": 2.0253164556962027,
253
+ "grad_norm": 3.9057198522311167,
254
+ "learning_rate": 1.4871728439607967e-05,
255
+ "loss": 0.5134,
256
+ "step": 160
257
+ },
258
+ {
259
+ "epoch": 2.088607594936709,
260
+ "grad_norm": 3.4868328352161817,
261
+ "learning_rate": 1.4480667839875786e-05,
262
+ "loss": 0.3096,
263
+ "step": 165
264
+ },
265
+ {
266
+ "epoch": 2.151898734177215,
267
+ "grad_norm": 2.9717493172755014,
268
+ "learning_rate": 1.408083612243465e-05,
269
+ "loss": 0.2909,
270
+ "step": 170
271
+ },
272
+ {
273
+ "epoch": 2.2151898734177213,
274
+ "grad_norm": 2.642444961402327,
275
+ "learning_rate": 1.367301597664757e-05,
276
+ "loss": 0.2837,
277
+ "step": 175
278
+ },
279
+ {
280
+ "epoch": 2.278481012658228,
281
+ "grad_norm": 2.3997899647768346,
282
+ "learning_rate": 1.3258005729601178e-05,
283
+ "loss": 0.2818,
284
+ "step": 180
285
+ },
286
+ {
287
+ "epoch": 2.3417721518987342,
288
+ "grad_norm": 2.7423823494323774,
289
+ "learning_rate": 1.2836617783342968e-05,
290
+ "loss": 0.2907,
291
+ "step": 185
292
+ },
293
+ {
294
+ "epoch": 2.4050632911392404,
295
+ "grad_norm": 2.3807259553082276,
296
+ "learning_rate": 1.2409677024566145e-05,
297
+ "loss": 0.2872,
298
+ "step": 190
299
+ },
300
+ {
301
+ "epoch": 2.4683544303797467,
302
+ "grad_norm": 2.336572941961749,
303
+ "learning_rate": 1.1978019209855174e-05,
304
+ "loss": 0.2866,
305
+ "step": 195
306
+ },
307
+ {
308
+ "epoch": 2.5316455696202533,
309
+ "grad_norm": 2.2601309613900757,
310
+ "learning_rate": 1.1542489329653024e-05,
311
+ "loss": 0.2945,
312
+ "step": 200
313
+ },
314
+ {
315
+ "epoch": 2.5949367088607596,
316
+ "grad_norm": 2.3050926972713968,
317
+ "learning_rate": 1.11039399541527e-05,
318
+ "loss": 0.2937,
319
+ "step": 205
320
+ },
321
+ {
322
+ "epoch": 2.6582278481012658,
323
+ "grad_norm": 2.221130883158219,
324
+ "learning_rate": 1.066322956435104e-05,
325
+ "loss": 0.2961,
326
+ "step": 210
327
+ },
328
+ {
329
+ "epoch": 2.721518987341772,
330
+ "grad_norm": 2.270387701106379,
331
+ "learning_rate": 1.022122087153187e-05,
332
+ "loss": 0.2965,
333
+ "step": 215
334
+ },
335
+ {
336
+ "epoch": 2.7848101265822782,
337
+ "grad_norm": 2.279056997659604,
338
+ "learning_rate": 9.778779128468133e-06,
339
+ "loss": 0.2964,
340
+ "step": 220
341
+ },
342
+ {
343
+ "epoch": 2.848101265822785,
344
+ "grad_norm": 2.1417041666216696,
345
+ "learning_rate": 9.336770435648963e-06,
346
+ "loss": 0.2931,
347
+ "step": 225
348
+ },
349
+ {
350
+ "epoch": 2.911392405063291,
351
+ "grad_norm": 2.439196726609028,
352
+ "learning_rate": 8.896060045847305e-06,
353
+ "loss": 0.2977,
354
+ "step": 230
355
+ },
356
+ {
357
+ "epoch": 2.9746835443037973,
358
+ "grad_norm": 2.2397521488796417,
359
+ "learning_rate": 8.457510670346976e-06,
360
+ "loss": 0.2926,
361
+ "step": 235
362
+ },
363
+ {
364
+ "epoch": 3.0,
365
+ "eval_loss": 0.12934371829032898,
366
+ "eval_runtime": 213.237,
367
+ "eval_samples_per_second": 47.389,
368
+ "eval_steps_per_second": 0.741,
369
+ "step": 237
370
+ },
371
+ {
372
+ "epoch": 3.037974683544304,
373
+ "grad_norm": 2.2704813832171142,
374
+ "learning_rate": 8.021980790144828e-06,
375
+ "loss": 0.199,
376
+ "step": 240
377
+ },
378
+ {
379
+ "epoch": 3.1012658227848102,
380
+ "grad_norm": 2.132226843273487,
381
+ "learning_rate": 7.590322975433857e-06,
382
+ "loss": 0.1128,
383
+ "step": 245
384
+ },
385
+ {
386
+ "epoch": 3.1645569620253164,
387
+ "grad_norm": 2.0782142465832236,
388
+ "learning_rate": 7.163382216657033e-06,
389
+ "loss": 0.1129,
390
+ "step": 250
391
+ },
392
+ {
393
+ "epoch": 3.2278481012658227,
394
+ "grad_norm": 1.761669140987703,
395
+ "learning_rate": 6.741994270398826e-06,
396
+ "loss": 0.1099,
397
+ "step": 255
398
+ },
399
+ {
400
+ "epoch": 3.291139240506329,
401
+ "grad_norm": 1.717634933034142,
402
+ "learning_rate": 6.326984023352435e-06,
403
+ "loss": 0.1086,
404
+ "step": 260
405
+ },
406
+ {
407
+ "epoch": 3.3544303797468356,
408
+ "grad_norm": 1.6608157157785954,
409
+ "learning_rate": 5.919163877565351e-06,
410
+ "loss": 0.1068,
411
+ "step": 265
412
+ },
413
+ {
414
+ "epoch": 3.4177215189873418,
415
+ "grad_norm": 1.6064024695135044,
416
+ "learning_rate": 5.519332160124215e-06,
417
+ "loss": 0.1101,
418
+ "step": 270
419
+ },
420
+ {
421
+ "epoch": 3.481012658227848,
422
+ "grad_norm": 1.6349645931027217,
423
+ "learning_rate": 5.128271560392037e-06,
424
+ "loss": 0.1074,
425
+ "step": 275
426
+ },
427
+ {
428
+ "epoch": 3.5443037974683547,
429
+ "grad_norm": 1.7679069294086043,
430
+ "learning_rate": 4.746747597857014e-06,
431
+ "loss": 0.1118,
432
+ "step": 280
433
+ },
434
+ {
435
+ "epoch": 3.607594936708861,
436
+ "grad_norm": 1.5530405205049536,
437
+ "learning_rate": 4.375507123592194e-06,
438
+ "loss": 0.1077,
439
+ "step": 285
440
+ },
441
+ {
442
+ "epoch": 3.670886075949367,
443
+ "grad_norm": 1.5871889705183277,
444
+ "learning_rate": 4.015276858259427e-06,
445
+ "loss": 0.1082,
446
+ "step": 290
447
+ },
448
+ {
449
+ "epoch": 3.7341772151898733,
450
+ "grad_norm": 1.5985959451025067,
451
+ "learning_rate": 3.6667619695195287e-06,
452
+ "loss": 0.1063,
453
+ "step": 295
454
+ },
455
+ {
456
+ "epoch": 3.7974683544303796,
457
+ "grad_norm": 1.567040797684349,
458
+ "learning_rate": 3.330644691633492e-06,
459
+ "loss": 0.1067,
460
+ "step": 300
461
+ },
462
+ {
463
+ "epoch": 3.8607594936708862,
464
+ "grad_norm": 1.6098560275600409,
465
+ "learning_rate": 3.00758298995686e-06,
466
+ "loss": 0.1045,
467
+ "step": 305
468
+ },
469
+ {
470
+ "epoch": 3.9240506329113924,
471
+ "grad_norm": 1.495424841878041,
472
+ "learning_rate": 2.698209272941659e-06,
473
+ "loss": 0.1076,
474
+ "step": 310
475
+ },
476
+ {
477
+ "epoch": 3.9873417721518987,
478
+ "grad_norm": 1.4612235257248143,
479
+ "learning_rate": 2.403129154167153e-06,
480
+ "loss": 0.1048,
481
+ "step": 315
482
+ },
483
+ {
484
+ "epoch": 4.0,
485
+ "eval_loss": 0.054249346256256104,
486
+ "eval_runtime": 213.4038,
487
+ "eval_samples_per_second": 47.352,
488
+ "eval_steps_per_second": 0.74,
489
+ "step": 316
490
+ },
491
+ {
492
+ "epoch": 4.050632911392405,
493
+ "grad_norm": 0.8970304833218358,
494
+ "learning_rate": 2.1229202668228197e-06,
495
+ "loss": 0.0643,
496
+ "step": 320
497
+ },
498
+ {
499
+ "epoch": 4.113924050632911,
500
+ "grad_norm": 1.3031778574429476,
501
+ "learning_rate": 1.8581311329642592e-06,
502
+ "loss": 0.0496,
503
+ "step": 325
504
+ },
505
+ {
506
+ "epoch": 4.177215189873418,
507
+ "grad_norm": 1.0494986965992847,
508
+ "learning_rate": 1.609280089755515e-06,
509
+ "loss": 0.0498,
510
+ "step": 330
511
+ },
512
+ {
513
+ "epoch": 4.2405063291139244,
514
+ "grad_norm": 0.8871889072165642,
515
+ "learning_rate": 1.3768542747997215e-06,
516
+ "loss": 0.048,
517
+ "step": 335
518
+ },
519
+ {
520
+ "epoch": 4.30379746835443,
521
+ "grad_norm": 0.8947823783464494,
522
+ "learning_rate": 1.161308672544389e-06,
523
+ "loss": 0.0491,
524
+ "step": 340
525
+ },
526
+ {
527
+ "epoch": 4.367088607594937,
528
+ "grad_norm": 0.8681171283598373,
529
+ "learning_rate": 9.630652236279626e-07,
530
+ "loss": 0.0467,
531
+ "step": 345
532
+ },
533
+ {
534
+ "epoch": 4.430379746835443,
535
+ "grad_norm": 0.8483515397273593,
536
+ "learning_rate": 7.825119989112173e-07,
537
+ "loss": 0.0474,
538
+ "step": 350
539
+ },
540
+ {
541
+ "epoch": 4.493670886075949,
542
+ "grad_norm": 0.883073997111795,
543
+ "learning_rate": 6.200024398103255e-07,
544
+ "loss": 0.0461,
545
+ "step": 355
546
+ },
547
+ {
548
+ "epoch": 4.556962025316456,
549
+ "grad_norm": 0.8423665781515933,
550
+ "learning_rate": 4.7585466641868696e-07,
551
+ "loss": 0.0474,
552
+ "step": 360
553
+ },
554
+ {
555
+ "epoch": 4.620253164556962,
556
+ "grad_norm": 1.0038851569615406,
557
+ "learning_rate": 3.5035085477190143e-07,
558
+ "loss": 0.0476,
559
+ "step": 365
560
+ },
561
+ {
562
+ "epoch": 4.6835443037974684,
563
+ "grad_norm": 0.9089402076181121,
564
+ "learning_rate": 2.4373668447493225e-07,
565
+ "loss": 0.0467,
566
+ "step": 370
567
+ },
568
+ {
569
+ "epoch": 4.746835443037975,
570
+ "grad_norm": 0.8530919153878059,
571
+ "learning_rate": 1.562208577727442e-07,
572
+ "loss": 0.0469,
573
+ "step": 375
574
+ },
575
+ {
576
+ "epoch": 4.810126582278481,
577
+ "grad_norm": 0.8298393668849166,
578
+ "learning_rate": 8.797469100585432e-08,
579
+ "loss": 0.0481,
580
+ "step": 380
581
+ },
582
+ {
583
+ "epoch": 4.8734177215189876,
584
+ "grad_norm": 0.986632852960438,
585
+ "learning_rate": 3.913177925055189e-08,
586
+ "loss": 0.0471,
587
+ "step": 385
588
+ },
589
+ {
590
+ "epoch": 4.936708860759493,
591
+ "grad_norm": 0.9075462948662969,
592
+ "learning_rate": 9.78773480026396e-09,
593
+ "loss": 0.0474,
594
+ "step": 390
595
+ },
596
+ {
597
+ "epoch": 5.0,
598
+ "grad_norm": 0.7480159011876264,
599
+ "learning_rate": 0.0,
600
+ "loss": 0.0465,
601
+ "step": 395
602
+ },
603
+ {
604
+ "epoch": 5.0,
605
+ "eval_loss": 0.039588429033756256,
606
+ "eval_runtime": 213.3054,
607
+ "eval_samples_per_second": 47.373,
608
+ "eval_steps_per_second": 0.741,
609
+ "step": 395
610
+ },
611
+ {
612
+ "epoch": 5.0,
613
+ "step": 395,
614
+ "total_flos": 165305238159360.0,
615
+ "train_loss": 0.43836132314386245,
616
+ "train_runtime": 6288.8643,
617
+ "train_samples_per_second": 8.034,
618
+ "train_steps_per_second": 0.063
619
+ }
620
+ ],
621
+ "logging_steps": 5,
622
+ "max_steps": 395,
623
+ "num_input_tokens_seen": 0,
624
+ "num_train_epochs": 5,
625
+ "save_steps": 500,
626
+ "stateful_callbacks": {
627
+ "TrainerControl": {
628
+ "args": {
629
+ "should_epoch_stop": false,
630
+ "should_evaluate": false,
631
+ "should_log": false,
632
+ "should_save": true,
633
+ "should_training_stop": true
634
+ },
635
+ "attributes": {}
636
+ }
637
+ },
638
+ "total_flos": 165305238159360.0,
639
+ "train_batch_size": 4,
640
+ "trial_name": null,
641
+ "trial_params": null
642
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64ffc04f64ae9bc197484fd7b6a1c0cb35fb961b932b98746fd5162e9a75d6ab
3
+ size 6968