YuanZ77 commited on
Commit
7ea4821
·
verified ·
1 Parent(s): f7aa499

Model save

Browse files
README.md ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: gemma
4
+ base_model: google/gemma-2-2b-it
5
+ tags:
6
+ - trl
7
+ - sft
8
+ - generated_from_trainer
9
+ datasets:
10
+ - generator
11
+ model-index:
12
+ - name: gemma2b-b
13
+ results: []
14
+ ---
15
+
16
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
+ should probably proofread and complete it, then remove this comment. -->
18
+
19
+ # gemma2b-b
20
+
21
+ This model is a fine-tuned version of [google/gemma-2-2b-it](https://huggingface.co/google/gemma-2-2b-it) on the generator dataset.
22
+ It achieves the following results on the evaluation set:
23
+ - Loss: 0.9842
24
+
25
+ ## Model description
26
+
27
+ More information needed
28
+
29
+ ## Intended uses & limitations
30
+
31
+ More information needed
32
+
33
+ ## Training and evaluation data
34
+
35
+ More information needed
36
+
37
+ ## Training procedure
38
+
39
+ ### Training hyperparameters
40
+
41
+ The following hyperparameters were used during training:
42
+ - learning_rate: 2e-05
43
+ - train_batch_size: 4
44
+ - eval_batch_size: 4
45
+ - seed: 42
46
+ - distributed_type: multi-GPU
47
+ - num_devices: 2
48
+ - gradient_accumulation_steps: 8
49
+ - total_train_batch_size: 64
50
+ - total_eval_batch_size: 8
51
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
52
+ - lr_scheduler_type: cosine
53
+ - lr_scheduler_warmup_ratio: 0.1
54
+ - num_epochs: 3
55
+
56
+ ### Training results
57
+
58
+ | Training Loss | Epoch | Step | Validation Loss |
59
+ |:-------------:|:------:|:----:|:---------------:|
60
+ | 1.3036 | 0.9180 | 7 | 1.2136 |
61
+ | 0.9604 | 1.9672 | 15 | 0.9997 |
62
+ | 0.8292 | 2.7541 | 21 | 0.9842 |
63
+
64
+
65
+ ### Framework versions
66
+
67
+ - Transformers 4.44.2
68
+ - Pytorch 2.4.0+cu121
69
+ - Datasets 3.0.0
70
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.7540983606557377,
3
+ "total_flos": 3971544514560.0,
4
+ "train_loss": 1.268857215132032,
5
+ "train_runtime": 396.9611,
6
+ "train_samples": 726,
7
+ "train_samples_per_second": 3.673,
8
+ "train_steps_per_second": 0.053
9
+ }
generation_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "cache_implementation": "hybrid",
5
+ "eos_token_id": [
6
+ 1,
7
+ 107
8
+ ],
9
+ "pad_token_id": 0,
10
+ "transformers_version": "4.44.2"
11
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.7540983606557377,
3
+ "total_flos": 3971544514560.0,
4
+ "train_loss": 1.268857215132032,
5
+ "train_runtime": 396.9611,
6
+ "train_samples": 726,
7
+ "train_samples_per_second": 3.673,
8
+ "train_steps_per_second": 0.053
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.7540983606557377,
5
+ "eval_steps": 500,
6
+ "global_step": 21,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.13114754098360656,
13
+ "grad_norm": 39.05130772271081,
14
+ "learning_rate": 6.666666666666667e-06,
15
+ "loss": 2.4376,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.26229508196721313,
20
+ "grad_norm": 39.015973031999486,
21
+ "learning_rate": 1.3333333333333333e-05,
22
+ "loss": 2.4339,
23
+ "step": 2
24
+ },
25
+ {
26
+ "epoch": 0.39344262295081966,
27
+ "grad_norm": 15.152312064538828,
28
+ "learning_rate": 2e-05,
29
+ "loss": 1.8888,
30
+ "step": 3
31
+ },
32
+ {
33
+ "epoch": 0.5245901639344263,
34
+ "grad_norm": 29.273723674546083,
35
+ "learning_rate": 1.9848077530122083e-05,
36
+ "loss": 1.8725,
37
+ "step": 4
38
+ },
39
+ {
40
+ "epoch": 0.6557377049180327,
41
+ "grad_norm": 11.667536919946288,
42
+ "learning_rate": 1.9396926207859085e-05,
43
+ "loss": 1.7797,
44
+ "step": 5
45
+ },
46
+ {
47
+ "epoch": 0.7868852459016393,
48
+ "grad_norm": 5.696635279505377,
49
+ "learning_rate": 1.866025403784439e-05,
50
+ "loss": 1.4526,
51
+ "step": 6
52
+ },
53
+ {
54
+ "epoch": 0.9180327868852459,
55
+ "grad_norm": 6.1474254223603655,
56
+ "learning_rate": 1.766044443118978e-05,
57
+ "loss": 1.3036,
58
+ "step": 7
59
+ },
60
+ {
61
+ "epoch": 0.9180327868852459,
62
+ "eval_loss": 1.2136186361312866,
63
+ "eval_runtime": 9.285,
64
+ "eval_samples_per_second": 20.678,
65
+ "eval_steps_per_second": 2.585,
66
+ "step": 7
67
+ },
68
+ {
69
+ "epoch": 1.0491803278688525,
70
+ "grad_norm": 3.8441152970886963,
71
+ "learning_rate": 1.6427876096865394e-05,
72
+ "loss": 1.193,
73
+ "step": 8
74
+ },
75
+ {
76
+ "epoch": 1.180327868852459,
77
+ "grad_norm": 3.216661811188678,
78
+ "learning_rate": 1.5000000000000002e-05,
79
+ "loss": 1.1161,
80
+ "step": 9
81
+ },
82
+ {
83
+ "epoch": 1.3114754098360657,
84
+ "grad_norm": 2.783161962423741,
85
+ "learning_rate": 1.342020143325669e-05,
86
+ "loss": 1.0656,
87
+ "step": 10
88
+ },
89
+ {
90
+ "epoch": 1.4426229508196722,
91
+ "grad_norm": 2.6684579204452517,
92
+ "learning_rate": 1.1736481776669307e-05,
93
+ "loss": 1.0333,
94
+ "step": 11
95
+ },
96
+ {
97
+ "epoch": 1.5737704918032787,
98
+ "grad_norm": 2.1293181551342286,
99
+ "learning_rate": 1e-05,
100
+ "loss": 1.0045,
101
+ "step": 12
102
+ },
103
+ {
104
+ "epoch": 1.7049180327868854,
105
+ "grad_norm": 1.7349053966220533,
106
+ "learning_rate": 8.263518223330698e-06,
107
+ "loss": 0.971,
108
+ "step": 13
109
+ },
110
+ {
111
+ "epoch": 1.8360655737704918,
112
+ "grad_norm": 1.708605250889302,
113
+ "learning_rate": 6.579798566743314e-06,
114
+ "loss": 0.9746,
115
+ "step": 14
116
+ },
117
+ {
118
+ "epoch": 1.9672131147540983,
119
+ "grad_norm": 1.565922544143404,
120
+ "learning_rate": 5.000000000000003e-06,
121
+ "loss": 0.9604,
122
+ "step": 15
123
+ },
124
+ {
125
+ "epoch": 1.9672131147540983,
126
+ "eval_loss": 0.9996973872184753,
127
+ "eval_runtime": 9.0907,
128
+ "eval_samples_per_second": 21.121,
129
+ "eval_steps_per_second": 2.64,
130
+ "step": 15
131
+ },
132
+ {
133
+ "epoch": 2.098360655737705,
134
+ "grad_norm": 1.4703758003535075,
135
+ "learning_rate": 3.5721239031346067e-06,
136
+ "loss": 0.9006,
137
+ "step": 16
138
+ },
139
+ {
140
+ "epoch": 2.2295081967213113,
141
+ "grad_norm": 1.4546503498400736,
142
+ "learning_rate": 2.339555568810221e-06,
143
+ "loss": 0.8672,
144
+ "step": 17
145
+ },
146
+ {
147
+ "epoch": 2.360655737704918,
148
+ "grad_norm": 1.333908386274604,
149
+ "learning_rate": 1.339745962155613e-06,
150
+ "loss": 0.8603,
151
+ "step": 18
152
+ },
153
+ {
154
+ "epoch": 2.4918032786885247,
155
+ "grad_norm": 1.2309918853415422,
156
+ "learning_rate": 6.030737921409169e-07,
157
+ "loss": 0.8533,
158
+ "step": 19
159
+ },
160
+ {
161
+ "epoch": 2.6229508196721314,
162
+ "grad_norm": 1.187570177680188,
163
+ "learning_rate": 1.519224698779198e-07,
164
+ "loss": 0.8482,
165
+ "step": 20
166
+ },
167
+ {
168
+ "epoch": 2.7540983606557377,
169
+ "grad_norm": 1.1796688649183953,
170
+ "learning_rate": 0.0,
171
+ "loss": 0.8292,
172
+ "step": 21
173
+ },
174
+ {
175
+ "epoch": 2.7540983606557377,
176
+ "eval_loss": 0.9841778874397278,
177
+ "eval_runtime": 9.0575,
178
+ "eval_samples_per_second": 21.198,
179
+ "eval_steps_per_second": 2.65,
180
+ "step": 21
181
+ },
182
+ {
183
+ "epoch": 2.7540983606557377,
184
+ "step": 21,
185
+ "total_flos": 3971544514560.0,
186
+ "train_loss": 1.268857215132032,
187
+ "train_runtime": 396.9611,
188
+ "train_samples_per_second": 3.673,
189
+ "train_steps_per_second": 0.053
190
+ }
191
+ ],
192
+ "logging_steps": 1,
193
+ "max_steps": 21,
194
+ "num_input_tokens_seen": 0,
195
+ "num_train_epochs": 3,
196
+ "save_steps": 500,
197
+ "stateful_callbacks": {
198
+ "TrainerControl": {
199
+ "args": {
200
+ "should_epoch_stop": false,
201
+ "should_evaluate": false,
202
+ "should_log": false,
203
+ "should_save": true,
204
+ "should_training_stop": true
205
+ },
206
+ "attributes": {}
207
+ }
208
+ },
209
+ "total_flos": 3971544514560.0,
210
+ "train_batch_size": 4,
211
+ "trial_name": null,
212
+ "trial_params": null
213
+ }