YuanZ77 commited on
Commit
d952e72
·
verified ·
1 Parent(s): e054f65

Model save

Browse files
Files changed (4) hide show
  1. README.md +71 -0
  2. all_results.json +9 -0
  3. train_results.json +9 -0
  4. trainer_state.json +376 -0
README.md ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: LLM-PBE/Llama3.1-8b-instruct-LLMPC-Red-Team
3
+ library_name: peft
4
+ license: llama3.1
5
+ tags:
6
+ - trl
7
+ - sft
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: zephyr-7b-sft-qlora
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # zephyr-7b-sft-qlora
18
+
19
+ This model is a fine-tuned version of [LLM-PBE/Llama3.1-8b-instruct-LLMPC-Red-Team](https://huggingface.co/LLM-PBE/Llama3.1-8b-instruct-LLMPC-Red-Team) on the None dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.2536
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 5e-05
41
+ - train_batch_size: 4
42
+ - eval_batch_size: 8
43
+ - seed: 42
44
+ - distributed_type: multi-GPU
45
+ - num_devices: 4
46
+ - gradient_accumulation_steps: 2
47
+ - total_train_batch_size: 32
48
+ - total_eval_batch_size: 32
49
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
+ - lr_scheduler_type: cosine
51
+ - lr_scheduler_warmup_ratio: 0.1
52
+ - num_epochs: 5.0
53
+
54
+ ### Training results
55
+
56
+ | Training Loss | Epoch | Step | Validation Loss |
57
+ |:-------------:|:-----:|:----:|:---------------:|
58
+ | 0.3835 | 1.0 | 41 | 0.3792 |
59
+ | 0.3431 | 2.0 | 82 | 0.3488 |
60
+ | 0.2489 | 3.0 | 123 | 0.2568 |
61
+ | 0.2468 | 4.0 | 164 | 0.2538 |
62
+ | 0.2454 | 5.0 | 205 | 0.2536 |
63
+
64
+
65
+ ### Framework versions
66
+
67
+ - PEFT 0.10.0
68
+ - Transformers 4.45.2
69
+ - Pytorch 2.4.0+cu121
70
+ - Datasets 3.0.0
71
+ - Tokenizers 0.20.0
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "total_flos": 7.472149011177144e+17,
4
+ "train_loss": 0.12734018942204917,
5
+ "train_runtime": 865.1722,
6
+ "train_samples": 1305,
7
+ "train_samples_per_second": 7.542,
8
+ "train_steps_per_second": 0.237
9
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "total_flos": 7.472149011177144e+17,
4
+ "train_loss": 0.12734018942204917,
5
+ "train_runtime": 865.1722,
6
+ "train_samples": 1305,
7
+ "train_samples_per_second": 7.542,
8
+ "train_steps_per_second": 0.237
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,376 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 5.0,
5
+ "eval_steps": 500,
6
+ "global_step": 205,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.024390243902439025,
13
+ "grad_norm": 0.2731004059314728,
14
+ "learning_rate": 2.3809523809523808e-06,
15
+ "loss": 0.6255,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.12195121951219512,
20
+ "grad_norm": 0.27167728543281555,
21
+ "learning_rate": 1.1904761904761905e-05,
22
+ "loss": 0.6283,
23
+ "step": 5
24
+ },
25
+ {
26
+ "epoch": 0.24390243902439024,
27
+ "grad_norm": 0.24203768372535706,
28
+ "learning_rate": 2.380952380952381e-05,
29
+ "loss": 0.6021,
30
+ "step": 10
31
+ },
32
+ {
33
+ "epoch": 0.36585365853658536,
34
+ "grad_norm": 0.1512613147497177,
35
+ "learning_rate": 3.571428571428572e-05,
36
+ "loss": 0.5736,
37
+ "step": 15
38
+ },
39
+ {
40
+ "epoch": 0.4878048780487805,
41
+ "grad_norm": 0.13210485875606537,
42
+ "learning_rate": 4.761904761904762e-05,
43
+ "loss": 0.5307,
44
+ "step": 20
45
+ },
46
+ {
47
+ "epoch": 0.6097560975609756,
48
+ "grad_norm": 0.11809483170509338,
49
+ "learning_rate": 4.994171922976348e-05,
50
+ "loss": 0.4787,
51
+ "step": 25
52
+ },
53
+ {
54
+ "epoch": 0.7317073170731707,
55
+ "grad_norm": 0.1285855919122696,
56
+ "learning_rate": 4.9705419236058825e-05,
57
+ "loss": 0.4354,
58
+ "step": 30
59
+ },
60
+ {
61
+ "epoch": 0.8536585365853658,
62
+ "grad_norm": 0.10194303095340729,
63
+ "learning_rate": 4.9289177234948535e-05,
64
+ "loss": 0.4031,
65
+ "step": 35
66
+ },
67
+ {
68
+ "epoch": 0.975609756097561,
69
+ "grad_norm": 0.0670081079006195,
70
+ "learning_rate": 4.8696024926503396e-05,
71
+ "loss": 0.3835,
72
+ "step": 40
73
+ },
74
+ {
75
+ "epoch": 1.0,
76
+ "eval_loss": 0.37915876507759094,
77
+ "eval_runtime": 7.8362,
78
+ "eval_samples_per_second": 18.631,
79
+ "eval_steps_per_second": 0.638,
80
+ "step": 41
81
+ },
82
+ {
83
+ "epoch": 1.0975609756097562,
84
+ "grad_norm": 0.07868604362010956,
85
+ "learning_rate": 4.793028253763633e-05,
86
+ "loss": 0.367,
87
+ "step": 45
88
+ },
89
+ {
90
+ "epoch": 1.2195121951219512,
91
+ "grad_norm": 0.08379828184843063,
92
+ "learning_rate": 4.69975273557146e-05,
93
+ "loss": 0.3689,
94
+ "step": 50
95
+ },
96
+ {
97
+ "epoch": 1.3414634146341464,
98
+ "grad_norm": 0.089637391269207,
99
+ "learning_rate": 4.5904553106367774e-05,
100
+ "loss": 0.3695,
101
+ "step": 55
102
+ },
103
+ {
104
+ "epoch": 1.4634146341463414,
105
+ "grad_norm": 0.08855098485946655,
106
+ "learning_rate": 4.4659320471363314e-05,
107
+ "loss": 0.3615,
108
+ "step": 60
109
+ },
110
+ {
111
+ "epoch": 1.5853658536585367,
112
+ "grad_norm": 0.09250541776418686,
113
+ "learning_rate": 4.3270899106953105e-05,
114
+ "loss": 0.3584,
115
+ "step": 65
116
+ },
117
+ {
118
+ "epoch": 1.7073170731707317,
119
+ "grad_norm": 0.08777391910552979,
120
+ "learning_rate": 4.174940158500041e-05,
121
+ "loss": 0.3443,
122
+ "step": 70
123
+ },
124
+ {
125
+ "epoch": 1.8292682926829267,
126
+ "grad_norm": 0.08619946986436844,
127
+ "learning_rate": 4.0105909738027365e-05,
128
+ "loss": 0.3491,
129
+ "step": 75
130
+ },
131
+ {
132
+ "epoch": 1.951219512195122,
133
+ "grad_norm": 0.09870754927396774,
134
+ "learning_rate": 3.835239394464901e-05,
135
+ "loss": 0.3431,
136
+ "step": 80
137
+ },
138
+ {
139
+ "epoch": 2.0,
140
+ "eval_loss": 0.3487952947616577,
141
+ "eval_runtime": 7.7885,
142
+ "eval_samples_per_second": 18.746,
143
+ "eval_steps_per_second": 0.642,
144
+ "step": 82
145
+ },
146
+ {
147
+ "epoch": 2.073170731707317,
148
+ "grad_norm": 0.08834455162286758,
149
+ "learning_rate": 3.6501625943278805e-05,
150
+ "loss": 0.3355,
151
+ "step": 85
152
+ },
153
+ {
154
+ "epoch": 2.1951219512195124,
155
+ "grad_norm": 0.07206975668668747,
156
+ "learning_rate": 3.456708580912725e-05,
157
+ "loss": 0.3394,
158
+ "step": 90
159
+ },
160
+ {
161
+ "epoch": 2.317073170731707,
162
+ "grad_norm": 0.05374612286686897,
163
+ "learning_rate": 3.25628637720269e-05,
164
+ "loss": 0.3462,
165
+ "step": 95
166
+ },
167
+ {
168
+ "epoch": 2.4390243902439024,
169
+ "grad_norm": 0.050639085471630096,
170
+ "learning_rate": 3.0503557590194143e-05,
171
+ "loss": 0.3246,
172
+ "step": 100
173
+ },
174
+ {
175
+ "epoch": 2.5609756097560976,
176
+ "grad_norm": 0.06731884926557541,
177
+ "learning_rate": 2.840416622740617e-05,
178
+ "loss": 0.2627,
179
+ "step": 105
180
+ },
181
+ {
182
+ "epoch": 2.682926829268293,
183
+ "grad_norm": 0.047202687710523605,
184
+ "learning_rate": 2.6279980607995836e-05,
185
+ "loss": 0.268,
186
+ "step": 110
187
+ },
188
+ {
189
+ "epoch": 2.8048780487804876,
190
+ "grad_norm": 0.045556213706731796,
191
+ "learning_rate": 2.4146472245350805e-05,
192
+ "loss": 0.2513,
193
+ "step": 115
194
+ },
195
+ {
196
+ "epoch": 2.926829268292683,
197
+ "grad_norm": 0.043327976018190384,
198
+ "learning_rate": 2.201918055509173e-05,
199
+ "loss": 0.2489,
200
+ "step": 120
201
+ },
202
+ {
203
+ "epoch": 3.0,
204
+ "eval_loss": 0.2567618787288666,
205
+ "eval_runtime": 13.2824,
206
+ "eval_samples_per_second": 10.992,
207
+ "eval_steps_per_second": 0.376,
208
+ "step": 123
209
+ },
210
+ {
211
+ "epoch": 3.048780487804878,
212
+ "grad_norm": 0.045429013669490814,
213
+ "learning_rate": 1.991359967368416e-05,
214
+ "loss": 0.2556,
215
+ "step": 125
216
+ },
217
+ {
218
+ "epoch": 3.1707317073170733,
219
+ "grad_norm": 0.03991298750042915,
220
+ "learning_rate": 1.7845065606841472e-05,
221
+ "loss": 0.2418,
222
+ "step": 130
223
+ },
224
+ {
225
+ "epoch": 3.292682926829268,
226
+ "grad_norm": 0.04288507252931595,
227
+ "learning_rate": 1.582864452967359e-05,
228
+ "loss": 0.2455,
229
+ "step": 135
230
+ },
231
+ {
232
+ "epoch": 3.4146341463414633,
233
+ "grad_norm": 0.04243966192007065,
234
+ "learning_rate": 1.3879023052147899e-05,
235
+ "loss": 0.2494,
236
+ "step": 140
237
+ },
238
+ {
239
+ "epoch": 3.5365853658536586,
240
+ "grad_norm": 0.041205402463674545,
241
+ "learning_rate": 1.2010401249114167e-05,
242
+ "loss": 0.241,
243
+ "step": 145
244
+ },
245
+ {
246
+ "epoch": 3.658536585365854,
247
+ "grad_norm": 0.04464095085859299,
248
+ "learning_rate": 1.0236389234009727e-05,
249
+ "loss": 0.2512,
250
+ "step": 150
251
+ },
252
+ {
253
+ "epoch": 3.7804878048780486,
254
+ "grad_norm": 0.04085472226142883,
255
+ "learning_rate": 8.569908029550685e-06,
256
+ "loss": 0.2461,
257
+ "step": 155
258
+ },
259
+ {
260
+ "epoch": 3.902439024390244,
261
+ "grad_norm": 0.041141241788864136,
262
+ "learning_rate": 7.02309545741773e-06,
263
+ "loss": 0.2468,
264
+ "step": 160
265
+ },
266
+ {
267
+ "epoch": 4.0,
268
+ "eval_loss": 0.2538050413131714,
269
+ "eval_runtime": 11.622,
270
+ "eval_samples_per_second": 12.562,
271
+ "eval_steps_per_second": 0.43,
272
+ "step": 164
273
+ },
274
+ {
275
+ "epoch": 4.024390243902439,
276
+ "grad_norm": 0.042266517877578735,
277
+ "learning_rate": 5.607217732389503e-06,
278
+ "loss": 0.2496,
279
+ "step": 165
280
+ },
281
+ {
282
+ "epoch": 4.146341463414634,
283
+ "grad_norm": 0.039677053689956665,
284
+ "learning_rate": 4.332587404827854e-06,
285
+ "loss": 0.2429,
286
+ "step": 170
287
+ },
288
+ {
289
+ "epoch": 4.2682926829268295,
290
+ "grad_norm": 0.0383627712726593,
291
+ "learning_rate": 3.208488249181216e-06,
292
+ "loss": 0.2452,
293
+ "step": 175
294
+ },
295
+ {
296
+ "epoch": 4.390243902439025,
297
+ "grad_norm": 0.039468664675951004,
298
+ "learning_rate": 2.2431076455809467e-06,
299
+ "loss": 0.2438,
300
+ "step": 180
301
+ },
302
+ {
303
+ "epoch": 4.512195121951219,
304
+ "grad_norm": 0.0408608578145504,
305
+ "learning_rate": 1.44347694702949e-06,
306
+ "loss": 0.2451,
307
+ "step": 185
308
+ },
309
+ {
310
+ "epoch": 4.634146341463414,
311
+ "grad_norm": 0.03901852294802666,
312
+ "learning_rate": 8.154202665162147e-07,
313
+ "loss": 0.2531,
314
+ "step": 190
315
+ },
316
+ {
317
+ "epoch": 4.7560975609756095,
318
+ "grad_norm": 0.039755210280418396,
319
+ "learning_rate": 3.635120570700784e-07,
320
+ "loss": 0.243,
321
+ "step": 195
322
+ },
323
+ {
324
+ "epoch": 4.878048780487805,
325
+ "grad_norm": 0.04330005869269371,
326
+ "learning_rate": 9.104379371500105e-08,
327
+ "loss": 0.2447,
328
+ "step": 200
329
+ },
330
+ {
331
+ "epoch": 5.0,
332
+ "grad_norm": 0.04034363478422165,
333
+ "learning_rate": 0.0,
334
+ "loss": 0.2454,
335
+ "step": 205
336
+ },
337
+ {
338
+ "epoch": 5.0,
339
+ "eval_loss": 0.2536194324493408,
340
+ "eval_runtime": 11.5668,
341
+ "eval_samples_per_second": 12.622,
342
+ "eval_steps_per_second": 0.432,
343
+ "step": 205
344
+ },
345
+ {
346
+ "epoch": 5.0,
347
+ "step": 205,
348
+ "total_flos": 7.472149011177144e+17,
349
+ "train_loss": 0.12734018942204917,
350
+ "train_runtime": 865.1722,
351
+ "train_samples_per_second": 7.542,
352
+ "train_steps_per_second": 0.237
353
+ }
354
+ ],
355
+ "logging_steps": 5,
356
+ "max_steps": 205,
357
+ "num_input_tokens_seen": 0,
358
+ "num_train_epochs": 5,
359
+ "save_steps": 100,
360
+ "stateful_callbacks": {
361
+ "TrainerControl": {
362
+ "args": {
363
+ "should_epoch_stop": false,
364
+ "should_evaluate": false,
365
+ "should_log": false,
366
+ "should_save": true,
367
+ "should_training_stop": true
368
+ },
369
+ "attributes": {}
370
+ }
371
+ },
372
+ "total_flos": 7.472149011177144e+17,
373
+ "train_batch_size": 4,
374
+ "trial_name": null,
375
+ "trial_params": null
376
+ }