youdiniplays commited on
Commit
12e6c3d
·
verified ·
1 Parent(s): 022b6d6

End of training

Browse files
README.md CHANGED
@@ -17,9 +17,9 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [youdiniplays/tl-ceb-model-v2](https://huggingface.co/youdiniplays/tl-ceb-model-v2) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.3933
21
- - Bleu: 4.5385
22
- - Gen Len: 18.024
23
 
24
  ## Model description
25
 
@@ -51,11 +51,11 @@ The following hyperparameters were used during training:
51
 
52
  | Training Loss | Epoch | Step | Validation Loss | Bleu | Gen Len |
53
  |:-------------:|:-----:|:-----:|:---------------:|:------:|:-------:|
54
- | 0.5165 | 1.0 | 6516 | 0.4074 | 4.3267 | 18.008 |
55
- | 0.4819 | 2.0 | 13032 | 0.4139 | 4.3977 | 18.051 |
56
- | 0.4448 | 3.0 | 19548 | 0.4067 | 4.4144 | 18.041 |
57
- | 0.3992 | 4.0 | 26064 | 0.3954 | 4.4996 | 18.016 |
58
- | 0.3604 | 5.0 | 32580 | 0.3933 | 4.5385 | 18.024 |
59
 
60
 
61
  ### Framework versions
 
17
 
18
  This model is a fine-tuned version of [youdiniplays/tl-ceb-model-v2](https://huggingface.co/youdiniplays/tl-ceb-model-v2) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.3606
21
+ - Bleu: 3.942
22
+ - Gen Len: 18.31
23
 
24
  ## Model description
25
 
 
51
 
52
  | Training Loss | Epoch | Step | Validation Loss | Bleu | Gen Len |
53
  |:-------------:|:-----:|:-----:|:---------------:|:------:|:-------:|
54
+ | 0.4837 | 1.0 | 6516 | 0.3805 | 3.8228 | 18.313 |
55
+ | 0.4479 | 2.0 | 13032 | 0.3810 | 3.7662 | 18.331 |
56
+ | 0.4036 | 3.0 | 19548 | 0.3755 | 3.8306 | 18.343 |
57
+ | 0.3572 | 4.0 | 26064 | 0.3673 | 3.8996 | 18.321 |
58
+ | 0.3183 | 5.0 | 32580 | 0.3606 | 3.942 | 18.31 |
59
 
60
 
61
  ### Framework versions
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:12a56e4f696daf653f7e92b1bf3b257dfe9d43458e87219ae7e2d4a450ef9f0a
3
  size 242041896
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f41bb533aa86e301a1be3b65f96c1170612ae823deb447e90a35a957837ed0d
3
  size 242041896
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:58cc9abef2cbdea5218d8fa701a60440ccf98ca546e5f5c51a8311045ac25026
3
  size 484163514
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5199d65e10b84399e00ecb642c52de2797b550f6e71e4e1f4b276272553ca552
3
  size 484163514
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8dab94ed4b71ebfd79df176ffa20e8a3d588127f93d1c43937f8527fd3a9c67d
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1cd63170bb513ed1093722cd85dcb9976e6bd105066dd8d5d629488d4e94f5b
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36e6f55bcc217a43fef0a794bb548420d34ff2f529f3e814b7aa83539a026216
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53e446aca6ce45dd19f0dd3bb36af3837e8ac7287ccd924c8a3f9b7a954a486f
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 4.987722529158994,
5
  "eval_steps": 500,
6
- "global_step": 32500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -11,439 +11,285 @@
11
  {
12
  "epoch": 0.08,
13
  "learning_rate": 0.0009846531614487416,
14
- "loss": 0.4248,
15
  "step": 500
16
  },
17
  {
18
  "epoch": 0.15,
19
  "learning_rate": 0.0009693063228974831,
20
- "loss": 0.4521,
21
  "step": 1000
22
  },
23
  {
24
  "epoch": 0.23,
25
  "learning_rate": 0.0009539594843462247,
26
- "loss": 0.4654,
27
  "step": 1500
28
  },
29
  {
30
  "epoch": 0.31,
31
  "learning_rate": 0.0009386126457949663,
32
- "loss": 0.4686,
33
  "step": 2000
34
  },
35
  {
36
  "epoch": 0.38,
37
  "learning_rate": 0.0009232658072437078,
38
- "loss": 0.4705,
39
  "step": 2500
40
  },
41
  {
42
  "epoch": 0.46,
43
  "learning_rate": 0.0009079189686924494,
44
- "loss": 0.4757,
45
  "step": 3000
46
  },
47
  {
48
  "epoch": 0.54,
49
  "learning_rate": 0.000892572130141191,
50
- "loss": 0.478,
51
  "step": 3500
52
  },
53
  {
54
  "epoch": 0.61,
55
  "learning_rate": 0.0008772252915899325,
56
- "loss": 0.4802,
57
  "step": 4000
58
  },
59
  {
60
  "epoch": 0.69,
61
  "learning_rate": 0.0008618784530386741,
62
- "loss": 0.4848,
63
  "step": 4500
64
  },
65
  {
66
  "epoch": 0.77,
67
  "learning_rate": 0.0008465316144874156,
68
- "loss": 0.4835,
69
  "step": 5000
70
  },
71
  {
72
  "epoch": 0.84,
73
  "learning_rate": 0.0008311847759361571,
74
- "loss": 0.4845,
75
  "step": 5500
76
  },
77
  {
78
  "epoch": 0.92,
79
  "learning_rate": 0.0008158379373848988,
80
- "loss": 0.485,
81
  "step": 6000
82
  },
83
  {
84
  "epoch": 1.0,
85
- "learning_rate": 0.0008005524861878454,
86
- "loss": 0.4837,
87
  "step": 6500
88
  },
89
  {
90
  "epoch": 1.0,
91
- "eval_bleu": 3.8228,
92
- "eval_gen_len": 18.313,
93
- "eval_loss": 0.3804944157600403,
94
- "eval_runtime": 25.2075,
95
- "eval_samples_per_second": 39.671,
96
- "eval_steps_per_second": 2.499,
97
  "step": 6516
98
  },
99
  {
100
  "epoch": 1.07,
101
- "learning_rate": 0.0007852056476365869,
102
- "loss": 0.4371,
103
  "step": 7000
104
  },
105
  {
106
  "epoch": 1.15,
107
- "learning_rate": 0.0007698588090853285,
108
- "loss": 0.4371,
109
  "step": 7500
110
  },
111
  {
112
  "epoch": 1.23,
113
- "learning_rate": 0.0007545119705340699,
114
- "loss": 0.4439,
115
  "step": 8000
116
  },
117
  {
118
  "epoch": 1.3,
119
  "learning_rate": 0.0007391651319828116,
120
- "loss": 0.4449,
121
  "step": 8500
122
  },
123
  {
124
  "epoch": 1.38,
125
  "learning_rate": 0.0007238182934315531,
126
- "loss": 0.4484,
127
  "step": 9000
128
  },
129
  {
130
  "epoch": 1.46,
131
  "learning_rate": 0.0007084714548802946,
132
- "loss": 0.4471,
133
  "step": 9500
134
  },
135
  {
136
  "epoch": 1.53,
137
  "learning_rate": 0.0006931246163290362,
138
- "loss": 0.4562,
139
  "step": 10000
140
  },
141
  {
142
  "epoch": 1.61,
143
  "learning_rate": 0.0006778084714548804,
144
- "loss": 0.4496,
145
  "step": 10500
146
  },
147
  {
148
  "epoch": 1.69,
149
  "learning_rate": 0.0006624616329036218,
150
- "loss": 0.4485,
151
  "step": 11000
152
  },
153
  {
154
  "epoch": 1.76,
155
  "learning_rate": 0.0006471147943523635,
156
- "loss": 0.4481,
157
  "step": 11500
158
  },
159
  {
160
  "epoch": 1.84,
161
  "learning_rate": 0.000631767955801105,
162
- "loss": 0.4479,
163
  "step": 12000
164
  },
165
  {
166
  "epoch": 1.92,
167
- "learning_rate": 0.000616451810926949,
168
- "loss": 0.4472,
169
  "step": 12500
170
  },
171
  {
172
  "epoch": 2.0,
173
- "learning_rate": 0.0006011049723756906,
174
- "loss": 0.4479,
175
  "step": 13000
176
  },
177
  {
178
  "epoch": 2.0,
179
- "eval_bleu": 3.7662,
180
- "eval_gen_len": 18.331,
181
- "eval_loss": 0.3810326159000397,
182
- "eval_runtime": 21.6354,
183
- "eval_samples_per_second": 46.22,
184
- "eval_steps_per_second": 2.912,
185
  "step": 13032
186
  },
187
  {
188
  "epoch": 2.07,
189
- "learning_rate": 0.0005857581338244322,
190
- "loss": 0.4063,
191
  "step": 13500
192
  },
193
  {
194
  "epoch": 2.15,
195
- "learning_rate": 0.0005704112952731737,
196
- "loss": 0.4007,
197
  "step": 14000
198
  },
199
  {
200
  "epoch": 2.23,
201
  "learning_rate": 0.0005550951503990178,
202
- "loss": 0.4059,
203
  "step": 14500
204
  },
205
  {
206
  "epoch": 2.3,
207
  "learning_rate": 0.0005397483118477594,
208
- "loss": 0.4023,
209
  "step": 15000
210
  },
211
  {
212
  "epoch": 2.38,
213
  "learning_rate": 0.0005244014732965009,
214
- "loss": 0.4061,
215
  "step": 15500
216
  },
217
  {
218
  "epoch": 2.46,
219
  "learning_rate": 0.0005090546347452425,
220
- "loss": 0.4046,
221
  "step": 16000
222
  },
223
  {
224
  "epoch": 2.53,
225
  "learning_rate": 0.0004937384898710866,
226
- "loss": 0.4082,
227
  "step": 16500
228
  },
229
  {
230
  "epoch": 2.61,
231
  "learning_rate": 0.00047839165131982814,
232
- "loss": 0.4122,
233
  "step": 17000
234
  },
235
  {
236
  "epoch": 2.69,
237
  "learning_rate": 0.0004630448127685697,
238
- "loss": 0.4076,
239
  "step": 17500
240
  },
241
  {
242
  "epoch": 2.76,
243
  "learning_rate": 0.00044769797421731125,
244
- "loss": 0.4055,
245
  "step": 18000
246
  },
247
  {
248
  "epoch": 2.84,
249
  "learning_rate": 0.00043238182934315537,
250
- "loss": 0.4003,
251
  "step": 18500
252
  },
253
  {
254
  "epoch": 2.92,
255
  "learning_rate": 0.00041703499079189687,
256
- "loss": 0.4045,
257
  "step": 19000
258
  },
259
  {
260
  "epoch": 2.99,
261
  "learning_rate": 0.0004016881522406384,
262
- "loss": 0.4036,
263
  "step": 19500
264
  },
265
  {
266
  "epoch": 3.0,
267
- "eval_bleu": 3.8306,
268
- "eval_gen_len": 18.343,
269
- "eval_loss": 0.3754543364048004,
270
- "eval_runtime": 21.8401,
271
- "eval_samples_per_second": 45.787,
272
- "eval_steps_per_second": 2.885,
273
  "step": 19548
274
  },
275
  {
276
  "epoch": 3.07,
277
  "learning_rate": 0.00038634131368938,
278
- "loss": 0.3611,
279
  "step": 20000
280
  },
281
  {
282
  "epoch": 3.15,
283
- "learning_rate": 0.0003710251688152241,
284
- "loss": 0.3608,
285
  "step": 20500
286
- },
287
- {
288
- "epoch": 3.22,
289
- "learning_rate": 0.0003556783302639656,
290
- "loss": 0.3676,
291
- "step": 21000
292
- },
293
- {
294
- "epoch": 3.3,
295
- "learning_rate": 0.00034033149171270715,
296
- "loss": 0.3643,
297
- "step": 21500
298
- },
299
- {
300
- "epoch": 3.38,
301
- "learning_rate": 0.00032498465316144876,
302
- "loss": 0.3591,
303
- "step": 22000
304
- },
305
- {
306
- "epoch": 3.45,
307
- "learning_rate": 0.0003096378146101903,
308
- "loss": 0.3651,
309
- "step": 22500
310
- },
311
- {
312
- "epoch": 3.53,
313
- "learning_rate": 0.0002943216697360344,
314
- "loss": 0.3669,
315
- "step": 23000
316
- },
317
- {
318
- "epoch": 3.61,
319
- "learning_rate": 0.00027897483118477593,
320
- "loss": 0.3609,
321
- "step": 23500
322
- },
323
- {
324
- "epoch": 3.68,
325
- "learning_rate": 0.0002636279926335175,
326
- "loss": 0.3626,
327
- "step": 24000
328
- },
329
- {
330
- "epoch": 3.76,
331
- "learning_rate": 0.0002482811540822591,
332
- "loss": 0.3625,
333
- "step": 24500
334
- },
335
- {
336
- "epoch": 3.84,
337
- "learning_rate": 0.00023296500920810313,
338
- "loss": 0.362,
339
- "step": 25000
340
- },
341
- {
342
- "epoch": 3.91,
343
- "learning_rate": 0.00021761817065684468,
344
- "loss": 0.3554,
345
- "step": 25500
346
- },
347
- {
348
- "epoch": 3.99,
349
- "learning_rate": 0.00020227133210558627,
350
- "loss": 0.3572,
351
- "step": 26000
352
- },
353
- {
354
- "epoch": 4.0,
355
- "eval_bleu": 3.8996,
356
- "eval_gen_len": 18.321,
357
- "eval_loss": 0.3672715425491333,
358
- "eval_runtime": 22.3643,
359
- "eval_samples_per_second": 44.714,
360
- "eval_steps_per_second": 2.817,
361
- "step": 26064
362
- },
363
- {
364
- "epoch": 4.07,
365
- "learning_rate": 0.00018692449355432782,
366
- "loss": 0.3273,
367
- "step": 26500
368
- },
369
- {
370
- "epoch": 4.14,
371
- "learning_rate": 0.00017160834868017188,
372
- "loss": 0.3241,
373
- "step": 27000
374
- },
375
- {
376
- "epoch": 4.22,
377
- "learning_rate": 0.00015626151012891346,
378
- "loss": 0.3266,
379
- "step": 27500
380
- },
381
- {
382
- "epoch": 4.3,
383
- "learning_rate": 0.000140914671577655,
384
- "loss": 0.3238,
385
- "step": 28000
386
- },
387
- {
388
- "epoch": 4.37,
389
- "learning_rate": 0.00012556783302639657,
390
- "loss": 0.3256,
391
- "step": 28500
392
- },
393
- {
394
- "epoch": 4.45,
395
- "learning_rate": 0.00011025168815224063,
396
- "loss": 0.3258,
397
- "step": 29000
398
- },
399
- {
400
- "epoch": 4.53,
401
- "learning_rate": 9.49048496009822e-05,
402
- "loss": 0.3269,
403
- "step": 29500
404
- },
405
- {
406
- "epoch": 4.6,
407
- "learning_rate": 7.955801104972376e-05,
408
- "loss": 0.3228,
409
- "step": 30000
410
- },
411
- {
412
- "epoch": 4.68,
413
- "learning_rate": 6.421117249846533e-05,
414
- "loss": 0.3179,
415
- "step": 30500
416
- },
417
- {
418
- "epoch": 4.76,
419
- "learning_rate": 4.8895027624309394e-05,
420
- "loss": 0.3187,
421
- "step": 31000
422
- },
423
- {
424
- "epoch": 4.83,
425
- "learning_rate": 3.3548189073050956e-05,
426
- "loss": 0.3178,
427
- "step": 31500
428
- },
429
- {
430
- "epoch": 4.91,
431
- "learning_rate": 1.820135052179251e-05,
432
- "loss": 0.3183,
433
- "step": 32000
434
- },
435
- {
436
- "epoch": 4.99,
437
- "learning_rate": 2.85451197053407e-06,
438
- "loss": 0.3183,
439
- "step": 32500
440
  }
441
  ],
442
  "logging_steps": 500,
443
  "max_steps": 32580,
444
  "num_train_epochs": 5,
445
  "save_steps": 500,
446
- "total_flos": 1.7229657373016064e+16,
447
  "trial_name": null,
448
  "trial_params": null
449
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.14610190300798,
5
  "eval_steps": 500,
6
+ "global_step": 20500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
11
  {
12
  "epoch": 0.08,
13
  "learning_rate": 0.0009846531614487416,
14
+ "loss": 0.3358,
15
  "step": 500
16
  },
17
  {
18
  "epoch": 0.15,
19
  "learning_rate": 0.0009693063228974831,
20
+ "loss": 0.3723,
21
  "step": 1000
22
  },
23
  {
24
  "epoch": 0.23,
25
  "learning_rate": 0.0009539594843462247,
26
+ "loss": 0.3901,
27
  "step": 1500
28
  },
29
  {
30
  "epoch": 0.31,
31
  "learning_rate": 0.0009386126457949663,
32
+ "loss": 0.4022,
33
  "step": 2000
34
  },
35
  {
36
  "epoch": 0.38,
37
  "learning_rate": 0.0009232658072437078,
38
+ "loss": 0.4109,
39
  "step": 2500
40
  },
41
  {
42
  "epoch": 0.46,
43
  "learning_rate": 0.0009079189686924494,
44
+ "loss": 0.4132,
45
  "step": 3000
46
  },
47
  {
48
  "epoch": 0.54,
49
  "learning_rate": 0.000892572130141191,
50
+ "loss": 0.4197,
51
  "step": 3500
52
  },
53
  {
54
  "epoch": 0.61,
55
  "learning_rate": 0.0008772252915899325,
56
+ "loss": 0.4227,
57
  "step": 4000
58
  },
59
  {
60
  "epoch": 0.69,
61
  "learning_rate": 0.0008618784530386741,
62
+ "loss": 0.424,
63
  "step": 4500
64
  },
65
  {
66
  "epoch": 0.77,
67
  "learning_rate": 0.0008465316144874156,
68
+ "loss": 0.4228,
69
  "step": 5000
70
  },
71
  {
72
  "epoch": 0.84,
73
  "learning_rate": 0.0008311847759361571,
74
+ "loss": 0.4246,
75
  "step": 5500
76
  },
77
  {
78
  "epoch": 0.92,
79
  "learning_rate": 0.0008158379373848988,
80
+ "loss": 0.4268,
81
  "step": 6000
82
  },
83
  {
84
  "epoch": 1.0,
85
+ "learning_rate": 0.0008005217925107428,
86
+ "loss": 0.4316,
87
  "step": 6500
88
  },
89
  {
90
  "epoch": 1.0,
91
+ "eval_bleu": 4.2677,
92
+ "eval_gen_len": 18.024,
93
+ "eval_loss": 0.45959702134132385,
94
+ "eval_runtime": 23.3818,
95
+ "eval_samples_per_second": 42.768,
96
+ "eval_steps_per_second": 2.694,
97
  "step": 6516
98
  },
99
  {
100
  "epoch": 1.07,
101
+ "learning_rate": 0.0007851749539594844,
102
+ "loss": 0.3897,
103
  "step": 7000
104
  },
105
  {
106
  "epoch": 1.15,
107
+ "learning_rate": 0.0007698281154082259,
108
+ "loss": 0.3901,
109
  "step": 7500
110
  },
111
  {
112
  "epoch": 1.23,
113
+ "learning_rate": 0.0007544812768569675,
114
+ "loss": 0.3965,
115
  "step": 8000
116
  },
117
  {
118
  "epoch": 1.3,
119
  "learning_rate": 0.0007391651319828116,
120
+ "loss": 0.394,
121
  "step": 8500
122
  },
123
  {
124
  "epoch": 1.38,
125
  "learning_rate": 0.0007238182934315531,
126
+ "loss": 0.4003,
127
  "step": 9000
128
  },
129
  {
130
  "epoch": 1.46,
131
  "learning_rate": 0.0007084714548802946,
132
+ "loss": 0.403,
133
  "step": 9500
134
  },
135
  {
136
  "epoch": 1.53,
137
  "learning_rate": 0.0006931246163290362,
138
+ "loss": 0.4067,
139
  "step": 10000
140
  },
141
  {
142
  "epoch": 1.61,
143
  "learning_rate": 0.0006778084714548804,
144
+ "loss": 0.4067,
145
  "step": 10500
146
  },
147
  {
148
  "epoch": 1.69,
149
  "learning_rate": 0.0006624616329036218,
150
+ "loss": 0.4047,
151
  "step": 11000
152
  },
153
  {
154
  "epoch": 1.76,
155
  "learning_rate": 0.0006471147943523635,
156
+ "loss": 0.4032,
157
  "step": 11500
158
  },
159
  {
160
  "epoch": 1.84,
161
  "learning_rate": 0.000631767955801105,
162
+ "loss": 0.4077,
163
  "step": 12000
164
  },
165
  {
166
  "epoch": 1.92,
167
+ "learning_rate": 0.0006164825046040515,
168
+ "loss": 0.4062,
169
  "step": 12500
170
  },
171
  {
172
  "epoch": 2.0,
173
+ "learning_rate": 0.0006011356660527931,
174
+ "loss": 0.4074,
175
  "step": 13000
176
  },
177
  {
178
  "epoch": 2.0,
179
+ "eval_bleu": 4.2659,
180
+ "eval_gen_len": 18.019,
181
+ "eval_loss": 0.45938295125961304,
182
+ "eval_runtime": 24.1279,
183
+ "eval_samples_per_second": 41.446,
184
+ "eval_steps_per_second": 2.611,
185
  "step": 13032
186
  },
187
  {
188
  "epoch": 2.07,
189
+ "learning_rate": 0.0005857888275015347,
190
+ "loss": 0.3667,
191
  "step": 13500
192
  },
193
  {
194
  "epoch": 2.15,
195
+ "learning_rate": 0.0005704419889502763,
196
+ "loss": 0.3674,
197
  "step": 14000
198
  },
199
  {
200
  "epoch": 2.23,
201
  "learning_rate": 0.0005550951503990178,
202
+ "loss": 0.3749,
203
  "step": 14500
204
  },
205
  {
206
  "epoch": 2.3,
207
  "learning_rate": 0.0005397483118477594,
208
+ "loss": 0.3707,
209
  "step": 15000
210
  },
211
  {
212
  "epoch": 2.38,
213
  "learning_rate": 0.0005244014732965009,
214
+ "loss": 0.3676,
215
  "step": 15500
216
  },
217
  {
218
  "epoch": 2.46,
219
  "learning_rate": 0.0005090546347452425,
220
+ "loss": 0.3761,
221
  "step": 16000
222
  },
223
  {
224
  "epoch": 2.53,
225
  "learning_rate": 0.0004937384898710866,
226
+ "loss": 0.3795,
227
  "step": 16500
228
  },
229
  {
230
  "epoch": 2.61,
231
  "learning_rate": 0.00047839165131982814,
232
+ "loss": 0.377,
233
  "step": 17000
234
  },
235
  {
236
  "epoch": 2.69,
237
  "learning_rate": 0.0004630448127685697,
238
+ "loss": 0.3804,
239
  "step": 17500
240
  },
241
  {
242
  "epoch": 2.76,
243
  "learning_rate": 0.00044769797421731125,
244
+ "loss": 0.3824,
245
  "step": 18000
246
  },
247
  {
248
  "epoch": 2.84,
249
  "learning_rate": 0.00043238182934315537,
250
+ "loss": 0.3724,
251
  "step": 18500
252
  },
253
  {
254
  "epoch": 2.92,
255
  "learning_rate": 0.00041703499079189687,
256
+ "loss": 0.3795,
257
  "step": 19000
258
  },
259
  {
260
  "epoch": 2.99,
261
  "learning_rate": 0.0004016881522406384,
262
+ "loss": 0.3833,
263
  "step": 19500
264
  },
265
  {
266
  "epoch": 3.0,
267
+ "eval_bleu": 4.3571,
268
+ "eval_gen_len": 18.015,
269
+ "eval_loss": 0.4461449086666107,
270
+ "eval_runtime": 23.8381,
271
+ "eval_samples_per_second": 41.95,
272
+ "eval_steps_per_second": 2.643,
273
  "step": 19548
274
  },
275
  {
276
  "epoch": 3.07,
277
  "learning_rate": 0.00038634131368938,
278
+ "loss": 0.3423,
279
  "step": 20000
280
  },
281
  {
282
  "epoch": 3.15,
283
+ "learning_rate": 0.0003710558624923266,
284
+ "loss": 0.3375,
285
  "step": 20500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  }
287
  ],
288
  "logging_steps": 500,
289
  "max_steps": 32580,
290
  "num_train_epochs": 5,
291
  "save_steps": 500,
292
+ "total_flos": 1.0876549321457664e+16,
293
  "trial_name": null,
294
  "trial_params": null
295
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b58968f338c312448777ee1ccf6dbc18e49e360004c218ec824e458e768bc68
3
  size 4792
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d9e5fd144c9e568216db4505e71aa6bc722764749fdf639a5e84c071e512165
3
  size 4792
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:12a56e4f696daf653f7e92b1bf3b257dfe9d43458e87219ae7e2d4a450ef9f0a
3
  size 242041896
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb2b0a9b9f4a155763c87b51105045e65887afa5bfbc46b443dd01f5651826fc
3
  size 242041896
runs/Jan21_14-30-28_54c898f0920f/events.out.tfevents.1705847429.54c898f0920f.211.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8421fec6ebe0b1140a36339f157ceabd74d54d6603509b8b874f4a0075277118
3
- size 17066
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8fb12633f17104e51c7e80afdea5d9c62195fbc60ccb09102029559af398b02
3
+ size 17803