youdiniplays commited on
Commit
df22fde
·
verified ·
1 Parent(s): a2febff

End of training

Browse files
README.md CHANGED
@@ -17,9 +17,9 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [youdiniplays/tl-ceb-model-v2](https://huggingface.co/youdiniplays/tl-ceb-model-v2) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.4749
21
- - Bleu: 3.8223
22
- - Gen Len: 18.208
23
 
24
  ## Model description
25
 
@@ -51,11 +51,11 @@ The following hyperparameters were used during training:
51
 
52
  | Training Loss | Epoch | Step | Validation Loss | Bleu | Gen Len |
53
  |:-------------:|:-----:|:-----:|:---------------:|:------:|:-------:|
54
- | 0.6089 | 1.0 | 6516 | 0.5061 | 3.6822 | 18.235 |
55
- | 0.5687 | 2.0 | 13032 | 0.4956 | 3.7119 | 18.222 |
56
- | 0.525 | 3.0 | 19548 | 0.4869 | 3.8722 | 18.219 |
57
- | 0.488 | 4.0 | 26064 | 0.4795 | 3.7886 | 18.23 |
58
- | 0.4527 | 5.0 | 32580 | 0.4749 | 3.8223 | 18.208 |
59
 
60
 
61
  ### Framework versions
 
17
 
18
  This model is a fine-tuned version of [youdiniplays/tl-ceb-model-v2](https://huggingface.co/youdiniplays/tl-ceb-model-v2) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.4204
21
+ - Bleu: 3.642
22
+ - Gen Len: 18.299
23
 
24
  ## Model description
25
 
 
51
 
52
  | Training Loss | Epoch | Step | Validation Loss | Bleu | Gen Len |
53
  |:-------------:|:-----:|:-----:|:---------------:|:------:|:-------:|
54
+ | 0.552 | 1.0 | 6516 | 0.4449 | 3.5422 | 18.301 |
55
+ | 0.5154 | 2.0 | 13032 | 0.4440 | 3.4964 | 18.336 |
56
+ | 0.4821 | 3.0 | 19548 | 0.4356 | 3.5832 | 18.309 |
57
+ | 0.4398 | 4.0 | 26064 | 0.4224 | 3.6303 | 18.305 |
58
+ | 0.4031 | 5.0 | 32580 | 0.4204 | 3.642 | 18.299 |
59
 
60
 
61
  ### Framework versions
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03ad4fae3cb368e1108a698448c2180a61f7ce3dbdd3d4a5029c8f332c57c8a6
3
  size 242041896
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c43564e930260fa3abbd24ef8bc8e2019043f8ab0ea3f357c88632def606095f
3
  size 242041896
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6bc180befcc96c4d6cf5aa5864822fa50dd5216ecd55530428f649f0b5f9c56d
3
  size 484163514
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5157a79613ec248ac8f7d1c8901dc35a0467ddf892a8d4eace27612fd7b9473d
3
  size 484163514
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:64996838d00245e925fc4810b966d5b49a79df56c1de39902bc8d8f7a6f38f1c
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af796267785b8380aa702da74322c3a84f42d9cab004eb98c1af133c0218d64a
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36e6f55bcc217a43fef0a794bb548420d34ff2f529f3e814b7aa83539a026216
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b2103b3ba42c5a974b79f170c6d33390323651972016338595643ea512a6c2b
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -11,431 +11,431 @@
11
  {
12
  "epoch": 0.08,
13
  "learning_rate": 0.0009846531614487416,
14
- "loss": 0.4942,
15
  "step": 500
16
  },
17
  {
18
  "epoch": 0.15,
19
  "learning_rate": 0.0009693063228974831,
20
- "loss": 0.5285,
21
  "step": 1000
22
  },
23
  {
24
  "epoch": 0.23,
25
  "learning_rate": 0.0009539594843462247,
26
- "loss": 0.5364,
27
  "step": 1500
28
  },
29
  {
30
  "epoch": 0.31,
31
  "learning_rate": 0.0009386126457949663,
32
- "loss": 0.5424,
33
  "step": 2000
34
  },
35
  {
36
  "epoch": 0.38,
37
  "learning_rate": 0.0009232658072437078,
38
- "loss": 0.5394,
39
  "step": 2500
40
  },
41
  {
42
  "epoch": 0.46,
43
  "learning_rate": 0.0009079189686924494,
44
- "loss": 0.5466,
45
  "step": 3000
46
  },
47
  {
48
  "epoch": 0.54,
49
  "learning_rate": 0.000892572130141191,
50
- "loss": 0.5491,
51
  "step": 3500
52
  },
53
  {
54
  "epoch": 0.61,
55
  "learning_rate": 0.0008772252915899325,
56
- "loss": 0.5548,
57
  "step": 4000
58
  },
59
  {
60
  "epoch": 0.69,
61
  "learning_rate": 0.0008619091467157766,
62
- "loss": 0.5513,
63
  "step": 4500
64
  },
65
  {
66
  "epoch": 0.77,
67
  "learning_rate": 0.0008465623081645181,
68
- "loss": 0.5505,
69
  "step": 5000
70
  },
71
  {
72
  "epoch": 0.84,
73
  "learning_rate": 0.0008312154696132597,
74
- "loss": 0.5562,
75
  "step": 5500
76
  },
77
  {
78
  "epoch": 0.92,
79
  "learning_rate": 0.0008158686310620012,
80
- "loss": 0.5522,
81
  "step": 6000
82
  },
83
  {
84
  "epoch": 1.0,
85
  "learning_rate": 0.0008005524861878454,
86
- "loss": 0.552,
87
  "step": 6500
88
  },
89
  {
90
  "epoch": 1.0,
91
- "eval_bleu": 3.5422,
92
- "eval_gen_len": 18.301,
93
- "eval_loss": 0.44493627548217773,
94
- "eval_runtime": 26.1469,
95
- "eval_samples_per_second": 38.245,
96
- "eval_steps_per_second": 2.409,
97
  "step": 6516
98
  },
99
  {
100
  "epoch": 1.07,
101
  "learning_rate": 0.0007852056476365869,
102
- "loss": 0.5122,
103
  "step": 7000
104
  },
105
  {
106
  "epoch": 1.15,
107
  "learning_rate": 0.0007698588090853285,
108
- "loss": 0.511,
109
  "step": 7500
110
  },
111
  {
112
  "epoch": 1.23,
113
  "learning_rate": 0.0007545119705340699,
114
- "loss": 0.5156,
115
  "step": 8000
116
  },
117
  {
118
  "epoch": 1.3,
119
  "learning_rate": 0.000739195825659914,
120
- "loss": 0.5192,
121
  "step": 8500
122
  },
123
  {
124
  "epoch": 1.38,
125
  "learning_rate": 0.0007238489871086557,
126
- "loss": 0.5172,
127
  "step": 9000
128
  },
129
  {
130
  "epoch": 1.46,
131
- "learning_rate": 0.0007085021485573971,
132
- "loss": 0.5211,
133
  "step": 9500
134
  },
135
  {
136
  "epoch": 1.53,
137
- "learning_rate": 0.0006931553100061387,
138
- "loss": 0.5175,
139
  "step": 10000
140
  },
141
  {
142
  "epoch": 1.61,
143
  "learning_rate": 0.0006778391651319828,
144
- "loss": 0.5182,
145
  "step": 10500
146
  },
147
  {
148
  "epoch": 1.69,
149
  "learning_rate": 0.0006624923265807244,
150
- "loss": 0.5177,
151
  "step": 11000
152
  },
153
  {
154
  "epoch": 1.76,
155
  "learning_rate": 0.0006471454880294659,
156
- "loss": 0.5141,
157
  "step": 11500
158
  },
159
  {
160
  "epoch": 1.84,
161
  "learning_rate": 0.0006317986494782076,
162
- "loss": 0.5207,
163
  "step": 12000
164
  },
165
  {
166
  "epoch": 1.92,
167
  "learning_rate": 0.000616451810926949,
168
- "loss": 0.5154,
169
  "step": 12500
170
  },
171
  {
172
  "epoch": 2.0,
173
- "learning_rate": 0.0006011356660527931,
174
- "loss": 0.5154,
175
  "step": 13000
176
  },
177
  {
178
  "epoch": 2.0,
179
- "eval_bleu": 3.4964,
180
- "eval_gen_len": 18.336,
181
- "eval_loss": 0.44400554895401,
182
- "eval_runtime": 24.0346,
183
- "eval_samples_per_second": 41.607,
184
- "eval_steps_per_second": 2.621,
185
  "step": 13032
186
  },
187
  {
188
  "epoch": 2.07,
189
  "learning_rate": 0.0005857888275015347,
190
- "loss": 0.4746,
191
  "step": 13500
192
  },
193
  {
194
  "epoch": 2.15,
195
  "learning_rate": 0.0005704419889502763,
196
- "loss": 0.4745,
197
  "step": 14000
198
  },
199
  {
200
  "epoch": 2.23,
201
  "learning_rate": 0.0005550951503990178,
202
- "loss": 0.4775,
203
  "step": 14500
204
  },
205
  {
206
  "epoch": 2.3,
207
- "learning_rate": 0.0005397790055248619,
208
- "loss": 0.4819,
209
  "step": 15000
210
  },
211
  {
212
  "epoch": 2.38,
213
- "learning_rate": 0.0005244321669736034,
214
- "loss": 0.4803,
215
  "step": 15500
216
  },
217
  {
218
  "epoch": 2.46,
219
  "learning_rate": 0.000509085328422345,
220
- "loss": 0.4779,
221
  "step": 16000
222
  },
223
  {
224
  "epoch": 2.53,
225
  "learning_rate": 0.0004937384898710866,
226
- "loss": 0.4778,
227
  "step": 16500
228
  },
229
  {
230
  "epoch": 2.61,
231
  "learning_rate": 0.00047839165131982814,
232
- "loss": 0.4773,
233
  "step": 17000
234
  },
235
  {
236
  "epoch": 2.69,
237
- "learning_rate": 0.0004630755064456722,
238
- "loss": 0.4721,
239
  "step": 17500
240
  },
241
  {
242
  "epoch": 2.76,
243
  "learning_rate": 0.00044772866789441376,
244
- "loss": 0.4744,
245
  "step": 18000
246
  },
247
  {
248
  "epoch": 2.84,
249
  "learning_rate": 0.00043238182934315537,
250
- "loss": 0.4812,
251
  "step": 18500
252
  },
253
  {
254
  "epoch": 2.92,
255
  "learning_rate": 0.00041703499079189687,
256
- "loss": 0.4757,
257
  "step": 19000
258
  },
259
  {
260
  "epoch": 2.99,
261
- "learning_rate": 0.00040171884591774093,
262
- "loss": 0.4821,
263
  "step": 19500
264
  },
265
  {
266
  "epoch": 3.0,
267
- "eval_bleu": 3.5832,
268
- "eval_gen_len": 18.309,
269
- "eval_loss": 0.4355914294719696,
270
- "eval_runtime": 25.3154,
271
- "eval_samples_per_second": 39.502,
272
- "eval_steps_per_second": 2.489,
273
  "step": 19548
274
  },
275
  {
276
  "epoch": 3.07,
277
  "learning_rate": 0.00038637200736648254,
278
- "loss": 0.4381,
279
  "step": 20000
280
  },
281
  {
282
  "epoch": 3.15,
283
  "learning_rate": 0.0003710251688152241,
284
- "loss": 0.4407,
285
  "step": 20500
286
  },
287
  {
288
  "epoch": 3.22,
289
  "learning_rate": 0.0003556783302639656,
290
- "loss": 0.4427,
291
  "step": 21000
292
  },
293
  {
294
  "epoch": 3.3,
295
  "learning_rate": 0.00034033149171270715,
296
- "loss": 0.4375,
297
  "step": 21500
298
  },
299
  {
300
  "epoch": 3.38,
301
- "learning_rate": 0.00032498465316144876,
302
- "loss": 0.4418,
303
  "step": 22000
304
  },
305
  {
306
  "epoch": 3.45,
307
- "learning_rate": 0.0003096378146101903,
308
- "loss": 0.4379,
309
  "step": 22500
310
  },
311
  {
312
  "epoch": 3.53,
313
- "learning_rate": 0.00029429097605893187,
314
- "loss": 0.4412,
315
  "step": 23000
316
  },
317
  {
318
  "epoch": 3.61,
319
  "learning_rate": 0.00027897483118477593,
320
- "loss": 0.4382,
321
  "step": 23500
322
  },
323
  {
324
  "epoch": 3.68,
325
  "learning_rate": 0.00026365868631062,
326
- "loss": 0.4397,
327
  "step": 24000
328
  },
329
  {
330
  "epoch": 3.76,
331
  "learning_rate": 0.0002483118477593616,
332
- "loss": 0.4368,
333
  "step": 24500
334
  },
335
  {
336
  "epoch": 3.84,
337
  "learning_rate": 0.00023296500920810313,
338
- "loss": 0.4377,
339
  "step": 25000
340
  },
341
  {
342
  "epoch": 3.91,
343
  "learning_rate": 0.00021761817065684468,
344
- "loss": 0.4387,
345
  "step": 25500
346
  },
347
  {
348
  "epoch": 3.99,
349
  "learning_rate": 0.00020230202578268877,
350
- "loss": 0.4398,
351
  "step": 26000
352
  },
353
  {
354
  "epoch": 4.0,
355
- "eval_bleu": 3.6303,
356
- "eval_gen_len": 18.305,
357
- "eval_loss": 0.4224185645580292,
358
- "eval_runtime": 23.5328,
359
- "eval_samples_per_second": 42.494,
360
- "eval_steps_per_second": 2.677,
361
  "step": 26064
362
  },
363
  {
364
  "epoch": 4.07,
365
  "learning_rate": 0.00018695518723143033,
366
- "loss": 0.4097,
367
  "step": 26500
368
  },
369
  {
370
  "epoch": 4.14,
371
  "learning_rate": 0.00017160834868017188,
372
- "loss": 0.4057,
373
  "step": 27000
374
  },
375
  {
376
  "epoch": 4.22,
377
  "learning_rate": 0.00015626151012891346,
378
- "loss": 0.4058,
379
  "step": 27500
380
  },
381
  {
382
  "epoch": 4.3,
383
  "learning_rate": 0.00014094536525475752,
384
- "loss": 0.4049,
385
  "step": 28000
386
  },
387
  {
388
  "epoch": 4.37,
389
  "learning_rate": 0.00012559852670349908,
390
- "loss": 0.4058,
391
  "step": 28500
392
  },
393
  {
394
  "epoch": 4.45,
395
  "learning_rate": 0.00011025168815224063,
396
- "loss": 0.4055,
397
  "step": 29000
398
  },
399
  {
400
  "epoch": 4.53,
401
  "learning_rate": 9.49048496009822e-05,
402
- "loss": 0.405,
403
  "step": 29500
404
  },
405
  {
406
  "epoch": 4.6,
407
  "learning_rate": 7.955801104972376e-05,
408
- "loss": 0.4032,
409
  "step": 30000
410
  },
411
  {
412
  "epoch": 4.68,
413
  "learning_rate": 6.424186617556783e-05,
414
- "loss": 0.4089,
415
  "step": 30500
416
  },
417
  {
418
  "epoch": 4.76,
419
  "learning_rate": 4.8895027624309394e-05,
420
- "loss": 0.4057,
421
  "step": 31000
422
  },
423
  {
424
  "epoch": 4.83,
425
  "learning_rate": 3.3548189073050956e-05,
426
- "loss": 0.3978,
427
  "step": 31500
428
  },
429
  {
430
  "epoch": 4.91,
431
  "learning_rate": 1.820135052179251e-05,
432
- "loss": 0.4011,
433
  "step": 32000
434
  },
435
  {
436
  "epoch": 4.99,
437
- "learning_rate": 2.85451197053407e-06,
438
- "loss": 0.4031,
439
  "step": 32500
440
  }
441
  ],
@@ -443,7 +443,7 @@
443
  "max_steps": 32580,
444
  "num_train_epochs": 5,
445
  "save_steps": 500,
446
- "total_flos": 1.7234743264149504e+16,
447
  "trial_name": null,
448
  "trial_params": null
449
  }
 
11
  {
12
  "epoch": 0.08,
13
  "learning_rate": 0.0009846531614487416,
14
+ "loss": 0.5762,
15
  "step": 500
16
  },
17
  {
18
  "epoch": 0.15,
19
  "learning_rate": 0.0009693063228974831,
20
+ "loss": 0.6042,
21
  "step": 1000
22
  },
23
  {
24
  "epoch": 0.23,
25
  "learning_rate": 0.0009539594843462247,
26
+ "loss": 0.6024,
27
  "step": 1500
28
  },
29
  {
30
  "epoch": 0.31,
31
  "learning_rate": 0.0009386126457949663,
32
+ "loss": 0.6057,
33
  "step": 2000
34
  },
35
  {
36
  "epoch": 0.38,
37
  "learning_rate": 0.0009232658072437078,
38
+ "loss": 0.6143,
39
  "step": 2500
40
  },
41
  {
42
  "epoch": 0.46,
43
  "learning_rate": 0.0009079189686924494,
44
+ "loss": 0.6081,
45
  "step": 3000
46
  },
47
  {
48
  "epoch": 0.54,
49
  "learning_rate": 0.000892572130141191,
50
+ "loss": 0.611,
51
  "step": 3500
52
  },
53
  {
54
  "epoch": 0.61,
55
  "learning_rate": 0.0008772252915899325,
56
+ "loss": 0.6104,
57
  "step": 4000
58
  },
59
  {
60
  "epoch": 0.69,
61
  "learning_rate": 0.0008619091467157766,
62
+ "loss": 0.6154,
63
  "step": 4500
64
  },
65
  {
66
  "epoch": 0.77,
67
  "learning_rate": 0.0008465623081645181,
68
+ "loss": 0.6151,
69
  "step": 5000
70
  },
71
  {
72
  "epoch": 0.84,
73
  "learning_rate": 0.0008312154696132597,
74
+ "loss": 0.611,
75
  "step": 5500
76
  },
77
  {
78
  "epoch": 0.92,
79
  "learning_rate": 0.0008158686310620012,
80
+ "loss": 0.617,
81
  "step": 6000
82
  },
83
  {
84
  "epoch": 1.0,
85
  "learning_rate": 0.0008005524861878454,
86
+ "loss": 0.6089,
87
  "step": 6500
88
  },
89
  {
90
  "epoch": 1.0,
91
+ "eval_bleu": 3.6822,
92
+ "eval_gen_len": 18.235,
93
+ "eval_loss": 0.506125271320343,
94
+ "eval_runtime": 27.1471,
95
+ "eval_samples_per_second": 36.836,
96
+ "eval_steps_per_second": 2.321,
97
  "step": 6516
98
  },
99
  {
100
  "epoch": 1.07,
101
  "learning_rate": 0.0007852056476365869,
102
+ "loss": 0.5657,
103
  "step": 7000
104
  },
105
  {
106
  "epoch": 1.15,
107
  "learning_rate": 0.0007698588090853285,
108
+ "loss": 0.5623,
109
  "step": 7500
110
  },
111
  {
112
  "epoch": 1.23,
113
  "learning_rate": 0.0007545119705340699,
114
+ "loss": 0.5669,
115
  "step": 8000
116
  },
117
  {
118
  "epoch": 1.3,
119
  "learning_rate": 0.000739195825659914,
120
+ "loss": 0.5793,
121
  "step": 8500
122
  },
123
  {
124
  "epoch": 1.38,
125
  "learning_rate": 0.0007238489871086557,
126
+ "loss": 0.5743,
127
  "step": 9000
128
  },
129
  {
130
  "epoch": 1.46,
131
+ "learning_rate": 0.0007085328422344997,
132
+ "loss": 0.5722,
133
  "step": 9500
134
  },
135
  {
136
  "epoch": 1.53,
137
+ "learning_rate": 0.0006931860036832413,
138
+ "loss": 0.5749,
139
  "step": 10000
140
  },
141
  {
142
  "epoch": 1.61,
143
  "learning_rate": 0.0006778391651319828,
144
+ "loss": 0.5776,
145
  "step": 10500
146
  },
147
  {
148
  "epoch": 1.69,
149
  "learning_rate": 0.0006624923265807244,
150
+ "loss": 0.5694,
151
  "step": 11000
152
  },
153
  {
154
  "epoch": 1.76,
155
  "learning_rate": 0.0006471454880294659,
156
+ "loss": 0.5716,
157
  "step": 11500
158
  },
159
  {
160
  "epoch": 1.84,
161
  "learning_rate": 0.0006317986494782076,
162
+ "loss": 0.5665,
163
  "step": 12000
164
  },
165
  {
166
  "epoch": 1.92,
167
  "learning_rate": 0.000616451810926949,
168
+ "loss": 0.5671,
169
  "step": 12500
170
  },
171
  {
172
  "epoch": 2.0,
173
+ "learning_rate": 0.0006011049723756906,
174
+ "loss": 0.5687,
175
  "step": 13000
176
  },
177
  {
178
  "epoch": 2.0,
179
+ "eval_bleu": 3.7119,
180
+ "eval_gen_len": 18.222,
181
+ "eval_loss": 0.495604932308197,
182
+ "eval_runtime": 23.8741,
183
+ "eval_samples_per_second": 41.886,
184
+ "eval_steps_per_second": 2.639,
185
  "step": 13032
186
  },
187
  {
188
  "epoch": 2.07,
189
  "learning_rate": 0.0005857888275015347,
190
+ "loss": 0.5307,
191
  "step": 13500
192
  },
193
  {
194
  "epoch": 2.15,
195
  "learning_rate": 0.0005704419889502763,
196
+ "loss": 0.5325,
197
  "step": 14000
198
  },
199
  {
200
  "epoch": 2.23,
201
  "learning_rate": 0.0005550951503990178,
202
+ "loss": 0.5353,
203
  "step": 14500
204
  },
205
  {
206
  "epoch": 2.3,
207
+ "learning_rate": 0.0005397483118477594,
208
+ "loss": 0.5289,
209
  "step": 15000
210
  },
211
  {
212
  "epoch": 2.38,
213
+ "learning_rate": 0.0005244014732965009,
214
+ "loss": 0.5321,
215
  "step": 15500
216
  },
217
  {
218
  "epoch": 2.46,
219
  "learning_rate": 0.000509085328422345,
220
+ "loss": 0.5271,
221
  "step": 16000
222
  },
223
  {
224
  "epoch": 2.53,
225
  "learning_rate": 0.0004937384898710866,
226
+ "loss": 0.5295,
227
  "step": 16500
228
  },
229
  {
230
  "epoch": 2.61,
231
  "learning_rate": 0.00047839165131982814,
232
+ "loss": 0.524,
233
  "step": 17000
234
  },
235
  {
236
  "epoch": 2.69,
237
+ "learning_rate": 0.0004630448127685697,
238
+ "loss": 0.526,
239
  "step": 17500
240
  },
241
  {
242
  "epoch": 2.76,
243
  "learning_rate": 0.00044772866789441376,
244
+ "loss": 0.5323,
245
  "step": 18000
246
  },
247
  {
248
  "epoch": 2.84,
249
  "learning_rate": 0.00043238182934315537,
250
+ "loss": 0.5236,
251
  "step": 18500
252
  },
253
  {
254
  "epoch": 2.92,
255
  "learning_rate": 0.00041703499079189687,
256
+ "loss": 0.5266,
257
  "step": 19000
258
  },
259
  {
260
  "epoch": 2.99,
261
+ "learning_rate": 0.0004016881522406384,
262
+ "loss": 0.525,
263
  "step": 19500
264
  },
265
  {
266
  "epoch": 3.0,
267
+ "eval_bleu": 3.8722,
268
+ "eval_gen_len": 18.219,
269
+ "eval_loss": 0.4868793785572052,
270
+ "eval_runtime": 23.2067,
271
+ "eval_samples_per_second": 43.091,
272
+ "eval_steps_per_second": 2.715,
273
  "step": 19548
274
  },
275
  {
276
  "epoch": 3.07,
277
  "learning_rate": 0.00038637200736648254,
278
+ "loss": 0.4934,
279
  "step": 20000
280
  },
281
  {
282
  "epoch": 3.15,
283
  "learning_rate": 0.0003710251688152241,
284
+ "loss": 0.4869,
285
  "step": 20500
286
  },
287
  {
288
  "epoch": 3.22,
289
  "learning_rate": 0.0003556783302639656,
290
+ "loss": 0.4917,
291
  "step": 21000
292
  },
293
  {
294
  "epoch": 3.3,
295
  "learning_rate": 0.00034033149171270715,
296
+ "loss": 0.4841,
297
  "step": 21500
298
  },
299
  {
300
  "epoch": 3.38,
301
+ "learning_rate": 0.00032501534683855126,
302
+ "loss": 0.4906,
303
  "step": 22000
304
  },
305
  {
306
  "epoch": 3.45,
307
+ "learning_rate": 0.0003096685082872928,
308
+ "loss": 0.4884,
309
  "step": 22500
310
  },
311
  {
312
  "epoch": 3.53,
313
+ "learning_rate": 0.0002943216697360344,
314
+ "loss": 0.486,
315
  "step": 23000
316
  },
317
  {
318
  "epoch": 3.61,
319
  "learning_rate": 0.00027897483118477593,
320
+ "loss": 0.4932,
321
  "step": 23500
322
  },
323
  {
324
  "epoch": 3.68,
325
  "learning_rate": 0.00026365868631062,
326
+ "loss": 0.4951,
327
  "step": 24000
328
  },
329
  {
330
  "epoch": 3.76,
331
  "learning_rate": 0.0002483118477593616,
332
+ "loss": 0.4896,
333
  "step": 24500
334
  },
335
  {
336
  "epoch": 3.84,
337
  "learning_rate": 0.00023296500920810313,
338
+ "loss": 0.4832,
339
  "step": 25000
340
  },
341
  {
342
  "epoch": 3.91,
343
  "learning_rate": 0.00021761817065684468,
344
+ "loss": 0.4874,
345
  "step": 25500
346
  },
347
  {
348
  "epoch": 3.99,
349
  "learning_rate": 0.00020230202578268877,
350
+ "loss": 0.488,
351
  "step": 26000
352
  },
353
  {
354
  "epoch": 4.0,
355
+ "eval_bleu": 3.7886,
356
+ "eval_gen_len": 18.23,
357
+ "eval_loss": 0.4795362949371338,
358
+ "eval_runtime": 23.2422,
359
+ "eval_samples_per_second": 43.025,
360
+ "eval_steps_per_second": 2.711,
361
  "step": 26064
362
  },
363
  {
364
  "epoch": 4.07,
365
  "learning_rate": 0.00018695518723143033,
366
+ "loss": 0.4583,
367
  "step": 26500
368
  },
369
  {
370
  "epoch": 4.14,
371
  "learning_rate": 0.00017160834868017188,
372
+ "loss": 0.4495,
373
  "step": 27000
374
  },
375
  {
376
  "epoch": 4.22,
377
  "learning_rate": 0.00015626151012891346,
378
+ "loss": 0.454,
379
  "step": 27500
380
  },
381
  {
382
  "epoch": 4.3,
383
  "learning_rate": 0.00014094536525475752,
384
+ "loss": 0.4547,
385
  "step": 28000
386
  },
387
  {
388
  "epoch": 4.37,
389
  "learning_rate": 0.00012559852670349908,
390
+ "loss": 0.4588,
391
  "step": 28500
392
  },
393
  {
394
  "epoch": 4.45,
395
  "learning_rate": 0.00011025168815224063,
396
+ "loss": 0.453,
397
  "step": 29000
398
  },
399
  {
400
  "epoch": 4.53,
401
  "learning_rate": 9.49048496009822e-05,
402
+ "loss": 0.4549,
403
  "step": 29500
404
  },
405
  {
406
  "epoch": 4.6,
407
  "learning_rate": 7.955801104972376e-05,
408
+ "loss": 0.4533,
409
  "step": 30000
410
  },
411
  {
412
  "epoch": 4.68,
413
  "learning_rate": 6.424186617556783e-05,
414
+ "loss": 0.4495,
415
  "step": 30500
416
  },
417
  {
418
  "epoch": 4.76,
419
  "learning_rate": 4.8895027624309394e-05,
420
+ "loss": 0.4555,
421
  "step": 31000
422
  },
423
  {
424
  "epoch": 4.83,
425
  "learning_rate": 3.3548189073050956e-05,
426
+ "loss": 0.4563,
427
  "step": 31500
428
  },
429
  {
430
  "epoch": 4.91,
431
  "learning_rate": 1.820135052179251e-05,
432
+ "loss": 0.4563,
433
  "step": 32000
434
  },
435
  {
436
  "epoch": 4.99,
437
+ "learning_rate": 2.885205647636587e-06,
438
+ "loss": 0.4527,
439
  "step": 32500
440
  }
441
  ],
 
443
  "max_steps": 32580,
444
  "num_train_epochs": 5,
445
  "save_steps": 500,
446
+ "total_flos": 1.7233985138589696e+16,
447
  "trial_name": null,
448
  "trial_params": null
449
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e769153e1530bd8e47a2f0b7a7df8c3974ef6c0591382fcb5397ece912b6194
3
  size 4792
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbbc9f6fd02ff4264ade394c37847b822a605f5a9ef106ad63ba6639ed6bf7c2
3
  size 4792
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03ad4fae3cb368e1108a698448c2180a61f7ce3dbdd3d4a5029c8f332c57c8a6
3
  size 242041896
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7939d439dd6b3e66709084b0c9d94ee80069463f1d5f6e3750825ada2fa1b763
3
  size 242041896
runs/Jan20_14-02-20_b4d7770f8402/events.out.tfevents.1705759341.b4d7770f8402.183.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3edab546e5d925dcee711a42b20381a67d858e71be16204864be3da1412a165d
3
- size 17066
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74504e642541ceb221a3b10a3d5ce6d734f7ac4b7a30906a5ee0ec11deab1a47
3
+ size 17803