yuweiiizz commited on
Commit
169a268
·
verified ·
1 Parent(s): e35392a

Training in progress, step 1000, checkpoint

Browse files
last-checkpoint/config.json CHANGED
@@ -45,7 +45,7 @@
45
  "scale_embedding": false,
46
  "suppress_tokens": [],
47
  "torch_dtype": "float32",
48
- "transformers_version": "4.40.0",
49
  "use_cache": false,
50
  "use_weighted_layer_sum": false,
51
  "vocab_size": 51865
 
45
  "scale_embedding": false,
46
  "suppress_tokens": [],
47
  "torch_dtype": "float32",
48
+ "transformers_version": "4.40.1",
49
  "use_cache": false,
50
  "use_weighted_layer_sum": false,
51
  "vocab_size": 51865
last-checkpoint/generation_config.json CHANGED
@@ -262,5 +262,5 @@
262
  "transcribe": 50359,
263
  "translate": 50358
264
  },
265
- "transformers_version": "4.40.0"
266
  }
 
262
  "transcribe": 50359,
263
  "translate": 50358
264
  },
265
+ "transformers_version": "4.40.1"
266
  }
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b810b7f0fe83995300521a646137d370d36d549e50472fcf6c1da2a177919ec3
3
  size 966995080
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf5c2c11f76e4faaf0e73a2e3d071ca3a84e4e462cec1f6ed89d2fd5ee8fcff9
3
  size 966995080
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3a70bf563a60fc19871fd719056a40461333af72260151f6bade95b428cbc93c
3
  size 1925064044
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5440cc8dfb8091af2b204e8cabf6857c82af16c9a6dd368570a677a0b7da3055
3
  size 1925064044
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a7c93a397e9322e49f4ed50d18f810eaf2c39ecdb2985c95d248cd7a2fa2aa47
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51bea2a28f129bf069e5a02ae44edfec13f51109373355626e9228154b0d41f5
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd54311344b834087a4b1c20d06544579c7f43d33908960b6b3b61734dbde46d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2d378b4bd36bf44babbc26f567786bedc31fd4875330753b97c0f677a367397
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,1168 +1,301 @@
1
  {
2
- "best_metric": 29.080310880829014,
3
- "best_model_checkpoint": "./whisper-small-taiwanese/checkpoint-4000",
4
- "epoch": 2.5806451612903225,
5
  "eval_steps": 1000,
6
- "global_step": 4000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.016129032258064516,
13
- "grad_norm": 229.72373962402344,
14
  "learning_rate": 5.376344086021506e-07,
15
- "loss": 7.9674,
16
  "step": 25
17
  },
18
  {
19
  "epoch": 0.03225806451612903,
20
- "grad_norm": 50.686302185058594,
21
  "learning_rate": 1.0752688172043011e-06,
22
- "loss": 5.7026,
23
  "step": 50
24
  },
25
  {
26
  "epoch": 0.04838709677419355,
27
- "grad_norm": 32.474510192871094,
28
  "learning_rate": 1.6129032258064516e-06,
29
- "loss": 3.7065,
30
  "step": 75
31
  },
32
  {
33
  "epoch": 0.06451612903225806,
34
- "grad_norm": 30.973085403442383,
35
  "learning_rate": 2.1505376344086023e-06,
36
- "loss": 2.6906,
37
  "step": 100
38
  },
39
  {
40
  "epoch": 0.08064516129032258,
41
- "grad_norm": 28.370464324951172,
42
  "learning_rate": 2.688172043010753e-06,
43
- "loss": 2.3087,
44
  "step": 125
45
  },
46
  {
47
  "epoch": 0.0967741935483871,
48
- "grad_norm": 29.259729385375977,
49
  "learning_rate": 3.225806451612903e-06,
50
- "loss": 2.0589,
51
  "step": 150
52
  },
53
  {
54
  "epoch": 0.11290322580645161,
55
- "grad_norm": 29.08380699157715,
56
  "learning_rate": 3.763440860215054e-06,
57
- "loss": 1.8731,
58
  "step": 175
59
  },
60
  {
61
  "epoch": 0.12903225806451613,
62
- "grad_norm": 22.745624542236328,
63
  "learning_rate": 4.3010752688172045e-06,
64
- "loss": 1.5257,
65
  "step": 200
66
  },
67
  {
68
  "epoch": 0.14516129032258066,
69
- "grad_norm": 16.694580078125,
70
  "learning_rate": 4.838709677419355e-06,
71
- "loss": 1.4005,
72
  "step": 225
73
  },
74
  {
75
  "epoch": 0.16129032258064516,
76
- "grad_norm": 18.02663803100586,
77
  "learning_rate": 5.376344086021506e-06,
78
- "loss": 1.3308,
79
  "step": 250
80
  },
81
  {
82
  "epoch": 0.1774193548387097,
83
- "grad_norm": 14.609949111938477,
84
  "learning_rate": 5.9139784946236566e-06,
85
- "loss": 1.2143,
86
  "step": 275
87
  },
88
  {
89
  "epoch": 0.1935483870967742,
90
- "grad_norm": 16.727527618408203,
91
  "learning_rate": 6.451612903225806e-06,
92
- "loss": 1.1925,
93
  "step": 300
94
  },
95
  {
96
  "epoch": 0.20967741935483872,
97
- "grad_norm": 15.254867553710938,
98
  "learning_rate": 6.989247311827958e-06,
99
- "loss": 1.1482,
100
  "step": 325
101
  },
102
  {
103
  "epoch": 0.22580645161290322,
104
- "grad_norm": 16.119234085083008,
105
  "learning_rate": 7.526881720430108e-06,
106
- "loss": 1.0825,
107
  "step": 350
108
  },
109
  {
110
  "epoch": 0.24193548387096775,
111
- "grad_norm": 13.577301025390625,
112
  "learning_rate": 8.064516129032258e-06,
113
- "loss": 1.099,
114
  "step": 375
115
  },
116
  {
117
  "epoch": 0.25806451612903225,
118
- "grad_norm": 15.483856201171875,
119
  "learning_rate": 8.602150537634409e-06,
120
- "loss": 1.0654,
121
  "step": 400
122
  },
123
  {
124
  "epoch": 0.27419354838709675,
125
- "grad_norm": 15.842108726501465,
126
  "learning_rate": 9.13978494623656e-06,
127
- "loss": 0.9747,
128
  "step": 425
129
  },
130
  {
131
  "epoch": 0.2903225806451613,
132
- "grad_norm": 13.010821342468262,
133
  "learning_rate": 9.67741935483871e-06,
134
- "loss": 0.9679,
135
  "step": 450
136
  },
137
  {
138
  "epoch": 0.3064516129032258,
139
- "grad_norm": 15.315924644470215,
140
  "learning_rate": 9.97610513739546e-06,
141
- "loss": 0.9001,
142
  "step": 475
143
  },
144
  {
145
  "epoch": 0.3225806451612903,
146
- "grad_norm": 15.252881050109863,
147
  "learning_rate": 9.916367980884111e-06,
148
- "loss": 0.9019,
149
  "step": 500
150
  },
151
  {
152
  "epoch": 0.3387096774193548,
153
- "grad_norm": 15.013239860534668,
154
  "learning_rate": 9.856630824372761e-06,
155
- "loss": 0.9167,
156
  "step": 525
157
  },
158
  {
159
  "epoch": 0.3548387096774194,
160
- "grad_norm": 12.44570255279541,
161
  "learning_rate": 9.79689366786141e-06,
162
- "loss": 0.8644,
163
  "step": 550
164
  },
165
  {
166
  "epoch": 0.3709677419354839,
167
- "grad_norm": 13.266128540039062,
168
  "learning_rate": 9.737156511350062e-06,
169
- "loss": 0.8954,
170
  "step": 575
171
  },
172
  {
173
  "epoch": 0.3870967741935484,
174
- "grad_norm": 13.153059005737305,
175
  "learning_rate": 9.67741935483871e-06,
176
- "loss": 0.8364,
177
  "step": 600
178
  },
179
  {
180
  "epoch": 0.4032258064516129,
181
- "grad_norm": 15.848042488098145,
182
  "learning_rate": 9.61768219832736e-06,
183
- "loss": 0.8667,
184
  "step": 625
185
  },
186
  {
187
  "epoch": 0.41935483870967744,
188
- "grad_norm": 13.445392608642578,
189
  "learning_rate": 9.557945041816011e-06,
190
- "loss": 0.8155,
191
  "step": 650
192
  },
193
  {
194
  "epoch": 0.43548387096774194,
195
- "grad_norm": 13.883005142211914,
196
  "learning_rate": 9.49820788530466e-06,
197
- "loss": 0.8446,
198
  "step": 675
199
  },
200
  {
201
  "epoch": 0.45161290322580644,
202
- "grad_norm": 13.22021198272705,
203
  "learning_rate": 9.43847072879331e-06,
204
- "loss": 0.8255,
205
  "step": 700
206
  },
207
  {
208
  "epoch": 0.46774193548387094,
209
- "grad_norm": 14.165966987609863,
210
  "learning_rate": 9.37873357228196e-06,
211
- "loss": 0.8034,
212
  "step": 725
213
  },
214
  {
215
  "epoch": 0.4838709677419355,
216
- "grad_norm": 12.320103645324707,
217
  "learning_rate": 9.31899641577061e-06,
218
- "loss": 0.7439,
219
  "step": 750
220
  },
221
  {
222
  "epoch": 0.5,
223
- "grad_norm": 13.079719543457031,
224
  "learning_rate": 9.25925925925926e-06,
225
- "loss": 0.7574,
226
  "step": 775
227
  },
228
  {
229
  "epoch": 0.5161290322580645,
230
- "grad_norm": 12.108668327331543,
231
  "learning_rate": 9.19952210274791e-06,
232
- "loss": 0.7844,
233
  "step": 800
234
  },
235
  {
236
  "epoch": 0.532258064516129,
237
- "grad_norm": 12.974024772644043,
238
  "learning_rate": 9.13978494623656e-06,
239
- "loss": 0.783,
240
  "step": 825
241
  },
242
  {
243
  "epoch": 0.5483870967741935,
244
- "grad_norm": 14.670340538024902,
245
  "learning_rate": 9.08004778972521e-06,
246
- "loss": 0.7084,
247
  "step": 850
248
  },
249
  {
250
  "epoch": 0.5645161290322581,
251
- "grad_norm": 15.380485534667969,
252
  "learning_rate": 9.02031063321386e-06,
253
- "loss": 0.7624,
254
  "step": 875
255
  },
256
  {
257
  "epoch": 0.5806451612903226,
258
- "grad_norm": 14.00020694732666,
259
  "learning_rate": 8.96057347670251e-06,
260
- "loss": 0.7031,
261
  "step": 900
262
  },
263
  {
264
  "epoch": 0.5967741935483871,
265
- "grad_norm": 11.307880401611328,
266
  "learning_rate": 8.90083632019116e-06,
267
- "loss": 0.6797,
268
  "step": 925
269
  },
270
  {
271
  "epoch": 0.6129032258064516,
272
- "grad_norm": 14.682994842529297,
273
  "learning_rate": 8.84109916367981e-06,
274
- "loss": 0.6679,
275
  "step": 950
276
  },
277
  {
278
  "epoch": 0.6290322580645161,
279
- "grad_norm": 14.844277381896973,
280
  "learning_rate": 8.78136200716846e-06,
281
- "loss": 0.7079,
282
  "step": 975
283
  },
284
  {
285
  "epoch": 0.6451612903225806,
286
- "grad_norm": 14.752099990844727,
287
  "learning_rate": 8.72162485065711e-06,
288
- "loss": 0.6757,
289
  "step": 1000
290
  },
291
  {
292
  "epoch": 0.6451612903225806,
293
- "eval_cer": 43.3160621761658,
294
- "eval_loss": 0.6164932250976562,
295
- "eval_runtime": 945.7136,
296
- "eval_samples_per_second": 2.412,
297
- "eval_steps_per_second": 0.302,
298
  "step": 1000
299
- },
300
- {
301
- "epoch": 0.6612903225806451,
302
- "grad_norm": 12.722296714782715,
303
- "learning_rate": 8.66188769414576e-06,
304
- "loss": 0.6687,
305
- "step": 1025
306
- },
307
- {
308
- "epoch": 0.6774193548387096,
309
- "grad_norm": 19.90687370300293,
310
- "learning_rate": 8.602150537634409e-06,
311
- "loss": 0.6705,
312
- "step": 1050
313
- },
314
- {
315
- "epoch": 0.6935483870967742,
316
- "grad_norm": 12.18825626373291,
317
- "learning_rate": 8.54241338112306e-06,
318
- "loss": 0.6708,
319
- "step": 1075
320
- },
321
- {
322
- "epoch": 0.7096774193548387,
323
- "grad_norm": 12.795165061950684,
324
- "learning_rate": 8.48267622461171e-06,
325
- "loss": 0.6697,
326
- "step": 1100
327
- },
328
- {
329
- "epoch": 0.7258064516129032,
330
- "grad_norm": 12.366995811462402,
331
- "learning_rate": 8.422939068100358e-06,
332
- "loss": 0.6053,
333
- "step": 1125
334
- },
335
- {
336
- "epoch": 0.7419354838709677,
337
- "grad_norm": 11.671553611755371,
338
- "learning_rate": 8.36320191158901e-06,
339
- "loss": 0.6733,
340
- "step": 1150
341
- },
342
- {
343
- "epoch": 0.7580645161290323,
344
- "grad_norm": 14.575772285461426,
345
- "learning_rate": 8.303464755077659e-06,
346
- "loss": 0.6173,
347
- "step": 1175
348
- },
349
- {
350
- "epoch": 0.7741935483870968,
351
- "grad_norm": 10.553525924682617,
352
- "learning_rate": 8.24372759856631e-06,
353
- "loss": 0.6529,
354
- "step": 1200
355
- },
356
- {
357
- "epoch": 0.7903225806451613,
358
- "grad_norm": 10.87187671661377,
359
- "learning_rate": 8.18399044205496e-06,
360
- "loss": 0.6273,
361
- "step": 1225
362
- },
363
- {
364
- "epoch": 0.8064516129032258,
365
- "grad_norm": 13.998808860778809,
366
- "learning_rate": 8.124253285543608e-06,
367
- "loss": 0.6209,
368
- "step": 1250
369
- },
370
- {
371
- "epoch": 0.8225806451612904,
372
- "grad_norm": 13.7244234085083,
373
- "learning_rate": 8.064516129032258e-06,
374
- "loss": 0.6448,
375
- "step": 1275
376
- },
377
- {
378
- "epoch": 0.8387096774193549,
379
- "grad_norm": 12.110133171081543,
380
- "learning_rate": 8.004778972520909e-06,
381
- "loss": 0.6347,
382
- "step": 1300
383
- },
384
- {
385
- "epoch": 0.8548387096774194,
386
- "grad_norm": 15.724874496459961,
387
- "learning_rate": 7.945041816009559e-06,
388
- "loss": 0.5888,
389
- "step": 1325
390
- },
391
- {
392
- "epoch": 0.8709677419354839,
393
- "grad_norm": 12.077081680297852,
394
- "learning_rate": 7.88530465949821e-06,
395
- "loss": 0.6066,
396
- "step": 1350
397
- },
398
- {
399
- "epoch": 0.8870967741935484,
400
- "grad_norm": 12.401660919189453,
401
- "learning_rate": 7.825567502986858e-06,
402
- "loss": 0.5831,
403
- "step": 1375
404
- },
405
- {
406
- "epoch": 0.9032258064516129,
407
- "grad_norm": 12.884041786193848,
408
- "learning_rate": 7.765830346475508e-06,
409
- "loss": 0.6,
410
- "step": 1400
411
- },
412
- {
413
- "epoch": 0.9193548387096774,
414
- "grad_norm": 12.485610008239746,
415
- "learning_rate": 7.706093189964159e-06,
416
- "loss": 0.542,
417
- "step": 1425
418
- },
419
- {
420
- "epoch": 0.9354838709677419,
421
- "grad_norm": 13.256093978881836,
422
- "learning_rate": 7.646356033452809e-06,
423
- "loss": 0.5802,
424
- "step": 1450
425
- },
426
- {
427
- "epoch": 0.9516129032258065,
428
- "grad_norm": 10.507469177246094,
429
- "learning_rate": 7.586618876941458e-06,
430
- "loss": 0.5468,
431
- "step": 1475
432
- },
433
- {
434
- "epoch": 0.967741935483871,
435
- "grad_norm": 9.836853981018066,
436
- "learning_rate": 7.526881720430108e-06,
437
- "loss": 0.5252,
438
- "step": 1500
439
- },
440
- {
441
- "epoch": 0.9838709677419355,
442
- "grad_norm": 12.627049446105957,
443
- "learning_rate": 7.467144563918758e-06,
444
- "loss": 0.5501,
445
- "step": 1525
446
- },
447
- {
448
- "epoch": 1.0,
449
- "grad_norm": 13.873695373535156,
450
- "learning_rate": 7.4074074074074075e-06,
451
- "loss": 0.5414,
452
- "step": 1550
453
- },
454
- {
455
- "epoch": 1.0161290322580645,
456
- "grad_norm": 8.701884269714355,
457
- "learning_rate": 7.347670250896059e-06,
458
- "loss": 0.3627,
459
- "step": 1575
460
- },
461
- {
462
- "epoch": 1.032258064516129,
463
- "grad_norm": 10.089194297790527,
464
- "learning_rate": 7.287933094384708e-06,
465
- "loss": 0.3602,
466
- "step": 1600
467
- },
468
- {
469
- "epoch": 1.0483870967741935,
470
- "grad_norm": 8.33105182647705,
471
- "learning_rate": 7.2281959378733575e-06,
472
- "loss": 0.3706,
473
- "step": 1625
474
- },
475
- {
476
- "epoch": 1.064516129032258,
477
- "grad_norm": 12.283960342407227,
478
- "learning_rate": 7.168458781362008e-06,
479
- "loss": 0.3886,
480
- "step": 1650
481
- },
482
- {
483
- "epoch": 1.0806451612903225,
484
- "grad_norm": 10.99679183959961,
485
- "learning_rate": 7.108721624850657e-06,
486
- "loss": 0.4035,
487
- "step": 1675
488
- },
489
- {
490
- "epoch": 1.096774193548387,
491
- "grad_norm": 8.689805030822754,
492
- "learning_rate": 7.048984468339307e-06,
493
- "loss": 0.3571,
494
- "step": 1700
495
- },
496
- {
497
- "epoch": 1.1129032258064515,
498
- "grad_norm": 7.141482353210449,
499
- "learning_rate": 6.989247311827958e-06,
500
- "loss": 0.3557,
501
- "step": 1725
502
- },
503
- {
504
- "epoch": 1.129032258064516,
505
- "grad_norm": 10.56028938293457,
506
- "learning_rate": 6.929510155316607e-06,
507
- "loss": 0.3614,
508
- "step": 1750
509
- },
510
- {
511
- "epoch": 1.1451612903225807,
512
- "grad_norm": 11.50129508972168,
513
- "learning_rate": 6.869772998805258e-06,
514
- "loss": 0.3763,
515
- "step": 1775
516
- },
517
- {
518
- "epoch": 1.1612903225806452,
519
- "grad_norm": 10.562152862548828,
520
- "learning_rate": 6.810035842293907e-06,
521
- "loss": 0.3497,
522
- "step": 1800
523
- },
524
- {
525
- "epoch": 1.1774193548387097,
526
- "grad_norm": 12.868457794189453,
527
- "learning_rate": 6.7502986857825566e-06,
528
- "loss": 0.3562,
529
- "step": 1825
530
- },
531
- {
532
- "epoch": 1.1935483870967742,
533
- "grad_norm": 11.193254470825195,
534
- "learning_rate": 6.690561529271207e-06,
535
- "loss": 0.3662,
536
- "step": 1850
537
- },
538
- {
539
- "epoch": 1.2096774193548387,
540
- "grad_norm": 9.065006256103516,
541
- "learning_rate": 6.630824372759857e-06,
542
- "loss": 0.3547,
543
- "step": 1875
544
- },
545
- {
546
- "epoch": 1.2258064516129032,
547
- "grad_norm": 10.713186264038086,
548
- "learning_rate": 6.5710872162485075e-06,
549
- "loss": 0.3703,
550
- "step": 1900
551
- },
552
- {
553
- "epoch": 1.2419354838709677,
554
- "grad_norm": 11.31541919708252,
555
- "learning_rate": 6.511350059737157e-06,
556
- "loss": 0.3814,
557
- "step": 1925
558
- },
559
- {
560
- "epoch": 1.2580645161290323,
561
- "grad_norm": 12.24356746673584,
562
- "learning_rate": 6.451612903225806e-06,
563
- "loss": 0.346,
564
- "step": 1950
565
- },
566
- {
567
- "epoch": 1.2741935483870968,
568
- "grad_norm": 10.248428344726562,
569
- "learning_rate": 6.391875746714457e-06,
570
- "loss": 0.3581,
571
- "step": 1975
572
- },
573
- {
574
- "epoch": 1.2903225806451613,
575
- "grad_norm": 9.441059112548828,
576
- "learning_rate": 6.332138590203107e-06,
577
- "loss": 0.3539,
578
- "step": 2000
579
- },
580
- {
581
- "epoch": 1.2903225806451613,
582
- "eval_cer": 34.83160621761658,
583
- "eval_loss": 0.4711998403072357,
584
- "eval_runtime": 956.2234,
585
- "eval_samples_per_second": 2.385,
586
- "eval_steps_per_second": 0.299,
587
- "step": 2000
588
- },
589
- {
590
- "epoch": 1.3064516129032258,
591
- "grad_norm": 11.437358856201172,
592
- "learning_rate": 6.272401433691757e-06,
593
- "loss": 0.377,
594
- "step": 2025
595
- },
596
- {
597
- "epoch": 1.3225806451612903,
598
- "grad_norm": 8.866403579711914,
599
- "learning_rate": 6.212664277180407e-06,
600
- "loss": 0.3137,
601
- "step": 2050
602
- },
603
- {
604
- "epoch": 1.3387096774193548,
605
- "grad_norm": 10.96147632598877,
606
- "learning_rate": 6.152927120669057e-06,
607
- "loss": 0.3657,
608
- "step": 2075
609
- },
610
- {
611
- "epoch": 1.3548387096774195,
612
- "grad_norm": 16.82151985168457,
613
- "learning_rate": 6.0931899641577065e-06,
614
- "loss": 0.3575,
615
- "step": 2100
616
- },
617
- {
618
- "epoch": 1.370967741935484,
619
- "grad_norm": 10.459049224853516,
620
- "learning_rate": 6.033452807646356e-06,
621
- "loss": 0.3414,
622
- "step": 2125
623
- },
624
- {
625
- "epoch": 1.3870967741935485,
626
- "grad_norm": 9.4818696975708,
627
- "learning_rate": 5.973715651135007e-06,
628
- "loss": 0.3497,
629
- "step": 2150
630
- },
631
- {
632
- "epoch": 1.403225806451613,
633
- "grad_norm": 8.424386978149414,
634
- "learning_rate": 5.9139784946236566e-06,
635
- "loss": 0.3414,
636
- "step": 2175
637
- },
638
- {
639
- "epoch": 1.4193548387096775,
640
- "grad_norm": 10.135176658630371,
641
- "learning_rate": 5.854241338112307e-06,
642
- "loss": 0.3569,
643
- "step": 2200
644
- },
645
- {
646
- "epoch": 1.435483870967742,
647
- "grad_norm": 9.196470260620117,
648
- "learning_rate": 5.794504181600956e-06,
649
- "loss": 0.3767,
650
- "step": 2225
651
- },
652
- {
653
- "epoch": 1.4516129032258065,
654
- "grad_norm": 9.483991622924805,
655
- "learning_rate": 5.734767025089606e-06,
656
- "loss": 0.3302,
657
- "step": 2250
658
- },
659
- {
660
- "epoch": 1.467741935483871,
661
- "grad_norm": 11.096484184265137,
662
- "learning_rate": 5.675029868578256e-06,
663
- "loss": 0.3648,
664
- "step": 2275
665
- },
666
- {
667
- "epoch": 1.4838709677419355,
668
- "grad_norm": 11.095719337463379,
669
- "learning_rate": 5.615292712066906e-06,
670
- "loss": 0.348,
671
- "step": 2300
672
- },
673
- {
674
- "epoch": 1.5,
675
- "grad_norm": 8.295551300048828,
676
- "learning_rate": 5.555555555555557e-06,
677
- "loss": 0.3429,
678
- "step": 2325
679
- },
680
- {
681
- "epoch": 1.5161290322580645,
682
- "grad_norm": 9.586627960205078,
683
- "learning_rate": 5.495818399044206e-06,
684
- "loss": 0.3101,
685
- "step": 2350
686
- },
687
- {
688
- "epoch": 1.532258064516129,
689
- "grad_norm": 9.516448974609375,
690
- "learning_rate": 5.436081242532856e-06,
691
- "loss": 0.3268,
692
- "step": 2375
693
- },
694
- {
695
- "epoch": 1.5483870967741935,
696
- "grad_norm": 11.90730094909668,
697
- "learning_rate": 5.376344086021506e-06,
698
- "loss": 0.3218,
699
- "step": 2400
700
- },
701
- {
702
- "epoch": 1.564516129032258,
703
- "grad_norm": 10.183462142944336,
704
- "learning_rate": 5.316606929510155e-06,
705
- "loss": 0.3228,
706
- "step": 2425
707
- },
708
- {
709
- "epoch": 1.5806451612903225,
710
- "grad_norm": 9.693060874938965,
711
- "learning_rate": 5.2568697729988065e-06,
712
- "loss": 0.3043,
713
- "step": 2450
714
- },
715
- {
716
- "epoch": 1.596774193548387,
717
- "grad_norm": 10.416152000427246,
718
- "learning_rate": 5.197132616487456e-06,
719
- "loss": 0.3859,
720
- "step": 2475
721
- },
722
- {
723
- "epoch": 1.6129032258064515,
724
- "grad_norm": 11.155983924865723,
725
- "learning_rate": 5.137395459976105e-06,
726
- "loss": 0.321,
727
- "step": 2500
728
- },
729
- {
730
- "epoch": 1.629032258064516,
731
- "grad_norm": 8.956007957458496,
732
- "learning_rate": 5.077658303464756e-06,
733
- "loss": 0.3101,
734
- "step": 2525
735
- },
736
- {
737
- "epoch": 1.6451612903225805,
738
- "grad_norm": 11.339639663696289,
739
- "learning_rate": 5.017921146953405e-06,
740
- "loss": 0.3276,
741
- "step": 2550
742
- },
743
- {
744
- "epoch": 1.661290322580645,
745
- "grad_norm": 6.775766372680664,
746
- "learning_rate": 4.9581839904420555e-06,
747
- "loss": 0.3226,
748
- "step": 2575
749
- },
750
- {
751
- "epoch": 1.6774193548387095,
752
- "grad_norm": 9.266929626464844,
753
- "learning_rate": 4.898446833930705e-06,
754
- "loss": 0.317,
755
- "step": 2600
756
- },
757
- {
758
- "epoch": 1.6935483870967742,
759
- "grad_norm": 7.660613059997559,
760
- "learning_rate": 4.838709677419355e-06,
761
- "loss": 0.3209,
762
- "step": 2625
763
- },
764
- {
765
- "epoch": 1.7096774193548387,
766
- "grad_norm": 8.688915252685547,
767
- "learning_rate": 4.7789725209080055e-06,
768
- "loss": 0.3352,
769
- "step": 2650
770
- },
771
- {
772
- "epoch": 1.7258064516129032,
773
- "grad_norm": 7.915940761566162,
774
- "learning_rate": 4.719235364396655e-06,
775
- "loss": 0.3693,
776
- "step": 2675
777
- },
778
- {
779
- "epoch": 1.7419354838709677,
780
- "grad_norm": 9.707711219787598,
781
- "learning_rate": 4.659498207885305e-06,
782
- "loss": 0.3232,
783
- "step": 2700
784
- },
785
- {
786
- "epoch": 1.7580645161290323,
787
- "grad_norm": 9.361932754516602,
788
- "learning_rate": 4.599761051373955e-06,
789
- "loss": 0.3674,
790
- "step": 2725
791
- },
792
- {
793
- "epoch": 1.7741935483870968,
794
- "grad_norm": 11.118302345275879,
795
- "learning_rate": 4.540023894862605e-06,
796
- "loss": 0.3367,
797
- "step": 2750
798
- },
799
- {
800
- "epoch": 1.7903225806451613,
801
- "grad_norm": 9.054045677185059,
802
- "learning_rate": 4.480286738351255e-06,
803
- "loss": 0.326,
804
- "step": 2775
805
- },
806
- {
807
- "epoch": 1.8064516129032258,
808
- "grad_norm": 10.975425720214844,
809
- "learning_rate": 4.420549581839905e-06,
810
- "loss": 0.3553,
811
- "step": 2800
812
- },
813
- {
814
- "epoch": 1.8225806451612905,
815
- "grad_norm": 10.127399444580078,
816
- "learning_rate": 4.360812425328555e-06,
817
- "loss": 0.3321,
818
- "step": 2825
819
- },
820
- {
821
- "epoch": 1.838709677419355,
822
- "grad_norm": 11.241859436035156,
823
- "learning_rate": 4.3010752688172045e-06,
824
- "loss": 0.3287,
825
- "step": 2850
826
- },
827
- {
828
- "epoch": 1.8548387096774195,
829
- "grad_norm": 8.5289888381958,
830
- "learning_rate": 4.241338112305855e-06,
831
- "loss": 0.2897,
832
- "step": 2875
833
- },
834
- {
835
- "epoch": 1.870967741935484,
836
- "grad_norm": 15.53427505493164,
837
- "learning_rate": 4.181600955794505e-06,
838
- "loss": 0.3113,
839
- "step": 2900
840
- },
841
- {
842
- "epoch": 1.8870967741935485,
843
- "grad_norm": 9.797106742858887,
844
- "learning_rate": 4.121863799283155e-06,
845
- "loss": 0.3404,
846
- "step": 2925
847
- },
848
- {
849
- "epoch": 1.903225806451613,
850
- "grad_norm": 11.643150329589844,
851
- "learning_rate": 4.062126642771804e-06,
852
- "loss": 0.329,
853
- "step": 2950
854
- },
855
- {
856
- "epoch": 1.9193548387096775,
857
- "grad_norm": 7.331206798553467,
858
- "learning_rate": 4.002389486260454e-06,
859
- "loss": 0.2925,
860
- "step": 2975
861
- },
862
- {
863
- "epoch": 1.935483870967742,
864
- "grad_norm": 10.384967803955078,
865
- "learning_rate": 3.942652329749105e-06,
866
- "loss": 0.2883,
867
- "step": 3000
868
- },
869
- {
870
- "epoch": 1.935483870967742,
871
- "eval_cer": 31.230569948186528,
872
- "eval_loss": 0.4035734534263611,
873
- "eval_runtime": 956.3419,
874
- "eval_samples_per_second": 2.385,
875
- "eval_steps_per_second": 0.299,
876
- "step": 3000
877
- },
878
- {
879
- "epoch": 1.9516129032258065,
880
- "grad_norm": 9.115569114685059,
881
- "learning_rate": 3.882915173237754e-06,
882
- "loss": 0.2929,
883
- "step": 3025
884
- },
885
- {
886
- "epoch": 1.967741935483871,
887
- "grad_norm": 10.029779434204102,
888
- "learning_rate": 3.823178016726404e-06,
889
- "loss": 0.2939,
890
- "step": 3050
891
- },
892
- {
893
- "epoch": 1.9838709677419355,
894
- "grad_norm": 10.219369888305664,
895
- "learning_rate": 3.763440860215054e-06,
896
- "loss": 0.3228,
897
- "step": 3075
898
- },
899
- {
900
- "epoch": 2.0,
901
- "grad_norm": 10.639910697937012,
902
- "learning_rate": 3.7037037037037037e-06,
903
- "loss": 0.3236,
904
- "step": 3100
905
- },
906
- {
907
- "epoch": 2.0161290322580645,
908
- "grad_norm": 4.525390148162842,
909
- "learning_rate": 3.643966547192354e-06,
910
- "loss": 0.1713,
911
- "step": 3125
912
- },
913
- {
914
- "epoch": 2.032258064516129,
915
- "grad_norm": 6.762115478515625,
916
- "learning_rate": 3.584229390681004e-06,
917
- "loss": 0.1892,
918
- "step": 3150
919
- },
920
- {
921
- "epoch": 2.0483870967741935,
922
- "grad_norm": 8.670353889465332,
923
- "learning_rate": 3.5244922341696534e-06,
924
- "loss": 0.1902,
925
- "step": 3175
926
- },
927
- {
928
- "epoch": 2.064516129032258,
929
- "grad_norm": 6.066471576690674,
930
- "learning_rate": 3.4647550776583037e-06,
931
- "loss": 0.157,
932
- "step": 3200
933
- },
934
- {
935
- "epoch": 2.0806451612903225,
936
- "grad_norm": 5.877708911895752,
937
- "learning_rate": 3.4050179211469536e-06,
938
- "loss": 0.18,
939
- "step": 3225
940
- },
941
- {
942
- "epoch": 2.096774193548387,
943
- "grad_norm": 5.906160354614258,
944
- "learning_rate": 3.3452807646356034e-06,
945
- "loss": 0.198,
946
- "step": 3250
947
- },
948
- {
949
- "epoch": 2.1129032258064515,
950
- "grad_norm": 7.664149761199951,
951
- "learning_rate": 3.2855436081242537e-06,
952
- "loss": 0.1889,
953
- "step": 3275
954
- },
955
- {
956
- "epoch": 2.129032258064516,
957
- "grad_norm": 6.261497497558594,
958
- "learning_rate": 3.225806451612903e-06,
959
- "loss": 0.157,
960
- "step": 3300
961
- },
962
- {
963
- "epoch": 2.1451612903225805,
964
- "grad_norm": 6.205556869506836,
965
- "learning_rate": 3.1660692951015535e-06,
966
- "loss": 0.1635,
967
- "step": 3325
968
- },
969
- {
970
- "epoch": 2.161290322580645,
971
- "grad_norm": 5.549154758453369,
972
- "learning_rate": 3.1063321385902034e-06,
973
- "loss": 0.1799,
974
- "step": 3350
975
- },
976
- {
977
- "epoch": 2.1774193548387095,
978
- "grad_norm": 7.939329624176025,
979
- "learning_rate": 3.0465949820788532e-06,
980
- "loss": 0.159,
981
- "step": 3375
982
- },
983
- {
984
- "epoch": 2.193548387096774,
985
- "grad_norm": 7.168279647827148,
986
- "learning_rate": 2.9868578255675035e-06,
987
- "loss": 0.1772,
988
- "step": 3400
989
- },
990
- {
991
- "epoch": 2.2096774193548385,
992
- "grad_norm": 7.339049816131592,
993
- "learning_rate": 2.9271206690561534e-06,
994
- "loss": 0.161,
995
- "step": 3425
996
- },
997
- {
998
- "epoch": 2.225806451612903,
999
- "grad_norm": 7.939656734466553,
1000
- "learning_rate": 2.867383512544803e-06,
1001
- "loss": 0.163,
1002
- "step": 3450
1003
- },
1004
- {
1005
- "epoch": 2.241935483870968,
1006
- "grad_norm": 6.338183403015137,
1007
- "learning_rate": 2.807646356033453e-06,
1008
- "loss": 0.1671,
1009
- "step": 3475
1010
- },
1011
- {
1012
- "epoch": 2.258064516129032,
1013
- "grad_norm": 4.27256441116333,
1014
- "learning_rate": 2.747909199522103e-06,
1015
- "loss": 0.1696,
1016
- "step": 3500
1017
- },
1018
- {
1019
- "epoch": 2.274193548387097,
1020
- "grad_norm": 7.593271255493164,
1021
- "learning_rate": 2.688172043010753e-06,
1022
- "loss": 0.1558,
1023
- "step": 3525
1024
- },
1025
- {
1026
- "epoch": 2.2903225806451615,
1027
- "grad_norm": 3.984323501586914,
1028
- "learning_rate": 2.6284348864994032e-06,
1029
- "loss": 0.1727,
1030
- "step": 3550
1031
- },
1032
- {
1033
- "epoch": 2.306451612903226,
1034
- "grad_norm": 6.01830530166626,
1035
- "learning_rate": 2.5686977299880527e-06,
1036
- "loss": 0.1681,
1037
- "step": 3575
1038
- },
1039
- {
1040
- "epoch": 2.3225806451612905,
1041
- "grad_norm": 8.704211235046387,
1042
- "learning_rate": 2.5089605734767026e-06,
1043
- "loss": 0.1703,
1044
- "step": 3600
1045
- },
1046
- {
1047
- "epoch": 2.338709677419355,
1048
- "grad_norm": 7.5924506187438965,
1049
- "learning_rate": 2.4492234169653525e-06,
1050
- "loss": 0.1723,
1051
- "step": 3625
1052
- },
1053
- {
1054
- "epoch": 2.3548387096774195,
1055
- "grad_norm": 5.732957363128662,
1056
- "learning_rate": 2.3894862604540028e-06,
1057
- "loss": 0.1586,
1058
- "step": 3650
1059
- },
1060
- {
1061
- "epoch": 2.370967741935484,
1062
- "grad_norm": 7.640561580657959,
1063
- "learning_rate": 2.3297491039426526e-06,
1064
- "loss": 0.1781,
1065
- "step": 3675
1066
- },
1067
- {
1068
- "epoch": 2.3870967741935485,
1069
- "grad_norm": 7.3015313148498535,
1070
- "learning_rate": 2.2700119474313025e-06,
1071
- "loss": 0.1571,
1072
- "step": 3700
1073
- },
1074
- {
1075
- "epoch": 2.403225806451613,
1076
- "grad_norm": 7.6669440269470215,
1077
- "learning_rate": 2.2102747909199524e-06,
1078
- "loss": 0.1532,
1079
- "step": 3725
1080
- },
1081
- {
1082
- "epoch": 2.4193548387096775,
1083
- "grad_norm": 4.207671165466309,
1084
- "learning_rate": 2.1505376344086023e-06,
1085
- "loss": 0.1658,
1086
- "step": 3750
1087
- },
1088
- {
1089
- "epoch": 2.435483870967742,
1090
- "grad_norm": 6.316219806671143,
1091
- "learning_rate": 2.0908004778972526e-06,
1092
- "loss": 0.1681,
1093
- "step": 3775
1094
- },
1095
- {
1096
- "epoch": 2.4516129032258065,
1097
- "grad_norm": 6.380753040313721,
1098
- "learning_rate": 2.031063321385902e-06,
1099
- "loss": 0.1595,
1100
- "step": 3800
1101
- },
1102
- {
1103
- "epoch": 2.467741935483871,
1104
- "grad_norm": 7.425994873046875,
1105
- "learning_rate": 1.9713261648745523e-06,
1106
- "loss": 0.1457,
1107
- "step": 3825
1108
- },
1109
- {
1110
- "epoch": 2.4838709677419355,
1111
- "grad_norm": 7.0207600593566895,
1112
- "learning_rate": 1.911589008363202e-06,
1113
- "loss": 0.1557,
1114
- "step": 3850
1115
- },
1116
- {
1117
- "epoch": 2.5,
1118
- "grad_norm": 7.421679496765137,
1119
- "learning_rate": 1.8518518518518519e-06,
1120
- "loss": 0.17,
1121
- "step": 3875
1122
- },
1123
- {
1124
- "epoch": 2.5161290322580645,
1125
- "grad_norm": 7.43884801864624,
1126
- "learning_rate": 1.792114695340502e-06,
1127
- "loss": 0.1605,
1128
- "step": 3900
1129
- },
1130
- {
1131
- "epoch": 2.532258064516129,
1132
- "grad_norm": 6.753660202026367,
1133
- "learning_rate": 1.7323775388291518e-06,
1134
- "loss": 0.1467,
1135
- "step": 3925
1136
- },
1137
- {
1138
- "epoch": 2.5483870967741935,
1139
- "grad_norm": 5.872158050537109,
1140
- "learning_rate": 1.6726403823178017e-06,
1141
- "loss": 0.1511,
1142
- "step": 3950
1143
- },
1144
- {
1145
- "epoch": 2.564516129032258,
1146
- "grad_norm": 5.571343421936035,
1147
- "learning_rate": 1.6129032258064516e-06,
1148
- "loss": 0.1476,
1149
- "step": 3975
1150
- },
1151
- {
1152
- "epoch": 2.5806451612903225,
1153
- "grad_norm": 9.308771133422852,
1154
- "learning_rate": 1.5531660692951017e-06,
1155
- "loss": 0.1566,
1156
- "step": 4000
1157
- },
1158
- {
1159
- "epoch": 2.5806451612903225,
1160
- "eval_cer": 29.080310880829014,
1161
- "eval_loss": 0.37441545724868774,
1162
- "eval_runtime": 959.8469,
1163
- "eval_samples_per_second": 2.376,
1164
- "eval_steps_per_second": 0.298,
1165
- "step": 4000
1166
  }
1167
  ],
1168
  "logging_steps": 25,
@@ -1170,7 +303,7 @@
1170
  "num_input_tokens_seen": 0,
1171
  "num_train_epochs": 3,
1172
  "save_steps": 1000,
1173
- "total_flos": 1.84665797664768e+19,
1174
  "train_batch_size": 16,
1175
  "trial_name": null,
1176
  "trial_params": null
 
1
  {
2
+ "best_metric": 60.21685813863431,
3
+ "best_model_checkpoint": "./whisper-small-taiwanese/checkpoint-1000",
4
+ "epoch": 0.6451612903225806,
5
  "eval_steps": 1000,
6
+ "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.016129032258064516,
13
+ "grad_norm": 241.39755249023438,
14
  "learning_rate": 5.376344086021506e-07,
15
+ "loss": 8.0646,
16
  "step": 25
17
  },
18
  {
19
  "epoch": 0.03225806451612903,
20
+ "grad_norm": 52.91600799560547,
21
  "learning_rate": 1.0752688172043011e-06,
22
+ "loss": 5.6903,
23
  "step": 50
24
  },
25
  {
26
  "epoch": 0.04838709677419355,
27
+ "grad_norm": 32.09747314453125,
28
  "learning_rate": 1.6129032258064516e-06,
29
+ "loss": 3.6353,
30
  "step": 75
31
  },
32
  {
33
  "epoch": 0.06451612903225806,
34
+ "grad_norm": 31.451000213623047,
35
  "learning_rate": 2.1505376344086023e-06,
36
+ "loss": 2.6364,
37
  "step": 100
38
  },
39
  {
40
  "epoch": 0.08064516129032258,
41
+ "grad_norm": 29.471986770629883,
42
  "learning_rate": 2.688172043010753e-06,
43
+ "loss": 2.3125,
44
  "step": 125
45
  },
46
  {
47
  "epoch": 0.0967741935483871,
48
+ "grad_norm": 28.64345932006836,
49
  "learning_rate": 3.225806451612903e-06,
50
+ "loss": 2.1281,
51
  "step": 150
52
  },
53
  {
54
  "epoch": 0.11290322580645161,
55
+ "grad_norm": 28.750173568725586,
56
  "learning_rate": 3.763440860215054e-06,
57
+ "loss": 1.9073,
58
  "step": 175
59
  },
60
  {
61
  "epoch": 0.12903225806451613,
62
+ "grad_norm": 23.051420211791992,
63
  "learning_rate": 4.3010752688172045e-06,
64
+ "loss": 1.5977,
65
  "step": 200
66
  },
67
  {
68
  "epoch": 0.14516129032258066,
69
+ "grad_norm": 18.67135238647461,
70
  "learning_rate": 4.838709677419355e-06,
71
+ "loss": 1.5081,
72
  "step": 225
73
  },
74
  {
75
  "epoch": 0.16129032258064516,
76
+ "grad_norm": 15.335652351379395,
77
  "learning_rate": 5.376344086021506e-06,
78
+ "loss": 1.4169,
79
  "step": 250
80
  },
81
  {
82
  "epoch": 0.1774193548387097,
83
+ "grad_norm": 16.2917537689209,
84
  "learning_rate": 5.9139784946236566e-06,
85
+ "loss": 1.3469,
86
  "step": 275
87
  },
88
  {
89
  "epoch": 0.1935483870967742,
90
+ "grad_norm": 15.212031364440918,
91
  "learning_rate": 6.451612903225806e-06,
92
+ "loss": 1.4059,
93
  "step": 300
94
  },
95
  {
96
  "epoch": 0.20967741935483872,
97
+ "grad_norm": 15.661399841308594,
98
  "learning_rate": 6.989247311827958e-06,
99
+ "loss": 1.333,
100
  "step": 325
101
  },
102
  {
103
  "epoch": 0.22580645161290322,
104
+ "grad_norm": 16.841798782348633,
105
  "learning_rate": 7.526881720430108e-06,
106
+ "loss": 1.2252,
107
  "step": 350
108
  },
109
  {
110
  "epoch": 0.24193548387096775,
111
+ "grad_norm": 17.468032836914062,
112
  "learning_rate": 8.064516129032258e-06,
113
+ "loss": 1.2996,
114
  "step": 375
115
  },
116
  {
117
  "epoch": 0.25806451612903225,
118
+ "grad_norm": 16.684844970703125,
119
  "learning_rate": 8.602150537634409e-06,
120
+ "loss": 1.2653,
121
  "step": 400
122
  },
123
  {
124
  "epoch": 0.27419354838709675,
125
+ "grad_norm": 14.749136924743652,
126
  "learning_rate": 9.13978494623656e-06,
127
+ "loss": 1.1967,
128
  "step": 425
129
  },
130
  {
131
  "epoch": 0.2903225806451613,
132
+ "grad_norm": 13.751141548156738,
133
  "learning_rate": 9.67741935483871e-06,
134
+ "loss": 1.1865,
135
  "step": 450
136
  },
137
  {
138
  "epoch": 0.3064516129032258,
139
+ "grad_norm": 16.48873519897461,
140
  "learning_rate": 9.97610513739546e-06,
141
+ "loss": 1.1636,
142
  "step": 475
143
  },
144
  {
145
  "epoch": 0.3225806451612903,
146
+ "grad_norm": 14.694608688354492,
147
  "learning_rate": 9.916367980884111e-06,
148
+ "loss": 1.1796,
149
  "step": 500
150
  },
151
  {
152
  "epoch": 0.3387096774193548,
153
+ "grad_norm": 15.619414329528809,
154
  "learning_rate": 9.856630824372761e-06,
155
+ "loss": 1.1655,
156
  "step": 525
157
  },
158
  {
159
  "epoch": 0.3548387096774194,
160
+ "grad_norm": 13.177242279052734,
161
  "learning_rate": 9.79689366786141e-06,
162
+ "loss": 1.143,
163
  "step": 550
164
  },
165
  {
166
  "epoch": 0.3709677419354839,
167
+ "grad_norm": 15.957605361938477,
168
  "learning_rate": 9.737156511350062e-06,
169
+ "loss": 1.1414,
170
  "step": 575
171
  },
172
  {
173
  "epoch": 0.3870967741935484,
174
+ "grad_norm": 12.467620849609375,
175
  "learning_rate": 9.67741935483871e-06,
176
+ "loss": 1.0964,
177
  "step": 600
178
  },
179
  {
180
  "epoch": 0.4032258064516129,
181
+ "grad_norm": 15.435978889465332,
182
  "learning_rate": 9.61768219832736e-06,
183
+ "loss": 1.1512,
184
  "step": 625
185
  },
186
  {
187
  "epoch": 0.41935483870967744,
188
+ "grad_norm": 13.087624549865723,
189
  "learning_rate": 9.557945041816011e-06,
190
+ "loss": 1.1338,
191
  "step": 650
192
  },
193
  {
194
  "epoch": 0.43548387096774194,
195
+ "grad_norm": 15.716456413269043,
196
  "learning_rate": 9.49820788530466e-06,
197
+ "loss": 1.0783,
198
  "step": 675
199
  },
200
  {
201
  "epoch": 0.45161290322580644,
202
+ "grad_norm": 14.517507553100586,
203
  "learning_rate": 9.43847072879331e-06,
204
+ "loss": 1.0728,
205
  "step": 700
206
  },
207
  {
208
  "epoch": 0.46774193548387094,
209
+ "grad_norm": 17.37009620666504,
210
  "learning_rate": 9.37873357228196e-06,
211
+ "loss": 1.0317,
212
  "step": 725
213
  },
214
  {
215
  "epoch": 0.4838709677419355,
216
+ "grad_norm": 14.03701400756836,
217
  "learning_rate": 9.31899641577061e-06,
218
+ "loss": 1.0347,
219
  "step": 750
220
  },
221
  {
222
  "epoch": 0.5,
223
+ "grad_norm": 12.431659698486328,
224
  "learning_rate": 9.25925925925926e-06,
225
+ "loss": 1.0524,
226
  "step": 775
227
  },
228
  {
229
  "epoch": 0.5161290322580645,
230
+ "grad_norm": 12.746413230895996,
231
  "learning_rate": 9.19952210274791e-06,
232
+ "loss": 1.0826,
233
  "step": 800
234
  },
235
  {
236
  "epoch": 0.532258064516129,
237
+ "grad_norm": 15.521408081054688,
238
  "learning_rate": 9.13978494623656e-06,
239
+ "loss": 1.0377,
240
  "step": 825
241
  },
242
  {
243
  "epoch": 0.5483870967741935,
244
+ "grad_norm": 15.342901229858398,
245
  "learning_rate": 9.08004778972521e-06,
246
+ "loss": 0.9762,
247
  "step": 850
248
  },
249
  {
250
  "epoch": 0.5645161290322581,
251
+ "grad_norm": 16.137371063232422,
252
  "learning_rate": 9.02031063321386e-06,
253
+ "loss": 1.0725,
254
  "step": 875
255
  },
256
  {
257
  "epoch": 0.5806451612903226,
258
+ "grad_norm": 14.61146068572998,
259
  "learning_rate": 8.96057347670251e-06,
260
+ "loss": 0.9554,
261
  "step": 900
262
  },
263
  {
264
  "epoch": 0.5967741935483871,
265
+ "grad_norm": 13.561723709106445,
266
  "learning_rate": 8.90083632019116e-06,
267
+ "loss": 1.0127,
268
  "step": 925
269
  },
270
  {
271
  "epoch": 0.6129032258064516,
272
+ "grad_norm": 16.037729263305664,
273
  "learning_rate": 8.84109916367981e-06,
274
+ "loss": 0.9621,
275
  "step": 950
276
  },
277
  {
278
  "epoch": 0.6290322580645161,
279
+ "grad_norm": 13.945268630981445,
280
  "learning_rate": 8.78136200716846e-06,
281
+ "loss": 0.9479,
282
  "step": 975
283
  },
284
  {
285
  "epoch": 0.6451612903225806,
286
+ "grad_norm": 15.826567649841309,
287
  "learning_rate": 8.72162485065711e-06,
288
+ "loss": 0.9789,
289
  "step": 1000
290
  },
291
  {
292
  "epoch": 0.6451612903225806,
293
+ "eval_cer": 60.21685813863431,
294
+ "eval_loss": 0.9020848870277405,
295
+ "eval_runtime": 953.7359,
296
+ "eval_samples_per_second": 2.392,
297
+ "eval_steps_per_second": 0.3,
298
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  }
300
  ],
301
  "logging_steps": 25,
 
303
  "num_input_tokens_seen": 0,
304
  "num_train_epochs": 3,
305
  "save_steps": 1000,
306
+ "total_flos": 4.61736640512e+18,
307
  "train_batch_size": 16,
308
  "trial_name": null,
309
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fdbd7ffd023398f8cec6e5726c887d0bce38c6797a0f638b634302be6e3c8ab1
3
  size 5176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5780b3fe6cf6a2b7abc711d493a9d31fc1181c9fff73c0fc0a79ae423a23e2fb
3
  size 5176