Prikshit7766 commited on
Commit
e269401
·
verified ·
1 Parent(s): 70bc200

Upload log_history.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. log_history.json +847 -0
log_history.json ADDED
@@ -0,0 +1,847 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "loss": 1.3916,
4
+ "grad_norm": 7.633892059326172,
5
+ "learning_rate": 1.9831190798376187e-05,
6
+ "epoch": 0.08457374830852503,
7
+ "step": 500
8
+ },
9
+ {
10
+ "loss": 1.2408,
11
+ "grad_norm": 4.5788655281066895,
12
+ "learning_rate": 1.9662043301759137e-05,
13
+ "epoch": 0.16914749661705006,
14
+ "step": 1000
15
+ },
16
+ {
17
+ "loss": 1.2028,
18
+ "grad_norm": 5.243614673614502,
19
+ "learning_rate": 1.9492895805142083e-05,
20
+ "epoch": 0.25372124492557513,
21
+ "step": 1500
22
+ },
23
+ {
24
+ "loss": 1.1534,
25
+ "grad_norm": 3.415882110595703,
26
+ "learning_rate": 1.9323748308525033e-05,
27
+ "epoch": 0.3382949932341001,
28
+ "step": 2000
29
+ },
30
+ {
31
+ "loss": 1.1093,
32
+ "grad_norm": 3.2337677478790283,
33
+ "learning_rate": 1.9154600811907986e-05,
34
+ "epoch": 0.42286874154262516,
35
+ "step": 2500
36
+ },
37
+ {
38
+ "loss": 1.0754,
39
+ "grad_norm": 3.4433956146240234,
40
+ "learning_rate": 1.8985453315290936e-05,
41
+ "epoch": 0.5074424898511503,
42
+ "step": 3000
43
+ },
44
+ {
45
+ "loss": 1.0467,
46
+ "grad_norm": 4.641908168792725,
47
+ "learning_rate": 1.8816305818673886e-05,
48
+ "epoch": 0.5920162381596752,
49
+ "step": 3500
50
+ },
51
+ {
52
+ "loss": 1.0355,
53
+ "grad_norm": 3.7774765491485596,
54
+ "learning_rate": 1.8647158322056836e-05,
55
+ "epoch": 0.6765899864682002,
56
+ "step": 4000
57
+ },
58
+ {
59
+ "loss": 1.0421,
60
+ "grad_norm": 3.130302906036377,
61
+ "learning_rate": 1.8478349120433018e-05,
62
+ "epoch": 0.7611637347767253,
63
+ "step": 4500
64
+ },
65
+ {
66
+ "loss": 1.0158,
67
+ "grad_norm": 2.876955509185791,
68
+ "learning_rate": 1.8309201623815968e-05,
69
+ "epoch": 0.8457374830852503,
70
+ "step": 5000
71
+ },
72
+ {
73
+ "loss": 1.0,
74
+ "grad_norm": 3.218794107437134,
75
+ "learning_rate": 1.8140054127198918e-05,
76
+ "epoch": 0.9303112313937754,
77
+ "step": 5500
78
+ },
79
+ {
80
+ "loss": 0.9665,
81
+ "grad_norm": 2.9098243713378906,
82
+ "learning_rate": 1.7971244925575103e-05,
83
+ "epoch": 1.0148849797023005,
84
+ "step": 6000
85
+ },
86
+ {
87
+ "loss": 0.8815,
88
+ "grad_norm": 2.203686475753784,
89
+ "learning_rate": 1.7802097428958052e-05,
90
+ "epoch": 1.0994587280108254,
91
+ "step": 6500
92
+ },
93
+ {
94
+ "loss": 0.9049,
95
+ "grad_norm": 3.389420986175537,
96
+ "learning_rate": 1.7632949932341002e-05,
97
+ "epoch": 1.1840324763193504,
98
+ "step": 7000
99
+ },
100
+ {
101
+ "loss": 0.9059,
102
+ "grad_norm": 2.778923749923706,
103
+ "learning_rate": 1.7463802435723952e-05,
104
+ "epoch": 1.2686062246278755,
105
+ "step": 7500
106
+ },
107
+ {
108
+ "loss": 0.8858,
109
+ "grad_norm": 5.216675758361816,
110
+ "learning_rate": 1.7294993234100137e-05,
111
+ "epoch": 1.3531799729364005,
112
+ "step": 8000
113
+ },
114
+ {
115
+ "loss": 0.8881,
116
+ "grad_norm": 3.9376771450042725,
117
+ "learning_rate": 1.7125845737483087e-05,
118
+ "epoch": 1.4377537212449256,
119
+ "step": 8500
120
+ },
121
+ {
122
+ "loss": 0.8885,
123
+ "grad_norm": 3.8835389614105225,
124
+ "learning_rate": 1.6956698240866037e-05,
125
+ "epoch": 1.5223274695534506,
126
+ "step": 9000
127
+ },
128
+ {
129
+ "loss": 0.8774,
130
+ "grad_norm": 3.470211982727051,
131
+ "learning_rate": 1.6787550744248987e-05,
132
+ "epoch": 1.6069012178619757,
133
+ "step": 9500
134
+ },
135
+ {
136
+ "loss": 0.872,
137
+ "grad_norm": 3.03437876701355,
138
+ "learning_rate": 1.6618403247631937e-05,
139
+ "epoch": 1.6914749661705006,
140
+ "step": 10000
141
+ },
142
+ {
143
+ "loss": 0.8883,
144
+ "grad_norm": 3.5217363834381104,
145
+ "learning_rate": 1.6449255751014887e-05,
146
+ "epoch": 1.7760487144790256,
147
+ "step": 10500
148
+ },
149
+ {
150
+ "loss": 0.8614,
151
+ "grad_norm": 3.965338706970215,
152
+ "learning_rate": 1.6280446549391072e-05,
153
+ "epoch": 1.8606224627875507,
154
+ "step": 11000
155
+ },
156
+ {
157
+ "loss": 0.8682,
158
+ "grad_norm": 2.316436767578125,
159
+ "learning_rate": 1.6111299052774022e-05,
160
+ "epoch": 1.9451962110960759,
161
+ "step": 11500
162
+ },
163
+ {
164
+ "loss": 0.8286,
165
+ "grad_norm": 3.9571282863616943,
166
+ "learning_rate": 1.5942151556156972e-05,
167
+ "epoch": 2.029769959404601,
168
+ "step": 12000
169
+ },
170
+ {
171
+ "loss": 0.7808,
172
+ "grad_norm": 3.676339864730835,
173
+ "learning_rate": 1.577300405953992e-05,
174
+ "epoch": 2.1143437077131257,
175
+ "step": 12500
176
+ },
177
+ {
178
+ "loss": 0.7811,
179
+ "grad_norm": 2.8704166412353516,
180
+ "learning_rate": 1.560385656292287e-05,
181
+ "epoch": 2.198917456021651,
182
+ "step": 13000
183
+ },
184
+ {
185
+ "loss": 0.8,
186
+ "grad_norm": 4.323342800140381,
187
+ "learning_rate": 1.5434709066305818e-05,
188
+ "epoch": 2.283491204330176,
189
+ "step": 13500
190
+ },
191
+ {
192
+ "loss": 0.79,
193
+ "grad_norm": 2.5053164958953857,
194
+ "learning_rate": 1.5265561569688768e-05,
195
+ "epoch": 2.3680649526387008,
196
+ "step": 14000
197
+ },
198
+ {
199
+ "loss": 0.7849,
200
+ "grad_norm": 3.5477893352508545,
201
+ "learning_rate": 1.509641407307172e-05,
202
+ "epoch": 2.452638700947226,
203
+ "step": 14500
204
+ },
205
+ {
206
+ "loss": 0.7839,
207
+ "grad_norm": 3.699144124984741,
208
+ "learning_rate": 1.4927604871447903e-05,
209
+ "epoch": 2.537212449255751,
210
+ "step": 15000
211
+ },
212
+ {
213
+ "loss": 0.799,
214
+ "grad_norm": 2.969682455062866,
215
+ "learning_rate": 1.4758795669824088e-05,
216
+ "epoch": 2.621786197564276,
217
+ "step": 15500
218
+ },
219
+ {
220
+ "loss": 0.7799,
221
+ "grad_norm": 4.236715316772461,
222
+ "learning_rate": 1.4589648173207038e-05,
223
+ "epoch": 2.706359945872801,
224
+ "step": 16000
225
+ },
226
+ {
227
+ "loss": 0.7756,
228
+ "grad_norm": 3.2528302669525146,
229
+ "learning_rate": 1.4420500676589988e-05,
230
+ "epoch": 2.790933694181326,
231
+ "step": 16500
232
+ },
233
+ {
234
+ "loss": 0.7837,
235
+ "grad_norm": 4.050159454345703,
236
+ "learning_rate": 1.4251353179972938e-05,
237
+ "epoch": 2.8755074424898512,
238
+ "step": 17000
239
+ },
240
+ {
241
+ "loss": 0.7907,
242
+ "grad_norm": 3.53711199760437,
243
+ "learning_rate": 1.4082543978349121e-05,
244
+ "epoch": 2.960081190798376,
245
+ "step": 17500
246
+ },
247
+ {
248
+ "loss": 0.7533,
249
+ "grad_norm": 4.171680927276611,
250
+ "learning_rate": 1.3913396481732071e-05,
251
+ "epoch": 3.044654939106901,
252
+ "step": 18000
253
+ },
254
+ {
255
+ "loss": 0.724,
256
+ "grad_norm": 3.026613712310791,
257
+ "learning_rate": 1.3744248985115021e-05,
258
+ "epoch": 3.1292286874154263,
259
+ "step": 18500
260
+ },
261
+ {
262
+ "loss": 0.7289,
263
+ "grad_norm": 3.7827978134155273,
264
+ "learning_rate": 1.357510148849797e-05,
265
+ "epoch": 3.2138024357239514,
266
+ "step": 19000
267
+ },
268
+ {
269
+ "loss": 0.7078,
270
+ "grad_norm": 2.7544713020324707,
271
+ "learning_rate": 1.3405953991880922e-05,
272
+ "epoch": 3.2983761840324766,
273
+ "step": 19500
274
+ },
275
+ {
276
+ "loss": 0.7154,
277
+ "grad_norm": 2.6823747158050537,
278
+ "learning_rate": 1.3236806495263872e-05,
279
+ "epoch": 3.3829499323410013,
280
+ "step": 20000
281
+ },
282
+ {
283
+ "loss": 0.7314,
284
+ "grad_norm": 3.3478825092315674,
285
+ "learning_rate": 1.3067658998646822e-05,
286
+ "epoch": 3.4675236806495264,
287
+ "step": 20500
288
+ },
289
+ {
290
+ "loss": 0.726,
291
+ "grad_norm": 3.5908212661743164,
292
+ "learning_rate": 1.289851150202977e-05,
293
+ "epoch": 3.5520974289580516,
294
+ "step": 21000
295
+ },
296
+ {
297
+ "loss": 0.7285,
298
+ "grad_norm": 3.1159133911132812,
299
+ "learning_rate": 1.2729702300405956e-05,
300
+ "epoch": 3.6366711772665763,
301
+ "step": 21500
302
+ },
303
+ {
304
+ "loss": 0.7162,
305
+ "grad_norm": 3.3147387504577637,
306
+ "learning_rate": 1.2560554803788905e-05,
307
+ "epoch": 3.7212449255751014,
308
+ "step": 22000
309
+ },
310
+ {
311
+ "loss": 0.7233,
312
+ "grad_norm": 3.7307050228118896,
313
+ "learning_rate": 1.2391407307171854e-05,
314
+ "epoch": 3.8058186738836266,
315
+ "step": 22500
316
+ },
317
+ {
318
+ "loss": 0.7159,
319
+ "grad_norm": 2.382382392883301,
320
+ "learning_rate": 1.2222259810554804e-05,
321
+ "epoch": 3.8903924221921518,
322
+ "step": 23000
323
+ },
324
+ {
325
+ "loss": 0.722,
326
+ "grad_norm": 4.039222717285156,
327
+ "learning_rate": 1.2053112313937754e-05,
328
+ "epoch": 3.9749661705006765,
329
+ "step": 23500
330
+ },
331
+ {
332
+ "loss": 0.6786,
333
+ "grad_norm": 3.7908201217651367,
334
+ "learning_rate": 1.188430311231394e-05,
335
+ "epoch": 4.059539918809202,
336
+ "step": 24000
337
+ },
338
+ {
339
+ "loss": 0.6735,
340
+ "grad_norm": 3.7970995903015137,
341
+ "learning_rate": 1.171515561569689e-05,
342
+ "epoch": 4.144113667117726,
343
+ "step": 24500
344
+ },
345
+ {
346
+ "loss": 0.6736,
347
+ "grad_norm": 4.007111549377441,
348
+ "learning_rate": 1.1546008119079838e-05,
349
+ "epoch": 4.2286874154262515,
350
+ "step": 25000
351
+ },
352
+ {
353
+ "loss": 0.664,
354
+ "grad_norm": 3.4510090351104736,
355
+ "learning_rate": 1.1376860622462788e-05,
356
+ "epoch": 4.313261163734777,
357
+ "step": 25500
358
+ },
359
+ {
360
+ "loss": 0.6724,
361
+ "grad_norm": 3.279106378555298,
362
+ "learning_rate": 1.1207713125845738e-05,
363
+ "epoch": 4.397834912043302,
364
+ "step": 26000
365
+ },
366
+ {
367
+ "loss": 0.6696,
368
+ "grad_norm": 2.7026331424713135,
369
+ "learning_rate": 1.1038565629228688e-05,
370
+ "epoch": 4.482408660351827,
371
+ "step": 26500
372
+ },
373
+ {
374
+ "loss": 0.6738,
375
+ "grad_norm": 3.247185230255127,
376
+ "learning_rate": 1.0869418132611638e-05,
377
+ "epoch": 4.566982408660352,
378
+ "step": 27000
379
+ },
380
+ {
381
+ "loss": 0.6908,
382
+ "grad_norm": 3.6047909259796143,
383
+ "learning_rate": 1.0700608930987821e-05,
384
+ "epoch": 4.651556156968876,
385
+ "step": 27500
386
+ },
387
+ {
388
+ "loss": 0.6687,
389
+ "grad_norm": 4.114670753479004,
390
+ "learning_rate": 1.0531461434370771e-05,
391
+ "epoch": 4.7361299052774015,
392
+ "step": 28000
393
+ },
394
+ {
395
+ "loss": 0.6632,
396
+ "grad_norm": 2.717122793197632,
397
+ "learning_rate": 1.0362313937753723e-05,
398
+ "epoch": 4.820703653585927,
399
+ "step": 28500
400
+ },
401
+ {
402
+ "loss": 0.6842,
403
+ "grad_norm": 2.6075901985168457,
404
+ "learning_rate": 1.0193166441136673e-05,
405
+ "epoch": 4.905277401894452,
406
+ "step": 29000
407
+ },
408
+ {
409
+ "loss": 0.6767,
410
+ "grad_norm": 3.6151123046875,
411
+ "learning_rate": 1.0024018944519623e-05,
412
+ "epoch": 4.989851150202977,
413
+ "step": 29500
414
+ },
415
+ {
416
+ "loss": 0.6441,
417
+ "grad_norm": 3.7179670333862305,
418
+ "learning_rate": 9.855209742895806e-06,
419
+ "epoch": 5.074424898511502,
420
+ "step": 30000
421
+ },
422
+ {
423
+ "loss": 0.6274,
424
+ "grad_norm": 3.9154396057128906,
425
+ "learning_rate": 9.686062246278756e-06,
426
+ "epoch": 5.158998646820027,
427
+ "step": 30500
428
+ },
429
+ {
430
+ "loss": 0.6214,
431
+ "grad_norm": 2.7849080562591553,
432
+ "learning_rate": 9.516914749661706e-06,
433
+ "epoch": 5.243572395128552,
434
+ "step": 31000
435
+ },
436
+ {
437
+ "loss": 0.6362,
438
+ "grad_norm": 3.1513593196868896,
439
+ "learning_rate": 9.347767253044656e-06,
440
+ "epoch": 5.328146143437078,
441
+ "step": 31500
442
+ },
443
+ {
444
+ "loss": 0.6366,
445
+ "grad_norm": 3.0636239051818848,
446
+ "learning_rate": 9.178619756427606e-06,
447
+ "epoch": 5.412719891745602,
448
+ "step": 32000
449
+ },
450
+ {
451
+ "loss": 0.635,
452
+ "grad_norm": 3.542881727218628,
453
+ "learning_rate": 9.009810554803789e-06,
454
+ "epoch": 5.497293640054127,
455
+ "step": 32500
456
+ },
457
+ {
458
+ "loss": 0.629,
459
+ "grad_norm": 2.9938108921051025,
460
+ "learning_rate": 8.840663058186739e-06,
461
+ "epoch": 5.581867388362652,
462
+ "step": 33000
463
+ },
464
+ {
465
+ "loss": 0.6424,
466
+ "grad_norm": 3.608818769454956,
467
+ "learning_rate": 8.67151556156969e-06,
468
+ "epoch": 5.666441136671177,
469
+ "step": 33500
470
+ },
471
+ {
472
+ "loss": 0.6354,
473
+ "grad_norm": 4.858671188354492,
474
+ "learning_rate": 8.50236806495264e-06,
475
+ "epoch": 5.7510148849797025,
476
+ "step": 34000
477
+ },
478
+ {
479
+ "loss": 0.6365,
480
+ "grad_norm": 3.254009246826172,
481
+ "learning_rate": 8.333220568335589e-06,
482
+ "epoch": 5.835588633288228,
483
+ "step": 34500
484
+ },
485
+ {
486
+ "loss": 0.6343,
487
+ "grad_norm": 2.389611005783081,
488
+ "learning_rate": 8.164073071718539e-06,
489
+ "epoch": 5.920162381596752,
490
+ "step": 35000
491
+ },
492
+ {
493
+ "loss": 0.6305,
494
+ "grad_norm": 3.2198381423950195,
495
+ "learning_rate": 7.994925575101489e-06,
496
+ "epoch": 6.004736129905277,
497
+ "step": 35500
498
+ },
499
+ {
500
+ "loss": 0.597,
501
+ "grad_norm": 2.834723711013794,
502
+ "learning_rate": 7.82577807848444e-06,
503
+ "epoch": 6.089309878213802,
504
+ "step": 36000
505
+ },
506
+ {
507
+ "loss": 0.5914,
508
+ "grad_norm": 2.7054672241210938,
509
+ "learning_rate": 7.656630581867388e-06,
510
+ "epoch": 6.173883626522327,
511
+ "step": 36500
512
+ },
513
+ {
514
+ "loss": 0.6084,
515
+ "grad_norm": 2.8164889812469482,
516
+ "learning_rate": 7.487483085250339e-06,
517
+ "epoch": 6.2584573748308525,
518
+ "step": 37000
519
+ },
520
+ {
521
+ "loss": 0.5947,
522
+ "grad_norm": 3.42501163482666,
523
+ "learning_rate": 7.3186738836265225e-06,
524
+ "epoch": 6.343031123139378,
525
+ "step": 37500
526
+ },
527
+ {
528
+ "loss": 0.604,
529
+ "grad_norm": 3.881469249725342,
530
+ "learning_rate": 7.149526387009473e-06,
531
+ "epoch": 6.427604871447903,
532
+ "step": 38000
533
+ },
534
+ {
535
+ "loss": 0.5896,
536
+ "grad_norm": 3.2387328147888184,
537
+ "learning_rate": 6.980378890392423e-06,
538
+ "epoch": 6.512178619756428,
539
+ "step": 38500
540
+ },
541
+ {
542
+ "loss": 0.5985,
543
+ "grad_norm": 3.245598316192627,
544
+ "learning_rate": 6.811231393775373e-06,
545
+ "epoch": 6.596752368064953,
546
+ "step": 39000
547
+ },
548
+ {
549
+ "loss": 0.6207,
550
+ "grad_norm": 4.7686448097229,
551
+ "learning_rate": 6.642422192151556e-06,
552
+ "epoch": 6.681326116373477,
553
+ "step": 39500
554
+ },
555
+ {
556
+ "loss": 0.6164,
557
+ "grad_norm": 3.3545920848846436,
558
+ "learning_rate": 6.473274695534507e-06,
559
+ "epoch": 6.7658998646820026,
560
+ "step": 40000
561
+ },
562
+ {
563
+ "loss": 0.5994,
564
+ "grad_norm": 3.037534713745117,
565
+ "learning_rate": 6.304127198917457e-06,
566
+ "epoch": 6.850473612990528,
567
+ "step": 40500
568
+ },
569
+ {
570
+ "loss": 0.6249,
571
+ "grad_norm": 4.4626784324646,
572
+ "learning_rate": 6.134979702300406e-06,
573
+ "epoch": 6.935047361299053,
574
+ "step": 41000
575
+ },
576
+ {
577
+ "loss": 0.5985,
578
+ "grad_norm": 2.8611857891082764,
579
+ "learning_rate": 5.96617050067659e-06,
580
+ "epoch": 7.019621109607578,
581
+ "step": 41500
582
+ },
583
+ {
584
+ "loss": 0.5841,
585
+ "grad_norm": 3.028613805770874,
586
+ "learning_rate": 5.79702300405954e-06,
587
+ "epoch": 7.104194857916103,
588
+ "step": 42000
589
+ },
590
+ {
591
+ "loss": 0.586,
592
+ "grad_norm": 2.902698040008545,
593
+ "learning_rate": 5.627875507442491e-06,
594
+ "epoch": 7.188768606224627,
595
+ "step": 42500
596
+ },
597
+ {
598
+ "loss": 0.5869,
599
+ "grad_norm": 2.2707433700561523,
600
+ "learning_rate": 5.45872801082544e-06,
601
+ "epoch": 7.273342354533153,
602
+ "step": 43000
603
+ },
604
+ {
605
+ "loss": 0.5888,
606
+ "grad_norm": 3.4720981121063232,
607
+ "learning_rate": 5.289918809201624e-06,
608
+ "epoch": 7.357916102841678,
609
+ "step": 43500
610
+ },
611
+ {
612
+ "loss": 0.576,
613
+ "grad_norm": 2.8364577293395996,
614
+ "learning_rate": 5.120771312584573e-06,
615
+ "epoch": 7.442489851150203,
616
+ "step": 44000
617
+ },
618
+ {
619
+ "loss": 0.5759,
620
+ "grad_norm": 3.178103446960449,
621
+ "learning_rate": 4.951623815967524e-06,
622
+ "epoch": 7.527063599458728,
623
+ "step": 44500
624
+ },
625
+ {
626
+ "loss": 0.5742,
627
+ "grad_norm": 3.5063467025756836,
628
+ "learning_rate": 4.782476319350474e-06,
629
+ "epoch": 7.611637347767253,
630
+ "step": 45000
631
+ },
632
+ {
633
+ "loss": 0.5912,
634
+ "grad_norm": 2.37205171585083,
635
+ "learning_rate": 4.613328822733424e-06,
636
+ "epoch": 7.696211096075778,
637
+ "step": 45500
638
+ },
639
+ {
640
+ "loss": 0.575,
641
+ "grad_norm": 3.2511661052703857,
642
+ "learning_rate": 4.444181326116374e-06,
643
+ "epoch": 7.7807848443843035,
644
+ "step": 46000
645
+ },
646
+ {
647
+ "loss": 0.5724,
648
+ "grad_norm": 3.2974693775177,
649
+ "learning_rate": 4.275033829499324e-06,
650
+ "epoch": 7.865358592692828,
651
+ "step": 46500
652
+ },
653
+ {
654
+ "loss": 0.5745,
655
+ "grad_norm": 3.180819511413574,
656
+ "learning_rate": 4.105886332882274e-06,
657
+ "epoch": 7.949932341001353,
658
+ "step": 47000
659
+ },
660
+ {
661
+ "loss": 0.5697,
662
+ "grad_norm": 2.4791033267974854,
663
+ "learning_rate": 3.937077131258458e-06,
664
+ "epoch": 8.034506089309879,
665
+ "step": 47500
666
+ },
667
+ {
668
+ "loss": 0.556,
669
+ "grad_norm": 3.5898892879486084,
670
+ "learning_rate": 3.7679296346414073e-06,
671
+ "epoch": 8.119079837618404,
672
+ "step": 48000
673
+ },
674
+ {
675
+ "loss": 0.5726,
676
+ "grad_norm": 2.7320892810821533,
677
+ "learning_rate": 3.5987821380243577e-06,
678
+ "epoch": 8.203653585926928,
679
+ "step": 48500
680
+ },
681
+ {
682
+ "loss": 0.5603,
683
+ "grad_norm": 3.3177103996276855,
684
+ "learning_rate": 3.429634641407307e-06,
685
+ "epoch": 8.288227334235453,
686
+ "step": 49000
687
+ },
688
+ {
689
+ "loss": 0.558,
690
+ "grad_norm": 2.1732170581817627,
691
+ "learning_rate": 3.2604871447902575e-06,
692
+ "epoch": 8.372801082543978,
693
+ "step": 49500
694
+ },
695
+ {
696
+ "loss": 0.5631,
697
+ "grad_norm": 3.1351735591888428,
698
+ "learning_rate": 3.091339648173207e-06,
699
+ "epoch": 8.457374830852503,
700
+ "step": 50000
701
+ },
702
+ {
703
+ "loss": 0.5486,
704
+ "grad_norm": 2.601547956466675,
705
+ "learning_rate": 2.9221921515561573e-06,
706
+ "epoch": 8.541948579161028,
707
+ "step": 50500
708
+ },
709
+ {
710
+ "loss": 0.5706,
711
+ "grad_norm": 4.160942554473877,
712
+ "learning_rate": 2.753382949932341e-06,
713
+ "epoch": 8.626522327469553,
714
+ "step": 51000
715
+ },
716
+ {
717
+ "loss": 0.5683,
718
+ "grad_norm": 3.348295211791992,
719
+ "learning_rate": 2.5842354533152914e-06,
720
+ "epoch": 8.711096075778078,
721
+ "step": 51500
722
+ },
723
+ {
724
+ "loss": 0.564,
725
+ "grad_norm": 3.148343563079834,
726
+ "learning_rate": 2.415087956698241e-06,
727
+ "epoch": 8.795669824086604,
728
+ "step": 52000
729
+ },
730
+ {
731
+ "loss": 0.5725,
732
+ "grad_norm": 3.285578489303589,
733
+ "learning_rate": 2.246278755074425e-06,
734
+ "epoch": 8.880243572395129,
735
+ "step": 52500
736
+ },
737
+ {
738
+ "loss": 0.567,
739
+ "grad_norm": 3.201730251312256,
740
+ "learning_rate": 2.077131258457375e-06,
741
+ "epoch": 8.964817320703654,
742
+ "step": 53000
743
+ },
744
+ {
745
+ "loss": 0.553,
746
+ "grad_norm": 3.166001796722412,
747
+ "learning_rate": 1.907983761840325e-06,
748
+ "epoch": 9.049391069012179,
749
+ "step": 53500
750
+ },
751
+ {
752
+ "loss": 0.5614,
753
+ "grad_norm": 3.105032444000244,
754
+ "learning_rate": 1.7388362652232748e-06,
755
+ "epoch": 9.133964817320704,
756
+ "step": 54000
757
+ },
758
+ {
759
+ "loss": 0.5553,
760
+ "grad_norm": 4.7028937339782715,
761
+ "learning_rate": 1.5696887686062248e-06,
762
+ "epoch": 9.21853856562923,
763
+ "step": 54500
764
+ },
765
+ {
766
+ "loss": 0.552,
767
+ "grad_norm": 3.5488646030426025,
768
+ "learning_rate": 1.4005412719891747e-06,
769
+ "epoch": 9.303112313937755,
770
+ "step": 55000
771
+ },
772
+ {
773
+ "loss": 0.5483,
774
+ "grad_norm": 2.8257858753204346,
775
+ "learning_rate": 1.2313937753721246e-06,
776
+ "epoch": 9.387686062246278,
777
+ "step": 55500
778
+ },
779
+ {
780
+ "loss": 0.5476,
781
+ "grad_norm": 4.221645355224609,
782
+ "learning_rate": 1.0622462787550745e-06,
783
+ "epoch": 9.472259810554803,
784
+ "step": 56000
785
+ },
786
+ {
787
+ "loss": 0.5494,
788
+ "grad_norm": 3.1773674488067627,
789
+ "learning_rate": 8.930987821380243e-07,
790
+ "epoch": 9.556833558863328,
791
+ "step": 56500
792
+ },
793
+ {
794
+ "loss": 0.5477,
795
+ "grad_norm": 2.9824230670928955,
796
+ "learning_rate": 7.242895805142085e-07,
797
+ "epoch": 9.641407307171853,
798
+ "step": 57000
799
+ },
800
+ {
801
+ "loss": 0.5579,
802
+ "grad_norm": 2.51481294631958,
803
+ "learning_rate": 5.551420838971583e-07,
804
+ "epoch": 9.725981055480379,
805
+ "step": 57500
806
+ },
807
+ {
808
+ "loss": 0.5392,
809
+ "grad_norm": 3.0290215015411377,
810
+ "learning_rate": 3.8599458728010834e-07,
811
+ "epoch": 9.810554803788904,
812
+ "step": 58000
813
+ },
814
+ {
815
+ "loss": 0.5458,
816
+ "grad_norm": 2.9967031478881836,
817
+ "learning_rate": 2.168470906630582e-07,
818
+ "epoch": 9.895128552097429,
819
+ "step": 58500
820
+ },
821
+ {
822
+ "loss": 0.5434,
823
+ "grad_norm": 2.6248016357421875,
824
+ "learning_rate": 4.769959404600812e-08,
825
+ "epoch": 9.979702300405954,
826
+ "step": 59000
827
+ },
828
+ {
829
+ "train_runtime": 8128.9827,
830
+ "train_samples_per_second": 232.692,
831
+ "train_steps_per_second": 7.273,
832
+ "total_flos": 4.123593985189478e+16,
833
+ "train_loss": 0.7104815463735867,
834
+ "epoch": 10.0,
835
+ "step": 59120
836
+ },
837
+ {
838
+ "eval_loss": 0.7965446710586548,
839
+ "eval_model_preparation_time": 0.002,
840
+ "eval_bleu": 54.95663610516514,
841
+ "eval_runtime": 6012.2223,
842
+ "eval_samples_per_second": 3.496,
843
+ "eval_steps_per_second": 0.055,
844
+ "epoch": 10.0,
845
+ "step": 59120
846
+ }
847
+ ]