|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.991755976916735, |
|
"eval_steps": 500, |
|
"global_step": 6060, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0016488046166529267, |
|
"grad_norm": 380.0, |
|
"learning_rate": 3.300330033003301e-07, |
|
"loss": 35.4867, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008244023083264633, |
|
"grad_norm": 308.0, |
|
"learning_rate": 1.65016501650165e-06, |
|
"loss": 34.8873, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.016488046166529265, |
|
"grad_norm": 306.0, |
|
"learning_rate": 3.3003300330033e-06, |
|
"loss": 34.9252, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0247320692497939, |
|
"grad_norm": 163.0, |
|
"learning_rate": 4.950495049504951e-06, |
|
"loss": 31.7188, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03297609233305853, |
|
"grad_norm": 110.5, |
|
"learning_rate": 6.6006600660066e-06, |
|
"loss": 28.5443, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.041220115416323165, |
|
"grad_norm": 72.5, |
|
"learning_rate": 8.250825082508252e-06, |
|
"loss": 24.1835, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0494641384995878, |
|
"grad_norm": 39.5, |
|
"learning_rate": 9.900990099009901e-06, |
|
"loss": 21.6514, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.057708161582852434, |
|
"grad_norm": 19.0, |
|
"learning_rate": 1.155115511551155e-05, |
|
"loss": 19.5766, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.06595218466611706, |
|
"grad_norm": 16.25, |
|
"learning_rate": 1.32013201320132e-05, |
|
"loss": 18.5587, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0741962077493817, |
|
"grad_norm": 13.125, |
|
"learning_rate": 1.4851485148514851e-05, |
|
"loss": 17.2984, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08244023083264633, |
|
"grad_norm": 9.375, |
|
"learning_rate": 1.6501650165016504e-05, |
|
"loss": 16.2291, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09068425391591096, |
|
"grad_norm": 7.5, |
|
"learning_rate": 1.8151815181518153e-05, |
|
"loss": 15.5459, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.0989282769991756, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.9801980198019803e-05, |
|
"loss": 15.0494, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.10717230008244023, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 2.1452145214521452e-05, |
|
"loss": 14.5263, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.11541632316570487, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 2.31023102310231e-05, |
|
"loss": 14.0492, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1236603462489695, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 2.4752475247524754e-05, |
|
"loss": 14.0437, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.13190436933223412, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 2.64026402640264e-05, |
|
"loss": 13.4269, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.14014839241549876, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 2.8052805280528056e-05, |
|
"loss": 13.1438, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.1483924154987634, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 2.9702970297029702e-05, |
|
"loss": 12.8682, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.15663643858202803, |
|
"grad_norm": 7.5625, |
|
"learning_rate": 3.135313531353136e-05, |
|
"loss": 12.1682, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.16488046166529266, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 3.300330033003301e-05, |
|
"loss": 11.6708, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1731244847485573, |
|
"grad_norm": 13.875, |
|
"learning_rate": 3.465346534653465e-05, |
|
"loss": 10.6173, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.18136850783182193, |
|
"grad_norm": 20.375, |
|
"learning_rate": 3.6303630363036307e-05, |
|
"loss": 9.3535, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.18961253091508656, |
|
"grad_norm": 23.0, |
|
"learning_rate": 3.7953795379537956e-05, |
|
"loss": 7.0394, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.1978565539983512, |
|
"grad_norm": 19.0, |
|
"learning_rate": 3.9603960396039605e-05, |
|
"loss": 4.5084, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.20610057708161583, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 4.1254125412541255e-05, |
|
"loss": 2.7198, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.21434460016488047, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 4.2904290429042904e-05, |
|
"loss": 1.9952, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2225886232481451, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 4.455445544554456e-05, |
|
"loss": 1.7411, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.23083264633140974, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 4.62046204620462e-05, |
|
"loss": 1.5998, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.23907666941467437, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 4.785478547854786e-05, |
|
"loss": 1.5183, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.247320692497939, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 4.950495049504951e-05, |
|
"loss": 1.4489, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.25556471558120364, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 5.115511551155116e-05, |
|
"loss": 1.3924, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.26380873866446825, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 5.28052805280528e-05, |
|
"loss": 1.3648, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2720527617477329, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 5.445544554455446e-05, |
|
"loss": 1.3461, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.2802967848309975, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 5.610561056105611e-05, |
|
"loss": 1.3065, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2885408079142622, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 5.7755775577557755e-05, |
|
"loss": 1.2809, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.2967848309975268, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 5.9405940594059404e-05, |
|
"loss": 1.2647, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.30502885408079145, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 6.105610561056106e-05, |
|
"loss": 1.2387, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.31327287716405605, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 6.270627062706272e-05, |
|
"loss": 1.24, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3215169002473207, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 6.435643564356436e-05, |
|
"loss": 1.2108, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.3297609233305853, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 6.600660066006602e-05, |
|
"loss": 1.2026, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.33800494641385, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 6.765676567656766e-05, |
|
"loss": 1.1894, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.3462489694971146, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 6.93069306930693e-05, |
|
"loss": 1.2093, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.35449299258037925, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 7.095709570957097e-05, |
|
"loss": 1.1768, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.36273701566364386, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 7.260726072607261e-05, |
|
"loss": 1.1946, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.37098103874690846, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 7.425742574257426e-05, |
|
"loss": 1.1678, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.3792250618301731, |
|
"grad_norm": 1.5, |
|
"learning_rate": 7.590759075907591e-05, |
|
"loss": 1.1618, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.38746908491343773, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 7.755775577557755e-05, |
|
"loss": 1.1585, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.3957131079967024, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 7.920792079207921e-05, |
|
"loss": 1.1519, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.403957131079967, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 8.085808580858087e-05, |
|
"loss": 1.1408, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.41220115416323166, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 8.250825082508251e-05, |
|
"loss": 1.138, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.42044517724649627, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 8.415841584158417e-05, |
|
"loss": 1.1375, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.42868920032976093, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 8.580858085808581e-05, |
|
"loss": 1.1193, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.43693322341302554, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 8.745874587458746e-05, |
|
"loss": 1.1178, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.4451772464962902, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 8.910891089108912e-05, |
|
"loss": 1.1038, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4534212695795548, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 9.075907590759076e-05, |
|
"loss": 1.1148, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.46166529266281947, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 9.24092409240924e-05, |
|
"loss": 1.112, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.4699093157460841, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 9.405940594059406e-05, |
|
"loss": 1.0882, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.47815333882934874, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 9.570957095709572e-05, |
|
"loss": 1.0873, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.48639736191261335, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 9.735973597359736e-05, |
|
"loss": 1.0982, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.494641384995878, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 9.900990099009902e-05, |
|
"loss": 1.074, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5028854080791426, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.00010066006600660067, |
|
"loss": 1.0719, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.5111294311624073, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 0.00010231023102310232, |
|
"loss": 1.0816, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5193734542456719, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.00010396039603960397, |
|
"loss": 1.0681, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.5276174773289365, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 0.0001056105610561056, |
|
"loss": 1.0689, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5358615004122012, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.00010726072607260727, |
|
"loss": 1.0712, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.5441055234954658, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.00010891089108910893, |
|
"loss": 1.063, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5523495465787304, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.00011056105610561056, |
|
"loss": 1.0622, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.560593569661995, |
|
"grad_norm": 5.0, |
|
"learning_rate": 0.00011221122112211223, |
|
"loss": 1.0614, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5688375927452597, |
|
"grad_norm": 2.0, |
|
"learning_rate": 0.00011386138613861385, |
|
"loss": 1.0611, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.5770816158285244, |
|
"grad_norm": 1.75, |
|
"learning_rate": 0.00011551155115511551, |
|
"loss": 1.0451, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5853256389117889, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 0.00011716171617161718, |
|
"loss": 1.0506, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.5935696619950536, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.00011881188118811881, |
|
"loss": 1.0414, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6018136850783182, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 0.00012046204620462047, |
|
"loss": 1.0334, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.6100577081615829, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.00012211221122112212, |
|
"loss": 1.0388, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6183017312448474, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.00012376237623762376, |
|
"loss": 1.0251, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.6265457543281121, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 0.00012541254125412543, |
|
"loss": 1.0315, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6347897774113768, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 0.00012706270627062708, |
|
"loss": 1.0342, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.6430338004946414, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 0.00012871287128712872, |
|
"loss": 1.0258, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.651277823577906, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 0.00013036303630363036, |
|
"loss": 1.02, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.6595218466611706, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.00013201320132013203, |
|
"loss": 1.0053, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6677658697444353, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 0.00013366336633663367, |
|
"loss": 1.0217, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.6760098928277, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 0.00013531353135313532, |
|
"loss": 1.0066, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6842539159109645, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 0.00013696369636963699, |
|
"loss": 1.0141, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.6924979389942292, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.0001386138613861386, |
|
"loss": 1.0028, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7007419620774938, |
|
"grad_norm": 3.875, |
|
"learning_rate": 0.00014026402640264027, |
|
"loss": 1.0207, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.7089859851607585, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.00014191419141914194, |
|
"loss": 1.0122, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.717230008244023, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 0.00014356435643564356, |
|
"loss": 1.0145, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.7254740313272877, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 0.00014521452145214523, |
|
"loss": 1.0031, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.7337180544105524, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.00014686468646864687, |
|
"loss": 0.9987, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.7419620774938169, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 0.0001485148514851485, |
|
"loss": 1.0024, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7502061005770816, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.00015016501650165018, |
|
"loss": 1.0048, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.7584501236603463, |
|
"grad_norm": 3.5, |
|
"learning_rate": 0.00015181518151815182, |
|
"loss": 1.0039, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.7666941467436109, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 0.00015346534653465347, |
|
"loss": 1.0092, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.7749381698268755, |
|
"grad_norm": 3.578125, |
|
"learning_rate": 0.0001551155115511551, |
|
"loss": 1.0078, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7831821929101401, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 0.00015676567656765678, |
|
"loss": 1.0005, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.7914262159934048, |
|
"grad_norm": 3.765625, |
|
"learning_rate": 0.00015841584158415842, |
|
"loss": 0.9895, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7996702390766695, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 0.00016006600660066006, |
|
"loss": 0.9923, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.807914262159934, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 0.00016171617161716173, |
|
"loss": 0.9996, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.8161582852431987, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.00016336633663366338, |
|
"loss": 0.9955, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.8244023083264633, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 0.00016501650165016502, |
|
"loss": 0.9931, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.832646331409728, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 1.0061, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.8408903544929925, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 0.00016831683168316833, |
|
"loss": 1.0086, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.8491343775762572, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.00016996699669966997, |
|
"loss": 0.9886, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.8573784006595219, |
|
"grad_norm": 14.4375, |
|
"learning_rate": 0.00017161716171617162, |
|
"loss": 0.9933, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.8656224237427865, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 0.00017326732673267329, |
|
"loss": 1.0033, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.8738664468260511, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.00017491749174917493, |
|
"loss": 0.9905, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.8821104699093157, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 0.00017656765676567657, |
|
"loss": 0.9717, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.8903544929925804, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.00017821782178217824, |
|
"loss": 0.9656, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8985985160758451, |
|
"grad_norm": 3.625, |
|
"learning_rate": 0.00017986798679867986, |
|
"loss": 0.9827, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.9068425391591096, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 0.00018151815181518153, |
|
"loss": 0.9865, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.9150865622423743, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.0001831683168316832, |
|
"loss": 0.9815, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.9233305853256389, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 0.0001848184818481848, |
|
"loss": 0.9799, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.9315746084089035, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 0.00018646864686468648, |
|
"loss": 0.9611, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.9398186314921682, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.00018811881188118812, |
|
"loss": 0.9652, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.9480626545754328, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 0.00018976897689768977, |
|
"loss": 0.9676, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.9563066776586975, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.00019141914191419144, |
|
"loss": 0.9692, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.964550700741962, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 0.00019306930693069308, |
|
"loss": 0.9694, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.9727947238252267, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.00019471947194719472, |
|
"loss": 0.9823, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.9810387469084914, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 0.00019636963696369636, |
|
"loss": 0.97, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.989282769991756, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.00019801980198019803, |
|
"loss": 0.9746, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9975267930750206, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 0.00019966996699669968, |
|
"loss": 0.964, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.9991755976916735, |
|
"eval_loss": 2.485042095184326, |
|
"eval_runtime": 0.2808, |
|
"eval_samples_per_second": 35.608, |
|
"eval_steps_per_second": 3.561, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 1.0057708161582852, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.00019999973456433681, |
|
"loss": 0.9535, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.0140148392415498, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 0.00019999865623437013, |
|
"loss": 0.9553, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.0222588623248146, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 0.00019999674842930876, |
|
"loss": 0.9556, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.030502885408079, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 0.00019999401116497763, |
|
"loss": 0.9746, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.0387469084913439, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.000199990444464082, |
|
"loss": 0.9639, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.0469909315746084, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 0.00019998604835620717, |
|
"loss": 0.9585, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.055234954657873, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 0.00019998082287781826, |
|
"loss": 0.9563, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.0634789777411378, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 0.00019997476807225985, |
|
"loss": 0.9489, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.0717230008244023, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 0.00019996788398975578, |
|
"loss": 0.9474, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.0799670239076669, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 0.0001999601706874085, |
|
"loss": 0.9407, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.0882110469909316, |
|
"grad_norm": 2.875, |
|
"learning_rate": 0.00019995162822919883, |
|
"loss": 0.9514, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.0964550700741962, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 0.00019994225668598526, |
|
"loss": 0.9502, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.104699093157461, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 0.0001999320561355035, |
|
"loss": 0.9502, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.1129431162407255, |
|
"grad_norm": 2.0, |
|
"learning_rate": 0.00019992102666236566, |
|
"loss": 0.9455, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.12118713932399, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.00019990916835805974, |
|
"loss": 0.9429, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.1294311624072548, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.00019989648132094873, |
|
"loss": 0.9348, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.1376751854905194, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.00019988296565626987, |
|
"loss": 0.939, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.145919208573784, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.0001998686214761337, |
|
"loss": 0.9374, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.1541632316570487, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.00019985344889952327, |
|
"loss": 0.9326, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.1624072547403133, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.00019983744805229296, |
|
"loss": 0.9308, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.1706512778235778, |
|
"grad_norm": 1.75, |
|
"learning_rate": 0.00019982061906716764, |
|
"loss": 0.9436, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.1788953009068426, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.00019980296208374143, |
|
"loss": 0.9369, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.1871393239901071, |
|
"grad_norm": 20.0, |
|
"learning_rate": 0.00019978447724847652, |
|
"loss": 0.9334, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.195383347073372, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.00019976516471470216, |
|
"loss": 0.9416, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.2036273701566365, |
|
"grad_norm": 8.75, |
|
"learning_rate": 0.0001997450246426131, |
|
"loss": 0.9382, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.211871393239901, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.0001997240571992685, |
|
"loss": 0.9315, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.2201154163231658, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 0.00019970226255859038, |
|
"loss": 0.9266, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.2283594394064303, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 0.0001996796409013623, |
|
"loss": 0.9299, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.2366034624896949, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.0001996561924152278, |
|
"loss": 0.9202, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.2448474855729597, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.00019963191729468888, |
|
"loss": 0.9149, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.2530915086562242, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.00019960681574110426, |
|
"loss": 0.9165, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.2613355317394888, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 0.00019958088796268793, |
|
"loss": 0.9188, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.2695795548227535, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 0.0001995541341745072, |
|
"loss": 0.9274, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.277823577906018, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 0.0001995265545984811, |
|
"loss": 0.9136, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.2860676009892829, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 0.00019949814946337838, |
|
"loss": 0.9251, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.2943116240725474, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 0.00019946891900481578, |
|
"loss": 0.9176, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.302555647155812, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0001994388634652559, |
|
"loss": 0.9283, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.3107996702390767, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 0.00019940798309400526, |
|
"loss": 0.9221, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.3190436933223413, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.00019937627814721237, |
|
"loss": 0.9199, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.327287716405606, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.00019934374888786537, |
|
"loss": 0.9163, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.3355317394888706, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 0.00019931039558578997, |
|
"loss": 0.9181, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.3437757625721352, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 0.00019927621851764725, |
|
"loss": 0.9276, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.3520197856553997, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 0.00019924121796693127, |
|
"loss": 0.9199, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.3602638087386645, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.0001992053942239668, |
|
"loss": 0.9213, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.368507831821929, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.00019916874758590684, |
|
"loss": 0.9228, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.3767518549051938, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.00019913127835673023, |
|
"loss": 0.9149, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.3849958779884584, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00019909298684723904, |
|
"loss": 0.9086, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.393239901071723, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.00019905387337505612, |
|
"loss": 0.9092, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.4014839241549877, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 0.0001990139382646223, |
|
"loss": 0.9041, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.4097279472382522, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.00019897318184719385, |
|
"loss": 0.9093, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.417971970321517, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.00019893160446083963, |
|
"loss": 0.909, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.4262159934047816, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.00019888920645043831, |
|
"loss": 0.9014, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.434460016488046, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 0.00019884598816767563, |
|
"loss": 0.9036, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.4427040395713109, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 0.00019880194997104123, |
|
"loss": 0.8999, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.4509480626545754, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 0.00019875709222582594, |
|
"loss": 0.9, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.45919208573784, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 0.00019871141530411853, |
|
"loss": 0.8955, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.4674361088211048, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 0.00019866491958480284, |
|
"loss": 0.9042, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.4756801319043693, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.00019861760545355442, |
|
"loss": 0.9177, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.4839241549876339, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 0.00019856947330283752, |
|
"loss": 0.8974, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.4921681780708986, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.00019852052353190166, |
|
"loss": 0.9064, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.5004122011541632, |
|
"grad_norm": 3.796875, |
|
"learning_rate": 0.0001984707565467785, |
|
"loss": 0.9086, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.508656224237428, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 0.00019842017276027832, |
|
"loss": 0.9069, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.5169002473206925, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.00019836877259198662, |
|
"loss": 0.898, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.525144270403957, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 0.0001983165564682608, |
|
"loss": 0.8999, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.5333882934872216, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.00019826352482222638, |
|
"loss": 0.8987, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.5416323165704864, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.00019820967809377357, |
|
"loss": 0.8791, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.5498763396537512, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00019815501672955358, |
|
"loss": 0.8887, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.5581203627370157, |
|
"grad_norm": 6.0, |
|
"learning_rate": 0.0001980995411829749, |
|
"loss": 0.8955, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.5663643858202803, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.00019804325191419956, |
|
"loss": 0.8991, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.5746084089035448, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 0.00019798614939013932, |
|
"loss": 0.8916, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.5828524319868096, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.00019792823408445174, |
|
"loss": 0.9048, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.5910964550700744, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.0001978695064775363, |
|
"loss": 0.8828, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.599340478153339, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.00019780996705653044, |
|
"loss": 0.8864, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.6075845012366035, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 0.00019774961631530545, |
|
"loss": 0.8908, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.615828524319868, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.0001976884547544624, |
|
"loss": 0.8853, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.6240725474031328, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 0.0001976264828813281, |
|
"loss": 0.8835, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.6323165704863973, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 0.00019756370120995066, |
|
"loss": 0.8817, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.640560593569662, |
|
"grad_norm": 27.25, |
|
"learning_rate": 0.0001975001102610954, |
|
"loss": 0.8972, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.6488046166529267, |
|
"grad_norm": 9.75, |
|
"learning_rate": 0.0001974357105622405, |
|
"loss": 0.9076, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.6570486397361912, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0001973705026475726, |
|
"loss": 0.9001, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.6652926628194558, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 0.00019730448705798239, |
|
"loss": 0.9172, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.6735366859027205, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.0001972376643410601, |
|
"loss": 0.8945, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.6817807089859853, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 0.00019717003505109095, |
|
"loss": 0.8857, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.6900247320692499, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.00019710159974905064, |
|
"loss": 0.8852, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.6982687551525144, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 0.00019703235900260055, |
|
"loss": 0.8795, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.706512778235779, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.00019696231338608316, |
|
"loss": 0.8926, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.7147568013190437, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 0.00019689146348051719, |
|
"loss": 0.8927, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.7230008244023083, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 0.0001968198098735929, |
|
"loss": 0.8762, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.731244847485573, |
|
"grad_norm": 6.75, |
|
"learning_rate": 0.0001967473531596671, |
|
"loss": 0.8886, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.7394888705688376, |
|
"grad_norm": 12.125, |
|
"learning_rate": 0.00019667409393975822, |
|
"loss": 0.8865, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.7477328936521022, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.00019660003282154147, |
|
"loss": 0.887, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.7559769167353667, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.00019652517041934356, |
|
"loss": 0.8669, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.7642209398186315, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.00019644950735413788, |
|
"loss": 0.8774, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.7724649629018963, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.00019637304425353916, |
|
"loss": 0.8717, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.7807089859851608, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.0001962957817517982, |
|
"loss": 0.8769, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.7889530090684254, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 0.0001962177204897969, |
|
"loss": 0.872, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.79719703215169, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0001961388611150427, |
|
"loss": 0.8727, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.8054410552349547, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 0.00019605920428166323, |
|
"loss": 0.8671, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.8136850783182195, |
|
"grad_norm": 7.78125, |
|
"learning_rate": 0.00019597875065040094, |
|
"loss": 0.8927, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.821929101401484, |
|
"grad_norm": 10.9375, |
|
"learning_rate": 0.00019589750088860766, |
|
"loss": 0.881, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.8301731244847486, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.000195815455670239, |
|
"loss": 0.8793, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.838417147568013, |
|
"grad_norm": 3.890625, |
|
"learning_rate": 0.00019573261567584874, |
|
"loss": 0.8795, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.8466611706512777, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.00019564898159258324, |
|
"loss": 0.8933, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.8549051937345424, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.00019556455411417573, |
|
"loss": 0.8626, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.8631492168178072, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 0.0001954793339409405, |
|
"loss": 0.8616, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.8713932399010718, |
|
"grad_norm": 2.625, |
|
"learning_rate": 0.00019539332177976714, |
|
"loss": 0.8693, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 1.8796372629843363, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00019530651834411474, |
|
"loss": 0.8659, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.8878812860676009, |
|
"grad_norm": 6.0, |
|
"learning_rate": 0.00019521892435400587, |
|
"loss": 0.8666, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 1.8961253091508656, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.00019513054053602055, |
|
"loss": 0.8601, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.9043693322341304, |
|
"grad_norm": 2.125, |
|
"learning_rate": 0.00019504136762329047, |
|
"loss": 0.8631, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 1.912613355317395, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 0.00019495140635549261, |
|
"loss": 0.8833, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.9208573784006595, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 0.00019486065747884333, |
|
"loss": 0.8555, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 1.929101401483924, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.0001947691217460921, |
|
"loss": 0.8602, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.9373454245671888, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 0.0001946767999165152, |
|
"loss": 0.8553, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.9455894476504534, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.00019458369275590954, |
|
"loss": 0.8588, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.9538334707337182, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 0.00019448980103658613, |
|
"loss": 0.8529, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 1.9620774938169827, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 0.00019439512553736394, |
|
"loss": 0.8441, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.9703215169002473, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.0001942996670435632, |
|
"loss": 0.8526, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 1.9785655399835118, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 0.0001942034263469989, |
|
"loss": 0.8547, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.9868095630667766, |
|
"grad_norm": 13.0625, |
|
"learning_rate": 0.0001941064042459745, |
|
"loss": 0.8686, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 1.9950535861500414, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.00019400860154527493, |
|
"loss": 0.8499, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.4393434524536133, |
|
"eval_runtime": 0.2359, |
|
"eval_samples_per_second": 42.391, |
|
"eval_steps_per_second": 4.239, |
|
"step": 1213 |
|
}, |
|
{ |
|
"epoch": 2.003297609233306, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.0001939100190561601, |
|
"loss": 0.8486, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 2.0115416323165705, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 0.00019381065759635822, |
|
"loss": 0.8375, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.019785655399835, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.0001937105179900589, |
|
"loss": 0.8531, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 2.0280296784830996, |
|
"grad_norm": 1.75, |
|
"learning_rate": 0.00019360960106790643, |
|
"loss": 0.8369, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.0362737015663646, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00019350790766699282, |
|
"loss": 0.8276, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 2.044517724649629, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.0001934054386308508, |
|
"loss": 0.8289, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.0527617477328937, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.00019330219480944694, |
|
"loss": 0.8292, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 2.061005770816158, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.0001931981770591745, |
|
"loss": 0.8305, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.0692497938994228, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.00019309338624284644, |
|
"loss": 0.8243, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 2.0774938169826878, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.00019298782322968815, |
|
"loss": 0.8225, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.0857378400659523, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 0.0001928814888953303, |
|
"loss": 0.8212, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 2.093981863149217, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 0.0001927743841218016, |
|
"loss": 0.8188, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.1022258862324814, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00019266650979752136, |
|
"loss": 0.8209, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 2.110469909315746, |
|
"grad_norm": 8.25, |
|
"learning_rate": 0.00019255786681729225, |
|
"loss": 0.8242, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.1187139323990105, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 0.00019244845608229293, |
|
"loss": 0.828, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 2.1269579554822755, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.00019233827850007027, |
|
"loss": 0.8159, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.13520197856554, |
|
"grad_norm": 7.03125, |
|
"learning_rate": 0.00019222733498453222, |
|
"loss": 0.8196, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 2.1434460016488046, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.00019211562645594002, |
|
"loss": 0.8231, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.151690024732069, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.00019200315384090044, |
|
"loss": 0.8073, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 2.1599340478153337, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.00019188991807235844, |
|
"loss": 0.8255, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.1681780708985987, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.0001917759200895891, |
|
"loss": 0.8185, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 2.1764220939818633, |
|
"grad_norm": 8.0, |
|
"learning_rate": 0.00019166116083819002, |
|
"loss": 0.8174, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.184666117065128, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.00019154564127007336, |
|
"loss": 0.8263, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 2.1929101401483924, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.0001914293623434581, |
|
"loss": 0.8333, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.201154163231657, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 0.00019131232502286188, |
|
"loss": 0.8227, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 2.209398186314922, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.00019119453027909323, |
|
"loss": 0.8123, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.2176422093981865, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 0.0001910759790892433, |
|
"loss": 0.8129, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 2.225886232481451, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.0001909566724366779, |
|
"loss": 0.8101, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.2341302555647156, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 0.00019083661131102933, |
|
"loss": 0.8205, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 2.24237427864798, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 0.00019071579670818808, |
|
"loss": 0.8228, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.2506183017312447, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00019059422963029464, |
|
"loss": 0.8123, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 2.2588623248145097, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.00019047191108573125, |
|
"loss": 0.8227, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.267106347897774, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 0.00019034884208911335, |
|
"loss": 0.814, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 2.2753503709810388, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.00019022502366128135, |
|
"loss": 0.819, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.2835943940643033, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00019010045682929213, |
|
"loss": 0.8074, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 2.291838417147568, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.00018997514262641035, |
|
"loss": 0.8224, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.300082440230833, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.0001898490820921001, |
|
"loss": 0.8096, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 2.3083264633140974, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00018972227627201617, |
|
"loss": 0.8102, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.316570486397362, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.0001895947262179954, |
|
"loss": 0.8113, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 2.3248145094806265, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.00018946643298804793, |
|
"loss": 0.8109, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.333058532563891, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.00018933739764634847, |
|
"loss": 0.809, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 2.3413025556471556, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.0001892076212632274, |
|
"loss": 0.8153, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.3495465787304206, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.00018907710491516199, |
|
"loss": 0.8161, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 2.357790601813685, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.00018894584968476733, |
|
"loss": 0.8141, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.3660346248969497, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.00018881385666078755, |
|
"loss": 0.8102, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 2.3742786479802143, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.00018868112693808665, |
|
"loss": 0.8124, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.382522671063479, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.00018854766161763932, |
|
"loss": 0.8033, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 2.390766694146744, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.00018841346180652213, |
|
"loss": 0.812, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.3990107172300084, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.00018827852861790398, |
|
"loss": 0.8059, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 2.407254740313273, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00018814286317103714, |
|
"loss": 0.8021, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.4154987633965375, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 0.00018800646659124782, |
|
"loss": 0.8036, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 2.423742786479802, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00018786934000992688, |
|
"loss": 0.8045, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.4319868095630666, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00018773148456452046, |
|
"loss": 0.8108, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 2.4402308326463316, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00018759290139852048, |
|
"loss": 0.8097, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.448474855729596, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.00018745359166145523, |
|
"loss": 0.8052, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 2.4567188788128607, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.00018731355650887985, |
|
"loss": 0.8016, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.464962901896125, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.00018717279710236666, |
|
"loss": 0.8077, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 2.4732069249793898, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 0.00018703131460949554, |
|
"loss": 0.8031, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.4814509480626548, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 0.00018688911020384432, |
|
"loss": 0.8062, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 2.4896949711459193, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.000186746185064979, |
|
"loss": 0.8156, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.497938994229184, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 0.8083, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 2.5061830173124484, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00018645817733575193, |
|
"loss": 0.812, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.514427040395713, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 0.00018631309713437467, |
|
"loss": 0.796, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 2.5226710634789775, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.0001861673009777325, |
|
"loss": 0.7988, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.5309150865622425, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 0.00018602079007518438, |
|
"loss": 0.7988, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 2.539159109645507, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.00018587356564201817, |
|
"loss": 0.8045, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.5474031327287716, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.0001857256288994402, |
|
"loss": 0.8112, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 2.555647155812036, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.00018557698107456549, |
|
"loss": 0.808, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.563891178895301, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.00018542762340040722, |
|
"loss": 0.7958, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 2.5721352019785657, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.00018527755711586678, |
|
"loss": 0.8008, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.5803792250618303, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.00018512678346572337, |
|
"loss": 0.7995, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 2.588623248145095, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.00018497530370062363, |
|
"loss": 0.7974, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.5968672712283594, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0001848231190770714, |
|
"loss": 0.7929, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 2.605111294311624, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.00018467023085741717, |
|
"loss": 0.8014, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.6133553173948885, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 0.00018451664030984773, |
|
"loss": 0.7944, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 2.6215993404781535, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.00018436234870837547, |
|
"loss": 0.7937, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.629843363561418, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.00018420735733282807, |
|
"loss": 0.7983, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 2.6380873866446826, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.00018405166746883762, |
|
"loss": 0.7924, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.646331409727947, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.00018389528040783012, |
|
"loss": 0.7953, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 2.654575432811212, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.00018373819744701476, |
|
"loss": 0.7893, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.6628194558944767, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.00018358041988937305, |
|
"loss": 0.7945, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 2.671063478977741, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.00018342194904364813, |
|
"loss": 0.7894, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.6793075020610058, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.00018326278622433386, |
|
"loss": 0.7925, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 2.6875515251442703, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00018310293275166392, |
|
"loss": 0.7978, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.695795548227535, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.00018294238995160094, |
|
"loss": 0.792, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 2.7040395713107994, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00018278115915582526, |
|
"loss": 0.8069, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.7122835943940644, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 0.0001826192417017242, |
|
"loss": 0.8048, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 2.720527617477329, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00018245663893238075, |
|
"loss": 0.8009, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.7287716405605935, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0001822933521965625, |
|
"loss": 0.7903, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 2.737015663643858, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00018212938284871047, |
|
"loss": 0.7917, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.745259686727123, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00018196473224892784, |
|
"loss": 0.7886, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 2.7535037098103876, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.0001817994017629687, |
|
"loss": 0.7933, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.761747732893652, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.00018163339276222666, |
|
"loss": 0.792, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 2.7699917559769167, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.00018146670662372354, |
|
"loss": 0.7825, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.7782357790601813, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.0001812993447300979, |
|
"loss": 0.7929, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 2.786479802143446, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.00018113130846959368, |
|
"loss": 0.7925, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.7947238252267104, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.0001809625992360485, |
|
"loss": 0.7888, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 2.8029678483099754, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.00018079321842888227, |
|
"loss": 0.7995, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.81121187139324, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.00018062316745308542, |
|
"loss": 0.7939, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 2.8194558944765045, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 0.0001804524477192075, |
|
"loss": 0.79, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.827699917559769, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.0001802810606433451, |
|
"loss": 0.7927, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 2.835943940643034, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.00018010900764713048, |
|
"loss": 0.796, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.8441879637262986, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.0001799362901577196, |
|
"loss": 0.7921, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 2.852431986809563, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.00017976290960778024, |
|
"loss": 0.79, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.8606760098928277, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0001795888674354802, |
|
"loss": 0.7927, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 2.868920032976092, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.00017941416508447536, |
|
"loss": 0.7917, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.8771640560593568, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.0001792388040038977, |
|
"loss": 0.7905, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 2.8854080791426218, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.00017906278564834324, |
|
"loss": 0.7934, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.8936521022258863, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.00017888611147786002, |
|
"loss": 0.7957, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 2.901896125309151, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00017870878295793598, |
|
"loss": 0.7793, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.9101401483924154, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.0001785308015594868, |
|
"loss": 0.7912, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 2.91838417147568, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.00017835216875884368, |
|
"loss": 0.7842, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.926628194558945, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.00017817288603774116, |
|
"loss": 0.784, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 2.9348722176422095, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.00017799295488330467, |
|
"loss": 0.7934, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.943116240725474, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00017781237678803847, |
|
"loss": 0.7867, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 2.9513602638087386, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.00017763115324981294, |
|
"loss": 0.7911, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.959604286892003, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.00017744928577185243, |
|
"loss": 0.7914, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 2.9678483099752677, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.00017726677586272263, |
|
"loss": 0.7917, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.9760923330585327, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.00017708362503631814, |
|
"loss": 0.7819, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 2.9843363561417973, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.00017689983481184989, |
|
"loss": 0.7842, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.992580379225062, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00017671540671383243, |
|
"loss": 0.7939, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 2.9991755976916736, |
|
"eval_loss": 2.4241690635681152, |
|
"eval_runtime": 0.2578, |
|
"eval_samples_per_second": 38.793, |
|
"eval_steps_per_second": 3.879, |
|
"step": 1819 |
|
}, |
|
{ |
|
"epoch": 3.0008244023083264, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.00017653034227207152, |
|
"loss": 0.7885, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 3.009068425391591, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.00017634464302165124, |
|
"loss": 0.772, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 3.017312448474856, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.0001761583105029213, |
|
"loss": 0.7668, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 3.0255564715581205, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.00017597134626148427, |
|
"loss": 0.77, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 3.033800494641385, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.0001757837518481829, |
|
"loss": 0.7713, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 3.0420445177246496, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.00017559552881908695, |
|
"loss": 0.7748, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 3.050288540807914, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.00017540667873548063, |
|
"loss": 0.7653, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.058532563891179, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00017521720316384935, |
|
"loss": 0.7706, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 3.0667765869744437, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 0.00017502710367586687, |
|
"loss": 0.7633, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 3.075020610057708, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.00017483638184838239, |
|
"loss": 0.7568, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 3.0832646331409728, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.0001746450392634071, |
|
"loss": 0.757, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 3.0915086562242373, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.0001744530775081015, |
|
"loss": 0.7701, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 3.099752679307502, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.00017426049817476197, |
|
"loss": 0.7717, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 3.107996702390767, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.00017406730286080753, |
|
"loss": 0.7647, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 3.1162407254740314, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.00017387349316876666, |
|
"loss": 0.7618, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 3.124484748557296, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.00017367907070626424, |
|
"loss": 0.7712, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 3.1327287716405605, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00017348403708600772, |
|
"loss": 0.7635, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.140972794723825, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.0001732883939257742, |
|
"loss": 0.7591, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 3.14921681780709, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00017309214284839678, |
|
"loss": 0.7664, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 3.1574608408903546, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.00017289528548175114, |
|
"loss": 0.7633, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 3.165704863973619, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.00017269782345874203, |
|
"loss": 0.7676, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 3.1739488870568837, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 0.0001724997584172898, |
|
"loss": 0.7712, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 3.1821929101401483, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00017230109200031668, |
|
"loss": 0.7631, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 3.190436933223413, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.00017210182585573327, |
|
"loss": 0.7664, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 3.198680956306678, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00017190196163642483, |
|
"loss": 0.7653, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 3.2069249793899424, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.0001717015010002376, |
|
"loss": 0.7677, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 3.215169002473207, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00017150044560996488, |
|
"loss": 0.7628, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.2234130255564715, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.00017129879713333356, |
|
"loss": 0.7604, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 3.231657048639736, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.00017109655724298995, |
|
"loss": 0.7664, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 3.239901071723001, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00017089372761648616, |
|
"loss": 0.7679, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 3.2481450948062656, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00017069030993626603, |
|
"loss": 0.7621, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 3.25638911788953, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.00017048630588965117, |
|
"loss": 0.7747, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 3.2646331409727947, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.00017028171716882714, |
|
"loss": 0.7655, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 3.272877164056059, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00017007654547082922, |
|
"loss": 0.768, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 3.281121187139324, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00016987079249752843, |
|
"loss": 0.7631, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 3.2893652102225888, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.00016966445995561727, |
|
"loss": 0.7686, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 3.2976092333058533, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00016945754955659595, |
|
"loss": 0.7695, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.305853256389118, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.00016925006301675763, |
|
"loss": 0.7548, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 3.3140972794723824, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.0001690420020571747, |
|
"loss": 0.7642, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 3.322341302555647, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.00016883336840368412, |
|
"loss": 0.7706, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 3.330585325638912, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.0001686241637868734, |
|
"loss": 0.7693, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 3.3388293487221765, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00016841438994206595, |
|
"loss": 0.7616, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 3.347073371805441, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 0.0001682040486093071, |
|
"loss": 0.7661, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 3.3553173948887056, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00016799314153334916, |
|
"loss": 0.7543, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 3.36356141797197, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00016778167046363734, |
|
"loss": 0.757, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 3.371805441055235, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.00016756963715429502, |
|
"loss": 0.7647, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 3.3800494641384997, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00016735704336410943, |
|
"loss": 0.7562, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.3882934872217643, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0001671438908565167, |
|
"loss": 0.7573, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 3.396537510305029, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00016693018139958763, |
|
"loss": 0.7585, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 3.4047815333882934, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.00016671591676601272, |
|
"loss": 0.7538, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 3.413025556471558, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.00016650109873308765, |
|
"loss": 0.7635, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 3.421269579554823, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.00016628572908269841, |
|
"loss": 0.7605, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 3.4295136026380875, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.00016606980960130665, |
|
"loss": 0.7511, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 3.437757625721352, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00016585334207993476, |
|
"loss": 0.757, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 3.4460016488046166, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.00016563632831415102, |
|
"loss": 0.7616, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 3.454245671887881, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.00016541877010405477, |
|
"loss": 0.7605, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 3.462489694971146, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.00016520066925426144, |
|
"loss": 0.7564, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.4707337180544107, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.00016498202757388758, |
|
"loss": 0.7627, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 3.478977741137675, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0001647628468765358, |
|
"loss": 0.7514, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 3.4872217642209398, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.0001645431289802799, |
|
"loss": 0.7616, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 3.4954657873042043, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00016432287570764952, |
|
"loss": 0.7639, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 3.503709810387469, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.0001641020888856153, |
|
"loss": 0.7642, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 3.511953833470734, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.00016388077034557355, |
|
"loss": 0.7511, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 3.5201978565539984, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.0001636589219233311, |
|
"loss": 0.7513, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 3.528441879637263, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.00016343654545909007, |
|
"loss": 0.7568, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 3.5366859027205275, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.00016321364279743266, |
|
"loss": 0.7562, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 3.5449299258037925, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00016299021578730579, |
|
"loss": 0.7591, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.553173948887057, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.00016276626628200568, |
|
"loss": 0.7665, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 3.5614179719703216, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00016254179613916278, |
|
"loss": 0.7604, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 3.569661995053586, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.000162316807220726, |
|
"loss": 0.7504, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 3.5779060181368507, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00016209130139294744, |
|
"loss": 0.7646, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 3.5861500412201153, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.00016186528052636692, |
|
"loss": 0.7562, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 3.59439406430338, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.00016163874649579647, |
|
"loss": 0.7501, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 3.602638087386645, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.00016141170118030463, |
|
"loss": 0.7548, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 3.6108821104699094, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.0001611841464632011, |
|
"loss": 0.7582, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 3.619126133553174, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00016095608423202098, |
|
"loss": 0.7517, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 3.6273701566364385, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.00016072751637850904, |
|
"loss": 0.7563, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.6356141797197035, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.00016049844479860422, |
|
"loss": 0.7566, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 3.643858202802968, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.00016026887139242372, |
|
"loss": 0.7515, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 3.6521022258862326, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.0001600387980642474, |
|
"loss": 0.754, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 3.660346248969497, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.0001598082267225018, |
|
"loss": 0.7608, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 3.6685902720527617, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.0001595771592797445, |
|
"loss": 0.7574, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 3.676834295136026, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.0001593455976526482, |
|
"loss": 0.7526, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 3.6850783182192908, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.0001591135437619847, |
|
"loss": 0.7546, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 3.6933223413025558, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.00015888099953260905, |
|
"loss": 0.7574, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 3.7015663643858203, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.0001586479668934437, |
|
"loss": 0.7548, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 3.709810387469085, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.0001584144477774623, |
|
"loss": 0.7519, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 3.7180544105523494, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.0001581804441216738, |
|
"loss": 0.761, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 3.7262984336356144, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.00015794595786710632, |
|
"loss": 0.7552, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 3.734542456718879, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00015771099095879108, |
|
"loss": 0.7573, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 3.7427864798021435, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.00015747554534574626, |
|
"loss": 0.753, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 3.751030502885408, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.0001572396229809608, |
|
"loss": 0.7587, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 3.7592745259686726, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.00015700322582137827, |
|
"loss": 0.7505, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 3.767518549051937, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.0001567663558278806, |
|
"loss": 0.747, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 3.7757625721352017, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.0001565290149652718, |
|
"loss": 0.763, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 3.7840065952184667, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.00015629120520226165, |
|
"loss": 0.7547, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 3.7922506183017313, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.00015605292851144942, |
|
"loss": 0.7537, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.800494641384996, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00015581418686930743, |
|
"loss": 0.754, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 3.8087386644682604, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.00015557498225616487, |
|
"loss": 0.7407, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 3.8169826875515254, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00015533531665619098, |
|
"loss": 0.7556, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 3.82522671063479, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.00015509519205737896, |
|
"loss": 0.7516, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 3.8334707337180545, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.0001548546104515294, |
|
"loss": 0.7506, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 3.841714756801319, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.0001546135738342335, |
|
"loss": 0.7524, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 3.8499587798845836, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.0001543720842048569, |
|
"loss": 0.748, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 3.858202802967848, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.00015413014356652286, |
|
"loss": 0.7503, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 3.8664468260511127, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.00015388775392609564, |
|
"loss": 0.754, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 3.8746908491343777, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 0.000153644917294164, |
|
"loss": 0.7511, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 3.882934872217642, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.0001534016356850244, |
|
"loss": 0.7492, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 3.8911788953009068, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.00015315791111666425, |
|
"loss": 0.7529, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 3.8994229183841713, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00015291374561074536, |
|
"loss": 0.7481, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 3.9076669414674363, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.000152669141192587, |
|
"loss": 0.752, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 3.915910964550701, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.00015242409989114916, |
|
"loss": 0.7389, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 3.9241549876339654, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.00015217862373901575, |
|
"loss": 0.7521, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 3.93239901071723, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.0001519327147723776, |
|
"loss": 0.742, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 3.9406430338004945, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00015168637503101584, |
|
"loss": 0.7499, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 3.948887056883759, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.00015143960655828468, |
|
"loss": 0.7516, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 3.957131079967024, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.00015119241140109467, |
|
"loss": 0.7493, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.9653751030502886, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.0001509447916098956, |
|
"loss": 0.7445, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 3.973619126133553, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.0001506967492386596, |
|
"loss": 0.7535, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 3.9818631492168177, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.000150448286344864, |
|
"loss": 0.7411, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 3.9901071723000827, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.00015019940498947428, |
|
"loss": 0.7484, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 3.9983511953833473, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 0.00014995010723692714, |
|
"loss": 0.7465, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.436275005340576, |
|
"eval_runtime": 0.2365, |
|
"eval_samples_per_second": 42.283, |
|
"eval_steps_per_second": 4.228, |
|
"step": 2426 |
|
}, |
|
{ |
|
"epoch": 4.006595218466612, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.00014970039515511304, |
|
"loss": 0.7483, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 4.014839241549876, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 0.00014945027081535937, |
|
"loss": 0.7256, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 4.023083264633141, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.00014919973629241314, |
|
"loss": 0.7386, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 4.0313272877164055, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.0001489487936644237, |
|
"loss": 0.7329, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 4.03957131079967, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.00014869744501292561, |
|
"loss": 0.7317, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 4.047815333882935, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.00014844569242282148, |
|
"loss": 0.7278, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 4.056059356966199, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00014819353798236427, |
|
"loss": 0.73, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 4.064303380049465, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.0001479409837831404, |
|
"loss": 0.7357, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 4.072547403132729, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.00014768803192005223, |
|
"loss": 0.7341, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 4.080791426215994, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.00014743468449130063, |
|
"loss": 0.7367, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 4.089035449299258, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.00014718094359836772, |
|
"loss": 0.7322, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 4.097279472382523, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.00014692681134599925, |
|
"loss": 0.73, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 4.105523495465787, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.0001466722898421873, |
|
"loss": 0.7364, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 4.113767518549052, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.00014641738119815266, |
|
"loss": 0.7267, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 4.122011541632316, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.00014616208752832758, |
|
"loss": 0.7282, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.130255564715581, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.00014590641095033787, |
|
"loss": 0.7251, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 4.1384995877988455, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.0001456503535849855, |
|
"loss": 0.7391, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 4.14674361088211, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.0001453939175562312, |
|
"loss": 0.7346, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 4.1549876339653755, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.00014513710499117647, |
|
"loss": 0.7362, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 4.16323165704864, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.00014487991802004623, |
|
"loss": 0.731, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 4.171475680131905, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.00014462235877617098, |
|
"loss": 0.7285, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 4.179719703215169, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.0001443644293959693, |
|
"loss": 0.7386, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 4.187963726298434, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.00014410613201892985, |
|
"loss": 0.7376, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 4.196207749381698, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.0001438474687875938, |
|
"loss": 0.731, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 4.204451772464963, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.00014358844184753712, |
|
"loss": 0.7238, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 4.212695795548227, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 0.00014332905334735261, |
|
"loss": 0.7246, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 4.220939818631492, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.00014306930543863219, |
|
"loss": 0.7394, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 4.2291838417147565, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.00014280920027594907, |
|
"loss": 0.7306, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 4.237427864798021, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.00014254874001683976, |
|
"loss": 0.7418, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 4.2456718878812865, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 0.00014228792682178623, |
|
"loss": 0.7291, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 4.253915910964551, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.00014202676285419812, |
|
"loss": 0.7273, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 4.262159934047816, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00014176525028039452, |
|
"loss": 0.7311, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 4.27040395713108, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.00014150339126958633, |
|
"loss": 0.7214, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 4.278647980214345, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.00014124118799385796, |
|
"loss": 0.7324, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 4.286892003297609, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.00014097864262814955, |
|
"loss": 0.7397, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 4.295136026380874, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.00014071575735023875, |
|
"loss": 0.7382, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 4.303380049464138, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.0001404525343407228, |
|
"loss": 0.7324, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 4.311624072547403, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.00014018897578300035, |
|
"loss": 0.7327, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 4.319868095630667, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.0001399250838632533, |
|
"loss": 0.7419, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 4.328112118713932, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.0001396608607704289, |
|
"loss": 0.738, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 4.336356141797197, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00013939630869622133, |
|
"loss": 0.7412, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 4.344600164880462, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.00013913142983505364, |
|
"loss": 0.7336, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 4.3528441879637265, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00013886622638405952, |
|
"loss": 0.7282, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 4.361088211046991, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.00013860070054306516, |
|
"loss": 0.7306, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 4.369332234130256, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.0001383348545145708, |
|
"loss": 0.7279, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 4.37757625721352, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.0001380686905037327, |
|
"loss": 0.7355, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 4.385820280296785, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.00013780221071834476, |
|
"loss": 0.7336, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 4.394064303380049, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.0001375354173688201, |
|
"loss": 0.7314, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 4.402308326463314, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00013726831266817278, |
|
"loss": 0.7344, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 4.410552349546578, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00013700089883199966, |
|
"loss": 0.7361, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 4.418796372629844, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.0001367331780784616, |
|
"loss": 0.7322, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 4.427040395713108, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00013646515262826552, |
|
"loss": 0.7332, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 4.435284418796373, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.00013619682470464558, |
|
"loss": 0.7321, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 4.4435284418796375, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.00013592819653334505, |
|
"loss": 0.7262, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 4.451772464962902, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.0001356592703425976, |
|
"loss": 0.7273, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 4.460016488046167, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00013539004836310894, |
|
"loss": 0.7378, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 4.468260511129431, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0001351205328280385, |
|
"loss": 0.7254, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 4.476504534212696, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.00013485072597298038, |
|
"loss": 0.729, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 4.48474855729596, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.00013458063003594543, |
|
"loss": 0.7375, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 4.492992580379225, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.0001343102472573423, |
|
"loss": 0.7278, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 4.501236603462489, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.00013403957987995882, |
|
"loss": 0.7363, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 4.509480626545754, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00013376863014894375, |
|
"loss": 0.7341, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 4.517724649629019, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.00013349740031178784, |
|
"loss": 0.7325, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 4.525968672712284, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.00013322589261830517, |
|
"loss": 0.7376, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 4.534212695795548, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.00013295410932061478, |
|
"loss": 0.727, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 4.542456718878813, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.00013268205267312174, |
|
"loss": 0.729, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 4.5507007419620775, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.00013240972493249847, |
|
"loss": 0.7355, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 4.558944765045342, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.00013213712835766607, |
|
"loss": 0.7362, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 4.567188788128607, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.0001318642652097757, |
|
"loss": 0.7319, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 4.575432811211871, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.00013159113775218964, |
|
"loss": 0.7265, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 4.583676834295136, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.00013131774825046245, |
|
"loss": 0.7343, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 4.5919208573784, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.00013104409897232258, |
|
"loss": 0.7231, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 4.600164880461666, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.00013077019218765305, |
|
"loss": 0.7305, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 4.60840890354493, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.00013049603016847296, |
|
"loss": 0.7311, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 4.616652926628195, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.00013022161518891855, |
|
"loss": 0.7347, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 4.624896949711459, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.00012994694952522435, |
|
"loss": 0.7395, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 4.633140972794724, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.00012967203545570418, |
|
"loss": 0.7332, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 4.6413849958779885, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.0001293968752607325, |
|
"loss": 0.7326, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 4.649629018961253, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00012912147122272523, |
|
"loss": 0.7317, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 4.657873042044518, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.00012884582562612095, |
|
"loss": 0.7336, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 4.666117065127782, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.00012856994075736197, |
|
"loss": 0.7283, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 4.674361088211047, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00012829381890487536, |
|
"loss": 0.7366, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 4.682605111294311, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.00012801746235905384, |
|
"loss": 0.7377, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 4.690849134377576, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.00012774087341223695, |
|
"loss": 0.7357, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 4.699093157460841, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.00012746405435869198, |
|
"loss": 0.7307, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 4.707337180544106, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.00012718700749459486, |
|
"loss": 0.7307, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 4.71558120362737, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.0001269097351180112, |
|
"loss": 0.7244, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 4.723825226710635, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.00012663223952887723, |
|
"loss": 0.7321, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 4.732069249793899, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.0001263545230289807, |
|
"loss": 0.7243, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 4.740313272877164, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.00012607658792194174, |
|
"loss": 0.7282, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 4.7485572959604285, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.0001257984365131938, |
|
"loss": 0.7239, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 4.756801319043693, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.00012552007110996463, |
|
"loss": 0.7273, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 4.765045342126958, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.00012524149402125685, |
|
"loss": 0.7251, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 4.773289365210223, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00012496270755782914, |
|
"loss": 0.739, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 4.781533388293488, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.00012468371403217684, |
|
"loss": 0.7344, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 4.789777411376752, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.00012440451575851285, |
|
"loss": 0.7314, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 4.798021434460017, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.00012412511505274844, |
|
"loss": 0.7269, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 4.806265457543281, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00012384551423247407, |
|
"loss": 0.7292, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 4.814509480626546, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.00012356571561693996, |
|
"loss": 0.7227, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 4.82275350370981, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.00012328572152703725, |
|
"loss": 0.7311, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 4.830997526793075, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.00012300553428527832, |
|
"loss": 0.7315, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 4.8392415498763395, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00012272515621577782, |
|
"loss": 0.7376, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 4.847485572959604, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00012244458964423327, |
|
"loss": 0.7305, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 4.855729596042869, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00012216383689790574, |
|
"loss": 0.7279, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 4.863973619126133, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.00012188290030560063, |
|
"loss": 0.7299, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 4.872217642209399, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 0.00012160178219764837, |
|
"loss": 0.7253, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 4.880461665292663, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.00012132048490588492, |
|
"loss": 0.7291, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 4.888705688375928, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.00012103901076363269, |
|
"loss": 0.7244, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 4.896949711459192, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.0001207573621056809, |
|
"loss": 0.7279, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 4.905193734542457, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00012047554126826643, |
|
"loss": 0.7297, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 4.913437757625721, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00012019355058905435, |
|
"loss": 0.7285, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 4.921681780708986, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.00011991139240711857, |
|
"loss": 0.7312, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 4.92992580379225, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00011962906906292238, |
|
"loss": 0.7284, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 4.938169826875515, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.00011934658289829902, |
|
"loss": 0.7336, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 4.9464138499587795, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.00011906393625643244, |
|
"loss": 0.7281, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 4.954657873042045, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.00011878113148183758, |
|
"loss": 0.7271, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 4.9629018961253095, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.00011849817092034118, |
|
"loss": 0.7229, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 4.971145919208574, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.00011821505691906216, |
|
"loss": 0.7318, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 4.979389942291839, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.00011793179182639218, |
|
"loss": 0.7366, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 4.987633965375103, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.00011764837799197622, |
|
"loss": 0.7337, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 4.995877988458368, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.7312, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 4.999175597691673, |
|
"eval_loss": 2.439051389694214, |
|
"eval_runtime": 0.2596, |
|
"eval_samples_per_second": 38.523, |
|
"eval_steps_per_second": 3.852, |
|
"step": 3032 |
|
}, |
|
{ |
|
"epoch": 5.004122011541632, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.0001170811135026357, |
|
"loss": 0.7263, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 5.012366034624897, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.00011679726755309205, |
|
"loss": 0.7183, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 5.020610057708161, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00011651328227252517, |
|
"loss": 0.723, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 5.028854080791426, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00011622916001655388, |
|
"loss": 0.7185, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 5.0370981038746905, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00011594490314193323, |
|
"loss": 0.7132, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 5.045342126957956, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.00011566051400653486, |
|
"loss": 0.7054, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 5.0535861500412205, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.00011537599496932752, |
|
"loss": 0.7197, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 5.061830173124485, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.00011509134839035748, |
|
"loss": 0.7157, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 5.07007419620775, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.00011480657663072896, |
|
"loss": 0.7093, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 5.078318219291014, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.0001145216820525845, |
|
"loss": 0.7286, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 5.086562242374279, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.00011423666701908547, |
|
"loss": 0.7105, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 5.094806265457543, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.00011395153389439233, |
|
"loss": 0.7072, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 5.103050288540808, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.00011366628504364509, |
|
"loss": 0.7156, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 5.111294311624072, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.00011338092283294377, |
|
"loss": 0.7052, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 5.119538334707337, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00011309544962932862, |
|
"loss": 0.7197, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 5.127782357790601, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.00011280986780076057, |
|
"loss": 0.7195, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 5.136026380873867, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.00011252417971610163, |
|
"loss": 0.7062, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 5.144270403957131, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00011223838774509514, |
|
"loss": 0.7225, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 5.152514427040396, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.00011195249425834615, |
|
"loss": 0.7106, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 5.1607584501236605, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.00011166650162730188, |
|
"loss": 0.7174, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 5.169002473206925, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.00011138041222423177, |
|
"loss": 0.7208, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 5.17724649629019, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.00011109422842220805, |
|
"loss": 0.716, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 5.185490519373454, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.00011080795259508608, |
|
"loss": 0.717, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 5.193734542456719, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.00011052158711748434, |
|
"loss": 0.7093, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 5.201978565539983, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.00011023513436476511, |
|
"loss": 0.7129, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 5.210222588623248, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.00010994859671301462, |
|
"loss": 0.7168, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 5.218466611706512, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.0001096619765390232, |
|
"loss": 0.7158, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 5.226710634789778, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.00010937527622026575, |
|
"loss": 0.7229, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 5.234954657873042, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.00010908849813488203, |
|
"loss": 0.7151, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 5.243198680956307, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.00010880164466165674, |
|
"loss": 0.7185, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 5.2514427040395715, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.00010851471817999997, |
|
"loss": 0.7113, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 5.259686727122836, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.00010822772106992747, |
|
"loss": 0.7178, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 5.267930750206101, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00010794065571204072, |
|
"loss": 0.7106, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 5.276174773289365, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.0001076535244875074, |
|
"loss": 0.7136, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 5.28441879637263, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.00010736632977804149, |
|
"loss": 0.7138, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 5.292662819455894, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.00010707907396588361, |
|
"loss": 0.7192, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 5.300906842539159, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.00010679175943378119, |
|
"loss": 0.7068, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 5.309150865622423, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.00010650438856496872, |
|
"loss": 0.7095, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 5.317394888705689, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.00010621696374314807, |
|
"loss": 0.7118, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 5.325638911788953, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.00010592948735246854, |
|
"loss": 0.711, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 5.333882934872218, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00010564196177750725, |
|
"loss": 0.7172, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 5.342126957955482, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.0001053543894032493, |
|
"loss": 0.7171, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 5.350370981038747, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 0.00010506677261506797, |
|
"loss": 0.7153, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 5.3586150041220115, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 0.00010477911379870488, |
|
"loss": 0.7162, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 5.366859027205276, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.00010449141534025045, |
|
"loss": 0.7067, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 5.375103050288541, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.00010420367962612372, |
|
"loss": 0.7117, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 5.383347073371805, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.00010391590904305284, |
|
"loss": 0.7175, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 5.39159109645507, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.00010362810597805526, |
|
"loss": 0.7109, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 5.399835119538334, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.00010334027281841781, |
|
"loss": 0.7136, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 5.4080791426216, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.00010305241195167687, |
|
"loss": 0.7123, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 5.416323165704864, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 0.00010276452576559879, |
|
"loss": 0.7132, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 5.424567188788129, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.00010247661664815986, |
|
"loss": 0.7161, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 5.432811211871393, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00010218868698752658, |
|
"loss": 0.7122, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 5.441055234954658, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00010190073917203589, |
|
"loss": 0.7167, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 5.4492992580379225, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00010161277559017528, |
|
"loss": 0.7143, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 5.457543281121187, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 0.00010132479863056303, |
|
"loss": 0.7163, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 5.465787304204452, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.00010103681068192845, |
|
"loss": 0.7173, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 5.474031327287716, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.00010074881413309193, |
|
"loss": 0.714, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 5.482275350370981, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.00010046081137294516, |
|
"loss": 0.7128, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 5.490519373454246, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.00010017280479043147, |
|
"loss": 0.7242, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 5.498763396537511, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 9.988479677452584e-05, |
|
"loss": 0.7196, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 5.507007419620775, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 9.959678971421508e-05, |
|
"loss": 0.714, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 5.51525144270404, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 9.930878599847821e-05, |
|
"loss": 0.7173, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 5.523495465787304, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 9.902078801626636e-05, |
|
"loss": 0.7137, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 5.531739488870569, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 9.873279815648318e-05, |
|
"loss": 0.7125, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 5.539983511953833, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 9.844481880796491e-05, |
|
"loss": 0.7173, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 5.548227535037098, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 9.815685235946068e-05, |
|
"loss": 0.7134, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 5.5564715581203625, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 9.786890119961253e-05, |
|
"loss": 0.7199, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 5.564715581203627, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 9.758096771693573e-05, |
|
"loss": 0.7116, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 5.572959604286892, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 9.729305429979887e-05, |
|
"loss": 0.7131, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 5.581203627370156, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 9.700516333640415e-05, |
|
"loss": 0.7172, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 5.589447650453422, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 9.671729721476746e-05, |
|
"loss": 0.7121, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 5.597691673536686, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 9.642945832269874e-05, |
|
"loss": 0.7187, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 5.605935696619951, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 9.614164904778196e-05, |
|
"loss": 0.7108, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 5.614179719703215, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 9.585387177735547e-05, |
|
"loss": 0.7099, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 5.62242374278648, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 9.556612889849214e-05, |
|
"loss": 0.7169, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 5.630667765869744, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 9.527842279797953e-05, |
|
"loss": 0.7118, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 5.638911788953009, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 9.499075586230013e-05, |
|
"loss": 0.7148, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 5.6471558120362735, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 9.470313047761167e-05, |
|
"loss": 0.7166, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 5.655399835119538, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 9.44155490297271e-05, |
|
"loss": 0.7156, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 5.663643858202803, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 9.412801390409497e-05, |
|
"loss": 0.707, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 5.671887881286068, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 9.38405274857796e-05, |
|
"loss": 0.7125, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 5.680131904369333, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 9.355309215944124e-05, |
|
"loss": 0.7153, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 5.688375927452597, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 9.326571030931637e-05, |
|
"loss": 0.7143, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 5.696619950535862, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 9.297838431919794e-05, |
|
"loss": 0.7192, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 5.704863973619126, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 9.269111657241548e-05, |
|
"loss": 0.7151, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 5.713107996702391, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 9.240390945181543e-05, |
|
"loss": 0.7171, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 5.721352019785655, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 9.211676533974131e-05, |
|
"loss": 0.7111, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 5.72959604286892, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 9.182968661801412e-05, |
|
"loss": 0.7111, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 5.737840065952184, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 9.154267566791223e-05, |
|
"loss": 0.7211, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 5.746084089035449, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 9.125573487015203e-05, |
|
"loss": 0.7165, |
|
"step": 3485 |
|
}, |
|
{ |
|
"epoch": 5.7543281121187135, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 9.096886660486797e-05, |
|
"loss": 0.7082, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 5.762572135201978, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 9.068207325159284e-05, |
|
"loss": 0.7136, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 5.7708161582852435, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 9.039535718923804e-05, |
|
"loss": 0.714, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 5.779060181368508, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 9.01087207960739e-05, |
|
"loss": 0.7174, |
|
"step": 3505 |
|
}, |
|
{ |
|
"epoch": 5.787304204451773, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 8.982216644970979e-05, |
|
"loss": 0.7071, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 5.795548227535037, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 8.953569652707459e-05, |
|
"loss": 0.7081, |
|
"step": 3515 |
|
}, |
|
{ |
|
"epoch": 5.803792250618302, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 8.924931340439694e-05, |
|
"loss": 0.7124, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 5.812036273701566, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 8.896301945718541e-05, |
|
"loss": 0.7115, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 5.820280296784831, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 8.867681706020894e-05, |
|
"loss": 0.7134, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 5.828524319868095, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 8.839070858747697e-05, |
|
"loss": 0.7169, |
|
"step": 3535 |
|
}, |
|
{ |
|
"epoch": 5.83676834295136, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 8.810469641222001e-05, |
|
"loss": 0.7154, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 5.845012366034625, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 8.781878290686959e-05, |
|
"loss": 0.7182, |
|
"step": 3545 |
|
}, |
|
{ |
|
"epoch": 5.85325638911789, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 8.753297044303896e-05, |
|
"loss": 0.7128, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 5.8615004122011545, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 8.724726139150318e-05, |
|
"loss": 0.7083, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 5.869744435284419, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 8.696165812217953e-05, |
|
"loss": 0.7175, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 5.877988458367684, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 8.667616300410778e-05, |
|
"loss": 0.7174, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 5.886232481450948, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 8.639077840543077e-05, |
|
"loss": 0.7173, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 5.894476504534213, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 8.610550669337433e-05, |
|
"loss": 0.7147, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 5.902720527617477, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 8.582035023422815e-05, |
|
"loss": 0.7169, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 5.910964550700742, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 8.553531139332582e-05, |
|
"loss": 0.7237, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 5.919208573784006, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 8.525039253502529e-05, |
|
"loss": 0.7134, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 5.927452596867271, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 8.496559602268928e-05, |
|
"loss": 0.7189, |
|
"step": 3595 |
|
}, |
|
{ |
|
"epoch": 5.935696619950535, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 8.468092421866573e-05, |
|
"loss": 0.717, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 5.943940643033801, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 8.439637948426801e-05, |
|
"loss": 0.7094, |
|
"step": 3605 |
|
}, |
|
{ |
|
"epoch": 5.952184666117065, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 8.411196417975558e-05, |
|
"loss": 0.7019, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 5.96042868920033, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 8.382768066431425e-05, |
|
"loss": 0.7127, |
|
"step": 3615 |
|
}, |
|
{ |
|
"epoch": 5.9686727122835945, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 8.354353129603668e-05, |
|
"loss": 0.7133, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 5.976916735366859, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 8.325951843190274e-05, |
|
"loss": 0.7182, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 5.985160758450124, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 8.297564442776014e-05, |
|
"loss": 0.7053, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 5.993404781533388, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 8.269191163830467e-05, |
|
"loss": 0.7253, |
|
"step": 3635 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 2.459299325942993, |
|
"eval_runtime": 0.2463, |
|
"eval_samples_per_second": 40.595, |
|
"eval_steps_per_second": 4.059, |
|
"step": 3639 |
|
}, |
|
{ |
|
"epoch": 6.001648804616653, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 8.240832241706068e-05, |
|
"loss": 0.7144, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 6.009892827699917, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 8.212487911636184e-05, |
|
"loss": 0.7102, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 6.018136850783182, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 8.184158408733131e-05, |
|
"loss": 0.7073, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 6.026380873866446, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 8.155843967986236e-05, |
|
"loss": 0.6914, |
|
"step": 3655 |
|
}, |
|
{ |
|
"epoch": 6.034624896949712, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 8.127544824259889e-05, |
|
"loss": 0.7095, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 6.042868920032976, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 8.099261212291601e-05, |
|
"loss": 0.7006, |
|
"step": 3665 |
|
}, |
|
{ |
|
"epoch": 6.051112943116241, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 8.070993366690029e-05, |
|
"loss": 0.6983, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 6.0593569661995055, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 8.042741521933071e-05, |
|
"loss": 0.7086, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 6.06760098928277, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 8.014505912365893e-05, |
|
"loss": 0.7039, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 6.075845012366035, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 7.986286772198986e-05, |
|
"loss": 0.7056, |
|
"step": 3685 |
|
}, |
|
{ |
|
"epoch": 6.084089035449299, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 7.958084335506239e-05, |
|
"loss": 0.6957, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 6.092333058532564, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 7.929898836222983e-05, |
|
"loss": 0.7052, |
|
"step": 3695 |
|
}, |
|
{ |
|
"epoch": 6.100577081615828, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 7.90173050814406e-05, |
|
"loss": 0.6982, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 6.108821104699093, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 7.873579584921869e-05, |
|
"loss": 0.7029, |
|
"step": 3705 |
|
}, |
|
{ |
|
"epoch": 6.117065127782358, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 7.84544630006445e-05, |
|
"loss": 0.7015, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 6.125309150865623, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 7.817330886933527e-05, |
|
"loss": 0.7073, |
|
"step": 3715 |
|
}, |
|
{ |
|
"epoch": 6.133553173948887, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 7.789233578742582e-05, |
|
"loss": 0.7092, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 6.141797197032152, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 7.761154608554927e-05, |
|
"loss": 0.7025, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 6.150041220115416, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 7.733094209281756e-05, |
|
"loss": 0.7048, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 6.158285243198681, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 7.705052613680211e-05, |
|
"loss": 0.7029, |
|
"step": 3735 |
|
}, |
|
{ |
|
"epoch": 6.1665292662819455, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 7.677030054351477e-05, |
|
"loss": 0.701, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 6.17477328936521, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 7.649026763738827e-05, |
|
"loss": 0.7067, |
|
"step": 3745 |
|
}, |
|
{ |
|
"epoch": 6.183017312448475, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 7.6210429741257e-05, |
|
"loss": 0.7055, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 6.191261335531739, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 7.593078917633787e-05, |
|
"loss": 0.7104, |
|
"step": 3755 |
|
}, |
|
{ |
|
"epoch": 6.199505358615004, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 7.565134826221083e-05, |
|
"loss": 0.703, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 6.207749381698269, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 7.537210931679987e-05, |
|
"loss": 0.6998, |
|
"step": 3765 |
|
}, |
|
{ |
|
"epoch": 6.215993404781534, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 7.509307465635358e-05, |
|
"loss": 0.6976, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 6.224237427864798, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 7.481424659542609e-05, |
|
"loss": 0.7025, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 6.232481450948063, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 7.453562744685778e-05, |
|
"loss": 0.6971, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 6.240725474031327, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 7.425721952175618e-05, |
|
"loss": 0.6984, |
|
"step": 3785 |
|
}, |
|
{ |
|
"epoch": 6.248969497114592, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 7.39790251294767e-05, |
|
"loss": 0.7012, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 6.2572135201978565, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 7.370104657760361e-05, |
|
"loss": 0.7012, |
|
"step": 3795 |
|
}, |
|
{ |
|
"epoch": 6.265457543281121, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 7.342328617193067e-05, |
|
"loss": 0.7069, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 6.273701566364386, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 7.314574621644225e-05, |
|
"loss": 0.6998, |
|
"step": 3805 |
|
}, |
|
{ |
|
"epoch": 6.28194558944765, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 7.286842901329412e-05, |
|
"loss": 0.695, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 6.290189612530915, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 7.259133686279429e-05, |
|
"loss": 0.7045, |
|
"step": 3815 |
|
}, |
|
{ |
|
"epoch": 6.29843363561418, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 7.231447206338407e-05, |
|
"loss": 0.7062, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 6.306677658697445, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 7.203783691161883e-05, |
|
"loss": 0.6975, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 6.314921681780709, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 7.176143370214914e-05, |
|
"loss": 0.7035, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 6.323165704863974, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 7.148526472770154e-05, |
|
"loss": 0.7071, |
|
"step": 3835 |
|
}, |
|
{ |
|
"epoch": 6.331409727947238, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 7.12093322790597e-05, |
|
"loss": 0.7022, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 6.339653751030503, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 7.09336386450453e-05, |
|
"loss": 0.7104, |
|
"step": 3845 |
|
}, |
|
{ |
|
"epoch": 6.347897774113767, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 7.065818611249915e-05, |
|
"loss": 0.7028, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 6.356141797197032, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 7.038297696626206e-05, |
|
"loss": 0.7049, |
|
"step": 3855 |
|
}, |
|
{ |
|
"epoch": 6.3643858202802965, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 7.010801348915608e-05, |
|
"loss": 0.7074, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 6.372629843363561, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 6.983329796196534e-05, |
|
"loss": 0.7001, |
|
"step": 3865 |
|
}, |
|
{ |
|
"epoch": 6.380873866446826, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 6.955883266341741e-05, |
|
"loss": 0.7006, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 6.389117889530091, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 6.928461987016413e-05, |
|
"loss": 0.7113, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 6.397361912613356, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 6.901066185676295e-05, |
|
"loss": 0.6964, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 6.40560593569662, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 6.873696089565786e-05, |
|
"loss": 0.7086, |
|
"step": 3885 |
|
}, |
|
{ |
|
"epoch": 6.413849958779885, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 6.846351925716068e-05, |
|
"loss": 0.698, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 6.422093981863149, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 6.819033920943219e-05, |
|
"loss": 0.6997, |
|
"step": 3895 |
|
}, |
|
{ |
|
"epoch": 6.430338004946414, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 6.791742301846326e-05, |
|
"loss": 0.7031, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 6.438582028029678, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 6.764477294805615e-05, |
|
"loss": 0.7026, |
|
"step": 3905 |
|
}, |
|
{ |
|
"epoch": 6.446826051112943, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 6.737239125980573e-05, |
|
"loss": 0.7006, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 6.4550700741962075, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 6.710028021308061e-05, |
|
"loss": 0.6971, |
|
"step": 3915 |
|
}, |
|
{ |
|
"epoch": 6.463314097279472, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 6.682844206500445e-05, |
|
"loss": 0.7028, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 6.471558120362737, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 6.655687907043734e-05, |
|
"loss": 0.7053, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 6.479802143446002, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 6.62855934819569e-05, |
|
"loss": 0.6995, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 6.488046166529267, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 6.601458754983978e-05, |
|
"loss": 0.6971, |
|
"step": 3935 |
|
}, |
|
{ |
|
"epoch": 6.496290189612531, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 6.574386352204289e-05, |
|
"loss": 0.7029, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 6.504534212695796, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 6.547342364418481e-05, |
|
"loss": 0.7011, |
|
"step": 3945 |
|
}, |
|
{ |
|
"epoch": 6.51277823577906, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 6.520327015952713e-05, |
|
"loss": 0.699, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 6.521022258862325, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 6.493340530895583e-05, |
|
"loss": 0.6987, |
|
"step": 3955 |
|
}, |
|
{ |
|
"epoch": 6.529266281945589, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 6.466383133096267e-05, |
|
"loss": 0.7095, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 6.537510305028854, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 6.439455046162677e-05, |
|
"loss": 0.704, |
|
"step": 3965 |
|
}, |
|
{ |
|
"epoch": 6.545754328112118, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 6.412556493459581e-05, |
|
"loss": 0.7127, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 6.553998351195383, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 6.385687698106781e-05, |
|
"loss": 0.7019, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 6.562242374278648, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 6.358848882977233e-05, |
|
"loss": 0.702, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 6.570486397361913, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 6.332040270695219e-05, |
|
"loss": 0.7086, |
|
"step": 3985 |
|
}, |
|
{ |
|
"epoch": 6.5787304204451775, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 6.305262083634488e-05, |
|
"loss": 0.7086, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 6.586974443528442, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 6.278514543916415e-05, |
|
"loss": 0.7087, |
|
"step": 3995 |
|
}, |
|
{ |
|
"epoch": 6.595218466611707, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 6.251797873408161e-05, |
|
"loss": 0.6976, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 6.603462489694971, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 6.225112293720836e-05, |
|
"loss": 0.6968, |
|
"step": 4005 |
|
}, |
|
{ |
|
"epoch": 6.611706512778236, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 6.198458026207652e-05, |
|
"loss": 0.7039, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 6.6199505358615, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 6.171835291962088e-05, |
|
"loss": 0.702, |
|
"step": 4015 |
|
}, |
|
{ |
|
"epoch": 6.628194558944765, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 6.145244311816063e-05, |
|
"loss": 0.7004, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 6.636438582028029, |
|
"grad_norm": 0.5, |
|
"learning_rate": 6.1186853063381e-05, |
|
"loss": 0.6988, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 6.644682605111294, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 6.092158495831486e-05, |
|
"loss": 0.7019, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 6.6529266281945585, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 6.065664100332478e-05, |
|
"loss": 0.7082, |
|
"step": 4035 |
|
}, |
|
{ |
|
"epoch": 6.661170651277824, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 6.039202339608432e-05, |
|
"loss": 0.7008, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 6.6694146743610885, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 6.012773433156017e-05, |
|
"loss": 0.7022, |
|
"step": 4045 |
|
}, |
|
{ |
|
"epoch": 6.677658697444353, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 5.986377600199371e-05, |
|
"loss": 0.6986, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 6.685902720527618, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 5.9600150596883066e-05, |
|
"loss": 0.6989, |
|
"step": 4055 |
|
}, |
|
{ |
|
"epoch": 6.694146743610882, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 5.933686030296459e-05, |
|
"loss": 0.6993, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 6.702390766694147, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 5.907390730419507e-05, |
|
"loss": 0.6977, |
|
"step": 4065 |
|
}, |
|
{ |
|
"epoch": 6.710634789777411, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 5.881129378173347e-05, |
|
"loss": 0.7019, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 6.718878812860676, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 5.854902191392284e-05, |
|
"loss": 0.6936, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 6.72712283594394, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 5.828709387627218e-05, |
|
"loss": 0.7002, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 6.735366859027205, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 5.802551184143865e-05, |
|
"loss": 0.7026, |
|
"step": 4085 |
|
}, |
|
{ |
|
"epoch": 6.74361088211047, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 5.7764277979209094e-05, |
|
"loss": 0.7151, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 6.751854905193735, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 5.750339445648252e-05, |
|
"loss": 0.7055, |
|
"step": 4095 |
|
}, |
|
{ |
|
"epoch": 6.760098928276999, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 5.724286343725185e-05, |
|
"loss": 0.7032, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 6.768342951360264, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 5.6982687082585994e-05, |
|
"loss": 0.7008, |
|
"step": 4105 |
|
}, |
|
{ |
|
"epoch": 6.7765869744435285, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 5.6722867550612116e-05, |
|
"loss": 0.6998, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 6.784830997526793, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 5.6463406996497456e-05, |
|
"loss": 0.6961, |
|
"step": 4115 |
|
}, |
|
{ |
|
"epoch": 6.793075020610058, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 5.620430757243156e-05, |
|
"loss": 0.6963, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 6.801319043693322, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 5.5945571427608526e-05, |
|
"loss": 0.7083, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 6.809563066776587, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 5.5687200708209076e-05, |
|
"loss": 0.704, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 6.817807089859851, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 5.542919755738275e-05, |
|
"loss": 0.7061, |
|
"step": 4135 |
|
}, |
|
{ |
|
"epoch": 6.826051112943116, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 5.5171564115230254e-05, |
|
"loss": 0.7037, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 6.83429513602638, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 5.491430251878551e-05, |
|
"loss": 0.715, |
|
"step": 4145 |
|
}, |
|
{ |
|
"epoch": 6.842539159109646, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 5.4657414901998095e-05, |
|
"loss": 0.7023, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 6.85078318219291, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 5.4400903395715366e-05, |
|
"loss": 0.6967, |
|
"step": 4155 |
|
}, |
|
{ |
|
"epoch": 6.859027205276175, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 5.4144770127665024e-05, |
|
"loss": 0.7073, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 6.8672712283594395, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 5.388901722243724e-05, |
|
"loss": 0.6954, |
|
"step": 4165 |
|
}, |
|
{ |
|
"epoch": 6.875515251442704, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 5.363364680146725e-05, |
|
"loss": 0.7044, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 6.883759274525969, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 5.3378660983017536e-05, |
|
"loss": 0.7045, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 6.892003297609233, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 5.31240618821604e-05, |
|
"loss": 0.7029, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 6.900247320692498, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 5.286985161076029e-05, |
|
"loss": 0.7018, |
|
"step": 4185 |
|
}, |
|
{ |
|
"epoch": 6.908491343775762, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 5.2616032277456463e-05, |
|
"loss": 0.7102, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 6.916735366859028, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 5.236260598764535e-05, |
|
"loss": 0.7078, |
|
"step": 4195 |
|
}, |
|
{ |
|
"epoch": 6.924979389942292, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 5.210957484346314e-05, |
|
"loss": 0.7055, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 6.933223413025557, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 5.185694094376843e-05, |
|
"loss": 0.7068, |
|
"step": 4205 |
|
}, |
|
{ |
|
"epoch": 6.941467436108821, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 5.160470638412461e-05, |
|
"loss": 0.6911, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 6.949711459192086, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 5.135287325678271e-05, |
|
"loss": 0.7047, |
|
"step": 4215 |
|
}, |
|
{ |
|
"epoch": 6.95795548227535, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 5.1101443650663764e-05, |
|
"loss": 0.6989, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 6.966199505358615, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 5.085041965134183e-05, |
|
"loss": 0.6975, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 6.9744435284418795, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 5.059980334102637e-05, |
|
"loss": 0.7055, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 6.982687551525144, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 5.034959679854532e-05, |
|
"loss": 0.6983, |
|
"step": 4235 |
|
}, |
|
{ |
|
"epoch": 6.990931574608409, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 5.009980209932743e-05, |
|
"loss": 0.7046, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 6.999175597691673, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 4.985042131538545e-05, |
|
"loss": 0.7042, |
|
"step": 4245 |
|
}, |
|
{ |
|
"epoch": 6.999175597691673, |
|
"eval_loss": 2.4711008071899414, |
|
"eval_runtime": 0.2631, |
|
"eval_samples_per_second": 38.011, |
|
"eval_steps_per_second": 3.801, |
|
"step": 4245 |
|
}, |
|
{ |
|
"epoch": 7.007419620774938, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 4.960145651529856e-05, |
|
"loss": 0.6792, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 7.015663643858203, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 4.9352909764195576e-05, |
|
"loss": 0.6999, |
|
"step": 4255 |
|
}, |
|
{ |
|
"epoch": 7.023907666941468, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 4.9104783123737566e-05, |
|
"loss": 0.6999, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 7.032151690024732, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 4.885707865210093e-05, |
|
"loss": 0.7018, |
|
"step": 4265 |
|
}, |
|
{ |
|
"epoch": 7.040395713107997, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 4.860979840396016e-05, |
|
"loss": 0.6912, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 7.048639736191261, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 4.836294443047088e-05, |
|
"loss": 0.6945, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 7.056883759274526, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 4.8116518779252885e-05, |
|
"loss": 0.6905, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 7.0651277823577905, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 4.787052349437295e-05, |
|
"loss": 0.691, |
|
"step": 4285 |
|
}, |
|
{ |
|
"epoch": 7.073371805441055, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 4.762496061632814e-05, |
|
"loss": 0.6843, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 7.08161582852432, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 4.7379832182028814e-05, |
|
"loss": 0.6951, |
|
"step": 4295 |
|
}, |
|
{ |
|
"epoch": 7.089859851607584, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 4.713514022478155e-05, |
|
"loss": 0.6893, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 7.09810387469085, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 4.689088677427249e-05, |
|
"loss": 0.6952, |
|
"step": 4305 |
|
}, |
|
{ |
|
"epoch": 7.106347897774114, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 4.6647073856550415e-05, |
|
"loss": 0.6958, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 7.114591920857379, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 4.6403703494009875e-05, |
|
"loss": 0.6946, |
|
"step": 4315 |
|
}, |
|
{ |
|
"epoch": 7.122835943940643, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 4.6160777705374524e-05, |
|
"loss": 0.6996, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 7.131079967023908, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 4.591829850568046e-05, |
|
"loss": 0.6969, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 7.139323990107172, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 4.567626790625921e-05, |
|
"loss": 0.6868, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 7.147568013190437, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 4.543468791472131e-05, |
|
"loss": 0.69, |
|
"step": 4335 |
|
}, |
|
{ |
|
"epoch": 7.155812036273701, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 4.519356053493958e-05, |
|
"loss": 0.6979, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 7.164056059356966, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 4.495288776703241e-05, |
|
"loss": 0.7022, |
|
"step": 4345 |
|
}, |
|
{ |
|
"epoch": 7.1723000824402305, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 4.471267160734731e-05, |
|
"loss": 0.6874, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 7.180544105523495, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 4.447291404844424e-05, |
|
"loss": 0.6982, |
|
"step": 4355 |
|
}, |
|
{ |
|
"epoch": 7.18878812860676, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 4.4233617079079236e-05, |
|
"loss": 0.7015, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 7.197032151690025, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 4.399478268418771e-05, |
|
"loss": 0.6919, |
|
"step": 4365 |
|
}, |
|
{ |
|
"epoch": 7.20527617477329, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 4.375641284486808e-05, |
|
"loss": 0.6867, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 7.213520197856554, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 4.3518509538365425e-05, |
|
"loss": 0.6929, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 7.221764220939819, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 4.328107473805487e-05, |
|
"loss": 0.7013, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 7.230008244023083, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 4.3044110413425395e-05, |
|
"loss": 0.6879, |
|
"step": 4385 |
|
}, |
|
{ |
|
"epoch": 7.238252267106348, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 4.2807618530063565e-05, |
|
"loss": 0.6918, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 7.246496290189612, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 4.257160104963696e-05, |
|
"loss": 0.6965, |
|
"step": 4395 |
|
}, |
|
{ |
|
"epoch": 7.254740313272877, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 4.23360599298781e-05, |
|
"loss": 0.6963, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 7.2629843363561415, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 4.210099712456822e-05, |
|
"loss": 0.69, |
|
"step": 4405 |
|
}, |
|
{ |
|
"epoch": 7.271228359439406, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 4.1866414583520877e-05, |
|
"loss": 0.6955, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 7.2794723825226715, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 4.163231425256595e-05, |
|
"loss": 0.6888, |
|
"step": 4415 |
|
}, |
|
{ |
|
"epoch": 7.287716405605936, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 4.139869807353357e-05, |
|
"loss": 0.6998, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 7.295960428689201, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 4.1165567984237764e-05, |
|
"loss": 0.6963, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 7.304204451772465, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 4.0932925918460516e-05, |
|
"loss": 0.6922, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 7.31244847485573, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 4.070077380593579e-05, |
|
"loss": 0.6969, |
|
"step": 4435 |
|
}, |
|
{ |
|
"epoch": 7.320692497938994, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 4.046911357233343e-05, |
|
"loss": 0.6893, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 7.328936521022259, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 4.02379471392431e-05, |
|
"loss": 0.6902, |
|
"step": 4445 |
|
}, |
|
{ |
|
"epoch": 7.337180544105523, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 4.000727642415867e-05, |
|
"loss": 0.7053, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 7.345424567188788, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 3.977710334046193e-05, |
|
"loss": 0.6942, |
|
"step": 4455 |
|
}, |
|
{ |
|
"epoch": 7.353668590272052, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 3.954742979740695e-05, |
|
"loss": 0.7078, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 7.361912613355317, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 3.9318257700104174e-05, |
|
"loss": 0.6932, |
|
"step": 4465 |
|
}, |
|
{ |
|
"epoch": 7.370156636438582, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 3.9089588949504655e-05, |
|
"loss": 0.6955, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 7.378400659521847, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 3.8861425442384135e-05, |
|
"loss": 0.6969, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 7.3866446826051115, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 3.863376907132752e-05, |
|
"loss": 0.6949, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 7.394888705688376, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 3.840662172471315e-05, |
|
"loss": 0.7005, |
|
"step": 4485 |
|
}, |
|
{ |
|
"epoch": 7.403132728771641, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 3.8179985286696986e-05, |
|
"loss": 0.6935, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 7.411376751854905, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 3.7953861637197085e-05, |
|
"loss": 0.6923, |
|
"step": 4495 |
|
}, |
|
{ |
|
"epoch": 7.41962077493817, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 3.772825265187802e-05, |
|
"loss": 0.6923, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 7.427864798021434, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 3.75031602021353e-05, |
|
"loss": 0.6979, |
|
"step": 4505 |
|
}, |
|
{ |
|
"epoch": 7.436108821104699, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 3.727858615507974e-05, |
|
"loss": 0.6977, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 7.444352844187963, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 3.705453237352227e-05, |
|
"loss": 0.7043, |
|
"step": 4515 |
|
}, |
|
{ |
|
"epoch": 7.452596867271229, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 3.683100071595813e-05, |
|
"loss": 0.6956, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 7.460840890354493, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 3.660799303655166e-05, |
|
"loss": 0.6974, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 7.469084913437758, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 3.638551118512089e-05, |
|
"loss": 0.7013, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 7.4773289365210225, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 3.616355700712221e-05, |
|
"loss": 0.6966, |
|
"step": 4535 |
|
}, |
|
{ |
|
"epoch": 7.485572959604287, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 3.594213234363486e-05, |
|
"loss": 0.6964, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 7.493816982687552, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 3.5721239031346066e-05, |
|
"loss": 0.6922, |
|
"step": 4545 |
|
}, |
|
{ |
|
"epoch": 7.502061005770816, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 3.550087890253544e-05, |
|
"loss": 0.6948, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 7.510305028854081, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 3.5281053785059925e-05, |
|
"loss": 0.695, |
|
"step": 4555 |
|
}, |
|
{ |
|
"epoch": 7.518549051937345, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 3.506176550233863e-05, |
|
"loss": 0.6949, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 7.52679307502061, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 3.484301587333772e-05, |
|
"loss": 0.6903, |
|
"step": 4565 |
|
}, |
|
{ |
|
"epoch": 7.535037098103874, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 3.462480671255515e-05, |
|
"loss": 0.6983, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 7.543281121187139, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 3.440713983000601e-05, |
|
"loss": 0.6964, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 7.551525144270404, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 3.419001703120709e-05, |
|
"loss": 0.6934, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 7.559769167353669, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 3.397344011716216e-05, |
|
"loss": 0.7035, |
|
"step": 4585 |
|
}, |
|
{ |
|
"epoch": 7.568013190436933, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 3.3757410884346894e-05, |
|
"loss": 0.6827, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 7.576257213520198, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 3.354193112469407e-05, |
|
"loss": 0.6979, |
|
"step": 4595 |
|
}, |
|
{ |
|
"epoch": 7.5845012366034625, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 3.332700262557864e-05, |
|
"loss": 0.7002, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 7.592745259686727, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 3.3112627169802946e-05, |
|
"loss": 0.6996, |
|
"step": 4605 |
|
}, |
|
{ |
|
"epoch": 7.600989282769992, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 3.289880653558188e-05, |
|
"loss": 0.6942, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 7.609233305853256, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 3.2685542496528185e-05, |
|
"loss": 0.7002, |
|
"step": 4615 |
|
}, |
|
{ |
|
"epoch": 7.617477328936521, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 3.2472836821637744e-05, |
|
"loss": 0.6953, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 7.625721352019785, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 3.2260691275274835e-05, |
|
"loss": 0.7001, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 7.633965375103051, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 3.204910761715763e-05, |
|
"loss": 0.6935, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 7.642209398186315, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 3.1838087602343344e-05, |
|
"loss": 0.6973, |
|
"step": 4635 |
|
}, |
|
{ |
|
"epoch": 7.65045342126958, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 3.162763298121408e-05, |
|
"loss": 0.6962, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 7.658697444352844, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 3.1417745499461934e-05, |
|
"loss": 0.6986, |
|
"step": 4645 |
|
}, |
|
{ |
|
"epoch": 7.666941467436109, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 3.120842689807468e-05, |
|
"loss": 0.7008, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 7.6751854905193735, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 3.099967891332132e-05, |
|
"loss": 0.698, |
|
"step": 4655 |
|
}, |
|
{ |
|
"epoch": 7.683429513602638, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 3.079150327673766e-05, |
|
"loss": 0.6996, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 7.691673536685903, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 3.058390171511196e-05, |
|
"loss": 0.6973, |
|
"step": 4665 |
|
}, |
|
{ |
|
"epoch": 7.699917559769167, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 3.0376875950470617e-05, |
|
"loss": 0.6972, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 7.708161582852432, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 3.0170427700063873e-05, |
|
"loss": 0.6962, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 7.716405605935696, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 2.996455867635155e-05, |
|
"loss": 0.7006, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 7.724649629018961, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 2.9759270586988865e-05, |
|
"loss": 0.7017, |
|
"step": 4685 |
|
}, |
|
{ |
|
"epoch": 7.732893652102226, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 2.9554565134812294e-05, |
|
"loss": 0.7051, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 7.741137675185491, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 2.9350444017825385e-05, |
|
"loss": 0.6909, |
|
"step": 4695 |
|
}, |
|
{ |
|
"epoch": 7.749381698268755, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 2.9146908929184713e-05, |
|
"loss": 0.6939, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 7.75762572135202, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 2.894396155718585e-05, |
|
"loss": 0.6956, |
|
"step": 4705 |
|
}, |
|
{ |
|
"epoch": 7.765869744435284, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 2.874160358524931e-05, |
|
"loss": 0.6962, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 7.774113767518549, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 2.853983669190664e-05, |
|
"loss": 0.6911, |
|
"step": 4715 |
|
}, |
|
{ |
|
"epoch": 7.7823577906018135, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 2.8338662550786443e-05, |
|
"loss": 0.6954, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 7.790601813685078, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 2.8138082830600554e-05, |
|
"loss": 0.694, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 7.798845836768343, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 2.7938099195130153e-05, |
|
"loss": 0.6935, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 7.807089859851608, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 2.7738713303211982e-05, |
|
"loss": 0.6885, |
|
"step": 4735 |
|
}, |
|
{ |
|
"epoch": 7.815333882934873, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 2.753992680872457e-05, |
|
"loss": 0.7002, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 7.823577906018137, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 2.7341741360574548e-05, |
|
"loss": 0.6928, |
|
"step": 4745 |
|
}, |
|
{ |
|
"epoch": 7.831821929101402, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 2.7144158602682924e-05, |
|
"loss": 0.6959, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 7.840065952184666, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 2.6947180173971508e-05, |
|
"loss": 0.6907, |
|
"step": 4755 |
|
}, |
|
{ |
|
"epoch": 7.848309975267931, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 2.6750807708349267e-05, |
|
"loss": 0.6982, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 7.856553998351195, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 2.6555042834698773e-05, |
|
"loss": 0.6945, |
|
"step": 4765 |
|
}, |
|
{ |
|
"epoch": 7.86479802143446, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 2.6359887176862718e-05, |
|
"loss": 0.695, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 7.8730420445177245, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 2.6165342353630428e-05, |
|
"loss": 0.694, |
|
"step": 4775 |
|
}, |
|
{ |
|
"epoch": 7.881286067600989, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 2.5971409978724458e-05, |
|
"loss": 0.6986, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 7.889530090684254, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 2.577809166078716e-05, |
|
"loss": 0.6935, |
|
"step": 4785 |
|
}, |
|
{ |
|
"epoch": 7.897774113767518, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 2.558538900336741e-05, |
|
"loss": 0.6991, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 7.906018136850783, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 2.5393303604907205e-05, |
|
"loss": 0.6974, |
|
"step": 4795 |
|
}, |
|
{ |
|
"epoch": 7.914262159934048, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 2.5201837058728505e-05, |
|
"loss": 0.6956, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 7.922506183017313, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 2.5010990953019975e-05, |
|
"loss": 0.6927, |
|
"step": 4805 |
|
}, |
|
{ |
|
"epoch": 7.930750206100577, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 2.4820766870823807e-05, |
|
"loss": 0.688, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 7.938994229183842, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 2.4631166390022574e-05, |
|
"loss": 0.695, |
|
"step": 4815 |
|
}, |
|
{ |
|
"epoch": 7.947238252267106, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 2.4442191083326195e-05, |
|
"loss": 0.7014, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 7.955482275350371, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 2.425384251825882e-05, |
|
"loss": 0.6955, |
|
"step": 4825 |
|
}, |
|
{ |
|
"epoch": 7.963726298433635, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 2.4066122257145894e-05, |
|
"loss": 0.6934, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 7.9719703215169, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 2.387903185710115e-05, |
|
"loss": 0.6909, |
|
"step": 4835 |
|
}, |
|
{ |
|
"epoch": 7.9802143446001645, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 2.3692572870013718e-05, |
|
"loss": 0.691, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 7.98845836768343, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 2.3506746842535242e-05, |
|
"loss": 0.6929, |
|
"step": 4845 |
|
}, |
|
{ |
|
"epoch": 7.9967023907666945, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 2.3321555316067045e-05, |
|
"loss": 0.6928, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 2.471337080001831, |
|
"eval_runtime": 0.2361, |
|
"eval_samples_per_second": 42.357, |
|
"eval_steps_per_second": 4.236, |
|
"step": 4852 |
|
}, |
|
{ |
|
"epoch": 8.004946413849959, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 2.313699982674736e-05, |
|
"loss": 0.6913, |
|
"step": 4855 |
|
}, |
|
{ |
|
"epoch": 8.013190436933224, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 2.295308190543859e-05, |
|
"loss": 0.6943, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 8.021434460016488, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 2.276980307771458e-05, |
|
"loss": 0.6958, |
|
"step": 4865 |
|
}, |
|
{ |
|
"epoch": 8.029678483099753, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 2.2587164863847975e-05, |
|
"loss": 0.6957, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 8.037922506183017, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 2.2405168778797646e-05, |
|
"loss": 0.6914, |
|
"step": 4875 |
|
}, |
|
{ |
|
"epoch": 8.046166529266282, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 2.222381633219608e-05, |
|
"loss": 0.6904, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 8.054410552349546, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 2.204310902833685e-05, |
|
"loss": 0.6921, |
|
"step": 4885 |
|
}, |
|
{ |
|
"epoch": 8.062654575432811, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 2.1863048366162208e-05, |
|
"loss": 0.6926, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 8.070898598516075, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 2.1683635839250537e-05, |
|
"loss": 0.6938, |
|
"step": 4895 |
|
}, |
|
{ |
|
"epoch": 8.07914262159934, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 2.15048729358041e-05, |
|
"loss": 0.6936, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 8.087386644682605, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 2.1326761138636553e-05, |
|
"loss": 0.6959, |
|
"step": 4905 |
|
}, |
|
{ |
|
"epoch": 8.09563066776587, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 2.114930192516076e-05, |
|
"loss": 0.6883, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 8.103874690849134, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 2.097249676737648e-05, |
|
"loss": 0.6989, |
|
"step": 4915 |
|
}, |
|
{ |
|
"epoch": 8.112118713932398, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 2.0796347131858186e-05, |
|
"loss": 0.6915, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 8.120362737015663, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 2.0620854479742834e-05, |
|
"loss": 0.6893, |
|
"step": 4925 |
|
}, |
|
{ |
|
"epoch": 8.12860676009893, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 2.044602026671786e-05, |
|
"loss": 0.699, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 8.136850783182194, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 2.027184594300898e-05, |
|
"loss": 0.6962, |
|
"step": 4935 |
|
}, |
|
{ |
|
"epoch": 8.145094806265458, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 2.0098332953368272e-05, |
|
"loss": 0.6869, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 8.153338829348723, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 1.9925482737062085e-05, |
|
"loss": 0.6957, |
|
"step": 4945 |
|
}, |
|
{ |
|
"epoch": 8.161582852431987, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 1.9753296727859195e-05, |
|
"loss": 0.692, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 8.169826875515252, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 1.9581776354018854e-05, |
|
"loss": 0.6985, |
|
"step": 4955 |
|
}, |
|
{ |
|
"epoch": 8.178070898598516, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 1.941092303827896e-05, |
|
"loss": 0.6876, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 8.186314921681781, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 1.9240738197844278e-05, |
|
"loss": 0.6863, |
|
"step": 4965 |
|
}, |
|
{ |
|
"epoch": 8.194558944765046, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 1.9071223244374614e-05, |
|
"loss": 0.694, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 8.20280296784831, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 1.8902379583973208e-05, |
|
"loss": 0.6936, |
|
"step": 4975 |
|
}, |
|
{ |
|
"epoch": 8.211046990931575, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 1.8734208617174988e-05, |
|
"loss": 0.6926, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 8.21929101401484, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 1.856671173893497e-05, |
|
"loss": 0.6921, |
|
"step": 4985 |
|
}, |
|
{ |
|
"epoch": 8.227535037098104, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 1.839989033861673e-05, |
|
"loss": 0.6893, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 8.235779060181368, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 1.8233745799980817e-05, |
|
"loss": 0.6931, |
|
"step": 4995 |
|
}, |
|
{ |
|
"epoch": 8.244023083264633, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 1.8068279501173335e-05, |
|
"loss": 0.6842, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 8.252267106347897, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 1.790349281471445e-05, |
|
"loss": 0.6998, |
|
"step": 5005 |
|
}, |
|
{ |
|
"epoch": 8.260511129431162, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 1.773938710748706e-05, |
|
"loss": 0.6946, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 8.268755152514426, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 1.757596374072543e-05, |
|
"loss": 0.6901, |
|
"step": 5015 |
|
}, |
|
{ |
|
"epoch": 8.276999175597691, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 1.741322407000391e-05, |
|
"loss": 0.6938, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 8.285243198680956, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 1.7251169445225657e-05, |
|
"loss": 0.6922, |
|
"step": 5025 |
|
}, |
|
{ |
|
"epoch": 8.29348722176422, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 1.70898012106115e-05, |
|
"loss": 0.6844, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 8.301731244847485, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 1.692912070468874e-05, |
|
"loss": 0.6968, |
|
"step": 5035 |
|
}, |
|
{ |
|
"epoch": 8.309975267930751, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 1.676912926028007e-05, |
|
"loss": 0.6977, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 8.318219291014016, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 1.660982820449247e-05, |
|
"loss": 0.6995, |
|
"step": 5045 |
|
}, |
|
{ |
|
"epoch": 8.32646331409728, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 1.6451218858706374e-05, |
|
"loss": 0.6934, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 8.334707337180545, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 1.6293302538564382e-05, |
|
"loss": 0.6954, |
|
"step": 5055 |
|
}, |
|
{ |
|
"epoch": 8.34295136026381, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 1.6136080553960687e-05, |
|
"loss": 0.6942, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 8.351195383347074, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 1.5979554209030024e-05, |
|
"loss": 0.6887, |
|
"step": 5065 |
|
}, |
|
{ |
|
"epoch": 8.359439406430338, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 1.5823724802136865e-05, |
|
"loss": 0.6948, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 8.367683429513603, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 1.5668593625864715e-05, |
|
"loss": 0.695, |
|
"step": 5075 |
|
}, |
|
{ |
|
"epoch": 8.375927452596867, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 1.5514161967005337e-05, |
|
"loss": 0.7057, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 8.384171475680132, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 1.536043110654809e-05, |
|
"loss": 0.6906, |
|
"step": 5085 |
|
}, |
|
{ |
|
"epoch": 8.392415498763397, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 1.5207402319669306e-05, |
|
"loss": 0.6909, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 8.400659521846661, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 1.505507687572173e-05, |
|
"loss": 0.6841, |
|
"step": 5095 |
|
}, |
|
{ |
|
"epoch": 8.408903544929926, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 1.4903456038223939e-05, |
|
"loss": 0.6889, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 8.41714756801319, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 1.4752541064849946e-05, |
|
"loss": 0.6908, |
|
"step": 5105 |
|
}, |
|
{ |
|
"epoch": 8.425391591096455, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 1.4602333207418651e-05, |
|
"loss": 0.6949, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 8.43363561417972, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 1.4452833711883628e-05, |
|
"loss": 0.691, |
|
"step": 5115 |
|
}, |
|
{ |
|
"epoch": 8.441879637262984, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 1.4304043818322565e-05, |
|
"loss": 0.6855, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 8.450123660346248, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 1.4155964760927176e-05, |
|
"loss": 0.6937, |
|
"step": 5125 |
|
}, |
|
{ |
|
"epoch": 8.458367683429513, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 1.4008597767992871e-05, |
|
"loss": 0.6922, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 8.466611706512778, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 1.3861944061908583e-05, |
|
"loss": 0.6929, |
|
"step": 5135 |
|
}, |
|
{ |
|
"epoch": 8.474855729596042, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 1.3716004859146592e-05, |
|
"loss": 0.6898, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 8.483099752679308, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 1.3570781370252582e-05, |
|
"loss": 0.6851, |
|
"step": 5145 |
|
}, |
|
{ |
|
"epoch": 8.491343775762573, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 1.3426274799835337e-05, |
|
"loss": 0.6846, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 8.499587798845837, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 1.328248634655701e-05, |
|
"loss": 0.7024, |
|
"step": 5155 |
|
}, |
|
{ |
|
"epoch": 8.507831821929102, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 1.3139417203123027e-05, |
|
"loss": 0.6881, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 8.516075845012367, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 1.2997068556272263e-05, |
|
"loss": 0.7002, |
|
"step": 5165 |
|
}, |
|
{ |
|
"epoch": 8.524319868095631, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 1.2855441586767113e-05, |
|
"loss": 0.6909, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 8.532563891178896, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 1.2714537469383858e-05, |
|
"loss": 0.6878, |
|
"step": 5175 |
|
}, |
|
{ |
|
"epoch": 8.54080791426216, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 1.2574357372902767e-05, |
|
"loss": 0.6917, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 8.549051937345425, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 1.243490246009842e-05, |
|
"loss": 0.689, |
|
"step": 5185 |
|
}, |
|
{ |
|
"epoch": 8.55729596042869, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 1.2296173887730123e-05, |
|
"loss": 0.6859, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 8.565539983511954, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 1.215817280653232e-05, |
|
"loss": 0.6858, |
|
"step": 5195 |
|
}, |
|
{ |
|
"epoch": 8.573784006595218, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 1.2020900361204968e-05, |
|
"loss": 0.6894, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 8.582028029678483, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 1.1884357690404158e-05, |
|
"loss": 0.6886, |
|
"step": 5205 |
|
}, |
|
{ |
|
"epoch": 8.590272052761748, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 1.1748545926732535e-05, |
|
"loss": 0.6903, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 8.598516075845012, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 1.1613466196729984e-05, |
|
"loss": 0.7021, |
|
"step": 5215 |
|
}, |
|
{ |
|
"epoch": 8.606760098928277, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 1.1479119620864276e-05, |
|
"loss": 0.6826, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 8.615004122011541, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 1.1345507313521786e-05, |
|
"loss": 0.6954, |
|
"step": 5225 |
|
}, |
|
{ |
|
"epoch": 8.623248145094806, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 1.1212630382998213e-05, |
|
"loss": 0.6877, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 8.63149216817807, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 1.1080489931489391e-05, |
|
"loss": 0.696, |
|
"step": 5235 |
|
}, |
|
{ |
|
"epoch": 8.639736191261335, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 1.0949087055082252e-05, |
|
"loss": 0.6977, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 8.6479802143446, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 1.0818422843745512e-05, |
|
"loss": 0.6924, |
|
"step": 5245 |
|
}, |
|
{ |
|
"epoch": 8.656224237427864, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 1.0688498381320855e-05, |
|
"loss": 0.6941, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 8.664468260511129, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 1.0559314745513805e-05, |
|
"loss": 0.6878, |
|
"step": 5255 |
|
}, |
|
{ |
|
"epoch": 8.672712283594395, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 1.0430873007884857e-05, |
|
"loss": 0.6975, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 8.68095630667766, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 1.0303174233840528e-05, |
|
"loss": 0.6863, |
|
"step": 5265 |
|
}, |
|
{ |
|
"epoch": 8.689200329760924, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 1.0176219482624616e-05, |
|
"loss": 0.7022, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 8.697444352844188, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 1.0050009807309325e-05, |
|
"loss": 0.6892, |
|
"step": 5275 |
|
}, |
|
{ |
|
"epoch": 8.705688375927453, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 9.924546254786493e-06, |
|
"loss": 0.6839, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 8.713932399010718, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 9.799829865759069e-06, |
|
"loss": 0.6821, |
|
"step": 5285 |
|
}, |
|
{ |
|
"epoch": 8.722176422093982, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 9.675861674732312e-06, |
|
"loss": 0.6885, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 8.730420445177247, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 9.552642710005299e-06, |
|
"loss": 0.6965, |
|
"step": 5295 |
|
}, |
|
{ |
|
"epoch": 8.738664468260511, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 9.430173993662451e-06, |
|
"loss": 0.6971, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 8.746908491343776, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 9.308456541564881e-06, |
|
"loss": 0.6847, |
|
"step": 5305 |
|
}, |
|
{ |
|
"epoch": 8.75515251442704, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 9.187491363342093e-06, |
|
"loss": 0.6982, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 8.763396537510305, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 9.067279462383615e-06, |
|
"loss": 0.6906, |
|
"step": 5315 |
|
}, |
|
{ |
|
"epoch": 8.77164056059357, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 8.947821835830616e-06, |
|
"loss": 0.6981, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 8.779884583676834, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 8.829119474567671e-06, |
|
"loss": 0.6972, |
|
"step": 5325 |
|
}, |
|
{ |
|
"epoch": 8.788128606760099, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 8.711173363214553e-06, |
|
"loss": 0.6875, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 8.796372629843363, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 8.593984480118011e-06, |
|
"loss": 0.6904, |
|
"step": 5335 |
|
}, |
|
{ |
|
"epoch": 8.804616652926628, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 8.47755379734373e-06, |
|
"loss": 0.6886, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 8.812860676009892, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 8.361882280668165e-06, |
|
"loss": 0.6919, |
|
"step": 5345 |
|
}, |
|
{ |
|
"epoch": 8.821104699093157, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 8.24697088957066e-06, |
|
"loss": 0.6934, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 8.829348722176421, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 8.132820577225387e-06, |
|
"loss": 0.6882, |
|
"step": 5355 |
|
}, |
|
{ |
|
"epoch": 8.837592745259688, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 8.019432290493457e-06, |
|
"loss": 0.7015, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 8.845836768342952, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 7.906806969915148e-06, |
|
"loss": 0.689, |
|
"step": 5365 |
|
}, |
|
{ |
|
"epoch": 8.854080791426217, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 7.794945549701993e-06, |
|
"loss": 0.6866, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 8.862324814509481, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 7.683848957729056e-06, |
|
"loss": 0.696, |
|
"step": 5375 |
|
}, |
|
{ |
|
"epoch": 8.870568837592746, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 7.573518115527289e-06, |
|
"loss": 0.6824, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 8.87881286067601, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 7.463953938275858e-06, |
|
"loss": 0.6941, |
|
"step": 5385 |
|
}, |
|
{ |
|
"epoch": 8.887056883759275, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 7.355157334794516e-06, |
|
"loss": 0.6901, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 8.89530090684254, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 7.247129207536152e-06, |
|
"loss": 0.688, |
|
"step": 5395 |
|
}, |
|
{ |
|
"epoch": 8.903544929925804, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 7.1398704525792e-06, |
|
"loss": 0.6906, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 8.911788953009069, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 7.0333819596203e-06, |
|
"loss": 0.6878, |
|
"step": 5405 |
|
}, |
|
{ |
|
"epoch": 8.920032976092333, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 6.927664611966811e-06, |
|
"loss": 0.6965, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 8.928276999175598, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 6.8227192865295995e-06, |
|
"loss": 0.69, |
|
"step": 5415 |
|
}, |
|
{ |
|
"epoch": 8.936521022258862, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 6.718546853815688e-06, |
|
"loss": 0.6857, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 8.944765045342127, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 6.6151481779211155e-06, |
|
"loss": 0.6922, |
|
"step": 5425 |
|
}, |
|
{ |
|
"epoch": 8.953009068425391, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 6.512524116523633e-06, |
|
"loss": 0.6885, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 8.961253091508656, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 6.410675520875742e-06, |
|
"loss": 0.6854, |
|
"step": 5435 |
|
}, |
|
{ |
|
"epoch": 8.96949711459192, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 6.30960323579749e-06, |
|
"loss": 0.6966, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 8.977741137675185, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 6.209308099669597e-06, |
|
"loss": 0.6962, |
|
"step": 5445 |
|
}, |
|
{ |
|
"epoch": 8.98598516075845, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 6.109790944426397e-06, |
|
"loss": 0.707, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 8.994229183841714, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 6.011052595549038e-06, |
|
"loss": 0.6924, |
|
"step": 5455 |
|
}, |
|
{ |
|
"epoch": 8.999175597691673, |
|
"eval_loss": 2.4814510345458984, |
|
"eval_runtime": 0.2587, |
|
"eval_samples_per_second": 38.654, |
|
"eval_steps_per_second": 3.865, |
|
"step": 5458 |
|
}, |
|
{ |
|
"epoch": 9.002473206924979, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 5.913093872058528e-06, |
|
"loss": 0.6875, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 9.010717230008243, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 5.81591558650898e-06, |
|
"loss": 0.6871, |
|
"step": 5465 |
|
}, |
|
{ |
|
"epoch": 9.01896125309151, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 5.719518544980929e-06, |
|
"loss": 0.6887, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 9.027205276174774, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 5.623903547074549e-06, |
|
"loss": 0.7051, |
|
"step": 5475 |
|
}, |
|
{ |
|
"epoch": 9.035449299258039, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 5.529071385903084e-06, |
|
"loss": 0.694, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 9.043693322341303, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 5.43502284808628e-06, |
|
"loss": 0.6839, |
|
"step": 5485 |
|
}, |
|
{ |
|
"epoch": 9.051937345424568, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 5.341758713743828e-06, |
|
"loss": 0.6906, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 9.060181368507832, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 5.249279756488878e-06, |
|
"loss": 0.6895, |
|
"step": 5495 |
|
}, |
|
{ |
|
"epoch": 9.068425391591097, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 5.157586743421672e-06, |
|
"loss": 0.6937, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 9.076669414674361, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 5.066680435123106e-06, |
|
"loss": 0.7007, |
|
"step": 5505 |
|
}, |
|
{ |
|
"epoch": 9.084913437757626, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 4.976561585648509e-06, |
|
"loss": 0.6864, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 9.09315746084089, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 4.887230942521337e-06, |
|
"loss": 0.6886, |
|
"step": 5515 |
|
}, |
|
{ |
|
"epoch": 9.101401483924155, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 4.798689246727006e-06, |
|
"loss": 0.6965, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 9.10964550700742, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 4.710937232706691e-06, |
|
"loss": 0.6888, |
|
"step": 5525 |
|
}, |
|
{ |
|
"epoch": 9.117889530090684, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 4.623975628351273e-06, |
|
"loss": 0.6937, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 9.126133553173949, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 4.537805154995278e-06, |
|
"loss": 0.6989, |
|
"step": 5535 |
|
}, |
|
{ |
|
"epoch": 9.134377576257213, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 4.452426527410947e-06, |
|
"loss": 0.69, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 9.142621599340478, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 4.36784045380223e-06, |
|
"loss": 0.6952, |
|
"step": 5545 |
|
}, |
|
{ |
|
"epoch": 9.150865622423742, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 4.2840476357989825e-06, |
|
"loss": 0.6909, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 9.159109645507007, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 4.20104876845111e-06, |
|
"loss": 0.6835, |
|
"step": 5555 |
|
}, |
|
{ |
|
"epoch": 9.167353668590271, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 4.118844540222788e-06, |
|
"loss": 0.7042, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 9.175597691673536, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 4.037435632986786e-06, |
|
"loss": 0.693, |
|
"step": 5565 |
|
}, |
|
{ |
|
"epoch": 9.1838417147568, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 3.95682272201876e-06, |
|
"loss": 0.6854, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 9.192085737840065, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 3.877006475991729e-06, |
|
"loss": 0.6937, |
|
"step": 5575 |
|
}, |
|
{ |
|
"epoch": 9.200329760923331, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 3.797987556970495e-06, |
|
"loss": 0.6968, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 9.208573784006596, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 3.7197666204060955e-06, |
|
"loss": 0.6902, |
|
"step": 5585 |
|
}, |
|
{ |
|
"epoch": 9.21681780708986, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 3.6423443151304526e-06, |
|
"loss": 0.6896, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 9.225061830173125, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 3.565721283350931e-06, |
|
"loss": 0.696, |
|
"step": 5595 |
|
}, |
|
{ |
|
"epoch": 9.23330585325639, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 3.4898981606450333e-06, |
|
"loss": 0.6895, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 9.241549876339654, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 3.414875575955101e-06, |
|
"loss": 0.6845, |
|
"step": 5605 |
|
}, |
|
{ |
|
"epoch": 9.249793899422919, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 3.3406541515832003e-06, |
|
"loss": 0.6908, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 9.258037922506183, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 3.267234503185823e-06, |
|
"loss": 0.6885, |
|
"step": 5615 |
|
}, |
|
{ |
|
"epoch": 9.266281945589448, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 3.1946172397688267e-06, |
|
"loss": 0.6921, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 9.274525968672712, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 3.1228029636824475e-06, |
|
"loss": 0.6927, |
|
"step": 5625 |
|
}, |
|
{ |
|
"epoch": 9.282769991755977, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 3.051792270616216e-06, |
|
"loss": 0.689, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 9.291014014839241, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 2.981585749594051e-06, |
|
"loss": 0.6962, |
|
"step": 5635 |
|
}, |
|
{ |
|
"epoch": 9.299258037922506, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 2.912183982969385e-06, |
|
"loss": 0.6873, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 9.30750206100577, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 2.8435875464203343e-06, |
|
"loss": 0.6839, |
|
"step": 5645 |
|
}, |
|
{ |
|
"epoch": 9.315746084089035, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 2.7757970089449024e-06, |
|
"loss": 0.6884, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 9.3239901071723, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 2.708812932856253e-06, |
|
"loss": 0.6865, |
|
"step": 5655 |
|
}, |
|
{ |
|
"epoch": 9.332234130255564, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 2.6426358737781098e-06, |
|
"loss": 0.6944, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 9.340478153338829, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 2.577266380640053e-06, |
|
"loss": 0.6866, |
|
"step": 5665 |
|
}, |
|
{ |
|
"epoch": 9.348722176422093, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 2.5127049956730207e-06, |
|
"loss": 0.6917, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 9.356966199505358, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 2.448952254404846e-06, |
|
"loss": 0.6984, |
|
"step": 5675 |
|
}, |
|
{ |
|
"epoch": 9.365210222588622, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 2.3860086856557383e-06, |
|
"loss": 0.6881, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 9.373454245671887, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 2.3238748115339324e-06, |
|
"loss": 0.689, |
|
"step": 5685 |
|
}, |
|
{ |
|
"epoch": 9.381698268755153, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 2.2625511474313685e-06, |
|
"loss": 0.6968, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 9.389942291838418, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 2.2020382020194074e-06, |
|
"loss": 0.6923, |
|
"step": 5695 |
|
}, |
|
{ |
|
"epoch": 9.398186314921682, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 2.1423364772445887e-06, |
|
"loss": 0.6929, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 9.406430338004947, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 2.0834464683245346e-06, |
|
"loss": 0.6948, |
|
"step": 5705 |
|
}, |
|
{ |
|
"epoch": 9.414674361088212, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 2.025368663743743e-06, |
|
"loss": 0.6956, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 9.422918384171476, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 1.968103545249611e-06, |
|
"loss": 0.6857, |
|
"step": 5715 |
|
}, |
|
{ |
|
"epoch": 9.43116240725474, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 1.91165158784844e-06, |
|
"loss": 0.6871, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 9.439406430338005, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 1.8560132598014368e-06, |
|
"loss": 0.6864, |
|
"step": 5725 |
|
}, |
|
{ |
|
"epoch": 9.44765045342127, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 1.8011890226208527e-06, |
|
"loss": 0.6922, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 9.455894476504534, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 1.7471793310662287e-06, |
|
"loss": 0.6973, |
|
"step": 5735 |
|
}, |
|
{ |
|
"epoch": 9.464138499587799, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 1.6939846331405108e-06, |
|
"loss": 0.6954, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 9.472382522671063, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 1.6416053700863964e-06, |
|
"loss": 0.6983, |
|
"step": 5745 |
|
}, |
|
{ |
|
"epoch": 9.480626545754328, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 1.5900419763826614e-06, |
|
"loss": 0.6904, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 9.488870568837593, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 1.5392948797405827e-06, |
|
"loss": 0.7001, |
|
"step": 5755 |
|
}, |
|
{ |
|
"epoch": 9.497114591920857, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 1.489364501100332e-06, |
|
"loss": 0.6978, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 9.505358615004122, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 1.4402512546275114e-06, |
|
"loss": 0.6974, |
|
"step": 5765 |
|
}, |
|
{ |
|
"epoch": 9.513602638087386, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 1.3919555477097668e-06, |
|
"loss": 0.6885, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 9.52184666117065, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 1.344477780953346e-06, |
|
"loss": 0.6884, |
|
"step": 5775 |
|
}, |
|
{ |
|
"epoch": 9.530090684253915, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 1.2978183481797801e-06, |
|
"loss": 0.6899, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 9.53833470733718, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 1.251977636422641e-06, |
|
"loss": 0.6897, |
|
"step": 5785 |
|
}, |
|
{ |
|
"epoch": 9.546578730420444, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 1.2069560259243328e-06, |
|
"loss": 0.6933, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 9.55482275350371, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 1.1627538901329172e-06, |
|
"loss": 0.6868, |
|
"step": 5795 |
|
}, |
|
{ |
|
"epoch": 9.563066776586975, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 1.1193715956990258e-06, |
|
"loss": 0.6855, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 9.57131079967024, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 1.076809502472831e-06, |
|
"loss": 0.6977, |
|
"step": 5805 |
|
}, |
|
{ |
|
"epoch": 9.579554822753504, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 1.035067963501024e-06, |
|
"loss": 0.6969, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 9.587798845836769, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 9.94147325023953e-07, |
|
"loss": 0.6982, |
|
"step": 5815 |
|
}, |
|
{ |
|
"epoch": 9.596042868920033, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 9.540479264726676e-07, |
|
"loss": 0.6865, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 9.604286892003298, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 9.147701004661446e-07, |
|
"loss": 0.6897, |
|
"step": 5825 |
|
}, |
|
{ |
|
"epoch": 9.612530915086563, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 8.763141728085789e-07, |
|
"loss": 0.6837, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 9.620774938169827, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 8.386804624865851e-07, |
|
"loss": 0.6865, |
|
"step": 5835 |
|
}, |
|
{ |
|
"epoch": 9.629018961253092, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 8.018692816666118e-07, |
|
"loss": 0.6907, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 9.637262984336356, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 7.658809356923424e-07, |
|
"loss": 0.6902, |
|
"step": 5845 |
|
}, |
|
{ |
|
"epoch": 9.64550700741962, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 7.307157230821426e-07, |
|
"loss": 0.6925, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 9.653751030502885, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 6.963739355266286e-07, |
|
"loss": 0.6911, |
|
"step": 5855 |
|
}, |
|
{ |
|
"epoch": 9.66199505358615, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 6.628558578862021e-07, |
|
"loss": 0.6838, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 9.670239076669414, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 6.301617681886863e-07, |
|
"loss": 0.6883, |
|
"step": 5865 |
|
}, |
|
{ |
|
"epoch": 9.678483099752679, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 5.982919376270823e-07, |
|
"loss": 0.6908, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 9.686727122835944, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 5.672466305572388e-07, |
|
"loss": 0.6908, |
|
"step": 5875 |
|
}, |
|
{ |
|
"epoch": 9.694971145919208, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 5.370261044956971e-07, |
|
"loss": 0.6962, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 9.703215169002473, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 5.07630610117582e-07, |
|
"loss": 0.6932, |
|
"step": 5885 |
|
}, |
|
{ |
|
"epoch": 9.711459192085737, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 4.790603912544489e-07, |
|
"loss": 0.6878, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 9.719703215169002, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 4.5131568489236166e-07, |
|
"loss": 0.6946, |
|
"step": 5895 |
|
}, |
|
{ |
|
"epoch": 9.727947238252266, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 4.2439672116982855e-07, |
|
"loss": 0.6853, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 9.73619126133553, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 3.983037233759368e-07, |
|
"loss": 0.6914, |
|
"step": 5905 |
|
}, |
|
{ |
|
"epoch": 9.744435284418797, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 3.73036907948543e-07, |
|
"loss": 0.6898, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 9.752679307502062, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 3.485964844723744e-07, |
|
"loss": 0.6888, |
|
"step": 5915 |
|
}, |
|
{ |
|
"epoch": 9.760923330585326, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 3.2498265567739717e-07, |
|
"loss": 0.6824, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 9.76916735366859, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 3.0219561743707326e-07, |
|
"loss": 0.691, |
|
"step": 5925 |
|
}, |
|
{ |
|
"epoch": 9.777411376751855, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 2.8023555876673937e-07, |
|
"loss": 0.6862, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 9.78565539983512, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 2.5910266182207486e-07, |
|
"loss": 0.6933, |
|
"step": 5935 |
|
}, |
|
{ |
|
"epoch": 9.793899422918384, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 2.3879710189753656e-07, |
|
"loss": 0.6926, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 9.802143446001649, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 2.1931904742495957e-07, |
|
"loss": 0.6807, |
|
"step": 5945 |
|
}, |
|
{ |
|
"epoch": 9.810387469084914, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 2.0066865997212525e-07, |
|
"loss": 0.6923, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 9.818631492168178, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 1.8284609424142895e-07, |
|
"loss": 0.6885, |
|
"step": 5955 |
|
}, |
|
{ |
|
"epoch": 9.826875515251443, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 1.6585149806860324e-07, |
|
"loss": 0.6862, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 9.835119538334707, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 1.4968501242148547e-07, |
|
"loss": 0.6955, |
|
"step": 5965 |
|
}, |
|
{ |
|
"epoch": 9.843363561417972, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 1.3434677139885222e-07, |
|
"loss": 0.6957, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 9.851607584501236, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 1.1983690222929778e-07, |
|
"loss": 0.6915, |
|
"step": 5975 |
|
}, |
|
{ |
|
"epoch": 9.8598516075845, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 1.0615552527017958e-07, |
|
"loss": 0.701, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 9.868095630667765, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 9.330275400666332e-08, |
|
"loss": 0.6959, |
|
"step": 5985 |
|
}, |
|
{ |
|
"epoch": 9.87633965375103, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 8.127869505069053e-08, |
|
"loss": 0.6885, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 9.884583676834295, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 7.00834481402013e-08, |
|
"loss": 0.6842, |
|
"step": 5995 |
|
}, |
|
{ |
|
"epoch": 9.892827699917559, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 5.971710613821291e-08, |
|
"loss": 0.6956, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 9.901071723000824, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 5.0179755032109253e-08, |
|
"loss": 0.6898, |
|
"step": 6005 |
|
}, |
|
{ |
|
"epoch": 9.90931574608409, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 4.147147393290807e-08, |
|
"loss": 0.6899, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 9.917559769167354, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 3.359233507459481e-08, |
|
"loss": 0.697, |
|
"step": 6015 |
|
}, |
|
{ |
|
"epoch": 9.925803792250619, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 2.6542403813545334e-08, |
|
"loss": 0.6938, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 9.934047815333884, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 2.0321738627981923e-08, |
|
"loss": 0.686, |
|
"step": 6025 |
|
}, |
|
{ |
|
"epoch": 9.942291838417148, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 1.4930391117451426e-08, |
|
"loss": 0.6874, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 9.950535861500413, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 1.0368406002436715e-08, |
|
"loss": 0.6934, |
|
"step": 6035 |
|
}, |
|
{ |
|
"epoch": 9.958779884583677, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 6.635821124001406e-09, |
|
"loss": 0.6913, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 9.967023907666942, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 3.732667443390181e-09, |
|
"loss": 0.6895, |
|
"step": 6045 |
|
}, |
|
{ |
|
"epoch": 9.975267930750206, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 1.6589690418955528e-09, |
|
"loss": 0.6968, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 9.983511953833471, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 4.147431205359098e-10, |
|
"loss": 0.6946, |
|
"step": 6055 |
|
}, |
|
{ |
|
"epoch": 9.991755976916735, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.0, |
|
"loss": 0.6936, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 9.991755976916735, |
|
"eval_loss": 2.4860482215881348, |
|
"eval_runtime": 0.2343, |
|
"eval_samples_per_second": 42.675, |
|
"eval_steps_per_second": 4.267, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 9.991755976916735, |
|
"step": 6060, |
|
"total_flos": 1.8500974249565487e+19, |
|
"train_loss": 1.1020318522705104, |
|
"train_runtime": 14653.0399, |
|
"train_samples_per_second": 26.478, |
|
"train_steps_per_second": 0.414 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 6060, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"total_flos": 1.8500974249565487e+19, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|