diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" new file mode 100644--- /dev/null +++ "b/last-checkpoint/trainer_state.json" @@ -0,0 +1,7120 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.14742739200943536, + "eval_steps": 500, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 2.948547840188707e-05, + "grad_norm": 4288.0, + "learning_rate": 5.89622641509434e-10, + "loss": 5.8964, + "step": 1 + }, + { + "epoch": 0.00014742739200943535, + "grad_norm": 5152.0, + "learning_rate": 2.9481132075471697e-09, + "loss": 7.2418, + "step": 5 + }, + { + "epoch": 0.0002948547840188707, + "grad_norm": 3872.0, + "learning_rate": 5.896226415094339e-09, + "loss": 6.7926, + "step": 10 + }, + { + "epoch": 0.0004422821760283061, + "grad_norm": 2704.0, + "learning_rate": 8.844339622641509e-09, + "loss": 7.0144, + "step": 15 + }, + { + "epoch": 0.0005897095680377414, + "grad_norm": 5824.0, + "learning_rate": 1.1792452830188679e-08, + "loss": 7.431, + "step": 20 + }, + { + "epoch": 0.0007371369600471767, + "grad_norm": 3488.0, + "learning_rate": 1.4740566037735849e-08, + "loss": 7.3504, + "step": 25 + }, + { + "epoch": 0.0008845643520566122, + "grad_norm": 4448.0, + "learning_rate": 1.7688679245283017e-08, + "loss": 6.8362, + "step": 30 + }, + { + "epoch": 0.0010319917440660474, + "grad_norm": 3984.0, + "learning_rate": 2.0636792452830187e-08, + "loss": 7.2579, + "step": 35 + }, + { + "epoch": 0.0011794191360754828, + "grad_norm": 6112.0, + "learning_rate": 2.3584905660377358e-08, + "loss": 7.0144, + "step": 40 + }, + { + "epoch": 0.0013268465280849183, + "grad_norm": 3536.0, + "learning_rate": 2.6533018867924528e-08, + "loss": 6.6585, + "step": 45 + }, + { + "epoch": 0.0014742739200943535, + "grad_norm": 5184.0, + "learning_rate": 2.9481132075471698e-08, + "loss": 7.1249, + "step": 50 + }, + { + "epoch": 0.001621701312103789, + "grad_norm": 4736.0, + "learning_rate": 3.242924528301887e-08, + "loss": 7.0672, + "step": 55 + }, + { + "epoch": 0.0017691287041132243, + "grad_norm": 3888.0, + "learning_rate": 3.5377358490566035e-08, + "loss": 6.9418, + "step": 60 + }, + { + "epoch": 0.0019165560961226596, + "grad_norm": 4512.0, + "learning_rate": 3.83254716981132e-08, + "loss": 6.7915, + "step": 65 + }, + { + "epoch": 0.0020639834881320948, + "grad_norm": 4544.0, + "learning_rate": 4.1273584905660375e-08, + "loss": 6.9867, + "step": 70 + }, + { + "epoch": 0.0022114108801415304, + "grad_norm": 4160.0, + "learning_rate": 4.422169811320754e-08, + "loss": 6.9295, + "step": 75 + }, + { + "epoch": 0.0023588382721509656, + "grad_norm": 5024.0, + "learning_rate": 4.7169811320754715e-08, + "loss": 7.1398, + "step": 80 + }, + { + "epoch": 0.002506265664160401, + "grad_norm": 6688.0, + "learning_rate": 5.011792452830189e-08, + "loss": 6.8533, + "step": 85 + }, + { + "epoch": 0.0026536930561698365, + "grad_norm": 4128.0, + "learning_rate": 5.3066037735849055e-08, + "loss": 6.8373, + "step": 90 + }, + { + "epoch": 0.0028011204481792717, + "grad_norm": 4512.0, + "learning_rate": 5.601415094339622e-08, + "loss": 6.9857, + "step": 95 + }, + { + "epoch": 0.002948547840188707, + "grad_norm": 4704.0, + "learning_rate": 5.8962264150943396e-08, + "loss": 7.3043, + "step": 100 + }, + { + "epoch": 0.0030959752321981426, + "grad_norm": 5824.0, + "learning_rate": 6.191037735849057e-08, + "loss": 7.4446, + "step": 105 + }, + { + "epoch": 0.003243402624207578, + "grad_norm": 5600.0, + "learning_rate": 6.485849056603774e-08, + "loss": 6.4237, + "step": 110 + }, + { + "epoch": 0.003390830016217013, + "grad_norm": 3360.0, + "learning_rate": 6.78066037735849e-08, + "loss": 7.1134, + "step": 115 + }, + { + "epoch": 0.0035382574082264487, + "grad_norm": 5120.0, + "learning_rate": 7.075471698113207e-08, + "loss": 6.541, + "step": 120 + }, + { + "epoch": 0.003685684800235884, + "grad_norm": 4016.0, + "learning_rate": 7.370283018867925e-08, + "loss": 7.1745, + "step": 125 + }, + { + "epoch": 0.003833112192245319, + "grad_norm": 4096.0, + "learning_rate": 7.66509433962264e-08, + "loss": 6.6107, + "step": 130 + }, + { + "epoch": 0.003980539584254755, + "grad_norm": 6784.0, + "learning_rate": 7.959905660377358e-08, + "loss": 7.1709, + "step": 135 + }, + { + "epoch": 0.0041279669762641896, + "grad_norm": 4800.0, + "learning_rate": 8.254716981132075e-08, + "loss": 6.7303, + "step": 140 + }, + { + "epoch": 0.004275394368273625, + "grad_norm": 5472.0, + "learning_rate": 8.549528301886792e-08, + "loss": 6.6023, + "step": 145 + }, + { + "epoch": 0.004422821760283061, + "grad_norm": 5952.0, + "learning_rate": 8.844339622641508e-08, + "loss": 7.3574, + "step": 150 + }, + { + "epoch": 0.004570249152292496, + "grad_norm": 4480.0, + "learning_rate": 9.139150943396226e-08, + "loss": 6.4739, + "step": 155 + }, + { + "epoch": 0.004717676544301931, + "grad_norm": 2928.0, + "learning_rate": 9.433962264150943e-08, + "loss": 6.5303, + "step": 160 + }, + { + "epoch": 0.004865103936311367, + "grad_norm": 4064.0, + "learning_rate": 9.72877358490566e-08, + "loss": 6.7455, + "step": 165 + }, + { + "epoch": 0.005012531328320802, + "grad_norm": 2928.0, + "learning_rate": 1.0023584905660378e-07, + "loss": 6.2466, + "step": 170 + }, + { + "epoch": 0.005159958720330237, + "grad_norm": 3536.0, + "learning_rate": 1.0318396226415093e-07, + "loss": 6.493, + "step": 175 + }, + { + "epoch": 0.005307386112339673, + "grad_norm": 3792.0, + "learning_rate": 1.0613207547169811e-07, + "loss": 6.1627, + "step": 180 + }, + { + "epoch": 0.005454813504349108, + "grad_norm": 5568.0, + "learning_rate": 1.0908018867924528e-07, + "loss": 6.7054, + "step": 185 + }, + { + "epoch": 0.0056022408963585435, + "grad_norm": 5312.0, + "learning_rate": 1.1202830188679244e-07, + "loss": 6.3761, + "step": 190 + }, + { + "epoch": 0.005749668288367979, + "grad_norm": 3888.0, + "learning_rate": 1.1497641509433961e-07, + "loss": 6.4957, + "step": 195 + }, + { + "epoch": 0.005897095680377414, + "grad_norm": 4016.0, + "learning_rate": 1.1792452830188679e-07, + "loss": 6.3387, + "step": 200 + }, + { + "epoch": 0.0060445230723868495, + "grad_norm": 3712.0, + "learning_rate": 1.2087264150943396e-07, + "loss": 6.8239, + "step": 205 + }, + { + "epoch": 0.006191950464396285, + "grad_norm": 2224.0, + "learning_rate": 1.2382075471698114e-07, + "loss": 6.4956, + "step": 210 + }, + { + "epoch": 0.00633937785640572, + "grad_norm": 4736.0, + "learning_rate": 1.267688679245283e-07, + "loss": 6.2678, + "step": 215 + }, + { + "epoch": 0.006486805248415156, + "grad_norm": 4320.0, + "learning_rate": 1.2971698113207547e-07, + "loss": 6.4021, + "step": 220 + }, + { + "epoch": 0.006634232640424591, + "grad_norm": 4032.0, + "learning_rate": 1.3266509433962265e-07, + "loss": 5.8306, + "step": 225 + }, + { + "epoch": 0.006781660032434026, + "grad_norm": 3664.0, + "learning_rate": 1.356132075471698e-07, + "loss": 6.7351, + "step": 230 + }, + { + "epoch": 0.006929087424443462, + "grad_norm": 2032.0, + "learning_rate": 1.3856132075471696e-07, + "loss": 6.3457, + "step": 235 + }, + { + "epoch": 0.007076514816452897, + "grad_norm": 2784.0, + "learning_rate": 1.4150943396226414e-07, + "loss": 6.1769, + "step": 240 + }, + { + "epoch": 0.007223942208462332, + "grad_norm": 1464.0, + "learning_rate": 1.4445754716981132e-07, + "loss": 5.8652, + "step": 245 + }, + { + "epoch": 0.007371369600471768, + "grad_norm": 2008.0, + "learning_rate": 1.474056603773585e-07, + "loss": 6.0493, + "step": 250 + }, + { + "epoch": 0.007518796992481203, + "grad_norm": 1040.0, + "learning_rate": 1.5035377358490565e-07, + "loss": 5.9201, + "step": 255 + }, + { + "epoch": 0.007666224384490638, + "grad_norm": 1816.0, + "learning_rate": 1.533018867924528e-07, + "loss": 6.2471, + "step": 260 + }, + { + "epoch": 0.007813651776500074, + "grad_norm": 1432.0, + "learning_rate": 1.5624999999999999e-07, + "loss": 5.8522, + "step": 265 + }, + { + "epoch": 0.00796107916850951, + "grad_norm": 1816.0, + "learning_rate": 1.5919811320754717e-07, + "loss": 5.9513, + "step": 270 + }, + { + "epoch": 0.008108506560518945, + "grad_norm": 520.0, + "learning_rate": 1.6214622641509435e-07, + "loss": 5.784, + "step": 275 + }, + { + "epoch": 0.008255933952528379, + "grad_norm": 1272.0, + "learning_rate": 1.650943396226415e-07, + "loss": 5.6907, + "step": 280 + }, + { + "epoch": 0.008403361344537815, + "grad_norm": 852.0, + "learning_rate": 1.6804245283018868e-07, + "loss": 5.6086, + "step": 285 + }, + { + "epoch": 0.00855078873654725, + "grad_norm": 266.0, + "learning_rate": 1.7099056603773583e-07, + "loss": 5.3121, + "step": 290 + }, + { + "epoch": 0.008698216128556686, + "grad_norm": 756.0, + "learning_rate": 1.7393867924528301e-07, + "loss": 5.6198, + "step": 295 + }, + { + "epoch": 0.008845643520566122, + "grad_norm": 568.0, + "learning_rate": 1.7688679245283017e-07, + "loss": 5.5308, + "step": 300 + }, + { + "epoch": 0.008993070912575557, + "grad_norm": 644.0, + "learning_rate": 1.7983490566037735e-07, + "loss": 5.522, + "step": 305 + }, + { + "epoch": 0.009140498304584991, + "grad_norm": 596.0, + "learning_rate": 1.8278301886792453e-07, + "loss": 5.6054, + "step": 310 + }, + { + "epoch": 0.009287925696594427, + "grad_norm": 384.0, + "learning_rate": 1.857311320754717e-07, + "loss": 5.5085, + "step": 315 + }, + { + "epoch": 0.009435353088603863, + "grad_norm": 504.0, + "learning_rate": 1.8867924528301886e-07, + "loss": 5.6657, + "step": 320 + }, + { + "epoch": 0.009582780480613298, + "grad_norm": 460.0, + "learning_rate": 1.9162735849056601e-07, + "loss": 5.7185, + "step": 325 + }, + { + "epoch": 0.009730207872622734, + "grad_norm": 141.0, + "learning_rate": 1.945754716981132e-07, + "loss": 5.5482, + "step": 330 + }, + { + "epoch": 0.009877635264632168, + "grad_norm": 560.0, + "learning_rate": 1.9752358490566037e-07, + "loss": 5.378, + "step": 335 + }, + { + "epoch": 0.010025062656641603, + "grad_norm": 512.0, + "learning_rate": 2.0047169811320755e-07, + "loss": 5.6247, + "step": 340 + }, + { + "epoch": 0.010172490048651039, + "grad_norm": 528.0, + "learning_rate": 2.034198113207547e-07, + "loss": 5.1692, + "step": 345 + }, + { + "epoch": 0.010319917440660475, + "grad_norm": 624.0, + "learning_rate": 2.0636792452830186e-07, + "loss": 5.43, + "step": 350 + }, + { + "epoch": 0.01046734483266991, + "grad_norm": 592.0, + "learning_rate": 2.0931603773584904e-07, + "loss": 5.2031, + "step": 355 + }, + { + "epoch": 0.010614772224679346, + "grad_norm": 396.0, + "learning_rate": 2.1226415094339622e-07, + "loss": 5.1502, + "step": 360 + }, + { + "epoch": 0.01076219961668878, + "grad_norm": 516.0, + "learning_rate": 2.1521226415094338e-07, + "loss": 5.826, + "step": 365 + }, + { + "epoch": 0.010909627008698216, + "grad_norm": 760.0, + "learning_rate": 2.1816037735849056e-07, + "loss": 5.5087, + "step": 370 + }, + { + "epoch": 0.011057054400707651, + "grad_norm": 420.0, + "learning_rate": 2.2110849056603774e-07, + "loss": 5.146, + "step": 375 + }, + { + "epoch": 0.011204481792717087, + "grad_norm": 192.0, + "learning_rate": 2.240566037735849e-07, + "loss": 5.3402, + "step": 380 + }, + { + "epoch": 0.011351909184726523, + "grad_norm": 370.0, + "learning_rate": 2.2700471698113207e-07, + "loss": 5.3036, + "step": 385 + }, + { + "epoch": 0.011499336576735958, + "grad_norm": 352.0, + "learning_rate": 2.2995283018867922e-07, + "loss": 5.1917, + "step": 390 + }, + { + "epoch": 0.011646763968745392, + "grad_norm": 414.0, + "learning_rate": 2.329009433962264e-07, + "loss": 5.0797, + "step": 395 + }, + { + "epoch": 0.011794191360754828, + "grad_norm": 484.0, + "learning_rate": 2.3584905660377358e-07, + "loss": 5.296, + "step": 400 + }, + { + "epoch": 0.011941618752764263, + "grad_norm": 508.0, + "learning_rate": 2.3879716981132076e-07, + "loss": 5.026, + "step": 405 + }, + { + "epoch": 0.012089046144773699, + "grad_norm": 450.0, + "learning_rate": 2.417452830188679e-07, + "loss": 4.9075, + "step": 410 + }, + { + "epoch": 0.012236473536783135, + "grad_norm": 382.0, + "learning_rate": 2.4469339622641507e-07, + "loss": 5.0301, + "step": 415 + }, + { + "epoch": 0.01238390092879257, + "grad_norm": 384.0, + "learning_rate": 2.476415094339623e-07, + "loss": 4.9027, + "step": 420 + }, + { + "epoch": 0.012531328320802004, + "grad_norm": 188.0, + "learning_rate": 2.505896226415094e-07, + "loss": 5.0141, + "step": 425 + }, + { + "epoch": 0.01267875571281144, + "grad_norm": 157.0, + "learning_rate": 2.535377358490566e-07, + "loss": 4.5994, + "step": 430 + }, + { + "epoch": 0.012826183104820876, + "grad_norm": 386.0, + "learning_rate": 2.5648584905660374e-07, + "loss": 4.8006, + "step": 435 + }, + { + "epoch": 0.012973610496830311, + "grad_norm": 270.0, + "learning_rate": 2.5943396226415094e-07, + "loss": 4.4348, + "step": 440 + }, + { + "epoch": 0.013121037888839747, + "grad_norm": 161.0, + "learning_rate": 2.623820754716981e-07, + "loss": 4.7565, + "step": 445 + }, + { + "epoch": 0.013268465280849183, + "grad_norm": 186.0, + "learning_rate": 2.653301886792453e-07, + "loss": 4.7097, + "step": 450 + }, + { + "epoch": 0.013415892672858616, + "grad_norm": 115.0, + "learning_rate": 2.6827830188679246e-07, + "loss": 4.3228, + "step": 455 + }, + { + "epoch": 0.013563320064868052, + "grad_norm": 83.0, + "learning_rate": 2.712264150943396e-07, + "loss": 4.52, + "step": 460 + }, + { + "epoch": 0.013710747456877488, + "grad_norm": 142.0, + "learning_rate": 2.7417452830188676e-07, + "loss": 4.4627, + "step": 465 + }, + { + "epoch": 0.013858174848886923, + "grad_norm": 247.0, + "learning_rate": 2.771226415094339e-07, + "loss": 4.2847, + "step": 470 + }, + { + "epoch": 0.014005602240896359, + "grad_norm": 59.0, + "learning_rate": 2.800707547169811e-07, + "loss": 4.5636, + "step": 475 + }, + { + "epoch": 0.014153029632905795, + "grad_norm": 168.0, + "learning_rate": 2.830188679245283e-07, + "loss": 4.3123, + "step": 480 + }, + { + "epoch": 0.014300457024915229, + "grad_norm": 106.0, + "learning_rate": 2.8596698113207543e-07, + "loss": 4.3122, + "step": 485 + }, + { + "epoch": 0.014447884416924664, + "grad_norm": 97.0, + "learning_rate": 2.8891509433962264e-07, + "loss": 4.2267, + "step": 490 + }, + { + "epoch": 0.0145953118089341, + "grad_norm": 73.0, + "learning_rate": 2.918632075471698e-07, + "loss": 4.1379, + "step": 495 + }, + { + "epoch": 0.014742739200943536, + "grad_norm": 60.25, + "learning_rate": 2.94811320754717e-07, + "loss": 4.0958, + "step": 500 + }, + { + "epoch": 0.014742739200943536, + "eval_loss": 5.273393630981445, + "eval_runtime": 4.7076, + "eval_samples_per_second": 84.119, + "eval_steps_per_second": 2.761, + "step": 500 + }, + { + "epoch": 0.014890166592952971, + "grad_norm": 77.0, + "learning_rate": 2.9775943396226415e-07, + "loss": 3.849, + "step": 505 + }, + { + "epoch": 0.015037593984962405, + "grad_norm": 83.0, + "learning_rate": 3.007075471698113e-07, + "loss": 4.2059, + "step": 510 + }, + { + "epoch": 0.01518502137697184, + "grad_norm": 120.0, + "learning_rate": 3.0365566037735846e-07, + "loss": 4.1134, + "step": 515 + }, + { + "epoch": 0.015332448768981276, + "grad_norm": 58.25, + "learning_rate": 3.066037735849056e-07, + "loss": 3.9189, + "step": 520 + }, + { + "epoch": 0.015479876160990712, + "grad_norm": 77.0, + "learning_rate": 3.095518867924528e-07, + "loss": 4.1662, + "step": 525 + }, + { + "epoch": 0.015627303553000148, + "grad_norm": 120.5, + "learning_rate": 3.1249999999999997e-07, + "loss": 4.0004, + "step": 530 + }, + { + "epoch": 0.01577473094500958, + "grad_norm": 51.75, + "learning_rate": 3.154481132075472e-07, + "loss": 4.1284, + "step": 535 + }, + { + "epoch": 0.01592215833701902, + "grad_norm": 56.0, + "learning_rate": 3.1839622641509433e-07, + "loss": 4.1131, + "step": 540 + }, + { + "epoch": 0.016069585729028453, + "grad_norm": 87.0, + "learning_rate": 3.2134433962264154e-07, + "loss": 4.0497, + "step": 545 + }, + { + "epoch": 0.01621701312103789, + "grad_norm": 53.25, + "learning_rate": 3.242924528301887e-07, + "loss": 3.8763, + "step": 550 + }, + { + "epoch": 0.016364440513047324, + "grad_norm": 80.5, + "learning_rate": 3.272405660377358e-07, + "loss": 4.1176, + "step": 555 + }, + { + "epoch": 0.016511867905056758, + "grad_norm": 72.0, + "learning_rate": 3.30188679245283e-07, + "loss": 4.0426, + "step": 560 + }, + { + "epoch": 0.016659295297066196, + "grad_norm": 52.25, + "learning_rate": 3.3313679245283015e-07, + "loss": 3.7246, + "step": 565 + }, + { + "epoch": 0.01680672268907563, + "grad_norm": 50.0, + "learning_rate": 3.3608490566037736e-07, + "loss": 3.914, + "step": 570 + }, + { + "epoch": 0.016954150081085067, + "grad_norm": 48.25, + "learning_rate": 3.390330188679245e-07, + "loss": 4.1672, + "step": 575 + }, + { + "epoch": 0.0171015774730945, + "grad_norm": 56.5, + "learning_rate": 3.4198113207547167e-07, + "loss": 3.7885, + "step": 580 + }, + { + "epoch": 0.017249004865103935, + "grad_norm": 130.0, + "learning_rate": 3.449292452830189e-07, + "loss": 3.9511, + "step": 585 + }, + { + "epoch": 0.017396432257113372, + "grad_norm": 69.5, + "learning_rate": 3.4787735849056603e-07, + "loss": 4.0633, + "step": 590 + }, + { + "epoch": 0.017543859649122806, + "grad_norm": 49.5, + "learning_rate": 3.5082547169811323e-07, + "loss": 3.801, + "step": 595 + }, + { + "epoch": 0.017691287041132243, + "grad_norm": 38.0, + "learning_rate": 3.5377358490566033e-07, + "loss": 3.6217, + "step": 600 + }, + { + "epoch": 0.017838714433141677, + "grad_norm": 35.75, + "learning_rate": 3.567216981132075e-07, + "loss": 3.9652, + "step": 605 + }, + { + "epoch": 0.017986141825151115, + "grad_norm": 52.0, + "learning_rate": 3.596698113207547e-07, + "loss": 3.7182, + "step": 610 + }, + { + "epoch": 0.01813356921716055, + "grad_norm": 58.5, + "learning_rate": 3.6261792452830185e-07, + "loss": 3.6423, + "step": 615 + }, + { + "epoch": 0.018280996609169983, + "grad_norm": 46.25, + "learning_rate": 3.6556603773584905e-07, + "loss": 3.7992, + "step": 620 + }, + { + "epoch": 0.01842842400117942, + "grad_norm": 42.25, + "learning_rate": 3.685141509433962e-07, + "loss": 3.8397, + "step": 625 + }, + { + "epoch": 0.018575851393188854, + "grad_norm": 36.0, + "learning_rate": 3.714622641509434e-07, + "loss": 3.6173, + "step": 630 + }, + { + "epoch": 0.01872327878519829, + "grad_norm": 49.75, + "learning_rate": 3.7441037735849057e-07, + "loss": 4.0317, + "step": 635 + }, + { + "epoch": 0.018870706177207725, + "grad_norm": 38.5, + "learning_rate": 3.773584905660377e-07, + "loss": 3.7633, + "step": 640 + }, + { + "epoch": 0.01901813356921716, + "grad_norm": 41.75, + "learning_rate": 3.803066037735849e-07, + "loss": 3.8188, + "step": 645 + }, + { + "epoch": 0.019165560961226596, + "grad_norm": 63.5, + "learning_rate": 3.8325471698113203e-07, + "loss": 3.7537, + "step": 650 + }, + { + "epoch": 0.01931298835323603, + "grad_norm": 49.25, + "learning_rate": 3.8620283018867924e-07, + "loss": 3.6774, + "step": 655 + }, + { + "epoch": 0.019460415745245468, + "grad_norm": 39.0, + "learning_rate": 3.891509433962264e-07, + "loss": 3.7567, + "step": 660 + }, + { + "epoch": 0.0196078431372549, + "grad_norm": 39.0, + "learning_rate": 3.920990566037736e-07, + "loss": 3.756, + "step": 665 + }, + { + "epoch": 0.019755270529264336, + "grad_norm": 69.0, + "learning_rate": 3.9504716981132075e-07, + "loss": 3.6765, + "step": 670 + }, + { + "epoch": 0.019902697921273773, + "grad_norm": 60.75, + "learning_rate": 3.979952830188679e-07, + "loss": 3.949, + "step": 675 + }, + { + "epoch": 0.020050125313283207, + "grad_norm": 47.25, + "learning_rate": 4.009433962264151e-07, + "loss": 3.7218, + "step": 680 + }, + { + "epoch": 0.020197552705292644, + "grad_norm": 93.5, + "learning_rate": 4.038915094339622e-07, + "loss": 3.6829, + "step": 685 + }, + { + "epoch": 0.020344980097302078, + "grad_norm": 39.0, + "learning_rate": 4.068396226415094e-07, + "loss": 3.5724, + "step": 690 + }, + { + "epoch": 0.020492407489311516, + "grad_norm": 62.0, + "learning_rate": 4.0978773584905657e-07, + "loss": 3.6344, + "step": 695 + }, + { + "epoch": 0.02063983488132095, + "grad_norm": 196.0, + "learning_rate": 4.127358490566037e-07, + "loss": 3.5317, + "step": 700 + }, + { + "epoch": 0.020787262273330383, + "grad_norm": 41.0, + "learning_rate": 4.1568396226415093e-07, + "loss": 3.9269, + "step": 705 + }, + { + "epoch": 0.02093468966533982, + "grad_norm": 49.0, + "learning_rate": 4.186320754716981e-07, + "loss": 3.7463, + "step": 710 + }, + { + "epoch": 0.021082117057349255, + "grad_norm": 61.75, + "learning_rate": 4.215801886792453e-07, + "loss": 3.426, + "step": 715 + }, + { + "epoch": 0.021229544449358692, + "grad_norm": 40.25, + "learning_rate": 4.2452830188679244e-07, + "loss": 3.5511, + "step": 720 + }, + { + "epoch": 0.021376971841368126, + "grad_norm": 60.25, + "learning_rate": 4.2747641509433965e-07, + "loss": 3.8714, + "step": 725 + }, + { + "epoch": 0.02152439923337756, + "grad_norm": 67.0, + "learning_rate": 4.3042452830188675e-07, + "loss": 3.756, + "step": 730 + }, + { + "epoch": 0.021671826625386997, + "grad_norm": 47.0, + "learning_rate": 4.333726415094339e-07, + "loss": 3.548, + "step": 735 + }, + { + "epoch": 0.02181925401739643, + "grad_norm": 32.25, + "learning_rate": 4.363207547169811e-07, + "loss": 3.6436, + "step": 740 + }, + { + "epoch": 0.02196668140940587, + "grad_norm": 54.75, + "learning_rate": 4.3926886792452826e-07, + "loss": 3.4437, + "step": 745 + }, + { + "epoch": 0.022114108801415303, + "grad_norm": 55.75, + "learning_rate": 4.4221698113207547e-07, + "loss": 3.6474, + "step": 750 + }, + { + "epoch": 0.02226153619342474, + "grad_norm": 49.0, + "learning_rate": 4.451650943396226e-07, + "loss": 3.7664, + "step": 755 + }, + { + "epoch": 0.022408963585434174, + "grad_norm": 37.0, + "learning_rate": 4.481132075471698e-07, + "loss": 3.4262, + "step": 760 + }, + { + "epoch": 0.022556390977443608, + "grad_norm": 72.0, + "learning_rate": 4.51061320754717e-07, + "loss": 3.5507, + "step": 765 + }, + { + "epoch": 0.022703818369453045, + "grad_norm": 34.75, + "learning_rate": 4.5400943396226414e-07, + "loss": 3.6294, + "step": 770 + }, + { + "epoch": 0.02285124576146248, + "grad_norm": 71.0, + "learning_rate": 4.569575471698113e-07, + "loss": 3.3822, + "step": 775 + }, + { + "epoch": 0.022998673153471916, + "grad_norm": 57.5, + "learning_rate": 4.5990566037735845e-07, + "loss": 3.6301, + "step": 780 + }, + { + "epoch": 0.02314610054548135, + "grad_norm": 44.5, + "learning_rate": 4.6285377358490565e-07, + "loss": 3.5542, + "step": 785 + }, + { + "epoch": 0.023293527937490784, + "grad_norm": 30.25, + "learning_rate": 4.658018867924528e-07, + "loss": 3.4738, + "step": 790 + }, + { + "epoch": 0.02344095532950022, + "grad_norm": 36.25, + "learning_rate": 4.6874999999999996e-07, + "loss": 3.6581, + "step": 795 + }, + { + "epoch": 0.023588382721509656, + "grad_norm": 27.5, + "learning_rate": 4.7169811320754717e-07, + "loss": 3.4342, + "step": 800 + }, + { + "epoch": 0.023735810113519093, + "grad_norm": 36.5, + "learning_rate": 4.746462264150943e-07, + "loss": 3.6826, + "step": 805 + }, + { + "epoch": 0.023883237505528527, + "grad_norm": 41.75, + "learning_rate": 4.775943396226415e-07, + "loss": 3.485, + "step": 810 + }, + { + "epoch": 0.024030664897537964, + "grad_norm": 171.0, + "learning_rate": 4.805424528301887e-07, + "loss": 3.5775, + "step": 815 + }, + { + "epoch": 0.024178092289547398, + "grad_norm": 50.25, + "learning_rate": 4.834905660377358e-07, + "loss": 3.6154, + "step": 820 + }, + { + "epoch": 0.024325519681556832, + "grad_norm": 28.875, + "learning_rate": 4.86438679245283e-07, + "loss": 3.6943, + "step": 825 + }, + { + "epoch": 0.02447294707356627, + "grad_norm": 35.75, + "learning_rate": 4.893867924528301e-07, + "loss": 3.7264, + "step": 830 + }, + { + "epoch": 0.024620374465575703, + "grad_norm": 46.5, + "learning_rate": 4.923349056603773e-07, + "loss": 3.4934, + "step": 835 + }, + { + "epoch": 0.02476780185758514, + "grad_norm": 42.0, + "learning_rate": 4.952830188679246e-07, + "loss": 3.5727, + "step": 840 + }, + { + "epoch": 0.024915229249594575, + "grad_norm": 27.5, + "learning_rate": 4.982311320754717e-07, + "loss": 3.3648, + "step": 845 + }, + { + "epoch": 0.02506265664160401, + "grad_norm": 22.75, + "learning_rate": 5.011792452830188e-07, + "loss": 3.5408, + "step": 850 + }, + { + "epoch": 0.025210084033613446, + "grad_norm": 44.75, + "learning_rate": 5.04127358490566e-07, + "loss": 3.7565, + "step": 855 + }, + { + "epoch": 0.02535751142562288, + "grad_norm": 33.75, + "learning_rate": 5.070754716981132e-07, + "loss": 3.5236, + "step": 860 + }, + { + "epoch": 0.025504938817632317, + "grad_norm": 79.0, + "learning_rate": 5.100235849056603e-07, + "loss": 3.4544, + "step": 865 + }, + { + "epoch": 0.02565236620964175, + "grad_norm": 54.75, + "learning_rate": 5.129716981132075e-07, + "loss": 3.3027, + "step": 870 + }, + { + "epoch": 0.025799793601651185, + "grad_norm": 37.0, + "learning_rate": 5.159198113207547e-07, + "loss": 3.2541, + "step": 875 + }, + { + "epoch": 0.025947220993660623, + "grad_norm": 33.0, + "learning_rate": 5.188679245283019e-07, + "loss": 3.5423, + "step": 880 + }, + { + "epoch": 0.026094648385670056, + "grad_norm": 31.125, + "learning_rate": 5.21816037735849e-07, + "loss": 3.5534, + "step": 885 + }, + { + "epoch": 0.026242075777679494, + "grad_norm": 30.25, + "learning_rate": 5.247641509433962e-07, + "loss": 3.6136, + "step": 890 + }, + { + "epoch": 0.026389503169688928, + "grad_norm": 29.125, + "learning_rate": 5.277122641509433e-07, + "loss": 3.3698, + "step": 895 + }, + { + "epoch": 0.026536930561698365, + "grad_norm": 30.5, + "learning_rate": 5.306603773584906e-07, + "loss": 3.4885, + "step": 900 + }, + { + "epoch": 0.0266843579537078, + "grad_norm": 24.375, + "learning_rate": 5.336084905660378e-07, + "loss": 3.3895, + "step": 905 + }, + { + "epoch": 0.026831785345717233, + "grad_norm": 34.5, + "learning_rate": 5.365566037735849e-07, + "loss": 3.5425, + "step": 910 + }, + { + "epoch": 0.02697921273772667, + "grad_norm": 53.5, + "learning_rate": 5.395047169811321e-07, + "loss": 3.5077, + "step": 915 + }, + { + "epoch": 0.027126640129736104, + "grad_norm": 25.375, + "learning_rate": 5.424528301886792e-07, + "loss": 3.3913, + "step": 920 + }, + { + "epoch": 0.02727406752174554, + "grad_norm": 27.75, + "learning_rate": 5.454009433962265e-07, + "loss": 3.467, + "step": 925 + }, + { + "epoch": 0.027421494913754976, + "grad_norm": 53.0, + "learning_rate": 5.483490566037735e-07, + "loss": 3.7339, + "step": 930 + }, + { + "epoch": 0.02756892230576441, + "grad_norm": 34.0, + "learning_rate": 5.512971698113207e-07, + "loss": 3.4162, + "step": 935 + }, + { + "epoch": 0.027716349697773847, + "grad_norm": 26.125, + "learning_rate": 5.542452830188678e-07, + "loss": 3.4891, + "step": 940 + }, + { + "epoch": 0.02786377708978328, + "grad_norm": 30.625, + "learning_rate": 5.57193396226415e-07, + "loss": 3.6314, + "step": 945 + }, + { + "epoch": 0.028011204481792718, + "grad_norm": 26.5, + "learning_rate": 5.601415094339622e-07, + "loss": 3.5523, + "step": 950 + }, + { + "epoch": 0.028158631873802152, + "grad_norm": 33.75, + "learning_rate": 5.630896226415094e-07, + "loss": 3.3893, + "step": 955 + }, + { + "epoch": 0.02830605926581159, + "grad_norm": 32.0, + "learning_rate": 5.660377358490566e-07, + "loss": 3.3843, + "step": 960 + }, + { + "epoch": 0.028453486657821023, + "grad_norm": 29.5, + "learning_rate": 5.689858490566037e-07, + "loss": 3.5737, + "step": 965 + }, + { + "epoch": 0.028600914049830457, + "grad_norm": 71.0, + "learning_rate": 5.719339622641509e-07, + "loss": 3.2828, + "step": 970 + }, + { + "epoch": 0.028748341441839895, + "grad_norm": 42.25, + "learning_rate": 5.748820754716981e-07, + "loss": 3.414, + "step": 975 + }, + { + "epoch": 0.02889576883384933, + "grad_norm": 30.0, + "learning_rate": 5.778301886792453e-07, + "loss": 3.3214, + "step": 980 + }, + { + "epoch": 0.029043196225858766, + "grad_norm": 24.0, + "learning_rate": 5.807783018867924e-07, + "loss": 3.3956, + "step": 985 + }, + { + "epoch": 0.0291906236178682, + "grad_norm": 29.0, + "learning_rate": 5.837264150943396e-07, + "loss": 3.4536, + "step": 990 + }, + { + "epoch": 0.029338051009877634, + "grad_norm": 73.0, + "learning_rate": 5.866745283018868e-07, + "loss": 3.8505, + "step": 995 + }, + { + "epoch": 0.02948547840188707, + "grad_norm": 28.875, + "learning_rate": 5.89622641509434e-07, + "loss": 3.4702, + "step": 1000 + }, + { + "epoch": 0.02948547840188707, + "eval_loss": 4.123366832733154, + "eval_runtime": 4.7173, + "eval_samples_per_second": 83.946, + "eval_steps_per_second": 2.756, + "step": 1000 + }, + { + "epoch": 0.029632905793896505, + "grad_norm": 40.5, + "learning_rate": 5.925707547169812e-07, + "loss": 3.4618, + "step": 1005 + }, + { + "epoch": 0.029780333185905943, + "grad_norm": 31.0, + "learning_rate": 5.955188679245283e-07, + "loss": 3.3067, + "step": 1010 + }, + { + "epoch": 0.029927760577915376, + "grad_norm": 25.5, + "learning_rate": 5.984669811320755e-07, + "loss": 3.2988, + "step": 1015 + }, + { + "epoch": 0.03007518796992481, + "grad_norm": 24.875, + "learning_rate": 6.014150943396226e-07, + "loss": 3.4574, + "step": 1020 + }, + { + "epoch": 0.030222615361934248, + "grad_norm": 34.0, + "learning_rate": 6.043632075471698e-07, + "loss": 3.4777, + "step": 1025 + }, + { + "epoch": 0.03037004275394368, + "grad_norm": 21.375, + "learning_rate": 6.073113207547169e-07, + "loss": 3.2491, + "step": 1030 + }, + { + "epoch": 0.03051747014595312, + "grad_norm": 32.75, + "learning_rate": 6.102594339622641e-07, + "loss": 3.4012, + "step": 1035 + }, + { + "epoch": 0.030664897537962553, + "grad_norm": 20.625, + "learning_rate": 6.132075471698112e-07, + "loss": 3.2219, + "step": 1040 + }, + { + "epoch": 0.03081232492997199, + "grad_norm": 28.625, + "learning_rate": 6.161556603773585e-07, + "loss": 3.4702, + "step": 1045 + }, + { + "epoch": 0.030959752321981424, + "grad_norm": 32.5, + "learning_rate": 6.191037735849056e-07, + "loss": 3.2885, + "step": 1050 + }, + { + "epoch": 0.031107179713990858, + "grad_norm": 28.0, + "learning_rate": 6.220518867924528e-07, + "loss": 3.414, + "step": 1055 + }, + { + "epoch": 0.031254607106000296, + "grad_norm": 24.125, + "learning_rate": 6.249999999999999e-07, + "loss": 3.35, + "step": 1060 + }, + { + "epoch": 0.03140203449800973, + "grad_norm": 26.25, + "learning_rate": 6.279481132075471e-07, + "loss": 3.4552, + "step": 1065 + }, + { + "epoch": 0.03154946189001916, + "grad_norm": 78.0, + "learning_rate": 6.308962264150944e-07, + "loss": 3.2664, + "step": 1070 + }, + { + "epoch": 0.0316968892820286, + "grad_norm": 27.375, + "learning_rate": 6.338443396226415e-07, + "loss": 3.4228, + "step": 1075 + }, + { + "epoch": 0.03184431667403804, + "grad_norm": 32.25, + "learning_rate": 6.367924528301887e-07, + "loss": 3.2762, + "step": 1080 + }, + { + "epoch": 0.03199174406604747, + "grad_norm": 25.875, + "learning_rate": 6.397405660377358e-07, + "loss": 3.2858, + "step": 1085 + }, + { + "epoch": 0.032139171458056906, + "grad_norm": 27.875, + "learning_rate": 6.426886792452831e-07, + "loss": 3.3251, + "step": 1090 + }, + { + "epoch": 0.03228659885006634, + "grad_norm": 21.5, + "learning_rate": 6.456367924528302e-07, + "loss": 3.3962, + "step": 1095 + }, + { + "epoch": 0.03243402624207578, + "grad_norm": 20.5, + "learning_rate": 6.485849056603774e-07, + "loss": 3.4683, + "step": 1100 + }, + { + "epoch": 0.03258145363408521, + "grad_norm": 23.75, + "learning_rate": 6.515330188679244e-07, + "loss": 3.2019, + "step": 1105 + }, + { + "epoch": 0.03272888102609465, + "grad_norm": 55.5, + "learning_rate": 6.544811320754716e-07, + "loss": 3.4679, + "step": 1110 + }, + { + "epoch": 0.032876308418104086, + "grad_norm": 36.75, + "learning_rate": 6.574292452830188e-07, + "loss": 3.3551, + "step": 1115 + }, + { + "epoch": 0.033023735810113516, + "grad_norm": 23.5, + "learning_rate": 6.60377358490566e-07, + "loss": 3.3352, + "step": 1120 + }, + { + "epoch": 0.033171163202122954, + "grad_norm": 31.5, + "learning_rate": 6.633254716981132e-07, + "loss": 3.4598, + "step": 1125 + }, + { + "epoch": 0.03331859059413239, + "grad_norm": 36.0, + "learning_rate": 6.662735849056603e-07, + "loss": 3.5186, + "step": 1130 + }, + { + "epoch": 0.03346601798614183, + "grad_norm": 34.75, + "learning_rate": 6.692216981132075e-07, + "loss": 3.3776, + "step": 1135 + }, + { + "epoch": 0.03361344537815126, + "grad_norm": 45.0, + "learning_rate": 6.721698113207547e-07, + "loss": 3.4942, + "step": 1140 + }, + { + "epoch": 0.033760872770160696, + "grad_norm": 21.0, + "learning_rate": 6.751179245283019e-07, + "loss": 3.4764, + "step": 1145 + }, + { + "epoch": 0.033908300162170134, + "grad_norm": 21.875, + "learning_rate": 6.78066037735849e-07, + "loss": 3.3914, + "step": 1150 + }, + { + "epoch": 0.034055727554179564, + "grad_norm": 22.375, + "learning_rate": 6.810141509433962e-07, + "loss": 3.2916, + "step": 1155 + }, + { + "epoch": 0.034203154946189, + "grad_norm": 24.875, + "learning_rate": 6.839622641509433e-07, + "loss": 3.3966, + "step": 1160 + }, + { + "epoch": 0.03435058233819844, + "grad_norm": 32.25, + "learning_rate": 6.869103773584906e-07, + "loss": 3.2619, + "step": 1165 + }, + { + "epoch": 0.03449800973020787, + "grad_norm": 24.875, + "learning_rate": 6.898584905660377e-07, + "loss": 3.2777, + "step": 1170 + }, + { + "epoch": 0.03464543712221731, + "grad_norm": 20.875, + "learning_rate": 6.928066037735849e-07, + "loss": 3.228, + "step": 1175 + }, + { + "epoch": 0.034792864514226744, + "grad_norm": 24.0, + "learning_rate": 6.957547169811321e-07, + "loss": 3.2484, + "step": 1180 + }, + { + "epoch": 0.03494029190623618, + "grad_norm": 21.5, + "learning_rate": 6.987028301886792e-07, + "loss": 3.2644, + "step": 1185 + }, + { + "epoch": 0.03508771929824561, + "grad_norm": 21.0, + "learning_rate": 7.016509433962265e-07, + "loss": 3.0695, + "step": 1190 + }, + { + "epoch": 0.03523514669025505, + "grad_norm": 35.25, + "learning_rate": 7.045990566037735e-07, + "loss": 3.3176, + "step": 1195 + }, + { + "epoch": 0.03538257408226449, + "grad_norm": 19.25, + "learning_rate": 7.075471698113207e-07, + "loss": 3.2097, + "step": 1200 + }, + { + "epoch": 0.03553000147427392, + "grad_norm": 25.875, + "learning_rate": 7.104952830188678e-07, + "loss": 3.3202, + "step": 1205 + }, + { + "epoch": 0.035677428866283355, + "grad_norm": 27.25, + "learning_rate": 7.13443396226415e-07, + "loss": 3.1499, + "step": 1210 + }, + { + "epoch": 0.03582485625829279, + "grad_norm": 34.25, + "learning_rate": 7.163915094339622e-07, + "loss": 3.308, + "step": 1215 + }, + { + "epoch": 0.03597228365030223, + "grad_norm": 23.875, + "learning_rate": 7.193396226415094e-07, + "loss": 3.0438, + "step": 1220 + }, + { + "epoch": 0.03611971104231166, + "grad_norm": 26.5, + "learning_rate": 7.222877358490565e-07, + "loss": 3.2286, + "step": 1225 + }, + { + "epoch": 0.0362671384343211, + "grad_norm": 25.0, + "learning_rate": 7.252358490566037e-07, + "loss": 3.1494, + "step": 1230 + }, + { + "epoch": 0.036414565826330535, + "grad_norm": 33.5, + "learning_rate": 7.28183962264151e-07, + "loss": 3.3187, + "step": 1235 + }, + { + "epoch": 0.036561993218339965, + "grad_norm": 20.625, + "learning_rate": 7.311320754716981e-07, + "loss": 3.2469, + "step": 1240 + }, + { + "epoch": 0.0367094206103494, + "grad_norm": 20.75, + "learning_rate": 7.340801886792453e-07, + "loss": 3.2862, + "step": 1245 + }, + { + "epoch": 0.03685684800235884, + "grad_norm": 27.875, + "learning_rate": 7.370283018867924e-07, + "loss": 3.2839, + "step": 1250 + }, + { + "epoch": 0.03700427539436827, + "grad_norm": 20.75, + "learning_rate": 7.399764150943396e-07, + "loss": 3.3897, + "step": 1255 + }, + { + "epoch": 0.03715170278637771, + "grad_norm": 27.125, + "learning_rate": 7.429245283018868e-07, + "loss": 3.1699, + "step": 1260 + }, + { + "epoch": 0.037299130178387145, + "grad_norm": 21.75, + "learning_rate": 7.45872641509434e-07, + "loss": 3.163, + "step": 1265 + }, + { + "epoch": 0.03744655757039658, + "grad_norm": 21.625, + "learning_rate": 7.488207547169811e-07, + "loss": 3.244, + "step": 1270 + }, + { + "epoch": 0.03759398496240601, + "grad_norm": 27.75, + "learning_rate": 7.517688679245283e-07, + "loss": 3.2307, + "step": 1275 + }, + { + "epoch": 0.03774141235441545, + "grad_norm": 40.75, + "learning_rate": 7.547169811320754e-07, + "loss": 3.2905, + "step": 1280 + }, + { + "epoch": 0.03788883974642489, + "grad_norm": 25.25, + "learning_rate": 7.576650943396226e-07, + "loss": 3.3318, + "step": 1285 + }, + { + "epoch": 0.03803626713843432, + "grad_norm": 32.5, + "learning_rate": 7.606132075471698e-07, + "loss": 3.2618, + "step": 1290 + }, + { + "epoch": 0.038183694530443756, + "grad_norm": 33.5, + "learning_rate": 7.635613207547169e-07, + "loss": 3.3272, + "step": 1295 + }, + { + "epoch": 0.03833112192245319, + "grad_norm": 28.125, + "learning_rate": 7.665094339622641e-07, + "loss": 3.1578, + "step": 1300 + }, + { + "epoch": 0.03847854931446263, + "grad_norm": 21.375, + "learning_rate": 7.694575471698112e-07, + "loss": 3.0873, + "step": 1305 + }, + { + "epoch": 0.03862597670647206, + "grad_norm": 23.75, + "learning_rate": 7.724056603773585e-07, + "loss": 3.3165, + "step": 1310 + }, + { + "epoch": 0.0387734040984815, + "grad_norm": 21.5, + "learning_rate": 7.753537735849056e-07, + "loss": 3.1587, + "step": 1315 + }, + { + "epoch": 0.038920831490490936, + "grad_norm": 20.5, + "learning_rate": 7.783018867924528e-07, + "loss": 3.3898, + "step": 1320 + }, + { + "epoch": 0.039068258882500366, + "grad_norm": 34.0, + "learning_rate": 7.812499999999999e-07, + "loss": 3.1372, + "step": 1325 + }, + { + "epoch": 0.0392156862745098, + "grad_norm": 18.875, + "learning_rate": 7.841981132075472e-07, + "loss": 3.2379, + "step": 1330 + }, + { + "epoch": 0.03936311366651924, + "grad_norm": 21.125, + "learning_rate": 7.871462264150943e-07, + "loss": 3.3005, + "step": 1335 + }, + { + "epoch": 0.03951054105852867, + "grad_norm": 23.25, + "learning_rate": 7.900943396226415e-07, + "loss": 3.1946, + "step": 1340 + }, + { + "epoch": 0.03965796845053811, + "grad_norm": 23.375, + "learning_rate": 7.930424528301887e-07, + "loss": 3.0714, + "step": 1345 + }, + { + "epoch": 0.039805395842547546, + "grad_norm": 28.375, + "learning_rate": 7.959905660377358e-07, + "loss": 3.3614, + "step": 1350 + }, + { + "epoch": 0.03995282323455698, + "grad_norm": 24.25, + "learning_rate": 7.989386792452831e-07, + "loss": 3.1599, + "step": 1355 + }, + { + "epoch": 0.040100250626566414, + "grad_norm": 20.375, + "learning_rate": 8.018867924528302e-07, + "loss": 3.1418, + "step": 1360 + }, + { + "epoch": 0.04024767801857585, + "grad_norm": 23.5, + "learning_rate": 8.048349056603774e-07, + "loss": 2.9725, + "step": 1365 + }, + { + "epoch": 0.04039510541058529, + "grad_norm": 29.0, + "learning_rate": 8.077830188679244e-07, + "loss": 3.0553, + "step": 1370 + }, + { + "epoch": 0.04054253280259472, + "grad_norm": 29.0, + "learning_rate": 8.107311320754716e-07, + "loss": 3.1955, + "step": 1375 + }, + { + "epoch": 0.040689960194604156, + "grad_norm": 21.75, + "learning_rate": 8.136792452830188e-07, + "loss": 3.1639, + "step": 1380 + }, + { + "epoch": 0.040837387586613594, + "grad_norm": 33.0, + "learning_rate": 8.16627358490566e-07, + "loss": 3.1981, + "step": 1385 + }, + { + "epoch": 0.04098481497862303, + "grad_norm": 25.625, + "learning_rate": 8.195754716981131e-07, + "loss": 3.081, + "step": 1390 + }, + { + "epoch": 0.04113224237063246, + "grad_norm": 24.0, + "learning_rate": 8.225235849056603e-07, + "loss": 3.0706, + "step": 1395 + }, + { + "epoch": 0.0412796697626419, + "grad_norm": 23.0, + "learning_rate": 8.254716981132074e-07, + "loss": 3.2926, + "step": 1400 + }, + { + "epoch": 0.041427097154651336, + "grad_norm": 23.375, + "learning_rate": 8.284198113207547e-07, + "loss": 3.1699, + "step": 1405 + }, + { + "epoch": 0.04157452454666077, + "grad_norm": 30.0, + "learning_rate": 8.313679245283019e-07, + "loss": 3.145, + "step": 1410 + }, + { + "epoch": 0.041721951938670204, + "grad_norm": 29.25, + "learning_rate": 8.34316037735849e-07, + "loss": 3.2222, + "step": 1415 + }, + { + "epoch": 0.04186937933067964, + "grad_norm": 36.5, + "learning_rate": 8.372641509433962e-07, + "loss": 3.2383, + "step": 1420 + }, + { + "epoch": 0.04201680672268908, + "grad_norm": 19.875, + "learning_rate": 8.402122641509433e-07, + "loss": 2.9331, + "step": 1425 + }, + { + "epoch": 0.04216423411469851, + "grad_norm": 24.375, + "learning_rate": 8.431603773584906e-07, + "loss": 3.076, + "step": 1430 + }, + { + "epoch": 0.04231166150670795, + "grad_norm": 24.125, + "learning_rate": 8.461084905660377e-07, + "loss": 3.1641, + "step": 1435 + }, + { + "epoch": 0.042459088898717384, + "grad_norm": 21.375, + "learning_rate": 8.490566037735849e-07, + "loss": 3.3244, + "step": 1440 + }, + { + "epoch": 0.042606516290726815, + "grad_norm": 21.125, + "learning_rate": 8.52004716981132e-07, + "loss": 3.1178, + "step": 1445 + }, + { + "epoch": 0.04275394368273625, + "grad_norm": 27.0, + "learning_rate": 8.549528301886793e-07, + "loss": 3.1739, + "step": 1450 + }, + { + "epoch": 0.04290137107474569, + "grad_norm": 21.625, + "learning_rate": 8.579009433962265e-07, + "loss": 3.2333, + "step": 1455 + }, + { + "epoch": 0.04304879846675512, + "grad_norm": 31.25, + "learning_rate": 8.608490566037735e-07, + "loss": 3.2602, + "step": 1460 + }, + { + "epoch": 0.04319622585876456, + "grad_norm": 22.0, + "learning_rate": 8.637971698113207e-07, + "loss": 3.2519, + "step": 1465 + }, + { + "epoch": 0.043343653250773995, + "grad_norm": 24.625, + "learning_rate": 8.667452830188678e-07, + "loss": 3.0988, + "step": 1470 + }, + { + "epoch": 0.04349108064278343, + "grad_norm": 28.125, + "learning_rate": 8.696933962264151e-07, + "loss": 3.1193, + "step": 1475 + }, + { + "epoch": 0.04363850803479286, + "grad_norm": 20.625, + "learning_rate": 8.726415094339622e-07, + "loss": 3.2805, + "step": 1480 + }, + { + "epoch": 0.0437859354268023, + "grad_norm": 18.875, + "learning_rate": 8.755896226415094e-07, + "loss": 3.2113, + "step": 1485 + }, + { + "epoch": 0.04393336281881174, + "grad_norm": 17.625, + "learning_rate": 8.785377358490565e-07, + "loss": 3.2982, + "step": 1490 + }, + { + "epoch": 0.04408079021082117, + "grad_norm": 17.375, + "learning_rate": 8.814858490566037e-07, + "loss": 3.187, + "step": 1495 + }, + { + "epoch": 0.044228217602830605, + "grad_norm": 21.75, + "learning_rate": 8.844339622641509e-07, + "loss": 3.1377, + "step": 1500 + }, + { + "epoch": 0.044228217602830605, + "eval_loss": 3.5992095470428467, + "eval_runtime": 4.6937, + "eval_samples_per_second": 84.368, + "eval_steps_per_second": 2.77, + "step": 1500 + }, + { + "epoch": 0.04437564499484004, + "grad_norm": 188.0, + "learning_rate": 8.873820754716981e-07, + "loss": 3.1779, + "step": 1505 + }, + { + "epoch": 0.04452307238684948, + "grad_norm": 45.5, + "learning_rate": 8.903301886792452e-07, + "loss": 3.2703, + "step": 1510 + }, + { + "epoch": 0.04467049977885891, + "grad_norm": 20.875, + "learning_rate": 8.932783018867924e-07, + "loss": 3.1839, + "step": 1515 + }, + { + "epoch": 0.04481792717086835, + "grad_norm": 19.375, + "learning_rate": 8.962264150943396e-07, + "loss": 3.0066, + "step": 1520 + }, + { + "epoch": 0.044965354562877785, + "grad_norm": 29.5, + "learning_rate": 8.991745283018868e-07, + "loss": 3.2096, + "step": 1525 + }, + { + "epoch": 0.045112781954887216, + "grad_norm": 26.875, + "learning_rate": 9.02122641509434e-07, + "loss": 3.0473, + "step": 1530 + }, + { + "epoch": 0.04526020934689665, + "grad_norm": 19.625, + "learning_rate": 9.050707547169811e-07, + "loss": 3.0378, + "step": 1535 + }, + { + "epoch": 0.04540763673890609, + "grad_norm": 17.5, + "learning_rate": 9.080188679245283e-07, + "loss": 3.2259, + "step": 1540 + }, + { + "epoch": 0.04555506413091552, + "grad_norm": 22.625, + "learning_rate": 9.109669811320755e-07, + "loss": 3.0831, + "step": 1545 + }, + { + "epoch": 0.04570249152292496, + "grad_norm": 19.125, + "learning_rate": 9.139150943396226e-07, + "loss": 3.1252, + "step": 1550 + }, + { + "epoch": 0.045849918914934396, + "grad_norm": 23.25, + "learning_rate": 9.168632075471697e-07, + "loss": 3.2609, + "step": 1555 + }, + { + "epoch": 0.04599734630694383, + "grad_norm": 27.625, + "learning_rate": 9.198113207547169e-07, + "loss": 3.1041, + "step": 1560 + }, + { + "epoch": 0.04614477369895326, + "grad_norm": 22.75, + "learning_rate": 9.22759433962264e-07, + "loss": 3.289, + "step": 1565 + }, + { + "epoch": 0.0462922010909627, + "grad_norm": 19.125, + "learning_rate": 9.257075471698113e-07, + "loss": 3.2432, + "step": 1570 + }, + { + "epoch": 0.04643962848297214, + "grad_norm": 16.5, + "learning_rate": 9.286556603773585e-07, + "loss": 3.0007, + "step": 1575 + }, + { + "epoch": 0.04658705587498157, + "grad_norm": 24.5, + "learning_rate": 9.316037735849056e-07, + "loss": 3.2563, + "step": 1580 + }, + { + "epoch": 0.046734483266991006, + "grad_norm": 19.75, + "learning_rate": 9.345518867924528e-07, + "loss": 3.0233, + "step": 1585 + }, + { + "epoch": 0.04688191065900044, + "grad_norm": 24.0, + "learning_rate": 9.374999999999999e-07, + "loss": 2.9052, + "step": 1590 + }, + { + "epoch": 0.04702933805100988, + "grad_norm": 21.375, + "learning_rate": 9.404481132075472e-07, + "loss": 3.157, + "step": 1595 + }, + { + "epoch": 0.04717676544301931, + "grad_norm": 37.0, + "learning_rate": 9.433962264150943e-07, + "loss": 2.9625, + "step": 1600 + }, + { + "epoch": 0.04732419283502875, + "grad_norm": 16.125, + "learning_rate": 9.463443396226415e-07, + "loss": 3.1684, + "step": 1605 + }, + { + "epoch": 0.047471620227038186, + "grad_norm": 17.0, + "learning_rate": 9.492924528301886e-07, + "loss": 3.1345, + "step": 1610 + }, + { + "epoch": 0.047619047619047616, + "grad_norm": 24.125, + "learning_rate": 9.522405660377358e-07, + "loss": 3.2707, + "step": 1615 + }, + { + "epoch": 0.047766475011057054, + "grad_norm": 19.375, + "learning_rate": 9.55188679245283e-07, + "loss": 3.1682, + "step": 1620 + }, + { + "epoch": 0.04791390240306649, + "grad_norm": 20.125, + "learning_rate": 9.581367924528302e-07, + "loss": 3.1835, + "step": 1625 + }, + { + "epoch": 0.04806132979507593, + "grad_norm": 19.75, + "learning_rate": 9.610849056603774e-07, + "loss": 3.0946, + "step": 1630 + }, + { + "epoch": 0.04820875718708536, + "grad_norm": 27.0, + "learning_rate": 9.640330188679245e-07, + "loss": 2.9732, + "step": 1635 + }, + { + "epoch": 0.048356184579094796, + "grad_norm": 28.0, + "learning_rate": 9.669811320754717e-07, + "loss": 3.0809, + "step": 1640 + }, + { + "epoch": 0.048503611971104234, + "grad_norm": 16.625, + "learning_rate": 9.699292452830188e-07, + "loss": 2.8662, + "step": 1645 + }, + { + "epoch": 0.048651039363113664, + "grad_norm": 25.0, + "learning_rate": 9.72877358490566e-07, + "loss": 3.1653, + "step": 1650 + }, + { + "epoch": 0.0487984667551231, + "grad_norm": 20.875, + "learning_rate": 9.758254716981131e-07, + "loss": 2.9102, + "step": 1655 + }, + { + "epoch": 0.04894589414713254, + "grad_norm": 15.625, + "learning_rate": 9.787735849056603e-07, + "loss": 3.0904, + "step": 1660 + }, + { + "epoch": 0.04909332153914197, + "grad_norm": 26.5, + "learning_rate": 9.817216981132074e-07, + "loss": 3.2188, + "step": 1665 + }, + { + "epoch": 0.04924074893115141, + "grad_norm": 20.0, + "learning_rate": 9.846698113207546e-07, + "loss": 3.0935, + "step": 1670 + }, + { + "epoch": 0.049388176323160844, + "grad_norm": 22.625, + "learning_rate": 9.876179245283017e-07, + "loss": 3.0112, + "step": 1675 + }, + { + "epoch": 0.04953560371517028, + "grad_norm": 28.875, + "learning_rate": 9.90566037735849e-07, + "loss": 3.0347, + "step": 1680 + }, + { + "epoch": 0.04968303110717971, + "grad_norm": 28.0, + "learning_rate": 9.935141509433963e-07, + "loss": 3.1383, + "step": 1685 + }, + { + "epoch": 0.04983045849918915, + "grad_norm": 22.625, + "learning_rate": 9.964622641509434e-07, + "loss": 3.195, + "step": 1690 + }, + { + "epoch": 0.04997788589119859, + "grad_norm": 17.625, + "learning_rate": 9.994103773584906e-07, + "loss": 3.0536, + "step": 1695 + }, + { + "epoch": 0.05012531328320802, + "grad_norm": 20.875, + "learning_rate": 1.0023584905660375e-06, + "loss": 3.1255, + "step": 1700 + }, + { + "epoch": 0.050272740675217455, + "grad_norm": 23.625, + "learning_rate": 1.0053066037735849e-06, + "loss": 3.0029, + "step": 1705 + }, + { + "epoch": 0.05042016806722689, + "grad_norm": 21.875, + "learning_rate": 1.008254716981132e-06, + "loss": 3.0566, + "step": 1710 + }, + { + "epoch": 0.05056759545923633, + "grad_norm": 22.875, + "learning_rate": 1.0112028301886792e-06, + "loss": 3.235, + "step": 1715 + }, + { + "epoch": 0.05071502285124576, + "grad_norm": 21.25, + "learning_rate": 1.0141509433962263e-06, + "loss": 3.0852, + "step": 1720 + }, + { + "epoch": 0.0508624502432552, + "grad_norm": 20.875, + "learning_rate": 1.0170990566037737e-06, + "loss": 2.9238, + "step": 1725 + }, + { + "epoch": 0.051009877635264635, + "grad_norm": 21.0, + "learning_rate": 1.0200471698113206e-06, + "loss": 2.9247, + "step": 1730 + }, + { + "epoch": 0.051157305027274065, + "grad_norm": 22.375, + "learning_rate": 1.022995283018868e-06, + "loss": 2.9932, + "step": 1735 + }, + { + "epoch": 0.0513047324192835, + "grad_norm": 21.625, + "learning_rate": 1.025943396226415e-06, + "loss": 3.1733, + "step": 1740 + }, + { + "epoch": 0.05145215981129294, + "grad_norm": 18.75, + "learning_rate": 1.0288915094339623e-06, + "loss": 3.0096, + "step": 1745 + }, + { + "epoch": 0.05159958720330237, + "grad_norm": 16.5, + "learning_rate": 1.0318396226415095e-06, + "loss": 3.0647, + "step": 1750 + }, + { + "epoch": 0.05174701459531181, + "grad_norm": 20.75, + "learning_rate": 1.0347877358490566e-06, + "loss": 3.0325, + "step": 1755 + }, + { + "epoch": 0.051894441987321245, + "grad_norm": 16.875, + "learning_rate": 1.0377358490566038e-06, + "loss": 3.0077, + "step": 1760 + }, + { + "epoch": 0.05204186937933068, + "grad_norm": 18.625, + "learning_rate": 1.040683962264151e-06, + "loss": 2.9983, + "step": 1765 + }, + { + "epoch": 0.05218929677134011, + "grad_norm": 21.75, + "learning_rate": 1.043632075471698e-06, + "loss": 3.103, + "step": 1770 + }, + { + "epoch": 0.05233672416334955, + "grad_norm": 19.375, + "learning_rate": 1.0465801886792452e-06, + "loss": 3.0497, + "step": 1775 + }, + { + "epoch": 0.05248415155535899, + "grad_norm": 27.125, + "learning_rate": 1.0495283018867924e-06, + "loss": 3.037, + "step": 1780 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 14.0625, + "learning_rate": 1.0524764150943395e-06, + "loss": 2.7835, + "step": 1785 + }, + { + "epoch": 0.052779006339377856, + "grad_norm": 20.75, + "learning_rate": 1.0554245283018867e-06, + "loss": 3.0535, + "step": 1790 + }, + { + "epoch": 0.05292643373138729, + "grad_norm": 19.375, + "learning_rate": 1.0583726415094338e-06, + "loss": 3.0061, + "step": 1795 + }, + { + "epoch": 0.05307386112339673, + "grad_norm": 18.375, + "learning_rate": 1.0613207547169812e-06, + "loss": 2.9059, + "step": 1800 + }, + { + "epoch": 0.05322128851540616, + "grad_norm": 16.875, + "learning_rate": 1.0642688679245282e-06, + "loss": 3.0449, + "step": 1805 + }, + { + "epoch": 0.0533687159074156, + "grad_norm": 19.875, + "learning_rate": 1.0672169811320755e-06, + "loss": 2.905, + "step": 1810 + }, + { + "epoch": 0.053516143299425036, + "grad_norm": 20.25, + "learning_rate": 1.0701650943396225e-06, + "loss": 3.2346, + "step": 1815 + }, + { + "epoch": 0.053663570691434466, + "grad_norm": 17.875, + "learning_rate": 1.0731132075471698e-06, + "loss": 2.9078, + "step": 1820 + }, + { + "epoch": 0.0538109980834439, + "grad_norm": 20.125, + "learning_rate": 1.076061320754717e-06, + "loss": 3.1477, + "step": 1825 + }, + { + "epoch": 0.05395842547545334, + "grad_norm": 24.0, + "learning_rate": 1.0790094339622641e-06, + "loss": 2.8281, + "step": 1830 + }, + { + "epoch": 0.05410585286746277, + "grad_norm": 20.625, + "learning_rate": 1.0819575471698113e-06, + "loss": 3.0376, + "step": 1835 + }, + { + "epoch": 0.05425328025947221, + "grad_norm": 15.6875, + "learning_rate": 1.0849056603773584e-06, + "loss": 2.8431, + "step": 1840 + }, + { + "epoch": 0.054400707651481646, + "grad_norm": 18.875, + "learning_rate": 1.0878537735849056e-06, + "loss": 2.9487, + "step": 1845 + }, + { + "epoch": 0.05454813504349108, + "grad_norm": 30.75, + "learning_rate": 1.090801886792453e-06, + "loss": 2.9563, + "step": 1850 + }, + { + "epoch": 0.054695562435500514, + "grad_norm": 23.125, + "learning_rate": 1.09375e-06, + "loss": 2.871, + "step": 1855 + }, + { + "epoch": 0.05484298982750995, + "grad_norm": 17.0, + "learning_rate": 1.096698113207547e-06, + "loss": 3.029, + "step": 1860 + }, + { + "epoch": 0.05499041721951939, + "grad_norm": 14.625, + "learning_rate": 1.0996462264150942e-06, + "loss": 2.898, + "step": 1865 + }, + { + "epoch": 0.05513784461152882, + "grad_norm": 21.25, + "learning_rate": 1.1025943396226414e-06, + "loss": 2.8212, + "step": 1870 + }, + { + "epoch": 0.055285272003538256, + "grad_norm": 17.75, + "learning_rate": 1.1055424528301887e-06, + "loss": 2.9897, + "step": 1875 + }, + { + "epoch": 0.055432699395547694, + "grad_norm": 17.125, + "learning_rate": 1.1084905660377357e-06, + "loss": 2.8868, + "step": 1880 + }, + { + "epoch": 0.05558012678755713, + "grad_norm": 19.25, + "learning_rate": 1.111438679245283e-06, + "loss": 3.0397, + "step": 1885 + }, + { + "epoch": 0.05572755417956656, + "grad_norm": 19.75, + "learning_rate": 1.11438679245283e-06, + "loss": 2.8232, + "step": 1890 + }, + { + "epoch": 0.055874981571576, + "grad_norm": 36.75, + "learning_rate": 1.1173349056603773e-06, + "loss": 2.9727, + "step": 1895 + }, + { + "epoch": 0.056022408963585436, + "grad_norm": 39.25, + "learning_rate": 1.1202830188679245e-06, + "loss": 3.3028, + "step": 1900 + }, + { + "epoch": 0.05616983635559487, + "grad_norm": 19.25, + "learning_rate": 1.1232311320754717e-06, + "loss": 3.1002, + "step": 1905 + }, + { + "epoch": 0.056317263747604304, + "grad_norm": 24.0, + "learning_rate": 1.1261792452830188e-06, + "loss": 3.163, + "step": 1910 + }, + { + "epoch": 0.05646469113961374, + "grad_norm": 23.375, + "learning_rate": 1.129127358490566e-06, + "loss": 2.9574, + "step": 1915 + }, + { + "epoch": 0.05661211853162318, + "grad_norm": 62.0, + "learning_rate": 1.1320754716981131e-06, + "loss": 2.9711, + "step": 1920 + }, + { + "epoch": 0.05675954592363261, + "grad_norm": 20.875, + "learning_rate": 1.1350235849056605e-06, + "loss": 2.9719, + "step": 1925 + }, + { + "epoch": 0.05690697331564205, + "grad_norm": 21.0, + "learning_rate": 1.1379716981132074e-06, + "loss": 2.9451, + "step": 1930 + }, + { + "epoch": 0.057054400707651484, + "grad_norm": 19.5, + "learning_rate": 1.1409198113207548e-06, + "loss": 3.0775, + "step": 1935 + }, + { + "epoch": 0.057201828099660915, + "grad_norm": 18.0, + "learning_rate": 1.1438679245283017e-06, + "loss": 2.9611, + "step": 1940 + }, + { + "epoch": 0.05734925549167035, + "grad_norm": 19.5, + "learning_rate": 1.1468160377358489e-06, + "loss": 2.8335, + "step": 1945 + }, + { + "epoch": 0.05749668288367979, + "grad_norm": 19.125, + "learning_rate": 1.1497641509433962e-06, + "loss": 2.9033, + "step": 1950 + }, + { + "epoch": 0.05764411027568922, + "grad_norm": 19.75, + "learning_rate": 1.1527122641509432e-06, + "loss": 2.957, + "step": 1955 + }, + { + "epoch": 0.05779153766769866, + "grad_norm": 21.375, + "learning_rate": 1.1556603773584906e-06, + "loss": 3.0282, + "step": 1960 + }, + { + "epoch": 0.057938965059708095, + "grad_norm": 26.375, + "learning_rate": 1.1586084905660375e-06, + "loss": 2.9575, + "step": 1965 + }, + { + "epoch": 0.05808639245171753, + "grad_norm": 16.375, + "learning_rate": 1.1615566037735849e-06, + "loss": 2.8847, + "step": 1970 + }, + { + "epoch": 0.05823381984372696, + "grad_norm": 21.625, + "learning_rate": 1.164504716981132e-06, + "loss": 2.8438, + "step": 1975 + }, + { + "epoch": 0.0583812472357364, + "grad_norm": 26.375, + "learning_rate": 1.1674528301886792e-06, + "loss": 3.0832, + "step": 1980 + }, + { + "epoch": 0.05852867462774584, + "grad_norm": 17.5, + "learning_rate": 1.1704009433962263e-06, + "loss": 3.0941, + "step": 1985 + }, + { + "epoch": 0.05867610201975527, + "grad_norm": 16.25, + "learning_rate": 1.1733490566037737e-06, + "loss": 2.716, + "step": 1990 + }, + { + "epoch": 0.058823529411764705, + "grad_norm": 71.0, + "learning_rate": 1.1762971698113206e-06, + "loss": 2.9973, + "step": 1995 + }, + { + "epoch": 0.05897095680377414, + "grad_norm": 24.625, + "learning_rate": 1.179245283018868e-06, + "loss": 2.9267, + "step": 2000 + }, + { + "epoch": 0.05897095680377414, + "eval_loss": 3.27067494392395, + "eval_runtime": 4.6982, + "eval_samples_per_second": 84.287, + "eval_steps_per_second": 2.767, + "step": 2000 + }, + { + "epoch": 0.05911838419578358, + "grad_norm": 16.5, + "learning_rate": 1.182193396226415e-06, + "loss": 2.7164, + "step": 2005 + }, + { + "epoch": 0.05926581158779301, + "grad_norm": 14.8125, + "learning_rate": 1.1851415094339623e-06, + "loss": 2.8899, + "step": 2010 + }, + { + "epoch": 0.05941323897980245, + "grad_norm": 18.75, + "learning_rate": 1.1880896226415095e-06, + "loss": 2.926, + "step": 2015 + }, + { + "epoch": 0.059560666371811885, + "grad_norm": 23.125, + "learning_rate": 1.1910377358490566e-06, + "loss": 2.853, + "step": 2020 + }, + { + "epoch": 0.059708093763821316, + "grad_norm": 17.75, + "learning_rate": 1.1939858490566038e-06, + "loss": 2.8584, + "step": 2025 + }, + { + "epoch": 0.05985552115583075, + "grad_norm": 20.75, + "learning_rate": 1.196933962264151e-06, + "loss": 2.8365, + "step": 2030 + }, + { + "epoch": 0.06000294854784019, + "grad_norm": 19.0, + "learning_rate": 1.199882075471698e-06, + "loss": 3.0571, + "step": 2035 + }, + { + "epoch": 0.06015037593984962, + "grad_norm": 15.6875, + "learning_rate": 1.2028301886792452e-06, + "loss": 2.7813, + "step": 2040 + }, + { + "epoch": 0.06029780333185906, + "grad_norm": 16.125, + "learning_rate": 1.2057783018867924e-06, + "loss": 2.744, + "step": 2045 + }, + { + "epoch": 0.060445230723868495, + "grad_norm": 18.125, + "learning_rate": 1.2087264150943395e-06, + "loss": 2.8005, + "step": 2050 + }, + { + "epoch": 0.06059265811587793, + "grad_norm": 22.875, + "learning_rate": 1.2116745283018867e-06, + "loss": 2.9454, + "step": 2055 + }, + { + "epoch": 0.06074008550788736, + "grad_norm": 16.875, + "learning_rate": 1.2146226415094338e-06, + "loss": 3.0735, + "step": 2060 + }, + { + "epoch": 0.0608875128998968, + "grad_norm": 40.75, + "learning_rate": 1.2175707547169812e-06, + "loss": 2.9832, + "step": 2065 + }, + { + "epoch": 0.06103494029190624, + "grad_norm": 22.375, + "learning_rate": 1.2205188679245281e-06, + "loss": 3.0261, + "step": 2070 + }, + { + "epoch": 0.06118236768391567, + "grad_norm": 19.375, + "learning_rate": 1.2234669811320755e-06, + "loss": 3.0268, + "step": 2075 + }, + { + "epoch": 0.061329795075925106, + "grad_norm": 22.5, + "learning_rate": 1.2264150943396225e-06, + "loss": 2.9181, + "step": 2080 + }, + { + "epoch": 0.06147722246793454, + "grad_norm": 43.75, + "learning_rate": 1.2293632075471698e-06, + "loss": 2.7782, + "step": 2085 + }, + { + "epoch": 0.06162464985994398, + "grad_norm": 15.125, + "learning_rate": 1.232311320754717e-06, + "loss": 2.8703, + "step": 2090 + }, + { + "epoch": 0.06177207725195341, + "grad_norm": 21.75, + "learning_rate": 1.2352594339622641e-06, + "loss": 2.972, + "step": 2095 + }, + { + "epoch": 0.06191950464396285, + "grad_norm": 16.75, + "learning_rate": 1.2382075471698113e-06, + "loss": 2.8651, + "step": 2100 + }, + { + "epoch": 0.062066932035972286, + "grad_norm": 23.375, + "learning_rate": 1.2411556603773584e-06, + "loss": 3.1965, + "step": 2105 + }, + { + "epoch": 0.062214359427981716, + "grad_norm": 21.125, + "learning_rate": 1.2441037735849056e-06, + "loss": 2.9804, + "step": 2110 + }, + { + "epoch": 0.062361786819991154, + "grad_norm": 23.125, + "learning_rate": 1.247051886792453e-06, + "loss": 2.9106, + "step": 2115 + }, + { + "epoch": 0.06250921421200059, + "grad_norm": 19.5, + "learning_rate": 1.2499999999999999e-06, + "loss": 2.9487, + "step": 2120 + }, + { + "epoch": 0.06265664160401002, + "grad_norm": 43.5, + "learning_rate": 1.252948113207547e-06, + "loss": 2.9301, + "step": 2125 + }, + { + "epoch": 0.06280406899601947, + "grad_norm": 17.75, + "learning_rate": 1.2558962264150942e-06, + "loss": 2.8569, + "step": 2130 + }, + { + "epoch": 0.0629514963880289, + "grad_norm": 14.625, + "learning_rate": 1.2588443396226414e-06, + "loss": 2.851, + "step": 2135 + }, + { + "epoch": 0.06309892378003833, + "grad_norm": 19.0, + "learning_rate": 1.2617924528301887e-06, + "loss": 2.959, + "step": 2140 + }, + { + "epoch": 0.06324635117204777, + "grad_norm": 16.0, + "learning_rate": 1.2647405660377357e-06, + "loss": 3.0111, + "step": 2145 + }, + { + "epoch": 0.0633937785640572, + "grad_norm": 21.25, + "learning_rate": 1.267688679245283e-06, + "loss": 2.9492, + "step": 2150 + }, + { + "epoch": 0.06354120595606663, + "grad_norm": 27.875, + "learning_rate": 1.27063679245283e-06, + "loss": 2.8556, + "step": 2155 + }, + { + "epoch": 0.06368863334807608, + "grad_norm": 292.0, + "learning_rate": 1.2735849056603773e-06, + "loss": 3.0161, + "step": 2160 + }, + { + "epoch": 0.0638360607400855, + "grad_norm": 27.625, + "learning_rate": 1.2765330188679245e-06, + "loss": 2.8943, + "step": 2165 + }, + { + "epoch": 0.06398348813209494, + "grad_norm": 20.5, + "learning_rate": 1.2794811320754716e-06, + "loss": 2.9569, + "step": 2170 + }, + { + "epoch": 0.06413091552410438, + "grad_norm": 16.375, + "learning_rate": 1.2824292452830188e-06, + "loss": 2.8933, + "step": 2175 + }, + { + "epoch": 0.06427834291611381, + "grad_norm": 21.75, + "learning_rate": 1.2853773584905662e-06, + "loss": 2.8857, + "step": 2180 + }, + { + "epoch": 0.06442577030812324, + "grad_norm": 25.375, + "learning_rate": 1.288325471698113e-06, + "loss": 2.7119, + "step": 2185 + }, + { + "epoch": 0.06457319770013269, + "grad_norm": 15.5, + "learning_rate": 1.2912735849056605e-06, + "loss": 2.9744, + "step": 2190 + }, + { + "epoch": 0.06472062509214212, + "grad_norm": 28.0, + "learning_rate": 1.2942216981132074e-06, + "loss": 2.8828, + "step": 2195 + }, + { + "epoch": 0.06486805248415156, + "grad_norm": 17.625, + "learning_rate": 1.2971698113207548e-06, + "loss": 2.9426, + "step": 2200 + }, + { + "epoch": 0.06501547987616099, + "grad_norm": 16.75, + "learning_rate": 1.300117924528302e-06, + "loss": 2.9991, + "step": 2205 + }, + { + "epoch": 0.06516290726817042, + "grad_norm": 32.5, + "learning_rate": 1.3030660377358489e-06, + "loss": 2.9827, + "step": 2210 + }, + { + "epoch": 0.06531033466017987, + "grad_norm": 33.0, + "learning_rate": 1.3060141509433962e-06, + "loss": 2.7431, + "step": 2215 + }, + { + "epoch": 0.0654577620521893, + "grad_norm": 18.125, + "learning_rate": 1.3089622641509432e-06, + "loss": 2.8863, + "step": 2220 + }, + { + "epoch": 0.06560518944419873, + "grad_norm": 16.625, + "learning_rate": 1.3119103773584905e-06, + "loss": 2.8273, + "step": 2225 + }, + { + "epoch": 0.06575261683620817, + "grad_norm": 17.5, + "learning_rate": 1.3148584905660377e-06, + "loss": 2.8624, + "step": 2230 + }, + { + "epoch": 0.0659000442282176, + "grad_norm": 21.0, + "learning_rate": 1.3178066037735848e-06, + "loss": 2.8465, + "step": 2235 + }, + { + "epoch": 0.06604747162022703, + "grad_norm": 25.375, + "learning_rate": 1.320754716981132e-06, + "loss": 2.8271, + "step": 2240 + }, + { + "epoch": 0.06619489901223648, + "grad_norm": 16.375, + "learning_rate": 1.3237028301886792e-06, + "loss": 2.8661, + "step": 2245 + }, + { + "epoch": 0.06634232640424591, + "grad_norm": 15.75, + "learning_rate": 1.3266509433962263e-06, + "loss": 2.8245, + "step": 2250 + }, + { + "epoch": 0.06648975379625534, + "grad_norm": 16.25, + "learning_rate": 1.3295990566037737e-06, + "loss": 2.8016, + "step": 2255 + }, + { + "epoch": 0.06663718118826478, + "grad_norm": 85.5, + "learning_rate": 1.3325471698113206e-06, + "loss": 3.0405, + "step": 2260 + }, + { + "epoch": 0.06678460858027421, + "grad_norm": 16.5, + "learning_rate": 1.335495283018868e-06, + "loss": 2.941, + "step": 2265 + }, + { + "epoch": 0.06693203597228366, + "grad_norm": 18.375, + "learning_rate": 1.338443396226415e-06, + "loss": 2.9167, + "step": 2270 + }, + { + "epoch": 0.06707946336429309, + "grad_norm": 28.25, + "learning_rate": 1.3413915094339623e-06, + "loss": 2.9053, + "step": 2275 + }, + { + "epoch": 0.06722689075630252, + "grad_norm": 20.125, + "learning_rate": 1.3443396226415094e-06, + "loss": 2.9119, + "step": 2280 + }, + { + "epoch": 0.06737431814831196, + "grad_norm": 26.75, + "learning_rate": 1.3472877358490566e-06, + "loss": 2.8339, + "step": 2285 + }, + { + "epoch": 0.06752174554032139, + "grad_norm": 16.75, + "learning_rate": 1.3502358490566037e-06, + "loss": 2.8336, + "step": 2290 + }, + { + "epoch": 0.06766917293233082, + "grad_norm": 14.875, + "learning_rate": 1.353183962264151e-06, + "loss": 2.7363, + "step": 2295 + }, + { + "epoch": 0.06781660032434027, + "grad_norm": 15.625, + "learning_rate": 1.356132075471698e-06, + "loss": 2.8677, + "step": 2300 + }, + { + "epoch": 0.0679640277163497, + "grad_norm": 18.125, + "learning_rate": 1.3590801886792452e-06, + "loss": 2.8287, + "step": 2305 + }, + { + "epoch": 0.06811145510835913, + "grad_norm": 17.25, + "learning_rate": 1.3620283018867924e-06, + "loss": 2.8503, + "step": 2310 + }, + { + "epoch": 0.06825888250036857, + "grad_norm": 18.5, + "learning_rate": 1.3649764150943395e-06, + "loss": 2.8452, + "step": 2315 + }, + { + "epoch": 0.068406309892378, + "grad_norm": 20.125, + "learning_rate": 1.3679245283018867e-06, + "loss": 2.8188, + "step": 2320 + }, + { + "epoch": 0.06855373728438743, + "grad_norm": 21.875, + "learning_rate": 1.3708726415094338e-06, + "loss": 2.7956, + "step": 2325 + }, + { + "epoch": 0.06870116467639688, + "grad_norm": 24.875, + "learning_rate": 1.3738207547169812e-06, + "loss": 2.8185, + "step": 2330 + }, + { + "epoch": 0.06884859206840631, + "grad_norm": 18.25, + "learning_rate": 1.3767688679245281e-06, + "loss": 2.7711, + "step": 2335 + }, + { + "epoch": 0.06899601946041574, + "grad_norm": 20.875, + "learning_rate": 1.3797169811320755e-06, + "loss": 2.9115, + "step": 2340 + }, + { + "epoch": 0.06914344685242518, + "grad_norm": 18.125, + "learning_rate": 1.3826650943396224e-06, + "loss": 2.8137, + "step": 2345 + }, + { + "epoch": 0.06929087424443461, + "grad_norm": 25.5, + "learning_rate": 1.3856132075471698e-06, + "loss": 2.9505, + "step": 2350 + }, + { + "epoch": 0.06943830163644406, + "grad_norm": 24.375, + "learning_rate": 1.388561320754717e-06, + "loss": 2.8544, + "step": 2355 + }, + { + "epoch": 0.06958572902845349, + "grad_norm": 24.875, + "learning_rate": 1.3915094339622641e-06, + "loss": 2.9782, + "step": 2360 + }, + { + "epoch": 0.06973315642046292, + "grad_norm": 19.125, + "learning_rate": 1.3944575471698113e-06, + "loss": 3.0521, + "step": 2365 + }, + { + "epoch": 0.06988058381247236, + "grad_norm": 16.625, + "learning_rate": 1.3974056603773584e-06, + "loss": 2.7751, + "step": 2370 + }, + { + "epoch": 0.0700280112044818, + "grad_norm": 21.375, + "learning_rate": 1.4003537735849056e-06, + "loss": 2.7304, + "step": 2375 + }, + { + "epoch": 0.07017543859649122, + "grad_norm": 32.25, + "learning_rate": 1.403301886792453e-06, + "loss": 2.8306, + "step": 2380 + }, + { + "epoch": 0.07032286598850067, + "grad_norm": 20.375, + "learning_rate": 1.4062499999999999e-06, + "loss": 2.9704, + "step": 2385 + }, + { + "epoch": 0.0704702933805101, + "grad_norm": 27.875, + "learning_rate": 1.409198113207547e-06, + "loss": 2.8213, + "step": 2390 + }, + { + "epoch": 0.07061772077251953, + "grad_norm": 27.875, + "learning_rate": 1.4121462264150942e-06, + "loss": 2.8982, + "step": 2395 + }, + { + "epoch": 0.07076514816452897, + "grad_norm": 29.375, + "learning_rate": 1.4150943396226413e-06, + "loss": 2.8757, + "step": 2400 + }, + { + "epoch": 0.0709125755565384, + "grad_norm": 18.125, + "learning_rate": 1.4180424528301887e-06, + "loss": 2.7914, + "step": 2405 + }, + { + "epoch": 0.07106000294854783, + "grad_norm": 63.0, + "learning_rate": 1.4209905660377356e-06, + "loss": 2.7504, + "step": 2410 + }, + { + "epoch": 0.07120743034055728, + "grad_norm": 16.0, + "learning_rate": 1.423938679245283e-06, + "loss": 2.8089, + "step": 2415 + }, + { + "epoch": 0.07135485773256671, + "grad_norm": 19.125, + "learning_rate": 1.42688679245283e-06, + "loss": 2.7972, + "step": 2420 + }, + { + "epoch": 0.07150228512457614, + "grad_norm": 15.0, + "learning_rate": 1.4298349056603773e-06, + "loss": 2.7051, + "step": 2425 + }, + { + "epoch": 0.07164971251658558, + "grad_norm": 15.1875, + "learning_rate": 1.4327830188679245e-06, + "loss": 2.8048, + "step": 2430 + }, + { + "epoch": 0.07179713990859501, + "grad_norm": 18.375, + "learning_rate": 1.4357311320754716e-06, + "loss": 2.7196, + "step": 2435 + }, + { + "epoch": 0.07194456730060446, + "grad_norm": 16.375, + "learning_rate": 1.4386792452830188e-06, + "loss": 2.695, + "step": 2440 + }, + { + "epoch": 0.07209199469261389, + "grad_norm": 18.25, + "learning_rate": 1.4416273584905661e-06, + "loss": 2.9735, + "step": 2445 + }, + { + "epoch": 0.07223942208462332, + "grad_norm": 18.25, + "learning_rate": 1.444575471698113e-06, + "loss": 2.8195, + "step": 2450 + }, + { + "epoch": 0.07238684947663276, + "grad_norm": 17.875, + "learning_rate": 1.4475235849056605e-06, + "loss": 2.7522, + "step": 2455 + }, + { + "epoch": 0.0725342768686422, + "grad_norm": 17.125, + "learning_rate": 1.4504716981132074e-06, + "loss": 2.8065, + "step": 2460 + }, + { + "epoch": 0.07268170426065163, + "grad_norm": 15.625, + "learning_rate": 1.4534198113207548e-06, + "loss": 2.8917, + "step": 2465 + }, + { + "epoch": 0.07282913165266107, + "grad_norm": 19.125, + "learning_rate": 1.456367924528302e-06, + "loss": 2.8464, + "step": 2470 + }, + { + "epoch": 0.0729765590446705, + "grad_norm": 16.375, + "learning_rate": 1.4593160377358489e-06, + "loss": 2.9117, + "step": 2475 + }, + { + "epoch": 0.07312398643667993, + "grad_norm": 16.375, + "learning_rate": 1.4622641509433962e-06, + "loss": 2.6589, + "step": 2480 + }, + { + "epoch": 0.07327141382868937, + "grad_norm": 21.75, + "learning_rate": 1.4652122641509432e-06, + "loss": 2.915, + "step": 2485 + }, + { + "epoch": 0.0734188412206988, + "grad_norm": 16.875, + "learning_rate": 1.4681603773584905e-06, + "loss": 2.7783, + "step": 2490 + }, + { + "epoch": 0.07356626861270824, + "grad_norm": 16.875, + "learning_rate": 1.4711084905660377e-06, + "loss": 2.7156, + "step": 2495 + }, + { + "epoch": 0.07371369600471768, + "grad_norm": 17.0, + "learning_rate": 1.4740566037735848e-06, + "loss": 2.6785, + "step": 2500 + }, + { + "epoch": 0.07371369600471768, + "eval_loss": 3.0259971618652344, + "eval_runtime": 4.7168, + "eval_samples_per_second": 83.956, + "eval_steps_per_second": 2.756, + "step": 2500 + }, + { + "epoch": 0.07386112339672711, + "grad_norm": 11.6875, + "learning_rate": 1.477004716981132e-06, + "loss": 2.6697, + "step": 2505 + }, + { + "epoch": 0.07400855078873654, + "grad_norm": 19.125, + "learning_rate": 1.4799528301886791e-06, + "loss": 2.9352, + "step": 2510 + }, + { + "epoch": 0.07415597818074599, + "grad_norm": 24.5, + "learning_rate": 1.4829009433962263e-06, + "loss": 2.8369, + "step": 2515 + }, + { + "epoch": 0.07430340557275542, + "grad_norm": 33.25, + "learning_rate": 1.4858490566037737e-06, + "loss": 2.7892, + "step": 2520 + }, + { + "epoch": 0.07445083296476486, + "grad_norm": 23.0, + "learning_rate": 1.4887971698113206e-06, + "loss": 2.6221, + "step": 2525 + }, + { + "epoch": 0.07459826035677429, + "grad_norm": 17.5, + "learning_rate": 1.491745283018868e-06, + "loss": 2.7536, + "step": 2530 + }, + { + "epoch": 0.07474568774878372, + "grad_norm": 20.0, + "learning_rate": 1.494693396226415e-06, + "loss": 2.8727, + "step": 2535 + }, + { + "epoch": 0.07489311514079317, + "grad_norm": 13.6875, + "learning_rate": 1.4976415094339623e-06, + "loss": 2.6381, + "step": 2540 + }, + { + "epoch": 0.0750405425328026, + "grad_norm": 24.625, + "learning_rate": 1.5005896226415094e-06, + "loss": 2.8, + "step": 2545 + }, + { + "epoch": 0.07518796992481203, + "grad_norm": 21.875, + "learning_rate": 1.5035377358490566e-06, + "loss": 2.6405, + "step": 2550 + }, + { + "epoch": 0.07533539731682147, + "grad_norm": 13.75, + "learning_rate": 1.5064858490566037e-06, + "loss": 2.7266, + "step": 2555 + }, + { + "epoch": 0.0754828247088309, + "grad_norm": 15.25, + "learning_rate": 1.5094339622641509e-06, + "loss": 2.7492, + "step": 2560 + }, + { + "epoch": 0.07563025210084033, + "grad_norm": 19.5, + "learning_rate": 1.512382075471698e-06, + "loss": 2.7432, + "step": 2565 + }, + { + "epoch": 0.07577767949284978, + "grad_norm": 21.0, + "learning_rate": 1.5153301886792452e-06, + "loss": 2.8012, + "step": 2570 + }, + { + "epoch": 0.0759251068848592, + "grad_norm": 18.5, + "learning_rate": 1.5182783018867923e-06, + "loss": 2.8801, + "step": 2575 + }, + { + "epoch": 0.07607253427686864, + "grad_norm": 15.9375, + "learning_rate": 1.5212264150943395e-06, + "loss": 2.6939, + "step": 2580 + }, + { + "epoch": 0.07621996166887808, + "grad_norm": 16.375, + "learning_rate": 1.5241745283018867e-06, + "loss": 2.7358, + "step": 2585 + }, + { + "epoch": 0.07636738906088751, + "grad_norm": 15.375, + "learning_rate": 1.5271226415094338e-06, + "loss": 2.7957, + "step": 2590 + }, + { + "epoch": 0.07651481645289694, + "grad_norm": 18.375, + "learning_rate": 1.5300707547169812e-06, + "loss": 2.8, + "step": 2595 + }, + { + "epoch": 0.07666224384490639, + "grad_norm": 20.625, + "learning_rate": 1.5330188679245281e-06, + "loss": 2.7041, + "step": 2600 + }, + { + "epoch": 0.07680967123691582, + "grad_norm": 17.75, + "learning_rate": 1.5359669811320755e-06, + "loss": 2.8524, + "step": 2605 + }, + { + "epoch": 0.07695709862892526, + "grad_norm": 20.625, + "learning_rate": 1.5389150943396224e-06, + "loss": 2.7402, + "step": 2610 + }, + { + "epoch": 0.07710452602093469, + "grad_norm": 21.25, + "learning_rate": 1.5418632075471698e-06, + "loss": 2.8244, + "step": 2615 + }, + { + "epoch": 0.07725195341294412, + "grad_norm": 24.125, + "learning_rate": 1.544811320754717e-06, + "loss": 2.7226, + "step": 2620 + }, + { + "epoch": 0.07739938080495357, + "grad_norm": 19.625, + "learning_rate": 1.547759433962264e-06, + "loss": 2.637, + "step": 2625 + }, + { + "epoch": 0.077546808196963, + "grad_norm": 22.375, + "learning_rate": 1.5507075471698112e-06, + "loss": 2.6678, + "step": 2630 + }, + { + "epoch": 0.07769423558897243, + "grad_norm": 19.875, + "learning_rate": 1.5536556603773586e-06, + "loss": 2.5859, + "step": 2635 + }, + { + "epoch": 0.07784166298098187, + "grad_norm": 19.75, + "learning_rate": 1.5566037735849056e-06, + "loss": 2.5884, + "step": 2640 + }, + { + "epoch": 0.0779890903729913, + "grad_norm": 15.25, + "learning_rate": 1.559551886792453e-06, + "loss": 2.8667, + "step": 2645 + }, + { + "epoch": 0.07813651776500073, + "grad_norm": 17.0, + "learning_rate": 1.5624999999999999e-06, + "loss": 2.6761, + "step": 2650 + }, + { + "epoch": 0.07828394515701018, + "grad_norm": 21.75, + "learning_rate": 1.565448113207547e-06, + "loss": 2.6761, + "step": 2655 + }, + { + "epoch": 0.0784313725490196, + "grad_norm": 32.25, + "learning_rate": 1.5683962264150944e-06, + "loss": 2.7592, + "step": 2660 + }, + { + "epoch": 0.07857879994102904, + "grad_norm": 43.0, + "learning_rate": 1.5713443396226413e-06, + "loss": 2.7657, + "step": 2665 + }, + { + "epoch": 0.07872622733303848, + "grad_norm": 22.875, + "learning_rate": 1.5742924528301887e-06, + "loss": 2.8494, + "step": 2670 + }, + { + "epoch": 0.07887365472504791, + "grad_norm": 18.25, + "learning_rate": 1.5772405660377356e-06, + "loss": 2.7864, + "step": 2675 + }, + { + "epoch": 0.07902108211705734, + "grad_norm": 24.5, + "learning_rate": 1.580188679245283e-06, + "loss": 2.598, + "step": 2680 + }, + { + "epoch": 0.07916850950906679, + "grad_norm": 27.25, + "learning_rate": 1.5831367924528301e-06, + "loss": 2.751, + "step": 2685 + }, + { + "epoch": 0.07931593690107622, + "grad_norm": 17.625, + "learning_rate": 1.5860849056603773e-06, + "loss": 2.7719, + "step": 2690 + }, + { + "epoch": 0.07946336429308566, + "grad_norm": 20.75, + "learning_rate": 1.5890330188679245e-06, + "loss": 2.6851, + "step": 2695 + }, + { + "epoch": 0.07961079168509509, + "grad_norm": 16.25, + "learning_rate": 1.5919811320754716e-06, + "loss": 2.7268, + "step": 2700 + }, + { + "epoch": 0.07975821907710452, + "grad_norm": 20.75, + "learning_rate": 1.5949292452830188e-06, + "loss": 2.7603, + "step": 2705 + }, + { + "epoch": 0.07990564646911397, + "grad_norm": 17.875, + "learning_rate": 1.5978773584905661e-06, + "loss": 2.8362, + "step": 2710 + }, + { + "epoch": 0.0800530738611234, + "grad_norm": 20.375, + "learning_rate": 1.600825471698113e-06, + "loss": 2.7863, + "step": 2715 + }, + { + "epoch": 0.08020050125313283, + "grad_norm": 16.5, + "learning_rate": 1.6037735849056604e-06, + "loss": 2.6775, + "step": 2720 + }, + { + "epoch": 0.08034792864514227, + "grad_norm": 20.625, + "learning_rate": 1.6067216981132074e-06, + "loss": 2.7235, + "step": 2725 + }, + { + "epoch": 0.0804953560371517, + "grad_norm": 15.5625, + "learning_rate": 1.6096698113207547e-06, + "loss": 2.655, + "step": 2730 + }, + { + "epoch": 0.08064278342916113, + "grad_norm": 23.625, + "learning_rate": 1.612617924528302e-06, + "loss": 2.8043, + "step": 2735 + }, + { + "epoch": 0.08079021082117058, + "grad_norm": 23.375, + "learning_rate": 1.6155660377358488e-06, + "loss": 2.6753, + "step": 2740 + }, + { + "epoch": 0.08093763821318001, + "grad_norm": 16.75, + "learning_rate": 1.6185141509433962e-06, + "loss": 2.7216, + "step": 2745 + }, + { + "epoch": 0.08108506560518944, + "grad_norm": 29.75, + "learning_rate": 1.6214622641509431e-06, + "loss": 2.893, + "step": 2750 + }, + { + "epoch": 0.08123249299719888, + "grad_norm": 18.625, + "learning_rate": 1.6244103773584905e-06, + "loss": 2.7302, + "step": 2755 + }, + { + "epoch": 0.08137992038920831, + "grad_norm": 17.0, + "learning_rate": 1.6273584905660377e-06, + "loss": 2.7501, + "step": 2760 + }, + { + "epoch": 0.08152734778121776, + "grad_norm": 19.5, + "learning_rate": 1.6303066037735848e-06, + "loss": 2.7171, + "step": 2765 + }, + { + "epoch": 0.08167477517322719, + "grad_norm": 14.9375, + "learning_rate": 1.633254716981132e-06, + "loss": 2.7143, + "step": 2770 + }, + { + "epoch": 0.08182220256523662, + "grad_norm": 21.625, + "learning_rate": 1.6362028301886791e-06, + "loss": 2.632, + "step": 2775 + }, + { + "epoch": 0.08196962995724606, + "grad_norm": 17.25, + "learning_rate": 1.6391509433962263e-06, + "loss": 2.7041, + "step": 2780 + }, + { + "epoch": 0.08211705734925549, + "grad_norm": 27.25, + "learning_rate": 1.6420990566037736e-06, + "loss": 2.8002, + "step": 2785 + }, + { + "epoch": 0.08226448474126492, + "grad_norm": 31.25, + "learning_rate": 1.6450471698113206e-06, + "loss": 2.5959, + "step": 2790 + }, + { + "epoch": 0.08241191213327437, + "grad_norm": 19.875, + "learning_rate": 1.647995283018868e-06, + "loss": 2.7606, + "step": 2795 + }, + { + "epoch": 0.0825593395252838, + "grad_norm": 18.5, + "learning_rate": 1.6509433962264149e-06, + "loss": 3.0455, + "step": 2800 + }, + { + "epoch": 0.08270676691729323, + "grad_norm": 22.875, + "learning_rate": 1.6538915094339623e-06, + "loss": 2.7094, + "step": 2805 + }, + { + "epoch": 0.08285419430930267, + "grad_norm": 17.375, + "learning_rate": 1.6568396226415094e-06, + "loss": 2.4643, + "step": 2810 + }, + { + "epoch": 0.0830016217013121, + "grad_norm": 22.0, + "learning_rate": 1.6597877358490566e-06, + "loss": 2.6835, + "step": 2815 + }, + { + "epoch": 0.08314904909332153, + "grad_norm": 18.125, + "learning_rate": 1.6627358490566037e-06, + "loss": 2.6437, + "step": 2820 + }, + { + "epoch": 0.08329647648533098, + "grad_norm": 22.625, + "learning_rate": 1.6656839622641509e-06, + "loss": 2.6712, + "step": 2825 + }, + { + "epoch": 0.08344390387734041, + "grad_norm": 17.0, + "learning_rate": 1.668632075471698e-06, + "loss": 2.7535, + "step": 2830 + }, + { + "epoch": 0.08359133126934984, + "grad_norm": 24.375, + "learning_rate": 1.6715801886792452e-06, + "loss": 2.6405, + "step": 2835 + }, + { + "epoch": 0.08373875866135928, + "grad_norm": 16.875, + "learning_rate": 1.6745283018867923e-06, + "loss": 2.5578, + "step": 2840 + }, + { + "epoch": 0.08388618605336871, + "grad_norm": 17.375, + "learning_rate": 1.6774764150943395e-06, + "loss": 2.662, + "step": 2845 + }, + { + "epoch": 0.08403361344537816, + "grad_norm": 18.0, + "learning_rate": 1.6804245283018866e-06, + "loss": 2.5877, + "step": 2850 + }, + { + "epoch": 0.08418104083738759, + "grad_norm": 22.75, + "learning_rate": 1.6833726415094338e-06, + "loss": 2.7346, + "step": 2855 + }, + { + "epoch": 0.08432846822939702, + "grad_norm": 25.0, + "learning_rate": 1.6863207547169812e-06, + "loss": 2.7402, + "step": 2860 + }, + { + "epoch": 0.08447589562140646, + "grad_norm": 19.0, + "learning_rate": 1.689268867924528e-06, + "loss": 2.6026, + "step": 2865 + }, + { + "epoch": 0.0846233230134159, + "grad_norm": 23.625, + "learning_rate": 1.6922169811320755e-06, + "loss": 2.6259, + "step": 2870 + }, + { + "epoch": 0.08477075040542532, + "grad_norm": 16.625, + "learning_rate": 1.6951650943396224e-06, + "loss": 2.67, + "step": 2875 + }, + { + "epoch": 0.08491817779743477, + "grad_norm": 22.875, + "learning_rate": 1.6981132075471698e-06, + "loss": 2.4129, + "step": 2880 + }, + { + "epoch": 0.0850656051894442, + "grad_norm": 129.0, + "learning_rate": 1.701061320754717e-06, + "loss": 2.8064, + "step": 2885 + }, + { + "epoch": 0.08521303258145363, + "grad_norm": 13.625, + "learning_rate": 1.704009433962264e-06, + "loss": 2.6248, + "step": 2890 + }, + { + "epoch": 0.08536045997346307, + "grad_norm": 16.125, + "learning_rate": 1.7069575471698112e-06, + "loss": 2.6568, + "step": 2895 + }, + { + "epoch": 0.0855078873654725, + "grad_norm": 15.75, + "learning_rate": 1.7099056603773586e-06, + "loss": 2.7321, + "step": 2900 + }, + { + "epoch": 0.08565531475748193, + "grad_norm": 34.5, + "learning_rate": 1.7128537735849055e-06, + "loss": 2.6696, + "step": 2905 + }, + { + "epoch": 0.08580274214949138, + "grad_norm": 18.375, + "learning_rate": 1.715801886792453e-06, + "loss": 2.7116, + "step": 2910 + }, + { + "epoch": 0.08595016954150081, + "grad_norm": 16.125, + "learning_rate": 1.7187499999999998e-06, + "loss": 2.66, + "step": 2915 + }, + { + "epoch": 0.08609759693351024, + "grad_norm": 18.25, + "learning_rate": 1.721698113207547e-06, + "loss": 2.7062, + "step": 2920 + }, + { + "epoch": 0.08624502432551968, + "grad_norm": 19.75, + "learning_rate": 1.7246462264150944e-06, + "loss": 2.667, + "step": 2925 + }, + { + "epoch": 0.08639245171752911, + "grad_norm": 13.125, + "learning_rate": 1.7275943396226413e-06, + "loss": 2.6131, + "step": 2930 + }, + { + "epoch": 0.08653987910953856, + "grad_norm": 20.25, + "learning_rate": 1.7305424528301887e-06, + "loss": 2.6455, + "step": 2935 + }, + { + "epoch": 0.08668730650154799, + "grad_norm": 18.625, + "learning_rate": 1.7334905660377356e-06, + "loss": 2.6537, + "step": 2940 + }, + { + "epoch": 0.08683473389355742, + "grad_norm": 16.75, + "learning_rate": 1.736438679245283e-06, + "loss": 2.6189, + "step": 2945 + }, + { + "epoch": 0.08698216128556686, + "grad_norm": 19.5, + "learning_rate": 1.7393867924528301e-06, + "loss": 2.6757, + "step": 2950 + }, + { + "epoch": 0.0871295886775763, + "grad_norm": 20.5, + "learning_rate": 1.7423349056603773e-06, + "loss": 2.613, + "step": 2955 + }, + { + "epoch": 0.08727701606958572, + "grad_norm": 23.125, + "learning_rate": 1.7452830188679244e-06, + "loss": 2.6377, + "step": 2960 + }, + { + "epoch": 0.08742444346159517, + "grad_norm": 23.25, + "learning_rate": 1.7482311320754716e-06, + "loss": 2.7463, + "step": 2965 + }, + { + "epoch": 0.0875718708536046, + "grad_norm": 69.5, + "learning_rate": 1.7511792452830188e-06, + "loss": 2.7704, + "step": 2970 + }, + { + "epoch": 0.08771929824561403, + "grad_norm": 15.8125, + "learning_rate": 1.7541273584905661e-06, + "loss": 2.593, + "step": 2975 + }, + { + "epoch": 0.08786672563762347, + "grad_norm": 53.25, + "learning_rate": 1.757075471698113e-06, + "loss": 2.5997, + "step": 2980 + }, + { + "epoch": 0.0880141530296329, + "grad_norm": 33.75, + "learning_rate": 1.7600235849056604e-06, + "loss": 2.7404, + "step": 2985 + }, + { + "epoch": 0.08816158042164234, + "grad_norm": 19.5, + "learning_rate": 1.7629716981132074e-06, + "loss": 2.5596, + "step": 2990 + }, + { + "epoch": 0.08830900781365178, + "grad_norm": 24.625, + "learning_rate": 1.7659198113207547e-06, + "loss": 2.6808, + "step": 2995 + }, + { + "epoch": 0.08845643520566121, + "grad_norm": 20.75, + "learning_rate": 1.7688679245283019e-06, + "loss": 2.534, + "step": 3000 + }, + { + "epoch": 0.08845643520566121, + "eval_loss": 2.8255984783172607, + "eval_runtime": 4.7091, + "eval_samples_per_second": 84.093, + "eval_steps_per_second": 2.761, + "step": 3000 + }, + { + "epoch": 0.08860386259767064, + "grad_norm": 15.75, + "learning_rate": 1.7718160377358488e-06, + "loss": 2.6962, + "step": 3005 + }, + { + "epoch": 0.08875128998968008, + "grad_norm": 19.125, + "learning_rate": 1.7747641509433962e-06, + "loss": 2.5769, + "step": 3010 + }, + { + "epoch": 0.08889871738168952, + "grad_norm": 23.0, + "learning_rate": 1.7777122641509431e-06, + "loss": 2.5602, + "step": 3015 + }, + { + "epoch": 0.08904614477369896, + "grad_norm": 16.0, + "learning_rate": 1.7806603773584905e-06, + "loss": 2.5782, + "step": 3020 + }, + { + "epoch": 0.08919357216570839, + "grad_norm": 19.375, + "learning_rate": 1.7836084905660377e-06, + "loss": 2.6285, + "step": 3025 + }, + { + "epoch": 0.08934099955771782, + "grad_norm": 21.25, + "learning_rate": 1.7865566037735848e-06, + "loss": 2.6416, + "step": 3030 + }, + { + "epoch": 0.08948842694972726, + "grad_norm": 19.5, + "learning_rate": 1.789504716981132e-06, + "loss": 2.5563, + "step": 3035 + }, + { + "epoch": 0.0896358543417367, + "grad_norm": 19.5, + "learning_rate": 1.7924528301886791e-06, + "loss": 2.6188, + "step": 3040 + }, + { + "epoch": 0.08978328173374613, + "grad_norm": 15.75, + "learning_rate": 1.7954009433962263e-06, + "loss": 2.6018, + "step": 3045 + }, + { + "epoch": 0.08993070912575557, + "grad_norm": 19.625, + "learning_rate": 1.7983490566037736e-06, + "loss": 2.6601, + "step": 3050 + }, + { + "epoch": 0.090078136517765, + "grad_norm": 22.375, + "learning_rate": 1.8012971698113206e-06, + "loss": 2.6206, + "step": 3055 + }, + { + "epoch": 0.09022556390977443, + "grad_norm": 25.625, + "learning_rate": 1.804245283018868e-06, + "loss": 2.7032, + "step": 3060 + }, + { + "epoch": 0.09037299130178388, + "grad_norm": 16.75, + "learning_rate": 1.8071933962264149e-06, + "loss": 2.6075, + "step": 3065 + }, + { + "epoch": 0.0905204186937933, + "grad_norm": 18.875, + "learning_rate": 1.8101415094339622e-06, + "loss": 2.7258, + "step": 3070 + }, + { + "epoch": 0.09066784608580274, + "grad_norm": 23.375, + "learning_rate": 1.8130896226415094e-06, + "loss": 2.7048, + "step": 3075 + }, + { + "epoch": 0.09081527347781218, + "grad_norm": 19.75, + "learning_rate": 1.8160377358490566e-06, + "loss": 2.4501, + "step": 3080 + }, + { + "epoch": 0.09096270086982161, + "grad_norm": 19.125, + "learning_rate": 1.8189858490566037e-06, + "loss": 2.5144, + "step": 3085 + }, + { + "epoch": 0.09111012826183104, + "grad_norm": 17.375, + "learning_rate": 1.821933962264151e-06, + "loss": 2.7287, + "step": 3090 + }, + { + "epoch": 0.09125755565384049, + "grad_norm": 15.125, + "learning_rate": 1.824882075471698e-06, + "loss": 2.6906, + "step": 3095 + }, + { + "epoch": 0.09140498304584992, + "grad_norm": 16.875, + "learning_rate": 1.8278301886792452e-06, + "loss": 2.6328, + "step": 3100 + }, + { + "epoch": 0.09155241043785936, + "grad_norm": 28.375, + "learning_rate": 1.8307783018867923e-06, + "loss": 2.7216, + "step": 3105 + }, + { + "epoch": 0.09169983782986879, + "grad_norm": 20.375, + "learning_rate": 1.8337264150943395e-06, + "loss": 2.6709, + "step": 3110 + }, + { + "epoch": 0.09184726522187822, + "grad_norm": 17.625, + "learning_rate": 1.8366745283018868e-06, + "loss": 2.6247, + "step": 3115 + }, + { + "epoch": 0.09199469261388767, + "grad_norm": 14.625, + "learning_rate": 1.8396226415094338e-06, + "loss": 2.704, + "step": 3120 + }, + { + "epoch": 0.0921421200058971, + "grad_norm": 20.5, + "learning_rate": 1.8425707547169811e-06, + "loss": 2.6575, + "step": 3125 + }, + { + "epoch": 0.09228954739790653, + "grad_norm": 16.75, + "learning_rate": 1.845518867924528e-06, + "loss": 2.6053, + "step": 3130 + }, + { + "epoch": 0.09243697478991597, + "grad_norm": 18.875, + "learning_rate": 1.8484669811320755e-06, + "loss": 2.6402, + "step": 3135 + }, + { + "epoch": 0.0925844021819254, + "grad_norm": 24.375, + "learning_rate": 1.8514150943396226e-06, + "loss": 2.6393, + "step": 3140 + }, + { + "epoch": 0.09273182957393483, + "grad_norm": 23.125, + "learning_rate": 1.8543632075471698e-06, + "loss": 2.7078, + "step": 3145 + }, + { + "epoch": 0.09287925696594428, + "grad_norm": 18.5, + "learning_rate": 1.857311320754717e-06, + "loss": 2.6088, + "step": 3150 + }, + { + "epoch": 0.0930266843579537, + "grad_norm": 17.0, + "learning_rate": 1.860259433962264e-06, + "loss": 2.6411, + "step": 3155 + }, + { + "epoch": 0.09317411174996314, + "grad_norm": 21.25, + "learning_rate": 1.8632075471698112e-06, + "loss": 2.5609, + "step": 3160 + }, + { + "epoch": 0.09332153914197258, + "grad_norm": 19.0, + "learning_rate": 1.8661556603773586e-06, + "loss": 2.7937, + "step": 3165 + }, + { + "epoch": 0.09346896653398201, + "grad_norm": 17.75, + "learning_rate": 1.8691037735849055e-06, + "loss": 2.5774, + "step": 3170 + }, + { + "epoch": 0.09361639392599144, + "grad_norm": 29.125, + "learning_rate": 1.8720518867924529e-06, + "loss": 2.6118, + "step": 3175 + }, + { + "epoch": 0.09376382131800089, + "grad_norm": 34.0, + "learning_rate": 1.8749999999999998e-06, + "loss": 2.7062, + "step": 3180 + }, + { + "epoch": 0.09391124871001032, + "grad_norm": 21.375, + "learning_rate": 1.877948113207547e-06, + "loss": 2.4245, + "step": 3185 + }, + { + "epoch": 0.09405867610201976, + "grad_norm": 21.0, + "learning_rate": 1.8808962264150944e-06, + "loss": 2.575, + "step": 3190 + }, + { + "epoch": 0.09420610349402919, + "grad_norm": 19.75, + "learning_rate": 1.8838443396226413e-06, + "loss": 2.6233, + "step": 3195 + }, + { + "epoch": 0.09435353088603862, + "grad_norm": 17.0, + "learning_rate": 1.8867924528301887e-06, + "loss": 2.63, + "step": 3200 + }, + { + "epoch": 0.09450095827804807, + "grad_norm": 15.5, + "learning_rate": 1.8897405660377356e-06, + "loss": 2.4527, + "step": 3205 + }, + { + "epoch": 0.0946483856700575, + "grad_norm": 28.125, + "learning_rate": 1.892688679245283e-06, + "loss": 2.6324, + "step": 3210 + }, + { + "epoch": 0.09479581306206693, + "grad_norm": 13.5625, + "learning_rate": 1.8956367924528301e-06, + "loss": 2.5413, + "step": 3215 + }, + { + "epoch": 0.09494324045407637, + "grad_norm": 20.625, + "learning_rate": 1.8985849056603773e-06, + "loss": 2.7314, + "step": 3220 + }, + { + "epoch": 0.0950906678460858, + "grad_norm": 18.375, + "learning_rate": 1.9015330188679244e-06, + "loss": 2.5598, + "step": 3225 + }, + { + "epoch": 0.09523809523809523, + "grad_norm": 25.75, + "learning_rate": 1.9044811320754716e-06, + "loss": 2.5823, + "step": 3230 + }, + { + "epoch": 0.09538552263010468, + "grad_norm": 14.375, + "learning_rate": 1.907429245283019e-06, + "loss": 2.6562, + "step": 3235 + }, + { + "epoch": 0.09553295002211411, + "grad_norm": 24.75, + "learning_rate": 1.910377358490566e-06, + "loss": 2.6288, + "step": 3240 + }, + { + "epoch": 0.09568037741412354, + "grad_norm": 17.625, + "learning_rate": 1.9133254716981133e-06, + "loss": 2.5194, + "step": 3245 + }, + { + "epoch": 0.09582780480613298, + "grad_norm": 15.4375, + "learning_rate": 1.9162735849056604e-06, + "loss": 2.5139, + "step": 3250 + }, + { + "epoch": 0.09597523219814241, + "grad_norm": 20.375, + "learning_rate": 1.9192216981132076e-06, + "loss": 2.6683, + "step": 3255 + }, + { + "epoch": 0.09612265959015186, + "grad_norm": 24.375, + "learning_rate": 1.9221698113207547e-06, + "loss": 2.4635, + "step": 3260 + }, + { + "epoch": 0.09627008698216129, + "grad_norm": 29.75, + "learning_rate": 1.925117924528302e-06, + "loss": 2.6588, + "step": 3265 + }, + { + "epoch": 0.09641751437417072, + "grad_norm": 15.375, + "learning_rate": 1.928066037735849e-06, + "loss": 2.4663, + "step": 3270 + }, + { + "epoch": 0.09656494176618016, + "grad_norm": 17.25, + "learning_rate": 1.931014150943396e-06, + "loss": 2.5586, + "step": 3275 + }, + { + "epoch": 0.09671236915818959, + "grad_norm": 16.75, + "learning_rate": 1.9339622641509433e-06, + "loss": 2.4494, + "step": 3280 + }, + { + "epoch": 0.09685979655019902, + "grad_norm": 17.75, + "learning_rate": 1.9369103773584905e-06, + "loss": 2.513, + "step": 3285 + }, + { + "epoch": 0.09700722394220847, + "grad_norm": 18.375, + "learning_rate": 1.9398584905660376e-06, + "loss": 2.5481, + "step": 3290 + }, + { + "epoch": 0.0971546513342179, + "grad_norm": 15.1875, + "learning_rate": 1.942806603773585e-06, + "loss": 2.5519, + "step": 3295 + }, + { + "epoch": 0.09730207872622733, + "grad_norm": 56.5, + "learning_rate": 1.945754716981132e-06, + "loss": 2.4955, + "step": 3300 + }, + { + "epoch": 0.09744950611823677, + "grad_norm": 20.25, + "learning_rate": 1.948702830188679e-06, + "loss": 2.5694, + "step": 3305 + }, + { + "epoch": 0.0975969335102462, + "grad_norm": 29.875, + "learning_rate": 1.9516509433962263e-06, + "loss": 2.5207, + "step": 3310 + }, + { + "epoch": 0.09774436090225563, + "grad_norm": 17.0, + "learning_rate": 1.9545990566037734e-06, + "loss": 2.5684, + "step": 3315 + }, + { + "epoch": 0.09789178829426508, + "grad_norm": 18.375, + "learning_rate": 1.9575471698113206e-06, + "loss": 2.5669, + "step": 3320 + }, + { + "epoch": 0.09803921568627451, + "grad_norm": 18.875, + "learning_rate": 1.9604952830188677e-06, + "loss": 2.4171, + "step": 3325 + }, + { + "epoch": 0.09818664307828394, + "grad_norm": 77.5, + "learning_rate": 1.963443396226415e-06, + "loss": 2.6574, + "step": 3330 + }, + { + "epoch": 0.09833407047029338, + "grad_norm": 84.5, + "learning_rate": 1.9663915094339624e-06, + "loss": 2.574, + "step": 3335 + }, + { + "epoch": 0.09848149786230281, + "grad_norm": 28.125, + "learning_rate": 1.969339622641509e-06, + "loss": 2.6212, + "step": 3340 + }, + { + "epoch": 0.09862892525431226, + "grad_norm": 30.375, + "learning_rate": 1.9722877358490568e-06, + "loss": 2.421, + "step": 3345 + }, + { + "epoch": 0.09877635264632169, + "grad_norm": 22.875, + "learning_rate": 1.9752358490566035e-06, + "loss": 2.5433, + "step": 3350 + }, + { + "epoch": 0.09892378003833112, + "grad_norm": 24.0, + "learning_rate": 1.978183962264151e-06, + "loss": 2.733, + "step": 3355 + }, + { + "epoch": 0.09907120743034056, + "grad_norm": 16.875, + "learning_rate": 1.981132075471698e-06, + "loss": 2.599, + "step": 3360 + }, + { + "epoch": 0.09921863482235, + "grad_norm": 16.625, + "learning_rate": 1.984080188679245e-06, + "loss": 2.5051, + "step": 3365 + }, + { + "epoch": 0.09936606221435942, + "grad_norm": 22.875, + "learning_rate": 1.9870283018867925e-06, + "loss": 2.4694, + "step": 3370 + }, + { + "epoch": 0.09951348960636887, + "grad_norm": 19.625, + "learning_rate": 1.9899764150943392e-06, + "loss": 2.5643, + "step": 3375 + }, + { + "epoch": 0.0996609169983783, + "grad_norm": 19.75, + "learning_rate": 1.992924528301887e-06, + "loss": 2.4453, + "step": 3380 + }, + { + "epoch": 0.09980834439038773, + "grad_norm": 19.875, + "learning_rate": 1.995872641509434e-06, + "loss": 2.4782, + "step": 3385 + }, + { + "epoch": 0.09995577178239717, + "grad_norm": 14.125, + "learning_rate": 1.998820754716981e-06, + "loss": 2.5011, + "step": 3390 + }, + { + "epoch": 0.1001031991744066, + "grad_norm": 16.625, + "learning_rate": 1.999999952328609e-06, + "loss": 2.6024, + "step": 3395 + }, + { + "epoch": 0.10025062656641603, + "grad_norm": 17.5, + "learning_rate": 1.9999996610034596e-06, + "loss": 2.5227, + "step": 3400 + }, + { + "epoch": 0.10039805395842548, + "grad_norm": 17.0, + "learning_rate": 1.9999991048373438e-06, + "loss": 2.4102, + "step": 3405 + }, + { + "epoch": 0.10054548135043491, + "grad_norm": 22.375, + "learning_rate": 1.9999982838304092e-06, + "loss": 2.5443, + "step": 3410 + }, + { + "epoch": 0.10069290874244434, + "grad_norm": 19.25, + "learning_rate": 1.9999971979828727e-06, + "loss": 2.5334, + "step": 3415 + }, + { + "epoch": 0.10084033613445378, + "grad_norm": 18.5, + "learning_rate": 1.999995847295022e-06, + "loss": 2.6654, + "step": 3420 + }, + { + "epoch": 0.10098776352646321, + "grad_norm": 18.0, + "learning_rate": 1.999994231767215e-06, + "loss": 2.5571, + "step": 3425 + }, + { + "epoch": 0.10113519091847266, + "grad_norm": 13.5, + "learning_rate": 1.99999235139988e-06, + "loss": 2.5254, + "step": 3430 + }, + { + "epoch": 0.10128261831048209, + "grad_norm": 27.375, + "learning_rate": 1.999990206193514e-06, + "loss": 2.4695, + "step": 3435 + }, + { + "epoch": 0.10143004570249152, + "grad_norm": 22.0, + "learning_rate": 1.999987796148686e-06, + "loss": 2.4044, + "step": 3440 + }, + { + "epoch": 0.10157747309450096, + "grad_norm": 16.75, + "learning_rate": 1.9999851212660336e-06, + "loss": 2.4752, + "step": 3445 + }, + { + "epoch": 0.1017249004865104, + "grad_norm": 18.75, + "learning_rate": 1.9999821815462655e-06, + "loss": 2.6967, + "step": 3450 + }, + { + "epoch": 0.10187232787851982, + "grad_norm": 25.0, + "learning_rate": 1.9999789769901606e-06, + "loss": 2.5363, + "step": 3455 + }, + { + "epoch": 0.10201975527052927, + "grad_norm": 26.375, + "learning_rate": 1.9999755075985674e-06, + "loss": 2.5621, + "step": 3460 + }, + { + "epoch": 0.1021671826625387, + "grad_norm": 19.375, + "learning_rate": 1.9999717733724043e-06, + "loss": 2.5803, + "step": 3465 + }, + { + "epoch": 0.10231461005454813, + "grad_norm": 22.625, + "learning_rate": 1.9999677743126607e-06, + "loss": 2.6209, + "step": 3470 + }, + { + "epoch": 0.10246203744655757, + "grad_norm": 17.625, + "learning_rate": 1.999963510420396e-06, + "loss": 2.4312, + "step": 3475 + }, + { + "epoch": 0.102609464838567, + "grad_norm": 41.0, + "learning_rate": 1.999958981696739e-06, + "loss": 2.5988, + "step": 3480 + }, + { + "epoch": 0.10275689223057644, + "grad_norm": 21.375, + "learning_rate": 1.999954188142889e-06, + "loss": 2.5389, + "step": 3485 + }, + { + "epoch": 0.10290431962258588, + "grad_norm": 16.0, + "learning_rate": 1.9999491297601154e-06, + "loss": 2.5496, + "step": 3490 + }, + { + "epoch": 0.10305174701459531, + "grad_norm": 29.0, + "learning_rate": 1.9999438065497587e-06, + "loss": 2.5188, + "step": 3495 + }, + { + "epoch": 0.10319917440660474, + "grad_norm": 18.125, + "learning_rate": 1.999938218513228e-06, + "loss": 2.4835, + "step": 3500 + }, + { + "epoch": 0.10319917440660474, + "eval_loss": 2.641951322555542, + "eval_runtime": 4.7225, + "eval_samples_per_second": 83.855, + "eval_steps_per_second": 2.753, + "step": 3500 + }, + { + "epoch": 0.10334660179861418, + "grad_norm": 19.625, + "learning_rate": 1.9999323656520037e-06, + "loss": 2.4558, + "step": 3505 + }, + { + "epoch": 0.10349402919062362, + "grad_norm": 17.375, + "learning_rate": 1.999926247967635e-06, + "loss": 2.4677, + "step": 3510 + }, + { + "epoch": 0.10364145658263306, + "grad_norm": 18.0, + "learning_rate": 1.999919865461743e-06, + "loss": 2.4587, + "step": 3515 + }, + { + "epoch": 0.10378888397464249, + "grad_norm": 20.875, + "learning_rate": 1.999913218136018e-06, + "loss": 2.5211, + "step": 3520 + }, + { + "epoch": 0.10393631136665192, + "grad_norm": 16.25, + "learning_rate": 1.99990630599222e-06, + "loss": 2.552, + "step": 3525 + }, + { + "epoch": 0.10408373875866136, + "grad_norm": 16.875, + "learning_rate": 1.99989912903218e-06, + "loss": 2.3828, + "step": 3530 + }, + { + "epoch": 0.1042311661506708, + "grad_norm": 19.625, + "learning_rate": 1.999891687257799e-06, + "loss": 2.4554, + "step": 3535 + }, + { + "epoch": 0.10437859354268023, + "grad_norm": 18.5, + "learning_rate": 1.9998839806710466e-06, + "loss": 2.5472, + "step": 3540 + }, + { + "epoch": 0.10452602093468967, + "grad_norm": 17.5, + "learning_rate": 1.9998760092739654e-06, + "loss": 2.4378, + "step": 3545 + }, + { + "epoch": 0.1046734483266991, + "grad_norm": 18.5, + "learning_rate": 1.999867773068666e-06, + "loss": 2.4833, + "step": 3550 + }, + { + "epoch": 0.10482087571870853, + "grad_norm": 18.5, + "learning_rate": 1.999859272057329e-06, + "loss": 2.6006, + "step": 3555 + }, + { + "epoch": 0.10496830311071798, + "grad_norm": 20.75, + "learning_rate": 1.999850506242207e-06, + "loss": 2.5066, + "step": 3560 + }, + { + "epoch": 0.1051157305027274, + "grad_norm": 26.875, + "learning_rate": 1.9998414756256208e-06, + "loss": 2.5387, + "step": 3565 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 34.0, + "learning_rate": 1.9998321802099614e-06, + "loss": 2.5366, + "step": 3570 + }, + { + "epoch": 0.10541058528674628, + "grad_norm": 28.375, + "learning_rate": 1.9998226199976925e-06, + "loss": 2.6194, + "step": 3575 + }, + { + "epoch": 0.10555801267875571, + "grad_norm": 18.0, + "learning_rate": 1.9998127949913444e-06, + "loss": 2.4925, + "step": 3580 + }, + { + "epoch": 0.10570544007076514, + "grad_norm": 15.375, + "learning_rate": 1.9998027051935198e-06, + "loss": 2.4019, + "step": 3585 + }, + { + "epoch": 0.10585286746277459, + "grad_norm": 17.25, + "learning_rate": 1.999792350606891e-06, + "loss": 2.4669, + "step": 3590 + }, + { + "epoch": 0.10600029485478402, + "grad_norm": 17.375, + "learning_rate": 1.9997817312342e-06, + "loss": 2.6052, + "step": 3595 + }, + { + "epoch": 0.10614772224679346, + "grad_norm": 17.25, + "learning_rate": 1.9997708470782596e-06, + "loss": 2.5472, + "step": 3600 + }, + { + "epoch": 0.10629514963880289, + "grad_norm": 15.25, + "learning_rate": 1.9997596981419517e-06, + "loss": 2.4028, + "step": 3605 + }, + { + "epoch": 0.10644257703081232, + "grad_norm": 16.5, + "learning_rate": 1.99974828442823e-06, + "loss": 2.5456, + "step": 3610 + }, + { + "epoch": 0.10659000442282177, + "grad_norm": 16.375, + "learning_rate": 1.9997366059401166e-06, + "loss": 2.5217, + "step": 3615 + }, + { + "epoch": 0.1067374318148312, + "grad_norm": 17.5, + "learning_rate": 1.9997246626807045e-06, + "loss": 2.5225, + "step": 3620 + }, + { + "epoch": 0.10688485920684063, + "grad_norm": 18.25, + "learning_rate": 1.9997124546531566e-06, + "loss": 2.4429, + "step": 3625 + }, + { + "epoch": 0.10703228659885007, + "grad_norm": 27.5, + "learning_rate": 1.999699981860707e-06, + "loss": 2.5007, + "step": 3630 + }, + { + "epoch": 0.1071797139908595, + "grad_norm": 17.0, + "learning_rate": 1.999687244306658e-06, + "loss": 2.4672, + "step": 3635 + }, + { + "epoch": 0.10732714138286893, + "grad_norm": 21.125, + "learning_rate": 1.9996742419943834e-06, + "loss": 2.4403, + "step": 3640 + }, + { + "epoch": 0.10747456877487838, + "grad_norm": 20.625, + "learning_rate": 1.9996609749273268e-06, + "loss": 2.6091, + "step": 3645 + }, + { + "epoch": 0.1076219961668878, + "grad_norm": 22.25, + "learning_rate": 1.999647443109002e-06, + "loss": 2.4657, + "step": 3650 + }, + { + "epoch": 0.10776942355889724, + "grad_norm": 22.0, + "learning_rate": 1.9996336465429923e-06, + "loss": 2.4482, + "step": 3655 + }, + { + "epoch": 0.10791685095090668, + "grad_norm": 18.0, + "learning_rate": 1.999619585232952e-06, + "loss": 2.4894, + "step": 3660 + }, + { + "epoch": 0.10806427834291611, + "grad_norm": 13.25, + "learning_rate": 1.999605259182605e-06, + "loss": 2.4985, + "step": 3665 + }, + { + "epoch": 0.10821170573492554, + "grad_norm": 19.375, + "learning_rate": 1.999590668395745e-06, + "loss": 2.5532, + "step": 3670 + }, + { + "epoch": 0.10835913312693499, + "grad_norm": 17.0, + "learning_rate": 1.9995758128762376e-06, + "loss": 2.5648, + "step": 3675 + }, + { + "epoch": 0.10850656051894442, + "grad_norm": 15.9375, + "learning_rate": 1.9995606926280157e-06, + "loss": 2.4608, + "step": 3680 + }, + { + "epoch": 0.10865398791095386, + "grad_norm": 25.875, + "learning_rate": 1.999545307655084e-06, + "loss": 2.5516, + "step": 3685 + }, + { + "epoch": 0.10880141530296329, + "grad_norm": 15.1875, + "learning_rate": 1.999529657961518e-06, + "loss": 2.5005, + "step": 3690 + }, + { + "epoch": 0.10894884269497272, + "grad_norm": 19.125, + "learning_rate": 1.999513743551461e-06, + "loss": 2.6021, + "step": 3695 + }, + { + "epoch": 0.10909627008698217, + "grad_norm": 18.875, + "learning_rate": 1.999497564429129e-06, + "loss": 2.5128, + "step": 3700 + }, + { + "epoch": 0.1092436974789916, + "grad_norm": 19.25, + "learning_rate": 1.9994811205988063e-06, + "loss": 2.496, + "step": 3705 + }, + { + "epoch": 0.10939112487100103, + "grad_norm": 20.5, + "learning_rate": 1.999464412064848e-06, + "loss": 2.3362, + "step": 3710 + }, + { + "epoch": 0.10953855226301047, + "grad_norm": 21.25, + "learning_rate": 1.9994474388316794e-06, + "loss": 2.5267, + "step": 3715 + }, + { + "epoch": 0.1096859796550199, + "grad_norm": 21.5, + "learning_rate": 1.9994302009037957e-06, + "loss": 2.4618, + "step": 3720 + }, + { + "epoch": 0.10983340704702933, + "grad_norm": 17.625, + "learning_rate": 1.9994126982857614e-06, + "loss": 2.47, + "step": 3725 + }, + { + "epoch": 0.10998083443903878, + "grad_norm": 16.0, + "learning_rate": 1.999394930982213e-06, + "loss": 2.5356, + "step": 3730 + }, + { + "epoch": 0.11012826183104821, + "grad_norm": 18.625, + "learning_rate": 1.9993768989978558e-06, + "loss": 2.4532, + "step": 3735 + }, + { + "epoch": 0.11027568922305764, + "grad_norm": 15.5625, + "learning_rate": 1.9993586023374645e-06, + "loss": 2.5099, + "step": 3740 + }, + { + "epoch": 0.11042311661506708, + "grad_norm": 26.0, + "learning_rate": 1.9993400410058864e-06, + "loss": 2.4453, + "step": 3745 + }, + { + "epoch": 0.11057054400707651, + "grad_norm": 17.0, + "learning_rate": 1.999321215008036e-06, + "loss": 2.5786, + "step": 3750 + }, + { + "epoch": 0.11071797139908596, + "grad_norm": 15.3125, + "learning_rate": 1.9993021243488994e-06, + "loss": 2.5121, + "step": 3755 + }, + { + "epoch": 0.11086539879109539, + "grad_norm": 14.3125, + "learning_rate": 1.999282769033533e-06, + "loss": 2.5239, + "step": 3760 + }, + { + "epoch": 0.11101282618310482, + "grad_norm": 32.5, + "learning_rate": 1.9992631490670623e-06, + "loss": 2.3592, + "step": 3765 + }, + { + "epoch": 0.11116025357511426, + "grad_norm": 15.6875, + "learning_rate": 1.9992432644546836e-06, + "loss": 2.4312, + "step": 3770 + }, + { + "epoch": 0.11130768096712369, + "grad_norm": 38.5, + "learning_rate": 1.999223115201664e-06, + "loss": 2.5293, + "step": 3775 + }, + { + "epoch": 0.11145510835913312, + "grad_norm": 17.375, + "learning_rate": 1.9992027013133393e-06, + "loss": 2.381, + "step": 3780 + }, + { + "epoch": 0.11160253575114257, + "grad_norm": 19.125, + "learning_rate": 1.999182022795116e-06, + "loss": 2.4684, + "step": 3785 + }, + { + "epoch": 0.111749963143152, + "grad_norm": 22.625, + "learning_rate": 1.9991610796524697e-06, + "loss": 2.367, + "step": 3790 + }, + { + "epoch": 0.11189739053516143, + "grad_norm": 18.125, + "learning_rate": 1.999139871890948e-06, + "loss": 2.5411, + "step": 3795 + }, + { + "epoch": 0.11204481792717087, + "grad_norm": 20.625, + "learning_rate": 1.999118399516168e-06, + "loss": 2.4369, + "step": 3800 + }, + { + "epoch": 0.1121922453191803, + "grad_norm": 29.75, + "learning_rate": 1.9990966625338154e-06, + "loss": 2.588, + "step": 3805 + }, + { + "epoch": 0.11233967271118973, + "grad_norm": 30.5, + "learning_rate": 1.9990746609496476e-06, + "loss": 2.4131, + "step": 3810 + }, + { + "epoch": 0.11248710010319918, + "grad_norm": 12.3125, + "learning_rate": 1.9990523947694917e-06, + "loss": 2.3787, + "step": 3815 + }, + { + "epoch": 0.11263452749520861, + "grad_norm": 27.125, + "learning_rate": 1.999029863999244e-06, + "loss": 2.4659, + "step": 3820 + }, + { + "epoch": 0.11278195488721804, + "grad_norm": 16.875, + "learning_rate": 1.9990070686448725e-06, + "loss": 2.4582, + "step": 3825 + }, + { + "epoch": 0.11292938227922748, + "grad_norm": 19.125, + "learning_rate": 1.9989840087124134e-06, + "loss": 2.3936, + "step": 3830 + }, + { + "epoch": 0.11307680967123691, + "grad_norm": 20.375, + "learning_rate": 1.9989606842079745e-06, + "loss": 2.5457, + "step": 3835 + }, + { + "epoch": 0.11322423706324636, + "grad_norm": 25.875, + "learning_rate": 1.998937095137733e-06, + "loss": 2.5145, + "step": 3840 + }, + { + "epoch": 0.11337166445525579, + "grad_norm": 20.0, + "learning_rate": 1.998913241507936e-06, + "loss": 2.5409, + "step": 3845 + }, + { + "epoch": 0.11351909184726522, + "grad_norm": 16.875, + "learning_rate": 1.998889123324901e-06, + "loss": 2.4614, + "step": 3850 + }, + { + "epoch": 0.11366651923927466, + "grad_norm": 25.875, + "learning_rate": 1.998864740595016e-06, + "loss": 2.432, + "step": 3855 + }, + { + "epoch": 0.1138139466312841, + "grad_norm": 19.375, + "learning_rate": 1.998840093324738e-06, + "loss": 2.5605, + "step": 3860 + }, + { + "epoch": 0.11396137402329352, + "grad_norm": 24.625, + "learning_rate": 1.998815181520595e-06, + "loss": 2.411, + "step": 3865 + }, + { + "epoch": 0.11410880141530297, + "grad_norm": 18.875, + "learning_rate": 1.9987900051891843e-06, + "loss": 2.5734, + "step": 3870 + }, + { + "epoch": 0.1142562288073124, + "grad_norm": 15.75, + "learning_rate": 1.9987645643371733e-06, + "loss": 2.4893, + "step": 3875 + }, + { + "epoch": 0.11440365619932183, + "grad_norm": 17.5, + "learning_rate": 1.998738858971301e-06, + "loss": 2.5551, + "step": 3880 + }, + { + "epoch": 0.11455108359133127, + "grad_norm": 17.125, + "learning_rate": 1.9987128890983736e-06, + "loss": 2.5463, + "step": 3885 + }, + { + "epoch": 0.1146985109833407, + "grad_norm": 16.25, + "learning_rate": 1.9986866547252704e-06, + "loss": 2.3747, + "step": 3890 + }, + { + "epoch": 0.11484593837535013, + "grad_norm": 17.625, + "learning_rate": 1.9986601558589393e-06, + "loss": 2.5215, + "step": 3895 + }, + { + "epoch": 0.11499336576735958, + "grad_norm": 15.3125, + "learning_rate": 1.9986333925063968e-06, + "loss": 2.4341, + "step": 3900 + }, + { + "epoch": 0.11514079315936901, + "grad_norm": 15.25, + "learning_rate": 1.9986063646747325e-06, + "loss": 2.5542, + "step": 3905 + }, + { + "epoch": 0.11528822055137844, + "grad_norm": 15.8125, + "learning_rate": 1.998579072371104e-06, + "loss": 2.5368, + "step": 3910 + }, + { + "epoch": 0.11543564794338788, + "grad_norm": 23.625, + "learning_rate": 1.998551515602739e-06, + "loss": 2.4132, + "step": 3915 + }, + { + "epoch": 0.11558307533539731, + "grad_norm": 17.625, + "learning_rate": 1.9985236943769358e-06, + "loss": 2.5002, + "step": 3920 + }, + { + "epoch": 0.11573050272740676, + "grad_norm": 15.625, + "learning_rate": 1.9984956087010635e-06, + "loss": 2.2823, + "step": 3925 + }, + { + "epoch": 0.11587793011941619, + "grad_norm": 18.25, + "learning_rate": 1.9984672585825592e-06, + "loss": 2.5808, + "step": 3930 + }, + { + "epoch": 0.11602535751142562, + "grad_norm": 17.0, + "learning_rate": 1.9984386440289315e-06, + "loss": 2.5604, + "step": 3935 + }, + { + "epoch": 0.11617278490343506, + "grad_norm": 18.5, + "learning_rate": 1.998409765047759e-06, + "loss": 2.482, + "step": 3940 + }, + { + "epoch": 0.1163202122954445, + "grad_norm": 14.5, + "learning_rate": 1.99838062164669e-06, + "loss": 2.4246, + "step": 3945 + }, + { + "epoch": 0.11646763968745392, + "grad_norm": 16.0, + "learning_rate": 1.9983512138334425e-06, + "loss": 2.4198, + "step": 3950 + }, + { + "epoch": 0.11661506707946337, + "grad_norm": 16.25, + "learning_rate": 1.998321541615805e-06, + "loss": 2.434, + "step": 3955 + }, + { + "epoch": 0.1167624944714728, + "grad_norm": 24.75, + "learning_rate": 1.9982916050016364e-06, + "loss": 2.4377, + "step": 3960 + }, + { + "epoch": 0.11690992186348223, + "grad_norm": 21.625, + "learning_rate": 1.9982614039988643e-06, + "loss": 2.4218, + "step": 3965 + }, + { + "epoch": 0.11705734925549167, + "grad_norm": 17.5, + "learning_rate": 1.9982309386154884e-06, + "loss": 2.4098, + "step": 3970 + }, + { + "epoch": 0.1172047766475011, + "grad_norm": 16.5, + "learning_rate": 1.998200208859576e-06, + "loss": 2.3641, + "step": 3975 + }, + { + "epoch": 0.11735220403951054, + "grad_norm": 17.125, + "learning_rate": 1.9981692147392655e-06, + "loss": 2.4643, + "step": 3980 + }, + { + "epoch": 0.11749963143151998, + "grad_norm": 15.625, + "learning_rate": 1.998137956262767e-06, + "loss": 2.3954, + "step": 3985 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 44.75, + "learning_rate": 1.9981064334383577e-06, + "loss": 2.4361, + "step": 3990 + }, + { + "epoch": 0.11779448621553884, + "grad_norm": 30.875, + "learning_rate": 1.998074646274386e-06, + "loss": 2.5092, + "step": 3995 + }, + { + "epoch": 0.11794191360754828, + "grad_norm": 16.25, + "learning_rate": 1.998042594779271e-06, + "loss": 2.5085, + "step": 4000 + }, + { + "epoch": 0.11794191360754828, + "eval_loss": 2.489173173904419, + "eval_runtime": 4.7135, + "eval_samples_per_second": 84.014, + "eval_steps_per_second": 2.758, + "step": 4000 + }, + { + "epoch": 0.11808934099955772, + "grad_norm": 17.875, + "learning_rate": 1.9980102789615014e-06, + "loss": 2.5217, + "step": 4005 + }, + { + "epoch": 0.11823676839156716, + "grad_norm": 15.5625, + "learning_rate": 1.9979776988296353e-06, + "loss": 2.383, + "step": 4010 + }, + { + "epoch": 0.11838419578357659, + "grad_norm": 66.5, + "learning_rate": 1.997944854392301e-06, + "loss": 2.5012, + "step": 4015 + }, + { + "epoch": 0.11853162317558602, + "grad_norm": 18.125, + "learning_rate": 1.997911745658198e-06, + "loss": 2.4182, + "step": 4020 + }, + { + "epoch": 0.11867905056759546, + "grad_norm": 22.75, + "learning_rate": 1.9978783726360945e-06, + "loss": 2.4518, + "step": 4025 + }, + { + "epoch": 0.1188264779596049, + "grad_norm": 15.75, + "learning_rate": 1.9978447353348287e-06, + "loss": 2.3372, + "step": 4030 + }, + { + "epoch": 0.11897390535161433, + "grad_norm": 15.875, + "learning_rate": 1.9978108337633092e-06, + "loss": 2.4263, + "step": 4035 + }, + { + "epoch": 0.11912133274362377, + "grad_norm": 17.125, + "learning_rate": 1.9977766679305143e-06, + "loss": 2.547, + "step": 4040 + }, + { + "epoch": 0.1192687601356332, + "grad_norm": 16.5, + "learning_rate": 1.9977422378454936e-06, + "loss": 2.3247, + "step": 4045 + }, + { + "epoch": 0.11941618752764263, + "grad_norm": 15.5625, + "learning_rate": 1.9977075435173646e-06, + "loss": 2.3781, + "step": 4050 + }, + { + "epoch": 0.11956361491965208, + "grad_norm": 23.75, + "learning_rate": 1.997672584955316e-06, + "loss": 2.3924, + "step": 4055 + }, + { + "epoch": 0.1197110423116615, + "grad_norm": 21.75, + "learning_rate": 1.997637362168606e-06, + "loss": 2.4931, + "step": 4060 + }, + { + "epoch": 0.11985846970367094, + "grad_norm": 20.375, + "learning_rate": 1.997601875166564e-06, + "loss": 2.4232, + "step": 4065 + }, + { + "epoch": 0.12000589709568038, + "grad_norm": 13.6875, + "learning_rate": 1.9975661239585874e-06, + "loss": 2.3576, + "step": 4070 + }, + { + "epoch": 0.12015332448768981, + "grad_norm": 16.875, + "learning_rate": 1.997530108554145e-06, + "loss": 2.439, + "step": 4075 + }, + { + "epoch": 0.12030075187969924, + "grad_norm": 19.75, + "learning_rate": 1.997493828962775e-06, + "loss": 2.4518, + "step": 4080 + }, + { + "epoch": 0.12044817927170869, + "grad_norm": 18.125, + "learning_rate": 1.997457285194086e-06, + "loss": 2.4058, + "step": 4085 + }, + { + "epoch": 0.12059560666371812, + "grad_norm": 15.4375, + "learning_rate": 1.9974204772577557e-06, + "loss": 2.4125, + "step": 4090 + }, + { + "epoch": 0.12074303405572756, + "grad_norm": 15.5, + "learning_rate": 1.9973834051635332e-06, + "loss": 2.4462, + "step": 4095 + }, + { + "epoch": 0.12089046144773699, + "grad_norm": 14.25, + "learning_rate": 1.9973460689212366e-06, + "loss": 2.3354, + "step": 4100 + }, + { + "epoch": 0.12103788883974642, + "grad_norm": 16.25, + "learning_rate": 1.997308468540753e-06, + "loss": 2.3519, + "step": 4105 + }, + { + "epoch": 0.12118531623175587, + "grad_norm": 16.375, + "learning_rate": 1.997270604032042e-06, + "loss": 2.3724, + "step": 4110 + }, + { + "epoch": 0.1213327436237653, + "grad_norm": 25.75, + "learning_rate": 1.9972324754051306e-06, + "loss": 2.5401, + "step": 4115 + }, + { + "epoch": 0.12148017101577473, + "grad_norm": 17.0, + "learning_rate": 1.9971940826701175e-06, + "loss": 2.4093, + "step": 4120 + }, + { + "epoch": 0.12162759840778417, + "grad_norm": 17.25, + "learning_rate": 1.99715542583717e-06, + "loss": 2.4163, + "step": 4125 + }, + { + "epoch": 0.1217750257997936, + "grad_norm": 14.625, + "learning_rate": 1.9971165049165266e-06, + "loss": 2.397, + "step": 4130 + }, + { + "epoch": 0.12192245319180303, + "grad_norm": 20.625, + "learning_rate": 1.997077319918495e-06, + "loss": 2.4253, + "step": 4135 + }, + { + "epoch": 0.12206988058381248, + "grad_norm": 19.625, + "learning_rate": 1.9970378708534527e-06, + "loss": 2.3918, + "step": 4140 + }, + { + "epoch": 0.1222173079758219, + "grad_norm": 13.1875, + "learning_rate": 1.9969981577318476e-06, + "loss": 2.4003, + "step": 4145 + }, + { + "epoch": 0.12236473536783134, + "grad_norm": 16.375, + "learning_rate": 1.9969581805641977e-06, + "loss": 2.3088, + "step": 4150 + }, + { + "epoch": 0.12251216275984078, + "grad_norm": 17.875, + "learning_rate": 1.99691793936109e-06, + "loss": 2.3631, + "step": 4155 + }, + { + "epoch": 0.12265959015185021, + "grad_norm": 13.0625, + "learning_rate": 1.9968774341331828e-06, + "loss": 2.3745, + "step": 4160 + }, + { + "epoch": 0.12280701754385964, + "grad_norm": 13.625, + "learning_rate": 1.9968366648912024e-06, + "loss": 2.3856, + "step": 4165 + }, + { + "epoch": 0.12295444493586909, + "grad_norm": 22.5, + "learning_rate": 1.9967956316459473e-06, + "loss": 2.5149, + "step": 4170 + }, + { + "epoch": 0.12310187232787852, + "grad_norm": 23.5, + "learning_rate": 1.9967543344082845e-06, + "loss": 2.3066, + "step": 4175 + }, + { + "epoch": 0.12324929971988796, + "grad_norm": 14.125, + "learning_rate": 1.996712773189151e-06, + "loss": 2.3431, + "step": 4180 + }, + { + "epoch": 0.12339672711189739, + "grad_norm": 19.625, + "learning_rate": 1.996670947999554e-06, + "loss": 2.3903, + "step": 4185 + }, + { + "epoch": 0.12354415450390682, + "grad_norm": 17.625, + "learning_rate": 1.9966288588505705e-06, + "loss": 2.4746, + "step": 4190 + }, + { + "epoch": 0.12369158189591627, + "grad_norm": 15.6875, + "learning_rate": 1.9965865057533474e-06, + "loss": 2.313, + "step": 4195 + }, + { + "epoch": 0.1238390092879257, + "grad_norm": 17.375, + "learning_rate": 1.996543888719101e-06, + "loss": 2.5405, + "step": 4200 + }, + { + "epoch": 0.12398643667993513, + "grad_norm": 16.0, + "learning_rate": 1.996501007759119e-06, + "loss": 2.3146, + "step": 4205 + }, + { + "epoch": 0.12413386407194457, + "grad_norm": 17.125, + "learning_rate": 1.996457862884758e-06, + "loss": 2.2757, + "step": 4210 + }, + { + "epoch": 0.124281291463954, + "grad_norm": 17.875, + "learning_rate": 1.996414454107444e-06, + "loss": 2.3661, + "step": 4215 + }, + { + "epoch": 0.12442871885596343, + "grad_norm": 18.375, + "learning_rate": 1.996370781438674e-06, + "loss": 2.3062, + "step": 4220 + }, + { + "epoch": 0.12457614624797288, + "grad_norm": 18.875, + "learning_rate": 1.9963268448900133e-06, + "loss": 2.4657, + "step": 4225 + }, + { + "epoch": 0.12472357363998231, + "grad_norm": 16.375, + "learning_rate": 1.9962826444730992e-06, + "loss": 2.4002, + "step": 4230 + }, + { + "epoch": 0.12487100103199174, + "grad_norm": 16.75, + "learning_rate": 1.996238180199637e-06, + "loss": 2.3283, + "step": 4235 + }, + { + "epoch": 0.12501842842400118, + "grad_norm": 14.375, + "learning_rate": 1.996193452081403e-06, + "loss": 2.3856, + "step": 4240 + }, + { + "epoch": 0.1251658558160106, + "grad_norm": 15.4375, + "learning_rate": 1.996148460130243e-06, + "loss": 2.3721, + "step": 4245 + }, + { + "epoch": 0.12531328320802004, + "grad_norm": 18.625, + "learning_rate": 1.9961032043580726e-06, + "loss": 2.4488, + "step": 4250 + }, + { + "epoch": 0.12546071060002947, + "grad_norm": 27.125, + "learning_rate": 1.9960576847768784e-06, + "loss": 2.3926, + "step": 4255 + }, + { + "epoch": 0.12560813799203893, + "grad_norm": 17.625, + "learning_rate": 1.996011901398714e-06, + "loss": 2.3488, + "step": 4260 + }, + { + "epoch": 0.12575556538404836, + "grad_norm": 16.375, + "learning_rate": 1.995965854235706e-06, + "loss": 2.441, + "step": 4265 + }, + { + "epoch": 0.1259029927760578, + "grad_norm": 15.25, + "learning_rate": 1.9959195433000496e-06, + "loss": 2.4117, + "step": 4270 + }, + { + "epoch": 0.12605042016806722, + "grad_norm": 14.75, + "learning_rate": 1.995872968604009e-06, + "loss": 2.3304, + "step": 4275 + }, + { + "epoch": 0.12619784756007665, + "grad_norm": 14.3125, + "learning_rate": 1.9958261301599195e-06, + "loss": 2.4009, + "step": 4280 + }, + { + "epoch": 0.12634527495208608, + "grad_norm": 17.25, + "learning_rate": 1.995779027980187e-06, + "loss": 2.4338, + "step": 4285 + }, + { + "epoch": 0.12649270234409554, + "grad_norm": 14.4375, + "learning_rate": 1.9957316620772842e-06, + "loss": 2.4749, + "step": 4290 + }, + { + "epoch": 0.12664012973610497, + "grad_norm": 18.375, + "learning_rate": 1.9956840324637564e-06, + "loss": 2.3095, + "step": 4295 + }, + { + "epoch": 0.1267875571281144, + "grad_norm": 20.0, + "learning_rate": 1.9956361391522177e-06, + "loss": 2.3714, + "step": 4300 + }, + { + "epoch": 0.12693498452012383, + "grad_norm": 22.125, + "learning_rate": 1.995587982155353e-06, + "loss": 2.4341, + "step": 4305 + }, + { + "epoch": 0.12708241191213326, + "grad_norm": 16.625, + "learning_rate": 1.995539561485915e-06, + "loss": 2.3509, + "step": 4310 + }, + { + "epoch": 0.12722983930414272, + "grad_norm": 16.0, + "learning_rate": 1.9954908771567287e-06, + "loss": 2.4267, + "step": 4315 + }, + { + "epoch": 0.12737726669615215, + "grad_norm": 14.8125, + "learning_rate": 1.9954419291806865e-06, + "loss": 2.3689, + "step": 4320 + }, + { + "epoch": 0.12752469408816158, + "grad_norm": 15.4375, + "learning_rate": 1.995392717570753e-06, + "loss": 2.3899, + "step": 4325 + }, + { + "epoch": 0.127672121480171, + "grad_norm": 17.25, + "learning_rate": 1.9953432423399606e-06, + "loss": 2.2691, + "step": 4330 + }, + { + "epoch": 0.12781954887218044, + "grad_norm": 13.5625, + "learning_rate": 1.9952935035014126e-06, + "loss": 2.4342, + "step": 4335 + }, + { + "epoch": 0.12796697626418987, + "grad_norm": 22.0, + "learning_rate": 1.995243501068282e-06, + "loss": 2.2938, + "step": 4340 + }, + { + "epoch": 0.12811440365619933, + "grad_norm": 22.75, + "learning_rate": 1.9951932350538113e-06, + "loss": 2.3834, + "step": 4345 + }, + { + "epoch": 0.12826183104820876, + "grad_norm": 14.875, + "learning_rate": 1.9951427054713137e-06, + "loss": 2.5038, + "step": 4350 + }, + { + "epoch": 0.1284092584402182, + "grad_norm": 16.875, + "learning_rate": 1.9950919123341707e-06, + "loss": 2.4017, + "step": 4355 + }, + { + "epoch": 0.12855668583222762, + "grad_norm": 21.375, + "learning_rate": 1.9950408556558344e-06, + "loss": 2.3696, + "step": 4360 + }, + { + "epoch": 0.12870411322423705, + "grad_norm": 13.875, + "learning_rate": 1.9949895354498272e-06, + "loss": 2.3681, + "step": 4365 + }, + { + "epoch": 0.12885154061624648, + "grad_norm": 13.0, + "learning_rate": 1.9949379517297404e-06, + "loss": 2.4246, + "step": 4370 + }, + { + "epoch": 0.12899896800825594, + "grad_norm": 14.5625, + "learning_rate": 1.994886104509236e-06, + "loss": 2.4722, + "step": 4375 + }, + { + "epoch": 0.12914639540026537, + "grad_norm": 17.25, + "learning_rate": 1.994833993802045e-06, + "loss": 2.3309, + "step": 4380 + }, + { + "epoch": 0.1292938227922748, + "grad_norm": 16.5, + "learning_rate": 1.994781619621968e-06, + "loss": 2.385, + "step": 4385 + }, + { + "epoch": 0.12944125018428423, + "grad_norm": 13.75, + "learning_rate": 1.9947289819828764e-06, + "loss": 2.301, + "step": 4390 + }, + { + "epoch": 0.12958867757629366, + "grad_norm": 17.0, + "learning_rate": 1.9946760808987106e-06, + "loss": 2.4786, + "step": 4395 + }, + { + "epoch": 0.12973610496830312, + "grad_norm": 15.6875, + "learning_rate": 1.994622916383481e-06, + "loss": 2.2656, + "step": 4400 + }, + { + "epoch": 0.12988353236031255, + "grad_norm": 14.5625, + "learning_rate": 1.994569488451268e-06, + "loss": 2.1932, + "step": 4405 + }, + { + "epoch": 0.13003095975232198, + "grad_norm": 14.875, + "learning_rate": 1.9945157971162207e-06, + "loss": 2.3135, + "step": 4410 + }, + { + "epoch": 0.13017838714433141, + "grad_norm": 22.625, + "learning_rate": 1.99446184239256e-06, + "loss": 2.3792, + "step": 4415 + }, + { + "epoch": 0.13032581453634084, + "grad_norm": 20.625, + "learning_rate": 1.9944076242945744e-06, + "loss": 2.403, + "step": 4420 + }, + { + "epoch": 0.13047324192835028, + "grad_norm": 15.5, + "learning_rate": 1.9943531428366233e-06, + "loss": 2.4211, + "step": 4425 + }, + { + "epoch": 0.13062066932035973, + "grad_norm": 16.75, + "learning_rate": 1.9942983980331355e-06, + "loss": 2.3437, + "step": 4430 + }, + { + "epoch": 0.13076809671236916, + "grad_norm": 12.0625, + "learning_rate": 1.99424338989861e-06, + "loss": 2.3165, + "step": 4435 + }, + { + "epoch": 0.1309155241043786, + "grad_norm": 25.0, + "learning_rate": 1.994188118447615e-06, + "loss": 2.396, + "step": 4440 + }, + { + "epoch": 0.13106295149638802, + "grad_norm": 19.125, + "learning_rate": 1.9941325836947888e-06, + "loss": 2.3832, + "step": 4445 + }, + { + "epoch": 0.13121037888839746, + "grad_norm": 16.0, + "learning_rate": 1.9940767856548395e-06, + "loss": 2.3981, + "step": 4450 + }, + { + "epoch": 0.13135780628040689, + "grad_norm": 17.5, + "learning_rate": 1.994020724342544e-06, + "loss": 2.3459, + "step": 4455 + }, + { + "epoch": 0.13150523367241634, + "grad_norm": 15.625, + "learning_rate": 1.99396439977275e-06, + "loss": 2.4587, + "step": 4460 + }, + { + "epoch": 0.13165266106442577, + "grad_norm": 25.125, + "learning_rate": 1.9939078119603746e-06, + "loss": 2.4407, + "step": 4465 + }, + { + "epoch": 0.1318000884564352, + "grad_norm": 15.875, + "learning_rate": 1.9938509609204047e-06, + "loss": 2.3194, + "step": 4470 + }, + { + "epoch": 0.13194751584844464, + "grad_norm": 18.0, + "learning_rate": 1.9937938466678967e-06, + "loss": 2.4082, + "step": 4475 + }, + { + "epoch": 0.13209494324045407, + "grad_norm": 14.5625, + "learning_rate": 1.9937364692179764e-06, + "loss": 2.2631, + "step": 4480 + }, + { + "epoch": 0.13224237063246352, + "grad_norm": 15.6875, + "learning_rate": 1.993678828585841e-06, + "loss": 2.2747, + "step": 4485 + }, + { + "epoch": 0.13238979802447295, + "grad_norm": 15.75, + "learning_rate": 1.9936209247867542e-06, + "loss": 2.3008, + "step": 4490 + }, + { + "epoch": 0.13253722541648238, + "grad_norm": 17.0, + "learning_rate": 1.9935627578360526e-06, + "loss": 2.3542, + "step": 4495 + }, + { + "epoch": 0.13268465280849182, + "grad_norm": 15.0, + "learning_rate": 1.9935043277491407e-06, + "loss": 2.3161, + "step": 4500 + }, + { + "epoch": 0.13268465280849182, + "eval_loss": 2.3908541202545166, + "eval_runtime": 4.7136, + "eval_samples_per_second": 84.013, + "eval_steps_per_second": 2.758, + "step": 4500 + }, + { + "epoch": 0.13283208020050125, + "grad_norm": 15.0625, + "learning_rate": 1.9934456345414938e-06, + "loss": 2.3238, + "step": 4505 + }, + { + "epoch": 0.13297950759251068, + "grad_norm": 17.5, + "learning_rate": 1.9933866782286553e-06, + "loss": 2.2844, + "step": 4510 + }, + { + "epoch": 0.13312693498452013, + "grad_norm": 15.1875, + "learning_rate": 1.99332745882624e-06, + "loss": 2.3055, + "step": 4515 + }, + { + "epoch": 0.13327436237652956, + "grad_norm": 15.0, + "learning_rate": 1.9932679763499313e-06, + "loss": 2.4481, + "step": 4520 + }, + { + "epoch": 0.133421789768539, + "grad_norm": 13.4375, + "learning_rate": 1.9932082308154833e-06, + "loss": 2.2749, + "step": 4525 + }, + { + "epoch": 0.13356921716054843, + "grad_norm": 16.5, + "learning_rate": 1.993148222238718e-06, + "loss": 2.5095, + "step": 4530 + }, + { + "epoch": 0.13371664455255786, + "grad_norm": 13.875, + "learning_rate": 1.9930879506355285e-06, + "loss": 2.4653, + "step": 4535 + }, + { + "epoch": 0.13386407194456731, + "grad_norm": 16.375, + "learning_rate": 1.9930274160218773e-06, + "loss": 2.3229, + "step": 4540 + }, + { + "epoch": 0.13401149933657674, + "grad_norm": 21.625, + "learning_rate": 1.9929666184137964e-06, + "loss": 2.3757, + "step": 4545 + }, + { + "epoch": 0.13415892672858618, + "grad_norm": 16.75, + "learning_rate": 1.992905557827388e-06, + "loss": 2.4258, + "step": 4550 + }, + { + "epoch": 0.1343063541205956, + "grad_norm": 13.75, + "learning_rate": 1.992844234278823e-06, + "loss": 2.3937, + "step": 4555 + }, + { + "epoch": 0.13445378151260504, + "grad_norm": 19.375, + "learning_rate": 1.9927826477843416e-06, + "loss": 2.4173, + "step": 4560 + }, + { + "epoch": 0.13460120890461447, + "grad_norm": 14.75, + "learning_rate": 1.992720798360256e-06, + "loss": 2.3872, + "step": 4565 + }, + { + "epoch": 0.13474863629662392, + "grad_norm": 15.75, + "learning_rate": 1.9926586860229455e-06, + "loss": 2.3229, + "step": 4570 + }, + { + "epoch": 0.13489606368863336, + "grad_norm": 13.1875, + "learning_rate": 1.99259631078886e-06, + "loss": 2.3331, + "step": 4575 + }, + { + "epoch": 0.13504349108064279, + "grad_norm": 14.6875, + "learning_rate": 1.9925336726745196e-06, + "loss": 2.3182, + "step": 4580 + }, + { + "epoch": 0.13519091847265222, + "grad_norm": 16.375, + "learning_rate": 1.992470771696513e-06, + "loss": 2.2982, + "step": 4585 + }, + { + "epoch": 0.13533834586466165, + "grad_norm": 15.625, + "learning_rate": 1.992407607871499e-06, + "loss": 2.3287, + "step": 4590 + }, + { + "epoch": 0.13548577325667108, + "grad_norm": 17.5, + "learning_rate": 1.992344181216206e-06, + "loss": 2.4427, + "step": 4595 + }, + { + "epoch": 0.13563320064868054, + "grad_norm": 17.625, + "learning_rate": 1.9922804917474316e-06, + "loss": 2.3961, + "step": 4600 + }, + { + "epoch": 0.13578062804068997, + "grad_norm": 16.625, + "learning_rate": 1.9922165394820445e-06, + "loss": 2.4671, + "step": 4605 + }, + { + "epoch": 0.1359280554326994, + "grad_norm": 22.25, + "learning_rate": 1.9921523244369805e-06, + "loss": 2.4358, + "step": 4610 + }, + { + "epoch": 0.13607548282470883, + "grad_norm": 16.125, + "learning_rate": 1.9920878466292473e-06, + "loss": 2.4312, + "step": 4615 + }, + { + "epoch": 0.13622291021671826, + "grad_norm": 18.5, + "learning_rate": 1.9920231060759207e-06, + "loss": 2.5213, + "step": 4620 + }, + { + "epoch": 0.13637033760872772, + "grad_norm": 14.8125, + "learning_rate": 1.9919581027941476e-06, + "loss": 2.2932, + "step": 4625 + }, + { + "epoch": 0.13651776500073715, + "grad_norm": 20.875, + "learning_rate": 1.9918928368011426e-06, + "loss": 2.3134, + "step": 4630 + }, + { + "epoch": 0.13666519239274658, + "grad_norm": 16.25, + "learning_rate": 1.991827308114191e-06, + "loss": 2.3506, + "step": 4635 + }, + { + "epoch": 0.136812619784756, + "grad_norm": 14.5, + "learning_rate": 1.9917615167506477e-06, + "loss": 2.3828, + "step": 4640 + }, + { + "epoch": 0.13696004717676544, + "grad_norm": 40.25, + "learning_rate": 1.9916954627279373e-06, + "loss": 2.3665, + "step": 4645 + }, + { + "epoch": 0.13710747456877487, + "grad_norm": 21.75, + "learning_rate": 1.9916291460635522e-06, + "loss": 2.3163, + "step": 4650 + }, + { + "epoch": 0.13725490196078433, + "grad_norm": 14.75, + "learning_rate": 1.9915625667750577e-06, + "loss": 2.3346, + "step": 4655 + }, + { + "epoch": 0.13740232935279376, + "grad_norm": 18.125, + "learning_rate": 1.991495724880085e-06, + "loss": 2.3689, + "step": 4660 + }, + { + "epoch": 0.1375497567448032, + "grad_norm": 16.0, + "learning_rate": 1.991428620396338e-06, + "loss": 2.3924, + "step": 4665 + }, + { + "epoch": 0.13769718413681262, + "grad_norm": 15.5, + "learning_rate": 1.9913612533415877e-06, + "loss": 2.2315, + "step": 4670 + }, + { + "epoch": 0.13784461152882205, + "grad_norm": 16.875, + "learning_rate": 1.9912936237336764e-06, + "loss": 2.3822, + "step": 4675 + }, + { + "epoch": 0.13799203892083148, + "grad_norm": 21.5, + "learning_rate": 1.9912257315905145e-06, + "loss": 2.3552, + "step": 4680 + }, + { + "epoch": 0.13813946631284094, + "grad_norm": 19.5, + "learning_rate": 1.991157576930083e-06, + "loss": 2.4338, + "step": 4685 + }, + { + "epoch": 0.13828689370485037, + "grad_norm": 20.875, + "learning_rate": 1.991089159770432e-06, + "loss": 2.3953, + "step": 4690 + }, + { + "epoch": 0.1384343210968598, + "grad_norm": 50.0, + "learning_rate": 1.9910204801296814e-06, + "loss": 2.3419, + "step": 4695 + }, + { + "epoch": 0.13858174848886923, + "grad_norm": 15.25, + "learning_rate": 1.99095153802602e-06, + "loss": 2.342, + "step": 4700 + }, + { + "epoch": 0.13872917588087866, + "grad_norm": 23.25, + "learning_rate": 1.9908823334777067e-06, + "loss": 2.4395, + "step": 4705 + }, + { + "epoch": 0.13887660327288812, + "grad_norm": 17.0, + "learning_rate": 1.9908128665030697e-06, + "loss": 2.3248, + "step": 4710 + }, + { + "epoch": 0.13902403066489755, + "grad_norm": 17.5, + "learning_rate": 1.990743137120507e-06, + "loss": 2.3639, + "step": 4715 + }, + { + "epoch": 0.13917145805690698, + "grad_norm": 18.25, + "learning_rate": 1.990673145348485e-06, + "loss": 2.3686, + "step": 4720 + }, + { + "epoch": 0.1393188854489164, + "grad_norm": 15.5, + "learning_rate": 1.990602891205541e-06, + "loss": 2.3286, + "step": 4725 + }, + { + "epoch": 0.13946631284092584, + "grad_norm": 19.0, + "learning_rate": 1.9905323747102813e-06, + "loss": 2.3668, + "step": 4730 + }, + { + "epoch": 0.13961374023293527, + "grad_norm": 14.125, + "learning_rate": 1.9904615958813814e-06, + "loss": 2.3007, + "step": 4735 + }, + { + "epoch": 0.13976116762494473, + "grad_norm": 14.6875, + "learning_rate": 1.990390554737586e-06, + "loss": 2.3387, + "step": 4740 + }, + { + "epoch": 0.13990859501695416, + "grad_norm": 17.625, + "learning_rate": 1.9903192512977104e-06, + "loss": 2.3229, + "step": 4745 + }, + { + "epoch": 0.1400560224089636, + "grad_norm": 15.75, + "learning_rate": 1.9902476855806382e-06, + "loss": 2.3936, + "step": 4750 + }, + { + "epoch": 0.14020344980097302, + "grad_norm": 16.375, + "learning_rate": 1.990175857605323e-06, + "loss": 2.4185, + "step": 4755 + }, + { + "epoch": 0.14035087719298245, + "grad_norm": 13.125, + "learning_rate": 1.9901037673907884e-06, + "loss": 2.2939, + "step": 4760 + }, + { + "epoch": 0.14049830458499188, + "grad_norm": 16.25, + "learning_rate": 1.990031414956126e-06, + "loss": 2.2509, + "step": 4765 + }, + { + "epoch": 0.14064573197700134, + "grad_norm": 14.1875, + "learning_rate": 1.989958800320498e-06, + "loss": 2.255, + "step": 4770 + }, + { + "epoch": 0.14079315936901077, + "grad_norm": 15.3125, + "learning_rate": 1.989885923503136e-06, + "loss": 2.3749, + "step": 4775 + }, + { + "epoch": 0.1409405867610202, + "grad_norm": 14.0625, + "learning_rate": 1.98981278452334e-06, + "loss": 2.3896, + "step": 4780 + }, + { + "epoch": 0.14108801415302963, + "grad_norm": 15.4375, + "learning_rate": 1.989739383400481e-06, + "loss": 2.1821, + "step": 4785 + }, + { + "epoch": 0.14123544154503906, + "grad_norm": 17.5, + "learning_rate": 1.989665720153999e-06, + "loss": 2.3354, + "step": 4790 + }, + { + "epoch": 0.14138286893704852, + "grad_norm": 14.9375, + "learning_rate": 1.989591794803402e-06, + "loss": 2.2849, + "step": 4795 + }, + { + "epoch": 0.14153029632905795, + "grad_norm": 15.1875, + "learning_rate": 1.9895176073682685e-06, + "loss": 2.3225, + "step": 4800 + }, + { + "epoch": 0.14167772372106738, + "grad_norm": 12.625, + "learning_rate": 1.9894431578682474e-06, + "loss": 2.3356, + "step": 4805 + }, + { + "epoch": 0.1418251511130768, + "grad_norm": 25.0, + "learning_rate": 1.989368446323055e-06, + "loss": 2.3309, + "step": 4810 + }, + { + "epoch": 0.14197257850508624, + "grad_norm": 17.875, + "learning_rate": 1.989293472752479e-06, + "loss": 2.2506, + "step": 4815 + }, + { + "epoch": 0.14212000589709567, + "grad_norm": 13.875, + "learning_rate": 1.989218237176374e-06, + "loss": 2.2593, + "step": 4820 + }, + { + "epoch": 0.14226743328910513, + "grad_norm": 17.0, + "learning_rate": 1.989142739614667e-06, + "loss": 2.3378, + "step": 4825 + }, + { + "epoch": 0.14241486068111456, + "grad_norm": 14.1875, + "learning_rate": 1.9890669800873518e-06, + "loss": 2.2224, + "step": 4830 + }, + { + "epoch": 0.142562288073124, + "grad_norm": 12.9375, + "learning_rate": 1.9889909586144927e-06, + "loss": 2.2454, + "step": 4835 + }, + { + "epoch": 0.14270971546513342, + "grad_norm": 20.0, + "learning_rate": 1.988914675216224e-06, + "loss": 2.3326, + "step": 4840 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 14.5, + "learning_rate": 1.9888381299127484e-06, + "loss": 2.289, + "step": 4845 + }, + { + "epoch": 0.14300457024915228, + "grad_norm": 16.0, + "learning_rate": 1.9887613227243377e-06, + "loss": 2.3622, + "step": 4850 + }, + { + "epoch": 0.14315199764116174, + "grad_norm": 20.0, + "learning_rate": 1.9886842536713342e-06, + "loss": 2.4091, + "step": 4855 + }, + { + "epoch": 0.14329942503317117, + "grad_norm": 13.4375, + "learning_rate": 1.988606922774149e-06, + "loss": 2.3741, + "step": 4860 + }, + { + "epoch": 0.1434468524251806, + "grad_norm": 16.375, + "learning_rate": 1.9885293300532623e-06, + "loss": 2.3865, + "step": 4865 + }, + { + "epoch": 0.14359427981719003, + "grad_norm": 13.625, + "learning_rate": 1.9884514755292236e-06, + "loss": 2.297, + "step": 4870 + }, + { + "epoch": 0.14374170720919946, + "grad_norm": 14.8125, + "learning_rate": 1.988373359222652e-06, + "loss": 2.3604, + "step": 4875 + }, + { + "epoch": 0.14388913460120892, + "grad_norm": 17.375, + "learning_rate": 1.9882949811542362e-06, + "loss": 2.2758, + "step": 4880 + }, + { + "epoch": 0.14403656199321835, + "grad_norm": 15.75, + "learning_rate": 1.9882163413447337e-06, + "loss": 2.4295, + "step": 4885 + }, + { + "epoch": 0.14418398938522778, + "grad_norm": 16.625, + "learning_rate": 1.9881374398149715e-06, + "loss": 2.4267, + "step": 4890 + }, + { + "epoch": 0.1443314167772372, + "grad_norm": 18.0, + "learning_rate": 1.988058276585847e-06, + "loss": 2.3796, + "step": 4895 + }, + { + "epoch": 0.14447884416924664, + "grad_norm": 32.0, + "learning_rate": 1.9879788516783242e-06, + "loss": 2.2386, + "step": 4900 + }, + { + "epoch": 0.14462627156125607, + "grad_norm": 18.0, + "learning_rate": 1.9878991651134388e-06, + "loss": 2.3415, + "step": 4905 + }, + { + "epoch": 0.14477369895326553, + "grad_norm": 13.75, + "learning_rate": 1.9878192169122957e-06, + "loss": 2.2535, + "step": 4910 + }, + { + "epoch": 0.14492112634527496, + "grad_norm": 17.75, + "learning_rate": 1.9877390070960677e-06, + "loss": 2.2624, + "step": 4915 + }, + { + "epoch": 0.1450685537372844, + "grad_norm": 17.125, + "learning_rate": 1.9876585356859977e-06, + "loss": 2.3719, + "step": 4920 + }, + { + "epoch": 0.14521598112929382, + "grad_norm": 25.125, + "learning_rate": 1.987577802703398e-06, + "loss": 2.3524, + "step": 4925 + }, + { + "epoch": 0.14536340852130325, + "grad_norm": 18.75, + "learning_rate": 1.98749680816965e-06, + "loss": 2.3082, + "step": 4930 + }, + { + "epoch": 0.14551083591331268, + "grad_norm": 29.125, + "learning_rate": 1.9874155521062047e-06, + "loss": 2.2639, + "step": 4935 + }, + { + "epoch": 0.14565826330532214, + "grad_norm": 13.375, + "learning_rate": 1.9873340345345816e-06, + "loss": 2.2192, + "step": 4940 + }, + { + "epoch": 0.14580569069733157, + "grad_norm": 15.875, + "learning_rate": 1.9872522554763698e-06, + "loss": 2.4222, + "step": 4945 + }, + { + "epoch": 0.145953118089341, + "grad_norm": 17.625, + "learning_rate": 1.987170214953228e-06, + "loss": 2.4073, + "step": 4950 + }, + { + "epoch": 0.14610054548135043, + "grad_norm": 16.0, + "learning_rate": 1.9870879129868842e-06, + "loss": 2.3876, + "step": 4955 + }, + { + "epoch": 0.14624797287335986, + "grad_norm": 14.25, + "learning_rate": 1.987005349599135e-06, + "loss": 2.3663, + "step": 4960 + }, + { + "epoch": 0.14639540026536932, + "grad_norm": 18.875, + "learning_rate": 1.9869225248118463e-06, + "loss": 2.5066, + "step": 4965 + }, + { + "epoch": 0.14654282765737875, + "grad_norm": 12.5, + "learning_rate": 1.9868394386469535e-06, + "loss": 2.2031, + "step": 4970 + }, + { + "epoch": 0.14669025504938818, + "grad_norm": 13.625, + "learning_rate": 1.986756091126462e-06, + "loss": 2.308, + "step": 4975 + }, + { + "epoch": 0.1468376824413976, + "grad_norm": 17.75, + "learning_rate": 1.986672482272445e-06, + "loss": 2.3624, + "step": 4980 + }, + { + "epoch": 0.14698510983340704, + "grad_norm": 16.875, + "learning_rate": 1.9865886121070463e-06, + "loss": 2.3513, + "step": 4985 + }, + { + "epoch": 0.14713253722541647, + "grad_norm": 12.4375, + "learning_rate": 1.986504480652477e-06, + "loss": 2.2523, + "step": 4990 + }, + { + "epoch": 0.14727996461742593, + "grad_norm": 13.6875, + "learning_rate": 1.986420087931019e-06, + "loss": 2.3941, + "step": 4995 + }, + { + "epoch": 0.14742739200943536, + "grad_norm": 15.8125, + "learning_rate": 1.9863354339650234e-06, + "loss": 2.2635, + "step": 5000 + }, + { + "epoch": 0.14742739200943536, + "eval_loss": 2.3236687183380127, + "eval_runtime": 4.7176, + "eval_samples_per_second": 83.94, + "eval_steps_per_second": 2.756, + "step": 5000 + } + ], + "logging_steps": 5, + "max_steps": 33915, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3986311890783961e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}