{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2948547840188707, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 2.948547840188707e-05, "grad_norm": 4288.0, "learning_rate": 5.89622641509434e-10, "loss": 5.8964, "step": 1 }, { "epoch": 0.00014742739200943535, "grad_norm": 5152.0, "learning_rate": 2.9481132075471697e-09, "loss": 7.2418, "step": 5 }, { "epoch": 0.0002948547840188707, "grad_norm": 3872.0, "learning_rate": 5.896226415094339e-09, "loss": 6.7926, "step": 10 }, { "epoch": 0.0004422821760283061, "grad_norm": 2704.0, "learning_rate": 8.844339622641509e-09, "loss": 7.0144, "step": 15 }, { "epoch": 0.0005897095680377414, "grad_norm": 5824.0, "learning_rate": 1.1792452830188679e-08, "loss": 7.431, "step": 20 }, { "epoch": 0.0007371369600471767, "grad_norm": 3488.0, "learning_rate": 1.4740566037735849e-08, "loss": 7.3504, "step": 25 }, { "epoch": 0.0008845643520566122, "grad_norm": 4448.0, "learning_rate": 1.7688679245283017e-08, "loss": 6.8362, "step": 30 }, { "epoch": 0.0010319917440660474, "grad_norm": 3984.0, "learning_rate": 2.0636792452830187e-08, "loss": 7.2579, "step": 35 }, { "epoch": 0.0011794191360754828, "grad_norm": 6112.0, "learning_rate": 2.3584905660377358e-08, "loss": 7.0144, "step": 40 }, { "epoch": 0.0013268465280849183, "grad_norm": 3536.0, "learning_rate": 2.6533018867924528e-08, "loss": 6.6585, "step": 45 }, { "epoch": 0.0014742739200943535, "grad_norm": 5184.0, "learning_rate": 2.9481132075471698e-08, "loss": 7.1249, "step": 50 }, { "epoch": 0.001621701312103789, "grad_norm": 4736.0, "learning_rate": 3.242924528301887e-08, "loss": 7.0672, "step": 55 }, { "epoch": 0.0017691287041132243, "grad_norm": 3888.0, "learning_rate": 3.5377358490566035e-08, "loss": 6.9418, "step": 60 }, { "epoch": 0.0019165560961226596, "grad_norm": 4512.0, "learning_rate": 3.83254716981132e-08, "loss": 6.7915, "step": 65 }, { "epoch": 0.0020639834881320948, "grad_norm": 4544.0, "learning_rate": 4.1273584905660375e-08, "loss": 6.9867, "step": 70 }, { "epoch": 0.0022114108801415304, "grad_norm": 4160.0, "learning_rate": 4.422169811320754e-08, "loss": 6.9295, "step": 75 }, { "epoch": 0.0023588382721509656, "grad_norm": 5024.0, "learning_rate": 4.7169811320754715e-08, "loss": 7.1398, "step": 80 }, { "epoch": 0.002506265664160401, "grad_norm": 6688.0, "learning_rate": 5.011792452830189e-08, "loss": 6.8533, "step": 85 }, { "epoch": 0.0026536930561698365, "grad_norm": 4128.0, "learning_rate": 5.3066037735849055e-08, "loss": 6.8373, "step": 90 }, { "epoch": 0.0028011204481792717, "grad_norm": 4512.0, "learning_rate": 5.601415094339622e-08, "loss": 6.9857, "step": 95 }, { "epoch": 0.002948547840188707, "grad_norm": 4704.0, "learning_rate": 5.8962264150943396e-08, "loss": 7.3043, "step": 100 }, { "epoch": 0.0030959752321981426, "grad_norm": 5824.0, "learning_rate": 6.191037735849057e-08, "loss": 7.4446, "step": 105 }, { "epoch": 0.003243402624207578, "grad_norm": 5600.0, "learning_rate": 6.485849056603774e-08, "loss": 6.4237, "step": 110 }, { "epoch": 0.003390830016217013, "grad_norm": 3360.0, "learning_rate": 6.78066037735849e-08, "loss": 7.1134, "step": 115 }, { "epoch": 0.0035382574082264487, "grad_norm": 5120.0, "learning_rate": 7.075471698113207e-08, "loss": 6.541, "step": 120 }, { "epoch": 0.003685684800235884, "grad_norm": 4016.0, "learning_rate": 7.370283018867925e-08, "loss": 7.1745, "step": 125 }, { "epoch": 0.003833112192245319, "grad_norm": 4096.0, "learning_rate": 7.66509433962264e-08, "loss": 6.6107, "step": 130 }, { "epoch": 0.003980539584254755, "grad_norm": 6784.0, "learning_rate": 7.959905660377358e-08, "loss": 7.1709, "step": 135 }, { "epoch": 0.0041279669762641896, "grad_norm": 4800.0, "learning_rate": 8.254716981132075e-08, "loss": 6.7303, "step": 140 }, { "epoch": 0.004275394368273625, "grad_norm": 5472.0, "learning_rate": 8.549528301886792e-08, "loss": 6.6023, "step": 145 }, { "epoch": 0.004422821760283061, "grad_norm": 5952.0, "learning_rate": 8.844339622641508e-08, "loss": 7.3574, "step": 150 }, { "epoch": 0.004570249152292496, "grad_norm": 4480.0, "learning_rate": 9.139150943396226e-08, "loss": 6.4739, "step": 155 }, { "epoch": 0.004717676544301931, "grad_norm": 2928.0, "learning_rate": 9.433962264150943e-08, "loss": 6.5303, "step": 160 }, { "epoch": 0.004865103936311367, "grad_norm": 4064.0, "learning_rate": 9.72877358490566e-08, "loss": 6.7455, "step": 165 }, { "epoch": 0.005012531328320802, "grad_norm": 2928.0, "learning_rate": 1.0023584905660378e-07, "loss": 6.2466, "step": 170 }, { "epoch": 0.005159958720330237, "grad_norm": 3536.0, "learning_rate": 1.0318396226415093e-07, "loss": 6.493, "step": 175 }, { "epoch": 0.005307386112339673, "grad_norm": 3792.0, "learning_rate": 1.0613207547169811e-07, "loss": 6.1627, "step": 180 }, { "epoch": 0.005454813504349108, "grad_norm": 5568.0, "learning_rate": 1.0908018867924528e-07, "loss": 6.7054, "step": 185 }, { "epoch": 0.0056022408963585435, "grad_norm": 5312.0, "learning_rate": 1.1202830188679244e-07, "loss": 6.3761, "step": 190 }, { "epoch": 0.005749668288367979, "grad_norm": 3888.0, "learning_rate": 1.1497641509433961e-07, "loss": 6.4957, "step": 195 }, { "epoch": 0.005897095680377414, "grad_norm": 4016.0, "learning_rate": 1.1792452830188679e-07, "loss": 6.3387, "step": 200 }, { "epoch": 0.0060445230723868495, "grad_norm": 3712.0, "learning_rate": 1.2087264150943396e-07, "loss": 6.8239, "step": 205 }, { "epoch": 0.006191950464396285, "grad_norm": 2224.0, "learning_rate": 1.2382075471698114e-07, "loss": 6.4956, "step": 210 }, { "epoch": 0.00633937785640572, "grad_norm": 4736.0, "learning_rate": 1.267688679245283e-07, "loss": 6.2678, "step": 215 }, { "epoch": 0.006486805248415156, "grad_norm": 4320.0, "learning_rate": 1.2971698113207547e-07, "loss": 6.4021, "step": 220 }, { "epoch": 0.006634232640424591, "grad_norm": 4032.0, "learning_rate": 1.3266509433962265e-07, "loss": 5.8306, "step": 225 }, { "epoch": 0.006781660032434026, "grad_norm": 3664.0, "learning_rate": 1.356132075471698e-07, "loss": 6.7351, "step": 230 }, { "epoch": 0.006929087424443462, "grad_norm": 2032.0, "learning_rate": 1.3856132075471696e-07, "loss": 6.3457, "step": 235 }, { "epoch": 0.007076514816452897, "grad_norm": 2784.0, "learning_rate": 1.4150943396226414e-07, "loss": 6.1769, "step": 240 }, { "epoch": 0.007223942208462332, "grad_norm": 1464.0, "learning_rate": 1.4445754716981132e-07, "loss": 5.8652, "step": 245 }, { "epoch": 0.007371369600471768, "grad_norm": 2008.0, "learning_rate": 1.474056603773585e-07, "loss": 6.0493, "step": 250 }, { "epoch": 0.007518796992481203, "grad_norm": 1040.0, "learning_rate": 1.5035377358490565e-07, "loss": 5.9201, "step": 255 }, { "epoch": 0.007666224384490638, "grad_norm": 1816.0, "learning_rate": 1.533018867924528e-07, "loss": 6.2471, "step": 260 }, { "epoch": 0.007813651776500074, "grad_norm": 1432.0, "learning_rate": 1.5624999999999999e-07, "loss": 5.8522, "step": 265 }, { "epoch": 0.00796107916850951, "grad_norm": 1816.0, "learning_rate": 1.5919811320754717e-07, "loss": 5.9513, "step": 270 }, { "epoch": 0.008108506560518945, "grad_norm": 520.0, "learning_rate": 1.6214622641509435e-07, "loss": 5.784, "step": 275 }, { "epoch": 0.008255933952528379, "grad_norm": 1272.0, "learning_rate": 1.650943396226415e-07, "loss": 5.6907, "step": 280 }, { "epoch": 0.008403361344537815, "grad_norm": 852.0, "learning_rate": 1.6804245283018868e-07, "loss": 5.6086, "step": 285 }, { "epoch": 0.00855078873654725, "grad_norm": 266.0, "learning_rate": 1.7099056603773583e-07, "loss": 5.3121, "step": 290 }, { "epoch": 0.008698216128556686, "grad_norm": 756.0, "learning_rate": 1.7393867924528301e-07, "loss": 5.6198, "step": 295 }, { "epoch": 0.008845643520566122, "grad_norm": 568.0, "learning_rate": 1.7688679245283017e-07, "loss": 5.5308, "step": 300 }, { "epoch": 0.008993070912575557, "grad_norm": 644.0, "learning_rate": 1.7983490566037735e-07, "loss": 5.522, "step": 305 }, { "epoch": 0.009140498304584991, "grad_norm": 596.0, "learning_rate": 1.8278301886792453e-07, "loss": 5.6054, "step": 310 }, { "epoch": 0.009287925696594427, "grad_norm": 384.0, "learning_rate": 1.857311320754717e-07, "loss": 5.5085, "step": 315 }, { "epoch": 0.009435353088603863, "grad_norm": 504.0, "learning_rate": 1.8867924528301886e-07, "loss": 5.6657, "step": 320 }, { "epoch": 0.009582780480613298, "grad_norm": 460.0, "learning_rate": 1.9162735849056601e-07, "loss": 5.7185, "step": 325 }, { "epoch": 0.009730207872622734, "grad_norm": 141.0, "learning_rate": 1.945754716981132e-07, "loss": 5.5482, "step": 330 }, { "epoch": 0.009877635264632168, "grad_norm": 560.0, "learning_rate": 1.9752358490566037e-07, "loss": 5.378, "step": 335 }, { "epoch": 0.010025062656641603, "grad_norm": 512.0, "learning_rate": 2.0047169811320755e-07, "loss": 5.6247, "step": 340 }, { "epoch": 0.010172490048651039, "grad_norm": 528.0, "learning_rate": 2.034198113207547e-07, "loss": 5.1692, "step": 345 }, { "epoch": 0.010319917440660475, "grad_norm": 624.0, "learning_rate": 2.0636792452830186e-07, "loss": 5.43, "step": 350 }, { "epoch": 0.01046734483266991, "grad_norm": 592.0, "learning_rate": 2.0931603773584904e-07, "loss": 5.2031, "step": 355 }, { "epoch": 0.010614772224679346, "grad_norm": 396.0, "learning_rate": 2.1226415094339622e-07, "loss": 5.1502, "step": 360 }, { "epoch": 0.01076219961668878, "grad_norm": 516.0, "learning_rate": 2.1521226415094338e-07, "loss": 5.826, "step": 365 }, { "epoch": 0.010909627008698216, "grad_norm": 760.0, "learning_rate": 2.1816037735849056e-07, "loss": 5.5087, "step": 370 }, { "epoch": 0.011057054400707651, "grad_norm": 420.0, "learning_rate": 2.2110849056603774e-07, "loss": 5.146, "step": 375 }, { "epoch": 0.011204481792717087, "grad_norm": 192.0, "learning_rate": 2.240566037735849e-07, "loss": 5.3402, "step": 380 }, { "epoch": 0.011351909184726523, "grad_norm": 370.0, "learning_rate": 2.2700471698113207e-07, "loss": 5.3036, "step": 385 }, { "epoch": 0.011499336576735958, "grad_norm": 352.0, "learning_rate": 2.2995283018867922e-07, "loss": 5.1917, "step": 390 }, { "epoch": 0.011646763968745392, "grad_norm": 414.0, "learning_rate": 2.329009433962264e-07, "loss": 5.0797, "step": 395 }, { "epoch": 0.011794191360754828, "grad_norm": 484.0, "learning_rate": 2.3584905660377358e-07, "loss": 5.296, "step": 400 }, { "epoch": 0.011941618752764263, "grad_norm": 508.0, "learning_rate": 2.3879716981132076e-07, "loss": 5.026, "step": 405 }, { "epoch": 0.012089046144773699, "grad_norm": 450.0, "learning_rate": 2.417452830188679e-07, "loss": 4.9075, "step": 410 }, { "epoch": 0.012236473536783135, "grad_norm": 382.0, "learning_rate": 2.4469339622641507e-07, "loss": 5.0301, "step": 415 }, { "epoch": 0.01238390092879257, "grad_norm": 384.0, "learning_rate": 2.476415094339623e-07, "loss": 4.9027, "step": 420 }, { "epoch": 0.012531328320802004, "grad_norm": 188.0, "learning_rate": 2.505896226415094e-07, "loss": 5.0141, "step": 425 }, { "epoch": 0.01267875571281144, "grad_norm": 157.0, "learning_rate": 2.535377358490566e-07, "loss": 4.5994, "step": 430 }, { "epoch": 0.012826183104820876, "grad_norm": 386.0, "learning_rate": 2.5648584905660374e-07, "loss": 4.8006, "step": 435 }, { "epoch": 0.012973610496830311, "grad_norm": 270.0, "learning_rate": 2.5943396226415094e-07, "loss": 4.4348, "step": 440 }, { "epoch": 0.013121037888839747, "grad_norm": 161.0, "learning_rate": 2.623820754716981e-07, "loss": 4.7565, "step": 445 }, { "epoch": 0.013268465280849183, "grad_norm": 186.0, "learning_rate": 2.653301886792453e-07, "loss": 4.7097, "step": 450 }, { "epoch": 0.013415892672858616, "grad_norm": 115.0, "learning_rate": 2.6827830188679246e-07, "loss": 4.3228, "step": 455 }, { "epoch": 0.013563320064868052, "grad_norm": 83.0, "learning_rate": 2.712264150943396e-07, "loss": 4.52, "step": 460 }, { "epoch": 0.013710747456877488, "grad_norm": 142.0, "learning_rate": 2.7417452830188676e-07, "loss": 4.4627, "step": 465 }, { "epoch": 0.013858174848886923, "grad_norm": 247.0, "learning_rate": 2.771226415094339e-07, "loss": 4.2847, "step": 470 }, { "epoch": 0.014005602240896359, "grad_norm": 59.0, "learning_rate": 2.800707547169811e-07, "loss": 4.5636, "step": 475 }, { "epoch": 0.014153029632905795, "grad_norm": 168.0, "learning_rate": 2.830188679245283e-07, "loss": 4.3123, "step": 480 }, { "epoch": 0.014300457024915229, "grad_norm": 106.0, "learning_rate": 2.8596698113207543e-07, "loss": 4.3122, "step": 485 }, { "epoch": 0.014447884416924664, "grad_norm": 97.0, "learning_rate": 2.8891509433962264e-07, "loss": 4.2267, "step": 490 }, { "epoch": 0.0145953118089341, "grad_norm": 73.0, "learning_rate": 2.918632075471698e-07, "loss": 4.1379, "step": 495 }, { "epoch": 0.014742739200943536, "grad_norm": 60.25, "learning_rate": 2.94811320754717e-07, "loss": 4.0958, "step": 500 }, { "epoch": 0.014742739200943536, "eval_loss": 5.273393630981445, "eval_runtime": 4.7076, "eval_samples_per_second": 84.119, "eval_steps_per_second": 2.761, "step": 500 }, { "epoch": 0.014890166592952971, "grad_norm": 77.0, "learning_rate": 2.9775943396226415e-07, "loss": 3.849, "step": 505 }, { "epoch": 0.015037593984962405, "grad_norm": 83.0, "learning_rate": 3.007075471698113e-07, "loss": 4.2059, "step": 510 }, { "epoch": 0.01518502137697184, "grad_norm": 120.0, "learning_rate": 3.0365566037735846e-07, "loss": 4.1134, "step": 515 }, { "epoch": 0.015332448768981276, "grad_norm": 58.25, "learning_rate": 3.066037735849056e-07, "loss": 3.9189, "step": 520 }, { "epoch": 0.015479876160990712, "grad_norm": 77.0, "learning_rate": 3.095518867924528e-07, "loss": 4.1662, "step": 525 }, { "epoch": 0.015627303553000148, "grad_norm": 120.5, "learning_rate": 3.1249999999999997e-07, "loss": 4.0004, "step": 530 }, { "epoch": 0.01577473094500958, "grad_norm": 51.75, "learning_rate": 3.154481132075472e-07, "loss": 4.1284, "step": 535 }, { "epoch": 0.01592215833701902, "grad_norm": 56.0, "learning_rate": 3.1839622641509433e-07, "loss": 4.1131, "step": 540 }, { "epoch": 0.016069585729028453, "grad_norm": 87.0, "learning_rate": 3.2134433962264154e-07, "loss": 4.0497, "step": 545 }, { "epoch": 0.01621701312103789, "grad_norm": 53.25, "learning_rate": 3.242924528301887e-07, "loss": 3.8763, "step": 550 }, { "epoch": 0.016364440513047324, "grad_norm": 80.5, "learning_rate": 3.272405660377358e-07, "loss": 4.1176, "step": 555 }, { "epoch": 0.016511867905056758, "grad_norm": 72.0, "learning_rate": 3.30188679245283e-07, "loss": 4.0426, "step": 560 }, { "epoch": 0.016659295297066196, "grad_norm": 52.25, "learning_rate": 3.3313679245283015e-07, "loss": 3.7246, "step": 565 }, { "epoch": 0.01680672268907563, "grad_norm": 50.0, "learning_rate": 3.3608490566037736e-07, "loss": 3.914, "step": 570 }, { "epoch": 0.016954150081085067, "grad_norm": 48.25, "learning_rate": 3.390330188679245e-07, "loss": 4.1672, "step": 575 }, { "epoch": 0.0171015774730945, "grad_norm": 56.5, "learning_rate": 3.4198113207547167e-07, "loss": 3.7885, "step": 580 }, { "epoch": 0.017249004865103935, "grad_norm": 130.0, "learning_rate": 3.449292452830189e-07, "loss": 3.9511, "step": 585 }, { "epoch": 0.017396432257113372, "grad_norm": 69.5, "learning_rate": 3.4787735849056603e-07, "loss": 4.0633, "step": 590 }, { "epoch": 0.017543859649122806, "grad_norm": 49.5, "learning_rate": 3.5082547169811323e-07, "loss": 3.801, "step": 595 }, { "epoch": 0.017691287041132243, "grad_norm": 38.0, "learning_rate": 3.5377358490566033e-07, "loss": 3.6217, "step": 600 }, { "epoch": 0.017838714433141677, "grad_norm": 35.75, "learning_rate": 3.567216981132075e-07, "loss": 3.9652, "step": 605 }, { "epoch": 0.017986141825151115, "grad_norm": 52.0, "learning_rate": 3.596698113207547e-07, "loss": 3.7182, "step": 610 }, { "epoch": 0.01813356921716055, "grad_norm": 58.5, "learning_rate": 3.6261792452830185e-07, "loss": 3.6423, "step": 615 }, { "epoch": 0.018280996609169983, "grad_norm": 46.25, "learning_rate": 3.6556603773584905e-07, "loss": 3.7992, "step": 620 }, { "epoch": 0.01842842400117942, "grad_norm": 42.25, "learning_rate": 3.685141509433962e-07, "loss": 3.8397, "step": 625 }, { "epoch": 0.018575851393188854, "grad_norm": 36.0, "learning_rate": 3.714622641509434e-07, "loss": 3.6173, "step": 630 }, { "epoch": 0.01872327878519829, "grad_norm": 49.75, "learning_rate": 3.7441037735849057e-07, "loss": 4.0317, "step": 635 }, { "epoch": 0.018870706177207725, "grad_norm": 38.5, "learning_rate": 3.773584905660377e-07, "loss": 3.7633, "step": 640 }, { "epoch": 0.01901813356921716, "grad_norm": 41.75, "learning_rate": 3.803066037735849e-07, "loss": 3.8188, "step": 645 }, { "epoch": 0.019165560961226596, "grad_norm": 63.5, "learning_rate": 3.8325471698113203e-07, "loss": 3.7537, "step": 650 }, { "epoch": 0.01931298835323603, "grad_norm": 49.25, "learning_rate": 3.8620283018867924e-07, "loss": 3.6774, "step": 655 }, { "epoch": 0.019460415745245468, "grad_norm": 39.0, "learning_rate": 3.891509433962264e-07, "loss": 3.7567, "step": 660 }, { "epoch": 0.0196078431372549, "grad_norm": 39.0, "learning_rate": 3.920990566037736e-07, "loss": 3.756, "step": 665 }, { "epoch": 0.019755270529264336, "grad_norm": 69.0, "learning_rate": 3.9504716981132075e-07, "loss": 3.6765, "step": 670 }, { "epoch": 0.019902697921273773, "grad_norm": 60.75, "learning_rate": 3.979952830188679e-07, "loss": 3.949, "step": 675 }, { "epoch": 0.020050125313283207, "grad_norm": 47.25, "learning_rate": 4.009433962264151e-07, "loss": 3.7218, "step": 680 }, { "epoch": 0.020197552705292644, "grad_norm": 93.5, "learning_rate": 4.038915094339622e-07, "loss": 3.6829, "step": 685 }, { "epoch": 0.020344980097302078, "grad_norm": 39.0, "learning_rate": 4.068396226415094e-07, "loss": 3.5724, "step": 690 }, { "epoch": 0.020492407489311516, "grad_norm": 62.0, "learning_rate": 4.0978773584905657e-07, "loss": 3.6344, "step": 695 }, { "epoch": 0.02063983488132095, "grad_norm": 196.0, "learning_rate": 4.127358490566037e-07, "loss": 3.5317, "step": 700 }, { "epoch": 0.020787262273330383, "grad_norm": 41.0, "learning_rate": 4.1568396226415093e-07, "loss": 3.9269, "step": 705 }, { "epoch": 0.02093468966533982, "grad_norm": 49.0, "learning_rate": 4.186320754716981e-07, "loss": 3.7463, "step": 710 }, { "epoch": 0.021082117057349255, "grad_norm": 61.75, "learning_rate": 4.215801886792453e-07, "loss": 3.426, "step": 715 }, { "epoch": 0.021229544449358692, "grad_norm": 40.25, "learning_rate": 4.2452830188679244e-07, "loss": 3.5511, "step": 720 }, { "epoch": 0.021376971841368126, "grad_norm": 60.25, "learning_rate": 4.2747641509433965e-07, "loss": 3.8714, "step": 725 }, { "epoch": 0.02152439923337756, "grad_norm": 67.0, "learning_rate": 4.3042452830188675e-07, "loss": 3.756, "step": 730 }, { "epoch": 0.021671826625386997, "grad_norm": 47.0, "learning_rate": 4.333726415094339e-07, "loss": 3.548, "step": 735 }, { "epoch": 0.02181925401739643, "grad_norm": 32.25, "learning_rate": 4.363207547169811e-07, "loss": 3.6436, "step": 740 }, { "epoch": 0.02196668140940587, "grad_norm": 54.75, "learning_rate": 4.3926886792452826e-07, "loss": 3.4437, "step": 745 }, { "epoch": 0.022114108801415303, "grad_norm": 55.75, "learning_rate": 4.4221698113207547e-07, "loss": 3.6474, "step": 750 }, { "epoch": 0.02226153619342474, "grad_norm": 49.0, "learning_rate": 4.451650943396226e-07, "loss": 3.7664, "step": 755 }, { "epoch": 0.022408963585434174, "grad_norm": 37.0, "learning_rate": 4.481132075471698e-07, "loss": 3.4262, "step": 760 }, { "epoch": 0.022556390977443608, "grad_norm": 72.0, "learning_rate": 4.51061320754717e-07, "loss": 3.5507, "step": 765 }, { "epoch": 0.022703818369453045, "grad_norm": 34.75, "learning_rate": 4.5400943396226414e-07, "loss": 3.6294, "step": 770 }, { "epoch": 0.02285124576146248, "grad_norm": 71.0, "learning_rate": 4.569575471698113e-07, "loss": 3.3822, "step": 775 }, { "epoch": 0.022998673153471916, "grad_norm": 57.5, "learning_rate": 4.5990566037735845e-07, "loss": 3.6301, "step": 780 }, { "epoch": 0.02314610054548135, "grad_norm": 44.5, "learning_rate": 4.6285377358490565e-07, "loss": 3.5542, "step": 785 }, { "epoch": 0.023293527937490784, "grad_norm": 30.25, "learning_rate": 4.658018867924528e-07, "loss": 3.4738, "step": 790 }, { "epoch": 0.02344095532950022, "grad_norm": 36.25, "learning_rate": 4.6874999999999996e-07, "loss": 3.6581, "step": 795 }, { "epoch": 0.023588382721509656, "grad_norm": 27.5, "learning_rate": 4.7169811320754717e-07, "loss": 3.4342, "step": 800 }, { "epoch": 0.023735810113519093, "grad_norm": 36.5, "learning_rate": 4.746462264150943e-07, "loss": 3.6826, "step": 805 }, { "epoch": 0.023883237505528527, "grad_norm": 41.75, "learning_rate": 4.775943396226415e-07, "loss": 3.485, "step": 810 }, { "epoch": 0.024030664897537964, "grad_norm": 171.0, "learning_rate": 4.805424528301887e-07, "loss": 3.5775, "step": 815 }, { "epoch": 0.024178092289547398, "grad_norm": 50.25, "learning_rate": 4.834905660377358e-07, "loss": 3.6154, "step": 820 }, { "epoch": 0.024325519681556832, "grad_norm": 28.875, "learning_rate": 4.86438679245283e-07, "loss": 3.6943, "step": 825 }, { "epoch": 0.02447294707356627, "grad_norm": 35.75, "learning_rate": 4.893867924528301e-07, "loss": 3.7264, "step": 830 }, { "epoch": 0.024620374465575703, "grad_norm": 46.5, "learning_rate": 4.923349056603773e-07, "loss": 3.4934, "step": 835 }, { "epoch": 0.02476780185758514, "grad_norm": 42.0, "learning_rate": 4.952830188679246e-07, "loss": 3.5727, "step": 840 }, { "epoch": 0.024915229249594575, "grad_norm": 27.5, "learning_rate": 4.982311320754717e-07, "loss": 3.3648, "step": 845 }, { "epoch": 0.02506265664160401, "grad_norm": 22.75, "learning_rate": 5.011792452830188e-07, "loss": 3.5408, "step": 850 }, { "epoch": 0.025210084033613446, "grad_norm": 44.75, "learning_rate": 5.04127358490566e-07, "loss": 3.7565, "step": 855 }, { "epoch": 0.02535751142562288, "grad_norm": 33.75, "learning_rate": 5.070754716981132e-07, "loss": 3.5236, "step": 860 }, { "epoch": 0.025504938817632317, "grad_norm": 79.0, "learning_rate": 5.100235849056603e-07, "loss": 3.4544, "step": 865 }, { "epoch": 0.02565236620964175, "grad_norm": 54.75, "learning_rate": 5.129716981132075e-07, "loss": 3.3027, "step": 870 }, { "epoch": 0.025799793601651185, "grad_norm": 37.0, "learning_rate": 5.159198113207547e-07, "loss": 3.2541, "step": 875 }, { "epoch": 0.025947220993660623, "grad_norm": 33.0, "learning_rate": 5.188679245283019e-07, "loss": 3.5423, "step": 880 }, { "epoch": 0.026094648385670056, "grad_norm": 31.125, "learning_rate": 5.21816037735849e-07, "loss": 3.5534, "step": 885 }, { "epoch": 0.026242075777679494, "grad_norm": 30.25, "learning_rate": 5.247641509433962e-07, "loss": 3.6136, "step": 890 }, { "epoch": 0.026389503169688928, "grad_norm": 29.125, "learning_rate": 5.277122641509433e-07, "loss": 3.3698, "step": 895 }, { "epoch": 0.026536930561698365, "grad_norm": 30.5, "learning_rate": 5.306603773584906e-07, "loss": 3.4885, "step": 900 }, { "epoch": 0.0266843579537078, "grad_norm": 24.375, "learning_rate": 5.336084905660378e-07, "loss": 3.3895, "step": 905 }, { "epoch": 0.026831785345717233, "grad_norm": 34.5, "learning_rate": 5.365566037735849e-07, "loss": 3.5425, "step": 910 }, { "epoch": 0.02697921273772667, "grad_norm": 53.5, "learning_rate": 5.395047169811321e-07, "loss": 3.5077, "step": 915 }, { "epoch": 0.027126640129736104, "grad_norm": 25.375, "learning_rate": 5.424528301886792e-07, "loss": 3.3913, "step": 920 }, { "epoch": 0.02727406752174554, "grad_norm": 27.75, "learning_rate": 5.454009433962265e-07, "loss": 3.467, "step": 925 }, { "epoch": 0.027421494913754976, "grad_norm": 53.0, "learning_rate": 5.483490566037735e-07, "loss": 3.7339, "step": 930 }, { "epoch": 0.02756892230576441, "grad_norm": 34.0, "learning_rate": 5.512971698113207e-07, "loss": 3.4162, "step": 935 }, { "epoch": 0.027716349697773847, "grad_norm": 26.125, "learning_rate": 5.542452830188678e-07, "loss": 3.4891, "step": 940 }, { "epoch": 0.02786377708978328, "grad_norm": 30.625, "learning_rate": 5.57193396226415e-07, "loss": 3.6314, "step": 945 }, { "epoch": 0.028011204481792718, "grad_norm": 26.5, "learning_rate": 5.601415094339622e-07, "loss": 3.5523, "step": 950 }, { "epoch": 0.028158631873802152, "grad_norm": 33.75, "learning_rate": 5.630896226415094e-07, "loss": 3.3893, "step": 955 }, { "epoch": 0.02830605926581159, "grad_norm": 32.0, "learning_rate": 5.660377358490566e-07, "loss": 3.3843, "step": 960 }, { "epoch": 0.028453486657821023, "grad_norm": 29.5, "learning_rate": 5.689858490566037e-07, "loss": 3.5737, "step": 965 }, { "epoch": 0.028600914049830457, "grad_norm": 71.0, "learning_rate": 5.719339622641509e-07, "loss": 3.2828, "step": 970 }, { "epoch": 0.028748341441839895, "grad_norm": 42.25, "learning_rate": 5.748820754716981e-07, "loss": 3.414, "step": 975 }, { "epoch": 0.02889576883384933, "grad_norm": 30.0, "learning_rate": 5.778301886792453e-07, "loss": 3.3214, "step": 980 }, { "epoch": 0.029043196225858766, "grad_norm": 24.0, "learning_rate": 5.807783018867924e-07, "loss": 3.3956, "step": 985 }, { "epoch": 0.0291906236178682, "grad_norm": 29.0, "learning_rate": 5.837264150943396e-07, "loss": 3.4536, "step": 990 }, { "epoch": 0.029338051009877634, "grad_norm": 73.0, "learning_rate": 5.866745283018868e-07, "loss": 3.8505, "step": 995 }, { "epoch": 0.02948547840188707, "grad_norm": 28.875, "learning_rate": 5.89622641509434e-07, "loss": 3.4702, "step": 1000 }, { "epoch": 0.02948547840188707, "eval_loss": 4.123366832733154, "eval_runtime": 4.7173, "eval_samples_per_second": 83.946, "eval_steps_per_second": 2.756, "step": 1000 }, { "epoch": 0.029632905793896505, "grad_norm": 40.5, "learning_rate": 5.925707547169812e-07, "loss": 3.4618, "step": 1005 }, { "epoch": 0.029780333185905943, "grad_norm": 31.0, "learning_rate": 5.955188679245283e-07, "loss": 3.3067, "step": 1010 }, { "epoch": 0.029927760577915376, "grad_norm": 25.5, "learning_rate": 5.984669811320755e-07, "loss": 3.2988, "step": 1015 }, { "epoch": 0.03007518796992481, "grad_norm": 24.875, "learning_rate": 6.014150943396226e-07, "loss": 3.4574, "step": 1020 }, { "epoch": 0.030222615361934248, "grad_norm": 34.0, "learning_rate": 6.043632075471698e-07, "loss": 3.4777, "step": 1025 }, { "epoch": 0.03037004275394368, "grad_norm": 21.375, "learning_rate": 6.073113207547169e-07, "loss": 3.2491, "step": 1030 }, { "epoch": 0.03051747014595312, "grad_norm": 32.75, "learning_rate": 6.102594339622641e-07, "loss": 3.4012, "step": 1035 }, { "epoch": 0.030664897537962553, "grad_norm": 20.625, "learning_rate": 6.132075471698112e-07, "loss": 3.2219, "step": 1040 }, { "epoch": 0.03081232492997199, "grad_norm": 28.625, "learning_rate": 6.161556603773585e-07, "loss": 3.4702, "step": 1045 }, { "epoch": 0.030959752321981424, "grad_norm": 32.5, "learning_rate": 6.191037735849056e-07, "loss": 3.2885, "step": 1050 }, { "epoch": 0.031107179713990858, "grad_norm": 28.0, "learning_rate": 6.220518867924528e-07, "loss": 3.414, "step": 1055 }, { "epoch": 0.031254607106000296, "grad_norm": 24.125, "learning_rate": 6.249999999999999e-07, "loss": 3.35, "step": 1060 }, { "epoch": 0.03140203449800973, "grad_norm": 26.25, "learning_rate": 6.279481132075471e-07, "loss": 3.4552, "step": 1065 }, { "epoch": 0.03154946189001916, "grad_norm": 78.0, "learning_rate": 6.308962264150944e-07, "loss": 3.2664, "step": 1070 }, { "epoch": 0.0316968892820286, "grad_norm": 27.375, "learning_rate": 6.338443396226415e-07, "loss": 3.4228, "step": 1075 }, { "epoch": 0.03184431667403804, "grad_norm": 32.25, "learning_rate": 6.367924528301887e-07, "loss": 3.2762, "step": 1080 }, { "epoch": 0.03199174406604747, "grad_norm": 25.875, "learning_rate": 6.397405660377358e-07, "loss": 3.2858, "step": 1085 }, { "epoch": 0.032139171458056906, "grad_norm": 27.875, "learning_rate": 6.426886792452831e-07, "loss": 3.3251, "step": 1090 }, { "epoch": 0.03228659885006634, "grad_norm": 21.5, "learning_rate": 6.456367924528302e-07, "loss": 3.3962, "step": 1095 }, { "epoch": 0.03243402624207578, "grad_norm": 20.5, "learning_rate": 6.485849056603774e-07, "loss": 3.4683, "step": 1100 }, { "epoch": 0.03258145363408521, "grad_norm": 23.75, "learning_rate": 6.515330188679244e-07, "loss": 3.2019, "step": 1105 }, { "epoch": 0.03272888102609465, "grad_norm": 55.5, "learning_rate": 6.544811320754716e-07, "loss": 3.4679, "step": 1110 }, { "epoch": 0.032876308418104086, "grad_norm": 36.75, "learning_rate": 6.574292452830188e-07, "loss": 3.3551, "step": 1115 }, { "epoch": 0.033023735810113516, "grad_norm": 23.5, "learning_rate": 6.60377358490566e-07, "loss": 3.3352, "step": 1120 }, { "epoch": 0.033171163202122954, "grad_norm": 31.5, "learning_rate": 6.633254716981132e-07, "loss": 3.4598, "step": 1125 }, { "epoch": 0.03331859059413239, "grad_norm": 36.0, "learning_rate": 6.662735849056603e-07, "loss": 3.5186, "step": 1130 }, { "epoch": 0.03346601798614183, "grad_norm": 34.75, "learning_rate": 6.692216981132075e-07, "loss": 3.3776, "step": 1135 }, { "epoch": 0.03361344537815126, "grad_norm": 45.0, "learning_rate": 6.721698113207547e-07, "loss": 3.4942, "step": 1140 }, { "epoch": 0.033760872770160696, "grad_norm": 21.0, "learning_rate": 6.751179245283019e-07, "loss": 3.4764, "step": 1145 }, { "epoch": 0.033908300162170134, "grad_norm": 21.875, "learning_rate": 6.78066037735849e-07, "loss": 3.3914, "step": 1150 }, { "epoch": 0.034055727554179564, "grad_norm": 22.375, "learning_rate": 6.810141509433962e-07, "loss": 3.2916, "step": 1155 }, { "epoch": 0.034203154946189, "grad_norm": 24.875, "learning_rate": 6.839622641509433e-07, "loss": 3.3966, "step": 1160 }, { "epoch": 0.03435058233819844, "grad_norm": 32.25, "learning_rate": 6.869103773584906e-07, "loss": 3.2619, "step": 1165 }, { "epoch": 0.03449800973020787, "grad_norm": 24.875, "learning_rate": 6.898584905660377e-07, "loss": 3.2777, "step": 1170 }, { "epoch": 0.03464543712221731, "grad_norm": 20.875, "learning_rate": 6.928066037735849e-07, "loss": 3.228, "step": 1175 }, { "epoch": 0.034792864514226744, "grad_norm": 24.0, "learning_rate": 6.957547169811321e-07, "loss": 3.2484, "step": 1180 }, { "epoch": 0.03494029190623618, "grad_norm": 21.5, "learning_rate": 6.987028301886792e-07, "loss": 3.2644, "step": 1185 }, { "epoch": 0.03508771929824561, "grad_norm": 21.0, "learning_rate": 7.016509433962265e-07, "loss": 3.0695, "step": 1190 }, { "epoch": 0.03523514669025505, "grad_norm": 35.25, "learning_rate": 7.045990566037735e-07, "loss": 3.3176, "step": 1195 }, { "epoch": 0.03538257408226449, "grad_norm": 19.25, "learning_rate": 7.075471698113207e-07, "loss": 3.2097, "step": 1200 }, { "epoch": 0.03553000147427392, "grad_norm": 25.875, "learning_rate": 7.104952830188678e-07, "loss": 3.3202, "step": 1205 }, { "epoch": 0.035677428866283355, "grad_norm": 27.25, "learning_rate": 7.13443396226415e-07, "loss": 3.1499, "step": 1210 }, { "epoch": 0.03582485625829279, "grad_norm": 34.25, "learning_rate": 7.163915094339622e-07, "loss": 3.308, "step": 1215 }, { "epoch": 0.03597228365030223, "grad_norm": 23.875, "learning_rate": 7.193396226415094e-07, "loss": 3.0438, "step": 1220 }, { "epoch": 0.03611971104231166, "grad_norm": 26.5, "learning_rate": 7.222877358490565e-07, "loss": 3.2286, "step": 1225 }, { "epoch": 0.0362671384343211, "grad_norm": 25.0, "learning_rate": 7.252358490566037e-07, "loss": 3.1494, "step": 1230 }, { "epoch": 0.036414565826330535, "grad_norm": 33.5, "learning_rate": 7.28183962264151e-07, "loss": 3.3187, "step": 1235 }, { "epoch": 0.036561993218339965, "grad_norm": 20.625, "learning_rate": 7.311320754716981e-07, "loss": 3.2469, "step": 1240 }, { "epoch": 0.0367094206103494, "grad_norm": 20.75, "learning_rate": 7.340801886792453e-07, "loss": 3.2862, "step": 1245 }, { "epoch": 0.03685684800235884, "grad_norm": 27.875, "learning_rate": 7.370283018867924e-07, "loss": 3.2839, "step": 1250 }, { "epoch": 0.03700427539436827, "grad_norm": 20.75, "learning_rate": 7.399764150943396e-07, "loss": 3.3897, "step": 1255 }, { "epoch": 0.03715170278637771, "grad_norm": 27.125, "learning_rate": 7.429245283018868e-07, "loss": 3.1699, "step": 1260 }, { "epoch": 0.037299130178387145, "grad_norm": 21.75, "learning_rate": 7.45872641509434e-07, "loss": 3.163, "step": 1265 }, { "epoch": 0.03744655757039658, "grad_norm": 21.625, "learning_rate": 7.488207547169811e-07, "loss": 3.244, "step": 1270 }, { "epoch": 0.03759398496240601, "grad_norm": 27.75, "learning_rate": 7.517688679245283e-07, "loss": 3.2307, "step": 1275 }, { "epoch": 0.03774141235441545, "grad_norm": 40.75, "learning_rate": 7.547169811320754e-07, "loss": 3.2905, "step": 1280 }, { "epoch": 0.03788883974642489, "grad_norm": 25.25, "learning_rate": 7.576650943396226e-07, "loss": 3.3318, "step": 1285 }, { "epoch": 0.03803626713843432, "grad_norm": 32.5, "learning_rate": 7.606132075471698e-07, "loss": 3.2618, "step": 1290 }, { "epoch": 0.038183694530443756, "grad_norm": 33.5, "learning_rate": 7.635613207547169e-07, "loss": 3.3272, "step": 1295 }, { "epoch": 0.03833112192245319, "grad_norm": 28.125, "learning_rate": 7.665094339622641e-07, "loss": 3.1578, "step": 1300 }, { "epoch": 0.03847854931446263, "grad_norm": 21.375, "learning_rate": 7.694575471698112e-07, "loss": 3.0873, "step": 1305 }, { "epoch": 0.03862597670647206, "grad_norm": 23.75, "learning_rate": 7.724056603773585e-07, "loss": 3.3165, "step": 1310 }, { "epoch": 0.0387734040984815, "grad_norm": 21.5, "learning_rate": 7.753537735849056e-07, "loss": 3.1587, "step": 1315 }, { "epoch": 0.038920831490490936, "grad_norm": 20.5, "learning_rate": 7.783018867924528e-07, "loss": 3.3898, "step": 1320 }, { "epoch": 0.039068258882500366, "grad_norm": 34.0, "learning_rate": 7.812499999999999e-07, "loss": 3.1372, "step": 1325 }, { "epoch": 0.0392156862745098, "grad_norm": 18.875, "learning_rate": 7.841981132075472e-07, "loss": 3.2379, "step": 1330 }, { "epoch": 0.03936311366651924, "grad_norm": 21.125, "learning_rate": 7.871462264150943e-07, "loss": 3.3005, "step": 1335 }, { "epoch": 0.03951054105852867, "grad_norm": 23.25, "learning_rate": 7.900943396226415e-07, "loss": 3.1946, "step": 1340 }, { "epoch": 0.03965796845053811, "grad_norm": 23.375, "learning_rate": 7.930424528301887e-07, "loss": 3.0714, "step": 1345 }, { "epoch": 0.039805395842547546, "grad_norm": 28.375, "learning_rate": 7.959905660377358e-07, "loss": 3.3614, "step": 1350 }, { "epoch": 0.03995282323455698, "grad_norm": 24.25, "learning_rate": 7.989386792452831e-07, "loss": 3.1599, "step": 1355 }, { "epoch": 0.040100250626566414, "grad_norm": 20.375, "learning_rate": 8.018867924528302e-07, "loss": 3.1418, "step": 1360 }, { "epoch": 0.04024767801857585, "grad_norm": 23.5, "learning_rate": 8.048349056603774e-07, "loss": 2.9725, "step": 1365 }, { "epoch": 0.04039510541058529, "grad_norm": 29.0, "learning_rate": 8.077830188679244e-07, "loss": 3.0553, "step": 1370 }, { "epoch": 0.04054253280259472, "grad_norm": 29.0, "learning_rate": 8.107311320754716e-07, "loss": 3.1955, "step": 1375 }, { "epoch": 0.040689960194604156, "grad_norm": 21.75, "learning_rate": 8.136792452830188e-07, "loss": 3.1639, "step": 1380 }, { "epoch": 0.040837387586613594, "grad_norm": 33.0, "learning_rate": 8.16627358490566e-07, "loss": 3.1981, "step": 1385 }, { "epoch": 0.04098481497862303, "grad_norm": 25.625, "learning_rate": 8.195754716981131e-07, "loss": 3.081, "step": 1390 }, { "epoch": 0.04113224237063246, "grad_norm": 24.0, "learning_rate": 8.225235849056603e-07, "loss": 3.0706, "step": 1395 }, { "epoch": 0.0412796697626419, "grad_norm": 23.0, "learning_rate": 8.254716981132074e-07, "loss": 3.2926, "step": 1400 }, { "epoch": 0.041427097154651336, "grad_norm": 23.375, "learning_rate": 8.284198113207547e-07, "loss": 3.1699, "step": 1405 }, { "epoch": 0.04157452454666077, "grad_norm": 30.0, "learning_rate": 8.313679245283019e-07, "loss": 3.145, "step": 1410 }, { "epoch": 0.041721951938670204, "grad_norm": 29.25, "learning_rate": 8.34316037735849e-07, "loss": 3.2222, "step": 1415 }, { "epoch": 0.04186937933067964, "grad_norm": 36.5, "learning_rate": 8.372641509433962e-07, "loss": 3.2383, "step": 1420 }, { "epoch": 0.04201680672268908, "grad_norm": 19.875, "learning_rate": 8.402122641509433e-07, "loss": 2.9331, "step": 1425 }, { "epoch": 0.04216423411469851, "grad_norm": 24.375, "learning_rate": 8.431603773584906e-07, "loss": 3.076, "step": 1430 }, { "epoch": 0.04231166150670795, "grad_norm": 24.125, "learning_rate": 8.461084905660377e-07, "loss": 3.1641, "step": 1435 }, { "epoch": 0.042459088898717384, "grad_norm": 21.375, "learning_rate": 8.490566037735849e-07, "loss": 3.3244, "step": 1440 }, { "epoch": 0.042606516290726815, "grad_norm": 21.125, "learning_rate": 8.52004716981132e-07, "loss": 3.1178, "step": 1445 }, { "epoch": 0.04275394368273625, "grad_norm": 27.0, "learning_rate": 8.549528301886793e-07, "loss": 3.1739, "step": 1450 }, { "epoch": 0.04290137107474569, "grad_norm": 21.625, "learning_rate": 8.579009433962265e-07, "loss": 3.2333, "step": 1455 }, { "epoch": 0.04304879846675512, "grad_norm": 31.25, "learning_rate": 8.608490566037735e-07, "loss": 3.2602, "step": 1460 }, { "epoch": 0.04319622585876456, "grad_norm": 22.0, "learning_rate": 8.637971698113207e-07, "loss": 3.2519, "step": 1465 }, { "epoch": 0.043343653250773995, "grad_norm": 24.625, "learning_rate": 8.667452830188678e-07, "loss": 3.0988, "step": 1470 }, { "epoch": 0.04349108064278343, "grad_norm": 28.125, "learning_rate": 8.696933962264151e-07, "loss": 3.1193, "step": 1475 }, { "epoch": 0.04363850803479286, "grad_norm": 20.625, "learning_rate": 8.726415094339622e-07, "loss": 3.2805, "step": 1480 }, { "epoch": 0.0437859354268023, "grad_norm": 18.875, "learning_rate": 8.755896226415094e-07, "loss": 3.2113, "step": 1485 }, { "epoch": 0.04393336281881174, "grad_norm": 17.625, "learning_rate": 8.785377358490565e-07, "loss": 3.2982, "step": 1490 }, { "epoch": 0.04408079021082117, "grad_norm": 17.375, "learning_rate": 8.814858490566037e-07, "loss": 3.187, "step": 1495 }, { "epoch": 0.044228217602830605, "grad_norm": 21.75, "learning_rate": 8.844339622641509e-07, "loss": 3.1377, "step": 1500 }, { "epoch": 0.044228217602830605, "eval_loss": 3.5992095470428467, "eval_runtime": 4.6937, "eval_samples_per_second": 84.368, "eval_steps_per_second": 2.77, "step": 1500 }, { "epoch": 0.04437564499484004, "grad_norm": 188.0, "learning_rate": 8.873820754716981e-07, "loss": 3.1779, "step": 1505 }, { "epoch": 0.04452307238684948, "grad_norm": 45.5, "learning_rate": 8.903301886792452e-07, "loss": 3.2703, "step": 1510 }, { "epoch": 0.04467049977885891, "grad_norm": 20.875, "learning_rate": 8.932783018867924e-07, "loss": 3.1839, "step": 1515 }, { "epoch": 0.04481792717086835, "grad_norm": 19.375, "learning_rate": 8.962264150943396e-07, "loss": 3.0066, "step": 1520 }, { "epoch": 0.044965354562877785, "grad_norm": 29.5, "learning_rate": 8.991745283018868e-07, "loss": 3.2096, "step": 1525 }, { "epoch": 0.045112781954887216, "grad_norm": 26.875, "learning_rate": 9.02122641509434e-07, "loss": 3.0473, "step": 1530 }, { "epoch": 0.04526020934689665, "grad_norm": 19.625, "learning_rate": 9.050707547169811e-07, "loss": 3.0378, "step": 1535 }, { "epoch": 0.04540763673890609, "grad_norm": 17.5, "learning_rate": 9.080188679245283e-07, "loss": 3.2259, "step": 1540 }, { "epoch": 0.04555506413091552, "grad_norm": 22.625, "learning_rate": 9.109669811320755e-07, "loss": 3.0831, "step": 1545 }, { "epoch": 0.04570249152292496, "grad_norm": 19.125, "learning_rate": 9.139150943396226e-07, "loss": 3.1252, "step": 1550 }, { "epoch": 0.045849918914934396, "grad_norm": 23.25, "learning_rate": 9.168632075471697e-07, "loss": 3.2609, "step": 1555 }, { "epoch": 0.04599734630694383, "grad_norm": 27.625, "learning_rate": 9.198113207547169e-07, "loss": 3.1041, "step": 1560 }, { "epoch": 0.04614477369895326, "grad_norm": 22.75, "learning_rate": 9.22759433962264e-07, "loss": 3.289, "step": 1565 }, { "epoch": 0.0462922010909627, "grad_norm": 19.125, "learning_rate": 9.257075471698113e-07, "loss": 3.2432, "step": 1570 }, { "epoch": 0.04643962848297214, "grad_norm": 16.5, "learning_rate": 9.286556603773585e-07, "loss": 3.0007, "step": 1575 }, { "epoch": 0.04658705587498157, "grad_norm": 24.5, "learning_rate": 9.316037735849056e-07, "loss": 3.2563, "step": 1580 }, { "epoch": 0.046734483266991006, "grad_norm": 19.75, "learning_rate": 9.345518867924528e-07, "loss": 3.0233, "step": 1585 }, { "epoch": 0.04688191065900044, "grad_norm": 24.0, "learning_rate": 9.374999999999999e-07, "loss": 2.9052, "step": 1590 }, { "epoch": 0.04702933805100988, "grad_norm": 21.375, "learning_rate": 9.404481132075472e-07, "loss": 3.157, "step": 1595 }, { "epoch": 0.04717676544301931, "grad_norm": 37.0, "learning_rate": 9.433962264150943e-07, "loss": 2.9625, "step": 1600 }, { "epoch": 0.04732419283502875, "grad_norm": 16.125, "learning_rate": 9.463443396226415e-07, "loss": 3.1684, "step": 1605 }, { "epoch": 0.047471620227038186, "grad_norm": 17.0, "learning_rate": 9.492924528301886e-07, "loss": 3.1345, "step": 1610 }, { "epoch": 0.047619047619047616, "grad_norm": 24.125, "learning_rate": 9.522405660377358e-07, "loss": 3.2707, "step": 1615 }, { "epoch": 0.047766475011057054, "grad_norm": 19.375, "learning_rate": 9.55188679245283e-07, "loss": 3.1682, "step": 1620 }, { "epoch": 0.04791390240306649, "grad_norm": 20.125, "learning_rate": 9.581367924528302e-07, "loss": 3.1835, "step": 1625 }, { "epoch": 0.04806132979507593, "grad_norm": 19.75, "learning_rate": 9.610849056603774e-07, "loss": 3.0946, "step": 1630 }, { "epoch": 0.04820875718708536, "grad_norm": 27.0, "learning_rate": 9.640330188679245e-07, "loss": 2.9732, "step": 1635 }, { "epoch": 0.048356184579094796, "grad_norm": 28.0, "learning_rate": 9.669811320754717e-07, "loss": 3.0809, "step": 1640 }, { "epoch": 0.048503611971104234, "grad_norm": 16.625, "learning_rate": 9.699292452830188e-07, "loss": 2.8662, "step": 1645 }, { "epoch": 0.048651039363113664, "grad_norm": 25.0, "learning_rate": 9.72877358490566e-07, "loss": 3.1653, "step": 1650 }, { "epoch": 0.0487984667551231, "grad_norm": 20.875, "learning_rate": 9.758254716981131e-07, "loss": 2.9102, "step": 1655 }, { "epoch": 0.04894589414713254, "grad_norm": 15.625, "learning_rate": 9.787735849056603e-07, "loss": 3.0904, "step": 1660 }, { "epoch": 0.04909332153914197, "grad_norm": 26.5, "learning_rate": 9.817216981132074e-07, "loss": 3.2188, "step": 1665 }, { "epoch": 0.04924074893115141, "grad_norm": 20.0, "learning_rate": 9.846698113207546e-07, "loss": 3.0935, "step": 1670 }, { "epoch": 0.049388176323160844, "grad_norm": 22.625, "learning_rate": 9.876179245283017e-07, "loss": 3.0112, "step": 1675 }, { "epoch": 0.04953560371517028, "grad_norm": 28.875, "learning_rate": 9.90566037735849e-07, "loss": 3.0347, "step": 1680 }, { "epoch": 0.04968303110717971, "grad_norm": 28.0, "learning_rate": 9.935141509433963e-07, "loss": 3.1383, "step": 1685 }, { "epoch": 0.04983045849918915, "grad_norm": 22.625, "learning_rate": 9.964622641509434e-07, "loss": 3.195, "step": 1690 }, { "epoch": 0.04997788589119859, "grad_norm": 17.625, "learning_rate": 9.994103773584906e-07, "loss": 3.0536, "step": 1695 }, { "epoch": 0.05012531328320802, "grad_norm": 20.875, "learning_rate": 1.0023584905660375e-06, "loss": 3.1255, "step": 1700 }, { "epoch": 0.050272740675217455, "grad_norm": 23.625, "learning_rate": 1.0053066037735849e-06, "loss": 3.0029, "step": 1705 }, { "epoch": 0.05042016806722689, "grad_norm": 21.875, "learning_rate": 1.008254716981132e-06, "loss": 3.0566, "step": 1710 }, { "epoch": 0.05056759545923633, "grad_norm": 22.875, "learning_rate": 1.0112028301886792e-06, "loss": 3.235, "step": 1715 }, { "epoch": 0.05071502285124576, "grad_norm": 21.25, "learning_rate": 1.0141509433962263e-06, "loss": 3.0852, "step": 1720 }, { "epoch": 0.0508624502432552, "grad_norm": 20.875, "learning_rate": 1.0170990566037737e-06, "loss": 2.9238, "step": 1725 }, { "epoch": 0.051009877635264635, "grad_norm": 21.0, "learning_rate": 1.0200471698113206e-06, "loss": 2.9247, "step": 1730 }, { "epoch": 0.051157305027274065, "grad_norm": 22.375, "learning_rate": 1.022995283018868e-06, "loss": 2.9932, "step": 1735 }, { "epoch": 0.0513047324192835, "grad_norm": 21.625, "learning_rate": 1.025943396226415e-06, "loss": 3.1733, "step": 1740 }, { "epoch": 0.05145215981129294, "grad_norm": 18.75, "learning_rate": 1.0288915094339623e-06, "loss": 3.0096, "step": 1745 }, { "epoch": 0.05159958720330237, "grad_norm": 16.5, "learning_rate": 1.0318396226415095e-06, "loss": 3.0647, "step": 1750 }, { "epoch": 0.05174701459531181, "grad_norm": 20.75, "learning_rate": 1.0347877358490566e-06, "loss": 3.0325, "step": 1755 }, { "epoch": 0.051894441987321245, "grad_norm": 16.875, "learning_rate": 1.0377358490566038e-06, "loss": 3.0077, "step": 1760 }, { "epoch": 0.05204186937933068, "grad_norm": 18.625, "learning_rate": 1.040683962264151e-06, "loss": 2.9983, "step": 1765 }, { "epoch": 0.05218929677134011, "grad_norm": 21.75, "learning_rate": 1.043632075471698e-06, "loss": 3.103, "step": 1770 }, { "epoch": 0.05233672416334955, "grad_norm": 19.375, "learning_rate": 1.0465801886792452e-06, "loss": 3.0497, "step": 1775 }, { "epoch": 0.05248415155535899, "grad_norm": 27.125, "learning_rate": 1.0495283018867924e-06, "loss": 3.037, "step": 1780 }, { "epoch": 0.05263157894736842, "grad_norm": 14.0625, "learning_rate": 1.0524764150943395e-06, "loss": 2.7835, "step": 1785 }, { "epoch": 0.052779006339377856, "grad_norm": 20.75, "learning_rate": 1.0554245283018867e-06, "loss": 3.0535, "step": 1790 }, { "epoch": 0.05292643373138729, "grad_norm": 19.375, "learning_rate": 1.0583726415094338e-06, "loss": 3.0061, "step": 1795 }, { "epoch": 0.05307386112339673, "grad_norm": 18.375, "learning_rate": 1.0613207547169812e-06, "loss": 2.9059, "step": 1800 }, { "epoch": 0.05322128851540616, "grad_norm": 16.875, "learning_rate": 1.0642688679245282e-06, "loss": 3.0449, "step": 1805 }, { "epoch": 0.0533687159074156, "grad_norm": 19.875, "learning_rate": 1.0672169811320755e-06, "loss": 2.905, "step": 1810 }, { "epoch": 0.053516143299425036, "grad_norm": 20.25, "learning_rate": 1.0701650943396225e-06, "loss": 3.2346, "step": 1815 }, { "epoch": 0.053663570691434466, "grad_norm": 17.875, "learning_rate": 1.0731132075471698e-06, "loss": 2.9078, "step": 1820 }, { "epoch": 0.0538109980834439, "grad_norm": 20.125, "learning_rate": 1.076061320754717e-06, "loss": 3.1477, "step": 1825 }, { "epoch": 0.05395842547545334, "grad_norm": 24.0, "learning_rate": 1.0790094339622641e-06, "loss": 2.8281, "step": 1830 }, { "epoch": 0.05410585286746277, "grad_norm": 20.625, "learning_rate": 1.0819575471698113e-06, "loss": 3.0376, "step": 1835 }, { "epoch": 0.05425328025947221, "grad_norm": 15.6875, "learning_rate": 1.0849056603773584e-06, "loss": 2.8431, "step": 1840 }, { "epoch": 0.054400707651481646, "grad_norm": 18.875, "learning_rate": 1.0878537735849056e-06, "loss": 2.9487, "step": 1845 }, { "epoch": 0.05454813504349108, "grad_norm": 30.75, "learning_rate": 1.090801886792453e-06, "loss": 2.9563, "step": 1850 }, { "epoch": 0.054695562435500514, "grad_norm": 23.125, "learning_rate": 1.09375e-06, "loss": 2.871, "step": 1855 }, { "epoch": 0.05484298982750995, "grad_norm": 17.0, "learning_rate": 1.096698113207547e-06, "loss": 3.029, "step": 1860 }, { "epoch": 0.05499041721951939, "grad_norm": 14.625, "learning_rate": 1.0996462264150942e-06, "loss": 2.898, "step": 1865 }, { "epoch": 0.05513784461152882, "grad_norm": 21.25, "learning_rate": 1.1025943396226414e-06, "loss": 2.8212, "step": 1870 }, { "epoch": 0.055285272003538256, "grad_norm": 17.75, "learning_rate": 1.1055424528301887e-06, "loss": 2.9897, "step": 1875 }, { "epoch": 0.055432699395547694, "grad_norm": 17.125, "learning_rate": 1.1084905660377357e-06, "loss": 2.8868, "step": 1880 }, { "epoch": 0.05558012678755713, "grad_norm": 19.25, "learning_rate": 1.111438679245283e-06, "loss": 3.0397, "step": 1885 }, { "epoch": 0.05572755417956656, "grad_norm": 19.75, "learning_rate": 1.11438679245283e-06, "loss": 2.8232, "step": 1890 }, { "epoch": 0.055874981571576, "grad_norm": 36.75, "learning_rate": 1.1173349056603773e-06, "loss": 2.9727, "step": 1895 }, { "epoch": 0.056022408963585436, "grad_norm": 39.25, "learning_rate": 1.1202830188679245e-06, "loss": 3.3028, "step": 1900 }, { "epoch": 0.05616983635559487, "grad_norm": 19.25, "learning_rate": 1.1232311320754717e-06, "loss": 3.1002, "step": 1905 }, { "epoch": 0.056317263747604304, "grad_norm": 24.0, "learning_rate": 1.1261792452830188e-06, "loss": 3.163, "step": 1910 }, { "epoch": 0.05646469113961374, "grad_norm": 23.375, "learning_rate": 1.129127358490566e-06, "loss": 2.9574, "step": 1915 }, { "epoch": 0.05661211853162318, "grad_norm": 62.0, "learning_rate": 1.1320754716981131e-06, "loss": 2.9711, "step": 1920 }, { "epoch": 0.05675954592363261, "grad_norm": 20.875, "learning_rate": 1.1350235849056605e-06, "loss": 2.9719, "step": 1925 }, { "epoch": 0.05690697331564205, "grad_norm": 21.0, "learning_rate": 1.1379716981132074e-06, "loss": 2.9451, "step": 1930 }, { "epoch": 0.057054400707651484, "grad_norm": 19.5, "learning_rate": 1.1409198113207548e-06, "loss": 3.0775, "step": 1935 }, { "epoch": 0.057201828099660915, "grad_norm": 18.0, "learning_rate": 1.1438679245283017e-06, "loss": 2.9611, "step": 1940 }, { "epoch": 0.05734925549167035, "grad_norm": 19.5, "learning_rate": 1.1468160377358489e-06, "loss": 2.8335, "step": 1945 }, { "epoch": 0.05749668288367979, "grad_norm": 19.125, "learning_rate": 1.1497641509433962e-06, "loss": 2.9033, "step": 1950 }, { "epoch": 0.05764411027568922, "grad_norm": 19.75, "learning_rate": 1.1527122641509432e-06, "loss": 2.957, "step": 1955 }, { "epoch": 0.05779153766769866, "grad_norm": 21.375, "learning_rate": 1.1556603773584906e-06, "loss": 3.0282, "step": 1960 }, { "epoch": 0.057938965059708095, "grad_norm": 26.375, "learning_rate": 1.1586084905660375e-06, "loss": 2.9575, "step": 1965 }, { "epoch": 0.05808639245171753, "grad_norm": 16.375, "learning_rate": 1.1615566037735849e-06, "loss": 2.8847, "step": 1970 }, { "epoch": 0.05823381984372696, "grad_norm": 21.625, "learning_rate": 1.164504716981132e-06, "loss": 2.8438, "step": 1975 }, { "epoch": 0.0583812472357364, "grad_norm": 26.375, "learning_rate": 1.1674528301886792e-06, "loss": 3.0832, "step": 1980 }, { "epoch": 0.05852867462774584, "grad_norm": 17.5, "learning_rate": 1.1704009433962263e-06, "loss": 3.0941, "step": 1985 }, { "epoch": 0.05867610201975527, "grad_norm": 16.25, "learning_rate": 1.1733490566037737e-06, "loss": 2.716, "step": 1990 }, { "epoch": 0.058823529411764705, "grad_norm": 71.0, "learning_rate": 1.1762971698113206e-06, "loss": 2.9973, "step": 1995 }, { "epoch": 0.05897095680377414, "grad_norm": 24.625, "learning_rate": 1.179245283018868e-06, "loss": 2.9267, "step": 2000 }, { "epoch": 0.05897095680377414, "eval_loss": 3.27067494392395, "eval_runtime": 4.6982, "eval_samples_per_second": 84.287, "eval_steps_per_second": 2.767, "step": 2000 }, { "epoch": 0.05911838419578358, "grad_norm": 16.5, "learning_rate": 1.182193396226415e-06, "loss": 2.7164, "step": 2005 }, { "epoch": 0.05926581158779301, "grad_norm": 14.8125, "learning_rate": 1.1851415094339623e-06, "loss": 2.8899, "step": 2010 }, { "epoch": 0.05941323897980245, "grad_norm": 18.75, "learning_rate": 1.1880896226415095e-06, "loss": 2.926, "step": 2015 }, { "epoch": 0.059560666371811885, "grad_norm": 23.125, "learning_rate": 1.1910377358490566e-06, "loss": 2.853, "step": 2020 }, { "epoch": 0.059708093763821316, "grad_norm": 17.75, "learning_rate": 1.1939858490566038e-06, "loss": 2.8584, "step": 2025 }, { "epoch": 0.05985552115583075, "grad_norm": 20.75, "learning_rate": 1.196933962264151e-06, "loss": 2.8365, "step": 2030 }, { "epoch": 0.06000294854784019, "grad_norm": 19.0, "learning_rate": 1.199882075471698e-06, "loss": 3.0571, "step": 2035 }, { "epoch": 0.06015037593984962, "grad_norm": 15.6875, "learning_rate": 1.2028301886792452e-06, "loss": 2.7813, "step": 2040 }, { "epoch": 0.06029780333185906, "grad_norm": 16.125, "learning_rate": 1.2057783018867924e-06, "loss": 2.744, "step": 2045 }, { "epoch": 0.060445230723868495, "grad_norm": 18.125, "learning_rate": 1.2087264150943395e-06, "loss": 2.8005, "step": 2050 }, { "epoch": 0.06059265811587793, "grad_norm": 22.875, "learning_rate": 1.2116745283018867e-06, "loss": 2.9454, "step": 2055 }, { "epoch": 0.06074008550788736, "grad_norm": 16.875, "learning_rate": 1.2146226415094338e-06, "loss": 3.0735, "step": 2060 }, { "epoch": 0.0608875128998968, "grad_norm": 40.75, "learning_rate": 1.2175707547169812e-06, "loss": 2.9832, "step": 2065 }, { "epoch": 0.06103494029190624, "grad_norm": 22.375, "learning_rate": 1.2205188679245281e-06, "loss": 3.0261, "step": 2070 }, { "epoch": 0.06118236768391567, "grad_norm": 19.375, "learning_rate": 1.2234669811320755e-06, "loss": 3.0268, "step": 2075 }, { "epoch": 0.061329795075925106, "grad_norm": 22.5, "learning_rate": 1.2264150943396225e-06, "loss": 2.9181, "step": 2080 }, { "epoch": 0.06147722246793454, "grad_norm": 43.75, "learning_rate": 1.2293632075471698e-06, "loss": 2.7782, "step": 2085 }, { "epoch": 0.06162464985994398, "grad_norm": 15.125, "learning_rate": 1.232311320754717e-06, "loss": 2.8703, "step": 2090 }, { "epoch": 0.06177207725195341, "grad_norm": 21.75, "learning_rate": 1.2352594339622641e-06, "loss": 2.972, "step": 2095 }, { "epoch": 0.06191950464396285, "grad_norm": 16.75, "learning_rate": 1.2382075471698113e-06, "loss": 2.8651, "step": 2100 }, { "epoch": 0.062066932035972286, "grad_norm": 23.375, "learning_rate": 1.2411556603773584e-06, "loss": 3.1965, "step": 2105 }, { "epoch": 0.062214359427981716, "grad_norm": 21.125, "learning_rate": 1.2441037735849056e-06, "loss": 2.9804, "step": 2110 }, { "epoch": 0.062361786819991154, "grad_norm": 23.125, "learning_rate": 1.247051886792453e-06, "loss": 2.9106, "step": 2115 }, { "epoch": 0.06250921421200059, "grad_norm": 19.5, "learning_rate": 1.2499999999999999e-06, "loss": 2.9487, "step": 2120 }, { "epoch": 0.06265664160401002, "grad_norm": 43.5, "learning_rate": 1.252948113207547e-06, "loss": 2.9301, "step": 2125 }, { "epoch": 0.06280406899601947, "grad_norm": 17.75, "learning_rate": 1.2558962264150942e-06, "loss": 2.8569, "step": 2130 }, { "epoch": 0.0629514963880289, "grad_norm": 14.625, "learning_rate": 1.2588443396226414e-06, "loss": 2.851, "step": 2135 }, { "epoch": 0.06309892378003833, "grad_norm": 19.0, "learning_rate": 1.2617924528301887e-06, "loss": 2.959, "step": 2140 }, { "epoch": 0.06324635117204777, "grad_norm": 16.0, "learning_rate": 1.2647405660377357e-06, "loss": 3.0111, "step": 2145 }, { "epoch": 0.0633937785640572, "grad_norm": 21.25, "learning_rate": 1.267688679245283e-06, "loss": 2.9492, "step": 2150 }, { "epoch": 0.06354120595606663, "grad_norm": 27.875, "learning_rate": 1.27063679245283e-06, "loss": 2.8556, "step": 2155 }, { "epoch": 0.06368863334807608, "grad_norm": 292.0, "learning_rate": 1.2735849056603773e-06, "loss": 3.0161, "step": 2160 }, { "epoch": 0.0638360607400855, "grad_norm": 27.625, "learning_rate": 1.2765330188679245e-06, "loss": 2.8943, "step": 2165 }, { "epoch": 0.06398348813209494, "grad_norm": 20.5, "learning_rate": 1.2794811320754716e-06, "loss": 2.9569, "step": 2170 }, { "epoch": 0.06413091552410438, "grad_norm": 16.375, "learning_rate": 1.2824292452830188e-06, "loss": 2.8933, "step": 2175 }, { "epoch": 0.06427834291611381, "grad_norm": 21.75, "learning_rate": 1.2853773584905662e-06, "loss": 2.8857, "step": 2180 }, { "epoch": 0.06442577030812324, "grad_norm": 25.375, "learning_rate": 1.288325471698113e-06, "loss": 2.7119, "step": 2185 }, { "epoch": 0.06457319770013269, "grad_norm": 15.5, "learning_rate": 1.2912735849056605e-06, "loss": 2.9744, "step": 2190 }, { "epoch": 0.06472062509214212, "grad_norm": 28.0, "learning_rate": 1.2942216981132074e-06, "loss": 2.8828, "step": 2195 }, { "epoch": 0.06486805248415156, "grad_norm": 17.625, "learning_rate": 1.2971698113207548e-06, "loss": 2.9426, "step": 2200 }, { "epoch": 0.06501547987616099, "grad_norm": 16.75, "learning_rate": 1.300117924528302e-06, "loss": 2.9991, "step": 2205 }, { "epoch": 0.06516290726817042, "grad_norm": 32.5, "learning_rate": 1.3030660377358489e-06, "loss": 2.9827, "step": 2210 }, { "epoch": 0.06531033466017987, "grad_norm": 33.0, "learning_rate": 1.3060141509433962e-06, "loss": 2.7431, "step": 2215 }, { "epoch": 0.0654577620521893, "grad_norm": 18.125, "learning_rate": 1.3089622641509432e-06, "loss": 2.8863, "step": 2220 }, { "epoch": 0.06560518944419873, "grad_norm": 16.625, "learning_rate": 1.3119103773584905e-06, "loss": 2.8273, "step": 2225 }, { "epoch": 0.06575261683620817, "grad_norm": 17.5, "learning_rate": 1.3148584905660377e-06, "loss": 2.8624, "step": 2230 }, { "epoch": 0.0659000442282176, "grad_norm": 21.0, "learning_rate": 1.3178066037735848e-06, "loss": 2.8465, "step": 2235 }, { "epoch": 0.06604747162022703, "grad_norm": 25.375, "learning_rate": 1.320754716981132e-06, "loss": 2.8271, "step": 2240 }, { "epoch": 0.06619489901223648, "grad_norm": 16.375, "learning_rate": 1.3237028301886792e-06, "loss": 2.8661, "step": 2245 }, { "epoch": 0.06634232640424591, "grad_norm": 15.75, "learning_rate": 1.3266509433962263e-06, "loss": 2.8245, "step": 2250 }, { "epoch": 0.06648975379625534, "grad_norm": 16.25, "learning_rate": 1.3295990566037737e-06, "loss": 2.8016, "step": 2255 }, { "epoch": 0.06663718118826478, "grad_norm": 85.5, "learning_rate": 1.3325471698113206e-06, "loss": 3.0405, "step": 2260 }, { "epoch": 0.06678460858027421, "grad_norm": 16.5, "learning_rate": 1.335495283018868e-06, "loss": 2.941, "step": 2265 }, { "epoch": 0.06693203597228366, "grad_norm": 18.375, "learning_rate": 1.338443396226415e-06, "loss": 2.9167, "step": 2270 }, { "epoch": 0.06707946336429309, "grad_norm": 28.25, "learning_rate": 1.3413915094339623e-06, "loss": 2.9053, "step": 2275 }, { "epoch": 0.06722689075630252, "grad_norm": 20.125, "learning_rate": 1.3443396226415094e-06, "loss": 2.9119, "step": 2280 }, { "epoch": 0.06737431814831196, "grad_norm": 26.75, "learning_rate": 1.3472877358490566e-06, "loss": 2.8339, "step": 2285 }, { "epoch": 0.06752174554032139, "grad_norm": 16.75, "learning_rate": 1.3502358490566037e-06, "loss": 2.8336, "step": 2290 }, { "epoch": 0.06766917293233082, "grad_norm": 14.875, "learning_rate": 1.353183962264151e-06, "loss": 2.7363, "step": 2295 }, { "epoch": 0.06781660032434027, "grad_norm": 15.625, "learning_rate": 1.356132075471698e-06, "loss": 2.8677, "step": 2300 }, { "epoch": 0.0679640277163497, "grad_norm": 18.125, "learning_rate": 1.3590801886792452e-06, "loss": 2.8287, "step": 2305 }, { "epoch": 0.06811145510835913, "grad_norm": 17.25, "learning_rate": 1.3620283018867924e-06, "loss": 2.8503, "step": 2310 }, { "epoch": 0.06825888250036857, "grad_norm": 18.5, "learning_rate": 1.3649764150943395e-06, "loss": 2.8452, "step": 2315 }, { "epoch": 0.068406309892378, "grad_norm": 20.125, "learning_rate": 1.3679245283018867e-06, "loss": 2.8188, "step": 2320 }, { "epoch": 0.06855373728438743, "grad_norm": 21.875, "learning_rate": 1.3708726415094338e-06, "loss": 2.7956, "step": 2325 }, { "epoch": 0.06870116467639688, "grad_norm": 24.875, "learning_rate": 1.3738207547169812e-06, "loss": 2.8185, "step": 2330 }, { "epoch": 0.06884859206840631, "grad_norm": 18.25, "learning_rate": 1.3767688679245281e-06, "loss": 2.7711, "step": 2335 }, { "epoch": 0.06899601946041574, "grad_norm": 20.875, "learning_rate": 1.3797169811320755e-06, "loss": 2.9115, "step": 2340 }, { "epoch": 0.06914344685242518, "grad_norm": 18.125, "learning_rate": 1.3826650943396224e-06, "loss": 2.8137, "step": 2345 }, { "epoch": 0.06929087424443461, "grad_norm": 25.5, "learning_rate": 1.3856132075471698e-06, "loss": 2.9505, "step": 2350 }, { "epoch": 0.06943830163644406, "grad_norm": 24.375, "learning_rate": 1.388561320754717e-06, "loss": 2.8544, "step": 2355 }, { "epoch": 0.06958572902845349, "grad_norm": 24.875, "learning_rate": 1.3915094339622641e-06, "loss": 2.9782, "step": 2360 }, { "epoch": 0.06973315642046292, "grad_norm": 19.125, "learning_rate": 1.3944575471698113e-06, "loss": 3.0521, "step": 2365 }, { "epoch": 0.06988058381247236, "grad_norm": 16.625, "learning_rate": 1.3974056603773584e-06, "loss": 2.7751, "step": 2370 }, { "epoch": 0.0700280112044818, "grad_norm": 21.375, "learning_rate": 1.4003537735849056e-06, "loss": 2.7304, "step": 2375 }, { "epoch": 0.07017543859649122, "grad_norm": 32.25, "learning_rate": 1.403301886792453e-06, "loss": 2.8306, "step": 2380 }, { "epoch": 0.07032286598850067, "grad_norm": 20.375, "learning_rate": 1.4062499999999999e-06, "loss": 2.9704, "step": 2385 }, { "epoch": 0.0704702933805101, "grad_norm": 27.875, "learning_rate": 1.409198113207547e-06, "loss": 2.8213, "step": 2390 }, { "epoch": 0.07061772077251953, "grad_norm": 27.875, "learning_rate": 1.4121462264150942e-06, "loss": 2.8982, "step": 2395 }, { "epoch": 0.07076514816452897, "grad_norm": 29.375, "learning_rate": 1.4150943396226413e-06, "loss": 2.8757, "step": 2400 }, { "epoch": 0.0709125755565384, "grad_norm": 18.125, "learning_rate": 1.4180424528301887e-06, "loss": 2.7914, "step": 2405 }, { "epoch": 0.07106000294854783, "grad_norm": 63.0, "learning_rate": 1.4209905660377356e-06, "loss": 2.7504, "step": 2410 }, { "epoch": 0.07120743034055728, "grad_norm": 16.0, "learning_rate": 1.423938679245283e-06, "loss": 2.8089, "step": 2415 }, { "epoch": 0.07135485773256671, "grad_norm": 19.125, "learning_rate": 1.42688679245283e-06, "loss": 2.7972, "step": 2420 }, { "epoch": 0.07150228512457614, "grad_norm": 15.0, "learning_rate": 1.4298349056603773e-06, "loss": 2.7051, "step": 2425 }, { "epoch": 0.07164971251658558, "grad_norm": 15.1875, "learning_rate": 1.4327830188679245e-06, "loss": 2.8048, "step": 2430 }, { "epoch": 0.07179713990859501, "grad_norm": 18.375, "learning_rate": 1.4357311320754716e-06, "loss": 2.7196, "step": 2435 }, { "epoch": 0.07194456730060446, "grad_norm": 16.375, "learning_rate": 1.4386792452830188e-06, "loss": 2.695, "step": 2440 }, { "epoch": 0.07209199469261389, "grad_norm": 18.25, "learning_rate": 1.4416273584905661e-06, "loss": 2.9735, "step": 2445 }, { "epoch": 0.07223942208462332, "grad_norm": 18.25, "learning_rate": 1.444575471698113e-06, "loss": 2.8195, "step": 2450 }, { "epoch": 0.07238684947663276, "grad_norm": 17.875, "learning_rate": 1.4475235849056605e-06, "loss": 2.7522, "step": 2455 }, { "epoch": 0.0725342768686422, "grad_norm": 17.125, "learning_rate": 1.4504716981132074e-06, "loss": 2.8065, "step": 2460 }, { "epoch": 0.07268170426065163, "grad_norm": 15.625, "learning_rate": 1.4534198113207548e-06, "loss": 2.8917, "step": 2465 }, { "epoch": 0.07282913165266107, "grad_norm": 19.125, "learning_rate": 1.456367924528302e-06, "loss": 2.8464, "step": 2470 }, { "epoch": 0.0729765590446705, "grad_norm": 16.375, "learning_rate": 1.4593160377358489e-06, "loss": 2.9117, "step": 2475 }, { "epoch": 0.07312398643667993, "grad_norm": 16.375, "learning_rate": 1.4622641509433962e-06, "loss": 2.6589, "step": 2480 }, { "epoch": 0.07327141382868937, "grad_norm": 21.75, "learning_rate": 1.4652122641509432e-06, "loss": 2.915, "step": 2485 }, { "epoch": 0.0734188412206988, "grad_norm": 16.875, "learning_rate": 1.4681603773584905e-06, "loss": 2.7783, "step": 2490 }, { "epoch": 0.07356626861270824, "grad_norm": 16.875, "learning_rate": 1.4711084905660377e-06, "loss": 2.7156, "step": 2495 }, { "epoch": 0.07371369600471768, "grad_norm": 17.0, "learning_rate": 1.4740566037735848e-06, "loss": 2.6785, "step": 2500 }, { "epoch": 0.07371369600471768, "eval_loss": 3.0259971618652344, "eval_runtime": 4.7168, "eval_samples_per_second": 83.956, "eval_steps_per_second": 2.756, "step": 2500 }, { "epoch": 0.07386112339672711, "grad_norm": 11.6875, "learning_rate": 1.477004716981132e-06, "loss": 2.6697, "step": 2505 }, { "epoch": 0.07400855078873654, "grad_norm": 19.125, "learning_rate": 1.4799528301886791e-06, "loss": 2.9352, "step": 2510 }, { "epoch": 0.07415597818074599, "grad_norm": 24.5, "learning_rate": 1.4829009433962263e-06, "loss": 2.8369, "step": 2515 }, { "epoch": 0.07430340557275542, "grad_norm": 33.25, "learning_rate": 1.4858490566037737e-06, "loss": 2.7892, "step": 2520 }, { "epoch": 0.07445083296476486, "grad_norm": 23.0, "learning_rate": 1.4887971698113206e-06, "loss": 2.6221, "step": 2525 }, { "epoch": 0.07459826035677429, "grad_norm": 17.5, "learning_rate": 1.491745283018868e-06, "loss": 2.7536, "step": 2530 }, { "epoch": 0.07474568774878372, "grad_norm": 20.0, "learning_rate": 1.494693396226415e-06, "loss": 2.8727, "step": 2535 }, { "epoch": 0.07489311514079317, "grad_norm": 13.6875, "learning_rate": 1.4976415094339623e-06, "loss": 2.6381, "step": 2540 }, { "epoch": 0.0750405425328026, "grad_norm": 24.625, "learning_rate": 1.5005896226415094e-06, "loss": 2.8, "step": 2545 }, { "epoch": 0.07518796992481203, "grad_norm": 21.875, "learning_rate": 1.5035377358490566e-06, "loss": 2.6405, "step": 2550 }, { "epoch": 0.07533539731682147, "grad_norm": 13.75, "learning_rate": 1.5064858490566037e-06, "loss": 2.7266, "step": 2555 }, { "epoch": 0.0754828247088309, "grad_norm": 15.25, "learning_rate": 1.5094339622641509e-06, "loss": 2.7492, "step": 2560 }, { "epoch": 0.07563025210084033, "grad_norm": 19.5, "learning_rate": 1.512382075471698e-06, "loss": 2.7432, "step": 2565 }, { "epoch": 0.07577767949284978, "grad_norm": 21.0, "learning_rate": 1.5153301886792452e-06, "loss": 2.8012, "step": 2570 }, { "epoch": 0.0759251068848592, "grad_norm": 18.5, "learning_rate": 1.5182783018867923e-06, "loss": 2.8801, "step": 2575 }, { "epoch": 0.07607253427686864, "grad_norm": 15.9375, "learning_rate": 1.5212264150943395e-06, "loss": 2.6939, "step": 2580 }, { "epoch": 0.07621996166887808, "grad_norm": 16.375, "learning_rate": 1.5241745283018867e-06, "loss": 2.7358, "step": 2585 }, { "epoch": 0.07636738906088751, "grad_norm": 15.375, "learning_rate": 1.5271226415094338e-06, "loss": 2.7957, "step": 2590 }, { "epoch": 0.07651481645289694, "grad_norm": 18.375, "learning_rate": 1.5300707547169812e-06, "loss": 2.8, "step": 2595 }, { "epoch": 0.07666224384490639, "grad_norm": 20.625, "learning_rate": 1.5330188679245281e-06, "loss": 2.7041, "step": 2600 }, { "epoch": 0.07680967123691582, "grad_norm": 17.75, "learning_rate": 1.5359669811320755e-06, "loss": 2.8524, "step": 2605 }, { "epoch": 0.07695709862892526, "grad_norm": 20.625, "learning_rate": 1.5389150943396224e-06, "loss": 2.7402, "step": 2610 }, { "epoch": 0.07710452602093469, "grad_norm": 21.25, "learning_rate": 1.5418632075471698e-06, "loss": 2.8244, "step": 2615 }, { "epoch": 0.07725195341294412, "grad_norm": 24.125, "learning_rate": 1.544811320754717e-06, "loss": 2.7226, "step": 2620 }, { "epoch": 0.07739938080495357, "grad_norm": 19.625, "learning_rate": 1.547759433962264e-06, "loss": 2.637, "step": 2625 }, { "epoch": 0.077546808196963, "grad_norm": 22.375, "learning_rate": 1.5507075471698112e-06, "loss": 2.6678, "step": 2630 }, { "epoch": 0.07769423558897243, "grad_norm": 19.875, "learning_rate": 1.5536556603773586e-06, "loss": 2.5859, "step": 2635 }, { "epoch": 0.07784166298098187, "grad_norm": 19.75, "learning_rate": 1.5566037735849056e-06, "loss": 2.5884, "step": 2640 }, { "epoch": 0.0779890903729913, "grad_norm": 15.25, "learning_rate": 1.559551886792453e-06, "loss": 2.8667, "step": 2645 }, { "epoch": 0.07813651776500073, "grad_norm": 17.0, "learning_rate": 1.5624999999999999e-06, "loss": 2.6761, "step": 2650 }, { "epoch": 0.07828394515701018, "grad_norm": 21.75, "learning_rate": 1.565448113207547e-06, "loss": 2.6761, "step": 2655 }, { "epoch": 0.0784313725490196, "grad_norm": 32.25, "learning_rate": 1.5683962264150944e-06, "loss": 2.7592, "step": 2660 }, { "epoch": 0.07857879994102904, "grad_norm": 43.0, "learning_rate": 1.5713443396226413e-06, "loss": 2.7657, "step": 2665 }, { "epoch": 0.07872622733303848, "grad_norm": 22.875, "learning_rate": 1.5742924528301887e-06, "loss": 2.8494, "step": 2670 }, { "epoch": 0.07887365472504791, "grad_norm": 18.25, "learning_rate": 1.5772405660377356e-06, "loss": 2.7864, "step": 2675 }, { "epoch": 0.07902108211705734, "grad_norm": 24.5, "learning_rate": 1.580188679245283e-06, "loss": 2.598, "step": 2680 }, { "epoch": 0.07916850950906679, "grad_norm": 27.25, "learning_rate": 1.5831367924528301e-06, "loss": 2.751, "step": 2685 }, { "epoch": 0.07931593690107622, "grad_norm": 17.625, "learning_rate": 1.5860849056603773e-06, "loss": 2.7719, "step": 2690 }, { "epoch": 0.07946336429308566, "grad_norm": 20.75, "learning_rate": 1.5890330188679245e-06, "loss": 2.6851, "step": 2695 }, { "epoch": 0.07961079168509509, "grad_norm": 16.25, "learning_rate": 1.5919811320754716e-06, "loss": 2.7268, "step": 2700 }, { "epoch": 0.07975821907710452, "grad_norm": 20.75, "learning_rate": 1.5949292452830188e-06, "loss": 2.7603, "step": 2705 }, { "epoch": 0.07990564646911397, "grad_norm": 17.875, "learning_rate": 1.5978773584905661e-06, "loss": 2.8362, "step": 2710 }, { "epoch": 0.0800530738611234, "grad_norm": 20.375, "learning_rate": 1.600825471698113e-06, "loss": 2.7863, "step": 2715 }, { "epoch": 0.08020050125313283, "grad_norm": 16.5, "learning_rate": 1.6037735849056604e-06, "loss": 2.6775, "step": 2720 }, { "epoch": 0.08034792864514227, "grad_norm": 20.625, "learning_rate": 1.6067216981132074e-06, "loss": 2.7235, "step": 2725 }, { "epoch": 0.0804953560371517, "grad_norm": 15.5625, "learning_rate": 1.6096698113207547e-06, "loss": 2.655, "step": 2730 }, { "epoch": 0.08064278342916113, "grad_norm": 23.625, "learning_rate": 1.612617924528302e-06, "loss": 2.8043, "step": 2735 }, { "epoch": 0.08079021082117058, "grad_norm": 23.375, "learning_rate": 1.6155660377358488e-06, "loss": 2.6753, "step": 2740 }, { "epoch": 0.08093763821318001, "grad_norm": 16.75, "learning_rate": 1.6185141509433962e-06, "loss": 2.7216, "step": 2745 }, { "epoch": 0.08108506560518944, "grad_norm": 29.75, "learning_rate": 1.6214622641509431e-06, "loss": 2.893, "step": 2750 }, { "epoch": 0.08123249299719888, "grad_norm": 18.625, "learning_rate": 1.6244103773584905e-06, "loss": 2.7302, "step": 2755 }, { "epoch": 0.08137992038920831, "grad_norm": 17.0, "learning_rate": 1.6273584905660377e-06, "loss": 2.7501, "step": 2760 }, { "epoch": 0.08152734778121776, "grad_norm": 19.5, "learning_rate": 1.6303066037735848e-06, "loss": 2.7171, "step": 2765 }, { "epoch": 0.08167477517322719, "grad_norm": 14.9375, "learning_rate": 1.633254716981132e-06, "loss": 2.7143, "step": 2770 }, { "epoch": 0.08182220256523662, "grad_norm": 21.625, "learning_rate": 1.6362028301886791e-06, "loss": 2.632, "step": 2775 }, { "epoch": 0.08196962995724606, "grad_norm": 17.25, "learning_rate": 1.6391509433962263e-06, "loss": 2.7041, "step": 2780 }, { "epoch": 0.08211705734925549, "grad_norm": 27.25, "learning_rate": 1.6420990566037736e-06, "loss": 2.8002, "step": 2785 }, { "epoch": 0.08226448474126492, "grad_norm": 31.25, "learning_rate": 1.6450471698113206e-06, "loss": 2.5959, "step": 2790 }, { "epoch": 0.08241191213327437, "grad_norm": 19.875, "learning_rate": 1.647995283018868e-06, "loss": 2.7606, "step": 2795 }, { "epoch": 0.0825593395252838, "grad_norm": 18.5, "learning_rate": 1.6509433962264149e-06, "loss": 3.0455, "step": 2800 }, { "epoch": 0.08270676691729323, "grad_norm": 22.875, "learning_rate": 1.6538915094339623e-06, "loss": 2.7094, "step": 2805 }, { "epoch": 0.08285419430930267, "grad_norm": 17.375, "learning_rate": 1.6568396226415094e-06, "loss": 2.4643, "step": 2810 }, { "epoch": 0.0830016217013121, "grad_norm": 22.0, "learning_rate": 1.6597877358490566e-06, "loss": 2.6835, "step": 2815 }, { "epoch": 0.08314904909332153, "grad_norm": 18.125, "learning_rate": 1.6627358490566037e-06, "loss": 2.6437, "step": 2820 }, { "epoch": 0.08329647648533098, "grad_norm": 22.625, "learning_rate": 1.6656839622641509e-06, "loss": 2.6712, "step": 2825 }, { "epoch": 0.08344390387734041, "grad_norm": 17.0, "learning_rate": 1.668632075471698e-06, "loss": 2.7535, "step": 2830 }, { "epoch": 0.08359133126934984, "grad_norm": 24.375, "learning_rate": 1.6715801886792452e-06, "loss": 2.6405, "step": 2835 }, { "epoch": 0.08373875866135928, "grad_norm": 16.875, "learning_rate": 1.6745283018867923e-06, "loss": 2.5578, "step": 2840 }, { "epoch": 0.08388618605336871, "grad_norm": 17.375, "learning_rate": 1.6774764150943395e-06, "loss": 2.662, "step": 2845 }, { "epoch": 0.08403361344537816, "grad_norm": 18.0, "learning_rate": 1.6804245283018866e-06, "loss": 2.5877, "step": 2850 }, { "epoch": 0.08418104083738759, "grad_norm": 22.75, "learning_rate": 1.6833726415094338e-06, "loss": 2.7346, "step": 2855 }, { "epoch": 0.08432846822939702, "grad_norm": 25.0, "learning_rate": 1.6863207547169812e-06, "loss": 2.7402, "step": 2860 }, { "epoch": 0.08447589562140646, "grad_norm": 19.0, "learning_rate": 1.689268867924528e-06, "loss": 2.6026, "step": 2865 }, { "epoch": 0.0846233230134159, "grad_norm": 23.625, "learning_rate": 1.6922169811320755e-06, "loss": 2.6259, "step": 2870 }, { "epoch": 0.08477075040542532, "grad_norm": 16.625, "learning_rate": 1.6951650943396224e-06, "loss": 2.67, "step": 2875 }, { "epoch": 0.08491817779743477, "grad_norm": 22.875, "learning_rate": 1.6981132075471698e-06, "loss": 2.4129, "step": 2880 }, { "epoch": 0.0850656051894442, "grad_norm": 129.0, "learning_rate": 1.701061320754717e-06, "loss": 2.8064, "step": 2885 }, { "epoch": 0.08521303258145363, "grad_norm": 13.625, "learning_rate": 1.704009433962264e-06, "loss": 2.6248, "step": 2890 }, { "epoch": 0.08536045997346307, "grad_norm": 16.125, "learning_rate": 1.7069575471698112e-06, "loss": 2.6568, "step": 2895 }, { "epoch": 0.0855078873654725, "grad_norm": 15.75, "learning_rate": 1.7099056603773586e-06, "loss": 2.7321, "step": 2900 }, { "epoch": 0.08565531475748193, "grad_norm": 34.5, "learning_rate": 1.7128537735849055e-06, "loss": 2.6696, "step": 2905 }, { "epoch": 0.08580274214949138, "grad_norm": 18.375, "learning_rate": 1.715801886792453e-06, "loss": 2.7116, "step": 2910 }, { "epoch": 0.08595016954150081, "grad_norm": 16.125, "learning_rate": 1.7187499999999998e-06, "loss": 2.66, "step": 2915 }, { "epoch": 0.08609759693351024, "grad_norm": 18.25, "learning_rate": 1.721698113207547e-06, "loss": 2.7062, "step": 2920 }, { "epoch": 0.08624502432551968, "grad_norm": 19.75, "learning_rate": 1.7246462264150944e-06, "loss": 2.667, "step": 2925 }, { "epoch": 0.08639245171752911, "grad_norm": 13.125, "learning_rate": 1.7275943396226413e-06, "loss": 2.6131, "step": 2930 }, { "epoch": 0.08653987910953856, "grad_norm": 20.25, "learning_rate": 1.7305424528301887e-06, "loss": 2.6455, "step": 2935 }, { "epoch": 0.08668730650154799, "grad_norm": 18.625, "learning_rate": 1.7334905660377356e-06, "loss": 2.6537, "step": 2940 }, { "epoch": 0.08683473389355742, "grad_norm": 16.75, "learning_rate": 1.736438679245283e-06, "loss": 2.6189, "step": 2945 }, { "epoch": 0.08698216128556686, "grad_norm": 19.5, "learning_rate": 1.7393867924528301e-06, "loss": 2.6757, "step": 2950 }, { "epoch": 0.0871295886775763, "grad_norm": 20.5, "learning_rate": 1.7423349056603773e-06, "loss": 2.613, "step": 2955 }, { "epoch": 0.08727701606958572, "grad_norm": 23.125, "learning_rate": 1.7452830188679244e-06, "loss": 2.6377, "step": 2960 }, { "epoch": 0.08742444346159517, "grad_norm": 23.25, "learning_rate": 1.7482311320754716e-06, "loss": 2.7463, "step": 2965 }, { "epoch": 0.0875718708536046, "grad_norm": 69.5, "learning_rate": 1.7511792452830188e-06, "loss": 2.7704, "step": 2970 }, { "epoch": 0.08771929824561403, "grad_norm": 15.8125, "learning_rate": 1.7541273584905661e-06, "loss": 2.593, "step": 2975 }, { "epoch": 0.08786672563762347, "grad_norm": 53.25, "learning_rate": 1.757075471698113e-06, "loss": 2.5997, "step": 2980 }, { "epoch": 0.0880141530296329, "grad_norm": 33.75, "learning_rate": 1.7600235849056604e-06, "loss": 2.7404, "step": 2985 }, { "epoch": 0.08816158042164234, "grad_norm": 19.5, "learning_rate": 1.7629716981132074e-06, "loss": 2.5596, "step": 2990 }, { "epoch": 0.08830900781365178, "grad_norm": 24.625, "learning_rate": 1.7659198113207547e-06, "loss": 2.6808, "step": 2995 }, { "epoch": 0.08845643520566121, "grad_norm": 20.75, "learning_rate": 1.7688679245283019e-06, "loss": 2.534, "step": 3000 }, { "epoch": 0.08845643520566121, "eval_loss": 2.8255984783172607, "eval_runtime": 4.7091, "eval_samples_per_second": 84.093, "eval_steps_per_second": 2.761, "step": 3000 }, { "epoch": 0.08860386259767064, "grad_norm": 15.75, "learning_rate": 1.7718160377358488e-06, "loss": 2.6962, "step": 3005 }, { "epoch": 0.08875128998968008, "grad_norm": 19.125, "learning_rate": 1.7747641509433962e-06, "loss": 2.5769, "step": 3010 }, { "epoch": 0.08889871738168952, "grad_norm": 23.0, "learning_rate": 1.7777122641509431e-06, "loss": 2.5602, "step": 3015 }, { "epoch": 0.08904614477369896, "grad_norm": 16.0, "learning_rate": 1.7806603773584905e-06, "loss": 2.5782, "step": 3020 }, { "epoch": 0.08919357216570839, "grad_norm": 19.375, "learning_rate": 1.7836084905660377e-06, "loss": 2.6285, "step": 3025 }, { "epoch": 0.08934099955771782, "grad_norm": 21.25, "learning_rate": 1.7865566037735848e-06, "loss": 2.6416, "step": 3030 }, { "epoch": 0.08948842694972726, "grad_norm": 19.5, "learning_rate": 1.789504716981132e-06, "loss": 2.5563, "step": 3035 }, { "epoch": 0.0896358543417367, "grad_norm": 19.5, "learning_rate": 1.7924528301886791e-06, "loss": 2.6188, "step": 3040 }, { "epoch": 0.08978328173374613, "grad_norm": 15.75, "learning_rate": 1.7954009433962263e-06, "loss": 2.6018, "step": 3045 }, { "epoch": 0.08993070912575557, "grad_norm": 19.625, "learning_rate": 1.7983490566037736e-06, "loss": 2.6601, "step": 3050 }, { "epoch": 0.090078136517765, "grad_norm": 22.375, "learning_rate": 1.8012971698113206e-06, "loss": 2.6206, "step": 3055 }, { "epoch": 0.09022556390977443, "grad_norm": 25.625, "learning_rate": 1.804245283018868e-06, "loss": 2.7032, "step": 3060 }, { "epoch": 0.09037299130178388, "grad_norm": 16.75, "learning_rate": 1.8071933962264149e-06, "loss": 2.6075, "step": 3065 }, { "epoch": 0.0905204186937933, "grad_norm": 18.875, "learning_rate": 1.8101415094339622e-06, "loss": 2.7258, "step": 3070 }, { "epoch": 0.09066784608580274, "grad_norm": 23.375, "learning_rate": 1.8130896226415094e-06, "loss": 2.7048, "step": 3075 }, { "epoch": 0.09081527347781218, "grad_norm": 19.75, "learning_rate": 1.8160377358490566e-06, "loss": 2.4501, "step": 3080 }, { "epoch": 0.09096270086982161, "grad_norm": 19.125, "learning_rate": 1.8189858490566037e-06, "loss": 2.5144, "step": 3085 }, { "epoch": 0.09111012826183104, "grad_norm": 17.375, "learning_rate": 1.821933962264151e-06, "loss": 2.7287, "step": 3090 }, { "epoch": 0.09125755565384049, "grad_norm": 15.125, "learning_rate": 1.824882075471698e-06, "loss": 2.6906, "step": 3095 }, { "epoch": 0.09140498304584992, "grad_norm": 16.875, "learning_rate": 1.8278301886792452e-06, "loss": 2.6328, "step": 3100 }, { "epoch": 0.09155241043785936, "grad_norm": 28.375, "learning_rate": 1.8307783018867923e-06, "loss": 2.7216, "step": 3105 }, { "epoch": 0.09169983782986879, "grad_norm": 20.375, "learning_rate": 1.8337264150943395e-06, "loss": 2.6709, "step": 3110 }, { "epoch": 0.09184726522187822, "grad_norm": 17.625, "learning_rate": 1.8366745283018868e-06, "loss": 2.6247, "step": 3115 }, { "epoch": 0.09199469261388767, "grad_norm": 14.625, "learning_rate": 1.8396226415094338e-06, "loss": 2.704, "step": 3120 }, { "epoch": 0.0921421200058971, "grad_norm": 20.5, "learning_rate": 1.8425707547169811e-06, "loss": 2.6575, "step": 3125 }, { "epoch": 0.09228954739790653, "grad_norm": 16.75, "learning_rate": 1.845518867924528e-06, "loss": 2.6053, "step": 3130 }, { "epoch": 0.09243697478991597, "grad_norm": 18.875, "learning_rate": 1.8484669811320755e-06, "loss": 2.6402, "step": 3135 }, { "epoch": 0.0925844021819254, "grad_norm": 24.375, "learning_rate": 1.8514150943396226e-06, "loss": 2.6393, "step": 3140 }, { "epoch": 0.09273182957393483, "grad_norm": 23.125, "learning_rate": 1.8543632075471698e-06, "loss": 2.7078, "step": 3145 }, { "epoch": 0.09287925696594428, "grad_norm": 18.5, "learning_rate": 1.857311320754717e-06, "loss": 2.6088, "step": 3150 }, { "epoch": 0.0930266843579537, "grad_norm": 17.0, "learning_rate": 1.860259433962264e-06, "loss": 2.6411, "step": 3155 }, { "epoch": 0.09317411174996314, "grad_norm": 21.25, "learning_rate": 1.8632075471698112e-06, "loss": 2.5609, "step": 3160 }, { "epoch": 0.09332153914197258, "grad_norm": 19.0, "learning_rate": 1.8661556603773586e-06, "loss": 2.7937, "step": 3165 }, { "epoch": 0.09346896653398201, "grad_norm": 17.75, "learning_rate": 1.8691037735849055e-06, "loss": 2.5774, "step": 3170 }, { "epoch": 0.09361639392599144, "grad_norm": 29.125, "learning_rate": 1.8720518867924529e-06, "loss": 2.6118, "step": 3175 }, { "epoch": 0.09376382131800089, "grad_norm": 34.0, "learning_rate": 1.8749999999999998e-06, "loss": 2.7062, "step": 3180 }, { "epoch": 0.09391124871001032, "grad_norm": 21.375, "learning_rate": 1.877948113207547e-06, "loss": 2.4245, "step": 3185 }, { "epoch": 0.09405867610201976, "grad_norm": 21.0, "learning_rate": 1.8808962264150944e-06, "loss": 2.575, "step": 3190 }, { "epoch": 0.09420610349402919, "grad_norm": 19.75, "learning_rate": 1.8838443396226413e-06, "loss": 2.6233, "step": 3195 }, { "epoch": 0.09435353088603862, "grad_norm": 17.0, "learning_rate": 1.8867924528301887e-06, "loss": 2.63, "step": 3200 }, { "epoch": 0.09450095827804807, "grad_norm": 15.5, "learning_rate": 1.8897405660377356e-06, "loss": 2.4527, "step": 3205 }, { "epoch": 0.0946483856700575, "grad_norm": 28.125, "learning_rate": 1.892688679245283e-06, "loss": 2.6324, "step": 3210 }, { "epoch": 0.09479581306206693, "grad_norm": 13.5625, "learning_rate": 1.8956367924528301e-06, "loss": 2.5413, "step": 3215 }, { "epoch": 0.09494324045407637, "grad_norm": 20.625, "learning_rate": 1.8985849056603773e-06, "loss": 2.7314, "step": 3220 }, { "epoch": 0.0950906678460858, "grad_norm": 18.375, "learning_rate": 1.9015330188679244e-06, "loss": 2.5598, "step": 3225 }, { "epoch": 0.09523809523809523, "grad_norm": 25.75, "learning_rate": 1.9044811320754716e-06, "loss": 2.5823, "step": 3230 }, { "epoch": 0.09538552263010468, "grad_norm": 14.375, "learning_rate": 1.907429245283019e-06, "loss": 2.6562, "step": 3235 }, { "epoch": 0.09553295002211411, "grad_norm": 24.75, "learning_rate": 1.910377358490566e-06, "loss": 2.6288, "step": 3240 }, { "epoch": 0.09568037741412354, "grad_norm": 17.625, "learning_rate": 1.9133254716981133e-06, "loss": 2.5194, "step": 3245 }, { "epoch": 0.09582780480613298, "grad_norm": 15.4375, "learning_rate": 1.9162735849056604e-06, "loss": 2.5139, "step": 3250 }, { "epoch": 0.09597523219814241, "grad_norm": 20.375, "learning_rate": 1.9192216981132076e-06, "loss": 2.6683, "step": 3255 }, { "epoch": 0.09612265959015186, "grad_norm": 24.375, "learning_rate": 1.9221698113207547e-06, "loss": 2.4635, "step": 3260 }, { "epoch": 0.09627008698216129, "grad_norm": 29.75, "learning_rate": 1.925117924528302e-06, "loss": 2.6588, "step": 3265 }, { "epoch": 0.09641751437417072, "grad_norm": 15.375, "learning_rate": 1.928066037735849e-06, "loss": 2.4663, "step": 3270 }, { "epoch": 0.09656494176618016, "grad_norm": 17.25, "learning_rate": 1.931014150943396e-06, "loss": 2.5586, "step": 3275 }, { "epoch": 0.09671236915818959, "grad_norm": 16.75, "learning_rate": 1.9339622641509433e-06, "loss": 2.4494, "step": 3280 }, { "epoch": 0.09685979655019902, "grad_norm": 17.75, "learning_rate": 1.9369103773584905e-06, "loss": 2.513, "step": 3285 }, { "epoch": 0.09700722394220847, "grad_norm": 18.375, "learning_rate": 1.9398584905660376e-06, "loss": 2.5481, "step": 3290 }, { "epoch": 0.0971546513342179, "grad_norm": 15.1875, "learning_rate": 1.942806603773585e-06, "loss": 2.5519, "step": 3295 }, { "epoch": 0.09730207872622733, "grad_norm": 56.5, "learning_rate": 1.945754716981132e-06, "loss": 2.4955, "step": 3300 }, { "epoch": 0.09744950611823677, "grad_norm": 20.25, "learning_rate": 1.948702830188679e-06, "loss": 2.5694, "step": 3305 }, { "epoch": 0.0975969335102462, "grad_norm": 29.875, "learning_rate": 1.9516509433962263e-06, "loss": 2.5207, "step": 3310 }, { "epoch": 0.09774436090225563, "grad_norm": 17.0, "learning_rate": 1.9545990566037734e-06, "loss": 2.5684, "step": 3315 }, { "epoch": 0.09789178829426508, "grad_norm": 18.375, "learning_rate": 1.9575471698113206e-06, "loss": 2.5669, "step": 3320 }, { "epoch": 0.09803921568627451, "grad_norm": 18.875, "learning_rate": 1.9604952830188677e-06, "loss": 2.4171, "step": 3325 }, { "epoch": 0.09818664307828394, "grad_norm": 77.5, "learning_rate": 1.963443396226415e-06, "loss": 2.6574, "step": 3330 }, { "epoch": 0.09833407047029338, "grad_norm": 84.5, "learning_rate": 1.9663915094339624e-06, "loss": 2.574, "step": 3335 }, { "epoch": 0.09848149786230281, "grad_norm": 28.125, "learning_rate": 1.969339622641509e-06, "loss": 2.6212, "step": 3340 }, { "epoch": 0.09862892525431226, "grad_norm": 30.375, "learning_rate": 1.9722877358490568e-06, "loss": 2.421, "step": 3345 }, { "epoch": 0.09877635264632169, "grad_norm": 22.875, "learning_rate": 1.9752358490566035e-06, "loss": 2.5433, "step": 3350 }, { "epoch": 0.09892378003833112, "grad_norm": 24.0, "learning_rate": 1.978183962264151e-06, "loss": 2.733, "step": 3355 }, { "epoch": 0.09907120743034056, "grad_norm": 16.875, "learning_rate": 1.981132075471698e-06, "loss": 2.599, "step": 3360 }, { "epoch": 0.09921863482235, "grad_norm": 16.625, "learning_rate": 1.984080188679245e-06, "loss": 2.5051, "step": 3365 }, { "epoch": 0.09936606221435942, "grad_norm": 22.875, "learning_rate": 1.9870283018867925e-06, "loss": 2.4694, "step": 3370 }, { "epoch": 0.09951348960636887, "grad_norm": 19.625, "learning_rate": 1.9899764150943392e-06, "loss": 2.5643, "step": 3375 }, { "epoch": 0.0996609169983783, "grad_norm": 19.75, "learning_rate": 1.992924528301887e-06, "loss": 2.4453, "step": 3380 }, { "epoch": 0.09980834439038773, "grad_norm": 19.875, "learning_rate": 1.995872641509434e-06, "loss": 2.4782, "step": 3385 }, { "epoch": 0.09995577178239717, "grad_norm": 14.125, "learning_rate": 1.998820754716981e-06, "loss": 2.5011, "step": 3390 }, { "epoch": 0.1001031991744066, "grad_norm": 16.625, "learning_rate": 1.999999952328609e-06, "loss": 2.6024, "step": 3395 }, { "epoch": 0.10025062656641603, "grad_norm": 17.5, "learning_rate": 1.9999996610034596e-06, "loss": 2.5227, "step": 3400 }, { "epoch": 0.10039805395842548, "grad_norm": 17.0, "learning_rate": 1.9999991048373438e-06, "loss": 2.4102, "step": 3405 }, { "epoch": 0.10054548135043491, "grad_norm": 22.375, "learning_rate": 1.9999982838304092e-06, "loss": 2.5443, "step": 3410 }, { "epoch": 0.10069290874244434, "grad_norm": 19.25, "learning_rate": 1.9999971979828727e-06, "loss": 2.5334, "step": 3415 }, { "epoch": 0.10084033613445378, "grad_norm": 18.5, "learning_rate": 1.999995847295022e-06, "loss": 2.6654, "step": 3420 }, { "epoch": 0.10098776352646321, "grad_norm": 18.0, "learning_rate": 1.999994231767215e-06, "loss": 2.5571, "step": 3425 }, { "epoch": 0.10113519091847266, "grad_norm": 13.5, "learning_rate": 1.99999235139988e-06, "loss": 2.5254, "step": 3430 }, { "epoch": 0.10128261831048209, "grad_norm": 27.375, "learning_rate": 1.999990206193514e-06, "loss": 2.4695, "step": 3435 }, { "epoch": 0.10143004570249152, "grad_norm": 22.0, "learning_rate": 1.999987796148686e-06, "loss": 2.4044, "step": 3440 }, { "epoch": 0.10157747309450096, "grad_norm": 16.75, "learning_rate": 1.9999851212660336e-06, "loss": 2.4752, "step": 3445 }, { "epoch": 0.1017249004865104, "grad_norm": 18.75, "learning_rate": 1.9999821815462655e-06, "loss": 2.6967, "step": 3450 }, { "epoch": 0.10187232787851982, "grad_norm": 25.0, "learning_rate": 1.9999789769901606e-06, "loss": 2.5363, "step": 3455 }, { "epoch": 0.10201975527052927, "grad_norm": 26.375, "learning_rate": 1.9999755075985674e-06, "loss": 2.5621, "step": 3460 }, { "epoch": 0.1021671826625387, "grad_norm": 19.375, "learning_rate": 1.9999717733724043e-06, "loss": 2.5803, "step": 3465 }, { "epoch": 0.10231461005454813, "grad_norm": 22.625, "learning_rate": 1.9999677743126607e-06, "loss": 2.6209, "step": 3470 }, { "epoch": 0.10246203744655757, "grad_norm": 17.625, "learning_rate": 1.999963510420396e-06, "loss": 2.4312, "step": 3475 }, { "epoch": 0.102609464838567, "grad_norm": 41.0, "learning_rate": 1.999958981696739e-06, "loss": 2.5988, "step": 3480 }, { "epoch": 0.10275689223057644, "grad_norm": 21.375, "learning_rate": 1.999954188142889e-06, "loss": 2.5389, "step": 3485 }, { "epoch": 0.10290431962258588, "grad_norm": 16.0, "learning_rate": 1.9999491297601154e-06, "loss": 2.5496, "step": 3490 }, { "epoch": 0.10305174701459531, "grad_norm": 29.0, "learning_rate": 1.9999438065497587e-06, "loss": 2.5188, "step": 3495 }, { "epoch": 0.10319917440660474, "grad_norm": 18.125, "learning_rate": 1.999938218513228e-06, "loss": 2.4835, "step": 3500 }, { "epoch": 0.10319917440660474, "eval_loss": 2.641951322555542, "eval_runtime": 4.7225, "eval_samples_per_second": 83.855, "eval_steps_per_second": 2.753, "step": 3500 }, { "epoch": 0.10334660179861418, "grad_norm": 19.625, "learning_rate": 1.9999323656520037e-06, "loss": 2.4558, "step": 3505 }, { "epoch": 0.10349402919062362, "grad_norm": 17.375, "learning_rate": 1.999926247967635e-06, "loss": 2.4677, "step": 3510 }, { "epoch": 0.10364145658263306, "grad_norm": 18.0, "learning_rate": 1.999919865461743e-06, "loss": 2.4587, "step": 3515 }, { "epoch": 0.10378888397464249, "grad_norm": 20.875, "learning_rate": 1.999913218136018e-06, "loss": 2.5211, "step": 3520 }, { "epoch": 0.10393631136665192, "grad_norm": 16.25, "learning_rate": 1.99990630599222e-06, "loss": 2.552, "step": 3525 }, { "epoch": 0.10408373875866136, "grad_norm": 16.875, "learning_rate": 1.99989912903218e-06, "loss": 2.3828, "step": 3530 }, { "epoch": 0.1042311661506708, "grad_norm": 19.625, "learning_rate": 1.999891687257799e-06, "loss": 2.4554, "step": 3535 }, { "epoch": 0.10437859354268023, "grad_norm": 18.5, "learning_rate": 1.9998839806710466e-06, "loss": 2.5472, "step": 3540 }, { "epoch": 0.10452602093468967, "grad_norm": 17.5, "learning_rate": 1.9998760092739654e-06, "loss": 2.4378, "step": 3545 }, { "epoch": 0.1046734483266991, "grad_norm": 18.5, "learning_rate": 1.999867773068666e-06, "loss": 2.4833, "step": 3550 }, { "epoch": 0.10482087571870853, "grad_norm": 18.5, "learning_rate": 1.999859272057329e-06, "loss": 2.6006, "step": 3555 }, { "epoch": 0.10496830311071798, "grad_norm": 20.75, "learning_rate": 1.999850506242207e-06, "loss": 2.5066, "step": 3560 }, { "epoch": 0.1051157305027274, "grad_norm": 26.875, "learning_rate": 1.9998414756256208e-06, "loss": 2.5387, "step": 3565 }, { "epoch": 0.10526315789473684, "grad_norm": 34.0, "learning_rate": 1.9998321802099614e-06, "loss": 2.5366, "step": 3570 }, { "epoch": 0.10541058528674628, "grad_norm": 28.375, "learning_rate": 1.9998226199976925e-06, "loss": 2.6194, "step": 3575 }, { "epoch": 0.10555801267875571, "grad_norm": 18.0, "learning_rate": 1.9998127949913444e-06, "loss": 2.4925, "step": 3580 }, { "epoch": 0.10570544007076514, "grad_norm": 15.375, "learning_rate": 1.9998027051935198e-06, "loss": 2.4019, "step": 3585 }, { "epoch": 0.10585286746277459, "grad_norm": 17.25, "learning_rate": 1.999792350606891e-06, "loss": 2.4669, "step": 3590 }, { "epoch": 0.10600029485478402, "grad_norm": 17.375, "learning_rate": 1.9997817312342e-06, "loss": 2.6052, "step": 3595 }, { "epoch": 0.10614772224679346, "grad_norm": 17.25, "learning_rate": 1.9997708470782596e-06, "loss": 2.5472, "step": 3600 }, { "epoch": 0.10629514963880289, "grad_norm": 15.25, "learning_rate": 1.9997596981419517e-06, "loss": 2.4028, "step": 3605 }, { "epoch": 0.10644257703081232, "grad_norm": 16.5, "learning_rate": 1.99974828442823e-06, "loss": 2.5456, "step": 3610 }, { "epoch": 0.10659000442282177, "grad_norm": 16.375, "learning_rate": 1.9997366059401166e-06, "loss": 2.5217, "step": 3615 }, { "epoch": 0.1067374318148312, "grad_norm": 17.5, "learning_rate": 1.9997246626807045e-06, "loss": 2.5225, "step": 3620 }, { "epoch": 0.10688485920684063, "grad_norm": 18.25, "learning_rate": 1.9997124546531566e-06, "loss": 2.4429, "step": 3625 }, { "epoch": 0.10703228659885007, "grad_norm": 27.5, "learning_rate": 1.999699981860707e-06, "loss": 2.5007, "step": 3630 }, { "epoch": 0.1071797139908595, "grad_norm": 17.0, "learning_rate": 1.999687244306658e-06, "loss": 2.4672, "step": 3635 }, { "epoch": 0.10732714138286893, "grad_norm": 21.125, "learning_rate": 1.9996742419943834e-06, "loss": 2.4403, "step": 3640 }, { "epoch": 0.10747456877487838, "grad_norm": 20.625, "learning_rate": 1.9996609749273268e-06, "loss": 2.6091, "step": 3645 }, { "epoch": 0.1076219961668878, "grad_norm": 22.25, "learning_rate": 1.999647443109002e-06, "loss": 2.4657, "step": 3650 }, { "epoch": 0.10776942355889724, "grad_norm": 22.0, "learning_rate": 1.9996336465429923e-06, "loss": 2.4482, "step": 3655 }, { "epoch": 0.10791685095090668, "grad_norm": 18.0, "learning_rate": 1.999619585232952e-06, "loss": 2.4894, "step": 3660 }, { "epoch": 0.10806427834291611, "grad_norm": 13.25, "learning_rate": 1.999605259182605e-06, "loss": 2.4985, "step": 3665 }, { "epoch": 0.10821170573492554, "grad_norm": 19.375, "learning_rate": 1.999590668395745e-06, "loss": 2.5532, "step": 3670 }, { "epoch": 0.10835913312693499, "grad_norm": 17.0, "learning_rate": 1.9995758128762376e-06, "loss": 2.5648, "step": 3675 }, { "epoch": 0.10850656051894442, "grad_norm": 15.9375, "learning_rate": 1.9995606926280157e-06, "loss": 2.4608, "step": 3680 }, { "epoch": 0.10865398791095386, "grad_norm": 25.875, "learning_rate": 1.999545307655084e-06, "loss": 2.5516, "step": 3685 }, { "epoch": 0.10880141530296329, "grad_norm": 15.1875, "learning_rate": 1.999529657961518e-06, "loss": 2.5005, "step": 3690 }, { "epoch": 0.10894884269497272, "grad_norm": 19.125, "learning_rate": 1.999513743551461e-06, "loss": 2.6021, "step": 3695 }, { "epoch": 0.10909627008698217, "grad_norm": 18.875, "learning_rate": 1.999497564429129e-06, "loss": 2.5128, "step": 3700 }, { "epoch": 0.1092436974789916, "grad_norm": 19.25, "learning_rate": 1.9994811205988063e-06, "loss": 2.496, "step": 3705 }, { "epoch": 0.10939112487100103, "grad_norm": 20.5, "learning_rate": 1.999464412064848e-06, "loss": 2.3362, "step": 3710 }, { "epoch": 0.10953855226301047, "grad_norm": 21.25, "learning_rate": 1.9994474388316794e-06, "loss": 2.5267, "step": 3715 }, { "epoch": 0.1096859796550199, "grad_norm": 21.5, "learning_rate": 1.9994302009037957e-06, "loss": 2.4618, "step": 3720 }, { "epoch": 0.10983340704702933, "grad_norm": 17.625, "learning_rate": 1.9994126982857614e-06, "loss": 2.47, "step": 3725 }, { "epoch": 0.10998083443903878, "grad_norm": 16.0, "learning_rate": 1.999394930982213e-06, "loss": 2.5356, "step": 3730 }, { "epoch": 0.11012826183104821, "grad_norm": 18.625, "learning_rate": 1.9993768989978558e-06, "loss": 2.4532, "step": 3735 }, { "epoch": 0.11027568922305764, "grad_norm": 15.5625, "learning_rate": 1.9993586023374645e-06, "loss": 2.5099, "step": 3740 }, { "epoch": 0.11042311661506708, "grad_norm": 26.0, "learning_rate": 1.9993400410058864e-06, "loss": 2.4453, "step": 3745 }, { "epoch": 0.11057054400707651, "grad_norm": 17.0, "learning_rate": 1.999321215008036e-06, "loss": 2.5786, "step": 3750 }, { "epoch": 0.11071797139908596, "grad_norm": 15.3125, "learning_rate": 1.9993021243488994e-06, "loss": 2.5121, "step": 3755 }, { "epoch": 0.11086539879109539, "grad_norm": 14.3125, "learning_rate": 1.999282769033533e-06, "loss": 2.5239, "step": 3760 }, { "epoch": 0.11101282618310482, "grad_norm": 32.5, "learning_rate": 1.9992631490670623e-06, "loss": 2.3592, "step": 3765 }, { "epoch": 0.11116025357511426, "grad_norm": 15.6875, "learning_rate": 1.9992432644546836e-06, "loss": 2.4312, "step": 3770 }, { "epoch": 0.11130768096712369, "grad_norm": 38.5, "learning_rate": 1.999223115201664e-06, "loss": 2.5293, "step": 3775 }, { "epoch": 0.11145510835913312, "grad_norm": 17.375, "learning_rate": 1.9992027013133393e-06, "loss": 2.381, "step": 3780 }, { "epoch": 0.11160253575114257, "grad_norm": 19.125, "learning_rate": 1.999182022795116e-06, "loss": 2.4684, "step": 3785 }, { "epoch": 0.111749963143152, "grad_norm": 22.625, "learning_rate": 1.9991610796524697e-06, "loss": 2.367, "step": 3790 }, { "epoch": 0.11189739053516143, "grad_norm": 18.125, "learning_rate": 1.999139871890948e-06, "loss": 2.5411, "step": 3795 }, { "epoch": 0.11204481792717087, "grad_norm": 20.625, "learning_rate": 1.999118399516168e-06, "loss": 2.4369, "step": 3800 }, { "epoch": 0.1121922453191803, "grad_norm": 29.75, "learning_rate": 1.9990966625338154e-06, "loss": 2.588, "step": 3805 }, { "epoch": 0.11233967271118973, "grad_norm": 30.5, "learning_rate": 1.9990746609496476e-06, "loss": 2.4131, "step": 3810 }, { "epoch": 0.11248710010319918, "grad_norm": 12.3125, "learning_rate": 1.9990523947694917e-06, "loss": 2.3787, "step": 3815 }, { "epoch": 0.11263452749520861, "grad_norm": 27.125, "learning_rate": 1.999029863999244e-06, "loss": 2.4659, "step": 3820 }, { "epoch": 0.11278195488721804, "grad_norm": 16.875, "learning_rate": 1.9990070686448725e-06, "loss": 2.4582, "step": 3825 }, { "epoch": 0.11292938227922748, "grad_norm": 19.125, "learning_rate": 1.9989840087124134e-06, "loss": 2.3936, "step": 3830 }, { "epoch": 0.11307680967123691, "grad_norm": 20.375, "learning_rate": 1.9989606842079745e-06, "loss": 2.5457, "step": 3835 }, { "epoch": 0.11322423706324636, "grad_norm": 25.875, "learning_rate": 1.998937095137733e-06, "loss": 2.5145, "step": 3840 }, { "epoch": 0.11337166445525579, "grad_norm": 20.0, "learning_rate": 1.998913241507936e-06, "loss": 2.5409, "step": 3845 }, { "epoch": 0.11351909184726522, "grad_norm": 16.875, "learning_rate": 1.998889123324901e-06, "loss": 2.4614, "step": 3850 }, { "epoch": 0.11366651923927466, "grad_norm": 25.875, "learning_rate": 1.998864740595016e-06, "loss": 2.432, "step": 3855 }, { "epoch": 0.1138139466312841, "grad_norm": 19.375, "learning_rate": 1.998840093324738e-06, "loss": 2.5605, "step": 3860 }, { "epoch": 0.11396137402329352, "grad_norm": 24.625, "learning_rate": 1.998815181520595e-06, "loss": 2.411, "step": 3865 }, { "epoch": 0.11410880141530297, "grad_norm": 18.875, "learning_rate": 1.9987900051891843e-06, "loss": 2.5734, "step": 3870 }, { "epoch": 0.1142562288073124, "grad_norm": 15.75, "learning_rate": 1.9987645643371733e-06, "loss": 2.4893, "step": 3875 }, { "epoch": 0.11440365619932183, "grad_norm": 17.5, "learning_rate": 1.998738858971301e-06, "loss": 2.5551, "step": 3880 }, { "epoch": 0.11455108359133127, "grad_norm": 17.125, "learning_rate": 1.9987128890983736e-06, "loss": 2.5463, "step": 3885 }, { "epoch": 0.1146985109833407, "grad_norm": 16.25, "learning_rate": 1.9986866547252704e-06, "loss": 2.3747, "step": 3890 }, { "epoch": 0.11484593837535013, "grad_norm": 17.625, "learning_rate": 1.9986601558589393e-06, "loss": 2.5215, "step": 3895 }, { "epoch": 0.11499336576735958, "grad_norm": 15.3125, "learning_rate": 1.9986333925063968e-06, "loss": 2.4341, "step": 3900 }, { "epoch": 0.11514079315936901, "grad_norm": 15.25, "learning_rate": 1.9986063646747325e-06, "loss": 2.5542, "step": 3905 }, { "epoch": 0.11528822055137844, "grad_norm": 15.8125, "learning_rate": 1.998579072371104e-06, "loss": 2.5368, "step": 3910 }, { "epoch": 0.11543564794338788, "grad_norm": 23.625, "learning_rate": 1.998551515602739e-06, "loss": 2.4132, "step": 3915 }, { "epoch": 0.11558307533539731, "grad_norm": 17.625, "learning_rate": 1.9985236943769358e-06, "loss": 2.5002, "step": 3920 }, { "epoch": 0.11573050272740676, "grad_norm": 15.625, "learning_rate": 1.9984956087010635e-06, "loss": 2.2823, "step": 3925 }, { "epoch": 0.11587793011941619, "grad_norm": 18.25, "learning_rate": 1.9984672585825592e-06, "loss": 2.5808, "step": 3930 }, { "epoch": 0.11602535751142562, "grad_norm": 17.0, "learning_rate": 1.9984386440289315e-06, "loss": 2.5604, "step": 3935 }, { "epoch": 0.11617278490343506, "grad_norm": 18.5, "learning_rate": 1.998409765047759e-06, "loss": 2.482, "step": 3940 }, { "epoch": 0.1163202122954445, "grad_norm": 14.5, "learning_rate": 1.99838062164669e-06, "loss": 2.4246, "step": 3945 }, { "epoch": 0.11646763968745392, "grad_norm": 16.0, "learning_rate": 1.9983512138334425e-06, "loss": 2.4198, "step": 3950 }, { "epoch": 0.11661506707946337, "grad_norm": 16.25, "learning_rate": 1.998321541615805e-06, "loss": 2.434, "step": 3955 }, { "epoch": 0.1167624944714728, "grad_norm": 24.75, "learning_rate": 1.9982916050016364e-06, "loss": 2.4377, "step": 3960 }, { "epoch": 0.11690992186348223, "grad_norm": 21.625, "learning_rate": 1.9982614039988643e-06, "loss": 2.4218, "step": 3965 }, { "epoch": 0.11705734925549167, "grad_norm": 17.5, "learning_rate": 1.9982309386154884e-06, "loss": 2.4098, "step": 3970 }, { "epoch": 0.1172047766475011, "grad_norm": 16.5, "learning_rate": 1.998200208859576e-06, "loss": 2.3641, "step": 3975 }, { "epoch": 0.11735220403951054, "grad_norm": 17.125, "learning_rate": 1.9981692147392655e-06, "loss": 2.4643, "step": 3980 }, { "epoch": 0.11749963143151998, "grad_norm": 15.625, "learning_rate": 1.998137956262767e-06, "loss": 2.3954, "step": 3985 }, { "epoch": 0.11764705882352941, "grad_norm": 44.75, "learning_rate": 1.9981064334383577e-06, "loss": 2.4361, "step": 3990 }, { "epoch": 0.11779448621553884, "grad_norm": 30.875, "learning_rate": 1.998074646274386e-06, "loss": 2.5092, "step": 3995 }, { "epoch": 0.11794191360754828, "grad_norm": 16.25, "learning_rate": 1.998042594779271e-06, "loss": 2.5085, "step": 4000 }, { "epoch": 0.11794191360754828, "eval_loss": 2.489173173904419, "eval_runtime": 4.7135, "eval_samples_per_second": 84.014, "eval_steps_per_second": 2.758, "step": 4000 }, { "epoch": 0.11808934099955772, "grad_norm": 17.875, "learning_rate": 1.9980102789615014e-06, "loss": 2.5217, "step": 4005 }, { "epoch": 0.11823676839156716, "grad_norm": 15.5625, "learning_rate": 1.9979776988296353e-06, "loss": 2.383, "step": 4010 }, { "epoch": 0.11838419578357659, "grad_norm": 66.5, "learning_rate": 1.997944854392301e-06, "loss": 2.5012, "step": 4015 }, { "epoch": 0.11853162317558602, "grad_norm": 18.125, "learning_rate": 1.997911745658198e-06, "loss": 2.4182, "step": 4020 }, { "epoch": 0.11867905056759546, "grad_norm": 22.75, "learning_rate": 1.9978783726360945e-06, "loss": 2.4518, "step": 4025 }, { "epoch": 0.1188264779596049, "grad_norm": 15.75, "learning_rate": 1.9978447353348287e-06, "loss": 2.3372, "step": 4030 }, { "epoch": 0.11897390535161433, "grad_norm": 15.875, "learning_rate": 1.9978108337633092e-06, "loss": 2.4263, "step": 4035 }, { "epoch": 0.11912133274362377, "grad_norm": 17.125, "learning_rate": 1.9977766679305143e-06, "loss": 2.547, "step": 4040 }, { "epoch": 0.1192687601356332, "grad_norm": 16.5, "learning_rate": 1.9977422378454936e-06, "loss": 2.3247, "step": 4045 }, { "epoch": 0.11941618752764263, "grad_norm": 15.5625, "learning_rate": 1.9977075435173646e-06, "loss": 2.3781, "step": 4050 }, { "epoch": 0.11956361491965208, "grad_norm": 23.75, "learning_rate": 1.997672584955316e-06, "loss": 2.3924, "step": 4055 }, { "epoch": 0.1197110423116615, "grad_norm": 21.75, "learning_rate": 1.997637362168606e-06, "loss": 2.4931, "step": 4060 }, { "epoch": 0.11985846970367094, "grad_norm": 20.375, "learning_rate": 1.997601875166564e-06, "loss": 2.4232, "step": 4065 }, { "epoch": 0.12000589709568038, "grad_norm": 13.6875, "learning_rate": 1.9975661239585874e-06, "loss": 2.3576, "step": 4070 }, { "epoch": 0.12015332448768981, "grad_norm": 16.875, "learning_rate": 1.997530108554145e-06, "loss": 2.439, "step": 4075 }, { "epoch": 0.12030075187969924, "grad_norm": 19.75, "learning_rate": 1.997493828962775e-06, "loss": 2.4518, "step": 4080 }, { "epoch": 0.12044817927170869, "grad_norm": 18.125, "learning_rate": 1.997457285194086e-06, "loss": 2.4058, "step": 4085 }, { "epoch": 0.12059560666371812, "grad_norm": 15.4375, "learning_rate": 1.9974204772577557e-06, "loss": 2.4125, "step": 4090 }, { "epoch": 0.12074303405572756, "grad_norm": 15.5, "learning_rate": 1.9973834051635332e-06, "loss": 2.4462, "step": 4095 }, { "epoch": 0.12089046144773699, "grad_norm": 14.25, "learning_rate": 1.9973460689212366e-06, "loss": 2.3354, "step": 4100 }, { "epoch": 0.12103788883974642, "grad_norm": 16.25, "learning_rate": 1.997308468540753e-06, "loss": 2.3519, "step": 4105 }, { "epoch": 0.12118531623175587, "grad_norm": 16.375, "learning_rate": 1.997270604032042e-06, "loss": 2.3724, "step": 4110 }, { "epoch": 0.1213327436237653, "grad_norm": 25.75, "learning_rate": 1.9972324754051306e-06, "loss": 2.5401, "step": 4115 }, { "epoch": 0.12148017101577473, "grad_norm": 17.0, "learning_rate": 1.9971940826701175e-06, "loss": 2.4093, "step": 4120 }, { "epoch": 0.12162759840778417, "grad_norm": 17.25, "learning_rate": 1.99715542583717e-06, "loss": 2.4163, "step": 4125 }, { "epoch": 0.1217750257997936, "grad_norm": 14.625, "learning_rate": 1.9971165049165266e-06, "loss": 2.397, "step": 4130 }, { "epoch": 0.12192245319180303, "grad_norm": 20.625, "learning_rate": 1.997077319918495e-06, "loss": 2.4253, "step": 4135 }, { "epoch": 0.12206988058381248, "grad_norm": 19.625, "learning_rate": 1.9970378708534527e-06, "loss": 2.3918, "step": 4140 }, { "epoch": 0.1222173079758219, "grad_norm": 13.1875, "learning_rate": 1.9969981577318476e-06, "loss": 2.4003, "step": 4145 }, { "epoch": 0.12236473536783134, "grad_norm": 16.375, "learning_rate": 1.9969581805641977e-06, "loss": 2.3088, "step": 4150 }, { "epoch": 0.12251216275984078, "grad_norm": 17.875, "learning_rate": 1.99691793936109e-06, "loss": 2.3631, "step": 4155 }, { "epoch": 0.12265959015185021, "grad_norm": 13.0625, "learning_rate": 1.9968774341331828e-06, "loss": 2.3745, "step": 4160 }, { "epoch": 0.12280701754385964, "grad_norm": 13.625, "learning_rate": 1.9968366648912024e-06, "loss": 2.3856, "step": 4165 }, { "epoch": 0.12295444493586909, "grad_norm": 22.5, "learning_rate": 1.9967956316459473e-06, "loss": 2.5149, "step": 4170 }, { "epoch": 0.12310187232787852, "grad_norm": 23.5, "learning_rate": 1.9967543344082845e-06, "loss": 2.3066, "step": 4175 }, { "epoch": 0.12324929971988796, "grad_norm": 14.125, "learning_rate": 1.996712773189151e-06, "loss": 2.3431, "step": 4180 }, { "epoch": 0.12339672711189739, "grad_norm": 19.625, "learning_rate": 1.996670947999554e-06, "loss": 2.3903, "step": 4185 }, { "epoch": 0.12354415450390682, "grad_norm": 17.625, "learning_rate": 1.9966288588505705e-06, "loss": 2.4746, "step": 4190 }, { "epoch": 0.12369158189591627, "grad_norm": 15.6875, "learning_rate": 1.9965865057533474e-06, "loss": 2.313, "step": 4195 }, { "epoch": 0.1238390092879257, "grad_norm": 17.375, "learning_rate": 1.996543888719101e-06, "loss": 2.5405, "step": 4200 }, { "epoch": 0.12398643667993513, "grad_norm": 16.0, "learning_rate": 1.996501007759119e-06, "loss": 2.3146, "step": 4205 }, { "epoch": 0.12413386407194457, "grad_norm": 17.125, "learning_rate": 1.996457862884758e-06, "loss": 2.2757, "step": 4210 }, { "epoch": 0.124281291463954, "grad_norm": 17.875, "learning_rate": 1.996414454107444e-06, "loss": 2.3661, "step": 4215 }, { "epoch": 0.12442871885596343, "grad_norm": 18.375, "learning_rate": 1.996370781438674e-06, "loss": 2.3062, "step": 4220 }, { "epoch": 0.12457614624797288, "grad_norm": 18.875, "learning_rate": 1.9963268448900133e-06, "loss": 2.4657, "step": 4225 }, { "epoch": 0.12472357363998231, "grad_norm": 16.375, "learning_rate": 1.9962826444730992e-06, "loss": 2.4002, "step": 4230 }, { "epoch": 0.12487100103199174, "grad_norm": 16.75, "learning_rate": 1.996238180199637e-06, "loss": 2.3283, "step": 4235 }, { "epoch": 0.12501842842400118, "grad_norm": 14.375, "learning_rate": 1.996193452081403e-06, "loss": 2.3856, "step": 4240 }, { "epoch": 0.1251658558160106, "grad_norm": 15.4375, "learning_rate": 1.996148460130243e-06, "loss": 2.3721, "step": 4245 }, { "epoch": 0.12531328320802004, "grad_norm": 18.625, "learning_rate": 1.9961032043580726e-06, "loss": 2.4488, "step": 4250 }, { "epoch": 0.12546071060002947, "grad_norm": 27.125, "learning_rate": 1.9960576847768784e-06, "loss": 2.3926, "step": 4255 }, { "epoch": 0.12560813799203893, "grad_norm": 17.625, "learning_rate": 1.996011901398714e-06, "loss": 2.3488, "step": 4260 }, { "epoch": 0.12575556538404836, "grad_norm": 16.375, "learning_rate": 1.995965854235706e-06, "loss": 2.441, "step": 4265 }, { "epoch": 0.1259029927760578, "grad_norm": 15.25, "learning_rate": 1.9959195433000496e-06, "loss": 2.4117, "step": 4270 }, { "epoch": 0.12605042016806722, "grad_norm": 14.75, "learning_rate": 1.995872968604009e-06, "loss": 2.3304, "step": 4275 }, { "epoch": 0.12619784756007665, "grad_norm": 14.3125, "learning_rate": 1.9958261301599195e-06, "loss": 2.4009, "step": 4280 }, { "epoch": 0.12634527495208608, "grad_norm": 17.25, "learning_rate": 1.995779027980187e-06, "loss": 2.4338, "step": 4285 }, { "epoch": 0.12649270234409554, "grad_norm": 14.4375, "learning_rate": 1.9957316620772842e-06, "loss": 2.4749, "step": 4290 }, { "epoch": 0.12664012973610497, "grad_norm": 18.375, "learning_rate": 1.9956840324637564e-06, "loss": 2.3095, "step": 4295 }, { "epoch": 0.1267875571281144, "grad_norm": 20.0, "learning_rate": 1.9956361391522177e-06, "loss": 2.3714, "step": 4300 }, { "epoch": 0.12693498452012383, "grad_norm": 22.125, "learning_rate": 1.995587982155353e-06, "loss": 2.4341, "step": 4305 }, { "epoch": 0.12708241191213326, "grad_norm": 16.625, "learning_rate": 1.995539561485915e-06, "loss": 2.3509, "step": 4310 }, { "epoch": 0.12722983930414272, "grad_norm": 16.0, "learning_rate": 1.9954908771567287e-06, "loss": 2.4267, "step": 4315 }, { "epoch": 0.12737726669615215, "grad_norm": 14.8125, "learning_rate": 1.9954419291806865e-06, "loss": 2.3689, "step": 4320 }, { "epoch": 0.12752469408816158, "grad_norm": 15.4375, "learning_rate": 1.995392717570753e-06, "loss": 2.3899, "step": 4325 }, { "epoch": 0.127672121480171, "grad_norm": 17.25, "learning_rate": 1.9953432423399606e-06, "loss": 2.2691, "step": 4330 }, { "epoch": 0.12781954887218044, "grad_norm": 13.5625, "learning_rate": 1.9952935035014126e-06, "loss": 2.4342, "step": 4335 }, { "epoch": 0.12796697626418987, "grad_norm": 22.0, "learning_rate": 1.995243501068282e-06, "loss": 2.2938, "step": 4340 }, { "epoch": 0.12811440365619933, "grad_norm": 22.75, "learning_rate": 1.9951932350538113e-06, "loss": 2.3834, "step": 4345 }, { "epoch": 0.12826183104820876, "grad_norm": 14.875, "learning_rate": 1.9951427054713137e-06, "loss": 2.5038, "step": 4350 }, { "epoch": 0.1284092584402182, "grad_norm": 16.875, "learning_rate": 1.9950919123341707e-06, "loss": 2.4017, "step": 4355 }, { "epoch": 0.12855668583222762, "grad_norm": 21.375, "learning_rate": 1.9950408556558344e-06, "loss": 2.3696, "step": 4360 }, { "epoch": 0.12870411322423705, "grad_norm": 13.875, "learning_rate": 1.9949895354498272e-06, "loss": 2.3681, "step": 4365 }, { "epoch": 0.12885154061624648, "grad_norm": 13.0, "learning_rate": 1.9949379517297404e-06, "loss": 2.4246, "step": 4370 }, { "epoch": 0.12899896800825594, "grad_norm": 14.5625, "learning_rate": 1.994886104509236e-06, "loss": 2.4722, "step": 4375 }, { "epoch": 0.12914639540026537, "grad_norm": 17.25, "learning_rate": 1.994833993802045e-06, "loss": 2.3309, "step": 4380 }, { "epoch": 0.1292938227922748, "grad_norm": 16.5, "learning_rate": 1.994781619621968e-06, "loss": 2.385, "step": 4385 }, { "epoch": 0.12944125018428423, "grad_norm": 13.75, "learning_rate": 1.9947289819828764e-06, "loss": 2.301, "step": 4390 }, { "epoch": 0.12958867757629366, "grad_norm": 17.0, "learning_rate": 1.9946760808987106e-06, "loss": 2.4786, "step": 4395 }, { "epoch": 0.12973610496830312, "grad_norm": 15.6875, "learning_rate": 1.994622916383481e-06, "loss": 2.2656, "step": 4400 }, { "epoch": 0.12988353236031255, "grad_norm": 14.5625, "learning_rate": 1.994569488451268e-06, "loss": 2.1932, "step": 4405 }, { "epoch": 0.13003095975232198, "grad_norm": 14.875, "learning_rate": 1.9945157971162207e-06, "loss": 2.3135, "step": 4410 }, { "epoch": 0.13017838714433141, "grad_norm": 22.625, "learning_rate": 1.99446184239256e-06, "loss": 2.3792, "step": 4415 }, { "epoch": 0.13032581453634084, "grad_norm": 20.625, "learning_rate": 1.9944076242945744e-06, "loss": 2.403, "step": 4420 }, { "epoch": 0.13047324192835028, "grad_norm": 15.5, "learning_rate": 1.9943531428366233e-06, "loss": 2.4211, "step": 4425 }, { "epoch": 0.13062066932035973, "grad_norm": 16.75, "learning_rate": 1.9942983980331355e-06, "loss": 2.3437, "step": 4430 }, { "epoch": 0.13076809671236916, "grad_norm": 12.0625, "learning_rate": 1.99424338989861e-06, "loss": 2.3165, "step": 4435 }, { "epoch": 0.1309155241043786, "grad_norm": 25.0, "learning_rate": 1.994188118447615e-06, "loss": 2.396, "step": 4440 }, { "epoch": 0.13106295149638802, "grad_norm": 19.125, "learning_rate": 1.9941325836947888e-06, "loss": 2.3832, "step": 4445 }, { "epoch": 0.13121037888839746, "grad_norm": 16.0, "learning_rate": 1.9940767856548395e-06, "loss": 2.3981, "step": 4450 }, { "epoch": 0.13135780628040689, "grad_norm": 17.5, "learning_rate": 1.994020724342544e-06, "loss": 2.3459, "step": 4455 }, { "epoch": 0.13150523367241634, "grad_norm": 15.625, "learning_rate": 1.99396439977275e-06, "loss": 2.4587, "step": 4460 }, { "epoch": 0.13165266106442577, "grad_norm": 25.125, "learning_rate": 1.9939078119603746e-06, "loss": 2.4407, "step": 4465 }, { "epoch": 0.1318000884564352, "grad_norm": 15.875, "learning_rate": 1.9938509609204047e-06, "loss": 2.3194, "step": 4470 }, { "epoch": 0.13194751584844464, "grad_norm": 18.0, "learning_rate": 1.9937938466678967e-06, "loss": 2.4082, "step": 4475 }, { "epoch": 0.13209494324045407, "grad_norm": 14.5625, "learning_rate": 1.9937364692179764e-06, "loss": 2.2631, "step": 4480 }, { "epoch": 0.13224237063246352, "grad_norm": 15.6875, "learning_rate": 1.993678828585841e-06, "loss": 2.2747, "step": 4485 }, { "epoch": 0.13238979802447295, "grad_norm": 15.75, "learning_rate": 1.9936209247867542e-06, "loss": 2.3008, "step": 4490 }, { "epoch": 0.13253722541648238, "grad_norm": 17.0, "learning_rate": 1.9935627578360526e-06, "loss": 2.3542, "step": 4495 }, { "epoch": 0.13268465280849182, "grad_norm": 15.0, "learning_rate": 1.9935043277491407e-06, "loss": 2.3161, "step": 4500 }, { "epoch": 0.13268465280849182, "eval_loss": 2.3908541202545166, "eval_runtime": 4.7136, "eval_samples_per_second": 84.013, "eval_steps_per_second": 2.758, "step": 4500 }, { "epoch": 0.13283208020050125, "grad_norm": 15.0625, "learning_rate": 1.9934456345414938e-06, "loss": 2.3238, "step": 4505 }, { "epoch": 0.13297950759251068, "grad_norm": 17.5, "learning_rate": 1.9933866782286553e-06, "loss": 2.2844, "step": 4510 }, { "epoch": 0.13312693498452013, "grad_norm": 15.1875, "learning_rate": 1.99332745882624e-06, "loss": 2.3055, "step": 4515 }, { "epoch": 0.13327436237652956, "grad_norm": 15.0, "learning_rate": 1.9932679763499313e-06, "loss": 2.4481, "step": 4520 }, { "epoch": 0.133421789768539, "grad_norm": 13.4375, "learning_rate": 1.9932082308154833e-06, "loss": 2.2749, "step": 4525 }, { "epoch": 0.13356921716054843, "grad_norm": 16.5, "learning_rate": 1.993148222238718e-06, "loss": 2.5095, "step": 4530 }, { "epoch": 0.13371664455255786, "grad_norm": 13.875, "learning_rate": 1.9930879506355285e-06, "loss": 2.4653, "step": 4535 }, { "epoch": 0.13386407194456731, "grad_norm": 16.375, "learning_rate": 1.9930274160218773e-06, "loss": 2.3229, "step": 4540 }, { "epoch": 0.13401149933657674, "grad_norm": 21.625, "learning_rate": 1.9929666184137964e-06, "loss": 2.3757, "step": 4545 }, { "epoch": 0.13415892672858618, "grad_norm": 16.75, "learning_rate": 1.992905557827388e-06, "loss": 2.4258, "step": 4550 }, { "epoch": 0.1343063541205956, "grad_norm": 13.75, "learning_rate": 1.992844234278823e-06, "loss": 2.3937, "step": 4555 }, { "epoch": 0.13445378151260504, "grad_norm": 19.375, "learning_rate": 1.9927826477843416e-06, "loss": 2.4173, "step": 4560 }, { "epoch": 0.13460120890461447, "grad_norm": 14.75, "learning_rate": 1.992720798360256e-06, "loss": 2.3872, "step": 4565 }, { "epoch": 0.13474863629662392, "grad_norm": 15.75, "learning_rate": 1.9926586860229455e-06, "loss": 2.3229, "step": 4570 }, { "epoch": 0.13489606368863336, "grad_norm": 13.1875, "learning_rate": 1.99259631078886e-06, "loss": 2.3331, "step": 4575 }, { "epoch": 0.13504349108064279, "grad_norm": 14.6875, "learning_rate": 1.9925336726745196e-06, "loss": 2.3182, "step": 4580 }, { "epoch": 0.13519091847265222, "grad_norm": 16.375, "learning_rate": 1.992470771696513e-06, "loss": 2.2982, "step": 4585 }, { "epoch": 0.13533834586466165, "grad_norm": 15.625, "learning_rate": 1.992407607871499e-06, "loss": 2.3287, "step": 4590 }, { "epoch": 0.13548577325667108, "grad_norm": 17.5, "learning_rate": 1.992344181216206e-06, "loss": 2.4427, "step": 4595 }, { "epoch": 0.13563320064868054, "grad_norm": 17.625, "learning_rate": 1.9922804917474316e-06, "loss": 2.3961, "step": 4600 }, { "epoch": 0.13578062804068997, "grad_norm": 16.625, "learning_rate": 1.9922165394820445e-06, "loss": 2.4671, "step": 4605 }, { "epoch": 0.1359280554326994, "grad_norm": 22.25, "learning_rate": 1.9921523244369805e-06, "loss": 2.4358, "step": 4610 }, { "epoch": 0.13607548282470883, "grad_norm": 16.125, "learning_rate": 1.9920878466292473e-06, "loss": 2.4312, "step": 4615 }, { "epoch": 0.13622291021671826, "grad_norm": 18.5, "learning_rate": 1.9920231060759207e-06, "loss": 2.5213, "step": 4620 }, { "epoch": 0.13637033760872772, "grad_norm": 14.8125, "learning_rate": 1.9919581027941476e-06, "loss": 2.2932, "step": 4625 }, { "epoch": 0.13651776500073715, "grad_norm": 20.875, "learning_rate": 1.9918928368011426e-06, "loss": 2.3134, "step": 4630 }, { "epoch": 0.13666519239274658, "grad_norm": 16.25, "learning_rate": 1.991827308114191e-06, "loss": 2.3506, "step": 4635 }, { "epoch": 0.136812619784756, "grad_norm": 14.5, "learning_rate": 1.9917615167506477e-06, "loss": 2.3828, "step": 4640 }, { "epoch": 0.13696004717676544, "grad_norm": 40.25, "learning_rate": 1.9916954627279373e-06, "loss": 2.3665, "step": 4645 }, { "epoch": 0.13710747456877487, "grad_norm": 21.75, "learning_rate": 1.9916291460635522e-06, "loss": 2.3163, "step": 4650 }, { "epoch": 0.13725490196078433, "grad_norm": 14.75, "learning_rate": 1.9915625667750577e-06, "loss": 2.3346, "step": 4655 }, { "epoch": 0.13740232935279376, "grad_norm": 18.125, "learning_rate": 1.991495724880085e-06, "loss": 2.3689, "step": 4660 }, { "epoch": 0.1375497567448032, "grad_norm": 16.0, "learning_rate": 1.991428620396338e-06, "loss": 2.3924, "step": 4665 }, { "epoch": 0.13769718413681262, "grad_norm": 15.5, "learning_rate": 1.9913612533415877e-06, "loss": 2.2315, "step": 4670 }, { "epoch": 0.13784461152882205, "grad_norm": 16.875, "learning_rate": 1.9912936237336764e-06, "loss": 2.3822, "step": 4675 }, { "epoch": 0.13799203892083148, "grad_norm": 21.5, "learning_rate": 1.9912257315905145e-06, "loss": 2.3552, "step": 4680 }, { "epoch": 0.13813946631284094, "grad_norm": 19.5, "learning_rate": 1.991157576930083e-06, "loss": 2.4338, "step": 4685 }, { "epoch": 0.13828689370485037, "grad_norm": 20.875, "learning_rate": 1.991089159770432e-06, "loss": 2.3953, "step": 4690 }, { "epoch": 0.1384343210968598, "grad_norm": 50.0, "learning_rate": 1.9910204801296814e-06, "loss": 2.3419, "step": 4695 }, { "epoch": 0.13858174848886923, "grad_norm": 15.25, "learning_rate": 1.99095153802602e-06, "loss": 2.342, "step": 4700 }, { "epoch": 0.13872917588087866, "grad_norm": 23.25, "learning_rate": 1.9908823334777067e-06, "loss": 2.4395, "step": 4705 }, { "epoch": 0.13887660327288812, "grad_norm": 17.0, "learning_rate": 1.9908128665030697e-06, "loss": 2.3248, "step": 4710 }, { "epoch": 0.13902403066489755, "grad_norm": 17.5, "learning_rate": 1.990743137120507e-06, "loss": 2.3639, "step": 4715 }, { "epoch": 0.13917145805690698, "grad_norm": 18.25, "learning_rate": 1.990673145348485e-06, "loss": 2.3686, "step": 4720 }, { "epoch": 0.1393188854489164, "grad_norm": 15.5, "learning_rate": 1.990602891205541e-06, "loss": 2.3286, "step": 4725 }, { "epoch": 0.13946631284092584, "grad_norm": 19.0, "learning_rate": 1.9905323747102813e-06, "loss": 2.3668, "step": 4730 }, { "epoch": 0.13961374023293527, "grad_norm": 14.125, "learning_rate": 1.9904615958813814e-06, "loss": 2.3007, "step": 4735 }, { "epoch": 0.13976116762494473, "grad_norm": 14.6875, "learning_rate": 1.990390554737586e-06, "loss": 2.3387, "step": 4740 }, { "epoch": 0.13990859501695416, "grad_norm": 17.625, "learning_rate": 1.9903192512977104e-06, "loss": 2.3229, "step": 4745 }, { "epoch": 0.1400560224089636, "grad_norm": 15.75, "learning_rate": 1.9902476855806382e-06, "loss": 2.3936, "step": 4750 }, { "epoch": 0.14020344980097302, "grad_norm": 16.375, "learning_rate": 1.990175857605323e-06, "loss": 2.4185, "step": 4755 }, { "epoch": 0.14035087719298245, "grad_norm": 13.125, "learning_rate": 1.9901037673907884e-06, "loss": 2.2939, "step": 4760 }, { "epoch": 0.14049830458499188, "grad_norm": 16.25, "learning_rate": 1.990031414956126e-06, "loss": 2.2509, "step": 4765 }, { "epoch": 0.14064573197700134, "grad_norm": 14.1875, "learning_rate": 1.989958800320498e-06, "loss": 2.255, "step": 4770 }, { "epoch": 0.14079315936901077, "grad_norm": 15.3125, "learning_rate": 1.989885923503136e-06, "loss": 2.3749, "step": 4775 }, { "epoch": 0.1409405867610202, "grad_norm": 14.0625, "learning_rate": 1.98981278452334e-06, "loss": 2.3896, "step": 4780 }, { "epoch": 0.14108801415302963, "grad_norm": 15.4375, "learning_rate": 1.989739383400481e-06, "loss": 2.1821, "step": 4785 }, { "epoch": 0.14123544154503906, "grad_norm": 17.5, "learning_rate": 1.989665720153999e-06, "loss": 2.3354, "step": 4790 }, { "epoch": 0.14138286893704852, "grad_norm": 14.9375, "learning_rate": 1.989591794803402e-06, "loss": 2.2849, "step": 4795 }, { "epoch": 0.14153029632905795, "grad_norm": 15.1875, "learning_rate": 1.9895176073682685e-06, "loss": 2.3225, "step": 4800 }, { "epoch": 0.14167772372106738, "grad_norm": 12.625, "learning_rate": 1.9894431578682474e-06, "loss": 2.3356, "step": 4805 }, { "epoch": 0.1418251511130768, "grad_norm": 25.0, "learning_rate": 1.989368446323055e-06, "loss": 2.3309, "step": 4810 }, { "epoch": 0.14197257850508624, "grad_norm": 17.875, "learning_rate": 1.989293472752479e-06, "loss": 2.2506, "step": 4815 }, { "epoch": 0.14212000589709567, "grad_norm": 13.875, "learning_rate": 1.989218237176374e-06, "loss": 2.2593, "step": 4820 }, { "epoch": 0.14226743328910513, "grad_norm": 17.0, "learning_rate": 1.989142739614667e-06, "loss": 2.3378, "step": 4825 }, { "epoch": 0.14241486068111456, "grad_norm": 14.1875, "learning_rate": 1.9890669800873518e-06, "loss": 2.2224, "step": 4830 }, { "epoch": 0.142562288073124, "grad_norm": 12.9375, "learning_rate": 1.9889909586144927e-06, "loss": 2.2454, "step": 4835 }, { "epoch": 0.14270971546513342, "grad_norm": 20.0, "learning_rate": 1.988914675216224e-06, "loss": 2.3326, "step": 4840 }, { "epoch": 0.14285714285714285, "grad_norm": 14.5, "learning_rate": 1.9888381299127484e-06, "loss": 2.289, "step": 4845 }, { "epoch": 0.14300457024915228, "grad_norm": 16.0, "learning_rate": 1.9887613227243377e-06, "loss": 2.3622, "step": 4850 }, { "epoch": 0.14315199764116174, "grad_norm": 20.0, "learning_rate": 1.9886842536713342e-06, "loss": 2.4091, "step": 4855 }, { "epoch": 0.14329942503317117, "grad_norm": 13.4375, "learning_rate": 1.988606922774149e-06, "loss": 2.3741, "step": 4860 }, { "epoch": 0.1434468524251806, "grad_norm": 16.375, "learning_rate": 1.9885293300532623e-06, "loss": 2.3865, "step": 4865 }, { "epoch": 0.14359427981719003, "grad_norm": 13.625, "learning_rate": 1.9884514755292236e-06, "loss": 2.297, "step": 4870 }, { "epoch": 0.14374170720919946, "grad_norm": 14.8125, "learning_rate": 1.988373359222652e-06, "loss": 2.3604, "step": 4875 }, { "epoch": 0.14388913460120892, "grad_norm": 17.375, "learning_rate": 1.9882949811542362e-06, "loss": 2.2758, "step": 4880 }, { "epoch": 0.14403656199321835, "grad_norm": 15.75, "learning_rate": 1.9882163413447337e-06, "loss": 2.4295, "step": 4885 }, { "epoch": 0.14418398938522778, "grad_norm": 16.625, "learning_rate": 1.9881374398149715e-06, "loss": 2.4267, "step": 4890 }, { "epoch": 0.1443314167772372, "grad_norm": 18.0, "learning_rate": 1.988058276585847e-06, "loss": 2.3796, "step": 4895 }, { "epoch": 0.14447884416924664, "grad_norm": 32.0, "learning_rate": 1.9879788516783242e-06, "loss": 2.2386, "step": 4900 }, { "epoch": 0.14462627156125607, "grad_norm": 18.0, "learning_rate": 1.9878991651134388e-06, "loss": 2.3415, "step": 4905 }, { "epoch": 0.14477369895326553, "grad_norm": 13.75, "learning_rate": 1.9878192169122957e-06, "loss": 2.2535, "step": 4910 }, { "epoch": 0.14492112634527496, "grad_norm": 17.75, "learning_rate": 1.9877390070960677e-06, "loss": 2.2624, "step": 4915 }, { "epoch": 0.1450685537372844, "grad_norm": 17.125, "learning_rate": 1.9876585356859977e-06, "loss": 2.3719, "step": 4920 }, { "epoch": 0.14521598112929382, "grad_norm": 25.125, "learning_rate": 1.987577802703398e-06, "loss": 2.3524, "step": 4925 }, { "epoch": 0.14536340852130325, "grad_norm": 18.75, "learning_rate": 1.98749680816965e-06, "loss": 2.3082, "step": 4930 }, { "epoch": 0.14551083591331268, "grad_norm": 29.125, "learning_rate": 1.9874155521062047e-06, "loss": 2.2639, "step": 4935 }, { "epoch": 0.14565826330532214, "grad_norm": 13.375, "learning_rate": 1.9873340345345816e-06, "loss": 2.2192, "step": 4940 }, { "epoch": 0.14580569069733157, "grad_norm": 15.875, "learning_rate": 1.9872522554763698e-06, "loss": 2.4222, "step": 4945 }, { "epoch": 0.145953118089341, "grad_norm": 17.625, "learning_rate": 1.987170214953228e-06, "loss": 2.4073, "step": 4950 }, { "epoch": 0.14610054548135043, "grad_norm": 16.0, "learning_rate": 1.9870879129868842e-06, "loss": 2.3876, "step": 4955 }, { "epoch": 0.14624797287335986, "grad_norm": 14.25, "learning_rate": 1.987005349599135e-06, "loss": 2.3663, "step": 4960 }, { "epoch": 0.14639540026536932, "grad_norm": 18.875, "learning_rate": 1.9869225248118463e-06, "loss": 2.5066, "step": 4965 }, { "epoch": 0.14654282765737875, "grad_norm": 12.5, "learning_rate": 1.9868394386469535e-06, "loss": 2.2031, "step": 4970 }, { "epoch": 0.14669025504938818, "grad_norm": 13.625, "learning_rate": 1.986756091126462e-06, "loss": 2.308, "step": 4975 }, { "epoch": 0.1468376824413976, "grad_norm": 17.75, "learning_rate": 1.986672482272445e-06, "loss": 2.3624, "step": 4980 }, { "epoch": 0.14698510983340704, "grad_norm": 16.875, "learning_rate": 1.9865886121070463e-06, "loss": 2.3513, "step": 4985 }, { "epoch": 0.14713253722541647, "grad_norm": 12.4375, "learning_rate": 1.986504480652477e-06, "loss": 2.2523, "step": 4990 }, { "epoch": 0.14727996461742593, "grad_norm": 13.6875, "learning_rate": 1.986420087931019e-06, "loss": 2.3941, "step": 4995 }, { "epoch": 0.14742739200943536, "grad_norm": 15.8125, "learning_rate": 1.9863354339650234e-06, "loss": 2.2635, "step": 5000 }, { "epoch": 0.14742739200943536, "eval_loss": 2.3236687183380127, "eval_runtime": 4.7176, "eval_samples_per_second": 83.94, "eval_steps_per_second": 2.756, "step": 5000 }, { "epoch": 0.1475748194014448, "grad_norm": 23.375, "learning_rate": 1.98625051877691e-06, "loss": 2.3127, "step": 5005 }, { "epoch": 0.14772224679345422, "grad_norm": 13.4375, "learning_rate": 1.9861653423891667e-06, "loss": 2.3383, "step": 5010 }, { "epoch": 0.14786967418546365, "grad_norm": 15.75, "learning_rate": 1.9860799048243535e-06, "loss": 2.3119, "step": 5015 }, { "epoch": 0.14801710157747308, "grad_norm": 17.125, "learning_rate": 1.9859942061050965e-06, "loss": 2.3144, "step": 5020 }, { "epoch": 0.14816452896948254, "grad_norm": 13.5625, "learning_rate": 1.985908246254093e-06, "loss": 2.166, "step": 5025 }, { "epoch": 0.14831195636149197, "grad_norm": 15.625, "learning_rate": 1.985822025294108e-06, "loss": 2.2456, "step": 5030 }, { "epoch": 0.1484593837535014, "grad_norm": 15.3125, "learning_rate": 1.9857355432479763e-06, "loss": 2.3008, "step": 5035 }, { "epoch": 0.14860681114551083, "grad_norm": 16.25, "learning_rate": 1.9856488001386026e-06, "loss": 2.3406, "step": 5040 }, { "epoch": 0.14875423853752026, "grad_norm": 15.8125, "learning_rate": 1.9855617959889598e-06, "loss": 2.3188, "step": 5045 }, { "epoch": 0.14890166592952972, "grad_norm": 19.25, "learning_rate": 1.9854745308220895e-06, "loss": 2.3037, "step": 5050 }, { "epoch": 0.14904909332153915, "grad_norm": 16.875, "learning_rate": 1.985387004661104e-06, "loss": 2.2972, "step": 5055 }, { "epoch": 0.14919652071354858, "grad_norm": 15.25, "learning_rate": 1.9852992175291837e-06, "loss": 2.3653, "step": 5060 }, { "epoch": 0.149343948105558, "grad_norm": 18.375, "learning_rate": 1.985211169449578e-06, "loss": 2.3043, "step": 5065 }, { "epoch": 0.14949137549756744, "grad_norm": 15.4375, "learning_rate": 1.9851228604456056e-06, "loss": 2.3394, "step": 5070 }, { "epoch": 0.14963880288957687, "grad_norm": 15.125, "learning_rate": 1.985034290540654e-06, "loss": 2.2842, "step": 5075 }, { "epoch": 0.14978623028158633, "grad_norm": 13.5, "learning_rate": 1.984945459758181e-06, "loss": 2.2723, "step": 5080 }, { "epoch": 0.14993365767359576, "grad_norm": 15.25, "learning_rate": 1.984856368121712e-06, "loss": 2.2761, "step": 5085 }, { "epoch": 0.1500810850656052, "grad_norm": 13.875, "learning_rate": 1.9847670156548424e-06, "loss": 2.2788, "step": 5090 }, { "epoch": 0.15022851245761462, "grad_norm": 15.0625, "learning_rate": 1.984677402381236e-06, "loss": 2.3517, "step": 5095 }, { "epoch": 0.15037593984962405, "grad_norm": 14.125, "learning_rate": 1.9845875283246267e-06, "loss": 2.2871, "step": 5100 }, { "epoch": 0.15052336724163348, "grad_norm": 15.6875, "learning_rate": 1.984497393508817e-06, "loss": 2.3584, "step": 5105 }, { "epoch": 0.15067079463364294, "grad_norm": 15.625, "learning_rate": 1.984406997957677e-06, "loss": 2.4279, "step": 5110 }, { "epoch": 0.15081822202565237, "grad_norm": 16.0, "learning_rate": 1.984316341695148e-06, "loss": 2.3569, "step": 5115 }, { "epoch": 0.1509656494176618, "grad_norm": 26.0, "learning_rate": 1.9842254247452402e-06, "loss": 2.2929, "step": 5120 }, { "epoch": 0.15111307680967123, "grad_norm": 12.625, "learning_rate": 1.984134247132031e-06, "loss": 2.2725, "step": 5125 }, { "epoch": 0.15126050420168066, "grad_norm": 15.1875, "learning_rate": 1.984042808879668e-06, "loss": 2.2815, "step": 5130 }, { "epoch": 0.15140793159369012, "grad_norm": 17.5, "learning_rate": 1.983951110012369e-06, "loss": 2.2957, "step": 5135 }, { "epoch": 0.15155535898569955, "grad_norm": 12.375, "learning_rate": 1.9838591505544182e-06, "loss": 2.2805, "step": 5140 }, { "epoch": 0.15170278637770898, "grad_norm": 16.75, "learning_rate": 1.983766930530171e-06, "loss": 2.2924, "step": 5145 }, { "epoch": 0.1518502137697184, "grad_norm": 14.9375, "learning_rate": 1.9836744499640515e-06, "loss": 2.2116, "step": 5150 }, { "epoch": 0.15199764116172784, "grad_norm": 15.125, "learning_rate": 1.983581708880551e-06, "loss": 2.3455, "step": 5155 }, { "epoch": 0.15214506855373727, "grad_norm": 17.5, "learning_rate": 1.983488707304232e-06, "loss": 2.35, "step": 5160 }, { "epoch": 0.15229249594574673, "grad_norm": 14.625, "learning_rate": 1.9833954452597255e-06, "loss": 2.2939, "step": 5165 }, { "epoch": 0.15243992333775616, "grad_norm": 14.75, "learning_rate": 1.9833019227717306e-06, "loss": 2.3036, "step": 5170 }, { "epoch": 0.1525873507297656, "grad_norm": 14.75, "learning_rate": 1.9832081398650158e-06, "loss": 2.353, "step": 5175 }, { "epoch": 0.15273477812177502, "grad_norm": 13.375, "learning_rate": 1.9831140965644187e-06, "loss": 2.1931, "step": 5180 }, { "epoch": 0.15288220551378445, "grad_norm": 16.0, "learning_rate": 1.9830197928948464e-06, "loss": 2.3679, "step": 5185 }, { "epoch": 0.15302963290579388, "grad_norm": 13.6875, "learning_rate": 1.9829252288812735e-06, "loss": 2.1672, "step": 5190 }, { "epoch": 0.15317706029780334, "grad_norm": 14.5, "learning_rate": 1.982830404548745e-06, "loss": 2.1524, "step": 5195 }, { "epoch": 0.15332448768981277, "grad_norm": 17.5, "learning_rate": 1.9827353199223744e-06, "loss": 2.4355, "step": 5200 }, { "epoch": 0.1534719150818222, "grad_norm": 15.0625, "learning_rate": 1.9826399750273432e-06, "loss": 2.3482, "step": 5205 }, { "epoch": 0.15361934247383163, "grad_norm": 16.875, "learning_rate": 1.9825443698889035e-06, "loss": 2.3487, "step": 5210 }, { "epoch": 0.15376676986584106, "grad_norm": 12.3125, "learning_rate": 1.982448504532375e-06, "loss": 2.2663, "step": 5215 }, { "epoch": 0.15391419725785052, "grad_norm": 14.6875, "learning_rate": 1.9823523789831474e-06, "loss": 2.3294, "step": 5220 }, { "epoch": 0.15406162464985995, "grad_norm": 16.25, "learning_rate": 1.982255993266678e-06, "loss": 2.2712, "step": 5225 }, { "epoch": 0.15420905204186938, "grad_norm": 14.9375, "learning_rate": 1.9821593474084938e-06, "loss": 2.3677, "step": 5230 }, { "epoch": 0.1543564794338788, "grad_norm": 14.5, "learning_rate": 1.982062441434191e-06, "loss": 2.2574, "step": 5235 }, { "epoch": 0.15450390682588824, "grad_norm": 49.0, "learning_rate": 1.9819652753694336e-06, "loss": 2.209, "step": 5240 }, { "epoch": 0.15465133421789767, "grad_norm": 12.25, "learning_rate": 1.9818678492399557e-06, "loss": 2.2998, "step": 5245 }, { "epoch": 0.15479876160990713, "grad_norm": 17.125, "learning_rate": 1.98177016307156e-06, "loss": 2.1183, "step": 5250 }, { "epoch": 0.15494618900191656, "grad_norm": 15.625, "learning_rate": 1.981672216890117e-06, "loss": 2.2239, "step": 5255 }, { "epoch": 0.155093616393926, "grad_norm": 16.375, "learning_rate": 1.9815740107215676e-06, "loss": 2.2356, "step": 5260 }, { "epoch": 0.15524104378593542, "grad_norm": 16.875, "learning_rate": 1.9814755445919204e-06, "loss": 2.3464, "step": 5265 }, { "epoch": 0.15538847117794485, "grad_norm": 14.6875, "learning_rate": 1.9813768185272536e-06, "loss": 2.3359, "step": 5270 }, { "epoch": 0.15553589856995428, "grad_norm": 15.25, "learning_rate": 1.981277832553713e-06, "loss": 2.3663, "step": 5275 }, { "epoch": 0.15568332596196374, "grad_norm": 15.9375, "learning_rate": 1.9811785866975153e-06, "loss": 2.2262, "step": 5280 }, { "epoch": 0.15583075335397317, "grad_norm": 15.375, "learning_rate": 1.9810790809849446e-06, "loss": 2.2454, "step": 5285 }, { "epoch": 0.1559781807459826, "grad_norm": 15.9375, "learning_rate": 1.980979315442354e-06, "loss": 2.289, "step": 5290 }, { "epoch": 0.15612560813799203, "grad_norm": 15.375, "learning_rate": 1.980879290096165e-06, "loss": 2.2416, "step": 5295 }, { "epoch": 0.15627303553000146, "grad_norm": 12.9375, "learning_rate": 1.9807790049728692e-06, "loss": 2.2994, "step": 5300 }, { "epoch": 0.15642046292201092, "grad_norm": 14.5625, "learning_rate": 1.9806784600990255e-06, "loss": 2.3597, "step": 5305 }, { "epoch": 0.15656789031402035, "grad_norm": 21.625, "learning_rate": 1.980577655501263e-06, "loss": 2.3614, "step": 5310 }, { "epoch": 0.15671531770602978, "grad_norm": 12.375, "learning_rate": 1.9804765912062786e-06, "loss": 2.2174, "step": 5315 }, { "epoch": 0.1568627450980392, "grad_norm": 13.4375, "learning_rate": 1.9803752672408385e-06, "loss": 2.2548, "step": 5320 }, { "epoch": 0.15701017249004864, "grad_norm": 15.5625, "learning_rate": 1.9802736836317767e-06, "loss": 2.3221, "step": 5325 }, { "epoch": 0.15715759988205807, "grad_norm": 19.25, "learning_rate": 1.9801718404059973e-06, "loss": 2.4205, "step": 5330 }, { "epoch": 0.15730502727406753, "grad_norm": 15.0625, "learning_rate": 1.9800697375904727e-06, "loss": 2.3176, "step": 5335 }, { "epoch": 0.15745245466607696, "grad_norm": 15.0, "learning_rate": 1.9799673752122436e-06, "loss": 2.3065, "step": 5340 }, { "epoch": 0.1575998820580864, "grad_norm": 37.5, "learning_rate": 1.9798647532984197e-06, "loss": 2.3744, "step": 5345 }, { "epoch": 0.15774730945009582, "grad_norm": 15.875, "learning_rate": 1.97976187187618e-06, "loss": 2.222, "step": 5350 }, { "epoch": 0.15789473684210525, "grad_norm": 22.0, "learning_rate": 1.979658730972771e-06, "loss": 2.2452, "step": 5355 }, { "epoch": 0.15804216423411468, "grad_norm": 16.5, "learning_rate": 1.9795553306155096e-06, "loss": 2.3767, "step": 5360 }, { "epoch": 0.15818959162612414, "grad_norm": 15.875, "learning_rate": 1.9794516708317792e-06, "loss": 2.2898, "step": 5365 }, { "epoch": 0.15833701901813357, "grad_norm": 15.1875, "learning_rate": 1.9793477516490343e-06, "loss": 2.3616, "step": 5370 }, { "epoch": 0.158484446410143, "grad_norm": 15.375, "learning_rate": 1.979243573094796e-06, "loss": 2.2351, "step": 5375 }, { "epoch": 0.15863187380215243, "grad_norm": 15.4375, "learning_rate": 1.979139135196656e-06, "loss": 2.3049, "step": 5380 }, { "epoch": 0.15877930119416186, "grad_norm": 16.625, "learning_rate": 1.9790344379822735e-06, "loss": 2.3058, "step": 5385 }, { "epoch": 0.15892672858617132, "grad_norm": 19.75, "learning_rate": 1.978929481479376e-06, "loss": 2.4101, "step": 5390 }, { "epoch": 0.15907415597818075, "grad_norm": 17.25, "learning_rate": 1.9788242657157613e-06, "loss": 2.2751, "step": 5395 }, { "epoch": 0.15922158337019018, "grad_norm": 16.0, "learning_rate": 1.9787187907192936e-06, "loss": 2.253, "step": 5400 }, { "epoch": 0.15936901076219961, "grad_norm": 15.5, "learning_rate": 1.978613056517908e-06, "loss": 2.3013, "step": 5405 }, { "epoch": 0.15951643815420904, "grad_norm": 15.4375, "learning_rate": 1.9785070631396072e-06, "loss": 2.2712, "step": 5410 }, { "epoch": 0.15966386554621848, "grad_norm": 31.0, "learning_rate": 1.978400810612462e-06, "loss": 2.3801, "step": 5415 }, { "epoch": 0.15981129293822793, "grad_norm": 12.6875, "learning_rate": 1.978294298964613e-06, "loss": 2.1809, "step": 5420 }, { "epoch": 0.15995872033023736, "grad_norm": 15.5625, "learning_rate": 1.978187528224269e-06, "loss": 2.1765, "step": 5425 }, { "epoch": 0.1601061477222468, "grad_norm": 13.875, "learning_rate": 1.978080498419706e-06, "loss": 2.2463, "step": 5430 }, { "epoch": 0.16025357511425622, "grad_norm": 16.75, "learning_rate": 1.9779732095792715e-06, "loss": 2.1921, "step": 5435 }, { "epoch": 0.16040100250626566, "grad_norm": 15.5625, "learning_rate": 1.977865661731379e-06, "loss": 2.3557, "step": 5440 }, { "epoch": 0.16054842989827509, "grad_norm": 12.0, "learning_rate": 1.977757854904512e-06, "loss": 2.2406, "step": 5445 }, { "epoch": 0.16069585729028454, "grad_norm": 16.5, "learning_rate": 1.9776497891272222e-06, "loss": 2.2514, "step": 5450 }, { "epoch": 0.16084328468229397, "grad_norm": 15.3125, "learning_rate": 1.9775414644281296e-06, "loss": 2.3495, "step": 5455 }, { "epoch": 0.1609907120743034, "grad_norm": 17.5, "learning_rate": 1.977432880835923e-06, "loss": 2.3667, "step": 5460 }, { "epoch": 0.16113813946631284, "grad_norm": 14.4375, "learning_rate": 1.97732403837936e-06, "loss": 2.2498, "step": 5465 }, { "epoch": 0.16128556685832227, "grad_norm": 15.5, "learning_rate": 1.9772149370872666e-06, "loss": 2.2269, "step": 5470 }, { "epoch": 0.16143299425033172, "grad_norm": 16.125, "learning_rate": 1.9771055769885366e-06, "loss": 2.3518, "step": 5475 }, { "epoch": 0.16158042164234115, "grad_norm": 14.625, "learning_rate": 1.9769959581121343e-06, "loss": 2.1878, "step": 5480 }, { "epoch": 0.16172784903435058, "grad_norm": 24.0, "learning_rate": 1.9768860804870905e-06, "loss": 2.3007, "step": 5485 }, { "epoch": 0.16187527642636002, "grad_norm": 15.0625, "learning_rate": 1.9767759441425055e-06, "loss": 2.3047, "step": 5490 }, { "epoch": 0.16202270381836945, "grad_norm": 25.25, "learning_rate": 1.9766655491075473e-06, "loss": 2.2359, "step": 5495 }, { "epoch": 0.16217013121037888, "grad_norm": 16.375, "learning_rate": 1.976554895411454e-06, "loss": 2.3287, "step": 5500 }, { "epoch": 0.16217013121037888, "eval_loss": 2.275634765625, "eval_runtime": 4.7203, "eval_samples_per_second": 83.894, "eval_steps_per_second": 2.754, "step": 5500 }, { "epoch": 0.16231755860238833, "grad_norm": 16.375, "learning_rate": 1.976443983083531e-06, "loss": 2.1762, "step": 5505 }, { "epoch": 0.16246498599439776, "grad_norm": 15.625, "learning_rate": 1.9763328121531517e-06, "loss": 2.3403, "step": 5510 }, { "epoch": 0.1626124133864072, "grad_norm": 29.125, "learning_rate": 1.9762213826497595e-06, "loss": 2.2859, "step": 5515 }, { "epoch": 0.16275984077841663, "grad_norm": 20.25, "learning_rate": 1.9761096946028654e-06, "loss": 2.2856, "step": 5520 }, { "epoch": 0.16290726817042606, "grad_norm": 14.0625, "learning_rate": 1.9759977480420485e-06, "loss": 2.282, "step": 5525 }, { "epoch": 0.16305469556243551, "grad_norm": 17.25, "learning_rate": 1.975885542996958e-06, "loss": 2.3487, "step": 5530 }, { "epoch": 0.16320212295444494, "grad_norm": 15.5625, "learning_rate": 1.9757730794973088e-06, "loss": 2.2834, "step": 5535 }, { "epoch": 0.16334955034645438, "grad_norm": 15.875, "learning_rate": 1.975660357572887e-06, "loss": 2.3061, "step": 5540 }, { "epoch": 0.1634969777384638, "grad_norm": 14.875, "learning_rate": 1.975547377253546e-06, "loss": 2.3091, "step": 5545 }, { "epoch": 0.16364440513047324, "grad_norm": 18.75, "learning_rate": 1.9754341385692067e-06, "loss": 2.3004, "step": 5550 }, { "epoch": 0.16379183252248267, "grad_norm": 15.5625, "learning_rate": 1.97532064154986e-06, "loss": 2.2878, "step": 5555 }, { "epoch": 0.16393925991449212, "grad_norm": 14.125, "learning_rate": 1.975206886225565e-06, "loss": 2.2775, "step": 5560 }, { "epoch": 0.16408668730650156, "grad_norm": 16.0, "learning_rate": 1.975092872626448e-06, "loss": 2.273, "step": 5565 }, { "epoch": 0.16423411469851099, "grad_norm": 14.75, "learning_rate": 1.9749786007827046e-06, "loss": 2.2756, "step": 5570 }, { "epoch": 0.16438154209052042, "grad_norm": 20.625, "learning_rate": 1.9748640707245992e-06, "loss": 2.2598, "step": 5575 }, { "epoch": 0.16452896948252985, "grad_norm": 24.25, "learning_rate": 1.9747492824824638e-06, "loss": 2.2965, "step": 5580 }, { "epoch": 0.16467639687453928, "grad_norm": 25.375, "learning_rate": 1.9746342360866987e-06, "loss": 2.2723, "step": 5585 }, { "epoch": 0.16482382426654874, "grad_norm": 17.625, "learning_rate": 1.974518931567773e-06, "loss": 2.4182, "step": 5590 }, { "epoch": 0.16497125165855817, "grad_norm": 13.25, "learning_rate": 1.9744033689562244e-06, "loss": 2.3571, "step": 5595 }, { "epoch": 0.1651186790505676, "grad_norm": 15.4375, "learning_rate": 1.9742875482826583e-06, "loss": 2.2016, "step": 5600 }, { "epoch": 0.16526610644257703, "grad_norm": 15.1875, "learning_rate": 1.974171469577749e-06, "loss": 2.1913, "step": 5605 }, { "epoch": 0.16541353383458646, "grad_norm": 15.125, "learning_rate": 1.9740551328722387e-06, "loss": 2.3939, "step": 5610 }, { "epoch": 0.16556096122659592, "grad_norm": 16.125, "learning_rate": 1.9739385381969387e-06, "loss": 2.31, "step": 5615 }, { "epoch": 0.16570838861860535, "grad_norm": 14.375, "learning_rate": 1.9738216855827276e-06, "loss": 2.3633, "step": 5620 }, { "epoch": 0.16585581601061478, "grad_norm": 15.5625, "learning_rate": 1.9737045750605523e-06, "loss": 2.297, "step": 5625 }, { "epoch": 0.1660032434026242, "grad_norm": 17.125, "learning_rate": 1.9735872066614294e-06, "loss": 2.3581, "step": 5630 }, { "epoch": 0.16615067079463364, "grad_norm": 17.0, "learning_rate": 1.973469580416442e-06, "loss": 2.2865, "step": 5635 }, { "epoch": 0.16629809818664307, "grad_norm": 13.375, "learning_rate": 1.9733516963567433e-06, "loss": 2.245, "step": 5640 }, { "epoch": 0.16644552557865253, "grad_norm": 17.625, "learning_rate": 1.973233554513553e-06, "loss": 2.3806, "step": 5645 }, { "epoch": 0.16659295297066196, "grad_norm": 15.4375, "learning_rate": 1.9731151549181603e-06, "loss": 2.204, "step": 5650 }, { "epoch": 0.1667403803626714, "grad_norm": 15.375, "learning_rate": 1.9729964976019223e-06, "loss": 2.1747, "step": 5655 }, { "epoch": 0.16688780775468082, "grad_norm": 15.25, "learning_rate": 1.972877582596264e-06, "loss": 2.134, "step": 5660 }, { "epoch": 0.16703523514669025, "grad_norm": 15.8125, "learning_rate": 1.9727584099326796e-06, "loss": 2.3692, "step": 5665 }, { "epoch": 0.16718266253869968, "grad_norm": 18.375, "learning_rate": 1.9726389796427303e-06, "loss": 2.3848, "step": 5670 }, { "epoch": 0.16733008993070914, "grad_norm": 17.875, "learning_rate": 1.9725192917580466e-06, "loss": 2.2684, "step": 5675 }, { "epoch": 0.16747751732271857, "grad_norm": 15.5625, "learning_rate": 1.9723993463103265e-06, "loss": 2.3341, "step": 5680 }, { "epoch": 0.167624944714728, "grad_norm": 15.25, "learning_rate": 1.9722791433313364e-06, "loss": 2.3528, "step": 5685 }, { "epoch": 0.16777237210673743, "grad_norm": 16.75, "learning_rate": 1.972158682852911e-06, "loss": 2.2617, "step": 5690 }, { "epoch": 0.16791979949874686, "grad_norm": 10.9375, "learning_rate": 1.9720379649069537e-06, "loss": 2.2348, "step": 5695 }, { "epoch": 0.16806722689075632, "grad_norm": 13.0625, "learning_rate": 1.9719169895254347e-06, "loss": 2.2609, "step": 5700 }, { "epoch": 0.16821465428276575, "grad_norm": 15.3125, "learning_rate": 1.971795756740394e-06, "loss": 2.3166, "step": 5705 }, { "epoch": 0.16836208167477518, "grad_norm": 13.9375, "learning_rate": 1.9716742665839387e-06, "loss": 2.1999, "step": 5710 }, { "epoch": 0.1685095090667846, "grad_norm": 15.0625, "learning_rate": 1.9715525190882444e-06, "loss": 2.2247, "step": 5715 }, { "epoch": 0.16865693645879404, "grad_norm": 15.625, "learning_rate": 1.9714305142855545e-06, "loss": 2.2155, "step": 5720 }, { "epoch": 0.16880436385080347, "grad_norm": 18.125, "learning_rate": 1.971308252208182e-06, "loss": 2.2774, "step": 5725 }, { "epoch": 0.16895179124281293, "grad_norm": 17.5, "learning_rate": 1.9711857328885056e-06, "loss": 2.3131, "step": 5730 }, { "epoch": 0.16909921863482236, "grad_norm": 14.75, "learning_rate": 1.971062956358974e-06, "loss": 2.1557, "step": 5735 }, { "epoch": 0.1692466460268318, "grad_norm": 19.0, "learning_rate": 1.9709399226521034e-06, "loss": 2.3339, "step": 5740 }, { "epoch": 0.16939407341884122, "grad_norm": 15.1875, "learning_rate": 1.9708166318004785e-06, "loss": 2.2882, "step": 5745 }, { "epoch": 0.16954150081085065, "grad_norm": 20.375, "learning_rate": 1.9706930838367513e-06, "loss": 2.2205, "step": 5750 }, { "epoch": 0.16968892820286008, "grad_norm": 13.625, "learning_rate": 1.9705692787936427e-06, "loss": 2.2539, "step": 5755 }, { "epoch": 0.16983635559486954, "grad_norm": 13.5625, "learning_rate": 1.970445216703941e-06, "loss": 2.2197, "step": 5760 }, { "epoch": 0.16998378298687897, "grad_norm": 15.375, "learning_rate": 1.9703208976005035e-06, "loss": 2.3474, "step": 5765 }, { "epoch": 0.1701312103788884, "grad_norm": 15.125, "learning_rate": 1.9701963215162546e-06, "loss": 2.2846, "step": 5770 }, { "epoch": 0.17027863777089783, "grad_norm": 13.875, "learning_rate": 1.9700714884841872e-06, "loss": 2.2496, "step": 5775 }, { "epoch": 0.17042606516290726, "grad_norm": 14.125, "learning_rate": 1.9699463985373623e-06, "loss": 2.2557, "step": 5780 }, { "epoch": 0.17057349255491672, "grad_norm": 15.8125, "learning_rate": 1.9698210517089085e-06, "loss": 2.3008, "step": 5785 }, { "epoch": 0.17072091994692615, "grad_norm": 17.125, "learning_rate": 1.9696954480320237e-06, "loss": 2.3636, "step": 5790 }, { "epoch": 0.17086834733893558, "grad_norm": 16.0, "learning_rate": 1.9695695875399717e-06, "loss": 2.3082, "step": 5795 }, { "epoch": 0.171015774730945, "grad_norm": 14.0625, "learning_rate": 1.9694434702660866e-06, "loss": 2.1362, "step": 5800 }, { "epoch": 0.17116320212295444, "grad_norm": 18.625, "learning_rate": 1.9693170962437686e-06, "loss": 2.196, "step": 5805 }, { "epoch": 0.17131062951496387, "grad_norm": 13.3125, "learning_rate": 1.9691904655064873e-06, "loss": 2.2226, "step": 5810 }, { "epoch": 0.17145805690697333, "grad_norm": 15.8125, "learning_rate": 1.9690635780877794e-06, "loss": 2.2897, "step": 5815 }, { "epoch": 0.17160548429898276, "grad_norm": 15.5, "learning_rate": 1.96893643402125e-06, "loss": 2.3065, "step": 5820 }, { "epoch": 0.1717529116909922, "grad_norm": 16.5, "learning_rate": 1.968809033340572e-06, "loss": 2.2296, "step": 5825 }, { "epoch": 0.17190033908300162, "grad_norm": 12.5625, "learning_rate": 1.9686813760794865e-06, "loss": 2.3175, "step": 5830 }, { "epoch": 0.17204776647501105, "grad_norm": 16.125, "learning_rate": 1.9685534622718023e-06, "loss": 2.2041, "step": 5835 }, { "epoch": 0.17219519386702048, "grad_norm": 18.5, "learning_rate": 1.9684252919513963e-06, "loss": 2.2254, "step": 5840 }, { "epoch": 0.17234262125902994, "grad_norm": 16.5, "learning_rate": 1.9682968651522133e-06, "loss": 2.3361, "step": 5845 }, { "epoch": 0.17249004865103937, "grad_norm": 13.625, "learning_rate": 1.9681681819082655e-06, "loss": 2.275, "step": 5850 }, { "epoch": 0.1726374760430488, "grad_norm": 13.9375, "learning_rate": 1.968039242253634e-06, "loss": 2.199, "step": 5855 }, { "epoch": 0.17278490343505823, "grad_norm": 20.5, "learning_rate": 1.9679100462224673e-06, "loss": 2.2255, "step": 5860 }, { "epoch": 0.17293233082706766, "grad_norm": 19.75, "learning_rate": 1.967780593848982e-06, "loss": 2.3403, "step": 5865 }, { "epoch": 0.17307975821907712, "grad_norm": 18.75, "learning_rate": 1.9676508851674616e-06, "loss": 2.2646, "step": 5870 }, { "epoch": 0.17322718561108655, "grad_norm": 15.625, "learning_rate": 1.9675209202122587e-06, "loss": 2.3766, "step": 5875 }, { "epoch": 0.17337461300309598, "grad_norm": 14.3125, "learning_rate": 1.967390699017794e-06, "loss": 2.266, "step": 5880 }, { "epoch": 0.1735220403951054, "grad_norm": 32.5, "learning_rate": 1.9672602216185545e-06, "loss": 2.1764, "step": 5885 }, { "epoch": 0.17366946778711484, "grad_norm": 15.4375, "learning_rate": 1.9671294880490966e-06, "loss": 2.1586, "step": 5890 }, { "epoch": 0.17381689517912427, "grad_norm": 15.0, "learning_rate": 1.9669984983440434e-06, "loss": 2.2782, "step": 5895 }, { "epoch": 0.17396432257113373, "grad_norm": 19.125, "learning_rate": 1.9668672525380865e-06, "loss": 2.1917, "step": 5900 }, { "epoch": 0.17411174996314316, "grad_norm": 14.625, "learning_rate": 1.9667357506659856e-06, "loss": 2.29, "step": 5905 }, { "epoch": 0.1742591773551526, "grad_norm": 13.9375, "learning_rate": 1.9666039927625673e-06, "loss": 2.2455, "step": 5910 }, { "epoch": 0.17440660474716202, "grad_norm": 16.5, "learning_rate": 1.9664719788627267e-06, "loss": 2.1826, "step": 5915 }, { "epoch": 0.17455403213917145, "grad_norm": 14.1875, "learning_rate": 1.9663397090014265e-06, "loss": 2.2362, "step": 5920 }, { "epoch": 0.17470145953118088, "grad_norm": 20.75, "learning_rate": 1.9662071832136973e-06, "loss": 2.2673, "step": 5925 }, { "epoch": 0.17484888692319034, "grad_norm": 17.625, "learning_rate": 1.966074401534637e-06, "loss": 2.2222, "step": 5930 }, { "epoch": 0.17499631431519977, "grad_norm": 18.875, "learning_rate": 1.9659413639994124e-06, "loss": 2.2023, "step": 5935 }, { "epoch": 0.1751437417072092, "grad_norm": 16.125, "learning_rate": 1.965808070643256e-06, "loss": 2.2542, "step": 5940 }, { "epoch": 0.17529116909921863, "grad_norm": 17.0, "learning_rate": 1.965674521501471e-06, "loss": 2.2869, "step": 5945 }, { "epoch": 0.17543859649122806, "grad_norm": 16.75, "learning_rate": 1.965540716609425e-06, "loss": 2.2464, "step": 5950 }, { "epoch": 0.17558602388323752, "grad_norm": 14.75, "learning_rate": 1.965406656002556e-06, "loss": 2.242, "step": 5955 }, { "epoch": 0.17573345127524695, "grad_norm": 12.5, "learning_rate": 1.965272339716369e-06, "loss": 2.2244, "step": 5960 }, { "epoch": 0.17588087866725638, "grad_norm": 17.75, "learning_rate": 1.965137767786436e-06, "loss": 2.2567, "step": 5965 }, { "epoch": 0.1760283060592658, "grad_norm": 15.0625, "learning_rate": 1.9650029402483974e-06, "loss": 2.258, "step": 5970 }, { "epoch": 0.17617573345127524, "grad_norm": 18.375, "learning_rate": 1.9648678571379603e-06, "loss": 2.268, "step": 5975 }, { "epoch": 0.17632316084328467, "grad_norm": 14.8125, "learning_rate": 1.9647325184909014e-06, "loss": 2.2409, "step": 5980 }, { "epoch": 0.17647058823529413, "grad_norm": 15.75, "learning_rate": 1.9645969243430632e-06, "loss": 2.3359, "step": 5985 }, { "epoch": 0.17661801562730356, "grad_norm": 17.625, "learning_rate": 1.9644610747303567e-06, "loss": 2.3641, "step": 5990 }, { "epoch": 0.176765443019313, "grad_norm": 17.125, "learning_rate": 1.9643249696887613e-06, "loss": 2.2575, "step": 5995 }, { "epoch": 0.17691287041132242, "grad_norm": 15.5625, "learning_rate": 1.9641886092543215e-06, "loss": 2.3008, "step": 6000 }, { "epoch": 0.17691287041132242, "eval_loss": 2.2403786182403564, "eval_runtime": 4.7252, "eval_samples_per_second": 83.807, "eval_steps_per_second": 2.751, "step": 6000 }, { "epoch": 0.17706029780333185, "grad_norm": 13.25, "learning_rate": 1.9640519934631527e-06, "loss": 2.1941, "step": 6005 }, { "epoch": 0.17720772519534128, "grad_norm": 14.375, "learning_rate": 1.9639151223514356e-06, "loss": 2.3091, "step": 6010 }, { "epoch": 0.17735515258735074, "grad_norm": 19.375, "learning_rate": 1.9637779959554193e-06, "loss": 2.3316, "step": 6015 }, { "epoch": 0.17750257997936017, "grad_norm": 14.75, "learning_rate": 1.963640614311421e-06, "loss": 2.3345, "step": 6020 }, { "epoch": 0.1776500073713696, "grad_norm": 17.125, "learning_rate": 1.9635029774558245e-06, "loss": 2.1962, "step": 6025 }, { "epoch": 0.17779743476337903, "grad_norm": 15.1875, "learning_rate": 1.9633650854250818e-06, "loss": 2.2806, "step": 6030 }, { "epoch": 0.17794486215538846, "grad_norm": 17.25, "learning_rate": 1.9632269382557123e-06, "loss": 2.2432, "step": 6035 }, { "epoch": 0.17809228954739792, "grad_norm": 14.5625, "learning_rate": 1.9630885359843034e-06, "loss": 2.2453, "step": 6040 }, { "epoch": 0.17823971693940735, "grad_norm": 13.25, "learning_rate": 1.9629498786475094e-06, "loss": 2.328, "step": 6045 }, { "epoch": 0.17838714433141678, "grad_norm": 17.0, "learning_rate": 1.9628109662820525e-06, "loss": 2.2208, "step": 6050 }, { "epoch": 0.1785345717234262, "grad_norm": 14.8125, "learning_rate": 1.9626717989247222e-06, "loss": 2.2077, "step": 6055 }, { "epoch": 0.17868199911543564, "grad_norm": 14.75, "learning_rate": 1.9625323766123764e-06, "loss": 2.2507, "step": 6060 }, { "epoch": 0.17882942650744507, "grad_norm": 14.3125, "learning_rate": 1.9623926993819394e-06, "loss": 2.2847, "step": 6065 }, { "epoch": 0.17897685389945453, "grad_norm": 16.0, "learning_rate": 1.962252767270403e-06, "loss": 2.2177, "step": 6070 }, { "epoch": 0.17912428129146396, "grad_norm": 17.875, "learning_rate": 1.9621125803148275e-06, "loss": 2.2478, "step": 6075 }, { "epoch": 0.1792717086834734, "grad_norm": 14.875, "learning_rate": 1.9619721385523404e-06, "loss": 2.1835, "step": 6080 }, { "epoch": 0.17941913607548282, "grad_norm": 18.125, "learning_rate": 1.961831442020136e-06, "loss": 2.3067, "step": 6085 }, { "epoch": 0.17956656346749225, "grad_norm": 17.125, "learning_rate": 1.961690490755477e-06, "loss": 2.2633, "step": 6090 }, { "epoch": 0.17971399085950168, "grad_norm": 14.1875, "learning_rate": 1.961549284795692e-06, "loss": 2.2044, "step": 6095 }, { "epoch": 0.17986141825151114, "grad_norm": 13.0, "learning_rate": 1.9614078241781797e-06, "loss": 2.1931, "step": 6100 }, { "epoch": 0.18000884564352057, "grad_norm": 17.125, "learning_rate": 1.961266108940403e-06, "loss": 2.2065, "step": 6105 }, { "epoch": 0.18015627303553, "grad_norm": 13.8125, "learning_rate": 1.9611241391198956e-06, "loss": 2.2238, "step": 6110 }, { "epoch": 0.18030370042753943, "grad_norm": 15.1875, "learning_rate": 1.9609819147542555e-06, "loss": 2.2079, "step": 6115 }, { "epoch": 0.18045112781954886, "grad_norm": 13.875, "learning_rate": 1.9608394358811505e-06, "loss": 2.176, "step": 6120 }, { "epoch": 0.18059855521155832, "grad_norm": 15.5, "learning_rate": 1.9606967025383147e-06, "loss": 2.3294, "step": 6125 }, { "epoch": 0.18074598260356775, "grad_norm": 13.375, "learning_rate": 1.9605537147635493e-06, "loss": 2.1568, "step": 6130 }, { "epoch": 0.18089340999557718, "grad_norm": 12.8125, "learning_rate": 1.960410472594723e-06, "loss": 2.1856, "step": 6135 }, { "epoch": 0.1810408373875866, "grad_norm": 14.6875, "learning_rate": 1.9602669760697737e-06, "loss": 2.3059, "step": 6140 }, { "epoch": 0.18118826477959604, "grad_norm": 15.875, "learning_rate": 1.960123225226704e-06, "loss": 2.305, "step": 6145 }, { "epoch": 0.18133569217160547, "grad_norm": 16.375, "learning_rate": 1.9599792201035852e-06, "loss": 2.3145, "step": 6150 }, { "epoch": 0.18148311956361493, "grad_norm": 14.8125, "learning_rate": 1.959834960738556e-06, "loss": 2.2752, "step": 6155 }, { "epoch": 0.18163054695562436, "grad_norm": 14.4375, "learning_rate": 1.9596904471698223e-06, "loss": 2.2703, "step": 6160 }, { "epoch": 0.1817779743476338, "grad_norm": 19.75, "learning_rate": 1.9595456794356564e-06, "loss": 2.1955, "step": 6165 }, { "epoch": 0.18192540173964322, "grad_norm": 19.5, "learning_rate": 1.9594006575743997e-06, "loss": 2.3101, "step": 6170 }, { "epoch": 0.18207282913165265, "grad_norm": 17.0, "learning_rate": 1.9592553816244596e-06, "loss": 2.153, "step": 6175 }, { "epoch": 0.18222025652366208, "grad_norm": 15.375, "learning_rate": 1.959109851624311e-06, "loss": 2.2548, "step": 6180 }, { "epoch": 0.18236768391567154, "grad_norm": 16.25, "learning_rate": 1.9589640676124963e-06, "loss": 2.1377, "step": 6185 }, { "epoch": 0.18251511130768097, "grad_norm": 15.875, "learning_rate": 1.9588180296276254e-06, "loss": 2.2501, "step": 6190 }, { "epoch": 0.1826625386996904, "grad_norm": 14.3125, "learning_rate": 1.9586717377083748e-06, "loss": 2.277, "step": 6195 }, { "epoch": 0.18280996609169983, "grad_norm": 12.0, "learning_rate": 1.9585251918934884e-06, "loss": 2.1835, "step": 6200 }, { "epoch": 0.18295739348370926, "grad_norm": 15.1875, "learning_rate": 1.958378392221778e-06, "loss": 2.0964, "step": 6205 }, { "epoch": 0.18310482087571872, "grad_norm": 15.5625, "learning_rate": 1.958231338732122e-06, "loss": 2.1945, "step": 6210 }, { "epoch": 0.18325224826772815, "grad_norm": 15.375, "learning_rate": 1.9580840314634665e-06, "loss": 2.2639, "step": 6215 }, { "epoch": 0.18339967565973758, "grad_norm": 15.6875, "learning_rate": 1.957936470454824e-06, "loss": 2.2378, "step": 6220 }, { "epoch": 0.183547103051747, "grad_norm": 14.0625, "learning_rate": 1.957788655745275e-06, "loss": 2.2162, "step": 6225 }, { "epoch": 0.18369453044375644, "grad_norm": 14.1875, "learning_rate": 1.9576405873739664e-06, "loss": 2.2687, "step": 6230 }, { "epoch": 0.18384195783576587, "grad_norm": 14.6875, "learning_rate": 1.9574922653801138e-06, "loss": 2.2551, "step": 6235 }, { "epoch": 0.18398938522777533, "grad_norm": 15.875, "learning_rate": 1.957343689802998e-06, "loss": 2.2938, "step": 6240 }, { "epoch": 0.18413681261978476, "grad_norm": 13.4375, "learning_rate": 1.9571948606819687e-06, "loss": 2.2451, "step": 6245 }, { "epoch": 0.1842842400117942, "grad_norm": 17.75, "learning_rate": 1.9570457780564415e-06, "loss": 2.3438, "step": 6250 }, { "epoch": 0.18443166740380362, "grad_norm": 14.375, "learning_rate": 1.956896441965899e-06, "loss": 2.2574, "step": 6255 }, { "epoch": 0.18457909479581305, "grad_norm": 18.625, "learning_rate": 1.956746852449893e-06, "loss": 2.259, "step": 6260 }, { "epoch": 0.18472652218782248, "grad_norm": 16.125, "learning_rate": 1.95659700954804e-06, "loss": 2.2494, "step": 6265 }, { "epoch": 0.18487394957983194, "grad_norm": 21.375, "learning_rate": 1.9564469133000244e-06, "loss": 2.373, "step": 6270 }, { "epoch": 0.18502137697184137, "grad_norm": 13.4375, "learning_rate": 1.9562965637455984e-06, "loss": 2.2845, "step": 6275 }, { "epoch": 0.1851688043638508, "grad_norm": 19.125, "learning_rate": 1.95614596092458e-06, "loss": 2.1535, "step": 6280 }, { "epoch": 0.18531623175586023, "grad_norm": 15.5, "learning_rate": 1.955995104876856e-06, "loss": 2.2633, "step": 6285 }, { "epoch": 0.18546365914786966, "grad_norm": 15.5, "learning_rate": 1.9558439956423788e-06, "loss": 2.3352, "step": 6290 }, { "epoch": 0.18561108653987912, "grad_norm": 19.25, "learning_rate": 1.955692633261168e-06, "loss": 2.3492, "step": 6295 }, { "epoch": 0.18575851393188855, "grad_norm": 15.8125, "learning_rate": 1.9555410177733108e-06, "loss": 2.2918, "step": 6300 }, { "epoch": 0.18590594132389798, "grad_norm": 13.4375, "learning_rate": 1.9553891492189613e-06, "loss": 2.3091, "step": 6305 }, { "epoch": 0.1860533687159074, "grad_norm": 12.6875, "learning_rate": 1.9552370276383406e-06, "loss": 2.2345, "step": 6310 }, { "epoch": 0.18620079610791684, "grad_norm": 15.25, "learning_rate": 1.9550846530717368e-06, "loss": 2.1788, "step": 6315 }, { "epoch": 0.18634822349992627, "grad_norm": 15.8125, "learning_rate": 1.9549320255595044e-06, "loss": 2.2549, "step": 6320 }, { "epoch": 0.18649565089193573, "grad_norm": 13.75, "learning_rate": 1.954779145142066e-06, "loss": 2.2416, "step": 6325 }, { "epoch": 0.18664307828394516, "grad_norm": 12.8125, "learning_rate": 1.9546260118599103e-06, "loss": 2.1621, "step": 6330 }, { "epoch": 0.1867905056759546, "grad_norm": 21.0, "learning_rate": 1.9544726257535936e-06, "loss": 2.3335, "step": 6335 }, { "epoch": 0.18693793306796402, "grad_norm": 16.625, "learning_rate": 1.9543189868637383e-06, "loss": 2.2599, "step": 6340 }, { "epoch": 0.18708536045997345, "grad_norm": 15.25, "learning_rate": 1.954165095231035e-06, "loss": 2.1533, "step": 6345 }, { "epoch": 0.18723278785198288, "grad_norm": 13.9375, "learning_rate": 1.95401095089624e-06, "loss": 2.1882, "step": 6350 }, { "epoch": 0.18738021524399234, "grad_norm": 15.8125, "learning_rate": 1.9538565539001774e-06, "loss": 2.2502, "step": 6355 }, { "epoch": 0.18752764263600177, "grad_norm": 13.5625, "learning_rate": 1.953701904283737e-06, "loss": 2.2742, "step": 6360 }, { "epoch": 0.1876750700280112, "grad_norm": 14.375, "learning_rate": 1.9535470020878776e-06, "loss": 2.2428, "step": 6365 }, { "epoch": 0.18782249742002063, "grad_norm": 13.125, "learning_rate": 1.953391847353623e-06, "loss": 2.1909, "step": 6370 }, { "epoch": 0.18796992481203006, "grad_norm": 20.125, "learning_rate": 1.9532364401220645e-06, "loss": 2.3489, "step": 6375 }, { "epoch": 0.18811735220403952, "grad_norm": 18.0, "learning_rate": 1.9530807804343603e-06, "loss": 2.224, "step": 6380 }, { "epoch": 0.18826477959604895, "grad_norm": 15.0, "learning_rate": 1.952924868331736e-06, "loss": 2.2268, "step": 6385 }, { "epoch": 0.18841220698805838, "grad_norm": 13.625, "learning_rate": 1.9527687038554828e-06, "loss": 2.1472, "step": 6390 }, { "epoch": 0.18855963438006781, "grad_norm": 17.75, "learning_rate": 1.9526122870469603e-06, "loss": 2.3077, "step": 6395 }, { "epoch": 0.18870706177207724, "grad_norm": 14.0, "learning_rate": 1.952455617947593e-06, "loss": 2.3211, "step": 6400 }, { "epoch": 0.18885448916408668, "grad_norm": 16.375, "learning_rate": 1.952298696598874e-06, "loss": 2.2684, "step": 6405 }, { "epoch": 0.18900191655609613, "grad_norm": 18.75, "learning_rate": 1.952141523042363e-06, "loss": 2.3083, "step": 6410 }, { "epoch": 0.18914934394810556, "grad_norm": 15.4375, "learning_rate": 1.951984097319685e-06, "loss": 2.3116, "step": 6415 }, { "epoch": 0.189296771340115, "grad_norm": 13.5, "learning_rate": 1.9518264194725333e-06, "loss": 2.2399, "step": 6420 }, { "epoch": 0.18944419873212442, "grad_norm": 15.5, "learning_rate": 1.9516684895426676e-06, "loss": 2.2946, "step": 6425 }, { "epoch": 0.18959162612413386, "grad_norm": 14.0, "learning_rate": 1.9515103075719133e-06, "loss": 2.2819, "step": 6430 }, { "epoch": 0.18973905351614329, "grad_norm": 17.125, "learning_rate": 1.951351873602165e-06, "loss": 2.3172, "step": 6435 }, { "epoch": 0.18988648090815274, "grad_norm": 13.6875, "learning_rate": 1.9511931876753813e-06, "loss": 2.1765, "step": 6440 }, { "epoch": 0.19003390830016217, "grad_norm": 13.75, "learning_rate": 1.9510342498335893e-06, "loss": 2.1027, "step": 6445 }, { "epoch": 0.1901813356921716, "grad_norm": 13.875, "learning_rate": 1.9508750601188823e-06, "loss": 2.1639, "step": 6450 }, { "epoch": 0.19032876308418104, "grad_norm": 14.3125, "learning_rate": 1.95071561857342e-06, "loss": 2.1655, "step": 6455 }, { "epoch": 0.19047619047619047, "grad_norm": 18.375, "learning_rate": 1.9505559252394292e-06, "loss": 2.2215, "step": 6460 }, { "epoch": 0.19062361786819992, "grad_norm": 15.75, "learning_rate": 1.9503959801592035e-06, "loss": 2.3045, "step": 6465 }, { "epoch": 0.19077104526020935, "grad_norm": 22.25, "learning_rate": 1.950235783375102e-06, "loss": 2.3103, "step": 6470 }, { "epoch": 0.19091847265221878, "grad_norm": 15.8125, "learning_rate": 1.9500753349295524e-06, "loss": 2.2239, "step": 6475 }, { "epoch": 0.19106590004422822, "grad_norm": 13.4375, "learning_rate": 1.9499146348650477e-06, "loss": 2.2765, "step": 6480 }, { "epoch": 0.19121332743623765, "grad_norm": 13.25, "learning_rate": 1.949753683224148e-06, "loss": 2.1889, "step": 6485 }, { "epoch": 0.19136075482824708, "grad_norm": 16.375, "learning_rate": 1.9495924800494796e-06, "loss": 2.1884, "step": 6490 }, { "epoch": 0.19150818222025653, "grad_norm": 16.875, "learning_rate": 1.9494310253837357e-06, "loss": 2.2143, "step": 6495 }, { "epoch": 0.19165560961226596, "grad_norm": 15.625, "learning_rate": 1.9492693192696766e-06, "loss": 2.2323, "step": 6500 }, { "epoch": 0.19165560961226596, "eval_loss": 2.214259147644043, "eval_runtime": 4.7188, "eval_samples_per_second": 83.92, "eval_steps_per_second": 2.755, "step": 6500 }, { "epoch": 0.1918030370042754, "grad_norm": 19.75, "learning_rate": 1.949107361750128e-06, "loss": 2.2991, "step": 6505 }, { "epoch": 0.19195046439628483, "grad_norm": 16.125, "learning_rate": 1.948945152867984e-06, "loss": 2.3435, "step": 6510 }, { "epoch": 0.19209789178829426, "grad_norm": 13.4375, "learning_rate": 1.948782692666203e-06, "loss": 2.2187, "step": 6515 }, { "epoch": 0.19224531918030371, "grad_norm": 15.6875, "learning_rate": 1.9486199811878116e-06, "loss": 2.2188, "step": 6520 }, { "epoch": 0.19239274657231314, "grad_norm": 12.875, "learning_rate": 1.9484570184759027e-06, "loss": 2.2358, "step": 6525 }, { "epoch": 0.19254017396432258, "grad_norm": 18.0, "learning_rate": 1.9482938045736353e-06, "loss": 2.2281, "step": 6530 }, { "epoch": 0.192687601356332, "grad_norm": 13.875, "learning_rate": 1.948130339524235e-06, "loss": 2.2475, "step": 6535 }, { "epoch": 0.19283502874834144, "grad_norm": 15.5625, "learning_rate": 1.9479666233709945e-06, "loss": 2.1642, "step": 6540 }, { "epoch": 0.19298245614035087, "grad_norm": 16.125, "learning_rate": 1.947802656157272e-06, "loss": 2.2591, "step": 6545 }, { "epoch": 0.19312988353236032, "grad_norm": 17.375, "learning_rate": 1.9476384379264933e-06, "loss": 2.2818, "step": 6550 }, { "epoch": 0.19327731092436976, "grad_norm": 21.875, "learning_rate": 1.9474739687221494e-06, "loss": 2.1193, "step": 6555 }, { "epoch": 0.19342473831637919, "grad_norm": 14.4375, "learning_rate": 1.9473092485877994e-06, "loss": 2.1854, "step": 6560 }, { "epoch": 0.19357216570838862, "grad_norm": 14.625, "learning_rate": 1.9471442775670673e-06, "loss": 2.2582, "step": 6565 }, { "epoch": 0.19371959310039805, "grad_norm": 16.75, "learning_rate": 1.9469790557036443e-06, "loss": 2.2225, "step": 6570 }, { "epoch": 0.19386702049240748, "grad_norm": 17.0, "learning_rate": 1.9468135830412886e-06, "loss": 2.2606, "step": 6575 }, { "epoch": 0.19401444788441694, "grad_norm": 16.875, "learning_rate": 1.946647859623823e-06, "loss": 2.1635, "step": 6580 }, { "epoch": 0.19416187527642637, "grad_norm": 14.9375, "learning_rate": 1.9464818854951388e-06, "loss": 2.3414, "step": 6585 }, { "epoch": 0.1943093026684358, "grad_norm": 14.8125, "learning_rate": 1.9463156606991918e-06, "loss": 2.4052, "step": 6590 }, { "epoch": 0.19445673006044523, "grad_norm": 16.5, "learning_rate": 1.9461491852800065e-06, "loss": 2.129, "step": 6595 }, { "epoch": 0.19460415745245466, "grad_norm": 15.9375, "learning_rate": 1.9459824592816716e-06, "loss": 2.3136, "step": 6600 }, { "epoch": 0.19475158484446412, "grad_norm": 15.125, "learning_rate": 1.9458154827483427e-06, "loss": 2.1992, "step": 6605 }, { "epoch": 0.19489901223647355, "grad_norm": 17.375, "learning_rate": 1.9456482557242427e-06, "loss": 2.2089, "step": 6610 }, { "epoch": 0.19504643962848298, "grad_norm": 17.875, "learning_rate": 1.94548077825366e-06, "loss": 2.1713, "step": 6615 }, { "epoch": 0.1951938670204924, "grad_norm": 14.5, "learning_rate": 1.945313050380949e-06, "loss": 2.2417, "step": 6620 }, { "epoch": 0.19534129441250184, "grad_norm": 14.0625, "learning_rate": 1.945145072150532e-06, "loss": 2.3048, "step": 6625 }, { "epoch": 0.19548872180451127, "grad_norm": 16.75, "learning_rate": 1.9449768436068953e-06, "loss": 2.1964, "step": 6630 }, { "epoch": 0.19563614919652073, "grad_norm": 14.6875, "learning_rate": 1.944808364794594e-06, "loss": 2.2846, "step": 6635 }, { "epoch": 0.19578357658853016, "grad_norm": 17.75, "learning_rate": 1.944639635758247e-06, "loss": 2.2495, "step": 6640 }, { "epoch": 0.1959310039805396, "grad_norm": 15.1875, "learning_rate": 1.944470656542541e-06, "loss": 2.1223, "step": 6645 }, { "epoch": 0.19607843137254902, "grad_norm": 17.5, "learning_rate": 1.944301427192229e-06, "loss": 2.2621, "step": 6650 }, { "epoch": 0.19622585876455845, "grad_norm": 104.0, "learning_rate": 1.94413194775213e-06, "loss": 2.0909, "step": 6655 }, { "epoch": 0.19637328615656788, "grad_norm": 14.5625, "learning_rate": 1.9439622182671282e-06, "loss": 2.2395, "step": 6660 }, { "epoch": 0.19652071354857734, "grad_norm": 14.125, "learning_rate": 1.943792238782176e-06, "loss": 2.2649, "step": 6665 }, { "epoch": 0.19666814094058677, "grad_norm": 14.75, "learning_rate": 1.9436220093422907e-06, "loss": 2.3661, "step": 6670 }, { "epoch": 0.1968155683325962, "grad_norm": 16.375, "learning_rate": 1.9434515299925557e-06, "loss": 2.349, "step": 6675 }, { "epoch": 0.19696299572460563, "grad_norm": 16.625, "learning_rate": 1.943280800778121e-06, "loss": 2.2771, "step": 6680 }, { "epoch": 0.19711042311661506, "grad_norm": 14.9375, "learning_rate": 1.9431098217442027e-06, "loss": 2.2262, "step": 6685 }, { "epoch": 0.19725785050862452, "grad_norm": 14.375, "learning_rate": 1.942938592936083e-06, "loss": 2.2435, "step": 6690 }, { "epoch": 0.19740527790063395, "grad_norm": 13.4375, "learning_rate": 1.9427671143991103e-06, "loss": 2.1591, "step": 6695 }, { "epoch": 0.19755270529264338, "grad_norm": 14.75, "learning_rate": 1.9425953861787e-06, "loss": 2.296, "step": 6700 }, { "epoch": 0.1977001326846528, "grad_norm": 17.875, "learning_rate": 1.942423408320332e-06, "loss": 2.2397, "step": 6705 }, { "epoch": 0.19784756007666224, "grad_norm": 12.25, "learning_rate": 1.9422511808695525e-06, "loss": 2.2015, "step": 6710 }, { "epoch": 0.19799498746867167, "grad_norm": 15.25, "learning_rate": 1.942078703871976e-06, "loss": 2.2699, "step": 6715 }, { "epoch": 0.19814241486068113, "grad_norm": 14.8125, "learning_rate": 1.94190597737328e-06, "loss": 2.0769, "step": 6720 }, { "epoch": 0.19828984225269056, "grad_norm": 14.375, "learning_rate": 1.9417330014192103e-06, "loss": 2.0982, "step": 6725 }, { "epoch": 0.1984372696447, "grad_norm": 15.875, "learning_rate": 1.941559776055578e-06, "loss": 2.2998, "step": 6730 }, { "epoch": 0.19858469703670942, "grad_norm": 12.4375, "learning_rate": 1.94138630132826e-06, "loss": 2.1314, "step": 6735 }, { "epoch": 0.19873212442871885, "grad_norm": 18.5, "learning_rate": 1.9412125772832e-06, "loss": 2.2776, "step": 6740 }, { "epoch": 0.19887955182072828, "grad_norm": 13.1875, "learning_rate": 1.9410386039664067e-06, "loss": 2.1857, "step": 6745 }, { "epoch": 0.19902697921273774, "grad_norm": 14.4375, "learning_rate": 1.940864381423956e-06, "loss": 2.2383, "step": 6750 }, { "epoch": 0.19917440660474717, "grad_norm": 12.9375, "learning_rate": 1.9406899097019883e-06, "loss": 2.2486, "step": 6755 }, { "epoch": 0.1993218339967566, "grad_norm": 16.875, "learning_rate": 1.940515188846712e-06, "loss": 2.3064, "step": 6760 }, { "epoch": 0.19946926138876603, "grad_norm": 14.0, "learning_rate": 1.9403402189043994e-06, "loss": 2.1471, "step": 6765 }, { "epoch": 0.19961668878077546, "grad_norm": 14.0625, "learning_rate": 1.9401649999213904e-06, "loss": 2.1606, "step": 6770 }, { "epoch": 0.19976411617278492, "grad_norm": 19.0, "learning_rate": 1.9399895319440893e-06, "loss": 2.2425, "step": 6775 }, { "epoch": 0.19991154356479435, "grad_norm": 16.375, "learning_rate": 1.9398138150189683e-06, "loss": 2.1473, "step": 6780 }, { "epoch": 0.20005897095680378, "grad_norm": 13.375, "learning_rate": 1.9396378491925636e-06, "loss": 2.1876, "step": 6785 }, { "epoch": 0.2002063983488132, "grad_norm": 16.125, "learning_rate": 1.939461634511479e-06, "loss": 2.2899, "step": 6790 }, { "epoch": 0.20035382574082264, "grad_norm": 12.625, "learning_rate": 1.9392851710223823e-06, "loss": 2.1244, "step": 6795 }, { "epoch": 0.20050125313283207, "grad_norm": 11.9375, "learning_rate": 1.9391084587720093e-06, "loss": 2.257, "step": 6800 }, { "epoch": 0.20064868052484153, "grad_norm": 28.875, "learning_rate": 1.93893149780716e-06, "loss": 2.2221, "step": 6805 }, { "epoch": 0.20079610791685096, "grad_norm": 12.0625, "learning_rate": 1.9387542881747016e-06, "loss": 2.1064, "step": 6810 }, { "epoch": 0.2009435353088604, "grad_norm": 13.125, "learning_rate": 1.9385768299215656e-06, "loss": 2.1955, "step": 6815 }, { "epoch": 0.20109096270086982, "grad_norm": 15.8125, "learning_rate": 1.938399123094751e-06, "loss": 2.2424, "step": 6820 }, { "epoch": 0.20123839009287925, "grad_norm": 14.25, "learning_rate": 1.9382211677413213e-06, "loss": 2.1846, "step": 6825 }, { "epoch": 0.20138581748488868, "grad_norm": 14.375, "learning_rate": 1.938042963908407e-06, "loss": 2.2189, "step": 6830 }, { "epoch": 0.20153324487689814, "grad_norm": 13.1875, "learning_rate": 1.937864511643203e-06, "loss": 2.3278, "step": 6835 }, { "epoch": 0.20168067226890757, "grad_norm": 15.25, "learning_rate": 1.9376858109929713e-06, "loss": 2.2339, "step": 6840 }, { "epoch": 0.201828099660917, "grad_norm": 16.375, "learning_rate": 1.937506862005039e-06, "loss": 2.2715, "step": 6845 }, { "epoch": 0.20197552705292643, "grad_norm": 14.0625, "learning_rate": 1.9373276647267996e-06, "loss": 2.0827, "step": 6850 }, { "epoch": 0.20212295444493586, "grad_norm": 14.125, "learning_rate": 1.9371482192057114e-06, "loss": 2.2222, "step": 6855 }, { "epoch": 0.20227038183694532, "grad_norm": 14.5, "learning_rate": 1.936968525489299e-06, "loss": 2.2901, "step": 6860 }, { "epoch": 0.20241780922895475, "grad_norm": 12.0625, "learning_rate": 1.936788583625153e-06, "loss": 2.0957, "step": 6865 }, { "epoch": 0.20256523662096418, "grad_norm": 12.6875, "learning_rate": 1.936608393660929e-06, "loss": 2.1918, "step": 6870 }, { "epoch": 0.2027126640129736, "grad_norm": 19.625, "learning_rate": 1.9364279556443486e-06, "loss": 2.2613, "step": 6875 }, { "epoch": 0.20286009140498304, "grad_norm": 14.75, "learning_rate": 1.9362472696231994e-06, "loss": 2.2336, "step": 6880 }, { "epoch": 0.20300751879699247, "grad_norm": 12.75, "learning_rate": 1.9360663356453344e-06, "loss": 2.2267, "step": 6885 }, { "epoch": 0.20315494618900193, "grad_norm": 16.375, "learning_rate": 1.935885153758673e-06, "loss": 2.311, "step": 6890 }, { "epoch": 0.20330237358101136, "grad_norm": 14.625, "learning_rate": 1.9357037240111985e-06, "loss": 2.2432, "step": 6895 }, { "epoch": 0.2034498009730208, "grad_norm": 11.75, "learning_rate": 1.9355220464509617e-06, "loss": 2.17, "step": 6900 }, { "epoch": 0.20359722836503022, "grad_norm": 14.875, "learning_rate": 1.935340121126078e-06, "loss": 2.1693, "step": 6905 }, { "epoch": 0.20374465575703965, "grad_norm": 14.875, "learning_rate": 1.9351579480847288e-06, "loss": 2.2669, "step": 6910 }, { "epoch": 0.20389208314904908, "grad_norm": 17.25, "learning_rate": 1.934975527375161e-06, "loss": 2.2202, "step": 6915 }, { "epoch": 0.20403951054105854, "grad_norm": 13.4375, "learning_rate": 1.9347928590456874e-06, "loss": 2.2297, "step": 6920 }, { "epoch": 0.20418693793306797, "grad_norm": 15.625, "learning_rate": 1.9346099431446853e-06, "loss": 2.1914, "step": 6925 }, { "epoch": 0.2043343653250774, "grad_norm": 13.625, "learning_rate": 1.9344267797205988e-06, "loss": 2.2208, "step": 6930 }, { "epoch": 0.20448179271708683, "grad_norm": 13.25, "learning_rate": 1.934243368821937e-06, "loss": 2.1804, "step": 6935 }, { "epoch": 0.20462922010909626, "grad_norm": 17.0, "learning_rate": 1.934059710497275e-06, "loss": 2.2179, "step": 6940 }, { "epoch": 0.20477664750110572, "grad_norm": 15.1875, "learning_rate": 1.9338758047952527e-06, "loss": 2.1055, "step": 6945 }, { "epoch": 0.20492407489311515, "grad_norm": 14.625, "learning_rate": 1.9336916517645757e-06, "loss": 2.2436, "step": 6950 }, { "epoch": 0.20507150228512458, "grad_norm": 16.625, "learning_rate": 1.933507251454016e-06, "loss": 2.1477, "step": 6955 }, { "epoch": 0.205218929677134, "grad_norm": 15.5625, "learning_rate": 1.933322603912409e-06, "loss": 2.1234, "step": 6960 }, { "epoch": 0.20536635706914344, "grad_norm": 14.4375, "learning_rate": 1.933137709188659e-06, "loss": 2.257, "step": 6965 }, { "epoch": 0.20551378446115287, "grad_norm": 13.875, "learning_rate": 1.932952567331732e-06, "loss": 2.1056, "step": 6970 }, { "epoch": 0.20566121185316233, "grad_norm": 17.125, "learning_rate": 1.9327671783906614e-06, "loss": 2.161, "step": 6975 }, { "epoch": 0.20580863924517176, "grad_norm": 25.875, "learning_rate": 1.9325815424145465e-06, "loss": 2.1589, "step": 6980 }, { "epoch": 0.2059560666371812, "grad_norm": 16.625, "learning_rate": 1.9323956594525514e-06, "loss": 2.2228, "step": 6985 }, { "epoch": 0.20610349402919062, "grad_norm": 18.0, "learning_rate": 1.9322095295539045e-06, "loss": 2.2412, "step": 6990 }, { "epoch": 0.20625092142120005, "grad_norm": 15.125, "learning_rate": 1.9320231527679014e-06, "loss": 2.2868, "step": 6995 }, { "epoch": 0.20639834881320948, "grad_norm": 14.1875, "learning_rate": 1.931836529143902e-06, "loss": 2.1774, "step": 7000 }, { "epoch": 0.20639834881320948, "eval_loss": 2.193103075027466, "eval_runtime": 4.7184, "eval_samples_per_second": 83.927, "eval_steps_per_second": 2.755, "step": 7000 }, { "epoch": 0.20654577620521894, "grad_norm": 15.0625, "learning_rate": 1.9316496587313323e-06, "loss": 2.3368, "step": 7005 }, { "epoch": 0.20669320359722837, "grad_norm": 13.625, "learning_rate": 1.931462541579683e-06, "loss": 2.3225, "step": 7010 }, { "epoch": 0.2068406309892378, "grad_norm": 14.0625, "learning_rate": 1.9312751777385103e-06, "loss": 2.2313, "step": 7015 }, { "epoch": 0.20698805838124723, "grad_norm": 15.25, "learning_rate": 1.931087567257436e-06, "loss": 2.2718, "step": 7020 }, { "epoch": 0.20713548577325666, "grad_norm": 13.375, "learning_rate": 1.9308997101861474e-06, "loss": 2.1488, "step": 7025 }, { "epoch": 0.20728291316526612, "grad_norm": 14.25, "learning_rate": 1.930711606574396e-06, "loss": 2.2197, "step": 7030 }, { "epoch": 0.20743034055727555, "grad_norm": 16.125, "learning_rate": 1.930523256472e-06, "loss": 2.1245, "step": 7035 }, { "epoch": 0.20757776794928498, "grad_norm": 21.625, "learning_rate": 1.9303346599288415e-06, "loss": 2.2834, "step": 7040 }, { "epoch": 0.2077251953412944, "grad_norm": 13.1875, "learning_rate": 1.9301458169948695e-06, "loss": 2.145, "step": 7045 }, { "epoch": 0.20787262273330384, "grad_norm": 13.875, "learning_rate": 1.929956727720097e-06, "loss": 2.206, "step": 7050 }, { "epoch": 0.20802005012531327, "grad_norm": 13.4375, "learning_rate": 1.9297673921546026e-06, "loss": 2.1265, "step": 7055 }, { "epoch": 0.20816747751732273, "grad_norm": 15.3125, "learning_rate": 1.92957781034853e-06, "loss": 2.283, "step": 7060 }, { "epoch": 0.20831490490933216, "grad_norm": 15.125, "learning_rate": 1.929387982352088e-06, "loss": 2.2967, "step": 7065 }, { "epoch": 0.2084623323013416, "grad_norm": 17.0, "learning_rate": 1.9291979082155514e-06, "loss": 2.2077, "step": 7070 }, { "epoch": 0.20860975969335102, "grad_norm": 11.75, "learning_rate": 1.9290075879892593e-06, "loss": 2.1595, "step": 7075 }, { "epoch": 0.20875718708536045, "grad_norm": 15.0, "learning_rate": 1.9288170217236167e-06, "loss": 2.272, "step": 7080 }, { "epoch": 0.20890461447736988, "grad_norm": 13.9375, "learning_rate": 1.928626209469093e-06, "loss": 2.2495, "step": 7085 }, { "epoch": 0.20905204186937934, "grad_norm": 15.8125, "learning_rate": 1.9284351512762235e-06, "loss": 2.202, "step": 7090 }, { "epoch": 0.20919946926138877, "grad_norm": 17.5, "learning_rate": 1.9282438471956074e-06, "loss": 2.238, "step": 7095 }, { "epoch": 0.2093468966533982, "grad_norm": 19.5, "learning_rate": 1.9280522972779105e-06, "loss": 2.2096, "step": 7100 }, { "epoch": 0.20949432404540763, "grad_norm": 15.5, "learning_rate": 1.9278605015738635e-06, "loss": 2.3443, "step": 7105 }, { "epoch": 0.20964175143741706, "grad_norm": 15.3125, "learning_rate": 1.927668460134261e-06, "loss": 2.0769, "step": 7110 }, { "epoch": 0.20978917882942652, "grad_norm": 17.125, "learning_rate": 1.927476173009964e-06, "loss": 2.1903, "step": 7115 }, { "epoch": 0.20993660622143595, "grad_norm": 18.75, "learning_rate": 1.9272836402518975e-06, "loss": 2.3107, "step": 7120 }, { "epoch": 0.21008403361344538, "grad_norm": 15.6875, "learning_rate": 1.927090861911053e-06, "loss": 2.3182, "step": 7125 }, { "epoch": 0.2102314610054548, "grad_norm": 16.75, "learning_rate": 1.9268978380384846e-06, "loss": 2.2175, "step": 7130 }, { "epoch": 0.21037888839746424, "grad_norm": 15.5625, "learning_rate": 1.926704568685314e-06, "loss": 2.219, "step": 7135 }, { "epoch": 0.21052631578947367, "grad_norm": 15.1875, "learning_rate": 1.9265110539027273e-06, "loss": 2.1772, "step": 7140 }, { "epoch": 0.21067374318148313, "grad_norm": 16.5, "learning_rate": 1.9263172937419742e-06, "loss": 2.2305, "step": 7145 }, { "epoch": 0.21082117057349256, "grad_norm": 25.5, "learning_rate": 1.9261232882543706e-06, "loss": 2.3339, "step": 7150 }, { "epoch": 0.210968597965502, "grad_norm": 13.375, "learning_rate": 1.9259290374912976e-06, "loss": 2.0968, "step": 7155 }, { "epoch": 0.21111602535751142, "grad_norm": 12.8125, "learning_rate": 1.9257345415042e-06, "loss": 2.1461, "step": 7160 }, { "epoch": 0.21126345274952085, "grad_norm": 13.75, "learning_rate": 1.9255398003445887e-06, "loss": 2.2249, "step": 7165 }, { "epoch": 0.21141088014153028, "grad_norm": 16.75, "learning_rate": 1.9253448140640392e-06, "loss": 2.3032, "step": 7170 }, { "epoch": 0.21155830753353974, "grad_norm": 14.625, "learning_rate": 1.925149582714192e-06, "loss": 2.1457, "step": 7175 }, { "epoch": 0.21170573492554917, "grad_norm": 14.75, "learning_rate": 1.9249541063467524e-06, "loss": 2.1607, "step": 7180 }, { "epoch": 0.2118531623175586, "grad_norm": 16.375, "learning_rate": 1.92475838501349e-06, "loss": 2.1474, "step": 7185 }, { "epoch": 0.21200058970956803, "grad_norm": 14.875, "learning_rate": 1.92456241876624e-06, "loss": 2.2473, "step": 7190 }, { "epoch": 0.21214801710157746, "grad_norm": 13.75, "learning_rate": 1.9243662076569034e-06, "loss": 2.3273, "step": 7195 }, { "epoch": 0.21229544449358692, "grad_norm": 26.5, "learning_rate": 1.924169751737443e-06, "loss": 2.253, "step": 7200 }, { "epoch": 0.21244287188559635, "grad_norm": 14.6875, "learning_rate": 1.9239730510598906e-06, "loss": 2.1251, "step": 7205 }, { "epoch": 0.21259029927760578, "grad_norm": 14.25, "learning_rate": 1.923776105676339e-06, "loss": 2.1633, "step": 7210 }, { "epoch": 0.2127377266696152, "grad_norm": 14.875, "learning_rate": 1.923578915638948e-06, "loss": 2.2504, "step": 7215 }, { "epoch": 0.21288515406162464, "grad_norm": 14.375, "learning_rate": 1.9233814809999417e-06, "loss": 2.0792, "step": 7220 }, { "epoch": 0.21303258145363407, "grad_norm": 13.25, "learning_rate": 1.9231838018116084e-06, "loss": 2.0842, "step": 7225 }, { "epoch": 0.21318000884564353, "grad_norm": 15.875, "learning_rate": 1.922985878126302e-06, "loss": 2.373, "step": 7230 }, { "epoch": 0.21332743623765296, "grad_norm": 14.0, "learning_rate": 1.9227877099964413e-06, "loss": 2.2181, "step": 7235 }, { "epoch": 0.2134748636296624, "grad_norm": 14.5625, "learning_rate": 1.9225892974745083e-06, "loss": 2.2031, "step": 7240 }, { "epoch": 0.21362229102167182, "grad_norm": 13.6875, "learning_rate": 1.9223906406130515e-06, "loss": 2.0738, "step": 7245 }, { "epoch": 0.21376971841368125, "grad_norm": 13.375, "learning_rate": 1.9221917394646833e-06, "loss": 2.1385, "step": 7250 }, { "epoch": 0.21391714580569068, "grad_norm": 13.875, "learning_rate": 1.9219925940820813e-06, "loss": 2.2051, "step": 7255 }, { "epoch": 0.21406457319770014, "grad_norm": 13.875, "learning_rate": 1.9217932045179864e-06, "loss": 2.206, "step": 7260 }, { "epoch": 0.21421200058970957, "grad_norm": 13.3125, "learning_rate": 1.921593570825206e-06, "loss": 2.2147, "step": 7265 }, { "epoch": 0.214359427981719, "grad_norm": 16.125, "learning_rate": 1.921393693056611e-06, "loss": 2.1987, "step": 7270 }, { "epoch": 0.21450685537372843, "grad_norm": 19.375, "learning_rate": 1.921193571265137e-06, "loss": 2.268, "step": 7275 }, { "epoch": 0.21465428276573786, "grad_norm": 13.875, "learning_rate": 1.9209932055037844e-06, "loss": 2.2881, "step": 7280 }, { "epoch": 0.21480171015774732, "grad_norm": 16.125, "learning_rate": 1.920792595825619e-06, "loss": 2.1479, "step": 7285 }, { "epoch": 0.21494913754975675, "grad_norm": 16.0, "learning_rate": 1.92059174228377e-06, "loss": 2.2282, "step": 7290 }, { "epoch": 0.21509656494176618, "grad_norm": 14.5, "learning_rate": 1.9203906449314315e-06, "loss": 2.1761, "step": 7295 }, { "epoch": 0.2152439923337756, "grad_norm": 15.9375, "learning_rate": 1.920189303821862e-06, "loss": 2.2016, "step": 7300 }, { "epoch": 0.21539141972578504, "grad_norm": 17.25, "learning_rate": 1.9199877190083863e-06, "loss": 2.0975, "step": 7305 }, { "epoch": 0.21553884711779447, "grad_norm": 16.375, "learning_rate": 1.9197858905443916e-06, "loss": 2.2305, "step": 7310 }, { "epoch": 0.21568627450980393, "grad_norm": 13.1875, "learning_rate": 1.91958381848333e-06, "loss": 2.1979, "step": 7315 }, { "epoch": 0.21583370190181336, "grad_norm": 15.6875, "learning_rate": 1.919381502878718e-06, "loss": 2.2607, "step": 7320 }, { "epoch": 0.2159811292938228, "grad_norm": 11.9375, "learning_rate": 1.9191789437841384e-06, "loss": 2.1914, "step": 7325 }, { "epoch": 0.21612855668583222, "grad_norm": 15.1875, "learning_rate": 1.9189761412532365e-06, "loss": 2.2348, "step": 7330 }, { "epoch": 0.21627598407784165, "grad_norm": 14.0, "learning_rate": 1.9187730953397225e-06, "loss": 2.2635, "step": 7335 }, { "epoch": 0.21642341146985108, "grad_norm": 14.1875, "learning_rate": 1.918569806097372e-06, "loss": 2.2494, "step": 7340 }, { "epoch": 0.21657083886186054, "grad_norm": 16.375, "learning_rate": 1.9183662735800237e-06, "loss": 2.2639, "step": 7345 }, { "epoch": 0.21671826625386997, "grad_norm": 16.125, "learning_rate": 1.9181624978415814e-06, "loss": 2.2761, "step": 7350 }, { "epoch": 0.2168656936458794, "grad_norm": 15.5625, "learning_rate": 1.917958478936014e-06, "loss": 2.3698, "step": 7355 }, { "epoch": 0.21701312103788883, "grad_norm": 15.5, "learning_rate": 1.917754216917353e-06, "loss": 2.2325, "step": 7360 }, { "epoch": 0.21716054842989826, "grad_norm": 19.375, "learning_rate": 1.917549711839696e-06, "loss": 2.2518, "step": 7365 }, { "epoch": 0.21730797582190772, "grad_norm": 14.75, "learning_rate": 1.9173449637572042e-06, "loss": 2.2386, "step": 7370 }, { "epoch": 0.21745540321391715, "grad_norm": 13.25, "learning_rate": 1.9171399727241035e-06, "loss": 2.2323, "step": 7375 }, { "epoch": 0.21760283060592658, "grad_norm": 15.6875, "learning_rate": 1.9169347387946836e-06, "loss": 2.207, "step": 7380 }, { "epoch": 0.21775025799793601, "grad_norm": 19.0, "learning_rate": 1.916729262023299e-06, "loss": 2.2985, "step": 7385 }, { "epoch": 0.21789768538994544, "grad_norm": 15.875, "learning_rate": 1.916523542464369e-06, "loss": 2.156, "step": 7390 }, { "epoch": 0.21804511278195488, "grad_norm": 13.3125, "learning_rate": 1.916317580172376e-06, "loss": 2.1725, "step": 7395 }, { "epoch": 0.21819254017396433, "grad_norm": 14.0625, "learning_rate": 1.9161113752018666e-06, "loss": 2.2422, "step": 7400 }, { "epoch": 0.21833996756597376, "grad_norm": 15.4375, "learning_rate": 1.915904927607454e-06, "loss": 2.2404, "step": 7405 }, { "epoch": 0.2184873949579832, "grad_norm": 14.5625, "learning_rate": 1.915698237443812e-06, "loss": 2.3074, "step": 7410 }, { "epoch": 0.21863482234999262, "grad_norm": 15.25, "learning_rate": 1.915491304765682e-06, "loss": 2.1644, "step": 7415 }, { "epoch": 0.21878224974200206, "grad_norm": 20.625, "learning_rate": 1.9152841296278687e-06, "loss": 2.2966, "step": 7420 }, { "epoch": 0.21892967713401149, "grad_norm": 13.4375, "learning_rate": 1.915076712085239e-06, "loss": 2.223, "step": 7425 }, { "epoch": 0.21907710452602094, "grad_norm": 17.625, "learning_rate": 1.9148690521927267e-06, "loss": 2.2195, "step": 7430 }, { "epoch": 0.21922453191803037, "grad_norm": 15.0, "learning_rate": 1.9146611500053282e-06, "loss": 2.2043, "step": 7435 }, { "epoch": 0.2193719593100398, "grad_norm": 15.0, "learning_rate": 1.914453005578105e-06, "loss": 2.1418, "step": 7440 }, { "epoch": 0.21951938670204924, "grad_norm": 18.0, "learning_rate": 1.9142446189661818e-06, "loss": 2.1438, "step": 7445 }, { "epoch": 0.21966681409405867, "grad_norm": 15.25, "learning_rate": 1.9140359902247485e-06, "loss": 2.3102, "step": 7450 }, { "epoch": 0.21981424148606812, "grad_norm": 13.75, "learning_rate": 1.9138271194090576e-06, "loss": 2.1613, "step": 7455 }, { "epoch": 0.21996166887807755, "grad_norm": 16.5, "learning_rate": 1.9136180065744278e-06, "loss": 2.2573, "step": 7460 }, { "epoch": 0.22010909627008698, "grad_norm": 14.1875, "learning_rate": 1.91340865177624e-06, "loss": 2.1977, "step": 7465 }, { "epoch": 0.22025652366209642, "grad_norm": 13.75, "learning_rate": 1.91319905506994e-06, "loss": 2.2246, "step": 7470 }, { "epoch": 0.22040395105410585, "grad_norm": 14.125, "learning_rate": 1.9129892165110383e-06, "loss": 2.2511, "step": 7475 }, { "epoch": 0.22055137844611528, "grad_norm": 13.375, "learning_rate": 1.9127791361551077e-06, "loss": 2.1218, "step": 7480 }, { "epoch": 0.22069880583812473, "grad_norm": 14.0, "learning_rate": 1.912568814057787e-06, "loss": 2.1278, "step": 7485 }, { "epoch": 0.22084623323013416, "grad_norm": 14.875, "learning_rate": 1.9123582502747776e-06, "loss": 2.177, "step": 7490 }, { "epoch": 0.2209936606221436, "grad_norm": 18.0, "learning_rate": 1.9121474448618455e-06, "loss": 2.1431, "step": 7495 }, { "epoch": 0.22114108801415303, "grad_norm": 16.0, "learning_rate": 1.911936397874821e-06, "loss": 2.2285, "step": 7500 }, { "epoch": 0.22114108801415303, "eval_loss": 2.1763086318969727, "eval_runtime": 4.7087, "eval_samples_per_second": 84.099, "eval_steps_per_second": 2.761, "step": 7500 }, { "epoch": 0.22128851540616246, "grad_norm": 15.6875, "learning_rate": 1.911725109369598e-06, "loss": 2.2166, "step": 7505 }, { "epoch": 0.22143594279817191, "grad_norm": 27.875, "learning_rate": 1.9115135794021336e-06, "loss": 2.2402, "step": 7510 }, { "epoch": 0.22158337019018134, "grad_norm": 14.4375, "learning_rate": 1.91130180802845e-06, "loss": 2.204, "step": 7515 }, { "epoch": 0.22173079758219078, "grad_norm": 14.1875, "learning_rate": 1.911089795304634e-06, "loss": 2.1249, "step": 7520 }, { "epoch": 0.2218782249742002, "grad_norm": 13.8125, "learning_rate": 1.9108775412868333e-06, "loss": 2.1547, "step": 7525 }, { "epoch": 0.22202565236620964, "grad_norm": 15.5625, "learning_rate": 1.9106650460312634e-06, "loss": 2.2753, "step": 7530 }, { "epoch": 0.22217307975821907, "grad_norm": 14.75, "learning_rate": 1.9104523095942003e-06, "loss": 2.1348, "step": 7535 }, { "epoch": 0.22232050715022852, "grad_norm": 15.25, "learning_rate": 1.910239332031986e-06, "loss": 2.2014, "step": 7540 }, { "epoch": 0.22246793454223796, "grad_norm": 15.4375, "learning_rate": 1.910026113401026e-06, "loss": 2.2529, "step": 7545 }, { "epoch": 0.22261536193424739, "grad_norm": 25.5, "learning_rate": 1.909812653757789e-06, "loss": 2.2115, "step": 7550 }, { "epoch": 0.22276278932625682, "grad_norm": 14.9375, "learning_rate": 1.909598953158808e-06, "loss": 2.228, "step": 7555 }, { "epoch": 0.22291021671826625, "grad_norm": 14.3125, "learning_rate": 1.909385011660679e-06, "loss": 2.2131, "step": 7560 }, { "epoch": 0.22305764411027568, "grad_norm": 14.25, "learning_rate": 1.9091708293200635e-06, "loss": 2.1549, "step": 7565 }, { "epoch": 0.22320507150228514, "grad_norm": 13.875, "learning_rate": 1.908956406193685e-06, "loss": 2.1435, "step": 7570 }, { "epoch": 0.22335249889429457, "grad_norm": 15.0625, "learning_rate": 1.908741742338332e-06, "loss": 2.2743, "step": 7575 }, { "epoch": 0.223499926286304, "grad_norm": 23.625, "learning_rate": 1.908526837810857e-06, "loss": 2.2482, "step": 7580 }, { "epoch": 0.22364735367831343, "grad_norm": 15.375, "learning_rate": 1.9083116926681735e-06, "loss": 2.348, "step": 7585 }, { "epoch": 0.22379478107032286, "grad_norm": 15.625, "learning_rate": 1.908096306967263e-06, "loss": 2.1831, "step": 7590 }, { "epoch": 0.22394220846233232, "grad_norm": 14.0, "learning_rate": 1.907880680765167e-06, "loss": 2.2471, "step": 7595 }, { "epoch": 0.22408963585434175, "grad_norm": 15.5, "learning_rate": 1.9076648141189925e-06, "loss": 2.1452, "step": 7600 }, { "epoch": 0.22423706324635118, "grad_norm": 14.375, "learning_rate": 1.9074487070859102e-06, "loss": 2.2139, "step": 7605 }, { "epoch": 0.2243844906383606, "grad_norm": 13.375, "learning_rate": 1.907232359723154e-06, "loss": 2.0523, "step": 7610 }, { "epoch": 0.22453191803037004, "grad_norm": 14.1875, "learning_rate": 1.9070157720880213e-06, "loss": 2.2727, "step": 7615 }, { "epoch": 0.22467934542237947, "grad_norm": 15.625, "learning_rate": 1.9067989442378738e-06, "loss": 2.2582, "step": 7620 }, { "epoch": 0.22482677281438893, "grad_norm": 14.8125, "learning_rate": 1.9065818762301363e-06, "loss": 2.1734, "step": 7625 }, { "epoch": 0.22497420020639836, "grad_norm": 14.4375, "learning_rate": 1.906364568122297e-06, "loss": 2.1759, "step": 7630 }, { "epoch": 0.2251216275984078, "grad_norm": 14.625, "learning_rate": 1.9061470199719083e-06, "loss": 2.1728, "step": 7635 }, { "epoch": 0.22526905499041722, "grad_norm": 16.625, "learning_rate": 1.9059292318365855e-06, "loss": 2.3187, "step": 7640 }, { "epoch": 0.22541648238242665, "grad_norm": 15.0, "learning_rate": 1.9057112037740084e-06, "loss": 2.1401, "step": 7645 }, { "epoch": 0.22556390977443608, "grad_norm": 15.0, "learning_rate": 1.9054929358419195e-06, "loss": 2.1785, "step": 7650 }, { "epoch": 0.22571133716644554, "grad_norm": 15.875, "learning_rate": 1.9052744280981251e-06, "loss": 2.3024, "step": 7655 }, { "epoch": 0.22585876455845497, "grad_norm": 15.3125, "learning_rate": 1.9050556806004955e-06, "loss": 2.2835, "step": 7660 }, { "epoch": 0.2260061919504644, "grad_norm": 14.4375, "learning_rate": 1.904836693406963e-06, "loss": 2.2379, "step": 7665 }, { "epoch": 0.22615361934247383, "grad_norm": 13.5, "learning_rate": 1.9046174665755252e-06, "loss": 2.1806, "step": 7670 }, { "epoch": 0.22630104673448326, "grad_norm": 15.0, "learning_rate": 1.904398000164242e-06, "loss": 2.2357, "step": 7675 }, { "epoch": 0.22644847412649272, "grad_norm": 19.125, "learning_rate": 1.9041782942312374e-06, "loss": 2.1981, "step": 7680 }, { "epoch": 0.22659590151850215, "grad_norm": 15.0, "learning_rate": 1.9039583488346987e-06, "loss": 2.2131, "step": 7685 }, { "epoch": 0.22674332891051158, "grad_norm": 15.375, "learning_rate": 1.9037381640328757e-06, "loss": 2.1719, "step": 7690 }, { "epoch": 0.226890756302521, "grad_norm": 15.0625, "learning_rate": 1.9035177398840832e-06, "loss": 2.308, "step": 7695 }, { "epoch": 0.22703818369453044, "grad_norm": 10.875, "learning_rate": 1.9032970764466981e-06, "loss": 2.139, "step": 7700 }, { "epoch": 0.22718561108653987, "grad_norm": 16.625, "learning_rate": 1.9030761737791612e-06, "loss": 2.1771, "step": 7705 }, { "epoch": 0.22733303847854933, "grad_norm": 16.0, "learning_rate": 1.9028550319399765e-06, "loss": 2.1578, "step": 7710 }, { "epoch": 0.22748046587055876, "grad_norm": 15.9375, "learning_rate": 1.9026336509877119e-06, "loss": 2.174, "step": 7715 }, { "epoch": 0.2276278932625682, "grad_norm": 13.625, "learning_rate": 1.9024120309809978e-06, "loss": 2.1622, "step": 7720 }, { "epoch": 0.22777532065457762, "grad_norm": 19.375, "learning_rate": 1.9021901719785282e-06, "loss": 2.2129, "step": 7725 }, { "epoch": 0.22792274804658705, "grad_norm": 17.0, "learning_rate": 1.9019680740390607e-06, "loss": 2.2368, "step": 7730 }, { "epoch": 0.22807017543859648, "grad_norm": 13.25, "learning_rate": 1.9017457372214158e-06, "loss": 2.1988, "step": 7735 }, { "epoch": 0.22821760283060594, "grad_norm": 12.625, "learning_rate": 1.9015231615844773e-06, "loss": 2.1546, "step": 7740 }, { "epoch": 0.22836503022261537, "grad_norm": 16.5, "learning_rate": 1.9013003471871932e-06, "loss": 2.2133, "step": 7745 }, { "epoch": 0.2285124576146248, "grad_norm": 13.875, "learning_rate": 1.9010772940885727e-06, "loss": 2.1367, "step": 7750 }, { "epoch": 0.22865988500663423, "grad_norm": 14.3125, "learning_rate": 1.90085400234769e-06, "loss": 2.2353, "step": 7755 }, { "epoch": 0.22880731239864366, "grad_norm": 14.1875, "learning_rate": 1.9006304720236818e-06, "loss": 2.1319, "step": 7760 }, { "epoch": 0.22895473979065312, "grad_norm": 15.625, "learning_rate": 1.9004067031757484e-06, "loss": 2.1314, "step": 7765 }, { "epoch": 0.22910216718266255, "grad_norm": 18.5, "learning_rate": 1.9001826958631529e-06, "loss": 2.1489, "step": 7770 }, { "epoch": 0.22924959457467198, "grad_norm": 15.5, "learning_rate": 1.8999584501452213e-06, "loss": 2.2059, "step": 7775 }, { "epoch": 0.2293970219666814, "grad_norm": 13.8125, "learning_rate": 1.8997339660813433e-06, "loss": 2.0898, "step": 7780 }, { "epoch": 0.22954444935869084, "grad_norm": 13.375, "learning_rate": 1.8995092437309714e-06, "loss": 2.1954, "step": 7785 }, { "epoch": 0.22969187675070027, "grad_norm": 15.1875, "learning_rate": 1.8992842831536215e-06, "loss": 2.2219, "step": 7790 }, { "epoch": 0.22983930414270973, "grad_norm": 20.5, "learning_rate": 1.8990590844088723e-06, "loss": 2.333, "step": 7795 }, { "epoch": 0.22998673153471916, "grad_norm": 31.5, "learning_rate": 1.8988336475563654e-06, "loss": 2.0667, "step": 7800 }, { "epoch": 0.2301341589267286, "grad_norm": 14.8125, "learning_rate": 1.8986079726558064e-06, "loss": 2.2898, "step": 7805 }, { "epoch": 0.23028158631873802, "grad_norm": 15.6875, "learning_rate": 1.8983820597669626e-06, "loss": 2.2644, "step": 7810 }, { "epoch": 0.23042901371074745, "grad_norm": 13.625, "learning_rate": 1.8981559089496652e-06, "loss": 2.2789, "step": 7815 }, { "epoch": 0.23057644110275688, "grad_norm": 14.0625, "learning_rate": 1.8979295202638086e-06, "loss": 2.1742, "step": 7820 }, { "epoch": 0.23072386849476634, "grad_norm": 13.9375, "learning_rate": 1.8977028937693493e-06, "loss": 2.2069, "step": 7825 }, { "epoch": 0.23087129588677577, "grad_norm": 15.625, "learning_rate": 1.8974760295263075e-06, "loss": 2.1841, "step": 7830 }, { "epoch": 0.2310187232787852, "grad_norm": 17.875, "learning_rate": 1.8972489275947662e-06, "loss": 2.2387, "step": 7835 }, { "epoch": 0.23116615067079463, "grad_norm": 13.8125, "learning_rate": 1.8970215880348713e-06, "loss": 2.2129, "step": 7840 }, { "epoch": 0.23131357806280406, "grad_norm": 17.75, "learning_rate": 1.8967940109068316e-06, "loss": 2.1558, "step": 7845 }, { "epoch": 0.23146100545481352, "grad_norm": 14.625, "learning_rate": 1.896566196270919e-06, "loss": 2.2171, "step": 7850 }, { "epoch": 0.23160843284682295, "grad_norm": 15.875, "learning_rate": 1.896338144187468e-06, "loss": 2.2006, "step": 7855 }, { "epoch": 0.23175586023883238, "grad_norm": 16.125, "learning_rate": 1.896109854716876e-06, "loss": 2.1893, "step": 7860 }, { "epoch": 0.2319032876308418, "grad_norm": 16.125, "learning_rate": 1.895881327919604e-06, "loss": 2.1611, "step": 7865 }, { "epoch": 0.23205071502285124, "grad_norm": 15.3125, "learning_rate": 1.8956525638561749e-06, "loss": 2.1714, "step": 7870 }, { "epoch": 0.23219814241486067, "grad_norm": 16.25, "learning_rate": 1.8954235625871747e-06, "loss": 2.0627, "step": 7875 }, { "epoch": 0.23234556980687013, "grad_norm": 16.625, "learning_rate": 1.8951943241732526e-06, "loss": 2.1204, "step": 7880 }, { "epoch": 0.23249299719887956, "grad_norm": 13.5, "learning_rate": 1.8949648486751202e-06, "loss": 2.2194, "step": 7885 }, { "epoch": 0.232640424590889, "grad_norm": 13.5, "learning_rate": 1.8947351361535523e-06, "loss": 2.1443, "step": 7890 }, { "epoch": 0.23278785198289842, "grad_norm": 13.875, "learning_rate": 1.8945051866693856e-06, "loss": 2.244, "step": 7895 }, { "epoch": 0.23293527937490785, "grad_norm": 15.9375, "learning_rate": 1.8942750002835205e-06, "loss": 2.1937, "step": 7900 }, { "epoch": 0.23308270676691728, "grad_norm": 17.375, "learning_rate": 1.89404457705692e-06, "loss": 2.3146, "step": 7905 }, { "epoch": 0.23323013415892674, "grad_norm": 15.0625, "learning_rate": 1.8938139170506095e-06, "loss": 2.1182, "step": 7910 }, { "epoch": 0.23337756155093617, "grad_norm": 16.625, "learning_rate": 1.8935830203256772e-06, "loss": 2.272, "step": 7915 }, { "epoch": 0.2335249889429456, "grad_norm": 16.25, "learning_rate": 1.893351886943274e-06, "loss": 2.1908, "step": 7920 }, { "epoch": 0.23367241633495503, "grad_norm": 15.4375, "learning_rate": 1.8931205169646136e-06, "loss": 2.2338, "step": 7925 }, { "epoch": 0.23381984372696446, "grad_norm": 12.0625, "learning_rate": 1.8928889104509721e-06, "loss": 2.1454, "step": 7930 }, { "epoch": 0.23396727111897392, "grad_norm": 14.5625, "learning_rate": 1.8926570674636888e-06, "loss": 2.2695, "step": 7935 }, { "epoch": 0.23411469851098335, "grad_norm": 22.25, "learning_rate": 1.8924249880641647e-06, "loss": 2.1593, "step": 7940 }, { "epoch": 0.23426212590299278, "grad_norm": 14.125, "learning_rate": 1.8921926723138644e-06, "loss": 2.1392, "step": 7945 }, { "epoch": 0.2344095532950022, "grad_norm": 22.875, "learning_rate": 1.8919601202743146e-06, "loss": 2.1933, "step": 7950 }, { "epoch": 0.23455698068701164, "grad_norm": 14.8125, "learning_rate": 1.8917273320071044e-06, "loss": 2.1975, "step": 7955 }, { "epoch": 0.23470440807902107, "grad_norm": 16.0, "learning_rate": 1.8914943075738856e-06, "loss": 2.1972, "step": 7960 }, { "epoch": 0.23485183547103053, "grad_norm": 15.75, "learning_rate": 1.891261047036373e-06, "loss": 2.2886, "step": 7965 }, { "epoch": 0.23499926286303996, "grad_norm": 14.5625, "learning_rate": 1.8910275504563434e-06, "loss": 2.2055, "step": 7970 }, { "epoch": 0.2351466902550494, "grad_norm": 16.625, "learning_rate": 1.890793817895636e-06, "loss": 2.1969, "step": 7975 }, { "epoch": 0.23529411764705882, "grad_norm": 16.0, "learning_rate": 1.8905598494161535e-06, "loss": 2.1731, "step": 7980 }, { "epoch": 0.23544154503906825, "grad_norm": 14.625, "learning_rate": 1.8903256450798594e-06, "loss": 2.2552, "step": 7985 }, { "epoch": 0.23558897243107768, "grad_norm": 15.75, "learning_rate": 1.890091204948782e-06, "loss": 2.2014, "step": 7990 }, { "epoch": 0.23573639982308714, "grad_norm": 16.375, "learning_rate": 1.889856529085009e-06, "loss": 2.2227, "step": 7995 }, { "epoch": 0.23588382721509657, "grad_norm": 13.875, "learning_rate": 1.8896216175506932e-06, "loss": 2.1592, "step": 8000 }, { "epoch": 0.23588382721509657, "eval_loss": 2.1633877754211426, "eval_runtime": 4.717, "eval_samples_per_second": 83.952, "eval_steps_per_second": 2.756, "step": 8000 }, { "epoch": 0.236031254607106, "grad_norm": 11.625, "learning_rate": 1.889386470408049e-06, "loss": 2.1509, "step": 8005 }, { "epoch": 0.23617868199911543, "grad_norm": 13.3125, "learning_rate": 1.8891510877193522e-06, "loss": 2.2325, "step": 8010 }, { "epoch": 0.23632610939112486, "grad_norm": 14.25, "learning_rate": 1.8889154695469424e-06, "loss": 2.2458, "step": 8015 }, { "epoch": 0.23647353678313432, "grad_norm": 15.5625, "learning_rate": 1.888679615953221e-06, "loss": 2.2179, "step": 8020 }, { "epoch": 0.23662096417514375, "grad_norm": 13.5, "learning_rate": 1.8884435270006516e-06, "loss": 2.1758, "step": 8025 }, { "epoch": 0.23676839156715318, "grad_norm": 15.875, "learning_rate": 1.8882072027517598e-06, "loss": 2.1682, "step": 8030 }, { "epoch": 0.2369158189591626, "grad_norm": 14.5625, "learning_rate": 1.8879706432691344e-06, "loss": 2.1355, "step": 8035 }, { "epoch": 0.23706324635117204, "grad_norm": 18.875, "learning_rate": 1.8877338486154263e-06, "loss": 2.1882, "step": 8040 }, { "epoch": 0.23721067374318147, "grad_norm": 17.125, "learning_rate": 1.8874968188533482e-06, "loss": 2.2153, "step": 8045 }, { "epoch": 0.23735810113519093, "grad_norm": 14.625, "learning_rate": 1.887259554045675e-06, "loss": 2.1607, "step": 8050 }, { "epoch": 0.23750552852720036, "grad_norm": 15.5625, "learning_rate": 1.8870220542552445e-06, "loss": 2.1877, "step": 8055 }, { "epoch": 0.2376529559192098, "grad_norm": 53.5, "learning_rate": 1.8867843195449563e-06, "loss": 2.1403, "step": 8060 }, { "epoch": 0.23780038331121922, "grad_norm": 12.125, "learning_rate": 1.8865463499777724e-06, "loss": 2.1845, "step": 8065 }, { "epoch": 0.23794781070322865, "grad_norm": 14.75, "learning_rate": 1.886308145616717e-06, "loss": 2.1896, "step": 8070 }, { "epoch": 0.23809523809523808, "grad_norm": 16.25, "learning_rate": 1.8860697065248759e-06, "loss": 2.2754, "step": 8075 }, { "epoch": 0.23824266548724754, "grad_norm": 15.8125, "learning_rate": 1.8858310327653982e-06, "loss": 2.178, "step": 8080 }, { "epoch": 0.23839009287925697, "grad_norm": 14.625, "learning_rate": 1.885592124401494e-06, "loss": 2.2494, "step": 8085 }, { "epoch": 0.2385375202712664, "grad_norm": 16.0, "learning_rate": 1.8853529814964365e-06, "loss": 2.1538, "step": 8090 }, { "epoch": 0.23868494766327583, "grad_norm": 15.9375, "learning_rate": 1.88511360411356e-06, "loss": 2.2637, "step": 8095 }, { "epoch": 0.23883237505528526, "grad_norm": 15.6875, "learning_rate": 1.884873992316262e-06, "loss": 2.2347, "step": 8100 }, { "epoch": 0.23897980244729472, "grad_norm": 15.8125, "learning_rate": 1.8846341461680013e-06, "loss": 2.2894, "step": 8105 }, { "epoch": 0.23912722983930415, "grad_norm": 15.375, "learning_rate": 1.884394065732299e-06, "loss": 2.2071, "step": 8110 }, { "epoch": 0.23927465723131358, "grad_norm": 16.625, "learning_rate": 1.8841537510727383e-06, "loss": 2.2701, "step": 8115 }, { "epoch": 0.239422084623323, "grad_norm": 27.75, "learning_rate": 1.8839132022529642e-06, "loss": 2.1806, "step": 8120 }, { "epoch": 0.23956951201533244, "grad_norm": 15.25, "learning_rate": 1.8836724193366844e-06, "loss": 2.1222, "step": 8125 }, { "epoch": 0.23971693940734187, "grad_norm": 21.0, "learning_rate": 1.8834314023876675e-06, "loss": 2.2144, "step": 8130 }, { "epoch": 0.23986436679935133, "grad_norm": 16.5, "learning_rate": 1.8831901514697452e-06, "loss": 2.1489, "step": 8135 }, { "epoch": 0.24001179419136076, "grad_norm": 13.75, "learning_rate": 1.8829486666468104e-06, "loss": 2.1968, "step": 8140 }, { "epoch": 0.2401592215833702, "grad_norm": 16.0, "learning_rate": 1.882706947982818e-06, "loss": 2.148, "step": 8145 }, { "epoch": 0.24030664897537962, "grad_norm": 15.625, "learning_rate": 1.8824649955417853e-06, "loss": 2.0118, "step": 8150 }, { "epoch": 0.24045407636738905, "grad_norm": 14.5, "learning_rate": 1.8822228093877912e-06, "loss": 2.2747, "step": 8155 }, { "epoch": 0.24060150375939848, "grad_norm": 15.6875, "learning_rate": 1.8819803895849767e-06, "loss": 2.2399, "step": 8160 }, { "epoch": 0.24074893115140794, "grad_norm": 16.75, "learning_rate": 1.881737736197544e-06, "loss": 2.2886, "step": 8165 }, { "epoch": 0.24089635854341737, "grad_norm": 22.625, "learning_rate": 1.881494849289758e-06, "loss": 2.1599, "step": 8170 }, { "epoch": 0.2410437859354268, "grad_norm": 13.875, "learning_rate": 1.8812517289259454e-06, "loss": 2.1261, "step": 8175 }, { "epoch": 0.24119121332743623, "grad_norm": 14.75, "learning_rate": 1.881008375170494e-06, "loss": 2.2204, "step": 8180 }, { "epoch": 0.24133864071944566, "grad_norm": 14.0, "learning_rate": 1.880764788087854e-06, "loss": 2.1926, "step": 8185 }, { "epoch": 0.24148606811145512, "grad_norm": 15.75, "learning_rate": 1.8805209677425374e-06, "loss": 2.1834, "step": 8190 }, { "epoch": 0.24163349550346455, "grad_norm": 15.125, "learning_rate": 1.8802769141991177e-06, "loss": 2.2266, "step": 8195 }, { "epoch": 0.24178092289547398, "grad_norm": 15.625, "learning_rate": 1.8800326275222304e-06, "loss": 2.3082, "step": 8200 }, { "epoch": 0.2419283502874834, "grad_norm": 15.5, "learning_rate": 1.8797881077765724e-06, "loss": 2.2098, "step": 8205 }, { "epoch": 0.24207577767949284, "grad_norm": 14.5, "learning_rate": 1.8795433550269028e-06, "loss": 2.139, "step": 8210 }, { "epoch": 0.24222320507150227, "grad_norm": 25.625, "learning_rate": 1.8792983693380424e-06, "loss": 2.2472, "step": 8215 }, { "epoch": 0.24237063246351173, "grad_norm": 15.5, "learning_rate": 1.879053150774873e-06, "loss": 2.2395, "step": 8220 }, { "epoch": 0.24251805985552116, "grad_norm": 14.25, "learning_rate": 1.8788076994023387e-06, "loss": 2.1807, "step": 8225 }, { "epoch": 0.2426654872475306, "grad_norm": 14.625, "learning_rate": 1.8785620152854453e-06, "loss": 2.2557, "step": 8230 }, { "epoch": 0.24281291463954002, "grad_norm": 14.8125, "learning_rate": 1.8783160984892598e-06, "loss": 2.1958, "step": 8235 }, { "epoch": 0.24296034203154945, "grad_norm": 14.5, "learning_rate": 1.878069949078911e-06, "loss": 2.2767, "step": 8240 }, { "epoch": 0.24310776942355888, "grad_norm": 14.25, "learning_rate": 1.8778235671195897e-06, "loss": 2.3034, "step": 8245 }, { "epoch": 0.24325519681556834, "grad_norm": 15.375, "learning_rate": 1.877576952676548e-06, "loss": 2.2439, "step": 8250 }, { "epoch": 0.24340262420757777, "grad_norm": 16.5, "learning_rate": 1.877330105815099e-06, "loss": 2.1794, "step": 8255 }, { "epoch": 0.2435500515995872, "grad_norm": 16.0, "learning_rate": 1.8770830266006182e-06, "loss": 2.212, "step": 8260 }, { "epoch": 0.24369747899159663, "grad_norm": 15.5, "learning_rate": 1.8768357150985424e-06, "loss": 2.0937, "step": 8265 }, { "epoch": 0.24384490638360606, "grad_norm": 16.375, "learning_rate": 1.8765881713743696e-06, "loss": 2.2065, "step": 8270 }, { "epoch": 0.24399233377561552, "grad_norm": 13.625, "learning_rate": 1.87634039549366e-06, "loss": 2.3326, "step": 8275 }, { "epoch": 0.24413976116762495, "grad_norm": 14.6875, "learning_rate": 1.876092387522034e-06, "loss": 2.1759, "step": 8280 }, { "epoch": 0.24428718855963438, "grad_norm": 17.0, "learning_rate": 1.875844147525175e-06, "loss": 2.2042, "step": 8285 }, { "epoch": 0.2444346159516438, "grad_norm": 25.375, "learning_rate": 1.875595675568827e-06, "loss": 2.2385, "step": 8290 }, { "epoch": 0.24458204334365324, "grad_norm": 15.125, "learning_rate": 1.8753469717187956e-06, "loss": 2.1559, "step": 8295 }, { "epoch": 0.24472947073566267, "grad_norm": 15.125, "learning_rate": 1.8750980360409478e-06, "loss": 2.1629, "step": 8300 }, { "epoch": 0.24487689812767213, "grad_norm": 13.8125, "learning_rate": 1.8748488686012118e-06, "loss": 2.136, "step": 8305 }, { "epoch": 0.24502432551968156, "grad_norm": 16.0, "learning_rate": 1.8745994694655775e-06, "loss": 2.1955, "step": 8310 }, { "epoch": 0.245171752911691, "grad_norm": 13.375, "learning_rate": 1.874349838700096e-06, "loss": 2.2115, "step": 8315 }, { "epoch": 0.24531918030370042, "grad_norm": 16.5, "learning_rate": 1.8740999763708798e-06, "loss": 2.1912, "step": 8320 }, { "epoch": 0.24546660769570985, "grad_norm": 14.4375, "learning_rate": 1.8738498825441025e-06, "loss": 2.247, "step": 8325 }, { "epoch": 0.24561403508771928, "grad_norm": 37.75, "learning_rate": 1.8735995572859995e-06, "loss": 2.2053, "step": 8330 }, { "epoch": 0.24576146247972874, "grad_norm": 15.5, "learning_rate": 1.8733490006628672e-06, "loss": 2.1562, "step": 8335 }, { "epoch": 0.24590888987173817, "grad_norm": 13.0, "learning_rate": 1.873098212741063e-06, "loss": 2.1676, "step": 8340 }, { "epoch": 0.2460563172637476, "grad_norm": 13.75, "learning_rate": 1.8728471935870062e-06, "loss": 2.1155, "step": 8345 }, { "epoch": 0.24620374465575703, "grad_norm": 11.8125, "learning_rate": 1.8725959432671767e-06, "loss": 2.1968, "step": 8350 }, { "epoch": 0.24635117204776646, "grad_norm": 14.1875, "learning_rate": 1.8723444618481162e-06, "loss": 2.2811, "step": 8355 }, { "epoch": 0.24649859943977592, "grad_norm": 14.375, "learning_rate": 1.872092749396427e-06, "loss": 2.177, "step": 8360 }, { "epoch": 0.24664602683178535, "grad_norm": 14.9375, "learning_rate": 1.8718408059787727e-06, "loss": 2.1862, "step": 8365 }, { "epoch": 0.24679345422379478, "grad_norm": 14.3125, "learning_rate": 1.8715886316618787e-06, "loss": 2.2133, "step": 8370 }, { "epoch": 0.24694088161580421, "grad_norm": 11.5625, "learning_rate": 1.8713362265125313e-06, "loss": 2.0847, "step": 8375 }, { "epoch": 0.24708830900781364, "grad_norm": 16.75, "learning_rate": 1.871083590597577e-06, "loss": 2.2498, "step": 8380 }, { "epoch": 0.24723573639982308, "grad_norm": 18.375, "learning_rate": 1.8708307239839248e-06, "loss": 2.2175, "step": 8385 }, { "epoch": 0.24738316379183253, "grad_norm": 20.5, "learning_rate": 1.8705776267385436e-06, "loss": 2.2745, "step": 8390 }, { "epoch": 0.24753059118384196, "grad_norm": 15.0625, "learning_rate": 1.8703242989284647e-06, "loss": 2.0961, "step": 8395 }, { "epoch": 0.2476780185758514, "grad_norm": 17.875, "learning_rate": 1.8700707406207793e-06, "loss": 2.2088, "step": 8400 }, { "epoch": 0.24782544596786082, "grad_norm": 14.4375, "learning_rate": 1.8698169518826397e-06, "loss": 2.1112, "step": 8405 }, { "epoch": 0.24797287335987026, "grad_norm": 16.75, "learning_rate": 1.86956293278126e-06, "loss": 2.2312, "step": 8410 }, { "epoch": 0.24812030075187969, "grad_norm": 15.8125, "learning_rate": 1.8693086833839148e-06, "loss": 2.1858, "step": 8415 }, { "epoch": 0.24826772814388914, "grad_norm": 14.75, "learning_rate": 1.86905420375794e-06, "loss": 2.2127, "step": 8420 }, { "epoch": 0.24841515553589857, "grad_norm": 14.75, "learning_rate": 1.868799493970732e-06, "loss": 2.1444, "step": 8425 }, { "epoch": 0.248562582927908, "grad_norm": 14.625, "learning_rate": 1.8685445540897483e-06, "loss": 2.2396, "step": 8430 }, { "epoch": 0.24871001031991744, "grad_norm": 31.125, "learning_rate": 1.8682893841825074e-06, "loss": 2.1866, "step": 8435 }, { "epoch": 0.24885743771192687, "grad_norm": 14.5, "learning_rate": 1.8680339843165891e-06, "loss": 2.1701, "step": 8440 }, { "epoch": 0.24900486510393632, "grad_norm": 14.5, "learning_rate": 1.8677783545596338e-06, "loss": 2.1813, "step": 8445 }, { "epoch": 0.24915229249594575, "grad_norm": 15.0625, "learning_rate": 1.8675224949793424e-06, "loss": 2.2112, "step": 8450 }, { "epoch": 0.24929971988795518, "grad_norm": 15.8125, "learning_rate": 1.8672664056434773e-06, "loss": 2.2627, "step": 8455 }, { "epoch": 0.24944714727996462, "grad_norm": 14.75, "learning_rate": 1.8670100866198613e-06, "loss": 2.2006, "step": 8460 }, { "epoch": 0.24959457467197405, "grad_norm": 15.0, "learning_rate": 1.866753537976378e-06, "loss": 2.1875, "step": 8465 }, { "epoch": 0.24974200206398348, "grad_norm": 13.625, "learning_rate": 1.8664967597809729e-06, "loss": 2.3218, "step": 8470 }, { "epoch": 0.24988942945599293, "grad_norm": 15.1875, "learning_rate": 1.8662397521016503e-06, "loss": 2.2054, "step": 8475 }, { "epoch": 0.25003685684800236, "grad_norm": 17.375, "learning_rate": 1.8659825150064773e-06, "loss": 2.2152, "step": 8480 }, { "epoch": 0.25018428424001177, "grad_norm": 16.25, "learning_rate": 1.86572504856358e-06, "loss": 2.0906, "step": 8485 }, { "epoch": 0.2503317116320212, "grad_norm": 14.875, "learning_rate": 1.8654673528411466e-06, "loss": 2.1785, "step": 8490 }, { "epoch": 0.2504791390240307, "grad_norm": 17.375, "learning_rate": 1.8652094279074255e-06, "loss": 2.3136, "step": 8495 }, { "epoch": 0.2506265664160401, "grad_norm": 14.0625, "learning_rate": 1.8649512738307258e-06, "loss": 2.1993, "step": 8500 }, { "epoch": 0.2506265664160401, "eval_loss": 2.151705265045166, "eval_runtime": 4.7212, "eval_samples_per_second": 83.877, "eval_steps_per_second": 2.754, "step": 8500 }, { "epoch": 0.25077399380804954, "grad_norm": 17.125, "learning_rate": 1.864692890679417e-06, "loss": 2.1964, "step": 8505 }, { "epoch": 0.25092142120005895, "grad_norm": 16.75, "learning_rate": 1.86443427852193e-06, "loss": 2.2787, "step": 8510 }, { "epoch": 0.2510688485920684, "grad_norm": 14.625, "learning_rate": 1.8641754374267558e-06, "loss": 2.1101, "step": 8515 }, { "epoch": 0.25121627598407786, "grad_norm": 14.75, "learning_rate": 1.863916367462446e-06, "loss": 2.26, "step": 8520 }, { "epoch": 0.25136370337608727, "grad_norm": 14.125, "learning_rate": 1.8636570686976127e-06, "loss": 2.1509, "step": 8525 }, { "epoch": 0.2515111307680967, "grad_norm": 16.125, "learning_rate": 1.8633975412009294e-06, "loss": 2.2034, "step": 8530 }, { "epoch": 0.2516585581601061, "grad_norm": 15.125, "learning_rate": 1.8631377850411293e-06, "loss": 2.1605, "step": 8535 }, { "epoch": 0.2518059855521156, "grad_norm": 15.75, "learning_rate": 1.8628778002870069e-06, "loss": 2.1062, "step": 8540 }, { "epoch": 0.25195341294412504, "grad_norm": 17.0, "learning_rate": 1.8626175870074165e-06, "loss": 2.1868, "step": 8545 }, { "epoch": 0.25210084033613445, "grad_norm": 17.125, "learning_rate": 1.862357145271273e-06, "loss": 2.2456, "step": 8550 }, { "epoch": 0.2522482677281439, "grad_norm": 15.5625, "learning_rate": 1.8620964751475525e-06, "loss": 2.2008, "step": 8555 }, { "epoch": 0.2523956951201533, "grad_norm": 21.625, "learning_rate": 1.861835576705291e-06, "loss": 2.1544, "step": 8560 }, { "epoch": 0.25254312251216277, "grad_norm": 14.625, "learning_rate": 1.8615744500135855e-06, "loss": 2.18, "step": 8565 }, { "epoch": 0.25269054990417217, "grad_norm": 13.5625, "learning_rate": 1.8613130951415924e-06, "loss": 2.1628, "step": 8570 }, { "epoch": 0.2528379772961816, "grad_norm": 14.625, "learning_rate": 1.8610515121585296e-06, "loss": 2.1433, "step": 8575 }, { "epoch": 0.2529854046881911, "grad_norm": 12.875, "learning_rate": 1.8607897011336746e-06, "loss": 2.1122, "step": 8580 }, { "epoch": 0.2531328320802005, "grad_norm": 14.25, "learning_rate": 1.8605276621363664e-06, "loss": 2.1748, "step": 8585 }, { "epoch": 0.25328025947220995, "grad_norm": 19.625, "learning_rate": 1.8602653952360032e-06, "loss": 2.1691, "step": 8590 }, { "epoch": 0.25342768686421935, "grad_norm": 14.8125, "learning_rate": 1.8600029005020442e-06, "loss": 2.2135, "step": 8595 }, { "epoch": 0.2535751142562288, "grad_norm": 14.5625, "learning_rate": 1.8597401780040088e-06, "loss": 2.2858, "step": 8600 }, { "epoch": 0.25372254164823826, "grad_norm": 21.0, "learning_rate": 1.8594772278114764e-06, "loss": 2.2727, "step": 8605 }, { "epoch": 0.25386996904024767, "grad_norm": 16.125, "learning_rate": 1.8592140499940876e-06, "loss": 2.1415, "step": 8610 }, { "epoch": 0.2540173964322571, "grad_norm": 12.5625, "learning_rate": 1.858950644621542e-06, "loss": 2.1434, "step": 8615 }, { "epoch": 0.25416482382426653, "grad_norm": 13.375, "learning_rate": 1.858687011763601e-06, "loss": 2.2154, "step": 8620 }, { "epoch": 0.254312251216276, "grad_norm": 15.5, "learning_rate": 1.8584231514900842e-06, "loss": 2.2386, "step": 8625 }, { "epoch": 0.25445967860828544, "grad_norm": 16.0, "learning_rate": 1.8581590638708739e-06, "loss": 2.2126, "step": 8630 }, { "epoch": 0.25460710600029485, "grad_norm": 19.125, "learning_rate": 1.8578947489759105e-06, "loss": 2.2209, "step": 8635 }, { "epoch": 0.2547545333923043, "grad_norm": 14.9375, "learning_rate": 1.8576302068751958e-06, "loss": 2.2278, "step": 8640 }, { "epoch": 0.2549019607843137, "grad_norm": 14.6875, "learning_rate": 1.8573654376387915e-06, "loss": 2.0973, "step": 8645 }, { "epoch": 0.25504938817632317, "grad_norm": 13.8125, "learning_rate": 1.857100441336819e-06, "loss": 2.1817, "step": 8650 }, { "epoch": 0.25519681556833257, "grad_norm": 17.125, "learning_rate": 1.8568352180394603e-06, "loss": 2.161, "step": 8655 }, { "epoch": 0.255344242960342, "grad_norm": 14.9375, "learning_rate": 1.8565697678169578e-06, "loss": 2.2521, "step": 8660 }, { "epoch": 0.2554916703523515, "grad_norm": 14.1875, "learning_rate": 1.8563040907396132e-06, "loss": 2.1843, "step": 8665 }, { "epoch": 0.2556390977443609, "grad_norm": 16.75, "learning_rate": 1.8560381868777886e-06, "loss": 2.1212, "step": 8670 }, { "epoch": 0.25578652513637035, "grad_norm": 15.5625, "learning_rate": 1.8557720563019068e-06, "loss": 2.2056, "step": 8675 }, { "epoch": 0.25593395252837975, "grad_norm": 12.0625, "learning_rate": 1.8555056990824496e-06, "loss": 2.0907, "step": 8680 }, { "epoch": 0.2560813799203892, "grad_norm": 16.75, "learning_rate": 1.8552391152899599e-06, "loss": 2.3035, "step": 8685 }, { "epoch": 0.25622880731239867, "grad_norm": 15.375, "learning_rate": 1.854972304995039e-06, "loss": 2.2537, "step": 8690 }, { "epoch": 0.25637623470440807, "grad_norm": 17.25, "learning_rate": 1.8547052682683505e-06, "loss": 2.2827, "step": 8695 }, { "epoch": 0.2565236620964175, "grad_norm": 14.9375, "learning_rate": 1.8544380051806157e-06, "loss": 2.1187, "step": 8700 }, { "epoch": 0.25667108948842693, "grad_norm": 14.9375, "learning_rate": 1.8541705158026173e-06, "loss": 2.1568, "step": 8705 }, { "epoch": 0.2568185168804364, "grad_norm": 15.875, "learning_rate": 1.8539028002051973e-06, "loss": 2.1486, "step": 8710 }, { "epoch": 0.25696594427244585, "grad_norm": 15.125, "learning_rate": 1.8536348584592578e-06, "loss": 2.2145, "step": 8715 }, { "epoch": 0.25711337166445525, "grad_norm": 13.0625, "learning_rate": 1.8533666906357607e-06, "loss": 2.1246, "step": 8720 }, { "epoch": 0.2572607990564647, "grad_norm": 14.8125, "learning_rate": 1.8530982968057285e-06, "loss": 2.2059, "step": 8725 }, { "epoch": 0.2574082264484741, "grad_norm": 14.125, "learning_rate": 1.852829677040242e-06, "loss": 2.2232, "step": 8730 }, { "epoch": 0.25755565384048357, "grad_norm": 14.3125, "learning_rate": 1.8525608314104431e-06, "loss": 2.2328, "step": 8735 }, { "epoch": 0.25770308123249297, "grad_norm": 15.5, "learning_rate": 1.8522917599875334e-06, "loss": 2.2464, "step": 8740 }, { "epoch": 0.25785050862450243, "grad_norm": 15.375, "learning_rate": 1.8520224628427736e-06, "loss": 2.2532, "step": 8745 }, { "epoch": 0.2579979360165119, "grad_norm": 16.125, "learning_rate": 1.8517529400474848e-06, "loss": 2.1911, "step": 8750 }, { "epoch": 0.2581453634085213, "grad_norm": 14.6875, "learning_rate": 1.8514831916730482e-06, "loss": 2.2427, "step": 8755 }, { "epoch": 0.25829279080053075, "grad_norm": 13.625, "learning_rate": 1.8512132177909034e-06, "loss": 2.1896, "step": 8760 }, { "epoch": 0.25844021819254015, "grad_norm": 13.0625, "learning_rate": 1.8509430184725513e-06, "loss": 2.0831, "step": 8765 }, { "epoch": 0.2585876455845496, "grad_norm": 12.9375, "learning_rate": 1.8506725937895515e-06, "loss": 2.1604, "step": 8770 }, { "epoch": 0.25873507297655907, "grad_norm": 28.875, "learning_rate": 1.8504019438135235e-06, "loss": 2.2493, "step": 8775 }, { "epoch": 0.25888250036856847, "grad_norm": 15.1875, "learning_rate": 1.8501310686161463e-06, "loss": 2.1755, "step": 8780 }, { "epoch": 0.2590299277605779, "grad_norm": 14.3125, "learning_rate": 1.8498599682691592e-06, "loss": 2.2261, "step": 8785 }, { "epoch": 0.25917735515258733, "grad_norm": 18.625, "learning_rate": 1.8495886428443605e-06, "loss": 2.1731, "step": 8790 }, { "epoch": 0.2593247825445968, "grad_norm": 15.125, "learning_rate": 1.8493170924136083e-06, "loss": 2.3196, "step": 8795 }, { "epoch": 0.25947220993660625, "grad_norm": 16.75, "learning_rate": 1.8490453170488202e-06, "loss": 2.2446, "step": 8800 }, { "epoch": 0.25961963732861565, "grad_norm": 14.875, "learning_rate": 1.8487733168219739e-06, "loss": 2.1648, "step": 8805 }, { "epoch": 0.2597670647206251, "grad_norm": 14.3125, "learning_rate": 1.8485010918051059e-06, "loss": 2.2215, "step": 8810 }, { "epoch": 0.2599144921126345, "grad_norm": 17.5, "learning_rate": 1.8482286420703125e-06, "loss": 2.3307, "step": 8815 }, { "epoch": 0.26006191950464397, "grad_norm": 14.375, "learning_rate": 1.84795596768975e-06, "loss": 2.1618, "step": 8820 }, { "epoch": 0.26020934689665337, "grad_norm": 14.9375, "learning_rate": 1.847683068735633e-06, "loss": 2.2691, "step": 8825 }, { "epoch": 0.26035677428866283, "grad_norm": 11.0625, "learning_rate": 1.8474099452802369e-06, "loss": 2.117, "step": 8830 }, { "epoch": 0.2605042016806723, "grad_norm": 14.3125, "learning_rate": 1.8471365973958962e-06, "loss": 2.0912, "step": 8835 }, { "epoch": 0.2606516290726817, "grad_norm": 14.8125, "learning_rate": 1.846863025155004e-06, "loss": 2.1718, "step": 8840 }, { "epoch": 0.26079905646469115, "grad_norm": 17.0, "learning_rate": 1.8465892286300137e-06, "loss": 2.0818, "step": 8845 }, { "epoch": 0.26094648385670055, "grad_norm": 16.875, "learning_rate": 1.8463152078934383e-06, "loss": 2.2438, "step": 8850 }, { "epoch": 0.26109391124871, "grad_norm": 13.9375, "learning_rate": 1.8460409630178493e-06, "loss": 2.2129, "step": 8855 }, { "epoch": 0.26124133864071947, "grad_norm": 15.3125, "learning_rate": 1.8457664940758782e-06, "loss": 2.1616, "step": 8860 }, { "epoch": 0.26138876603272887, "grad_norm": 14.5, "learning_rate": 1.8454918011402155e-06, "loss": 2.2635, "step": 8865 }, { "epoch": 0.26153619342473833, "grad_norm": 14.0625, "learning_rate": 1.8452168842836114e-06, "loss": 2.2376, "step": 8870 }, { "epoch": 0.26168362081674773, "grad_norm": 13.5, "learning_rate": 1.8449417435788748e-06, "loss": 2.1005, "step": 8875 }, { "epoch": 0.2618310482087572, "grad_norm": 17.5, "learning_rate": 1.8446663790988742e-06, "loss": 2.164, "step": 8880 }, { "epoch": 0.26197847560076665, "grad_norm": 15.5, "learning_rate": 1.844390790916538e-06, "loss": 2.1707, "step": 8885 }, { "epoch": 0.26212590299277605, "grad_norm": 20.25, "learning_rate": 1.844114979104853e-06, "loss": 2.2746, "step": 8890 }, { "epoch": 0.2622733303847855, "grad_norm": 13.8125, "learning_rate": 1.843838943736865e-06, "loss": 2.1086, "step": 8895 }, { "epoch": 0.2624207577767949, "grad_norm": 16.125, "learning_rate": 1.8435626848856805e-06, "loss": 2.3454, "step": 8900 }, { "epoch": 0.26256818516880437, "grad_norm": 12.5625, "learning_rate": 1.8432862026244633e-06, "loss": 2.188, "step": 8905 }, { "epoch": 0.26271561256081377, "grad_norm": 14.8125, "learning_rate": 1.8430094970264374e-06, "loss": 2.1443, "step": 8910 }, { "epoch": 0.26286303995282323, "grad_norm": 18.375, "learning_rate": 1.8427325681648861e-06, "loss": 2.1986, "step": 8915 }, { "epoch": 0.2630104673448327, "grad_norm": 15.5, "learning_rate": 1.8424554161131515e-06, "loss": 2.1675, "step": 8920 }, { "epoch": 0.2631578947368421, "grad_norm": 13.4375, "learning_rate": 1.8421780409446347e-06, "loss": 2.2025, "step": 8925 }, { "epoch": 0.26330532212885155, "grad_norm": 14.125, "learning_rate": 1.841900442732796e-06, "loss": 2.1276, "step": 8930 }, { "epoch": 0.26345274952086095, "grad_norm": 17.75, "learning_rate": 1.841622621551155e-06, "loss": 2.1602, "step": 8935 }, { "epoch": 0.2636001769128704, "grad_norm": 13.875, "learning_rate": 1.8413445774732901e-06, "loss": 2.2945, "step": 8940 }, { "epoch": 0.26374760430487987, "grad_norm": 15.375, "learning_rate": 1.8410663105728387e-06, "loss": 2.0937, "step": 8945 }, { "epoch": 0.26389503169688927, "grad_norm": 17.75, "learning_rate": 1.840787820923497e-06, "loss": 2.2, "step": 8950 }, { "epoch": 0.26404245908889873, "grad_norm": 14.625, "learning_rate": 1.8405091085990213e-06, "loss": 2.1271, "step": 8955 }, { "epoch": 0.26418988648090813, "grad_norm": 25.75, "learning_rate": 1.840230173673225e-06, "loss": 2.1916, "step": 8960 }, { "epoch": 0.2643373138729176, "grad_norm": 17.25, "learning_rate": 1.8399510162199824e-06, "loss": 2.2481, "step": 8965 }, { "epoch": 0.26448474126492705, "grad_norm": 15.8125, "learning_rate": 1.8396716363132255e-06, "loss": 2.1464, "step": 8970 }, { "epoch": 0.26463216865693645, "grad_norm": 17.0, "learning_rate": 1.8393920340269458e-06, "loss": 2.1157, "step": 8975 }, { "epoch": 0.2647795960489459, "grad_norm": 14.3125, "learning_rate": 1.8391122094351933e-06, "loss": 2.0538, "step": 8980 }, { "epoch": 0.2649270234409553, "grad_norm": 12.6875, "learning_rate": 1.8388321626120769e-06, "loss": 2.2269, "step": 8985 }, { "epoch": 0.26507445083296477, "grad_norm": 16.75, "learning_rate": 1.8385518936317645e-06, "loss": 2.1711, "step": 8990 }, { "epoch": 0.26522187822497423, "grad_norm": 13.75, "learning_rate": 1.838271402568483e-06, "loss": 2.1552, "step": 8995 }, { "epoch": 0.26536930561698363, "grad_norm": 14.3125, "learning_rate": 1.837990689496518e-06, "loss": 2.1522, "step": 9000 }, { "epoch": 0.26536930561698363, "eval_loss": 2.142622232437134, "eval_runtime": 4.7105, "eval_samples_per_second": 84.067, "eval_steps_per_second": 2.76, "step": 9000 }, { "epoch": 0.2655167330089931, "grad_norm": 17.5, "learning_rate": 1.837709754490214e-06, "loss": 2.1945, "step": 9005 }, { "epoch": 0.2656641604010025, "grad_norm": 16.125, "learning_rate": 1.8374285976239734e-06, "loss": 2.2451, "step": 9010 }, { "epoch": 0.26581158779301195, "grad_norm": 15.625, "learning_rate": 1.8371472189722589e-06, "loss": 2.2319, "step": 9015 }, { "epoch": 0.26595901518502135, "grad_norm": 19.25, "learning_rate": 1.8368656186095905e-06, "loss": 2.2219, "step": 9020 }, { "epoch": 0.2661064425770308, "grad_norm": 22.125, "learning_rate": 1.8365837966105481e-06, "loss": 2.1642, "step": 9025 }, { "epoch": 0.26625386996904027, "grad_norm": 15.6875, "learning_rate": 1.8363017530497693e-06, "loss": 2.2582, "step": 9030 }, { "epoch": 0.26640129736104967, "grad_norm": 14.0625, "learning_rate": 1.836019488001951e-06, "loss": 2.1031, "step": 9035 }, { "epoch": 0.26654872475305913, "grad_norm": 15.6875, "learning_rate": 1.8357370015418488e-06, "loss": 2.225, "step": 9040 }, { "epoch": 0.26669615214506853, "grad_norm": 15.0, "learning_rate": 1.835454293744276e-06, "loss": 2.254, "step": 9045 }, { "epoch": 0.266843579537078, "grad_norm": 14.9375, "learning_rate": 1.8351713646841059e-06, "loss": 2.2191, "step": 9050 }, { "epoch": 0.26699100692908745, "grad_norm": 14.6875, "learning_rate": 1.8348882144362694e-06, "loss": 2.2358, "step": 9055 }, { "epoch": 0.26713843432109685, "grad_norm": 12.8125, "learning_rate": 1.8346048430757566e-06, "loss": 2.1893, "step": 9060 }, { "epoch": 0.2672858617131063, "grad_norm": 15.9375, "learning_rate": 1.8343212506776155e-06, "loss": 2.1395, "step": 9065 }, { "epoch": 0.2674332891051157, "grad_norm": 16.375, "learning_rate": 1.8340374373169533e-06, "loss": 2.1742, "step": 9070 }, { "epoch": 0.26758071649712517, "grad_norm": 18.125, "learning_rate": 1.8337534030689355e-06, "loss": 2.1369, "step": 9075 }, { "epoch": 0.26772814388913463, "grad_norm": 22.75, "learning_rate": 1.8334691480087856e-06, "loss": 2.1244, "step": 9080 }, { "epoch": 0.26787557128114403, "grad_norm": 13.6875, "learning_rate": 1.8331846722117864e-06, "loss": 2.227, "step": 9085 }, { "epoch": 0.2680229986731535, "grad_norm": 15.75, "learning_rate": 1.8328999757532788e-06, "loss": 2.1094, "step": 9090 }, { "epoch": 0.2681704260651629, "grad_norm": 13.75, "learning_rate": 1.8326150587086617e-06, "loss": 2.1565, "step": 9095 }, { "epoch": 0.26831785345717235, "grad_norm": 14.375, "learning_rate": 1.832329921153393e-06, "loss": 2.272, "step": 9100 }, { "epoch": 0.26846528084918175, "grad_norm": 17.375, "learning_rate": 1.8320445631629892e-06, "loss": 2.2073, "step": 9105 }, { "epoch": 0.2686127082411912, "grad_norm": 10.8125, "learning_rate": 1.8317589848130246e-06, "loss": 2.052, "step": 9110 }, { "epoch": 0.26876013563320067, "grad_norm": 14.0625, "learning_rate": 1.831473186179132e-06, "loss": 2.1517, "step": 9115 }, { "epoch": 0.2689075630252101, "grad_norm": 12.75, "learning_rate": 1.8311871673370023e-06, "loss": 2.0941, "step": 9120 }, { "epoch": 0.26905499041721953, "grad_norm": 17.125, "learning_rate": 1.8309009283623854e-06, "loss": 2.2009, "step": 9125 }, { "epoch": 0.26920241780922893, "grad_norm": 16.75, "learning_rate": 1.8306144693310893e-06, "loss": 2.0812, "step": 9130 }, { "epoch": 0.2693498452012384, "grad_norm": 12.75, "learning_rate": 1.8303277903189798e-06, "loss": 2.1815, "step": 9135 }, { "epoch": 0.26949727259324785, "grad_norm": 14.3125, "learning_rate": 1.8300408914019813e-06, "loss": 2.1776, "step": 9140 }, { "epoch": 0.26964469998525725, "grad_norm": 16.125, "learning_rate": 1.8297537726560766e-06, "loss": 2.2445, "step": 9145 }, { "epoch": 0.2697921273772667, "grad_norm": 15.875, "learning_rate": 1.8294664341573063e-06, "loss": 2.2909, "step": 9150 }, { "epoch": 0.2699395547692761, "grad_norm": 16.625, "learning_rate": 1.8291788759817695e-06, "loss": 2.2258, "step": 9155 }, { "epoch": 0.27008698216128557, "grad_norm": 12.0625, "learning_rate": 1.8288910982056237e-06, "loss": 2.1185, "step": 9160 }, { "epoch": 0.27023440955329503, "grad_norm": 14.5625, "learning_rate": 1.8286031009050837e-06, "loss": 2.1964, "step": 9165 }, { "epoch": 0.27038183694530443, "grad_norm": 11.8125, "learning_rate": 1.8283148841564234e-06, "loss": 2.1557, "step": 9170 }, { "epoch": 0.2705292643373139, "grad_norm": 14.6875, "learning_rate": 1.8280264480359747e-06, "loss": 2.1727, "step": 9175 }, { "epoch": 0.2706766917293233, "grad_norm": 14.25, "learning_rate": 1.8277377926201268e-06, "loss": 2.1334, "step": 9180 }, { "epoch": 0.27082411912133275, "grad_norm": 14.9375, "learning_rate": 1.8274489179853273e-06, "loss": 2.0905, "step": 9185 }, { "epoch": 0.27097154651334215, "grad_norm": 17.5, "learning_rate": 1.827159824208083e-06, "loss": 2.2571, "step": 9190 }, { "epoch": 0.2711189739053516, "grad_norm": 15.125, "learning_rate": 1.826870511364957e-06, "loss": 2.1973, "step": 9195 }, { "epoch": 0.27126640129736107, "grad_norm": 12.4375, "learning_rate": 1.8265809795325713e-06, "loss": 2.0869, "step": 9200 }, { "epoch": 0.2714138286893705, "grad_norm": 27.0, "learning_rate": 1.8262912287876065e-06, "loss": 2.1648, "step": 9205 }, { "epoch": 0.27156125608137993, "grad_norm": 20.5, "learning_rate": 1.8260012592067995e-06, "loss": 2.1417, "step": 9210 }, { "epoch": 0.27170868347338933, "grad_norm": 16.0, "learning_rate": 1.825711070866947e-06, "loss": 2.224, "step": 9215 }, { "epoch": 0.2718561108653988, "grad_norm": 15.625, "learning_rate": 1.825420663844902e-06, "loss": 2.1834, "step": 9220 }, { "epoch": 0.27200353825740825, "grad_norm": 15.25, "learning_rate": 1.8251300382175767e-06, "loss": 2.0962, "step": 9225 }, { "epoch": 0.27215096564941765, "grad_norm": 13.8125, "learning_rate": 1.824839194061941e-06, "loss": 2.1178, "step": 9230 }, { "epoch": 0.2722983930414271, "grad_norm": 21.625, "learning_rate": 1.824548131455022e-06, "loss": 2.2477, "step": 9235 }, { "epoch": 0.2724458204334365, "grad_norm": 17.5, "learning_rate": 1.8242568504739046e-06, "loss": 2.1384, "step": 9240 }, { "epoch": 0.272593247825446, "grad_norm": 17.125, "learning_rate": 1.8239653511957326e-06, "loss": 2.2245, "step": 9245 }, { "epoch": 0.27274067521745543, "grad_norm": 15.1875, "learning_rate": 1.8236736336977065e-06, "loss": 2.2292, "step": 9250 }, { "epoch": 0.27288810260946483, "grad_norm": 16.125, "learning_rate": 1.8233816980570857e-06, "loss": 2.2241, "step": 9255 }, { "epoch": 0.2730355300014743, "grad_norm": 16.875, "learning_rate": 1.8230895443511861e-06, "loss": 2.2429, "step": 9260 }, { "epoch": 0.2731829573934837, "grad_norm": 12.25, "learning_rate": 1.8227971726573825e-06, "loss": 2.2361, "step": 9265 }, { "epoch": 0.27333038478549315, "grad_norm": 13.75, "learning_rate": 1.8225045830531068e-06, "loss": 2.2299, "step": 9270 }, { "epoch": 0.27347781217750256, "grad_norm": 11.8125, "learning_rate": 1.8222117756158486e-06, "loss": 2.1748, "step": 9275 }, { "epoch": 0.273625239569512, "grad_norm": 14.4375, "learning_rate": 1.8219187504231553e-06, "loss": 2.1427, "step": 9280 }, { "epoch": 0.27377266696152147, "grad_norm": 16.5, "learning_rate": 1.821625507552632e-06, "loss": 2.2049, "step": 9285 }, { "epoch": 0.2739200943535309, "grad_norm": 13.6875, "learning_rate": 1.8213320470819413e-06, "loss": 2.1321, "step": 9290 }, { "epoch": 0.27406752174554033, "grad_norm": 13.125, "learning_rate": 1.821038369088804e-06, "loss": 2.057, "step": 9295 }, { "epoch": 0.27421494913754973, "grad_norm": 15.0625, "learning_rate": 1.820744473650998e-06, "loss": 2.0797, "step": 9300 }, { "epoch": 0.2743623765295592, "grad_norm": 15.3125, "learning_rate": 1.8204503608463586e-06, "loss": 2.1928, "step": 9305 }, { "epoch": 0.27450980392156865, "grad_norm": 15.4375, "learning_rate": 1.8201560307527793e-06, "loss": 2.2871, "step": 9310 }, { "epoch": 0.27465723131357805, "grad_norm": 15.4375, "learning_rate": 1.8198614834482107e-06, "loss": 2.2493, "step": 9315 }, { "epoch": 0.2748046587055875, "grad_norm": 14.9375, "learning_rate": 1.8195667190106607e-06, "loss": 2.1325, "step": 9320 }, { "epoch": 0.2749520860975969, "grad_norm": 18.5, "learning_rate": 1.8192717375181954e-06, "loss": 2.2074, "step": 9325 }, { "epoch": 0.2750995134896064, "grad_norm": 14.75, "learning_rate": 1.8189765390489375e-06, "loss": 2.1769, "step": 9330 }, { "epoch": 0.27524694088161583, "grad_norm": 14.75, "learning_rate": 1.8186811236810686e-06, "loss": 2.3256, "step": 9335 }, { "epoch": 0.27539436827362523, "grad_norm": 13.875, "learning_rate": 1.818385491492826e-06, "loss": 2.1801, "step": 9340 }, { "epoch": 0.2755417956656347, "grad_norm": 15.1875, "learning_rate": 1.8180896425625054e-06, "loss": 2.1626, "step": 9345 }, { "epoch": 0.2756892230576441, "grad_norm": 17.25, "learning_rate": 1.81779357696846e-06, "loss": 2.126, "step": 9350 }, { "epoch": 0.27583665044965355, "grad_norm": 14.875, "learning_rate": 1.8174972947890998e-06, "loss": 2.0943, "step": 9355 }, { "epoch": 0.27598407784166296, "grad_norm": 18.75, "learning_rate": 1.8172007961028928e-06, "loss": 2.2155, "step": 9360 }, { "epoch": 0.2761315052336724, "grad_norm": 13.5, "learning_rate": 1.8169040809883639e-06, "loss": 2.1628, "step": 9365 }, { "epoch": 0.2762789326256819, "grad_norm": 15.0625, "learning_rate": 1.8166071495240952e-06, "loss": 2.1454, "step": 9370 }, { "epoch": 0.2764263600176913, "grad_norm": 14.125, "learning_rate": 1.816310001788727e-06, "loss": 2.1551, "step": 9375 }, { "epoch": 0.27657378740970073, "grad_norm": 14.6875, "learning_rate": 1.8160126378609553e-06, "loss": 2.1669, "step": 9380 }, { "epoch": 0.27672121480171014, "grad_norm": 15.6875, "learning_rate": 1.8157150578195348e-06, "loss": 2.1653, "step": 9385 }, { "epoch": 0.2768686421937196, "grad_norm": 16.875, "learning_rate": 1.8154172617432772e-06, "loss": 2.2091, "step": 9390 }, { "epoch": 0.27701606958572905, "grad_norm": 14.5, "learning_rate": 1.8151192497110505e-06, "loss": 2.2048, "step": 9395 }, { "epoch": 0.27716349697773845, "grad_norm": 17.25, "learning_rate": 1.814821021801781e-06, "loss": 2.2176, "step": 9400 }, { "epoch": 0.2773109243697479, "grad_norm": 14.125, "learning_rate": 1.8145225780944515e-06, "loss": 2.2279, "step": 9405 }, { "epoch": 0.2774583517617573, "grad_norm": 15.3125, "learning_rate": 1.8142239186681022e-06, "loss": 2.1464, "step": 9410 }, { "epoch": 0.2776057791537668, "grad_norm": 13.4375, "learning_rate": 1.8139250436018303e-06, "loss": 2.2263, "step": 9415 }, { "epoch": 0.27775320654577623, "grad_norm": 16.125, "learning_rate": 1.81362595297479e-06, "loss": 2.1134, "step": 9420 }, { "epoch": 0.27790063393778563, "grad_norm": 15.25, "learning_rate": 1.8133266468661934e-06, "loss": 2.148, "step": 9425 }, { "epoch": 0.2780480613297951, "grad_norm": 14.625, "learning_rate": 1.8130271253553084e-06, "loss": 2.1851, "step": 9430 }, { "epoch": 0.2781954887218045, "grad_norm": 16.25, "learning_rate": 1.8127273885214609e-06, "loss": 2.2678, "step": 9435 }, { "epoch": 0.27834291611381395, "grad_norm": 16.375, "learning_rate": 1.8124274364440337e-06, "loss": 2.1842, "step": 9440 }, { "epoch": 0.27849034350582336, "grad_norm": 14.6875, "learning_rate": 1.8121272692024658e-06, "loss": 2.2537, "step": 9445 }, { "epoch": 0.2786377708978328, "grad_norm": 14.75, "learning_rate": 1.8118268868762546e-06, "loss": 2.1291, "step": 9450 }, { "epoch": 0.2787851982898423, "grad_norm": 15.0, "learning_rate": 1.811526289544953e-06, "loss": 2.1512, "step": 9455 }, { "epoch": 0.2789326256818517, "grad_norm": 14.5625, "learning_rate": 1.8112254772881717e-06, "loss": 2.1319, "step": 9460 }, { "epoch": 0.27908005307386113, "grad_norm": 14.8125, "learning_rate": 1.8109244501855782e-06, "loss": 2.1444, "step": 9465 }, { "epoch": 0.27922748046587054, "grad_norm": 13.875, "learning_rate": 1.810623208316897e-06, "loss": 2.1868, "step": 9470 }, { "epoch": 0.27937490785788, "grad_norm": 15.0625, "learning_rate": 1.8103217517619094e-06, "loss": 2.2162, "step": 9475 }, { "epoch": 0.27952233524988945, "grad_norm": 16.125, "learning_rate": 1.810020080600453e-06, "loss": 2.2668, "step": 9480 }, { "epoch": 0.27966976264189886, "grad_norm": 16.0, "learning_rate": 1.809718194912423e-06, "loss": 2.2179, "step": 9485 }, { "epoch": 0.2798171900339083, "grad_norm": 14.875, "learning_rate": 1.809416094777771e-06, "loss": 2.1202, "step": 9490 }, { "epoch": 0.2799646174259177, "grad_norm": 10.75, "learning_rate": 1.8091137802765058e-06, "loss": 2.0973, "step": 9495 }, { "epoch": 0.2801120448179272, "grad_norm": 12.0625, "learning_rate": 1.8088112514886923e-06, "loss": 2.0952, "step": 9500 }, { "epoch": 0.2801120448179272, "eval_loss": 2.1350300312042236, "eval_runtime": 4.7104, "eval_samples_per_second": 84.069, "eval_steps_per_second": 2.76, "step": 9500 }, { "epoch": 0.28025947220993663, "grad_norm": 16.5, "learning_rate": 1.8085085084944526e-06, "loss": 2.1947, "step": 9505 }, { "epoch": 0.28040689960194604, "grad_norm": 16.875, "learning_rate": 1.808205551373966e-06, "loss": 2.2008, "step": 9510 }, { "epoch": 0.2805543269939555, "grad_norm": 16.125, "learning_rate": 1.8079023802074674e-06, "loss": 2.244, "step": 9515 }, { "epoch": 0.2807017543859649, "grad_norm": 14.9375, "learning_rate": 1.807598995075249e-06, "loss": 2.2662, "step": 9520 }, { "epoch": 0.28084918177797435, "grad_norm": 15.75, "learning_rate": 1.80729539605766e-06, "loss": 2.1337, "step": 9525 }, { "epoch": 0.28099660916998376, "grad_norm": 14.375, "learning_rate": 1.8069915832351057e-06, "loss": 2.1507, "step": 9530 }, { "epoch": 0.2811440365619932, "grad_norm": 13.25, "learning_rate": 1.8066875566880482e-06, "loss": 2.1803, "step": 9535 }, { "epoch": 0.2812914639540027, "grad_norm": 11.25, "learning_rate": 1.8063833164970061e-06, "loss": 2.0851, "step": 9540 }, { "epoch": 0.2814388913460121, "grad_norm": 13.5, "learning_rate": 1.8060788627425548e-06, "loss": 2.2345, "step": 9545 }, { "epoch": 0.28158631873802153, "grad_norm": 15.875, "learning_rate": 1.8057741955053261e-06, "loss": 2.3918, "step": 9550 }, { "epoch": 0.28173374613003094, "grad_norm": 14.6875, "learning_rate": 1.8054693148660088e-06, "loss": 2.1558, "step": 9555 }, { "epoch": 0.2818811735220404, "grad_norm": 15.0625, "learning_rate": 1.805164220905347e-06, "loss": 2.1571, "step": 9560 }, { "epoch": 0.28202860091404985, "grad_norm": 15.0, "learning_rate": 1.8048589137041427e-06, "loss": 2.0988, "step": 9565 }, { "epoch": 0.28217602830605926, "grad_norm": 15.3125, "learning_rate": 1.8045533933432538e-06, "loss": 2.1293, "step": 9570 }, { "epoch": 0.2823234556980687, "grad_norm": 24.25, "learning_rate": 1.8042476599035944e-06, "loss": 2.1763, "step": 9575 }, { "epoch": 0.2824708830900781, "grad_norm": 14.5, "learning_rate": 1.8039417134661354e-06, "loss": 2.1462, "step": 9580 }, { "epoch": 0.2826183104820876, "grad_norm": 12.125, "learning_rate": 1.8036355541119038e-06, "loss": 2.2079, "step": 9585 }, { "epoch": 0.28276573787409703, "grad_norm": 14.5, "learning_rate": 1.8033291819219833e-06, "loss": 2.1601, "step": 9590 }, { "epoch": 0.28291316526610644, "grad_norm": 14.375, "learning_rate": 1.803022596977514e-06, "loss": 2.2201, "step": 9595 }, { "epoch": 0.2830605926581159, "grad_norm": 59.25, "learning_rate": 1.802715799359692e-06, "loss": 2.1743, "step": 9600 }, { "epoch": 0.2832080200501253, "grad_norm": 15.0625, "learning_rate": 1.8024087891497697e-06, "loss": 2.2809, "step": 9605 }, { "epoch": 0.28335544744213476, "grad_norm": 18.0, "learning_rate": 1.8021015664290563e-06, "loss": 2.0981, "step": 9610 }, { "epoch": 0.28350287483414416, "grad_norm": 14.25, "learning_rate": 1.8017941312789172e-06, "loss": 2.3601, "step": 9615 }, { "epoch": 0.2836503022261536, "grad_norm": 12.8125, "learning_rate": 1.8014864837807732e-06, "loss": 2.0582, "step": 9620 }, { "epoch": 0.2837977296181631, "grad_norm": 15.4375, "learning_rate": 1.8011786240161025e-06, "loss": 2.1897, "step": 9625 }, { "epoch": 0.2839451570101725, "grad_norm": 14.9375, "learning_rate": 1.8008705520664388e-06, "loss": 2.1527, "step": 9630 }, { "epoch": 0.28409258440218194, "grad_norm": 17.25, "learning_rate": 1.8005622680133722e-06, "loss": 2.1402, "step": 9635 }, { "epoch": 0.28424001179419134, "grad_norm": 15.6875, "learning_rate": 1.8002537719385492e-06, "loss": 2.2136, "step": 9640 }, { "epoch": 0.2843874391862008, "grad_norm": 15.375, "learning_rate": 1.7999450639236716e-06, "loss": 2.2145, "step": 9645 }, { "epoch": 0.28453486657821025, "grad_norm": 14.0625, "learning_rate": 1.799636144050499e-06, "loss": 2.2849, "step": 9650 }, { "epoch": 0.28468229397021966, "grad_norm": 17.25, "learning_rate": 1.7993270124008454e-06, "loss": 2.1804, "step": 9655 }, { "epoch": 0.2848297213622291, "grad_norm": 14.0625, "learning_rate": 1.7990176690565815e-06, "loss": 2.1777, "step": 9660 }, { "epoch": 0.2849771487542385, "grad_norm": 14.5625, "learning_rate": 1.798708114099634e-06, "loss": 2.1158, "step": 9665 }, { "epoch": 0.285124576146248, "grad_norm": 14.0, "learning_rate": 1.7983983476119864e-06, "loss": 2.1636, "step": 9670 }, { "epoch": 0.28527200353825743, "grad_norm": 13.6875, "learning_rate": 1.798088369675677e-06, "loss": 2.1837, "step": 9675 }, { "epoch": 0.28541943093026684, "grad_norm": 15.25, "learning_rate": 1.797778180372801e-06, "loss": 2.2862, "step": 9680 }, { "epoch": 0.2855668583222763, "grad_norm": 17.0, "learning_rate": 1.7974677797855092e-06, "loss": 2.2452, "step": 9685 }, { "epoch": 0.2857142857142857, "grad_norm": 15.875, "learning_rate": 1.7971571679960081e-06, "loss": 2.1044, "step": 9690 }, { "epoch": 0.28586171310629516, "grad_norm": 13.5625, "learning_rate": 1.7968463450865608e-06, "loss": 2.1527, "step": 9695 }, { "epoch": 0.28600914049830456, "grad_norm": 16.125, "learning_rate": 1.796535311139486e-06, "loss": 2.1591, "step": 9700 }, { "epoch": 0.286156567890314, "grad_norm": 12.6875, "learning_rate": 1.796224066237158e-06, "loss": 2.2229, "step": 9705 }, { "epoch": 0.2863039952823235, "grad_norm": 16.125, "learning_rate": 1.7959126104620074e-06, "loss": 2.2106, "step": 9710 }, { "epoch": 0.2864514226743329, "grad_norm": 16.625, "learning_rate": 1.7956009438965204e-06, "loss": 2.1492, "step": 9715 }, { "epoch": 0.28659885006634234, "grad_norm": 13.9375, "learning_rate": 1.7952890666232391e-06, "loss": 2.1762, "step": 9720 }, { "epoch": 0.28674627745835174, "grad_norm": 15.0, "learning_rate": 1.7949769787247617e-06, "loss": 2.1311, "step": 9725 }, { "epoch": 0.2868937048503612, "grad_norm": 14.625, "learning_rate": 1.7946646802837416e-06, "loss": 2.0461, "step": 9730 }, { "epoch": 0.28704113224237066, "grad_norm": 18.5, "learning_rate": 1.7943521713828883e-06, "loss": 2.113, "step": 9735 }, { "epoch": 0.28718855963438006, "grad_norm": 17.25, "learning_rate": 1.7940394521049667e-06, "loss": 2.2751, "step": 9740 }, { "epoch": 0.2873359870263895, "grad_norm": 13.25, "learning_rate": 1.7937265225327983e-06, "loss": 2.1439, "step": 9745 }, { "epoch": 0.2874834144183989, "grad_norm": 18.125, "learning_rate": 1.793413382749259e-06, "loss": 2.2289, "step": 9750 }, { "epoch": 0.2876308418104084, "grad_norm": 15.4375, "learning_rate": 1.7931000328372818e-06, "loss": 2.3197, "step": 9755 }, { "epoch": 0.28777826920241784, "grad_norm": 15.0, "learning_rate": 1.7927864728798543e-06, "loss": 2.1508, "step": 9760 }, { "epoch": 0.28792569659442724, "grad_norm": 12.0, "learning_rate": 1.7924727029600198e-06, "loss": 2.1825, "step": 9765 }, { "epoch": 0.2880731239864367, "grad_norm": 13.125, "learning_rate": 1.7921587231608777e-06, "loss": 2.1113, "step": 9770 }, { "epoch": 0.2882205513784461, "grad_norm": 13.9375, "learning_rate": 1.791844533565583e-06, "loss": 2.1209, "step": 9775 }, { "epoch": 0.28836797877045556, "grad_norm": 13.6875, "learning_rate": 1.7915301342573455e-06, "loss": 2.2531, "step": 9780 }, { "epoch": 0.28851540616246496, "grad_norm": 13.25, "learning_rate": 1.7912155253194311e-06, "loss": 2.1772, "step": 9785 }, { "epoch": 0.2886628335544744, "grad_norm": 13.375, "learning_rate": 1.7909007068351617e-06, "loss": 1.9749, "step": 9790 }, { "epoch": 0.2888102609464839, "grad_norm": 14.6875, "learning_rate": 1.7905856788879135e-06, "loss": 2.2297, "step": 9795 }, { "epoch": 0.2889576883384933, "grad_norm": 21.125, "learning_rate": 1.7902704415611194e-06, "loss": 2.1577, "step": 9800 }, { "epoch": 0.28910511573050274, "grad_norm": 15.4375, "learning_rate": 1.7899549949382667e-06, "loss": 2.1224, "step": 9805 }, { "epoch": 0.28925254312251214, "grad_norm": 15.375, "learning_rate": 1.789639339102899e-06, "loss": 2.1734, "step": 9810 }, { "epoch": 0.2893999705145216, "grad_norm": 14.5625, "learning_rate": 1.7893234741386148e-06, "loss": 1.9936, "step": 9815 }, { "epoch": 0.28954739790653106, "grad_norm": 15.625, "learning_rate": 1.7890074001290678e-06, "loss": 2.1629, "step": 9820 }, { "epoch": 0.28969482529854046, "grad_norm": 15.125, "learning_rate": 1.7886911171579678e-06, "loss": 2.1329, "step": 9825 }, { "epoch": 0.2898422526905499, "grad_norm": 13.5625, "learning_rate": 1.7883746253090792e-06, "loss": 2.2002, "step": 9830 }, { "epoch": 0.2899896800825593, "grad_norm": 18.5, "learning_rate": 1.7880579246662225e-06, "loss": 2.1516, "step": 9835 }, { "epoch": 0.2901371074745688, "grad_norm": 17.0, "learning_rate": 1.7877410153132727e-06, "loss": 2.3072, "step": 9840 }, { "epoch": 0.29028453486657824, "grad_norm": 15.4375, "learning_rate": 1.7874238973341602e-06, "loss": 2.094, "step": 9845 }, { "epoch": 0.29043196225858764, "grad_norm": 14.0, "learning_rate": 1.7871065708128712e-06, "loss": 2.1111, "step": 9850 }, { "epoch": 0.2905793896505971, "grad_norm": 15.625, "learning_rate": 1.7867890358334465e-06, "loss": 2.2635, "step": 9855 }, { "epoch": 0.2907268170426065, "grad_norm": 17.625, "learning_rate": 1.7864712924799829e-06, "loss": 2.0714, "step": 9860 }, { "epoch": 0.29087424443461596, "grad_norm": 14.875, "learning_rate": 1.7861533408366315e-06, "loss": 2.1918, "step": 9865 }, { "epoch": 0.29102167182662536, "grad_norm": 15.125, "learning_rate": 1.7858351809875992e-06, "loss": 2.1891, "step": 9870 }, { "epoch": 0.2911690992186348, "grad_norm": 13.5625, "learning_rate": 1.7855168130171471e-06, "loss": 2.2214, "step": 9875 }, { "epoch": 0.2913165266106443, "grad_norm": 13.875, "learning_rate": 1.785198237009593e-06, "loss": 2.1657, "step": 9880 }, { "epoch": 0.2914639540026537, "grad_norm": 15.3125, "learning_rate": 1.7848794530493083e-06, "loss": 2.2377, "step": 9885 }, { "epoch": 0.29161138139466314, "grad_norm": 14.3125, "learning_rate": 1.7845604612207206e-06, "loss": 2.1852, "step": 9890 }, { "epoch": 0.29175880878667254, "grad_norm": 15.0625, "learning_rate": 1.7842412616083116e-06, "loss": 2.2394, "step": 9895 }, { "epoch": 0.291906236178682, "grad_norm": 16.5, "learning_rate": 1.7839218542966185e-06, "loss": 2.2163, "step": 9900 }, { "epoch": 0.29205366357069146, "grad_norm": 14.0625, "learning_rate": 1.7836022393702334e-06, "loss": 2.1405, "step": 9905 }, { "epoch": 0.29220109096270086, "grad_norm": 18.625, "learning_rate": 1.783282416913804e-06, "loss": 2.2301, "step": 9910 }, { "epoch": 0.2923485183547103, "grad_norm": 12.5625, "learning_rate": 1.782962387012032e-06, "loss": 2.0984, "step": 9915 }, { "epoch": 0.2924959457467197, "grad_norm": 14.0, "learning_rate": 1.7826421497496743e-06, "loss": 2.1405, "step": 9920 }, { "epoch": 0.2926433731387292, "grad_norm": 15.5, "learning_rate": 1.7823217052115434e-06, "loss": 2.2241, "step": 9925 }, { "epoch": 0.29279080053073864, "grad_norm": 15.4375, "learning_rate": 1.7820010534825057e-06, "loss": 2.1421, "step": 9930 }, { "epoch": 0.29293822792274804, "grad_norm": 16.625, "learning_rate": 1.7816801946474832e-06, "loss": 2.1865, "step": 9935 }, { "epoch": 0.2930856553147575, "grad_norm": 11.625, "learning_rate": 1.781359128791452e-06, "loss": 2.1026, "step": 9940 }, { "epoch": 0.2932330827067669, "grad_norm": 12.9375, "learning_rate": 1.7810378559994442e-06, "loss": 2.1224, "step": 9945 }, { "epoch": 0.29338051009877636, "grad_norm": 13.8125, "learning_rate": 1.7807163763565457e-06, "loss": 2.1026, "step": 9950 }, { "epoch": 0.29352793749078576, "grad_norm": 17.75, "learning_rate": 1.7803946899478972e-06, "loss": 2.1495, "step": 9955 }, { "epoch": 0.2936753648827952, "grad_norm": 14.5, "learning_rate": 1.7800727968586952e-06, "loss": 2.2219, "step": 9960 }, { "epoch": 0.2938227922748047, "grad_norm": 14.5, "learning_rate": 1.7797506971741899e-06, "loss": 2.1837, "step": 9965 }, { "epoch": 0.2939702196668141, "grad_norm": 12.1875, "learning_rate": 1.779428390979686e-06, "loss": 2.0926, "step": 9970 }, { "epoch": 0.29411764705882354, "grad_norm": 24.625, "learning_rate": 1.7791058783605442e-06, "loss": 2.192, "step": 9975 }, { "epoch": 0.29426507445083294, "grad_norm": 13.75, "learning_rate": 1.7787831594021787e-06, "loss": 2.2558, "step": 9980 }, { "epoch": 0.2944125018428424, "grad_norm": 15.125, "learning_rate": 1.7784602341900585e-06, "loss": 2.2184, "step": 9985 }, { "epoch": 0.29455992923485186, "grad_norm": 26.125, "learning_rate": 1.7781371028097079e-06, "loss": 2.2509, "step": 9990 }, { "epoch": 0.29470735662686126, "grad_norm": 14.9375, "learning_rate": 1.7778137653467052e-06, "loss": 2.2607, "step": 9995 }, { "epoch": 0.2948547840188707, "grad_norm": 13.875, "learning_rate": 1.7774902218866833e-06, "loss": 2.1507, "step": 10000 }, { "epoch": 0.2948547840188707, "eval_loss": 2.1286072731018066, "eval_runtime": 4.7121, "eval_samples_per_second": 84.038, "eval_steps_per_second": 2.759, "step": 10000 } ], "logging_steps": 5, "max_steps": 33915, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.7972623782104793e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }