{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999783273010988, "eval_steps": 3000, "global_step": 34605, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014448465934129443, "grad_norm": 23.53125, "learning_rate": 2.2575809568531125e-10, "loss": 18.2618, "step": 50 }, { "epoch": 0.0028896931868258886, "grad_norm": 23.046875, "learning_rate": 4.515161913706225e-10, "loss": 18.3279, "step": 100 }, { "epoch": 0.004334539780238833, "grad_norm": 23.703125, "learning_rate": 6.772742870559338e-10, "loss": 18.2094, "step": 150 }, { "epoch": 0.005779386373651777, "grad_norm": 21.046875, "learning_rate": 9.03032382741245e-10, "loss": 18.0812, "step": 200 }, { "epoch": 0.007224232967064722, "grad_norm": 21.34375, "learning_rate": 1.1287904784265563e-09, "loss": 18.0178, "step": 250 }, { "epoch": 0.008669079560477666, "grad_norm": 23.390625, "learning_rate": 1.3545485741118676e-09, "loss": 18.0735, "step": 300 }, { "epoch": 0.01011392615389061, "grad_norm": 22.078125, "learning_rate": 1.5803066697971788e-09, "loss": 18.0169, "step": 350 }, { "epoch": 0.011558772747303554, "grad_norm": 19.1875, "learning_rate": 1.80606476548249e-09, "loss": 17.9299, "step": 400 }, { "epoch": 0.0130036193407165, "grad_norm": 22.75, "learning_rate": 2.0318228611678016e-09, "loss": 17.7586, "step": 450 }, { "epoch": 0.014448465934129445, "grad_norm": 20.984375, "learning_rate": 2.2575809568531127e-09, "loss": 17.7956, "step": 500 }, { "epoch": 0.015893312527542388, "grad_norm": 22.453125, "learning_rate": 2.4833390525384237e-09, "loss": 17.7873, "step": 550 }, { "epoch": 0.01733815912095533, "grad_norm": 19.71875, "learning_rate": 2.709097148223735e-09, "loss": 17.7643, "step": 600 }, { "epoch": 0.018783005714368278, "grad_norm": 20.1875, "learning_rate": 2.9348552439090465e-09, "loss": 17.6861, "step": 650 }, { "epoch": 0.02022785230778122, "grad_norm": 20.8125, "learning_rate": 3.1606133395943576e-09, "loss": 17.6883, "step": 700 }, { "epoch": 0.021672698901194165, "grad_norm": 23.875, "learning_rate": 3.386371435279669e-09, "loss": 17.6176, "step": 750 }, { "epoch": 0.02311754549460711, "grad_norm": 19.96875, "learning_rate": 3.61212953096498e-09, "loss": 17.6334, "step": 800 }, { "epoch": 0.024562392088020055, "grad_norm": 20.734375, "learning_rate": 3.837887626650292e-09, "loss": 17.5943, "step": 850 }, { "epoch": 0.026007238681433, "grad_norm": 21.09375, "learning_rate": 4.063645722335603e-09, "loss": 17.5436, "step": 900 }, { "epoch": 0.027452085274845942, "grad_norm": 19.34375, "learning_rate": 4.289403818020914e-09, "loss": 17.6069, "step": 950 }, { "epoch": 0.02889693186825889, "grad_norm": 23.109375, "learning_rate": 4.515161913706225e-09, "loss": 17.4956, "step": 1000 }, { "epoch": 0.030341778461671833, "grad_norm": 23.53125, "learning_rate": 4.740920009391537e-09, "loss": 17.624, "step": 1050 }, { "epoch": 0.031786625055084776, "grad_norm": 19.40625, "learning_rate": 4.966678105076847e-09, "loss": 17.6303, "step": 1100 }, { "epoch": 0.03323147164849772, "grad_norm": 20.9375, "learning_rate": 5.19243620076216e-09, "loss": 17.5501, "step": 1150 }, { "epoch": 0.03467631824191066, "grad_norm": 18.9375, "learning_rate": 5.41819429644747e-09, "loss": 17.5943, "step": 1200 }, { "epoch": 0.036121164835323606, "grad_norm": 25.625, "learning_rate": 5.643952392132782e-09, "loss": 17.4464, "step": 1250 }, { "epoch": 0.037566011428736557, "grad_norm": 18.203125, "learning_rate": 5.869710487818093e-09, "loss": 17.5614, "step": 1300 }, { "epoch": 0.0390108580221495, "grad_norm": 19.4375, "learning_rate": 6.095468583503404e-09, "loss": 17.3942, "step": 1350 }, { "epoch": 0.04045570461556244, "grad_norm": 20.71875, "learning_rate": 6.321226679188715e-09, "loss": 17.3659, "step": 1400 }, { "epoch": 0.04190055120897539, "grad_norm": 17.828125, "learning_rate": 6.546984774874027e-09, "loss": 17.432, "step": 1450 }, { "epoch": 0.04334539780238833, "grad_norm": 22.234375, "learning_rate": 6.772742870559338e-09, "loss": 17.5557, "step": 1500 }, { "epoch": 0.044790244395801274, "grad_norm": 21.0625, "learning_rate": 6.998500966244649e-09, "loss": 17.4979, "step": 1550 }, { "epoch": 0.04623509098921422, "grad_norm": 18.84375, "learning_rate": 7.22425906192996e-09, "loss": 17.2844, "step": 1600 }, { "epoch": 0.04767993758262717, "grad_norm": 22.40625, "learning_rate": 7.4500171576152714e-09, "loss": 17.3339, "step": 1650 }, { "epoch": 0.04912478417604011, "grad_norm": 19.390625, "learning_rate": 7.675775253300584e-09, "loss": 17.313, "step": 1700 }, { "epoch": 0.050569630769453054, "grad_norm": 19.328125, "learning_rate": 7.901533348985894e-09, "loss": 17.3682, "step": 1750 }, { "epoch": 0.052014477362866, "grad_norm": 22.421875, "learning_rate": 8.127291444671207e-09, "loss": 17.3144, "step": 1800 }, { "epoch": 0.05345932395627894, "grad_norm": 19.4375, "learning_rate": 8.353049540356517e-09, "loss": 17.3066, "step": 1850 }, { "epoch": 0.054904170549691884, "grad_norm": 19.984375, "learning_rate": 8.578807636041828e-09, "loss": 17.4404, "step": 1900 }, { "epoch": 0.05634901714310483, "grad_norm": 19.921875, "learning_rate": 8.804565731727138e-09, "loss": 17.3098, "step": 1950 }, { "epoch": 0.05779386373651778, "grad_norm": 18.578125, "learning_rate": 9.03032382741245e-09, "loss": 17.2569, "step": 2000 }, { "epoch": 0.05923871032993072, "grad_norm": 21.890625, "learning_rate": 9.256081923097763e-09, "loss": 17.3394, "step": 2050 }, { "epoch": 0.060683556923343665, "grad_norm": 25.4375, "learning_rate": 9.481840018783073e-09, "loss": 17.3198, "step": 2100 }, { "epoch": 0.06212840351675661, "grad_norm": 21.171875, "learning_rate": 9.707598114468384e-09, "loss": 17.3099, "step": 2150 }, { "epoch": 0.06357325011016955, "grad_norm": 26.203125, "learning_rate": 9.933356210153695e-09, "loss": 17.2845, "step": 2200 }, { "epoch": 0.0650180967035825, "grad_norm": 21.296875, "learning_rate": 1.0159114305839007e-08, "loss": 17.2587, "step": 2250 }, { "epoch": 0.06646294329699544, "grad_norm": 22.65625, "learning_rate": 1.038487240152432e-08, "loss": 17.3986, "step": 2300 }, { "epoch": 0.06790778989040838, "grad_norm": 23.625, "learning_rate": 1.061063049720963e-08, "loss": 17.3046, "step": 2350 }, { "epoch": 0.06935263648382133, "grad_norm": 21.203125, "learning_rate": 1.083638859289494e-08, "loss": 17.2043, "step": 2400 }, { "epoch": 0.07079748307723427, "grad_norm": 24.640625, "learning_rate": 1.1062146688580251e-08, "loss": 17.2274, "step": 2450 }, { "epoch": 0.07224232967064721, "grad_norm": 20.03125, "learning_rate": 1.1287904784265563e-08, "loss": 17.2168, "step": 2500 }, { "epoch": 0.07368717626406017, "grad_norm": 23.5, "learning_rate": 1.1513662879950874e-08, "loss": 17.1795, "step": 2550 }, { "epoch": 0.07513202285747311, "grad_norm": 19.890625, "learning_rate": 1.1739420975636186e-08, "loss": 17.2713, "step": 2600 }, { "epoch": 0.07657686945088606, "grad_norm": 20.796875, "learning_rate": 1.1965179071321498e-08, "loss": 17.371, "step": 2650 }, { "epoch": 0.078021716044299, "grad_norm": 21.640625, "learning_rate": 1.2190937167006807e-08, "loss": 17.199, "step": 2700 }, { "epoch": 0.07946656263771194, "grad_norm": 24.96875, "learning_rate": 1.241669526269212e-08, "loss": 17.2428, "step": 2750 }, { "epoch": 0.08091140923112489, "grad_norm": 18.640625, "learning_rate": 1.264245335837743e-08, "loss": 17.2682, "step": 2800 }, { "epoch": 0.08235625582453783, "grad_norm": 22.984375, "learning_rate": 1.2868211454062742e-08, "loss": 17.1985, "step": 2850 }, { "epoch": 0.08380110241795077, "grad_norm": 52.03125, "learning_rate": 1.3093969549748055e-08, "loss": 17.2148, "step": 2900 }, { "epoch": 0.08524594901136372, "grad_norm": 20.71875, "learning_rate": 1.3319727645433364e-08, "loss": 17.1465, "step": 2950 }, { "epoch": 0.08669079560477666, "grad_norm": 21.015625, "learning_rate": 1.3545485741118676e-08, "loss": 17.1231, "step": 3000 }, { "epoch": 0.08669079560477666, "eval_loss": 2.1503310203552246, "eval_runtime": 340.0537, "eval_samples_per_second": 2742.314, "eval_steps_per_second": 42.849, "step": 3000 }, { "epoch": 0.0881356421981896, "grad_norm": 19.109375, "learning_rate": 1.3771243836803987e-08, "loss": 17.2298, "step": 3050 }, { "epoch": 0.08958048879160255, "grad_norm": 20.453125, "learning_rate": 1.3997001932489299e-08, "loss": 17.0646, "step": 3100 }, { "epoch": 0.09102533538501549, "grad_norm": 22.234375, "learning_rate": 1.4222760028174611e-08, "loss": 17.2016, "step": 3150 }, { "epoch": 0.09247018197842843, "grad_norm": 18.921875, "learning_rate": 1.444851812385992e-08, "loss": 17.1826, "step": 3200 }, { "epoch": 0.09391502857184139, "grad_norm": 18.0, "learning_rate": 1.4674276219545232e-08, "loss": 17.1939, "step": 3250 }, { "epoch": 0.09535987516525433, "grad_norm": 19.859375, "learning_rate": 1.4900034315230543e-08, "loss": 17.104, "step": 3300 }, { "epoch": 0.09680472175866728, "grad_norm": 25.421875, "learning_rate": 1.5125792410915855e-08, "loss": 17.1392, "step": 3350 }, { "epoch": 0.09824956835208022, "grad_norm": 23.34375, "learning_rate": 1.5351550506601167e-08, "loss": 17.2487, "step": 3400 }, { "epoch": 0.09969441494549316, "grad_norm": 21.78125, "learning_rate": 1.5577308602286476e-08, "loss": 17.1119, "step": 3450 }, { "epoch": 0.10113926153890611, "grad_norm": 21.125, "learning_rate": 1.580306669797179e-08, "loss": 17.2458, "step": 3500 }, { "epoch": 0.10258410813231905, "grad_norm": 20.96875, "learning_rate": 1.6028824793657098e-08, "loss": 17.1168, "step": 3550 }, { "epoch": 0.104028954725732, "grad_norm": 23.03125, "learning_rate": 1.6254582889342413e-08, "loss": 17.1953, "step": 3600 }, { "epoch": 0.10547380131914494, "grad_norm": 20.828125, "learning_rate": 1.6480340985027722e-08, "loss": 17.1476, "step": 3650 }, { "epoch": 0.10691864791255788, "grad_norm": 20.09375, "learning_rate": 1.6706099080713034e-08, "loss": 17.1084, "step": 3700 }, { "epoch": 0.10836349450597083, "grad_norm": 21.5, "learning_rate": 1.6931857176398343e-08, "loss": 17.3216, "step": 3750 }, { "epoch": 0.10980834109938377, "grad_norm": 19.453125, "learning_rate": 1.7157615272083656e-08, "loss": 17.1359, "step": 3800 }, { "epoch": 0.11125318769279671, "grad_norm": 19.640625, "learning_rate": 1.7383373367768968e-08, "loss": 17.1561, "step": 3850 }, { "epoch": 0.11269803428620966, "grad_norm": 21.0, "learning_rate": 1.7609131463454277e-08, "loss": 17.0558, "step": 3900 }, { "epoch": 0.11414288087962261, "grad_norm": 22.21875, "learning_rate": 1.7834889559139592e-08, "loss": 17.149, "step": 3950 }, { "epoch": 0.11558772747303556, "grad_norm": 23.109375, "learning_rate": 1.80606476548249e-08, "loss": 17.0748, "step": 4000 }, { "epoch": 0.1170325740664485, "grad_norm": 19.265625, "learning_rate": 1.828640575051021e-08, "loss": 17.1038, "step": 4050 }, { "epoch": 0.11847742065986144, "grad_norm": 22.34375, "learning_rate": 1.8512163846195526e-08, "loss": 17.0105, "step": 4100 }, { "epoch": 0.11992226725327439, "grad_norm": 20.25, "learning_rate": 1.8737921941880835e-08, "loss": 17.0041, "step": 4150 }, { "epoch": 0.12136711384668733, "grad_norm": 23.484375, "learning_rate": 1.8963680037566147e-08, "loss": 17.1879, "step": 4200 }, { "epoch": 0.12281196044010027, "grad_norm": 23.859375, "learning_rate": 1.9189438133251456e-08, "loss": 17.0091, "step": 4250 }, { "epoch": 0.12425680703351322, "grad_norm": 23.609375, "learning_rate": 1.9415196228936768e-08, "loss": 16.9834, "step": 4300 }, { "epoch": 0.12570165362692617, "grad_norm": 20.171875, "learning_rate": 1.964095432462208e-08, "loss": 16.9839, "step": 4350 }, { "epoch": 0.1271465002203391, "grad_norm": 20.75, "learning_rate": 1.986671242030739e-08, "loss": 16.9732, "step": 4400 }, { "epoch": 0.12859134681375206, "grad_norm": 21.078125, "learning_rate": 2.0092470515992705e-08, "loss": 17.0603, "step": 4450 }, { "epoch": 0.130036193407165, "grad_norm": 19.796875, "learning_rate": 2.0318228611678014e-08, "loss": 17.0293, "step": 4500 }, { "epoch": 0.13148104000057795, "grad_norm": 21.53125, "learning_rate": 2.0543986707363323e-08, "loss": 17.0322, "step": 4550 }, { "epoch": 0.13292588659399088, "grad_norm": 23.171875, "learning_rate": 2.076974480304864e-08, "loss": 17.0431, "step": 4600 }, { "epoch": 0.13437073318740383, "grad_norm": 20.203125, "learning_rate": 2.0995502898733947e-08, "loss": 17.0135, "step": 4650 }, { "epoch": 0.13581557978081676, "grad_norm": 20.1875, "learning_rate": 2.122126099441926e-08, "loss": 16.905, "step": 4700 }, { "epoch": 0.13726042637422972, "grad_norm": 20.703125, "learning_rate": 2.144701909010457e-08, "loss": 17.082, "step": 4750 }, { "epoch": 0.13870527296764265, "grad_norm": 21.265625, "learning_rate": 2.167277718578988e-08, "loss": 16.9678, "step": 4800 }, { "epoch": 0.1401501195610556, "grad_norm": 20.640625, "learning_rate": 2.1898535281475193e-08, "loss": 17.0, "step": 4850 }, { "epoch": 0.14159496615446854, "grad_norm": 20.390625, "learning_rate": 2.2124293377160502e-08, "loss": 16.9941, "step": 4900 }, { "epoch": 0.1430398127478815, "grad_norm": 21.15625, "learning_rate": 2.2350051472845818e-08, "loss": 17.0649, "step": 4950 }, { "epoch": 0.14448465934129442, "grad_norm": 19.21875, "learning_rate": 2.2575809568531127e-08, "loss": 17.0574, "step": 5000 }, { "epoch": 0.14592950593470738, "grad_norm": 19.59375, "learning_rate": 2.2801567664216436e-08, "loss": 17.1142, "step": 5050 }, { "epoch": 0.14737435252812034, "grad_norm": 20.828125, "learning_rate": 2.3027325759901748e-08, "loss": 16.957, "step": 5100 }, { "epoch": 0.14881919912153327, "grad_norm": 21.125, "learning_rate": 2.325308385558706e-08, "loss": 17.1079, "step": 5150 }, { "epoch": 0.15026404571494623, "grad_norm": 21.890625, "learning_rate": 2.3478841951272372e-08, "loss": 17.0049, "step": 5200 }, { "epoch": 0.15170889230835916, "grad_norm": 22.828125, "learning_rate": 2.370460004695768e-08, "loss": 17.0316, "step": 5250 }, { "epoch": 0.1531537389017721, "grad_norm": 22.703125, "learning_rate": 2.3930358142642997e-08, "loss": 17.0376, "step": 5300 }, { "epoch": 0.15459858549518504, "grad_norm": 20.90625, "learning_rate": 2.4156116238328306e-08, "loss": 17.1354, "step": 5350 }, { "epoch": 0.156043432088598, "grad_norm": 22.609375, "learning_rate": 2.4381874334013615e-08, "loss": 16.9243, "step": 5400 }, { "epoch": 0.15748827868201093, "grad_norm": 20.859375, "learning_rate": 2.460763242969893e-08, "loss": 17.0623, "step": 5450 }, { "epoch": 0.1589331252754239, "grad_norm": 22.15625, "learning_rate": 2.483339052538424e-08, "loss": 17.0041, "step": 5500 }, { "epoch": 0.16037797186883682, "grad_norm": 23.078125, "learning_rate": 2.505914862106955e-08, "loss": 16.945, "step": 5550 }, { "epoch": 0.16182281846224977, "grad_norm": 20.5, "learning_rate": 2.528490671675486e-08, "loss": 16.886, "step": 5600 }, { "epoch": 0.1632676650556627, "grad_norm": 19.859375, "learning_rate": 2.5510664812440173e-08, "loss": 16.8898, "step": 5650 }, { "epoch": 0.16471251164907566, "grad_norm": 22.875, "learning_rate": 2.5736422908125485e-08, "loss": 17.0453, "step": 5700 }, { "epoch": 0.16615735824248862, "grad_norm": 21.015625, "learning_rate": 2.5962181003810794e-08, "loss": 17.01, "step": 5750 }, { "epoch": 0.16760220483590155, "grad_norm": 24.078125, "learning_rate": 2.618793909949611e-08, "loss": 17.0377, "step": 5800 }, { "epoch": 0.1690470514293145, "grad_norm": 19.53125, "learning_rate": 2.641369719518142e-08, "loss": 16.8983, "step": 5850 }, { "epoch": 0.17049189802272743, "grad_norm": 19.234375, "learning_rate": 2.6639455290866727e-08, "loss": 17.0153, "step": 5900 }, { "epoch": 0.1719367446161404, "grad_norm": 21.40625, "learning_rate": 2.6865213386552043e-08, "loss": 16.9406, "step": 5950 }, { "epoch": 0.17338159120955332, "grad_norm": 21.703125, "learning_rate": 2.7090971482237352e-08, "loss": 16.9213, "step": 6000 }, { "epoch": 0.17338159120955332, "eval_loss": 2.117004871368408, "eval_runtime": 351.2723, "eval_samples_per_second": 2654.733, "eval_steps_per_second": 41.481, "step": 6000 }, { "epoch": 0.17482643780296628, "grad_norm": 23.40625, "learning_rate": 2.7316729577922664e-08, "loss": 16.9717, "step": 6050 }, { "epoch": 0.1762712843963792, "grad_norm": 21.09375, "learning_rate": 2.7542487673607973e-08, "loss": 17.0021, "step": 6100 }, { "epoch": 0.17771613098979216, "grad_norm": 20.078125, "learning_rate": 2.7768245769293285e-08, "loss": 16.9975, "step": 6150 }, { "epoch": 0.1791609775832051, "grad_norm": 20.46875, "learning_rate": 2.7994003864978598e-08, "loss": 16.9298, "step": 6200 }, { "epoch": 0.18060582417661805, "grad_norm": 21.453125, "learning_rate": 2.8219761960663907e-08, "loss": 16.8852, "step": 6250 }, { "epoch": 0.18205067077003098, "grad_norm": 20.90625, "learning_rate": 2.8445520056349222e-08, "loss": 17.0017, "step": 6300 }, { "epoch": 0.18349551736344394, "grad_norm": 23.765625, "learning_rate": 2.867127815203453e-08, "loss": 16.9424, "step": 6350 }, { "epoch": 0.18494036395685687, "grad_norm": 21.328125, "learning_rate": 2.889703624771984e-08, "loss": 16.9559, "step": 6400 }, { "epoch": 0.18638521055026983, "grad_norm": 21.8125, "learning_rate": 2.9122794343405156e-08, "loss": 16.9595, "step": 6450 }, { "epoch": 0.18783005714368278, "grad_norm": 22.53125, "learning_rate": 2.9348552439090465e-08, "loss": 16.8973, "step": 6500 }, { "epoch": 0.1892749037370957, "grad_norm": 21.3125, "learning_rate": 2.9574310534775777e-08, "loss": 16.9388, "step": 6550 }, { "epoch": 0.19071975033050867, "grad_norm": 25.25, "learning_rate": 2.9800068630461086e-08, "loss": 16.8936, "step": 6600 }, { "epoch": 0.1921645969239216, "grad_norm": 21.921875, "learning_rate": 3.0025826726146395e-08, "loss": 16.9861, "step": 6650 }, { "epoch": 0.19360944351733456, "grad_norm": 22.34375, "learning_rate": 3.025158482183171e-08, "loss": 16.7772, "step": 6700 }, { "epoch": 0.19505429011074749, "grad_norm": 21.921875, "learning_rate": 3.047734291751702e-08, "loss": 16.8555, "step": 6750 }, { "epoch": 0.19649913670416044, "grad_norm": 25.28125, "learning_rate": 3.0703101013202335e-08, "loss": 16.7765, "step": 6800 }, { "epoch": 0.19794398329757337, "grad_norm": 21.625, "learning_rate": 3.0928859108887644e-08, "loss": 16.868, "step": 6850 }, { "epoch": 0.19938882989098633, "grad_norm": 25.046875, "learning_rate": 3.115461720457295e-08, "loss": 16.9386, "step": 6900 }, { "epoch": 0.20083367648439926, "grad_norm": 23.671875, "learning_rate": 3.138037530025826e-08, "loss": 16.9085, "step": 6950 }, { "epoch": 0.20227852307781222, "grad_norm": 22.234375, "learning_rate": 3.160613339594358e-08, "loss": 16.8305, "step": 7000 }, { "epoch": 0.20372336967122515, "grad_norm": 18.609375, "learning_rate": 3.183189149162889e-08, "loss": 16.7971, "step": 7050 }, { "epoch": 0.2051682162646381, "grad_norm": 21.78125, "learning_rate": 3.2057649587314195e-08, "loss": 16.8924, "step": 7100 }, { "epoch": 0.20661306285805103, "grad_norm": 19.921875, "learning_rate": 3.228340768299951e-08, "loss": 16.9471, "step": 7150 }, { "epoch": 0.208057909451464, "grad_norm": 20.75, "learning_rate": 3.2509165778684826e-08, "loss": 16.8681, "step": 7200 }, { "epoch": 0.20950275604487695, "grad_norm": 20.859375, "learning_rate": 3.2734923874370135e-08, "loss": 16.7762, "step": 7250 }, { "epoch": 0.21094760263828988, "grad_norm": 19.671875, "learning_rate": 3.2960681970055444e-08, "loss": 16.8033, "step": 7300 }, { "epoch": 0.21239244923170283, "grad_norm": 22.171875, "learning_rate": 3.318644006574075e-08, "loss": 16.9011, "step": 7350 }, { "epoch": 0.21383729582511576, "grad_norm": 20.09375, "learning_rate": 3.341219816142607e-08, "loss": 16.7137, "step": 7400 }, { "epoch": 0.21528214241852872, "grad_norm": 22.828125, "learning_rate": 3.363795625711138e-08, "loss": 16.8546, "step": 7450 }, { "epoch": 0.21672698901194165, "grad_norm": 22.640625, "learning_rate": 3.3863714352796687e-08, "loss": 16.9191, "step": 7500 }, { "epoch": 0.2181718356053546, "grad_norm": 28.5, "learning_rate": 3.4089472448482e-08, "loss": 16.8509, "step": 7550 }, { "epoch": 0.21961668219876754, "grad_norm": 20.484375, "learning_rate": 3.431523054416731e-08, "loss": 16.8247, "step": 7600 }, { "epoch": 0.2210615287921805, "grad_norm": 20.5625, "learning_rate": 3.454098863985262e-08, "loss": 16.7724, "step": 7650 }, { "epoch": 0.22250637538559342, "grad_norm": 22.0625, "learning_rate": 3.4766746735537936e-08, "loss": 16.7584, "step": 7700 }, { "epoch": 0.22395122197900638, "grad_norm": 20.375, "learning_rate": 3.499250483122325e-08, "loss": 16.7699, "step": 7750 }, { "epoch": 0.2253960685724193, "grad_norm": 20.65625, "learning_rate": 3.5218262926908553e-08, "loss": 16.8213, "step": 7800 }, { "epoch": 0.22684091516583227, "grad_norm": 22.171875, "learning_rate": 3.544402102259387e-08, "loss": 16.9104, "step": 7850 }, { "epoch": 0.22828576175924523, "grad_norm": 22.109375, "learning_rate": 3.5669779118279185e-08, "loss": 16.7777, "step": 7900 }, { "epoch": 0.22973060835265816, "grad_norm": 22.109375, "learning_rate": 3.589553721396449e-08, "loss": 16.7741, "step": 7950 }, { "epoch": 0.2311754549460711, "grad_norm": 20.90625, "learning_rate": 3.61212953096498e-08, "loss": 16.7486, "step": 8000 }, { "epoch": 0.23262030153948404, "grad_norm": 21.265625, "learning_rate": 3.634705340533512e-08, "loss": 16.7256, "step": 8050 }, { "epoch": 0.234065148132897, "grad_norm": 23.40625, "learning_rate": 3.657281150102042e-08, "loss": 16.7431, "step": 8100 }, { "epoch": 0.23550999472630993, "grad_norm": 21.0625, "learning_rate": 3.6798569596705736e-08, "loss": 16.777, "step": 8150 }, { "epoch": 0.2369548413197229, "grad_norm": 23.125, "learning_rate": 3.702432769239105e-08, "loss": 16.6821, "step": 8200 }, { "epoch": 0.23839968791313582, "grad_norm": 19.203125, "learning_rate": 3.725008578807636e-08, "loss": 16.7046, "step": 8250 }, { "epoch": 0.23984453450654877, "grad_norm": 21.015625, "learning_rate": 3.747584388376167e-08, "loss": 16.7624, "step": 8300 }, { "epoch": 0.2412893810999617, "grad_norm": 20.90625, "learning_rate": 3.770160197944698e-08, "loss": 16.801, "step": 8350 }, { "epoch": 0.24273422769337466, "grad_norm": 22.703125, "learning_rate": 3.7927360075132294e-08, "loss": 16.7837, "step": 8400 }, { "epoch": 0.2441790742867876, "grad_norm": 23.5, "learning_rate": 3.81531181708176e-08, "loss": 16.7856, "step": 8450 }, { "epoch": 0.24562392088020055, "grad_norm": 19.84375, "learning_rate": 3.837887626650291e-08, "loss": 16.7071, "step": 8500 }, { "epoch": 0.24706876747361348, "grad_norm": 21.78125, "learning_rate": 3.860463436218823e-08, "loss": 16.7162, "step": 8550 }, { "epoch": 0.24851361406702643, "grad_norm": 21.109375, "learning_rate": 3.8830392457873536e-08, "loss": 16.7331, "step": 8600 }, { "epoch": 0.2499584606604394, "grad_norm": 21.875, "learning_rate": 3.9056150553558845e-08, "loss": 16.7945, "step": 8650 }, { "epoch": 0.25140330725385235, "grad_norm": 21.1875, "learning_rate": 3.928190864924416e-08, "loss": 16.7675, "step": 8700 }, { "epoch": 0.2528481538472653, "grad_norm": 21.375, "learning_rate": 3.9507666744929476e-08, "loss": 16.7202, "step": 8750 }, { "epoch": 0.2542930004406782, "grad_norm": 21.25, "learning_rate": 3.973342484061478e-08, "loss": 16.6576, "step": 8800 }, { "epoch": 0.25573784703409114, "grad_norm": 20.84375, "learning_rate": 3.9959182936300094e-08, "loss": 16.6249, "step": 8850 }, { "epoch": 0.2571826936275041, "grad_norm": 20.703125, "learning_rate": 4.018494103198541e-08, "loss": 16.7702, "step": 8900 }, { "epoch": 0.25862754022091705, "grad_norm": 23.203125, "learning_rate": 4.041069912767071e-08, "loss": 16.687, "step": 8950 }, { "epoch": 0.26007238681433, "grad_norm": 24.96875, "learning_rate": 4.063645722335603e-08, "loss": 16.6843, "step": 9000 }, { "epoch": 0.26007238681433, "eval_loss": 2.087214946746826, "eval_runtime": 348.1727, "eval_samples_per_second": 2678.367, "eval_steps_per_second": 41.85, "step": 9000 }, { "epoch": 0.2615172334077429, "grad_norm": 22.578125, "learning_rate": 4.0862215319041343e-08, "loss": 16.6685, "step": 9050 }, { "epoch": 0.2629620800011559, "grad_norm": 21.125, "learning_rate": 4.1087973414726646e-08, "loss": 16.8334, "step": 9100 }, { "epoch": 0.2644069265945688, "grad_norm": 23.84375, "learning_rate": 4.131373151041196e-08, "loss": 16.5566, "step": 9150 }, { "epoch": 0.26585177318798175, "grad_norm": 18.640625, "learning_rate": 4.153948960609728e-08, "loss": 16.7514, "step": 9200 }, { "epoch": 0.2672966197813947, "grad_norm": 22.53125, "learning_rate": 4.1765247701782586e-08, "loss": 16.688, "step": 9250 }, { "epoch": 0.26874146637480767, "grad_norm": 23.46875, "learning_rate": 4.1991005797467895e-08, "loss": 16.6654, "step": 9300 }, { "epoch": 0.2701863129682206, "grad_norm": 20.984375, "learning_rate": 4.2216763893153204e-08, "loss": 16.7596, "step": 9350 }, { "epoch": 0.27163115956163353, "grad_norm": 21.9375, "learning_rate": 4.244252198883852e-08, "loss": 16.6031, "step": 9400 }, { "epoch": 0.2730760061550465, "grad_norm": 20.703125, "learning_rate": 4.266828008452383e-08, "loss": 16.702, "step": 9450 }, { "epoch": 0.27452085274845944, "grad_norm": 22.328125, "learning_rate": 4.289403818020914e-08, "loss": 16.6338, "step": 9500 }, { "epoch": 0.2759656993418724, "grad_norm": 20.0, "learning_rate": 4.311979627589445e-08, "loss": 16.7581, "step": 9550 }, { "epoch": 0.2774105459352853, "grad_norm": 20.46875, "learning_rate": 4.334555437157976e-08, "loss": 16.7627, "step": 9600 }, { "epoch": 0.2788553925286983, "grad_norm": 21.90625, "learning_rate": 4.357131246726507e-08, "loss": 16.7124, "step": 9650 }, { "epoch": 0.2803002391221112, "grad_norm": 22.46875, "learning_rate": 4.3797070562950386e-08, "loss": 16.7091, "step": 9700 }, { "epoch": 0.28174508571552415, "grad_norm": 23.078125, "learning_rate": 4.40228286586357e-08, "loss": 16.6876, "step": 9750 }, { "epoch": 0.2831899323089371, "grad_norm": 19.53125, "learning_rate": 4.4248586754321004e-08, "loss": 16.5888, "step": 9800 }, { "epoch": 0.28463477890235006, "grad_norm": 22.921875, "learning_rate": 4.447434485000632e-08, "loss": 16.7229, "step": 9850 }, { "epoch": 0.286079625495763, "grad_norm": 21.265625, "learning_rate": 4.4700102945691635e-08, "loss": 16.6842, "step": 9900 }, { "epoch": 0.2875244720891759, "grad_norm": 20.5625, "learning_rate": 4.492586104137694e-08, "loss": 16.614, "step": 9950 }, { "epoch": 0.28896931868258885, "grad_norm": 21.203125, "learning_rate": 4.515161913706225e-08, "loss": 16.6419, "step": 10000 }, { "epoch": 0.29041416527600183, "grad_norm": 20.375, "learning_rate": 4.537737723274757e-08, "loss": 16.6317, "step": 10050 }, { "epoch": 0.29185901186941476, "grad_norm": 21.375, "learning_rate": 4.560313532843287e-08, "loss": 16.6707, "step": 10100 }, { "epoch": 0.2933038584628277, "grad_norm": 23.4375, "learning_rate": 4.5828893424118187e-08, "loss": 16.6595, "step": 10150 }, { "epoch": 0.2947487050562407, "grad_norm": 20.703125, "learning_rate": 4.6054651519803496e-08, "loss": 16.639, "step": 10200 }, { "epoch": 0.2961935516496536, "grad_norm": 20.453125, "learning_rate": 4.628040961548881e-08, "loss": 16.6537, "step": 10250 }, { "epoch": 0.29763839824306654, "grad_norm": 21.234375, "learning_rate": 4.650616771117412e-08, "loss": 16.6099, "step": 10300 }, { "epoch": 0.29908324483647947, "grad_norm": 22.84375, "learning_rate": 4.673192580685943e-08, "loss": 16.6629, "step": 10350 }, { "epoch": 0.30052809142989245, "grad_norm": 20.796875, "learning_rate": 4.6957683902544745e-08, "loss": 16.6335, "step": 10400 }, { "epoch": 0.3019729380233054, "grad_norm": 21.65625, "learning_rate": 4.7183441998230054e-08, "loss": 16.6122, "step": 10450 }, { "epoch": 0.3034177846167183, "grad_norm": 23.046875, "learning_rate": 4.740920009391536e-08, "loss": 16.6171, "step": 10500 }, { "epoch": 0.30486263121013124, "grad_norm": 20.71875, "learning_rate": 4.763495818960068e-08, "loss": 16.5997, "step": 10550 }, { "epoch": 0.3063074778035442, "grad_norm": 22.71875, "learning_rate": 4.7860716285285994e-08, "loss": 16.615, "step": 10600 }, { "epoch": 0.30775232439695716, "grad_norm": 21.53125, "learning_rate": 4.8086474380971296e-08, "loss": 16.6133, "step": 10650 }, { "epoch": 0.3091971709903701, "grad_norm": 22.828125, "learning_rate": 4.831223247665661e-08, "loss": 16.5908, "step": 10700 }, { "epoch": 0.310642017583783, "grad_norm": 22.859375, "learning_rate": 4.853799057234193e-08, "loss": 16.5617, "step": 10750 }, { "epoch": 0.312086864177196, "grad_norm": 30.734375, "learning_rate": 4.876374866802723e-08, "loss": 16.5051, "step": 10800 }, { "epoch": 0.31353171077060893, "grad_norm": 20.203125, "learning_rate": 4.8989506763712545e-08, "loss": 16.6504, "step": 10850 }, { "epoch": 0.31497655736402186, "grad_norm": 26.859375, "learning_rate": 4.921526485939786e-08, "loss": 16.6061, "step": 10900 }, { "epoch": 0.31642140395743484, "grad_norm": 20.828125, "learning_rate": 4.944102295508316e-08, "loss": 16.6288, "step": 10950 }, { "epoch": 0.3178662505508478, "grad_norm": 25.515625, "learning_rate": 4.966678105076848e-08, "loss": 16.5228, "step": 11000 }, { "epoch": 0.3193110971442607, "grad_norm": 23.671875, "learning_rate": 4.9892539146453794e-08, "loss": 16.4985, "step": 11050 }, { "epoch": 0.32075594373767363, "grad_norm": 23.25, "learning_rate": 5.01182972421391e-08, "loss": 16.6149, "step": 11100 }, { "epoch": 0.3222007903310866, "grad_norm": 20.375, "learning_rate": 5.034405533782441e-08, "loss": 16.5877, "step": 11150 }, { "epoch": 0.32364563692449955, "grad_norm": 20.53125, "learning_rate": 5.056981343350972e-08, "loss": 16.4745, "step": 11200 }, { "epoch": 0.3250904835179125, "grad_norm": 19.71875, "learning_rate": 5.0795571529195036e-08, "loss": 16.5593, "step": 11250 }, { "epoch": 0.3265353301113254, "grad_norm": 20.53125, "learning_rate": 5.1021329624880345e-08, "loss": 16.4393, "step": 11300 }, { "epoch": 0.3279801767047384, "grad_norm": 22.078125, "learning_rate": 5.1247087720565654e-08, "loss": 16.5429, "step": 11350 }, { "epoch": 0.3294250232981513, "grad_norm": 20.5, "learning_rate": 5.147284581625097e-08, "loss": 16.4117, "step": 11400 }, { "epoch": 0.33086986989156425, "grad_norm": 23.640625, "learning_rate": 5.169860391193628e-08, "loss": 16.5424, "step": 11450 }, { "epoch": 0.33231471648497724, "grad_norm": 21.625, "learning_rate": 5.192436200762159e-08, "loss": 16.6651, "step": 11500 }, { "epoch": 0.33375956307839016, "grad_norm": 24.453125, "learning_rate": 5.2150120103306903e-08, "loss": 16.5083, "step": 11550 }, { "epoch": 0.3352044096718031, "grad_norm": 22.46875, "learning_rate": 5.237587819899222e-08, "loss": 16.4798, "step": 11600 }, { "epoch": 0.336649256265216, "grad_norm": 22.703125, "learning_rate": 5.260163629467752e-08, "loss": 16.6141, "step": 11650 }, { "epoch": 0.338094102858629, "grad_norm": 23.921875, "learning_rate": 5.282739439036284e-08, "loss": 16.6017, "step": 11700 }, { "epoch": 0.33953894945204194, "grad_norm": 24.140625, "learning_rate": 5.305315248604815e-08, "loss": 16.55, "step": 11750 }, { "epoch": 0.34098379604545487, "grad_norm": 21.9375, "learning_rate": 5.3278910581733455e-08, "loss": 16.5809, "step": 11800 }, { "epoch": 0.3424286426388678, "grad_norm": 22.3125, "learning_rate": 5.350466867741877e-08, "loss": 16.5002, "step": 11850 }, { "epoch": 0.3438734892322808, "grad_norm": 20.984375, "learning_rate": 5.3730426773104086e-08, "loss": 16.6775, "step": 11900 }, { "epoch": 0.3453183358256937, "grad_norm": 20.28125, "learning_rate": 5.395618486878939e-08, "loss": 16.5301, "step": 11950 }, { "epoch": 0.34676318241910664, "grad_norm": 21.578125, "learning_rate": 5.4181942964474704e-08, "loss": 16.501, "step": 12000 }, { "epoch": 0.34676318241910664, "eval_loss": 2.0641098022460938, "eval_runtime": 340.7339, "eval_samples_per_second": 2736.839, "eval_steps_per_second": 42.764, "step": 12000 }, { "epoch": 0.34820802901251957, "grad_norm": 22.203125, "learning_rate": 5.440770106016002e-08, "loss": 16.6513, "step": 12050 }, { "epoch": 0.34965287560593256, "grad_norm": 22.296875, "learning_rate": 5.463345915584533e-08, "loss": 16.584, "step": 12100 }, { "epoch": 0.3510977221993455, "grad_norm": 22.953125, "learning_rate": 5.485921725153064e-08, "loss": 16.5184, "step": 12150 }, { "epoch": 0.3525425687927584, "grad_norm": 19.984375, "learning_rate": 5.5084975347215946e-08, "loss": 16.4685, "step": 12200 }, { "epoch": 0.3539874153861714, "grad_norm": 22.515625, "learning_rate": 5.531073344290126e-08, "loss": 16.5087, "step": 12250 }, { "epoch": 0.35543226197958433, "grad_norm": 22.984375, "learning_rate": 5.553649153858657e-08, "loss": 16.4561, "step": 12300 }, { "epoch": 0.35687710857299726, "grad_norm": 20.421875, "learning_rate": 5.576224963427188e-08, "loss": 16.502, "step": 12350 }, { "epoch": 0.3583219551664102, "grad_norm": 21.234375, "learning_rate": 5.5988007729957195e-08, "loss": 16.4946, "step": 12400 }, { "epoch": 0.3597668017598232, "grad_norm": 20.953125, "learning_rate": 5.6213765825642504e-08, "loss": 16.4988, "step": 12450 }, { "epoch": 0.3612116483532361, "grad_norm": 23.421875, "learning_rate": 5.643952392132781e-08, "loss": 16.5329, "step": 12500 }, { "epoch": 0.36265649494664903, "grad_norm": 31.59375, "learning_rate": 5.666528201701313e-08, "loss": 16.4695, "step": 12550 }, { "epoch": 0.36410134154006196, "grad_norm": 22.203125, "learning_rate": 5.6891040112698444e-08, "loss": 16.4166, "step": 12600 }, { "epoch": 0.36554618813347495, "grad_norm": 23.828125, "learning_rate": 5.7116798208383747e-08, "loss": 16.3887, "step": 12650 }, { "epoch": 0.3669910347268879, "grad_norm": 25.0, "learning_rate": 5.734255630406906e-08, "loss": 16.3849, "step": 12700 }, { "epoch": 0.3684358813203008, "grad_norm": 21.875, "learning_rate": 5.756831439975438e-08, "loss": 16.489, "step": 12750 }, { "epoch": 0.36988072791371374, "grad_norm": 22.859375, "learning_rate": 5.779407249543968e-08, "loss": 16.4234, "step": 12800 }, { "epoch": 0.3713255745071267, "grad_norm": 20.53125, "learning_rate": 5.8019830591124996e-08, "loss": 16.3971, "step": 12850 }, { "epoch": 0.37277042110053965, "grad_norm": 23.0625, "learning_rate": 5.824558868681031e-08, "loss": 16.4946, "step": 12900 }, { "epoch": 0.3742152676939526, "grad_norm": 21.78125, "learning_rate": 5.8471346782495613e-08, "loss": 16.5603, "step": 12950 }, { "epoch": 0.37566011428736557, "grad_norm": 21.296875, "learning_rate": 5.869710487818093e-08, "loss": 16.3374, "step": 13000 }, { "epoch": 0.3771049608807785, "grad_norm": 22.90625, "learning_rate": 5.8922862973866245e-08, "loss": 16.3641, "step": 13050 }, { "epoch": 0.3785498074741914, "grad_norm": 22.84375, "learning_rate": 5.9148621069551554e-08, "loss": 16.4834, "step": 13100 }, { "epoch": 0.37999465406760435, "grad_norm": 20.40625, "learning_rate": 5.937437916523686e-08, "loss": 16.4651, "step": 13150 }, { "epoch": 0.38143950066101734, "grad_norm": 23.140625, "learning_rate": 5.960013726092217e-08, "loss": 16.5031, "step": 13200 }, { "epoch": 0.38288434725443027, "grad_norm": 22.625, "learning_rate": 5.982589535660749e-08, "loss": 16.3802, "step": 13250 }, { "epoch": 0.3843291938478432, "grad_norm": 20.375, "learning_rate": 6.005165345229279e-08, "loss": 16.3925, "step": 13300 }, { "epoch": 0.3857740404412561, "grad_norm": 20.734375, "learning_rate": 6.02774115479781e-08, "loss": 16.399, "step": 13350 }, { "epoch": 0.3872188870346691, "grad_norm": 22.5, "learning_rate": 6.050316964366342e-08, "loss": 16.3255, "step": 13400 }, { "epoch": 0.38866373362808204, "grad_norm": 22.0625, "learning_rate": 6.072892773934872e-08, "loss": 16.5031, "step": 13450 }, { "epoch": 0.39010858022149497, "grad_norm": 23.171875, "learning_rate": 6.095468583503404e-08, "loss": 16.3906, "step": 13500 }, { "epoch": 0.3915534268149079, "grad_norm": 23.46875, "learning_rate": 6.118044393071935e-08, "loss": 16.4107, "step": 13550 }, { "epoch": 0.3929982734083209, "grad_norm": 24.046875, "learning_rate": 6.140620202640467e-08, "loss": 16.4128, "step": 13600 }, { "epoch": 0.3944431200017338, "grad_norm": 23.53125, "learning_rate": 6.163196012208997e-08, "loss": 16.4372, "step": 13650 }, { "epoch": 0.39588796659514675, "grad_norm": 23.734375, "learning_rate": 6.185771821777529e-08, "loss": 16.5197, "step": 13700 }, { "epoch": 0.39733281318855973, "grad_norm": 22.0625, "learning_rate": 6.20834763134606e-08, "loss": 16.4171, "step": 13750 }, { "epoch": 0.39877765978197266, "grad_norm": 21.953125, "learning_rate": 6.23092344091459e-08, "loss": 16.3508, "step": 13800 }, { "epoch": 0.4002225063753856, "grad_norm": 23.796875, "learning_rate": 6.253499250483122e-08, "loss": 16.4303, "step": 13850 }, { "epoch": 0.4016673529687985, "grad_norm": 21.484375, "learning_rate": 6.276075060051652e-08, "loss": 16.3384, "step": 13900 }, { "epoch": 0.4031121995622115, "grad_norm": 21.3125, "learning_rate": 6.298650869620184e-08, "loss": 16.3742, "step": 13950 }, { "epoch": 0.40455704615562443, "grad_norm": 21.53125, "learning_rate": 6.321226679188715e-08, "loss": 16.3369, "step": 14000 }, { "epoch": 0.40600189274903736, "grad_norm": 24.578125, "learning_rate": 6.343802488757247e-08, "loss": 16.4018, "step": 14050 }, { "epoch": 0.4074467393424503, "grad_norm": 26.078125, "learning_rate": 6.366378298325779e-08, "loss": 16.4958, "step": 14100 }, { "epoch": 0.4088915859358633, "grad_norm": 20.296875, "learning_rate": 6.388954107894309e-08, "loss": 16.4418, "step": 14150 }, { "epoch": 0.4103364325292762, "grad_norm": 26.171875, "learning_rate": 6.411529917462839e-08, "loss": 16.3343, "step": 14200 }, { "epoch": 0.41178127912268914, "grad_norm": 19.671875, "learning_rate": 6.43410572703137e-08, "loss": 16.3505, "step": 14250 }, { "epoch": 0.41322612571610207, "grad_norm": 21.265625, "learning_rate": 6.456681536599902e-08, "loss": 16.3802, "step": 14300 }, { "epoch": 0.41467097230951505, "grad_norm": 22.59375, "learning_rate": 6.479257346168434e-08, "loss": 16.3253, "step": 14350 }, { "epoch": 0.416115818902928, "grad_norm": 24.015625, "learning_rate": 6.501833155736965e-08, "loss": 16.3624, "step": 14400 }, { "epoch": 0.4175606654963409, "grad_norm": 26.5, "learning_rate": 6.524408965305495e-08, "loss": 16.2562, "step": 14450 }, { "epoch": 0.4190055120897539, "grad_norm": 19.75, "learning_rate": 6.546984774874027e-08, "loss": 16.3554, "step": 14500 }, { "epoch": 0.4204503586831668, "grad_norm": 24.234375, "learning_rate": 6.569560584442557e-08, "loss": 16.4004, "step": 14550 }, { "epoch": 0.42189520527657975, "grad_norm": 22.9375, "learning_rate": 6.592136394011089e-08, "loss": 16.3374, "step": 14600 }, { "epoch": 0.4233400518699927, "grad_norm": 24.21875, "learning_rate": 6.61471220357962e-08, "loss": 16.2601, "step": 14650 }, { "epoch": 0.42478489846340567, "grad_norm": 23.203125, "learning_rate": 6.63728801314815e-08, "loss": 16.331, "step": 14700 }, { "epoch": 0.4262297450568186, "grad_norm": 23.640625, "learning_rate": 6.659863822716682e-08, "loss": 16.3419, "step": 14750 }, { "epoch": 0.42767459165023153, "grad_norm": 25.453125, "learning_rate": 6.682439632285214e-08, "loss": 16.2981, "step": 14800 }, { "epoch": 0.42911943824364446, "grad_norm": 25.765625, "learning_rate": 6.705015441853745e-08, "loss": 16.2609, "step": 14850 }, { "epoch": 0.43056428483705744, "grad_norm": 20.46875, "learning_rate": 6.727591251422276e-08, "loss": 16.3126, "step": 14900 }, { "epoch": 0.43200913143047037, "grad_norm": 23.25, "learning_rate": 6.750167060990807e-08, "loss": 16.2534, "step": 14950 }, { "epoch": 0.4334539780238833, "grad_norm": 21.4375, "learning_rate": 6.772742870559337e-08, "loss": 16.2914, "step": 15000 }, { "epoch": 0.4334539780238833, "eval_loss": 2.0395615100860596, "eval_runtime": 351.3281, "eval_samples_per_second": 2654.311, "eval_steps_per_second": 41.474, "step": 15000 }, { "epoch": 0.43489882461729623, "grad_norm": 20.796875, "learning_rate": 6.795318680127869e-08, "loss": 16.2964, "step": 15050 }, { "epoch": 0.4363436712107092, "grad_norm": 21.578125, "learning_rate": 6.8178944896964e-08, "loss": 16.3424, "step": 15100 }, { "epoch": 0.43778851780412215, "grad_norm": 21.703125, "learning_rate": 6.840470299264932e-08, "loss": 16.2815, "step": 15150 }, { "epoch": 0.4392333643975351, "grad_norm": 22.578125, "learning_rate": 6.863046108833462e-08, "loss": 16.2694, "step": 15200 }, { "epoch": 0.44067821099094806, "grad_norm": 26.828125, "learning_rate": 6.885621918401994e-08, "loss": 16.2776, "step": 15250 }, { "epoch": 0.442123057584361, "grad_norm": 21.828125, "learning_rate": 6.908197727970524e-08, "loss": 16.3022, "step": 15300 }, { "epoch": 0.4435679041777739, "grad_norm": 20.9375, "learning_rate": 6.930773537539056e-08, "loss": 16.2117, "step": 15350 }, { "epoch": 0.44501275077118685, "grad_norm": 25.59375, "learning_rate": 6.953349347107587e-08, "loss": 16.3209, "step": 15400 }, { "epoch": 0.44645759736459983, "grad_norm": 24.96875, "learning_rate": 6.975925156676119e-08, "loss": 16.2672, "step": 15450 }, { "epoch": 0.44790244395801276, "grad_norm": 20.359375, "learning_rate": 6.99850096624465e-08, "loss": 16.2478, "step": 15500 }, { "epoch": 0.4493472905514257, "grad_norm": 20.28125, "learning_rate": 7.021076775813179e-08, "loss": 16.2187, "step": 15550 }, { "epoch": 0.4507921371448386, "grad_norm": 21.1875, "learning_rate": 7.043652585381711e-08, "loss": 16.2327, "step": 15600 }, { "epoch": 0.4522369837382516, "grad_norm": 21.0625, "learning_rate": 7.066228394950242e-08, "loss": 16.321, "step": 15650 }, { "epoch": 0.45368183033166454, "grad_norm": 24.265625, "learning_rate": 7.088804204518774e-08, "loss": 16.2536, "step": 15700 }, { "epoch": 0.45512667692507747, "grad_norm": 22.953125, "learning_rate": 7.111380014087305e-08, "loss": 16.2889, "step": 15750 }, { "epoch": 0.45657152351849045, "grad_norm": 22.265625, "learning_rate": 7.133955823655837e-08, "loss": 16.2099, "step": 15800 }, { "epoch": 0.4580163701119034, "grad_norm": 21.25, "learning_rate": 7.156531633224367e-08, "loss": 16.2138, "step": 15850 }, { "epoch": 0.4594612167053163, "grad_norm": 22.125, "learning_rate": 7.179107442792897e-08, "loss": 16.2212, "step": 15900 }, { "epoch": 0.46090606329872924, "grad_norm": 21.515625, "learning_rate": 7.201683252361429e-08, "loss": 16.2495, "step": 15950 }, { "epoch": 0.4623509098921422, "grad_norm": 22.609375, "learning_rate": 7.22425906192996e-08, "loss": 16.2045, "step": 16000 }, { "epoch": 0.46379575648555516, "grad_norm": 22.15625, "learning_rate": 7.246834871498492e-08, "loss": 16.2562, "step": 16050 }, { "epoch": 0.4652406030789681, "grad_norm": 22.703125, "learning_rate": 7.269410681067024e-08, "loss": 16.2227, "step": 16100 }, { "epoch": 0.466685449672381, "grad_norm": 22.484375, "learning_rate": 7.291986490635554e-08, "loss": 16.2847, "step": 16150 }, { "epoch": 0.468130296265794, "grad_norm": 23.125, "learning_rate": 7.314562300204084e-08, "loss": 16.2231, "step": 16200 }, { "epoch": 0.46957514285920693, "grad_norm": 21.703125, "learning_rate": 7.337138109772616e-08, "loss": 16.2217, "step": 16250 }, { "epoch": 0.47101998945261986, "grad_norm": 21.34375, "learning_rate": 7.359713919341147e-08, "loss": 16.0962, "step": 16300 }, { "epoch": 0.4724648360460328, "grad_norm": 24.234375, "learning_rate": 7.382289728909679e-08, "loss": 16.1981, "step": 16350 }, { "epoch": 0.4739096826394458, "grad_norm": 22.09375, "learning_rate": 7.40486553847821e-08, "loss": 16.228, "step": 16400 }, { "epoch": 0.4753545292328587, "grad_norm": 20.671875, "learning_rate": 7.42744134804674e-08, "loss": 16.2862, "step": 16450 }, { "epoch": 0.47679937582627163, "grad_norm": 21.265625, "learning_rate": 7.450017157615272e-08, "loss": 16.3092, "step": 16500 }, { "epoch": 0.4782442224196846, "grad_norm": 24.90625, "learning_rate": 7.472592967183802e-08, "loss": 16.2476, "step": 16550 }, { "epoch": 0.47968906901309755, "grad_norm": 23.328125, "learning_rate": 7.495168776752334e-08, "loss": 16.1906, "step": 16600 }, { "epoch": 0.4811339156065105, "grad_norm": 23.140625, "learning_rate": 7.517744586320865e-08, "loss": 16.154, "step": 16650 }, { "epoch": 0.4825787621999234, "grad_norm": 22.234375, "learning_rate": 7.540320395889396e-08, "loss": 16.2017, "step": 16700 }, { "epoch": 0.4840236087933364, "grad_norm": 22.890625, "learning_rate": 7.562896205457927e-08, "loss": 16.345, "step": 16750 }, { "epoch": 0.4854684553867493, "grad_norm": 21.625, "learning_rate": 7.585472015026459e-08, "loss": 16.2125, "step": 16800 }, { "epoch": 0.48691330198016225, "grad_norm": 21.640625, "learning_rate": 7.60804782459499e-08, "loss": 16.217, "step": 16850 }, { "epoch": 0.4883581485735752, "grad_norm": 23.421875, "learning_rate": 7.63062363416352e-08, "loss": 16.2419, "step": 16900 }, { "epoch": 0.48980299516698816, "grad_norm": 25.9375, "learning_rate": 7.653199443732052e-08, "loss": 16.1463, "step": 16950 }, { "epoch": 0.4912478417604011, "grad_norm": 22.703125, "learning_rate": 7.675775253300582e-08, "loss": 16.0852, "step": 17000 }, { "epoch": 0.492692688353814, "grad_norm": 21.96875, "learning_rate": 7.698351062869114e-08, "loss": 16.2164, "step": 17050 }, { "epoch": 0.49413753494722695, "grad_norm": 25.6875, "learning_rate": 7.720926872437645e-08, "loss": 16.1581, "step": 17100 }, { "epoch": 0.49558238154063994, "grad_norm": 22.953125, "learning_rate": 7.743502682006177e-08, "loss": 16.1946, "step": 17150 }, { "epoch": 0.49702722813405287, "grad_norm": 21.984375, "learning_rate": 7.766078491574707e-08, "loss": 16.2602, "step": 17200 }, { "epoch": 0.4984720747274658, "grad_norm": 23.78125, "learning_rate": 7.788654301143239e-08, "loss": 16.1283, "step": 17250 }, { "epoch": 0.4999169213208788, "grad_norm": 21.0625, "learning_rate": 7.811230110711769e-08, "loss": 16.1301, "step": 17300 }, { "epoch": 0.5013617679142917, "grad_norm": 27.609375, "learning_rate": 7.8338059202803e-08, "loss": 16.0988, "step": 17350 }, { "epoch": 0.5028066145077047, "grad_norm": 24.4375, "learning_rate": 7.856381729848832e-08, "loss": 16.207, "step": 17400 }, { "epoch": 0.5042514611011176, "grad_norm": 23.765625, "learning_rate": 7.878957539417364e-08, "loss": 16.2054, "step": 17450 }, { "epoch": 0.5056963076945306, "grad_norm": 22.140625, "learning_rate": 7.901533348985895e-08, "loss": 16.173, "step": 17500 }, { "epoch": 0.5071411542879435, "grad_norm": 23.53125, "learning_rate": 7.924109158554424e-08, "loss": 16.043, "step": 17550 }, { "epoch": 0.5085860008813564, "grad_norm": 21.78125, "learning_rate": 7.946684968122956e-08, "loss": 16.246, "step": 17600 }, { "epoch": 0.5100308474747693, "grad_norm": 22.125, "learning_rate": 7.969260777691487e-08, "loss": 16.2203, "step": 17650 }, { "epoch": 0.5114756940681823, "grad_norm": 24.15625, "learning_rate": 7.991836587260019e-08, "loss": 16.1279, "step": 17700 }, { "epoch": 0.5129205406615952, "grad_norm": 22.1875, "learning_rate": 8.01441239682855e-08, "loss": 16.0409, "step": 17750 }, { "epoch": 0.5143653872550082, "grad_norm": 23.640625, "learning_rate": 8.036988206397082e-08, "loss": 16.0298, "step": 17800 }, { "epoch": 0.5158102338484212, "grad_norm": 24.015625, "learning_rate": 8.059564015965612e-08, "loss": 16.0894, "step": 17850 }, { "epoch": 0.5172550804418341, "grad_norm": 25.109375, "learning_rate": 8.082139825534142e-08, "loss": 16.1767, "step": 17900 }, { "epoch": 0.518699927035247, "grad_norm": 19.859375, "learning_rate": 8.104715635102674e-08, "loss": 16.0735, "step": 17950 }, { "epoch": 0.52014477362866, "grad_norm": 21.375, "learning_rate": 8.127291444671206e-08, "loss": 16.1829, "step": 18000 }, { "epoch": 0.52014477362866, "eval_loss": 2.015727996826172, "eval_runtime": 347.0309, "eval_samples_per_second": 2687.178, "eval_steps_per_second": 41.988, "step": 18000 }, { "epoch": 0.5215896202220729, "grad_norm": 20.765625, "learning_rate": 8.149867254239737e-08, "loss": 16.1783, "step": 18050 }, { "epoch": 0.5230344668154858, "grad_norm": 21.921875, "learning_rate": 8.172443063808269e-08, "loss": 16.0167, "step": 18100 }, { "epoch": 0.5244793134088989, "grad_norm": 21.1875, "learning_rate": 8.195018873376799e-08, "loss": 16.0192, "step": 18150 }, { "epoch": 0.5259241600023118, "grad_norm": 22.78125, "learning_rate": 8.217594682945329e-08, "loss": 16.1586, "step": 18200 }, { "epoch": 0.5273690065957247, "grad_norm": 21.15625, "learning_rate": 8.240170492513861e-08, "loss": 16.0389, "step": 18250 }, { "epoch": 0.5288138531891377, "grad_norm": 24.90625, "learning_rate": 8.262746302082392e-08, "loss": 16.1815, "step": 18300 }, { "epoch": 0.5302586997825506, "grad_norm": 21.171875, "learning_rate": 8.285322111650924e-08, "loss": 16.1257, "step": 18350 }, { "epoch": 0.5317035463759635, "grad_norm": 23.96875, "learning_rate": 8.307897921219455e-08, "loss": 16.1506, "step": 18400 }, { "epoch": 0.5331483929693764, "grad_norm": 23.3125, "learning_rate": 8.330473730787986e-08, "loss": 16.1117, "step": 18450 }, { "epoch": 0.5345932395627894, "grad_norm": 23.328125, "learning_rate": 8.353049540356517e-08, "loss": 16.1372, "step": 18500 }, { "epoch": 0.5360380861562024, "grad_norm": 21.265625, "learning_rate": 8.375625349925047e-08, "loss": 16.1619, "step": 18550 }, { "epoch": 0.5374829327496153, "grad_norm": 22.53125, "learning_rate": 8.398201159493579e-08, "loss": 16.0791, "step": 18600 }, { "epoch": 0.5389277793430283, "grad_norm": 27.203125, "learning_rate": 8.42077696906211e-08, "loss": 16.2036, "step": 18650 }, { "epoch": 0.5403726259364412, "grad_norm": 23.25, "learning_rate": 8.443352778630641e-08, "loss": 16.0342, "step": 18700 }, { "epoch": 0.5418174725298541, "grad_norm": 22.453125, "learning_rate": 8.465928588199172e-08, "loss": 16.1121, "step": 18750 }, { "epoch": 0.5432623191232671, "grad_norm": 27.1875, "learning_rate": 8.488504397767704e-08, "loss": 16.1368, "step": 18800 }, { "epoch": 0.54470716571668, "grad_norm": 24.609375, "learning_rate": 8.511080207336235e-08, "loss": 16.006, "step": 18850 }, { "epoch": 0.546152012310093, "grad_norm": 25.890625, "learning_rate": 8.533656016904766e-08, "loss": 16.0605, "step": 18900 }, { "epoch": 0.547596858903506, "grad_norm": 21.46875, "learning_rate": 8.556231826473297e-08, "loss": 16.1316, "step": 18950 }, { "epoch": 0.5490417054969189, "grad_norm": 21.75, "learning_rate": 8.578807636041827e-08, "loss": 16.0824, "step": 19000 }, { "epoch": 0.5504865520903318, "grad_norm": 23.796875, "learning_rate": 8.601383445610359e-08, "loss": 16.0387, "step": 19050 }, { "epoch": 0.5519313986837447, "grad_norm": 21.625, "learning_rate": 8.62395925517889e-08, "loss": 16.1834, "step": 19100 }, { "epoch": 0.5533762452771577, "grad_norm": 20.375, "learning_rate": 8.646535064747422e-08, "loss": 16.0675, "step": 19150 }, { "epoch": 0.5548210918705706, "grad_norm": 20.65625, "learning_rate": 8.669110874315952e-08, "loss": 15.9053, "step": 19200 }, { "epoch": 0.5562659384639835, "grad_norm": 21.984375, "learning_rate": 8.691686683884484e-08, "loss": 16.0319, "step": 19250 }, { "epoch": 0.5577107850573966, "grad_norm": 19.9375, "learning_rate": 8.714262493453014e-08, "loss": 16.0288, "step": 19300 }, { "epoch": 0.5591556316508095, "grad_norm": 21.234375, "learning_rate": 8.736838303021546e-08, "loss": 16.0782, "step": 19350 }, { "epoch": 0.5606004782442224, "grad_norm": 23.84375, "learning_rate": 8.759414112590077e-08, "loss": 15.987, "step": 19400 }, { "epoch": 0.5620453248376354, "grad_norm": 26.1875, "learning_rate": 8.781989922158609e-08, "loss": 16.1125, "step": 19450 }, { "epoch": 0.5634901714310483, "grad_norm": 22.640625, "learning_rate": 8.80456573172714e-08, "loss": 16.066, "step": 19500 }, { "epoch": 0.5649350180244612, "grad_norm": 23.9375, "learning_rate": 8.827141541295669e-08, "loss": 15.9326, "step": 19550 }, { "epoch": 0.5663798646178742, "grad_norm": 23.34375, "learning_rate": 8.849717350864201e-08, "loss": 16.0386, "step": 19600 }, { "epoch": 0.5678247112112872, "grad_norm": 22.703125, "learning_rate": 8.872293160432732e-08, "loss": 16.0198, "step": 19650 }, { "epoch": 0.5692695578047001, "grad_norm": 24.21875, "learning_rate": 8.894868970001264e-08, "loss": 15.9735, "step": 19700 }, { "epoch": 0.570714404398113, "grad_norm": 23.1875, "learning_rate": 8.917444779569795e-08, "loss": 15.9592, "step": 19750 }, { "epoch": 0.572159250991526, "grad_norm": 21.96875, "learning_rate": 8.940020589138327e-08, "loss": 16.0753, "step": 19800 }, { "epoch": 0.5736040975849389, "grad_norm": 23.59375, "learning_rate": 8.962596398706857e-08, "loss": 15.9445, "step": 19850 }, { "epoch": 0.5750489441783518, "grad_norm": 21.453125, "learning_rate": 8.985172208275388e-08, "loss": 15.9417, "step": 19900 }, { "epoch": 0.5764937907717648, "grad_norm": 26.203125, "learning_rate": 9.007748017843919e-08, "loss": 15.9153, "step": 19950 }, { "epoch": 0.5779386373651777, "grad_norm": 26.90625, "learning_rate": 9.03032382741245e-08, "loss": 16.0521, "step": 20000 }, { "epoch": 0.5793834839585907, "grad_norm": 24.5625, "learning_rate": 9.052899636980982e-08, "loss": 16.0634, "step": 20050 }, { "epoch": 0.5808283305520037, "grad_norm": 22.6875, "learning_rate": 9.075475446549514e-08, "loss": 16.0152, "step": 20100 }, { "epoch": 0.5822731771454166, "grad_norm": 21.15625, "learning_rate": 9.098051256118044e-08, "loss": 15.9701, "step": 20150 }, { "epoch": 0.5837180237388295, "grad_norm": 23.03125, "learning_rate": 9.120627065686574e-08, "loss": 15.9496, "step": 20200 }, { "epoch": 0.5851628703322425, "grad_norm": 22.53125, "learning_rate": 9.143202875255106e-08, "loss": 15.97, "step": 20250 }, { "epoch": 0.5866077169256554, "grad_norm": 23.046875, "learning_rate": 9.165778684823637e-08, "loss": 15.8332, "step": 20300 }, { "epoch": 0.5880525635190683, "grad_norm": 21.40625, "learning_rate": 9.188354494392169e-08, "loss": 15.977, "step": 20350 }, { "epoch": 0.5894974101124814, "grad_norm": 24.3125, "learning_rate": 9.210930303960699e-08, "loss": 15.9583, "step": 20400 }, { "epoch": 0.5909422567058943, "grad_norm": 23.359375, "learning_rate": 9.23350611352923e-08, "loss": 16.0266, "step": 20450 }, { "epoch": 0.5923871032993072, "grad_norm": 23.25, "learning_rate": 9.256081923097762e-08, "loss": 15.9804, "step": 20500 }, { "epoch": 0.5938319498927201, "grad_norm": 20.296875, "learning_rate": 9.278657732666292e-08, "loss": 15.9473, "step": 20550 }, { "epoch": 0.5952767964861331, "grad_norm": 21.1875, "learning_rate": 9.301233542234824e-08, "loss": 15.9547, "step": 20600 }, { "epoch": 0.596721643079546, "grad_norm": 23.40625, "learning_rate": 9.323809351803356e-08, "loss": 16.0047, "step": 20650 }, { "epoch": 0.5981664896729589, "grad_norm": 22.03125, "learning_rate": 9.346385161371886e-08, "loss": 15.9565, "step": 20700 }, { "epoch": 0.5996113362663719, "grad_norm": 23.96875, "learning_rate": 9.368960970940417e-08, "loss": 15.978, "step": 20750 }, { "epoch": 0.6010561828597849, "grad_norm": 26.078125, "learning_rate": 9.391536780508949e-08, "loss": 15.9615, "step": 20800 }, { "epoch": 0.6025010294531978, "grad_norm": 22.421875, "learning_rate": 9.41411259007748e-08, "loss": 15.9731, "step": 20850 }, { "epoch": 0.6039458760466108, "grad_norm": 28.59375, "learning_rate": 9.436688399646011e-08, "loss": 15.8916, "step": 20900 }, { "epoch": 0.6053907226400237, "grad_norm": 21.375, "learning_rate": 9.459264209214542e-08, "loss": 15.9134, "step": 20950 }, { "epoch": 0.6068355692334366, "grad_norm": 23.015625, "learning_rate": 9.481840018783072e-08, "loss": 15.9756, "step": 21000 }, { "epoch": 0.6068355692334366, "eval_loss": 1.990402102470398, "eval_runtime": 346.2542, "eval_samples_per_second": 2693.206, "eval_steps_per_second": 42.082, "step": 21000 }, { "epoch": 0.6082804158268496, "grad_norm": 21.765625, "learning_rate": 9.504415828351604e-08, "loss": 15.9846, "step": 21050 }, { "epoch": 0.6097252624202625, "grad_norm": 21.875, "learning_rate": 9.526991637920136e-08, "loss": 15.92, "step": 21100 }, { "epoch": 0.6111701090136755, "grad_norm": 23.140625, "learning_rate": 9.549567447488667e-08, "loss": 16.0141, "step": 21150 }, { "epoch": 0.6126149556070885, "grad_norm": 22.1875, "learning_rate": 9.572143257057199e-08, "loss": 15.9318, "step": 21200 }, { "epoch": 0.6140598022005014, "grad_norm": 24.421875, "learning_rate": 9.594719066625728e-08, "loss": 15.9115, "step": 21250 }, { "epoch": 0.6155046487939143, "grad_norm": 22.734375, "learning_rate": 9.617294876194259e-08, "loss": 15.847, "step": 21300 }, { "epoch": 0.6169494953873272, "grad_norm": 23.078125, "learning_rate": 9.639870685762791e-08, "loss": 16.0101, "step": 21350 }, { "epoch": 0.6183943419807402, "grad_norm": 21.265625, "learning_rate": 9.662446495331322e-08, "loss": 15.9588, "step": 21400 }, { "epoch": 0.6198391885741531, "grad_norm": 24.171875, "learning_rate": 9.685022304899854e-08, "loss": 15.9525, "step": 21450 }, { "epoch": 0.621284035167566, "grad_norm": 23.078125, "learning_rate": 9.707598114468385e-08, "loss": 15.9591, "step": 21500 }, { "epoch": 0.6227288817609791, "grad_norm": 26.1875, "learning_rate": 9.730173924036914e-08, "loss": 15.8973, "step": 21550 }, { "epoch": 0.624173728354392, "grad_norm": 22.1875, "learning_rate": 9.752749733605446e-08, "loss": 15.921, "step": 21600 }, { "epoch": 0.6256185749478049, "grad_norm": 34.90625, "learning_rate": 9.775325543173977e-08, "loss": 15.8763, "step": 21650 }, { "epoch": 0.6270634215412179, "grad_norm": 20.875, "learning_rate": 9.797901352742509e-08, "loss": 15.9879, "step": 21700 }, { "epoch": 0.6285082681346308, "grad_norm": 22.796875, "learning_rate": 9.82047716231104e-08, "loss": 15.8933, "step": 21750 }, { "epoch": 0.6299531147280437, "grad_norm": 25.84375, "learning_rate": 9.843052971879572e-08, "loss": 15.7943, "step": 21800 }, { "epoch": 0.6313979613214566, "grad_norm": 21.1875, "learning_rate": 9.865628781448102e-08, "loss": 15.842, "step": 21850 }, { "epoch": 0.6328428079148697, "grad_norm": 25.890625, "learning_rate": 9.888204591016633e-08, "loss": 15.9081, "step": 21900 }, { "epoch": 0.6342876545082826, "grad_norm": 20.859375, "learning_rate": 9.910780400585164e-08, "loss": 15.825, "step": 21950 }, { "epoch": 0.6357325011016955, "grad_norm": 20.953125, "learning_rate": 9.933356210153696e-08, "loss": 15.8846, "step": 22000 }, { "epoch": 0.6371773476951085, "grad_norm": 22.0, "learning_rate": 9.955932019722227e-08, "loss": 15.8721, "step": 22050 }, { "epoch": 0.6386221942885214, "grad_norm": 24.765625, "learning_rate": 9.978507829290759e-08, "loss": 15.7167, "step": 22100 }, { "epoch": 0.6400670408819343, "grad_norm": 23.984375, "learning_rate": 1.0001083638859289e-07, "loss": 15.8247, "step": 22150 }, { "epoch": 0.6415118874753473, "grad_norm": 19.5, "learning_rate": 1.002365944842782e-07, "loss": 15.8111, "step": 22200 }, { "epoch": 0.6429567340687602, "grad_norm": 25.640625, "learning_rate": 1.0046235257996351e-07, "loss": 15.9381, "step": 22250 }, { "epoch": 0.6444015806621732, "grad_norm": 19.84375, "learning_rate": 1.0068811067564882e-07, "loss": 15.7998, "step": 22300 }, { "epoch": 0.6458464272555862, "grad_norm": 21.296875, "learning_rate": 1.0091386877133414e-07, "loss": 15.8191, "step": 22350 }, { "epoch": 0.6472912738489991, "grad_norm": 22.0, "learning_rate": 1.0113962686701944e-07, "loss": 15.8098, "step": 22400 }, { "epoch": 0.648736120442412, "grad_norm": 24.46875, "learning_rate": 1.0136538496270476e-07, "loss": 15.8501, "step": 22450 }, { "epoch": 0.650180967035825, "grad_norm": 23.3125, "learning_rate": 1.0159114305839007e-07, "loss": 15.8631, "step": 22500 }, { "epoch": 0.6516258136292379, "grad_norm": 25.875, "learning_rate": 1.0181690115407538e-07, "loss": 15.6956, "step": 22550 }, { "epoch": 0.6530706602226508, "grad_norm": 24.703125, "learning_rate": 1.0204265924976069e-07, "loss": 15.8223, "step": 22600 }, { "epoch": 0.6545155068160639, "grad_norm": 22.15625, "learning_rate": 1.02268417345446e-07, "loss": 15.9404, "step": 22650 }, { "epoch": 0.6559603534094768, "grad_norm": 23.59375, "learning_rate": 1.0249417544113131e-07, "loss": 15.7929, "step": 22700 }, { "epoch": 0.6574052000028897, "grad_norm": 20.5, "learning_rate": 1.0271993353681662e-07, "loss": 15.8155, "step": 22750 }, { "epoch": 0.6588500465963026, "grad_norm": 22.984375, "learning_rate": 1.0294569163250194e-07, "loss": 15.8803, "step": 22800 }, { "epoch": 0.6602948931897156, "grad_norm": 22.859375, "learning_rate": 1.0317144972818726e-07, "loss": 15.8193, "step": 22850 }, { "epoch": 0.6617397397831285, "grad_norm": 24.109375, "learning_rate": 1.0339720782387256e-07, "loss": 15.6904, "step": 22900 }, { "epoch": 0.6631845863765414, "grad_norm": 28.3125, "learning_rate": 1.0362296591955787e-07, "loss": 15.8436, "step": 22950 }, { "epoch": 0.6646294329699545, "grad_norm": 24.234375, "learning_rate": 1.0384872401524318e-07, "loss": 15.7791, "step": 23000 }, { "epoch": 0.6660742795633674, "grad_norm": 22.40625, "learning_rate": 1.0407448211092849e-07, "loss": 15.8147, "step": 23050 }, { "epoch": 0.6675191261567803, "grad_norm": 21.90625, "learning_rate": 1.0430024020661381e-07, "loss": 15.8562, "step": 23100 }, { "epoch": 0.6689639727501933, "grad_norm": 22.296875, "learning_rate": 1.0452599830229912e-07, "loss": 15.8147, "step": 23150 }, { "epoch": 0.6704088193436062, "grad_norm": 24.0, "learning_rate": 1.0475175639798444e-07, "loss": 15.7467, "step": 23200 }, { "epoch": 0.6718536659370191, "grad_norm": 23.09375, "learning_rate": 1.0497751449366973e-07, "loss": 15.8176, "step": 23250 }, { "epoch": 0.673298512530432, "grad_norm": 29.296875, "learning_rate": 1.0520327258935504e-07, "loss": 15.7216, "step": 23300 }, { "epoch": 0.674743359123845, "grad_norm": 22.8125, "learning_rate": 1.0542903068504036e-07, "loss": 15.707, "step": 23350 }, { "epoch": 0.676188205717258, "grad_norm": 23.859375, "learning_rate": 1.0565478878072567e-07, "loss": 15.752, "step": 23400 }, { "epoch": 0.677633052310671, "grad_norm": 22.40625, "learning_rate": 1.0588054687641099e-07, "loss": 15.7979, "step": 23450 }, { "epoch": 0.6790778989040839, "grad_norm": 23.6875, "learning_rate": 1.061063049720963e-07, "loss": 15.7577, "step": 23500 }, { "epoch": 0.6805227454974968, "grad_norm": 20.421875, "learning_rate": 1.063320630677816e-07, "loss": 15.7758, "step": 23550 }, { "epoch": 0.6819675920909097, "grad_norm": 22.734375, "learning_rate": 1.0655782116346691e-07, "loss": 15.7973, "step": 23600 }, { "epoch": 0.6834124386843227, "grad_norm": 20.765625, "learning_rate": 1.0678357925915222e-07, "loss": 15.8134, "step": 23650 }, { "epoch": 0.6848572852777356, "grad_norm": 24.046875, "learning_rate": 1.0700933735483754e-07, "loss": 15.7493, "step": 23700 }, { "epoch": 0.6863021318711486, "grad_norm": 24.34375, "learning_rate": 1.0723509545052286e-07, "loss": 15.6879, "step": 23750 }, { "epoch": 0.6877469784645616, "grad_norm": 23.109375, "learning_rate": 1.0746085354620817e-07, "loss": 15.807, "step": 23800 }, { "epoch": 0.6891918250579745, "grad_norm": 25.34375, "learning_rate": 1.0768661164189347e-07, "loss": 15.8162, "step": 23850 }, { "epoch": 0.6906366716513874, "grad_norm": 21.140625, "learning_rate": 1.0791236973757878e-07, "loss": 15.7665, "step": 23900 }, { "epoch": 0.6920815182448004, "grad_norm": 22.796875, "learning_rate": 1.0813812783326409e-07, "loss": 15.7595, "step": 23950 }, { "epoch": 0.6935263648382133, "grad_norm": 22.578125, "learning_rate": 1.0836388592894941e-07, "loss": 15.7217, "step": 24000 }, { "epoch": 0.6935263648382133, "eval_loss": 1.9680598974227905, "eval_runtime": 341.6298, "eval_samples_per_second": 2729.662, "eval_steps_per_second": 42.651, "step": 24000 }, { "epoch": 0.6949712114316262, "grad_norm": 21.015625, "learning_rate": 1.0858964402463472e-07, "loss": 15.778, "step": 24050 }, { "epoch": 0.6964160580250391, "grad_norm": 24.84375, "learning_rate": 1.0881540212032004e-07, "loss": 15.7675, "step": 24100 }, { "epoch": 0.6978609046184522, "grad_norm": 26.171875, "learning_rate": 1.0904116021600534e-07, "loss": 15.7371, "step": 24150 }, { "epoch": 0.6993057512118651, "grad_norm": 20.859375, "learning_rate": 1.0926691831169066e-07, "loss": 15.8547, "step": 24200 }, { "epoch": 0.700750597805278, "grad_norm": 22.34375, "learning_rate": 1.0949267640737596e-07, "loss": 15.6903, "step": 24250 }, { "epoch": 0.702195444398691, "grad_norm": 23.140625, "learning_rate": 1.0971843450306127e-07, "loss": 15.7397, "step": 24300 }, { "epoch": 0.7036402909921039, "grad_norm": 21.953125, "learning_rate": 1.0994419259874659e-07, "loss": 15.6126, "step": 24350 }, { "epoch": 0.7050851375855168, "grad_norm": 25.8125, "learning_rate": 1.1016995069443189e-07, "loss": 15.8006, "step": 24400 }, { "epoch": 0.7065299841789298, "grad_norm": 25.0625, "learning_rate": 1.1039570879011721e-07, "loss": 15.6731, "step": 24450 }, { "epoch": 0.7079748307723428, "grad_norm": 26.265625, "learning_rate": 1.1062146688580252e-07, "loss": 15.7758, "step": 24500 }, { "epoch": 0.7094196773657557, "grad_norm": 23.390625, "learning_rate": 1.1084722498148783e-07, "loss": 15.7123, "step": 24550 }, { "epoch": 0.7108645239591687, "grad_norm": 21.984375, "learning_rate": 1.1107298307717314e-07, "loss": 15.7585, "step": 24600 }, { "epoch": 0.7123093705525816, "grad_norm": 23.765625, "learning_rate": 1.1129874117285846e-07, "loss": 15.7461, "step": 24650 }, { "epoch": 0.7137542171459945, "grad_norm": 22.265625, "learning_rate": 1.1152449926854376e-07, "loss": 15.7875, "step": 24700 }, { "epoch": 0.7151990637394074, "grad_norm": 22.375, "learning_rate": 1.1175025736422907e-07, "loss": 15.6815, "step": 24750 }, { "epoch": 0.7166439103328204, "grad_norm": 25.4375, "learning_rate": 1.1197601545991439e-07, "loss": 15.7047, "step": 24800 }, { "epoch": 0.7180887569262333, "grad_norm": 30.53125, "learning_rate": 1.122017735555997e-07, "loss": 15.7082, "step": 24850 }, { "epoch": 0.7195336035196463, "grad_norm": 23.859375, "learning_rate": 1.1242753165128501e-07, "loss": 15.6932, "step": 24900 }, { "epoch": 0.7209784501130593, "grad_norm": 21.890625, "learning_rate": 1.1265328974697032e-07, "loss": 15.7004, "step": 24950 }, { "epoch": 0.7224232967064722, "grad_norm": 22.21875, "learning_rate": 1.1287904784265563e-07, "loss": 15.5976, "step": 25000 }, { "epoch": 0.7238681432998851, "grad_norm": 20.765625, "learning_rate": 1.1310480593834094e-07, "loss": 15.651, "step": 25050 }, { "epoch": 0.7253129898932981, "grad_norm": 21.203125, "learning_rate": 1.1333056403402626e-07, "loss": 15.7333, "step": 25100 }, { "epoch": 0.726757836486711, "grad_norm": 22.25, "learning_rate": 1.1355632212971157e-07, "loss": 15.6802, "step": 25150 }, { "epoch": 0.7282026830801239, "grad_norm": 22.8125, "learning_rate": 1.1378208022539689e-07, "loss": 15.6639, "step": 25200 }, { "epoch": 0.729647529673537, "grad_norm": 21.140625, "learning_rate": 1.1400783832108218e-07, "loss": 15.6816, "step": 25250 }, { "epoch": 0.7310923762669499, "grad_norm": 23.0, "learning_rate": 1.1423359641676749e-07, "loss": 15.5984, "step": 25300 }, { "epoch": 0.7325372228603628, "grad_norm": 23.40625, "learning_rate": 1.1445935451245281e-07, "loss": 15.7119, "step": 25350 }, { "epoch": 0.7339820694537758, "grad_norm": 21.296875, "learning_rate": 1.1468511260813812e-07, "loss": 15.6212, "step": 25400 }, { "epoch": 0.7354269160471887, "grad_norm": 20.03125, "learning_rate": 1.1491087070382344e-07, "loss": 15.659, "step": 25450 }, { "epoch": 0.7368717626406016, "grad_norm": 22.984375, "learning_rate": 1.1513662879950876e-07, "loss": 15.771, "step": 25500 }, { "epoch": 0.7383166092340145, "grad_norm": 21.84375, "learning_rate": 1.1536238689519404e-07, "loss": 15.7338, "step": 25550 }, { "epoch": 0.7397614558274275, "grad_norm": 22.234375, "learning_rate": 1.1558814499087936e-07, "loss": 15.6507, "step": 25600 }, { "epoch": 0.7412063024208405, "grad_norm": 25.3125, "learning_rate": 1.1581390308656468e-07, "loss": 15.7084, "step": 25650 }, { "epoch": 0.7426511490142534, "grad_norm": 26.171875, "learning_rate": 1.1603966118224999e-07, "loss": 15.5601, "step": 25700 }, { "epoch": 0.7440959956076664, "grad_norm": 22.515625, "learning_rate": 1.1626541927793531e-07, "loss": 15.7191, "step": 25750 }, { "epoch": 0.7455408422010793, "grad_norm": 24.375, "learning_rate": 1.1649117737362062e-07, "loss": 15.6457, "step": 25800 }, { "epoch": 0.7469856887944922, "grad_norm": 23.640625, "learning_rate": 1.1671693546930592e-07, "loss": 15.572, "step": 25850 }, { "epoch": 0.7484305353879052, "grad_norm": 24.375, "learning_rate": 1.1694269356499123e-07, "loss": 15.6297, "step": 25900 }, { "epoch": 0.7498753819813181, "grad_norm": 23.8125, "learning_rate": 1.1716845166067654e-07, "loss": 15.6828, "step": 25950 }, { "epoch": 0.7513202285747311, "grad_norm": 23.953125, "learning_rate": 1.1739420975636186e-07, "loss": 15.5568, "step": 26000 }, { "epoch": 0.7527650751681441, "grad_norm": 25.421875, "learning_rate": 1.1761996785204717e-07, "loss": 15.6016, "step": 26050 }, { "epoch": 0.754209921761557, "grad_norm": 22.15625, "learning_rate": 1.1784572594773249e-07, "loss": 15.5887, "step": 26100 }, { "epoch": 0.7556547683549699, "grad_norm": 21.5625, "learning_rate": 1.1807148404341779e-07, "loss": 15.6077, "step": 26150 }, { "epoch": 0.7570996149483828, "grad_norm": 23.328125, "learning_rate": 1.1829724213910311e-07, "loss": 15.6592, "step": 26200 }, { "epoch": 0.7585444615417958, "grad_norm": 23.71875, "learning_rate": 1.1852300023478841e-07, "loss": 15.621, "step": 26250 }, { "epoch": 0.7599893081352087, "grad_norm": 24.5625, "learning_rate": 1.1874875833047373e-07, "loss": 15.5728, "step": 26300 }, { "epoch": 0.7614341547286216, "grad_norm": 23.21875, "learning_rate": 1.1897451642615904e-07, "loss": 15.6658, "step": 26350 }, { "epoch": 0.7628790013220347, "grad_norm": 24.96875, "learning_rate": 1.1920027452184434e-07, "loss": 15.4367, "step": 26400 }, { "epoch": 0.7643238479154476, "grad_norm": 23.296875, "learning_rate": 1.1942603261752967e-07, "loss": 15.6812, "step": 26450 }, { "epoch": 0.7657686945088605, "grad_norm": 21.21875, "learning_rate": 1.1965179071321497e-07, "loss": 15.4966, "step": 26500 }, { "epoch": 0.7672135411022735, "grad_norm": 21.859375, "learning_rate": 1.1987754880890028e-07, "loss": 15.6969, "step": 26550 }, { "epoch": 0.7686583876956864, "grad_norm": 21.234375, "learning_rate": 1.2010330690458558e-07, "loss": 15.5063, "step": 26600 }, { "epoch": 0.7701032342890993, "grad_norm": 30.65625, "learning_rate": 1.203290650002709e-07, "loss": 15.5682, "step": 26650 }, { "epoch": 0.7715480808825123, "grad_norm": 23.0625, "learning_rate": 1.205548230959562e-07, "loss": 15.5967, "step": 26700 }, { "epoch": 0.7729929274759253, "grad_norm": 22.171875, "learning_rate": 1.2078058119164154e-07, "loss": 15.5911, "step": 26750 }, { "epoch": 0.7744377740693382, "grad_norm": 25.8125, "learning_rate": 1.2100633928732684e-07, "loss": 15.6635, "step": 26800 }, { "epoch": 0.7758826206627512, "grad_norm": 23.40625, "learning_rate": 1.2123209738301214e-07, "loss": 15.5876, "step": 26850 }, { "epoch": 0.7773274672561641, "grad_norm": 21.15625, "learning_rate": 1.2145785547869745e-07, "loss": 15.5192, "step": 26900 }, { "epoch": 0.778772313849577, "grad_norm": 23.5625, "learning_rate": 1.2168361357438277e-07, "loss": 15.5746, "step": 26950 }, { "epoch": 0.7802171604429899, "grad_norm": 27.359375, "learning_rate": 1.2190937167006808e-07, "loss": 15.5407, "step": 27000 }, { "epoch": 0.7802171604429899, "eval_loss": 1.9437412023544312, "eval_runtime": 340.4, "eval_samples_per_second": 2739.524, "eval_steps_per_second": 42.806, "step": 27000 }, { "epoch": 0.7816620070364029, "grad_norm": 23.0625, "learning_rate": 1.221351297657534e-07, "loss": 15.609, "step": 27050 }, { "epoch": 0.7831068536298158, "grad_norm": 25.40625, "learning_rate": 1.223608878614387e-07, "loss": 15.6637, "step": 27100 }, { "epoch": 0.7845517002232288, "grad_norm": 23.90625, "learning_rate": 1.22586645957124e-07, "loss": 15.6405, "step": 27150 }, { "epoch": 0.7859965468166418, "grad_norm": 22.390625, "learning_rate": 1.2281240405280934e-07, "loss": 15.5515, "step": 27200 }, { "epoch": 0.7874413934100547, "grad_norm": 25.265625, "learning_rate": 1.2303816214849464e-07, "loss": 15.5254, "step": 27250 }, { "epoch": 0.7888862400034676, "grad_norm": 22.125, "learning_rate": 1.2326392024417994e-07, "loss": 15.5474, "step": 27300 }, { "epoch": 0.7903310865968806, "grad_norm": 23.03125, "learning_rate": 1.2348967833986527e-07, "loss": 15.554, "step": 27350 }, { "epoch": 0.7917759331902935, "grad_norm": 19.96875, "learning_rate": 1.2371543643555057e-07, "loss": 15.5717, "step": 27400 }, { "epoch": 0.7932207797837064, "grad_norm": 20.53125, "learning_rate": 1.2394119453123588e-07, "loss": 15.5454, "step": 27450 }, { "epoch": 0.7946656263771195, "grad_norm": 21.34375, "learning_rate": 1.241669526269212e-07, "loss": 15.5759, "step": 27500 }, { "epoch": 0.7961104729705324, "grad_norm": 23.9375, "learning_rate": 1.243927107226065e-07, "loss": 15.5199, "step": 27550 }, { "epoch": 0.7975553195639453, "grad_norm": 21.84375, "learning_rate": 1.246184688182918e-07, "loss": 15.4171, "step": 27600 }, { "epoch": 0.7990001661573582, "grad_norm": 22.234375, "learning_rate": 1.2484422691397714e-07, "loss": 15.5973, "step": 27650 }, { "epoch": 0.8004450127507712, "grad_norm": 22.421875, "learning_rate": 1.2506998500966244e-07, "loss": 15.4923, "step": 27700 }, { "epoch": 0.8018898593441841, "grad_norm": 21.34375, "learning_rate": 1.2529574310534774e-07, "loss": 15.509, "step": 27750 }, { "epoch": 0.803334705937597, "grad_norm": 24.359375, "learning_rate": 1.2552150120103305e-07, "loss": 15.5824, "step": 27800 }, { "epoch": 0.80477955253101, "grad_norm": 21.25, "learning_rate": 1.2574725929671838e-07, "loss": 15.5973, "step": 27850 }, { "epoch": 0.806224399124423, "grad_norm": 27.984375, "learning_rate": 1.2597301739240368e-07, "loss": 15.564, "step": 27900 }, { "epoch": 0.8076692457178359, "grad_norm": 33.71875, "learning_rate": 1.26198775488089e-07, "loss": 15.439, "step": 27950 }, { "epoch": 0.8091140923112489, "grad_norm": 24.09375, "learning_rate": 1.264245335837743e-07, "loss": 15.5574, "step": 28000 }, { "epoch": 0.8105589389046618, "grad_norm": 22.78125, "learning_rate": 1.266502916794596e-07, "loss": 15.5851, "step": 28050 }, { "epoch": 0.8120037854980747, "grad_norm": 21.125, "learning_rate": 1.2687604977514494e-07, "loss": 15.5334, "step": 28100 }, { "epoch": 0.8134486320914877, "grad_norm": 21.5625, "learning_rate": 1.2710180787083024e-07, "loss": 15.5789, "step": 28150 }, { "epoch": 0.8148934786849006, "grad_norm": 21.8125, "learning_rate": 1.2732756596651557e-07, "loss": 15.4793, "step": 28200 }, { "epoch": 0.8163383252783136, "grad_norm": 21.234375, "learning_rate": 1.2755332406220087e-07, "loss": 15.5015, "step": 28250 }, { "epoch": 0.8177831718717266, "grad_norm": 21.203125, "learning_rate": 1.2777908215788618e-07, "loss": 15.3711, "step": 28300 }, { "epoch": 0.8192280184651395, "grad_norm": 23.84375, "learning_rate": 1.280048402535715e-07, "loss": 15.4745, "step": 28350 }, { "epoch": 0.8206728650585524, "grad_norm": 22.03125, "learning_rate": 1.2823059834925678e-07, "loss": 15.5285, "step": 28400 }, { "epoch": 0.8221177116519653, "grad_norm": 20.9375, "learning_rate": 1.284563564449421e-07, "loss": 15.5214, "step": 28450 }, { "epoch": 0.8235625582453783, "grad_norm": 25.546875, "learning_rate": 1.286821145406274e-07, "loss": 15.4168, "step": 28500 }, { "epoch": 0.8250074048387912, "grad_norm": 24.265625, "learning_rate": 1.2890787263631271e-07, "loss": 15.5043, "step": 28550 }, { "epoch": 0.8264522514322041, "grad_norm": 23.265625, "learning_rate": 1.2913363073199804e-07, "loss": 15.4206, "step": 28600 }, { "epoch": 0.8278970980256172, "grad_norm": 22.0, "learning_rate": 1.2935938882768334e-07, "loss": 15.4444, "step": 28650 }, { "epoch": 0.8293419446190301, "grad_norm": 25.09375, "learning_rate": 1.2958514692336867e-07, "loss": 15.4043, "step": 28700 }, { "epoch": 0.830786791212443, "grad_norm": 21.046875, "learning_rate": 1.2981090501905398e-07, "loss": 15.5465, "step": 28750 }, { "epoch": 0.832231637805856, "grad_norm": 21.234375, "learning_rate": 1.300366631147393e-07, "loss": 15.4988, "step": 28800 }, { "epoch": 0.8336764843992689, "grad_norm": 21.046875, "learning_rate": 1.302624212104246e-07, "loss": 15.4368, "step": 28850 }, { "epoch": 0.8351213309926818, "grad_norm": 23.46875, "learning_rate": 1.304881793061099e-07, "loss": 15.4251, "step": 28900 }, { "epoch": 0.8365661775860948, "grad_norm": 23.046875, "learning_rate": 1.3071393740179524e-07, "loss": 15.4271, "step": 28950 }, { "epoch": 0.8380110241795078, "grad_norm": 21.0, "learning_rate": 1.3093969549748054e-07, "loss": 15.4439, "step": 29000 }, { "epoch": 0.8394558707729207, "grad_norm": 21.96875, "learning_rate": 1.3116545359316584e-07, "loss": 15.4197, "step": 29050 }, { "epoch": 0.8409007173663336, "grad_norm": 21.109375, "learning_rate": 1.3139121168885115e-07, "loss": 15.428, "step": 29100 }, { "epoch": 0.8423455639597466, "grad_norm": 21.984375, "learning_rate": 1.3161696978453645e-07, "loss": 15.3989, "step": 29150 }, { "epoch": 0.8437904105531595, "grad_norm": 21.921875, "learning_rate": 1.3184272788022178e-07, "loss": 15.4178, "step": 29200 }, { "epoch": 0.8452352571465724, "grad_norm": 20.765625, "learning_rate": 1.3206848597590708e-07, "loss": 15.3614, "step": 29250 }, { "epoch": 0.8466801037399854, "grad_norm": 21.390625, "learning_rate": 1.322942440715924e-07, "loss": 15.4306, "step": 29300 }, { "epoch": 0.8481249503333983, "grad_norm": 28.84375, "learning_rate": 1.325200021672777e-07, "loss": 15.4706, "step": 29350 }, { "epoch": 0.8495697969268113, "grad_norm": 22.921875, "learning_rate": 1.32745760262963e-07, "loss": 15.4703, "step": 29400 }, { "epoch": 0.8510146435202243, "grad_norm": 36.5625, "learning_rate": 1.3297151835864834e-07, "loss": 15.45, "step": 29450 }, { "epoch": 0.8524594901136372, "grad_norm": 24.25, "learning_rate": 1.3319727645433364e-07, "loss": 15.4233, "step": 29500 }, { "epoch": 0.8539043367070501, "grad_norm": 34.96875, "learning_rate": 1.3342303455001897e-07, "loss": 15.4693, "step": 29550 }, { "epoch": 0.8553491833004631, "grad_norm": 21.953125, "learning_rate": 1.3364879264570427e-07, "loss": 15.4402, "step": 29600 }, { "epoch": 0.856794029893876, "grad_norm": 22.40625, "learning_rate": 1.338745507413896e-07, "loss": 15.4761, "step": 29650 }, { "epoch": 0.8582388764872889, "grad_norm": 20.625, "learning_rate": 1.341003088370749e-07, "loss": 15.3796, "step": 29700 }, { "epoch": 0.859683723080702, "grad_norm": 21.328125, "learning_rate": 1.3432606693276018e-07, "loss": 15.3857, "step": 29750 }, { "epoch": 0.8611285696741149, "grad_norm": 23.375, "learning_rate": 1.345518250284455e-07, "loss": 15.414, "step": 29800 }, { "epoch": 0.8625734162675278, "grad_norm": 22.671875, "learning_rate": 1.347775831241308e-07, "loss": 15.3401, "step": 29850 }, { "epoch": 0.8640182628609407, "grad_norm": 22.65625, "learning_rate": 1.3500334121981614e-07, "loss": 15.346, "step": 29900 }, { "epoch": 0.8654631094543537, "grad_norm": 23.890625, "learning_rate": 1.3522909931550144e-07, "loss": 15.42, "step": 29950 }, { "epoch": 0.8669079560477666, "grad_norm": 20.515625, "learning_rate": 1.3545485741118675e-07, "loss": 15.389, "step": 30000 }, { "epoch": 0.8669079560477666, "eval_loss": 1.9219062328338623, "eval_runtime": 349.965, "eval_samples_per_second": 2664.65, "eval_steps_per_second": 41.636, "step": 30000 }, { "epoch": 0.8683528026411795, "grad_norm": 21.8125, "learning_rate": 1.3568061550687207e-07, "loss": 15.327, "step": 30050 }, { "epoch": 0.8697976492345925, "grad_norm": 22.8125, "learning_rate": 1.3590637360255738e-07, "loss": 15.3229, "step": 30100 }, { "epoch": 0.8712424958280055, "grad_norm": 23.671875, "learning_rate": 1.361321316982427e-07, "loss": 15.3576, "step": 30150 }, { "epoch": 0.8726873424214184, "grad_norm": 22.609375, "learning_rate": 1.36357889793928e-07, "loss": 15.3539, "step": 30200 }, { "epoch": 0.8741321890148314, "grad_norm": 20.65625, "learning_rate": 1.365836478896133e-07, "loss": 15.2693, "step": 30250 }, { "epoch": 0.8755770356082443, "grad_norm": 22.5625, "learning_rate": 1.3680940598529864e-07, "loss": 15.4018, "step": 30300 }, { "epoch": 0.8770218822016572, "grad_norm": 19.875, "learning_rate": 1.3703516408098394e-07, "loss": 15.4242, "step": 30350 }, { "epoch": 0.8784667287950702, "grad_norm": 27.234375, "learning_rate": 1.3726092217666924e-07, "loss": 15.3294, "step": 30400 }, { "epoch": 0.8799115753884831, "grad_norm": 23.375, "learning_rate": 1.3748668027235455e-07, "loss": 15.3841, "step": 30450 }, { "epoch": 0.8813564219818961, "grad_norm": 23.125, "learning_rate": 1.3771243836803988e-07, "loss": 15.3368, "step": 30500 }, { "epoch": 0.882801268575309, "grad_norm": 23.171875, "learning_rate": 1.3793819646372518e-07, "loss": 15.343, "step": 30550 }, { "epoch": 0.884246115168722, "grad_norm": 29.78125, "learning_rate": 1.3816395455941048e-07, "loss": 15.3782, "step": 30600 }, { "epoch": 0.8856909617621349, "grad_norm": 22.453125, "learning_rate": 1.383897126550958e-07, "loss": 15.4537, "step": 30650 }, { "epoch": 0.8871358083555478, "grad_norm": 21.265625, "learning_rate": 1.386154707507811e-07, "loss": 15.4048, "step": 30700 }, { "epoch": 0.8885806549489608, "grad_norm": 24.25, "learning_rate": 1.3884122884646644e-07, "loss": 15.3433, "step": 30750 }, { "epoch": 0.8900255015423737, "grad_norm": 25.1875, "learning_rate": 1.3906698694215174e-07, "loss": 15.3141, "step": 30800 }, { "epoch": 0.8914703481357866, "grad_norm": 24.25, "learning_rate": 1.3929274503783704e-07, "loss": 15.2703, "step": 30850 }, { "epoch": 0.8929151947291997, "grad_norm": 22.3125, "learning_rate": 1.3951850313352237e-07, "loss": 15.4022, "step": 30900 }, { "epoch": 0.8943600413226126, "grad_norm": 19.859375, "learning_rate": 1.3974426122920768e-07, "loss": 15.2936, "step": 30950 }, { "epoch": 0.8958048879160255, "grad_norm": 20.5, "learning_rate": 1.39970019324893e-07, "loss": 15.3219, "step": 31000 }, { "epoch": 0.8972497345094385, "grad_norm": 21.71875, "learning_rate": 1.4019577742057828e-07, "loss": 15.2468, "step": 31050 }, { "epoch": 0.8986945811028514, "grad_norm": 23.421875, "learning_rate": 1.4042153551626358e-07, "loss": 15.2591, "step": 31100 }, { "epoch": 0.9001394276962643, "grad_norm": 23.09375, "learning_rate": 1.406472936119489e-07, "loss": 15.3318, "step": 31150 }, { "epoch": 0.9015842742896772, "grad_norm": 24.09375, "learning_rate": 1.4087305170763421e-07, "loss": 15.2105, "step": 31200 }, { "epoch": 0.9030291208830903, "grad_norm": 22.671875, "learning_rate": 1.4109880980331954e-07, "loss": 15.2557, "step": 31250 }, { "epoch": 0.9044739674765032, "grad_norm": 22.0625, "learning_rate": 1.4132456789900484e-07, "loss": 15.4014, "step": 31300 }, { "epoch": 0.9059188140699161, "grad_norm": 21.796875, "learning_rate": 1.4155032599469017e-07, "loss": 15.2382, "step": 31350 }, { "epoch": 0.9073636606633291, "grad_norm": 24.5, "learning_rate": 1.4177608409037548e-07, "loss": 15.395, "step": 31400 }, { "epoch": 0.908808507256742, "grad_norm": 21.828125, "learning_rate": 1.4200184218606078e-07, "loss": 15.2785, "step": 31450 }, { "epoch": 0.9102533538501549, "grad_norm": 22.5625, "learning_rate": 1.422276002817461e-07, "loss": 15.2983, "step": 31500 }, { "epoch": 0.9116982004435679, "grad_norm": 21.328125, "learning_rate": 1.424533583774314e-07, "loss": 15.382, "step": 31550 }, { "epoch": 0.9131430470369809, "grad_norm": 21.3125, "learning_rate": 1.4267911647311674e-07, "loss": 15.2084, "step": 31600 }, { "epoch": 0.9145878936303938, "grad_norm": 22.6875, "learning_rate": 1.4290487456880204e-07, "loss": 15.2803, "step": 31650 }, { "epoch": 0.9160327402238068, "grad_norm": 20.953125, "learning_rate": 1.4313063266448734e-07, "loss": 15.3734, "step": 31700 }, { "epoch": 0.9174775868172197, "grad_norm": 22.765625, "learning_rate": 1.4335639076017265e-07, "loss": 15.3248, "step": 31750 }, { "epoch": 0.9189224334106326, "grad_norm": 21.640625, "learning_rate": 1.4358214885585795e-07, "loss": 15.2958, "step": 31800 }, { "epoch": 0.9203672800040456, "grad_norm": 21.53125, "learning_rate": 1.4380790695154328e-07, "loss": 15.2854, "step": 31850 }, { "epoch": 0.9218121265974585, "grad_norm": 21.265625, "learning_rate": 1.4403366504722858e-07, "loss": 15.2983, "step": 31900 }, { "epoch": 0.9232569731908714, "grad_norm": 26.265625, "learning_rate": 1.4425942314291388e-07, "loss": 15.1967, "step": 31950 }, { "epoch": 0.9247018197842845, "grad_norm": 21.078125, "learning_rate": 1.444851812385992e-07, "loss": 15.2861, "step": 32000 }, { "epoch": 0.9261466663776974, "grad_norm": 22.53125, "learning_rate": 1.447109393342845e-07, "loss": 15.203, "step": 32050 }, { "epoch": 0.9275915129711103, "grad_norm": 20.46875, "learning_rate": 1.4493669742996984e-07, "loss": 15.3343, "step": 32100 }, { "epoch": 0.9290363595645232, "grad_norm": 21.5625, "learning_rate": 1.4516245552565514e-07, "loss": 15.1377, "step": 32150 }, { "epoch": 0.9304812061579362, "grad_norm": 23.609375, "learning_rate": 1.4538821362134047e-07, "loss": 15.267, "step": 32200 }, { "epoch": 0.9319260527513491, "grad_norm": 22.59375, "learning_rate": 1.4561397171702577e-07, "loss": 15.3935, "step": 32250 }, { "epoch": 0.933370899344762, "grad_norm": 23.90625, "learning_rate": 1.4583972981271108e-07, "loss": 15.2605, "step": 32300 }, { "epoch": 0.9348157459381751, "grad_norm": 23.171875, "learning_rate": 1.460654879083964e-07, "loss": 15.2479, "step": 32350 }, { "epoch": 0.936260592531588, "grad_norm": 22.734375, "learning_rate": 1.4629124600408168e-07, "loss": 15.1586, "step": 32400 }, { "epoch": 0.9377054391250009, "grad_norm": 22.078125, "learning_rate": 1.46517004099767e-07, "loss": 15.2709, "step": 32450 }, { "epoch": 0.9391502857184139, "grad_norm": 20.484375, "learning_rate": 1.467427621954523e-07, "loss": 15.3932, "step": 32500 }, { "epoch": 0.9405951323118268, "grad_norm": 20.171875, "learning_rate": 1.4696852029113762e-07, "loss": 15.3021, "step": 32550 }, { "epoch": 0.9420399789052397, "grad_norm": 20.375, "learning_rate": 1.4719427838682294e-07, "loss": 15.3676, "step": 32600 }, { "epoch": 0.9434848254986526, "grad_norm": 18.40625, "learning_rate": 1.4742003648250825e-07, "loss": 15.225, "step": 32650 }, { "epoch": 0.9449296720920656, "grad_norm": 22.921875, "learning_rate": 1.4764579457819357e-07, "loss": 15.2833, "step": 32700 }, { "epoch": 0.9463745186854786, "grad_norm": 21.359375, "learning_rate": 1.4787155267387888e-07, "loss": 15.2648, "step": 32750 }, { "epoch": 0.9478193652788915, "grad_norm": 18.03125, "learning_rate": 1.480973107695642e-07, "loss": 15.1701, "step": 32800 }, { "epoch": 0.9492642118723045, "grad_norm": 20.734375, "learning_rate": 1.483230688652495e-07, "loss": 15.3169, "step": 32850 }, { "epoch": 0.9507090584657174, "grad_norm": 30.40625, "learning_rate": 1.485488269609348e-07, "loss": 15.1193, "step": 32900 }, { "epoch": 0.9521539050591303, "grad_norm": 18.96875, "learning_rate": 1.4877458505662014e-07, "loss": 15.1477, "step": 32950 }, { "epoch": 0.9535987516525433, "grad_norm": 22.125, "learning_rate": 1.4900034315230544e-07, "loss": 15.1363, "step": 33000 }, { "epoch": 0.9535987516525433, "eval_loss": 1.9005507230758667, "eval_runtime": 343.9939, "eval_samples_per_second": 2710.903, "eval_steps_per_second": 42.358, "step": 33000 }, { "epoch": 0.9550435982459562, "grad_norm": 20.171875, "learning_rate": 1.4922610124799077e-07, "loss": 15.2704, "step": 33050 }, { "epoch": 0.9564884448393692, "grad_norm": 25.21875, "learning_rate": 1.4945185934367605e-07, "loss": 15.2413, "step": 33100 }, { "epoch": 0.9579332914327822, "grad_norm": 20.984375, "learning_rate": 1.4967761743936135e-07, "loss": 15.0358, "step": 33150 }, { "epoch": 0.9593781380261951, "grad_norm": 22.65625, "learning_rate": 1.4990337553504668e-07, "loss": 15.148, "step": 33200 }, { "epoch": 0.960822984619608, "grad_norm": 24.671875, "learning_rate": 1.5012913363073198e-07, "loss": 15.0575, "step": 33250 }, { "epoch": 0.962267831213021, "grad_norm": 21.28125, "learning_rate": 1.503548917264173e-07, "loss": 15.1119, "step": 33300 }, { "epoch": 0.9637126778064339, "grad_norm": 24.21875, "learning_rate": 1.505806498221026e-07, "loss": 15.21, "step": 33350 }, { "epoch": 0.9651575243998468, "grad_norm": 21.6875, "learning_rate": 1.5080640791778791e-07, "loss": 15.1355, "step": 33400 }, { "epoch": 0.9666023709932597, "grad_norm": 24.390625, "learning_rate": 1.5103216601347324e-07, "loss": 15.2218, "step": 33450 }, { "epoch": 0.9680472175866728, "grad_norm": 19.25, "learning_rate": 1.5125792410915854e-07, "loss": 15.1256, "step": 33500 }, { "epoch": 0.9694920641800857, "grad_norm": 19.984375, "learning_rate": 1.5148368220484387e-07, "loss": 15.1171, "step": 33550 }, { "epoch": 0.9709369107734986, "grad_norm": 19.640625, "learning_rate": 1.5170944030052918e-07, "loss": 15.0999, "step": 33600 }, { "epoch": 0.9723817573669116, "grad_norm": 24.265625, "learning_rate": 1.519351983962145e-07, "loss": 15.2255, "step": 33650 }, { "epoch": 0.9738266039603245, "grad_norm": 25.546875, "learning_rate": 1.521609564918998e-07, "loss": 15.0743, "step": 33700 }, { "epoch": 0.9752714505537374, "grad_norm": 21.578125, "learning_rate": 1.5238671458758508e-07, "loss": 15.145, "step": 33750 }, { "epoch": 0.9767162971471504, "grad_norm": 24.46875, "learning_rate": 1.526124726832704e-07, "loss": 15.2408, "step": 33800 }, { "epoch": 0.9781611437405634, "grad_norm": 21.984375, "learning_rate": 1.5283823077895571e-07, "loss": 15.1413, "step": 33850 }, { "epoch": 0.9796059903339763, "grad_norm": 21.828125, "learning_rate": 1.5306398887464104e-07, "loss": 15.1452, "step": 33900 }, { "epoch": 0.9810508369273893, "grad_norm": 22.125, "learning_rate": 1.5328974697032635e-07, "loss": 15.1786, "step": 33950 }, { "epoch": 0.9824956835208022, "grad_norm": 27.046875, "learning_rate": 1.5351550506601165e-07, "loss": 15.069, "step": 34000 }, { "epoch": 0.9839405301142151, "grad_norm": 21.65625, "learning_rate": 1.5374126316169698e-07, "loss": 15.1776, "step": 34050 }, { "epoch": 0.985385376707628, "grad_norm": 21.953125, "learning_rate": 1.5396702125738228e-07, "loss": 15.1561, "step": 34100 }, { "epoch": 0.986830223301041, "grad_norm": 25.75, "learning_rate": 1.541927793530676e-07, "loss": 15.2242, "step": 34150 }, { "epoch": 0.9882750698944539, "grad_norm": 23.484375, "learning_rate": 1.544185374487529e-07, "loss": 15.1583, "step": 34200 }, { "epoch": 0.989719916487867, "grad_norm": 27.984375, "learning_rate": 1.546442955444382e-07, "loss": 15.1159, "step": 34250 }, { "epoch": 0.9911647630812799, "grad_norm": 21.34375, "learning_rate": 1.5487005364012354e-07, "loss": 15.0641, "step": 34300 }, { "epoch": 0.9926096096746928, "grad_norm": 20.25, "learning_rate": 1.5509581173580884e-07, "loss": 15.139, "step": 34350 }, { "epoch": 0.9940544562681057, "grad_norm": 21.640625, "learning_rate": 1.5532156983149415e-07, "loss": 15.0966, "step": 34400 }, { "epoch": 0.9954993028615187, "grad_norm": 22.125, "learning_rate": 1.5554732792717945e-07, "loss": 15.1072, "step": 34450 }, { "epoch": 0.9969441494549316, "grad_norm": 21.859375, "learning_rate": 1.5577308602286478e-07, "loss": 15.1392, "step": 34500 }, { "epoch": 0.9983889960483445, "grad_norm": 21.703125, "learning_rate": 1.5599884411855008e-07, "loss": 15.2097, "step": 34550 }, { "epoch": 0.9998338426417576, "grad_norm": 23.140625, "learning_rate": 1.5622460221423538e-07, "loss": 15.0667, "step": 34600 } ], "logging_steps": 50, "max_steps": 34605, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.042588809533587e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }