CocoRoF's picture
Training in progress, step 34605, checkpoint
83e69d3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9999783273010988,
"eval_steps": 3000,
"global_step": 34605,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014448465934129443,
"grad_norm": 23.53125,
"learning_rate": 2.2575809568531125e-10,
"loss": 18.2618,
"step": 50
},
{
"epoch": 0.0028896931868258886,
"grad_norm": 23.046875,
"learning_rate": 4.515161913706225e-10,
"loss": 18.3279,
"step": 100
},
{
"epoch": 0.004334539780238833,
"grad_norm": 23.703125,
"learning_rate": 6.772742870559338e-10,
"loss": 18.2094,
"step": 150
},
{
"epoch": 0.005779386373651777,
"grad_norm": 21.046875,
"learning_rate": 9.03032382741245e-10,
"loss": 18.0812,
"step": 200
},
{
"epoch": 0.007224232967064722,
"grad_norm": 21.34375,
"learning_rate": 1.1287904784265563e-09,
"loss": 18.0178,
"step": 250
},
{
"epoch": 0.008669079560477666,
"grad_norm": 23.390625,
"learning_rate": 1.3545485741118676e-09,
"loss": 18.0735,
"step": 300
},
{
"epoch": 0.01011392615389061,
"grad_norm": 22.078125,
"learning_rate": 1.5803066697971788e-09,
"loss": 18.0169,
"step": 350
},
{
"epoch": 0.011558772747303554,
"grad_norm": 19.1875,
"learning_rate": 1.80606476548249e-09,
"loss": 17.9299,
"step": 400
},
{
"epoch": 0.0130036193407165,
"grad_norm": 22.75,
"learning_rate": 2.0318228611678016e-09,
"loss": 17.7586,
"step": 450
},
{
"epoch": 0.014448465934129445,
"grad_norm": 20.984375,
"learning_rate": 2.2575809568531127e-09,
"loss": 17.7956,
"step": 500
},
{
"epoch": 0.015893312527542388,
"grad_norm": 22.453125,
"learning_rate": 2.4833390525384237e-09,
"loss": 17.7873,
"step": 550
},
{
"epoch": 0.01733815912095533,
"grad_norm": 19.71875,
"learning_rate": 2.709097148223735e-09,
"loss": 17.7643,
"step": 600
},
{
"epoch": 0.018783005714368278,
"grad_norm": 20.1875,
"learning_rate": 2.9348552439090465e-09,
"loss": 17.6861,
"step": 650
},
{
"epoch": 0.02022785230778122,
"grad_norm": 20.8125,
"learning_rate": 3.1606133395943576e-09,
"loss": 17.6883,
"step": 700
},
{
"epoch": 0.021672698901194165,
"grad_norm": 23.875,
"learning_rate": 3.386371435279669e-09,
"loss": 17.6176,
"step": 750
},
{
"epoch": 0.02311754549460711,
"grad_norm": 19.96875,
"learning_rate": 3.61212953096498e-09,
"loss": 17.6334,
"step": 800
},
{
"epoch": 0.024562392088020055,
"grad_norm": 20.734375,
"learning_rate": 3.837887626650292e-09,
"loss": 17.5943,
"step": 850
},
{
"epoch": 0.026007238681433,
"grad_norm": 21.09375,
"learning_rate": 4.063645722335603e-09,
"loss": 17.5436,
"step": 900
},
{
"epoch": 0.027452085274845942,
"grad_norm": 19.34375,
"learning_rate": 4.289403818020914e-09,
"loss": 17.6069,
"step": 950
},
{
"epoch": 0.02889693186825889,
"grad_norm": 23.109375,
"learning_rate": 4.515161913706225e-09,
"loss": 17.4956,
"step": 1000
},
{
"epoch": 0.030341778461671833,
"grad_norm": 23.53125,
"learning_rate": 4.740920009391537e-09,
"loss": 17.624,
"step": 1050
},
{
"epoch": 0.031786625055084776,
"grad_norm": 19.40625,
"learning_rate": 4.966678105076847e-09,
"loss": 17.6303,
"step": 1100
},
{
"epoch": 0.03323147164849772,
"grad_norm": 20.9375,
"learning_rate": 5.19243620076216e-09,
"loss": 17.5501,
"step": 1150
},
{
"epoch": 0.03467631824191066,
"grad_norm": 18.9375,
"learning_rate": 5.41819429644747e-09,
"loss": 17.5943,
"step": 1200
},
{
"epoch": 0.036121164835323606,
"grad_norm": 25.625,
"learning_rate": 5.643952392132782e-09,
"loss": 17.4464,
"step": 1250
},
{
"epoch": 0.037566011428736557,
"grad_norm": 18.203125,
"learning_rate": 5.869710487818093e-09,
"loss": 17.5614,
"step": 1300
},
{
"epoch": 0.0390108580221495,
"grad_norm": 19.4375,
"learning_rate": 6.095468583503404e-09,
"loss": 17.3942,
"step": 1350
},
{
"epoch": 0.04045570461556244,
"grad_norm": 20.71875,
"learning_rate": 6.321226679188715e-09,
"loss": 17.3659,
"step": 1400
},
{
"epoch": 0.04190055120897539,
"grad_norm": 17.828125,
"learning_rate": 6.546984774874027e-09,
"loss": 17.432,
"step": 1450
},
{
"epoch": 0.04334539780238833,
"grad_norm": 22.234375,
"learning_rate": 6.772742870559338e-09,
"loss": 17.5557,
"step": 1500
},
{
"epoch": 0.044790244395801274,
"grad_norm": 21.0625,
"learning_rate": 6.998500966244649e-09,
"loss": 17.4979,
"step": 1550
},
{
"epoch": 0.04623509098921422,
"grad_norm": 18.84375,
"learning_rate": 7.22425906192996e-09,
"loss": 17.2844,
"step": 1600
},
{
"epoch": 0.04767993758262717,
"grad_norm": 22.40625,
"learning_rate": 7.4500171576152714e-09,
"loss": 17.3339,
"step": 1650
},
{
"epoch": 0.04912478417604011,
"grad_norm": 19.390625,
"learning_rate": 7.675775253300584e-09,
"loss": 17.313,
"step": 1700
},
{
"epoch": 0.050569630769453054,
"grad_norm": 19.328125,
"learning_rate": 7.901533348985894e-09,
"loss": 17.3682,
"step": 1750
},
{
"epoch": 0.052014477362866,
"grad_norm": 22.421875,
"learning_rate": 8.127291444671207e-09,
"loss": 17.3144,
"step": 1800
},
{
"epoch": 0.05345932395627894,
"grad_norm": 19.4375,
"learning_rate": 8.353049540356517e-09,
"loss": 17.3066,
"step": 1850
},
{
"epoch": 0.054904170549691884,
"grad_norm": 19.984375,
"learning_rate": 8.578807636041828e-09,
"loss": 17.4404,
"step": 1900
},
{
"epoch": 0.05634901714310483,
"grad_norm": 19.921875,
"learning_rate": 8.804565731727138e-09,
"loss": 17.3098,
"step": 1950
},
{
"epoch": 0.05779386373651778,
"grad_norm": 18.578125,
"learning_rate": 9.03032382741245e-09,
"loss": 17.2569,
"step": 2000
},
{
"epoch": 0.05923871032993072,
"grad_norm": 21.890625,
"learning_rate": 9.256081923097763e-09,
"loss": 17.3394,
"step": 2050
},
{
"epoch": 0.060683556923343665,
"grad_norm": 25.4375,
"learning_rate": 9.481840018783073e-09,
"loss": 17.3198,
"step": 2100
},
{
"epoch": 0.06212840351675661,
"grad_norm": 21.171875,
"learning_rate": 9.707598114468384e-09,
"loss": 17.3099,
"step": 2150
},
{
"epoch": 0.06357325011016955,
"grad_norm": 26.203125,
"learning_rate": 9.933356210153695e-09,
"loss": 17.2845,
"step": 2200
},
{
"epoch": 0.0650180967035825,
"grad_norm": 21.296875,
"learning_rate": 1.0159114305839007e-08,
"loss": 17.2587,
"step": 2250
},
{
"epoch": 0.06646294329699544,
"grad_norm": 22.65625,
"learning_rate": 1.038487240152432e-08,
"loss": 17.3986,
"step": 2300
},
{
"epoch": 0.06790778989040838,
"grad_norm": 23.625,
"learning_rate": 1.061063049720963e-08,
"loss": 17.3046,
"step": 2350
},
{
"epoch": 0.06935263648382133,
"grad_norm": 21.203125,
"learning_rate": 1.083638859289494e-08,
"loss": 17.2043,
"step": 2400
},
{
"epoch": 0.07079748307723427,
"grad_norm": 24.640625,
"learning_rate": 1.1062146688580251e-08,
"loss": 17.2274,
"step": 2450
},
{
"epoch": 0.07224232967064721,
"grad_norm": 20.03125,
"learning_rate": 1.1287904784265563e-08,
"loss": 17.2168,
"step": 2500
},
{
"epoch": 0.07368717626406017,
"grad_norm": 23.5,
"learning_rate": 1.1513662879950874e-08,
"loss": 17.1795,
"step": 2550
},
{
"epoch": 0.07513202285747311,
"grad_norm": 19.890625,
"learning_rate": 1.1739420975636186e-08,
"loss": 17.2713,
"step": 2600
},
{
"epoch": 0.07657686945088606,
"grad_norm": 20.796875,
"learning_rate": 1.1965179071321498e-08,
"loss": 17.371,
"step": 2650
},
{
"epoch": 0.078021716044299,
"grad_norm": 21.640625,
"learning_rate": 1.2190937167006807e-08,
"loss": 17.199,
"step": 2700
},
{
"epoch": 0.07946656263771194,
"grad_norm": 24.96875,
"learning_rate": 1.241669526269212e-08,
"loss": 17.2428,
"step": 2750
},
{
"epoch": 0.08091140923112489,
"grad_norm": 18.640625,
"learning_rate": 1.264245335837743e-08,
"loss": 17.2682,
"step": 2800
},
{
"epoch": 0.08235625582453783,
"grad_norm": 22.984375,
"learning_rate": 1.2868211454062742e-08,
"loss": 17.1985,
"step": 2850
},
{
"epoch": 0.08380110241795077,
"grad_norm": 52.03125,
"learning_rate": 1.3093969549748055e-08,
"loss": 17.2148,
"step": 2900
},
{
"epoch": 0.08524594901136372,
"grad_norm": 20.71875,
"learning_rate": 1.3319727645433364e-08,
"loss": 17.1465,
"step": 2950
},
{
"epoch": 0.08669079560477666,
"grad_norm": 21.015625,
"learning_rate": 1.3545485741118676e-08,
"loss": 17.1231,
"step": 3000
},
{
"epoch": 0.08669079560477666,
"eval_loss": 2.1503310203552246,
"eval_runtime": 340.0537,
"eval_samples_per_second": 2742.314,
"eval_steps_per_second": 42.849,
"step": 3000
},
{
"epoch": 0.0881356421981896,
"grad_norm": 19.109375,
"learning_rate": 1.3771243836803987e-08,
"loss": 17.2298,
"step": 3050
},
{
"epoch": 0.08958048879160255,
"grad_norm": 20.453125,
"learning_rate": 1.3997001932489299e-08,
"loss": 17.0646,
"step": 3100
},
{
"epoch": 0.09102533538501549,
"grad_norm": 22.234375,
"learning_rate": 1.4222760028174611e-08,
"loss": 17.2016,
"step": 3150
},
{
"epoch": 0.09247018197842843,
"grad_norm": 18.921875,
"learning_rate": 1.444851812385992e-08,
"loss": 17.1826,
"step": 3200
},
{
"epoch": 0.09391502857184139,
"grad_norm": 18.0,
"learning_rate": 1.4674276219545232e-08,
"loss": 17.1939,
"step": 3250
},
{
"epoch": 0.09535987516525433,
"grad_norm": 19.859375,
"learning_rate": 1.4900034315230543e-08,
"loss": 17.104,
"step": 3300
},
{
"epoch": 0.09680472175866728,
"grad_norm": 25.421875,
"learning_rate": 1.5125792410915855e-08,
"loss": 17.1392,
"step": 3350
},
{
"epoch": 0.09824956835208022,
"grad_norm": 23.34375,
"learning_rate": 1.5351550506601167e-08,
"loss": 17.2487,
"step": 3400
},
{
"epoch": 0.09969441494549316,
"grad_norm": 21.78125,
"learning_rate": 1.5577308602286476e-08,
"loss": 17.1119,
"step": 3450
},
{
"epoch": 0.10113926153890611,
"grad_norm": 21.125,
"learning_rate": 1.580306669797179e-08,
"loss": 17.2458,
"step": 3500
},
{
"epoch": 0.10258410813231905,
"grad_norm": 20.96875,
"learning_rate": 1.6028824793657098e-08,
"loss": 17.1168,
"step": 3550
},
{
"epoch": 0.104028954725732,
"grad_norm": 23.03125,
"learning_rate": 1.6254582889342413e-08,
"loss": 17.1953,
"step": 3600
},
{
"epoch": 0.10547380131914494,
"grad_norm": 20.828125,
"learning_rate": 1.6480340985027722e-08,
"loss": 17.1476,
"step": 3650
},
{
"epoch": 0.10691864791255788,
"grad_norm": 20.09375,
"learning_rate": 1.6706099080713034e-08,
"loss": 17.1084,
"step": 3700
},
{
"epoch": 0.10836349450597083,
"grad_norm": 21.5,
"learning_rate": 1.6931857176398343e-08,
"loss": 17.3216,
"step": 3750
},
{
"epoch": 0.10980834109938377,
"grad_norm": 19.453125,
"learning_rate": 1.7157615272083656e-08,
"loss": 17.1359,
"step": 3800
},
{
"epoch": 0.11125318769279671,
"grad_norm": 19.640625,
"learning_rate": 1.7383373367768968e-08,
"loss": 17.1561,
"step": 3850
},
{
"epoch": 0.11269803428620966,
"grad_norm": 21.0,
"learning_rate": 1.7609131463454277e-08,
"loss": 17.0558,
"step": 3900
},
{
"epoch": 0.11414288087962261,
"grad_norm": 22.21875,
"learning_rate": 1.7834889559139592e-08,
"loss": 17.149,
"step": 3950
},
{
"epoch": 0.11558772747303556,
"grad_norm": 23.109375,
"learning_rate": 1.80606476548249e-08,
"loss": 17.0748,
"step": 4000
},
{
"epoch": 0.1170325740664485,
"grad_norm": 19.265625,
"learning_rate": 1.828640575051021e-08,
"loss": 17.1038,
"step": 4050
},
{
"epoch": 0.11847742065986144,
"grad_norm": 22.34375,
"learning_rate": 1.8512163846195526e-08,
"loss": 17.0105,
"step": 4100
},
{
"epoch": 0.11992226725327439,
"grad_norm": 20.25,
"learning_rate": 1.8737921941880835e-08,
"loss": 17.0041,
"step": 4150
},
{
"epoch": 0.12136711384668733,
"grad_norm": 23.484375,
"learning_rate": 1.8963680037566147e-08,
"loss": 17.1879,
"step": 4200
},
{
"epoch": 0.12281196044010027,
"grad_norm": 23.859375,
"learning_rate": 1.9189438133251456e-08,
"loss": 17.0091,
"step": 4250
},
{
"epoch": 0.12425680703351322,
"grad_norm": 23.609375,
"learning_rate": 1.9415196228936768e-08,
"loss": 16.9834,
"step": 4300
},
{
"epoch": 0.12570165362692617,
"grad_norm": 20.171875,
"learning_rate": 1.964095432462208e-08,
"loss": 16.9839,
"step": 4350
},
{
"epoch": 0.1271465002203391,
"grad_norm": 20.75,
"learning_rate": 1.986671242030739e-08,
"loss": 16.9732,
"step": 4400
},
{
"epoch": 0.12859134681375206,
"grad_norm": 21.078125,
"learning_rate": 2.0092470515992705e-08,
"loss": 17.0603,
"step": 4450
},
{
"epoch": 0.130036193407165,
"grad_norm": 19.796875,
"learning_rate": 2.0318228611678014e-08,
"loss": 17.0293,
"step": 4500
},
{
"epoch": 0.13148104000057795,
"grad_norm": 21.53125,
"learning_rate": 2.0543986707363323e-08,
"loss": 17.0322,
"step": 4550
},
{
"epoch": 0.13292588659399088,
"grad_norm": 23.171875,
"learning_rate": 2.076974480304864e-08,
"loss": 17.0431,
"step": 4600
},
{
"epoch": 0.13437073318740383,
"grad_norm": 20.203125,
"learning_rate": 2.0995502898733947e-08,
"loss": 17.0135,
"step": 4650
},
{
"epoch": 0.13581557978081676,
"grad_norm": 20.1875,
"learning_rate": 2.122126099441926e-08,
"loss": 16.905,
"step": 4700
},
{
"epoch": 0.13726042637422972,
"grad_norm": 20.703125,
"learning_rate": 2.144701909010457e-08,
"loss": 17.082,
"step": 4750
},
{
"epoch": 0.13870527296764265,
"grad_norm": 21.265625,
"learning_rate": 2.167277718578988e-08,
"loss": 16.9678,
"step": 4800
},
{
"epoch": 0.1401501195610556,
"grad_norm": 20.640625,
"learning_rate": 2.1898535281475193e-08,
"loss": 17.0,
"step": 4850
},
{
"epoch": 0.14159496615446854,
"grad_norm": 20.390625,
"learning_rate": 2.2124293377160502e-08,
"loss": 16.9941,
"step": 4900
},
{
"epoch": 0.1430398127478815,
"grad_norm": 21.15625,
"learning_rate": 2.2350051472845818e-08,
"loss": 17.0649,
"step": 4950
},
{
"epoch": 0.14448465934129442,
"grad_norm": 19.21875,
"learning_rate": 2.2575809568531127e-08,
"loss": 17.0574,
"step": 5000
},
{
"epoch": 0.14592950593470738,
"grad_norm": 19.59375,
"learning_rate": 2.2801567664216436e-08,
"loss": 17.1142,
"step": 5050
},
{
"epoch": 0.14737435252812034,
"grad_norm": 20.828125,
"learning_rate": 2.3027325759901748e-08,
"loss": 16.957,
"step": 5100
},
{
"epoch": 0.14881919912153327,
"grad_norm": 21.125,
"learning_rate": 2.325308385558706e-08,
"loss": 17.1079,
"step": 5150
},
{
"epoch": 0.15026404571494623,
"grad_norm": 21.890625,
"learning_rate": 2.3478841951272372e-08,
"loss": 17.0049,
"step": 5200
},
{
"epoch": 0.15170889230835916,
"grad_norm": 22.828125,
"learning_rate": 2.370460004695768e-08,
"loss": 17.0316,
"step": 5250
},
{
"epoch": 0.1531537389017721,
"grad_norm": 22.703125,
"learning_rate": 2.3930358142642997e-08,
"loss": 17.0376,
"step": 5300
},
{
"epoch": 0.15459858549518504,
"grad_norm": 20.90625,
"learning_rate": 2.4156116238328306e-08,
"loss": 17.1354,
"step": 5350
},
{
"epoch": 0.156043432088598,
"grad_norm": 22.609375,
"learning_rate": 2.4381874334013615e-08,
"loss": 16.9243,
"step": 5400
},
{
"epoch": 0.15748827868201093,
"grad_norm": 20.859375,
"learning_rate": 2.460763242969893e-08,
"loss": 17.0623,
"step": 5450
},
{
"epoch": 0.1589331252754239,
"grad_norm": 22.15625,
"learning_rate": 2.483339052538424e-08,
"loss": 17.0041,
"step": 5500
},
{
"epoch": 0.16037797186883682,
"grad_norm": 23.078125,
"learning_rate": 2.505914862106955e-08,
"loss": 16.945,
"step": 5550
},
{
"epoch": 0.16182281846224977,
"grad_norm": 20.5,
"learning_rate": 2.528490671675486e-08,
"loss": 16.886,
"step": 5600
},
{
"epoch": 0.1632676650556627,
"grad_norm": 19.859375,
"learning_rate": 2.5510664812440173e-08,
"loss": 16.8898,
"step": 5650
},
{
"epoch": 0.16471251164907566,
"grad_norm": 22.875,
"learning_rate": 2.5736422908125485e-08,
"loss": 17.0453,
"step": 5700
},
{
"epoch": 0.16615735824248862,
"grad_norm": 21.015625,
"learning_rate": 2.5962181003810794e-08,
"loss": 17.01,
"step": 5750
},
{
"epoch": 0.16760220483590155,
"grad_norm": 24.078125,
"learning_rate": 2.618793909949611e-08,
"loss": 17.0377,
"step": 5800
},
{
"epoch": 0.1690470514293145,
"grad_norm": 19.53125,
"learning_rate": 2.641369719518142e-08,
"loss": 16.8983,
"step": 5850
},
{
"epoch": 0.17049189802272743,
"grad_norm": 19.234375,
"learning_rate": 2.6639455290866727e-08,
"loss": 17.0153,
"step": 5900
},
{
"epoch": 0.1719367446161404,
"grad_norm": 21.40625,
"learning_rate": 2.6865213386552043e-08,
"loss": 16.9406,
"step": 5950
},
{
"epoch": 0.17338159120955332,
"grad_norm": 21.703125,
"learning_rate": 2.7090971482237352e-08,
"loss": 16.9213,
"step": 6000
},
{
"epoch": 0.17338159120955332,
"eval_loss": 2.117004871368408,
"eval_runtime": 351.2723,
"eval_samples_per_second": 2654.733,
"eval_steps_per_second": 41.481,
"step": 6000
},
{
"epoch": 0.17482643780296628,
"grad_norm": 23.40625,
"learning_rate": 2.7316729577922664e-08,
"loss": 16.9717,
"step": 6050
},
{
"epoch": 0.1762712843963792,
"grad_norm": 21.09375,
"learning_rate": 2.7542487673607973e-08,
"loss": 17.0021,
"step": 6100
},
{
"epoch": 0.17771613098979216,
"grad_norm": 20.078125,
"learning_rate": 2.7768245769293285e-08,
"loss": 16.9975,
"step": 6150
},
{
"epoch": 0.1791609775832051,
"grad_norm": 20.46875,
"learning_rate": 2.7994003864978598e-08,
"loss": 16.9298,
"step": 6200
},
{
"epoch": 0.18060582417661805,
"grad_norm": 21.453125,
"learning_rate": 2.8219761960663907e-08,
"loss": 16.8852,
"step": 6250
},
{
"epoch": 0.18205067077003098,
"grad_norm": 20.90625,
"learning_rate": 2.8445520056349222e-08,
"loss": 17.0017,
"step": 6300
},
{
"epoch": 0.18349551736344394,
"grad_norm": 23.765625,
"learning_rate": 2.867127815203453e-08,
"loss": 16.9424,
"step": 6350
},
{
"epoch": 0.18494036395685687,
"grad_norm": 21.328125,
"learning_rate": 2.889703624771984e-08,
"loss": 16.9559,
"step": 6400
},
{
"epoch": 0.18638521055026983,
"grad_norm": 21.8125,
"learning_rate": 2.9122794343405156e-08,
"loss": 16.9595,
"step": 6450
},
{
"epoch": 0.18783005714368278,
"grad_norm": 22.53125,
"learning_rate": 2.9348552439090465e-08,
"loss": 16.8973,
"step": 6500
},
{
"epoch": 0.1892749037370957,
"grad_norm": 21.3125,
"learning_rate": 2.9574310534775777e-08,
"loss": 16.9388,
"step": 6550
},
{
"epoch": 0.19071975033050867,
"grad_norm": 25.25,
"learning_rate": 2.9800068630461086e-08,
"loss": 16.8936,
"step": 6600
},
{
"epoch": 0.1921645969239216,
"grad_norm": 21.921875,
"learning_rate": 3.0025826726146395e-08,
"loss": 16.9861,
"step": 6650
},
{
"epoch": 0.19360944351733456,
"grad_norm": 22.34375,
"learning_rate": 3.025158482183171e-08,
"loss": 16.7772,
"step": 6700
},
{
"epoch": 0.19505429011074749,
"grad_norm": 21.921875,
"learning_rate": 3.047734291751702e-08,
"loss": 16.8555,
"step": 6750
},
{
"epoch": 0.19649913670416044,
"grad_norm": 25.28125,
"learning_rate": 3.0703101013202335e-08,
"loss": 16.7765,
"step": 6800
},
{
"epoch": 0.19794398329757337,
"grad_norm": 21.625,
"learning_rate": 3.0928859108887644e-08,
"loss": 16.868,
"step": 6850
},
{
"epoch": 0.19938882989098633,
"grad_norm": 25.046875,
"learning_rate": 3.115461720457295e-08,
"loss": 16.9386,
"step": 6900
},
{
"epoch": 0.20083367648439926,
"grad_norm": 23.671875,
"learning_rate": 3.138037530025826e-08,
"loss": 16.9085,
"step": 6950
},
{
"epoch": 0.20227852307781222,
"grad_norm": 22.234375,
"learning_rate": 3.160613339594358e-08,
"loss": 16.8305,
"step": 7000
},
{
"epoch": 0.20372336967122515,
"grad_norm": 18.609375,
"learning_rate": 3.183189149162889e-08,
"loss": 16.7971,
"step": 7050
},
{
"epoch": 0.2051682162646381,
"grad_norm": 21.78125,
"learning_rate": 3.2057649587314195e-08,
"loss": 16.8924,
"step": 7100
},
{
"epoch": 0.20661306285805103,
"grad_norm": 19.921875,
"learning_rate": 3.228340768299951e-08,
"loss": 16.9471,
"step": 7150
},
{
"epoch": 0.208057909451464,
"grad_norm": 20.75,
"learning_rate": 3.2509165778684826e-08,
"loss": 16.8681,
"step": 7200
},
{
"epoch": 0.20950275604487695,
"grad_norm": 20.859375,
"learning_rate": 3.2734923874370135e-08,
"loss": 16.7762,
"step": 7250
},
{
"epoch": 0.21094760263828988,
"grad_norm": 19.671875,
"learning_rate": 3.2960681970055444e-08,
"loss": 16.8033,
"step": 7300
},
{
"epoch": 0.21239244923170283,
"grad_norm": 22.171875,
"learning_rate": 3.318644006574075e-08,
"loss": 16.9011,
"step": 7350
},
{
"epoch": 0.21383729582511576,
"grad_norm": 20.09375,
"learning_rate": 3.341219816142607e-08,
"loss": 16.7137,
"step": 7400
},
{
"epoch": 0.21528214241852872,
"grad_norm": 22.828125,
"learning_rate": 3.363795625711138e-08,
"loss": 16.8546,
"step": 7450
},
{
"epoch": 0.21672698901194165,
"grad_norm": 22.640625,
"learning_rate": 3.3863714352796687e-08,
"loss": 16.9191,
"step": 7500
},
{
"epoch": 0.2181718356053546,
"grad_norm": 28.5,
"learning_rate": 3.4089472448482e-08,
"loss": 16.8509,
"step": 7550
},
{
"epoch": 0.21961668219876754,
"grad_norm": 20.484375,
"learning_rate": 3.431523054416731e-08,
"loss": 16.8247,
"step": 7600
},
{
"epoch": 0.2210615287921805,
"grad_norm": 20.5625,
"learning_rate": 3.454098863985262e-08,
"loss": 16.7724,
"step": 7650
},
{
"epoch": 0.22250637538559342,
"grad_norm": 22.0625,
"learning_rate": 3.4766746735537936e-08,
"loss": 16.7584,
"step": 7700
},
{
"epoch": 0.22395122197900638,
"grad_norm": 20.375,
"learning_rate": 3.499250483122325e-08,
"loss": 16.7699,
"step": 7750
},
{
"epoch": 0.2253960685724193,
"grad_norm": 20.65625,
"learning_rate": 3.5218262926908553e-08,
"loss": 16.8213,
"step": 7800
},
{
"epoch": 0.22684091516583227,
"grad_norm": 22.171875,
"learning_rate": 3.544402102259387e-08,
"loss": 16.9104,
"step": 7850
},
{
"epoch": 0.22828576175924523,
"grad_norm": 22.109375,
"learning_rate": 3.5669779118279185e-08,
"loss": 16.7777,
"step": 7900
},
{
"epoch": 0.22973060835265816,
"grad_norm": 22.109375,
"learning_rate": 3.589553721396449e-08,
"loss": 16.7741,
"step": 7950
},
{
"epoch": 0.2311754549460711,
"grad_norm": 20.90625,
"learning_rate": 3.61212953096498e-08,
"loss": 16.7486,
"step": 8000
},
{
"epoch": 0.23262030153948404,
"grad_norm": 21.265625,
"learning_rate": 3.634705340533512e-08,
"loss": 16.7256,
"step": 8050
},
{
"epoch": 0.234065148132897,
"grad_norm": 23.40625,
"learning_rate": 3.657281150102042e-08,
"loss": 16.7431,
"step": 8100
},
{
"epoch": 0.23550999472630993,
"grad_norm": 21.0625,
"learning_rate": 3.6798569596705736e-08,
"loss": 16.777,
"step": 8150
},
{
"epoch": 0.2369548413197229,
"grad_norm": 23.125,
"learning_rate": 3.702432769239105e-08,
"loss": 16.6821,
"step": 8200
},
{
"epoch": 0.23839968791313582,
"grad_norm": 19.203125,
"learning_rate": 3.725008578807636e-08,
"loss": 16.7046,
"step": 8250
},
{
"epoch": 0.23984453450654877,
"grad_norm": 21.015625,
"learning_rate": 3.747584388376167e-08,
"loss": 16.7624,
"step": 8300
},
{
"epoch": 0.2412893810999617,
"grad_norm": 20.90625,
"learning_rate": 3.770160197944698e-08,
"loss": 16.801,
"step": 8350
},
{
"epoch": 0.24273422769337466,
"grad_norm": 22.703125,
"learning_rate": 3.7927360075132294e-08,
"loss": 16.7837,
"step": 8400
},
{
"epoch": 0.2441790742867876,
"grad_norm": 23.5,
"learning_rate": 3.81531181708176e-08,
"loss": 16.7856,
"step": 8450
},
{
"epoch": 0.24562392088020055,
"grad_norm": 19.84375,
"learning_rate": 3.837887626650291e-08,
"loss": 16.7071,
"step": 8500
},
{
"epoch": 0.24706876747361348,
"grad_norm": 21.78125,
"learning_rate": 3.860463436218823e-08,
"loss": 16.7162,
"step": 8550
},
{
"epoch": 0.24851361406702643,
"grad_norm": 21.109375,
"learning_rate": 3.8830392457873536e-08,
"loss": 16.7331,
"step": 8600
},
{
"epoch": 0.2499584606604394,
"grad_norm": 21.875,
"learning_rate": 3.9056150553558845e-08,
"loss": 16.7945,
"step": 8650
},
{
"epoch": 0.25140330725385235,
"grad_norm": 21.1875,
"learning_rate": 3.928190864924416e-08,
"loss": 16.7675,
"step": 8700
},
{
"epoch": 0.2528481538472653,
"grad_norm": 21.375,
"learning_rate": 3.9507666744929476e-08,
"loss": 16.7202,
"step": 8750
},
{
"epoch": 0.2542930004406782,
"grad_norm": 21.25,
"learning_rate": 3.973342484061478e-08,
"loss": 16.6576,
"step": 8800
},
{
"epoch": 0.25573784703409114,
"grad_norm": 20.84375,
"learning_rate": 3.9959182936300094e-08,
"loss": 16.6249,
"step": 8850
},
{
"epoch": 0.2571826936275041,
"grad_norm": 20.703125,
"learning_rate": 4.018494103198541e-08,
"loss": 16.7702,
"step": 8900
},
{
"epoch": 0.25862754022091705,
"grad_norm": 23.203125,
"learning_rate": 4.041069912767071e-08,
"loss": 16.687,
"step": 8950
},
{
"epoch": 0.26007238681433,
"grad_norm": 24.96875,
"learning_rate": 4.063645722335603e-08,
"loss": 16.6843,
"step": 9000
},
{
"epoch": 0.26007238681433,
"eval_loss": 2.087214946746826,
"eval_runtime": 348.1727,
"eval_samples_per_second": 2678.367,
"eval_steps_per_second": 41.85,
"step": 9000
},
{
"epoch": 0.2615172334077429,
"grad_norm": 22.578125,
"learning_rate": 4.0862215319041343e-08,
"loss": 16.6685,
"step": 9050
},
{
"epoch": 0.2629620800011559,
"grad_norm": 21.125,
"learning_rate": 4.1087973414726646e-08,
"loss": 16.8334,
"step": 9100
},
{
"epoch": 0.2644069265945688,
"grad_norm": 23.84375,
"learning_rate": 4.131373151041196e-08,
"loss": 16.5566,
"step": 9150
},
{
"epoch": 0.26585177318798175,
"grad_norm": 18.640625,
"learning_rate": 4.153948960609728e-08,
"loss": 16.7514,
"step": 9200
},
{
"epoch": 0.2672966197813947,
"grad_norm": 22.53125,
"learning_rate": 4.1765247701782586e-08,
"loss": 16.688,
"step": 9250
},
{
"epoch": 0.26874146637480767,
"grad_norm": 23.46875,
"learning_rate": 4.1991005797467895e-08,
"loss": 16.6654,
"step": 9300
},
{
"epoch": 0.2701863129682206,
"grad_norm": 20.984375,
"learning_rate": 4.2216763893153204e-08,
"loss": 16.7596,
"step": 9350
},
{
"epoch": 0.27163115956163353,
"grad_norm": 21.9375,
"learning_rate": 4.244252198883852e-08,
"loss": 16.6031,
"step": 9400
},
{
"epoch": 0.2730760061550465,
"grad_norm": 20.703125,
"learning_rate": 4.266828008452383e-08,
"loss": 16.702,
"step": 9450
},
{
"epoch": 0.27452085274845944,
"grad_norm": 22.328125,
"learning_rate": 4.289403818020914e-08,
"loss": 16.6338,
"step": 9500
},
{
"epoch": 0.2759656993418724,
"grad_norm": 20.0,
"learning_rate": 4.311979627589445e-08,
"loss": 16.7581,
"step": 9550
},
{
"epoch": 0.2774105459352853,
"grad_norm": 20.46875,
"learning_rate": 4.334555437157976e-08,
"loss": 16.7627,
"step": 9600
},
{
"epoch": 0.2788553925286983,
"grad_norm": 21.90625,
"learning_rate": 4.357131246726507e-08,
"loss": 16.7124,
"step": 9650
},
{
"epoch": 0.2803002391221112,
"grad_norm": 22.46875,
"learning_rate": 4.3797070562950386e-08,
"loss": 16.7091,
"step": 9700
},
{
"epoch": 0.28174508571552415,
"grad_norm": 23.078125,
"learning_rate": 4.40228286586357e-08,
"loss": 16.6876,
"step": 9750
},
{
"epoch": 0.2831899323089371,
"grad_norm": 19.53125,
"learning_rate": 4.4248586754321004e-08,
"loss": 16.5888,
"step": 9800
},
{
"epoch": 0.28463477890235006,
"grad_norm": 22.921875,
"learning_rate": 4.447434485000632e-08,
"loss": 16.7229,
"step": 9850
},
{
"epoch": 0.286079625495763,
"grad_norm": 21.265625,
"learning_rate": 4.4700102945691635e-08,
"loss": 16.6842,
"step": 9900
},
{
"epoch": 0.2875244720891759,
"grad_norm": 20.5625,
"learning_rate": 4.492586104137694e-08,
"loss": 16.614,
"step": 9950
},
{
"epoch": 0.28896931868258885,
"grad_norm": 21.203125,
"learning_rate": 4.515161913706225e-08,
"loss": 16.6419,
"step": 10000
},
{
"epoch": 0.29041416527600183,
"grad_norm": 20.375,
"learning_rate": 4.537737723274757e-08,
"loss": 16.6317,
"step": 10050
},
{
"epoch": 0.29185901186941476,
"grad_norm": 21.375,
"learning_rate": 4.560313532843287e-08,
"loss": 16.6707,
"step": 10100
},
{
"epoch": 0.2933038584628277,
"grad_norm": 23.4375,
"learning_rate": 4.5828893424118187e-08,
"loss": 16.6595,
"step": 10150
},
{
"epoch": 0.2947487050562407,
"grad_norm": 20.703125,
"learning_rate": 4.6054651519803496e-08,
"loss": 16.639,
"step": 10200
},
{
"epoch": 0.2961935516496536,
"grad_norm": 20.453125,
"learning_rate": 4.628040961548881e-08,
"loss": 16.6537,
"step": 10250
},
{
"epoch": 0.29763839824306654,
"grad_norm": 21.234375,
"learning_rate": 4.650616771117412e-08,
"loss": 16.6099,
"step": 10300
},
{
"epoch": 0.29908324483647947,
"grad_norm": 22.84375,
"learning_rate": 4.673192580685943e-08,
"loss": 16.6629,
"step": 10350
},
{
"epoch": 0.30052809142989245,
"grad_norm": 20.796875,
"learning_rate": 4.6957683902544745e-08,
"loss": 16.6335,
"step": 10400
},
{
"epoch": 0.3019729380233054,
"grad_norm": 21.65625,
"learning_rate": 4.7183441998230054e-08,
"loss": 16.6122,
"step": 10450
},
{
"epoch": 0.3034177846167183,
"grad_norm": 23.046875,
"learning_rate": 4.740920009391536e-08,
"loss": 16.6171,
"step": 10500
},
{
"epoch": 0.30486263121013124,
"grad_norm": 20.71875,
"learning_rate": 4.763495818960068e-08,
"loss": 16.5997,
"step": 10550
},
{
"epoch": 0.3063074778035442,
"grad_norm": 22.71875,
"learning_rate": 4.7860716285285994e-08,
"loss": 16.615,
"step": 10600
},
{
"epoch": 0.30775232439695716,
"grad_norm": 21.53125,
"learning_rate": 4.8086474380971296e-08,
"loss": 16.6133,
"step": 10650
},
{
"epoch": 0.3091971709903701,
"grad_norm": 22.828125,
"learning_rate": 4.831223247665661e-08,
"loss": 16.5908,
"step": 10700
},
{
"epoch": 0.310642017583783,
"grad_norm": 22.859375,
"learning_rate": 4.853799057234193e-08,
"loss": 16.5617,
"step": 10750
},
{
"epoch": 0.312086864177196,
"grad_norm": 30.734375,
"learning_rate": 4.876374866802723e-08,
"loss": 16.5051,
"step": 10800
},
{
"epoch": 0.31353171077060893,
"grad_norm": 20.203125,
"learning_rate": 4.8989506763712545e-08,
"loss": 16.6504,
"step": 10850
},
{
"epoch": 0.31497655736402186,
"grad_norm": 26.859375,
"learning_rate": 4.921526485939786e-08,
"loss": 16.6061,
"step": 10900
},
{
"epoch": 0.31642140395743484,
"grad_norm": 20.828125,
"learning_rate": 4.944102295508316e-08,
"loss": 16.6288,
"step": 10950
},
{
"epoch": 0.3178662505508478,
"grad_norm": 25.515625,
"learning_rate": 4.966678105076848e-08,
"loss": 16.5228,
"step": 11000
},
{
"epoch": 0.3193110971442607,
"grad_norm": 23.671875,
"learning_rate": 4.9892539146453794e-08,
"loss": 16.4985,
"step": 11050
},
{
"epoch": 0.32075594373767363,
"grad_norm": 23.25,
"learning_rate": 5.01182972421391e-08,
"loss": 16.6149,
"step": 11100
},
{
"epoch": 0.3222007903310866,
"grad_norm": 20.375,
"learning_rate": 5.034405533782441e-08,
"loss": 16.5877,
"step": 11150
},
{
"epoch": 0.32364563692449955,
"grad_norm": 20.53125,
"learning_rate": 5.056981343350972e-08,
"loss": 16.4745,
"step": 11200
},
{
"epoch": 0.3250904835179125,
"grad_norm": 19.71875,
"learning_rate": 5.0795571529195036e-08,
"loss": 16.5593,
"step": 11250
},
{
"epoch": 0.3265353301113254,
"grad_norm": 20.53125,
"learning_rate": 5.1021329624880345e-08,
"loss": 16.4393,
"step": 11300
},
{
"epoch": 0.3279801767047384,
"grad_norm": 22.078125,
"learning_rate": 5.1247087720565654e-08,
"loss": 16.5429,
"step": 11350
},
{
"epoch": 0.3294250232981513,
"grad_norm": 20.5,
"learning_rate": 5.147284581625097e-08,
"loss": 16.4117,
"step": 11400
},
{
"epoch": 0.33086986989156425,
"grad_norm": 23.640625,
"learning_rate": 5.169860391193628e-08,
"loss": 16.5424,
"step": 11450
},
{
"epoch": 0.33231471648497724,
"grad_norm": 21.625,
"learning_rate": 5.192436200762159e-08,
"loss": 16.6651,
"step": 11500
},
{
"epoch": 0.33375956307839016,
"grad_norm": 24.453125,
"learning_rate": 5.2150120103306903e-08,
"loss": 16.5083,
"step": 11550
},
{
"epoch": 0.3352044096718031,
"grad_norm": 22.46875,
"learning_rate": 5.237587819899222e-08,
"loss": 16.4798,
"step": 11600
},
{
"epoch": 0.336649256265216,
"grad_norm": 22.703125,
"learning_rate": 5.260163629467752e-08,
"loss": 16.6141,
"step": 11650
},
{
"epoch": 0.338094102858629,
"grad_norm": 23.921875,
"learning_rate": 5.282739439036284e-08,
"loss": 16.6017,
"step": 11700
},
{
"epoch": 0.33953894945204194,
"grad_norm": 24.140625,
"learning_rate": 5.305315248604815e-08,
"loss": 16.55,
"step": 11750
},
{
"epoch": 0.34098379604545487,
"grad_norm": 21.9375,
"learning_rate": 5.3278910581733455e-08,
"loss": 16.5809,
"step": 11800
},
{
"epoch": 0.3424286426388678,
"grad_norm": 22.3125,
"learning_rate": 5.350466867741877e-08,
"loss": 16.5002,
"step": 11850
},
{
"epoch": 0.3438734892322808,
"grad_norm": 20.984375,
"learning_rate": 5.3730426773104086e-08,
"loss": 16.6775,
"step": 11900
},
{
"epoch": 0.3453183358256937,
"grad_norm": 20.28125,
"learning_rate": 5.395618486878939e-08,
"loss": 16.5301,
"step": 11950
},
{
"epoch": 0.34676318241910664,
"grad_norm": 21.578125,
"learning_rate": 5.4181942964474704e-08,
"loss": 16.501,
"step": 12000
},
{
"epoch": 0.34676318241910664,
"eval_loss": 2.0641098022460938,
"eval_runtime": 340.7339,
"eval_samples_per_second": 2736.839,
"eval_steps_per_second": 42.764,
"step": 12000
},
{
"epoch": 0.34820802901251957,
"grad_norm": 22.203125,
"learning_rate": 5.440770106016002e-08,
"loss": 16.6513,
"step": 12050
},
{
"epoch": 0.34965287560593256,
"grad_norm": 22.296875,
"learning_rate": 5.463345915584533e-08,
"loss": 16.584,
"step": 12100
},
{
"epoch": 0.3510977221993455,
"grad_norm": 22.953125,
"learning_rate": 5.485921725153064e-08,
"loss": 16.5184,
"step": 12150
},
{
"epoch": 0.3525425687927584,
"grad_norm": 19.984375,
"learning_rate": 5.5084975347215946e-08,
"loss": 16.4685,
"step": 12200
},
{
"epoch": 0.3539874153861714,
"grad_norm": 22.515625,
"learning_rate": 5.531073344290126e-08,
"loss": 16.5087,
"step": 12250
},
{
"epoch": 0.35543226197958433,
"grad_norm": 22.984375,
"learning_rate": 5.553649153858657e-08,
"loss": 16.4561,
"step": 12300
},
{
"epoch": 0.35687710857299726,
"grad_norm": 20.421875,
"learning_rate": 5.576224963427188e-08,
"loss": 16.502,
"step": 12350
},
{
"epoch": 0.3583219551664102,
"grad_norm": 21.234375,
"learning_rate": 5.5988007729957195e-08,
"loss": 16.4946,
"step": 12400
},
{
"epoch": 0.3597668017598232,
"grad_norm": 20.953125,
"learning_rate": 5.6213765825642504e-08,
"loss": 16.4988,
"step": 12450
},
{
"epoch": 0.3612116483532361,
"grad_norm": 23.421875,
"learning_rate": 5.643952392132781e-08,
"loss": 16.5329,
"step": 12500
},
{
"epoch": 0.36265649494664903,
"grad_norm": 31.59375,
"learning_rate": 5.666528201701313e-08,
"loss": 16.4695,
"step": 12550
},
{
"epoch": 0.36410134154006196,
"grad_norm": 22.203125,
"learning_rate": 5.6891040112698444e-08,
"loss": 16.4166,
"step": 12600
},
{
"epoch": 0.36554618813347495,
"grad_norm": 23.828125,
"learning_rate": 5.7116798208383747e-08,
"loss": 16.3887,
"step": 12650
},
{
"epoch": 0.3669910347268879,
"grad_norm": 25.0,
"learning_rate": 5.734255630406906e-08,
"loss": 16.3849,
"step": 12700
},
{
"epoch": 0.3684358813203008,
"grad_norm": 21.875,
"learning_rate": 5.756831439975438e-08,
"loss": 16.489,
"step": 12750
},
{
"epoch": 0.36988072791371374,
"grad_norm": 22.859375,
"learning_rate": 5.779407249543968e-08,
"loss": 16.4234,
"step": 12800
},
{
"epoch": 0.3713255745071267,
"grad_norm": 20.53125,
"learning_rate": 5.8019830591124996e-08,
"loss": 16.3971,
"step": 12850
},
{
"epoch": 0.37277042110053965,
"grad_norm": 23.0625,
"learning_rate": 5.824558868681031e-08,
"loss": 16.4946,
"step": 12900
},
{
"epoch": 0.3742152676939526,
"grad_norm": 21.78125,
"learning_rate": 5.8471346782495613e-08,
"loss": 16.5603,
"step": 12950
},
{
"epoch": 0.37566011428736557,
"grad_norm": 21.296875,
"learning_rate": 5.869710487818093e-08,
"loss": 16.3374,
"step": 13000
},
{
"epoch": 0.3771049608807785,
"grad_norm": 22.90625,
"learning_rate": 5.8922862973866245e-08,
"loss": 16.3641,
"step": 13050
},
{
"epoch": 0.3785498074741914,
"grad_norm": 22.84375,
"learning_rate": 5.9148621069551554e-08,
"loss": 16.4834,
"step": 13100
},
{
"epoch": 0.37999465406760435,
"grad_norm": 20.40625,
"learning_rate": 5.937437916523686e-08,
"loss": 16.4651,
"step": 13150
},
{
"epoch": 0.38143950066101734,
"grad_norm": 23.140625,
"learning_rate": 5.960013726092217e-08,
"loss": 16.5031,
"step": 13200
},
{
"epoch": 0.38288434725443027,
"grad_norm": 22.625,
"learning_rate": 5.982589535660749e-08,
"loss": 16.3802,
"step": 13250
},
{
"epoch": 0.3843291938478432,
"grad_norm": 20.375,
"learning_rate": 6.005165345229279e-08,
"loss": 16.3925,
"step": 13300
},
{
"epoch": 0.3857740404412561,
"grad_norm": 20.734375,
"learning_rate": 6.02774115479781e-08,
"loss": 16.399,
"step": 13350
},
{
"epoch": 0.3872188870346691,
"grad_norm": 22.5,
"learning_rate": 6.050316964366342e-08,
"loss": 16.3255,
"step": 13400
},
{
"epoch": 0.38866373362808204,
"grad_norm": 22.0625,
"learning_rate": 6.072892773934872e-08,
"loss": 16.5031,
"step": 13450
},
{
"epoch": 0.39010858022149497,
"grad_norm": 23.171875,
"learning_rate": 6.095468583503404e-08,
"loss": 16.3906,
"step": 13500
},
{
"epoch": 0.3915534268149079,
"grad_norm": 23.46875,
"learning_rate": 6.118044393071935e-08,
"loss": 16.4107,
"step": 13550
},
{
"epoch": 0.3929982734083209,
"grad_norm": 24.046875,
"learning_rate": 6.140620202640467e-08,
"loss": 16.4128,
"step": 13600
},
{
"epoch": 0.3944431200017338,
"grad_norm": 23.53125,
"learning_rate": 6.163196012208997e-08,
"loss": 16.4372,
"step": 13650
},
{
"epoch": 0.39588796659514675,
"grad_norm": 23.734375,
"learning_rate": 6.185771821777529e-08,
"loss": 16.5197,
"step": 13700
},
{
"epoch": 0.39733281318855973,
"grad_norm": 22.0625,
"learning_rate": 6.20834763134606e-08,
"loss": 16.4171,
"step": 13750
},
{
"epoch": 0.39877765978197266,
"grad_norm": 21.953125,
"learning_rate": 6.23092344091459e-08,
"loss": 16.3508,
"step": 13800
},
{
"epoch": 0.4002225063753856,
"grad_norm": 23.796875,
"learning_rate": 6.253499250483122e-08,
"loss": 16.4303,
"step": 13850
},
{
"epoch": 0.4016673529687985,
"grad_norm": 21.484375,
"learning_rate": 6.276075060051652e-08,
"loss": 16.3384,
"step": 13900
},
{
"epoch": 0.4031121995622115,
"grad_norm": 21.3125,
"learning_rate": 6.298650869620184e-08,
"loss": 16.3742,
"step": 13950
},
{
"epoch": 0.40455704615562443,
"grad_norm": 21.53125,
"learning_rate": 6.321226679188715e-08,
"loss": 16.3369,
"step": 14000
},
{
"epoch": 0.40600189274903736,
"grad_norm": 24.578125,
"learning_rate": 6.343802488757247e-08,
"loss": 16.4018,
"step": 14050
},
{
"epoch": 0.4074467393424503,
"grad_norm": 26.078125,
"learning_rate": 6.366378298325779e-08,
"loss": 16.4958,
"step": 14100
},
{
"epoch": 0.4088915859358633,
"grad_norm": 20.296875,
"learning_rate": 6.388954107894309e-08,
"loss": 16.4418,
"step": 14150
},
{
"epoch": 0.4103364325292762,
"grad_norm": 26.171875,
"learning_rate": 6.411529917462839e-08,
"loss": 16.3343,
"step": 14200
},
{
"epoch": 0.41178127912268914,
"grad_norm": 19.671875,
"learning_rate": 6.43410572703137e-08,
"loss": 16.3505,
"step": 14250
},
{
"epoch": 0.41322612571610207,
"grad_norm": 21.265625,
"learning_rate": 6.456681536599902e-08,
"loss": 16.3802,
"step": 14300
},
{
"epoch": 0.41467097230951505,
"grad_norm": 22.59375,
"learning_rate": 6.479257346168434e-08,
"loss": 16.3253,
"step": 14350
},
{
"epoch": 0.416115818902928,
"grad_norm": 24.015625,
"learning_rate": 6.501833155736965e-08,
"loss": 16.3624,
"step": 14400
},
{
"epoch": 0.4175606654963409,
"grad_norm": 26.5,
"learning_rate": 6.524408965305495e-08,
"loss": 16.2562,
"step": 14450
},
{
"epoch": 0.4190055120897539,
"grad_norm": 19.75,
"learning_rate": 6.546984774874027e-08,
"loss": 16.3554,
"step": 14500
},
{
"epoch": 0.4204503586831668,
"grad_norm": 24.234375,
"learning_rate": 6.569560584442557e-08,
"loss": 16.4004,
"step": 14550
},
{
"epoch": 0.42189520527657975,
"grad_norm": 22.9375,
"learning_rate": 6.592136394011089e-08,
"loss": 16.3374,
"step": 14600
},
{
"epoch": 0.4233400518699927,
"grad_norm": 24.21875,
"learning_rate": 6.61471220357962e-08,
"loss": 16.2601,
"step": 14650
},
{
"epoch": 0.42478489846340567,
"grad_norm": 23.203125,
"learning_rate": 6.63728801314815e-08,
"loss": 16.331,
"step": 14700
},
{
"epoch": 0.4262297450568186,
"grad_norm": 23.640625,
"learning_rate": 6.659863822716682e-08,
"loss": 16.3419,
"step": 14750
},
{
"epoch": 0.42767459165023153,
"grad_norm": 25.453125,
"learning_rate": 6.682439632285214e-08,
"loss": 16.2981,
"step": 14800
},
{
"epoch": 0.42911943824364446,
"grad_norm": 25.765625,
"learning_rate": 6.705015441853745e-08,
"loss": 16.2609,
"step": 14850
},
{
"epoch": 0.43056428483705744,
"grad_norm": 20.46875,
"learning_rate": 6.727591251422276e-08,
"loss": 16.3126,
"step": 14900
},
{
"epoch": 0.43200913143047037,
"grad_norm": 23.25,
"learning_rate": 6.750167060990807e-08,
"loss": 16.2534,
"step": 14950
},
{
"epoch": 0.4334539780238833,
"grad_norm": 21.4375,
"learning_rate": 6.772742870559337e-08,
"loss": 16.2914,
"step": 15000
},
{
"epoch": 0.4334539780238833,
"eval_loss": 2.0395615100860596,
"eval_runtime": 351.3281,
"eval_samples_per_second": 2654.311,
"eval_steps_per_second": 41.474,
"step": 15000
},
{
"epoch": 0.43489882461729623,
"grad_norm": 20.796875,
"learning_rate": 6.795318680127869e-08,
"loss": 16.2964,
"step": 15050
},
{
"epoch": 0.4363436712107092,
"grad_norm": 21.578125,
"learning_rate": 6.8178944896964e-08,
"loss": 16.3424,
"step": 15100
},
{
"epoch": 0.43778851780412215,
"grad_norm": 21.703125,
"learning_rate": 6.840470299264932e-08,
"loss": 16.2815,
"step": 15150
},
{
"epoch": 0.4392333643975351,
"grad_norm": 22.578125,
"learning_rate": 6.863046108833462e-08,
"loss": 16.2694,
"step": 15200
},
{
"epoch": 0.44067821099094806,
"grad_norm": 26.828125,
"learning_rate": 6.885621918401994e-08,
"loss": 16.2776,
"step": 15250
},
{
"epoch": 0.442123057584361,
"grad_norm": 21.828125,
"learning_rate": 6.908197727970524e-08,
"loss": 16.3022,
"step": 15300
},
{
"epoch": 0.4435679041777739,
"grad_norm": 20.9375,
"learning_rate": 6.930773537539056e-08,
"loss": 16.2117,
"step": 15350
},
{
"epoch": 0.44501275077118685,
"grad_norm": 25.59375,
"learning_rate": 6.953349347107587e-08,
"loss": 16.3209,
"step": 15400
},
{
"epoch": 0.44645759736459983,
"grad_norm": 24.96875,
"learning_rate": 6.975925156676119e-08,
"loss": 16.2672,
"step": 15450
},
{
"epoch": 0.44790244395801276,
"grad_norm": 20.359375,
"learning_rate": 6.99850096624465e-08,
"loss": 16.2478,
"step": 15500
},
{
"epoch": 0.4493472905514257,
"grad_norm": 20.28125,
"learning_rate": 7.021076775813179e-08,
"loss": 16.2187,
"step": 15550
},
{
"epoch": 0.4507921371448386,
"grad_norm": 21.1875,
"learning_rate": 7.043652585381711e-08,
"loss": 16.2327,
"step": 15600
},
{
"epoch": 0.4522369837382516,
"grad_norm": 21.0625,
"learning_rate": 7.066228394950242e-08,
"loss": 16.321,
"step": 15650
},
{
"epoch": 0.45368183033166454,
"grad_norm": 24.265625,
"learning_rate": 7.088804204518774e-08,
"loss": 16.2536,
"step": 15700
},
{
"epoch": 0.45512667692507747,
"grad_norm": 22.953125,
"learning_rate": 7.111380014087305e-08,
"loss": 16.2889,
"step": 15750
},
{
"epoch": 0.45657152351849045,
"grad_norm": 22.265625,
"learning_rate": 7.133955823655837e-08,
"loss": 16.2099,
"step": 15800
},
{
"epoch": 0.4580163701119034,
"grad_norm": 21.25,
"learning_rate": 7.156531633224367e-08,
"loss": 16.2138,
"step": 15850
},
{
"epoch": 0.4594612167053163,
"grad_norm": 22.125,
"learning_rate": 7.179107442792897e-08,
"loss": 16.2212,
"step": 15900
},
{
"epoch": 0.46090606329872924,
"grad_norm": 21.515625,
"learning_rate": 7.201683252361429e-08,
"loss": 16.2495,
"step": 15950
},
{
"epoch": 0.4623509098921422,
"grad_norm": 22.609375,
"learning_rate": 7.22425906192996e-08,
"loss": 16.2045,
"step": 16000
},
{
"epoch": 0.46379575648555516,
"grad_norm": 22.15625,
"learning_rate": 7.246834871498492e-08,
"loss": 16.2562,
"step": 16050
},
{
"epoch": 0.4652406030789681,
"grad_norm": 22.703125,
"learning_rate": 7.269410681067024e-08,
"loss": 16.2227,
"step": 16100
},
{
"epoch": 0.466685449672381,
"grad_norm": 22.484375,
"learning_rate": 7.291986490635554e-08,
"loss": 16.2847,
"step": 16150
},
{
"epoch": 0.468130296265794,
"grad_norm": 23.125,
"learning_rate": 7.314562300204084e-08,
"loss": 16.2231,
"step": 16200
},
{
"epoch": 0.46957514285920693,
"grad_norm": 21.703125,
"learning_rate": 7.337138109772616e-08,
"loss": 16.2217,
"step": 16250
},
{
"epoch": 0.47101998945261986,
"grad_norm": 21.34375,
"learning_rate": 7.359713919341147e-08,
"loss": 16.0962,
"step": 16300
},
{
"epoch": 0.4724648360460328,
"grad_norm": 24.234375,
"learning_rate": 7.382289728909679e-08,
"loss": 16.1981,
"step": 16350
},
{
"epoch": 0.4739096826394458,
"grad_norm": 22.09375,
"learning_rate": 7.40486553847821e-08,
"loss": 16.228,
"step": 16400
},
{
"epoch": 0.4753545292328587,
"grad_norm": 20.671875,
"learning_rate": 7.42744134804674e-08,
"loss": 16.2862,
"step": 16450
},
{
"epoch": 0.47679937582627163,
"grad_norm": 21.265625,
"learning_rate": 7.450017157615272e-08,
"loss": 16.3092,
"step": 16500
},
{
"epoch": 0.4782442224196846,
"grad_norm": 24.90625,
"learning_rate": 7.472592967183802e-08,
"loss": 16.2476,
"step": 16550
},
{
"epoch": 0.47968906901309755,
"grad_norm": 23.328125,
"learning_rate": 7.495168776752334e-08,
"loss": 16.1906,
"step": 16600
},
{
"epoch": 0.4811339156065105,
"grad_norm": 23.140625,
"learning_rate": 7.517744586320865e-08,
"loss": 16.154,
"step": 16650
},
{
"epoch": 0.4825787621999234,
"grad_norm": 22.234375,
"learning_rate": 7.540320395889396e-08,
"loss": 16.2017,
"step": 16700
},
{
"epoch": 0.4840236087933364,
"grad_norm": 22.890625,
"learning_rate": 7.562896205457927e-08,
"loss": 16.345,
"step": 16750
},
{
"epoch": 0.4854684553867493,
"grad_norm": 21.625,
"learning_rate": 7.585472015026459e-08,
"loss": 16.2125,
"step": 16800
},
{
"epoch": 0.48691330198016225,
"grad_norm": 21.640625,
"learning_rate": 7.60804782459499e-08,
"loss": 16.217,
"step": 16850
},
{
"epoch": 0.4883581485735752,
"grad_norm": 23.421875,
"learning_rate": 7.63062363416352e-08,
"loss": 16.2419,
"step": 16900
},
{
"epoch": 0.48980299516698816,
"grad_norm": 25.9375,
"learning_rate": 7.653199443732052e-08,
"loss": 16.1463,
"step": 16950
},
{
"epoch": 0.4912478417604011,
"grad_norm": 22.703125,
"learning_rate": 7.675775253300582e-08,
"loss": 16.0852,
"step": 17000
},
{
"epoch": 0.492692688353814,
"grad_norm": 21.96875,
"learning_rate": 7.698351062869114e-08,
"loss": 16.2164,
"step": 17050
},
{
"epoch": 0.49413753494722695,
"grad_norm": 25.6875,
"learning_rate": 7.720926872437645e-08,
"loss": 16.1581,
"step": 17100
},
{
"epoch": 0.49558238154063994,
"grad_norm": 22.953125,
"learning_rate": 7.743502682006177e-08,
"loss": 16.1946,
"step": 17150
},
{
"epoch": 0.49702722813405287,
"grad_norm": 21.984375,
"learning_rate": 7.766078491574707e-08,
"loss": 16.2602,
"step": 17200
},
{
"epoch": 0.4984720747274658,
"grad_norm": 23.78125,
"learning_rate": 7.788654301143239e-08,
"loss": 16.1283,
"step": 17250
},
{
"epoch": 0.4999169213208788,
"grad_norm": 21.0625,
"learning_rate": 7.811230110711769e-08,
"loss": 16.1301,
"step": 17300
},
{
"epoch": 0.5013617679142917,
"grad_norm": 27.609375,
"learning_rate": 7.8338059202803e-08,
"loss": 16.0988,
"step": 17350
},
{
"epoch": 0.5028066145077047,
"grad_norm": 24.4375,
"learning_rate": 7.856381729848832e-08,
"loss": 16.207,
"step": 17400
},
{
"epoch": 0.5042514611011176,
"grad_norm": 23.765625,
"learning_rate": 7.878957539417364e-08,
"loss": 16.2054,
"step": 17450
},
{
"epoch": 0.5056963076945306,
"grad_norm": 22.140625,
"learning_rate": 7.901533348985895e-08,
"loss": 16.173,
"step": 17500
},
{
"epoch": 0.5071411542879435,
"grad_norm": 23.53125,
"learning_rate": 7.924109158554424e-08,
"loss": 16.043,
"step": 17550
},
{
"epoch": 0.5085860008813564,
"grad_norm": 21.78125,
"learning_rate": 7.946684968122956e-08,
"loss": 16.246,
"step": 17600
},
{
"epoch": 0.5100308474747693,
"grad_norm": 22.125,
"learning_rate": 7.969260777691487e-08,
"loss": 16.2203,
"step": 17650
},
{
"epoch": 0.5114756940681823,
"grad_norm": 24.15625,
"learning_rate": 7.991836587260019e-08,
"loss": 16.1279,
"step": 17700
},
{
"epoch": 0.5129205406615952,
"grad_norm": 22.1875,
"learning_rate": 8.01441239682855e-08,
"loss": 16.0409,
"step": 17750
},
{
"epoch": 0.5143653872550082,
"grad_norm": 23.640625,
"learning_rate": 8.036988206397082e-08,
"loss": 16.0298,
"step": 17800
},
{
"epoch": 0.5158102338484212,
"grad_norm": 24.015625,
"learning_rate": 8.059564015965612e-08,
"loss": 16.0894,
"step": 17850
},
{
"epoch": 0.5172550804418341,
"grad_norm": 25.109375,
"learning_rate": 8.082139825534142e-08,
"loss": 16.1767,
"step": 17900
},
{
"epoch": 0.518699927035247,
"grad_norm": 19.859375,
"learning_rate": 8.104715635102674e-08,
"loss": 16.0735,
"step": 17950
},
{
"epoch": 0.52014477362866,
"grad_norm": 21.375,
"learning_rate": 8.127291444671206e-08,
"loss": 16.1829,
"step": 18000
},
{
"epoch": 0.52014477362866,
"eval_loss": 2.015727996826172,
"eval_runtime": 347.0309,
"eval_samples_per_second": 2687.178,
"eval_steps_per_second": 41.988,
"step": 18000
},
{
"epoch": 0.5215896202220729,
"grad_norm": 20.765625,
"learning_rate": 8.149867254239737e-08,
"loss": 16.1783,
"step": 18050
},
{
"epoch": 0.5230344668154858,
"grad_norm": 21.921875,
"learning_rate": 8.172443063808269e-08,
"loss": 16.0167,
"step": 18100
},
{
"epoch": 0.5244793134088989,
"grad_norm": 21.1875,
"learning_rate": 8.195018873376799e-08,
"loss": 16.0192,
"step": 18150
},
{
"epoch": 0.5259241600023118,
"grad_norm": 22.78125,
"learning_rate": 8.217594682945329e-08,
"loss": 16.1586,
"step": 18200
},
{
"epoch": 0.5273690065957247,
"grad_norm": 21.15625,
"learning_rate": 8.240170492513861e-08,
"loss": 16.0389,
"step": 18250
},
{
"epoch": 0.5288138531891377,
"grad_norm": 24.90625,
"learning_rate": 8.262746302082392e-08,
"loss": 16.1815,
"step": 18300
},
{
"epoch": 0.5302586997825506,
"grad_norm": 21.171875,
"learning_rate": 8.285322111650924e-08,
"loss": 16.1257,
"step": 18350
},
{
"epoch": 0.5317035463759635,
"grad_norm": 23.96875,
"learning_rate": 8.307897921219455e-08,
"loss": 16.1506,
"step": 18400
},
{
"epoch": 0.5331483929693764,
"grad_norm": 23.3125,
"learning_rate": 8.330473730787986e-08,
"loss": 16.1117,
"step": 18450
},
{
"epoch": 0.5345932395627894,
"grad_norm": 23.328125,
"learning_rate": 8.353049540356517e-08,
"loss": 16.1372,
"step": 18500
},
{
"epoch": 0.5360380861562024,
"grad_norm": 21.265625,
"learning_rate": 8.375625349925047e-08,
"loss": 16.1619,
"step": 18550
},
{
"epoch": 0.5374829327496153,
"grad_norm": 22.53125,
"learning_rate": 8.398201159493579e-08,
"loss": 16.0791,
"step": 18600
},
{
"epoch": 0.5389277793430283,
"grad_norm": 27.203125,
"learning_rate": 8.42077696906211e-08,
"loss": 16.2036,
"step": 18650
},
{
"epoch": 0.5403726259364412,
"grad_norm": 23.25,
"learning_rate": 8.443352778630641e-08,
"loss": 16.0342,
"step": 18700
},
{
"epoch": 0.5418174725298541,
"grad_norm": 22.453125,
"learning_rate": 8.465928588199172e-08,
"loss": 16.1121,
"step": 18750
},
{
"epoch": 0.5432623191232671,
"grad_norm": 27.1875,
"learning_rate": 8.488504397767704e-08,
"loss": 16.1368,
"step": 18800
},
{
"epoch": 0.54470716571668,
"grad_norm": 24.609375,
"learning_rate": 8.511080207336235e-08,
"loss": 16.006,
"step": 18850
},
{
"epoch": 0.546152012310093,
"grad_norm": 25.890625,
"learning_rate": 8.533656016904766e-08,
"loss": 16.0605,
"step": 18900
},
{
"epoch": 0.547596858903506,
"grad_norm": 21.46875,
"learning_rate": 8.556231826473297e-08,
"loss": 16.1316,
"step": 18950
},
{
"epoch": 0.5490417054969189,
"grad_norm": 21.75,
"learning_rate": 8.578807636041827e-08,
"loss": 16.0824,
"step": 19000
},
{
"epoch": 0.5504865520903318,
"grad_norm": 23.796875,
"learning_rate": 8.601383445610359e-08,
"loss": 16.0387,
"step": 19050
},
{
"epoch": 0.5519313986837447,
"grad_norm": 21.625,
"learning_rate": 8.62395925517889e-08,
"loss": 16.1834,
"step": 19100
},
{
"epoch": 0.5533762452771577,
"grad_norm": 20.375,
"learning_rate": 8.646535064747422e-08,
"loss": 16.0675,
"step": 19150
},
{
"epoch": 0.5548210918705706,
"grad_norm": 20.65625,
"learning_rate": 8.669110874315952e-08,
"loss": 15.9053,
"step": 19200
},
{
"epoch": 0.5562659384639835,
"grad_norm": 21.984375,
"learning_rate": 8.691686683884484e-08,
"loss": 16.0319,
"step": 19250
},
{
"epoch": 0.5577107850573966,
"grad_norm": 19.9375,
"learning_rate": 8.714262493453014e-08,
"loss": 16.0288,
"step": 19300
},
{
"epoch": 0.5591556316508095,
"grad_norm": 21.234375,
"learning_rate": 8.736838303021546e-08,
"loss": 16.0782,
"step": 19350
},
{
"epoch": 0.5606004782442224,
"grad_norm": 23.84375,
"learning_rate": 8.759414112590077e-08,
"loss": 15.987,
"step": 19400
},
{
"epoch": 0.5620453248376354,
"grad_norm": 26.1875,
"learning_rate": 8.781989922158609e-08,
"loss": 16.1125,
"step": 19450
},
{
"epoch": 0.5634901714310483,
"grad_norm": 22.640625,
"learning_rate": 8.80456573172714e-08,
"loss": 16.066,
"step": 19500
},
{
"epoch": 0.5649350180244612,
"grad_norm": 23.9375,
"learning_rate": 8.827141541295669e-08,
"loss": 15.9326,
"step": 19550
},
{
"epoch": 0.5663798646178742,
"grad_norm": 23.34375,
"learning_rate": 8.849717350864201e-08,
"loss": 16.0386,
"step": 19600
},
{
"epoch": 0.5678247112112872,
"grad_norm": 22.703125,
"learning_rate": 8.872293160432732e-08,
"loss": 16.0198,
"step": 19650
},
{
"epoch": 0.5692695578047001,
"grad_norm": 24.21875,
"learning_rate": 8.894868970001264e-08,
"loss": 15.9735,
"step": 19700
},
{
"epoch": 0.570714404398113,
"grad_norm": 23.1875,
"learning_rate": 8.917444779569795e-08,
"loss": 15.9592,
"step": 19750
},
{
"epoch": 0.572159250991526,
"grad_norm": 21.96875,
"learning_rate": 8.940020589138327e-08,
"loss": 16.0753,
"step": 19800
},
{
"epoch": 0.5736040975849389,
"grad_norm": 23.59375,
"learning_rate": 8.962596398706857e-08,
"loss": 15.9445,
"step": 19850
},
{
"epoch": 0.5750489441783518,
"grad_norm": 21.453125,
"learning_rate": 8.985172208275388e-08,
"loss": 15.9417,
"step": 19900
},
{
"epoch": 0.5764937907717648,
"grad_norm": 26.203125,
"learning_rate": 9.007748017843919e-08,
"loss": 15.9153,
"step": 19950
},
{
"epoch": 0.5779386373651777,
"grad_norm": 26.90625,
"learning_rate": 9.03032382741245e-08,
"loss": 16.0521,
"step": 20000
},
{
"epoch": 0.5793834839585907,
"grad_norm": 24.5625,
"learning_rate": 9.052899636980982e-08,
"loss": 16.0634,
"step": 20050
},
{
"epoch": 0.5808283305520037,
"grad_norm": 22.6875,
"learning_rate": 9.075475446549514e-08,
"loss": 16.0152,
"step": 20100
},
{
"epoch": 0.5822731771454166,
"grad_norm": 21.15625,
"learning_rate": 9.098051256118044e-08,
"loss": 15.9701,
"step": 20150
},
{
"epoch": 0.5837180237388295,
"grad_norm": 23.03125,
"learning_rate": 9.120627065686574e-08,
"loss": 15.9496,
"step": 20200
},
{
"epoch": 0.5851628703322425,
"grad_norm": 22.53125,
"learning_rate": 9.143202875255106e-08,
"loss": 15.97,
"step": 20250
},
{
"epoch": 0.5866077169256554,
"grad_norm": 23.046875,
"learning_rate": 9.165778684823637e-08,
"loss": 15.8332,
"step": 20300
},
{
"epoch": 0.5880525635190683,
"grad_norm": 21.40625,
"learning_rate": 9.188354494392169e-08,
"loss": 15.977,
"step": 20350
},
{
"epoch": 0.5894974101124814,
"grad_norm": 24.3125,
"learning_rate": 9.210930303960699e-08,
"loss": 15.9583,
"step": 20400
},
{
"epoch": 0.5909422567058943,
"grad_norm": 23.359375,
"learning_rate": 9.23350611352923e-08,
"loss": 16.0266,
"step": 20450
},
{
"epoch": 0.5923871032993072,
"grad_norm": 23.25,
"learning_rate": 9.256081923097762e-08,
"loss": 15.9804,
"step": 20500
},
{
"epoch": 0.5938319498927201,
"grad_norm": 20.296875,
"learning_rate": 9.278657732666292e-08,
"loss": 15.9473,
"step": 20550
},
{
"epoch": 0.5952767964861331,
"grad_norm": 21.1875,
"learning_rate": 9.301233542234824e-08,
"loss": 15.9547,
"step": 20600
},
{
"epoch": 0.596721643079546,
"grad_norm": 23.40625,
"learning_rate": 9.323809351803356e-08,
"loss": 16.0047,
"step": 20650
},
{
"epoch": 0.5981664896729589,
"grad_norm": 22.03125,
"learning_rate": 9.346385161371886e-08,
"loss": 15.9565,
"step": 20700
},
{
"epoch": 0.5996113362663719,
"grad_norm": 23.96875,
"learning_rate": 9.368960970940417e-08,
"loss": 15.978,
"step": 20750
},
{
"epoch": 0.6010561828597849,
"grad_norm": 26.078125,
"learning_rate": 9.391536780508949e-08,
"loss": 15.9615,
"step": 20800
},
{
"epoch": 0.6025010294531978,
"grad_norm": 22.421875,
"learning_rate": 9.41411259007748e-08,
"loss": 15.9731,
"step": 20850
},
{
"epoch": 0.6039458760466108,
"grad_norm": 28.59375,
"learning_rate": 9.436688399646011e-08,
"loss": 15.8916,
"step": 20900
},
{
"epoch": 0.6053907226400237,
"grad_norm": 21.375,
"learning_rate": 9.459264209214542e-08,
"loss": 15.9134,
"step": 20950
},
{
"epoch": 0.6068355692334366,
"grad_norm": 23.015625,
"learning_rate": 9.481840018783072e-08,
"loss": 15.9756,
"step": 21000
},
{
"epoch": 0.6068355692334366,
"eval_loss": 1.990402102470398,
"eval_runtime": 346.2542,
"eval_samples_per_second": 2693.206,
"eval_steps_per_second": 42.082,
"step": 21000
},
{
"epoch": 0.6082804158268496,
"grad_norm": 21.765625,
"learning_rate": 9.504415828351604e-08,
"loss": 15.9846,
"step": 21050
},
{
"epoch": 0.6097252624202625,
"grad_norm": 21.875,
"learning_rate": 9.526991637920136e-08,
"loss": 15.92,
"step": 21100
},
{
"epoch": 0.6111701090136755,
"grad_norm": 23.140625,
"learning_rate": 9.549567447488667e-08,
"loss": 16.0141,
"step": 21150
},
{
"epoch": 0.6126149556070885,
"grad_norm": 22.1875,
"learning_rate": 9.572143257057199e-08,
"loss": 15.9318,
"step": 21200
},
{
"epoch": 0.6140598022005014,
"grad_norm": 24.421875,
"learning_rate": 9.594719066625728e-08,
"loss": 15.9115,
"step": 21250
},
{
"epoch": 0.6155046487939143,
"grad_norm": 22.734375,
"learning_rate": 9.617294876194259e-08,
"loss": 15.847,
"step": 21300
},
{
"epoch": 0.6169494953873272,
"grad_norm": 23.078125,
"learning_rate": 9.639870685762791e-08,
"loss": 16.0101,
"step": 21350
},
{
"epoch": 0.6183943419807402,
"grad_norm": 21.265625,
"learning_rate": 9.662446495331322e-08,
"loss": 15.9588,
"step": 21400
},
{
"epoch": 0.6198391885741531,
"grad_norm": 24.171875,
"learning_rate": 9.685022304899854e-08,
"loss": 15.9525,
"step": 21450
},
{
"epoch": 0.621284035167566,
"grad_norm": 23.078125,
"learning_rate": 9.707598114468385e-08,
"loss": 15.9591,
"step": 21500
},
{
"epoch": 0.6227288817609791,
"grad_norm": 26.1875,
"learning_rate": 9.730173924036914e-08,
"loss": 15.8973,
"step": 21550
},
{
"epoch": 0.624173728354392,
"grad_norm": 22.1875,
"learning_rate": 9.752749733605446e-08,
"loss": 15.921,
"step": 21600
},
{
"epoch": 0.6256185749478049,
"grad_norm": 34.90625,
"learning_rate": 9.775325543173977e-08,
"loss": 15.8763,
"step": 21650
},
{
"epoch": 0.6270634215412179,
"grad_norm": 20.875,
"learning_rate": 9.797901352742509e-08,
"loss": 15.9879,
"step": 21700
},
{
"epoch": 0.6285082681346308,
"grad_norm": 22.796875,
"learning_rate": 9.82047716231104e-08,
"loss": 15.8933,
"step": 21750
},
{
"epoch": 0.6299531147280437,
"grad_norm": 25.84375,
"learning_rate": 9.843052971879572e-08,
"loss": 15.7943,
"step": 21800
},
{
"epoch": 0.6313979613214566,
"grad_norm": 21.1875,
"learning_rate": 9.865628781448102e-08,
"loss": 15.842,
"step": 21850
},
{
"epoch": 0.6328428079148697,
"grad_norm": 25.890625,
"learning_rate": 9.888204591016633e-08,
"loss": 15.9081,
"step": 21900
},
{
"epoch": 0.6342876545082826,
"grad_norm": 20.859375,
"learning_rate": 9.910780400585164e-08,
"loss": 15.825,
"step": 21950
},
{
"epoch": 0.6357325011016955,
"grad_norm": 20.953125,
"learning_rate": 9.933356210153696e-08,
"loss": 15.8846,
"step": 22000
},
{
"epoch": 0.6371773476951085,
"grad_norm": 22.0,
"learning_rate": 9.955932019722227e-08,
"loss": 15.8721,
"step": 22050
},
{
"epoch": 0.6386221942885214,
"grad_norm": 24.765625,
"learning_rate": 9.978507829290759e-08,
"loss": 15.7167,
"step": 22100
},
{
"epoch": 0.6400670408819343,
"grad_norm": 23.984375,
"learning_rate": 1.0001083638859289e-07,
"loss": 15.8247,
"step": 22150
},
{
"epoch": 0.6415118874753473,
"grad_norm": 19.5,
"learning_rate": 1.002365944842782e-07,
"loss": 15.8111,
"step": 22200
},
{
"epoch": 0.6429567340687602,
"grad_norm": 25.640625,
"learning_rate": 1.0046235257996351e-07,
"loss": 15.9381,
"step": 22250
},
{
"epoch": 0.6444015806621732,
"grad_norm": 19.84375,
"learning_rate": 1.0068811067564882e-07,
"loss": 15.7998,
"step": 22300
},
{
"epoch": 0.6458464272555862,
"grad_norm": 21.296875,
"learning_rate": 1.0091386877133414e-07,
"loss": 15.8191,
"step": 22350
},
{
"epoch": 0.6472912738489991,
"grad_norm": 22.0,
"learning_rate": 1.0113962686701944e-07,
"loss": 15.8098,
"step": 22400
},
{
"epoch": 0.648736120442412,
"grad_norm": 24.46875,
"learning_rate": 1.0136538496270476e-07,
"loss": 15.8501,
"step": 22450
},
{
"epoch": 0.650180967035825,
"grad_norm": 23.3125,
"learning_rate": 1.0159114305839007e-07,
"loss": 15.8631,
"step": 22500
},
{
"epoch": 0.6516258136292379,
"grad_norm": 25.875,
"learning_rate": 1.0181690115407538e-07,
"loss": 15.6956,
"step": 22550
},
{
"epoch": 0.6530706602226508,
"grad_norm": 24.703125,
"learning_rate": 1.0204265924976069e-07,
"loss": 15.8223,
"step": 22600
},
{
"epoch": 0.6545155068160639,
"grad_norm": 22.15625,
"learning_rate": 1.02268417345446e-07,
"loss": 15.9404,
"step": 22650
},
{
"epoch": 0.6559603534094768,
"grad_norm": 23.59375,
"learning_rate": 1.0249417544113131e-07,
"loss": 15.7929,
"step": 22700
},
{
"epoch": 0.6574052000028897,
"grad_norm": 20.5,
"learning_rate": 1.0271993353681662e-07,
"loss": 15.8155,
"step": 22750
},
{
"epoch": 0.6588500465963026,
"grad_norm": 22.984375,
"learning_rate": 1.0294569163250194e-07,
"loss": 15.8803,
"step": 22800
},
{
"epoch": 0.6602948931897156,
"grad_norm": 22.859375,
"learning_rate": 1.0317144972818726e-07,
"loss": 15.8193,
"step": 22850
},
{
"epoch": 0.6617397397831285,
"grad_norm": 24.109375,
"learning_rate": 1.0339720782387256e-07,
"loss": 15.6904,
"step": 22900
},
{
"epoch": 0.6631845863765414,
"grad_norm": 28.3125,
"learning_rate": 1.0362296591955787e-07,
"loss": 15.8436,
"step": 22950
},
{
"epoch": 0.6646294329699545,
"grad_norm": 24.234375,
"learning_rate": 1.0384872401524318e-07,
"loss": 15.7791,
"step": 23000
},
{
"epoch": 0.6660742795633674,
"grad_norm": 22.40625,
"learning_rate": 1.0407448211092849e-07,
"loss": 15.8147,
"step": 23050
},
{
"epoch": 0.6675191261567803,
"grad_norm": 21.90625,
"learning_rate": 1.0430024020661381e-07,
"loss": 15.8562,
"step": 23100
},
{
"epoch": 0.6689639727501933,
"grad_norm": 22.296875,
"learning_rate": 1.0452599830229912e-07,
"loss": 15.8147,
"step": 23150
},
{
"epoch": 0.6704088193436062,
"grad_norm": 24.0,
"learning_rate": 1.0475175639798444e-07,
"loss": 15.7467,
"step": 23200
},
{
"epoch": 0.6718536659370191,
"grad_norm": 23.09375,
"learning_rate": 1.0497751449366973e-07,
"loss": 15.8176,
"step": 23250
},
{
"epoch": 0.673298512530432,
"grad_norm": 29.296875,
"learning_rate": 1.0520327258935504e-07,
"loss": 15.7216,
"step": 23300
},
{
"epoch": 0.674743359123845,
"grad_norm": 22.8125,
"learning_rate": 1.0542903068504036e-07,
"loss": 15.707,
"step": 23350
},
{
"epoch": 0.676188205717258,
"grad_norm": 23.859375,
"learning_rate": 1.0565478878072567e-07,
"loss": 15.752,
"step": 23400
},
{
"epoch": 0.677633052310671,
"grad_norm": 22.40625,
"learning_rate": 1.0588054687641099e-07,
"loss": 15.7979,
"step": 23450
},
{
"epoch": 0.6790778989040839,
"grad_norm": 23.6875,
"learning_rate": 1.061063049720963e-07,
"loss": 15.7577,
"step": 23500
},
{
"epoch": 0.6805227454974968,
"grad_norm": 20.421875,
"learning_rate": 1.063320630677816e-07,
"loss": 15.7758,
"step": 23550
},
{
"epoch": 0.6819675920909097,
"grad_norm": 22.734375,
"learning_rate": 1.0655782116346691e-07,
"loss": 15.7973,
"step": 23600
},
{
"epoch": 0.6834124386843227,
"grad_norm": 20.765625,
"learning_rate": 1.0678357925915222e-07,
"loss": 15.8134,
"step": 23650
},
{
"epoch": 0.6848572852777356,
"grad_norm": 24.046875,
"learning_rate": 1.0700933735483754e-07,
"loss": 15.7493,
"step": 23700
},
{
"epoch": 0.6863021318711486,
"grad_norm": 24.34375,
"learning_rate": 1.0723509545052286e-07,
"loss": 15.6879,
"step": 23750
},
{
"epoch": 0.6877469784645616,
"grad_norm": 23.109375,
"learning_rate": 1.0746085354620817e-07,
"loss": 15.807,
"step": 23800
},
{
"epoch": 0.6891918250579745,
"grad_norm": 25.34375,
"learning_rate": 1.0768661164189347e-07,
"loss": 15.8162,
"step": 23850
},
{
"epoch": 0.6906366716513874,
"grad_norm": 21.140625,
"learning_rate": 1.0791236973757878e-07,
"loss": 15.7665,
"step": 23900
},
{
"epoch": 0.6920815182448004,
"grad_norm": 22.796875,
"learning_rate": 1.0813812783326409e-07,
"loss": 15.7595,
"step": 23950
},
{
"epoch": 0.6935263648382133,
"grad_norm": 22.578125,
"learning_rate": 1.0836388592894941e-07,
"loss": 15.7217,
"step": 24000
},
{
"epoch": 0.6935263648382133,
"eval_loss": 1.9680598974227905,
"eval_runtime": 341.6298,
"eval_samples_per_second": 2729.662,
"eval_steps_per_second": 42.651,
"step": 24000
},
{
"epoch": 0.6949712114316262,
"grad_norm": 21.015625,
"learning_rate": 1.0858964402463472e-07,
"loss": 15.778,
"step": 24050
},
{
"epoch": 0.6964160580250391,
"grad_norm": 24.84375,
"learning_rate": 1.0881540212032004e-07,
"loss": 15.7675,
"step": 24100
},
{
"epoch": 0.6978609046184522,
"grad_norm": 26.171875,
"learning_rate": 1.0904116021600534e-07,
"loss": 15.7371,
"step": 24150
},
{
"epoch": 0.6993057512118651,
"grad_norm": 20.859375,
"learning_rate": 1.0926691831169066e-07,
"loss": 15.8547,
"step": 24200
},
{
"epoch": 0.700750597805278,
"grad_norm": 22.34375,
"learning_rate": 1.0949267640737596e-07,
"loss": 15.6903,
"step": 24250
},
{
"epoch": 0.702195444398691,
"grad_norm": 23.140625,
"learning_rate": 1.0971843450306127e-07,
"loss": 15.7397,
"step": 24300
},
{
"epoch": 0.7036402909921039,
"grad_norm": 21.953125,
"learning_rate": 1.0994419259874659e-07,
"loss": 15.6126,
"step": 24350
},
{
"epoch": 0.7050851375855168,
"grad_norm": 25.8125,
"learning_rate": 1.1016995069443189e-07,
"loss": 15.8006,
"step": 24400
},
{
"epoch": 0.7065299841789298,
"grad_norm": 25.0625,
"learning_rate": 1.1039570879011721e-07,
"loss": 15.6731,
"step": 24450
},
{
"epoch": 0.7079748307723428,
"grad_norm": 26.265625,
"learning_rate": 1.1062146688580252e-07,
"loss": 15.7758,
"step": 24500
},
{
"epoch": 0.7094196773657557,
"grad_norm": 23.390625,
"learning_rate": 1.1084722498148783e-07,
"loss": 15.7123,
"step": 24550
},
{
"epoch": 0.7108645239591687,
"grad_norm": 21.984375,
"learning_rate": 1.1107298307717314e-07,
"loss": 15.7585,
"step": 24600
},
{
"epoch": 0.7123093705525816,
"grad_norm": 23.765625,
"learning_rate": 1.1129874117285846e-07,
"loss": 15.7461,
"step": 24650
},
{
"epoch": 0.7137542171459945,
"grad_norm": 22.265625,
"learning_rate": 1.1152449926854376e-07,
"loss": 15.7875,
"step": 24700
},
{
"epoch": 0.7151990637394074,
"grad_norm": 22.375,
"learning_rate": 1.1175025736422907e-07,
"loss": 15.6815,
"step": 24750
},
{
"epoch": 0.7166439103328204,
"grad_norm": 25.4375,
"learning_rate": 1.1197601545991439e-07,
"loss": 15.7047,
"step": 24800
},
{
"epoch": 0.7180887569262333,
"grad_norm": 30.53125,
"learning_rate": 1.122017735555997e-07,
"loss": 15.7082,
"step": 24850
},
{
"epoch": 0.7195336035196463,
"grad_norm": 23.859375,
"learning_rate": 1.1242753165128501e-07,
"loss": 15.6932,
"step": 24900
},
{
"epoch": 0.7209784501130593,
"grad_norm": 21.890625,
"learning_rate": 1.1265328974697032e-07,
"loss": 15.7004,
"step": 24950
},
{
"epoch": 0.7224232967064722,
"grad_norm": 22.21875,
"learning_rate": 1.1287904784265563e-07,
"loss": 15.5976,
"step": 25000
},
{
"epoch": 0.7238681432998851,
"grad_norm": 20.765625,
"learning_rate": 1.1310480593834094e-07,
"loss": 15.651,
"step": 25050
},
{
"epoch": 0.7253129898932981,
"grad_norm": 21.203125,
"learning_rate": 1.1333056403402626e-07,
"loss": 15.7333,
"step": 25100
},
{
"epoch": 0.726757836486711,
"grad_norm": 22.25,
"learning_rate": 1.1355632212971157e-07,
"loss": 15.6802,
"step": 25150
},
{
"epoch": 0.7282026830801239,
"grad_norm": 22.8125,
"learning_rate": 1.1378208022539689e-07,
"loss": 15.6639,
"step": 25200
},
{
"epoch": 0.729647529673537,
"grad_norm": 21.140625,
"learning_rate": 1.1400783832108218e-07,
"loss": 15.6816,
"step": 25250
},
{
"epoch": 0.7310923762669499,
"grad_norm": 23.0,
"learning_rate": 1.1423359641676749e-07,
"loss": 15.5984,
"step": 25300
},
{
"epoch": 0.7325372228603628,
"grad_norm": 23.40625,
"learning_rate": 1.1445935451245281e-07,
"loss": 15.7119,
"step": 25350
},
{
"epoch": 0.7339820694537758,
"grad_norm": 21.296875,
"learning_rate": 1.1468511260813812e-07,
"loss": 15.6212,
"step": 25400
},
{
"epoch": 0.7354269160471887,
"grad_norm": 20.03125,
"learning_rate": 1.1491087070382344e-07,
"loss": 15.659,
"step": 25450
},
{
"epoch": 0.7368717626406016,
"grad_norm": 22.984375,
"learning_rate": 1.1513662879950876e-07,
"loss": 15.771,
"step": 25500
},
{
"epoch": 0.7383166092340145,
"grad_norm": 21.84375,
"learning_rate": 1.1536238689519404e-07,
"loss": 15.7338,
"step": 25550
},
{
"epoch": 0.7397614558274275,
"grad_norm": 22.234375,
"learning_rate": 1.1558814499087936e-07,
"loss": 15.6507,
"step": 25600
},
{
"epoch": 0.7412063024208405,
"grad_norm": 25.3125,
"learning_rate": 1.1581390308656468e-07,
"loss": 15.7084,
"step": 25650
},
{
"epoch": 0.7426511490142534,
"grad_norm": 26.171875,
"learning_rate": 1.1603966118224999e-07,
"loss": 15.5601,
"step": 25700
},
{
"epoch": 0.7440959956076664,
"grad_norm": 22.515625,
"learning_rate": 1.1626541927793531e-07,
"loss": 15.7191,
"step": 25750
},
{
"epoch": 0.7455408422010793,
"grad_norm": 24.375,
"learning_rate": 1.1649117737362062e-07,
"loss": 15.6457,
"step": 25800
},
{
"epoch": 0.7469856887944922,
"grad_norm": 23.640625,
"learning_rate": 1.1671693546930592e-07,
"loss": 15.572,
"step": 25850
},
{
"epoch": 0.7484305353879052,
"grad_norm": 24.375,
"learning_rate": 1.1694269356499123e-07,
"loss": 15.6297,
"step": 25900
},
{
"epoch": 0.7498753819813181,
"grad_norm": 23.8125,
"learning_rate": 1.1716845166067654e-07,
"loss": 15.6828,
"step": 25950
},
{
"epoch": 0.7513202285747311,
"grad_norm": 23.953125,
"learning_rate": 1.1739420975636186e-07,
"loss": 15.5568,
"step": 26000
},
{
"epoch": 0.7527650751681441,
"grad_norm": 25.421875,
"learning_rate": 1.1761996785204717e-07,
"loss": 15.6016,
"step": 26050
},
{
"epoch": 0.754209921761557,
"grad_norm": 22.15625,
"learning_rate": 1.1784572594773249e-07,
"loss": 15.5887,
"step": 26100
},
{
"epoch": 0.7556547683549699,
"grad_norm": 21.5625,
"learning_rate": 1.1807148404341779e-07,
"loss": 15.6077,
"step": 26150
},
{
"epoch": 0.7570996149483828,
"grad_norm": 23.328125,
"learning_rate": 1.1829724213910311e-07,
"loss": 15.6592,
"step": 26200
},
{
"epoch": 0.7585444615417958,
"grad_norm": 23.71875,
"learning_rate": 1.1852300023478841e-07,
"loss": 15.621,
"step": 26250
},
{
"epoch": 0.7599893081352087,
"grad_norm": 24.5625,
"learning_rate": 1.1874875833047373e-07,
"loss": 15.5728,
"step": 26300
},
{
"epoch": 0.7614341547286216,
"grad_norm": 23.21875,
"learning_rate": 1.1897451642615904e-07,
"loss": 15.6658,
"step": 26350
},
{
"epoch": 0.7628790013220347,
"grad_norm": 24.96875,
"learning_rate": 1.1920027452184434e-07,
"loss": 15.4367,
"step": 26400
},
{
"epoch": 0.7643238479154476,
"grad_norm": 23.296875,
"learning_rate": 1.1942603261752967e-07,
"loss": 15.6812,
"step": 26450
},
{
"epoch": 0.7657686945088605,
"grad_norm": 21.21875,
"learning_rate": 1.1965179071321497e-07,
"loss": 15.4966,
"step": 26500
},
{
"epoch": 0.7672135411022735,
"grad_norm": 21.859375,
"learning_rate": 1.1987754880890028e-07,
"loss": 15.6969,
"step": 26550
},
{
"epoch": 0.7686583876956864,
"grad_norm": 21.234375,
"learning_rate": 1.2010330690458558e-07,
"loss": 15.5063,
"step": 26600
},
{
"epoch": 0.7701032342890993,
"grad_norm": 30.65625,
"learning_rate": 1.203290650002709e-07,
"loss": 15.5682,
"step": 26650
},
{
"epoch": 0.7715480808825123,
"grad_norm": 23.0625,
"learning_rate": 1.205548230959562e-07,
"loss": 15.5967,
"step": 26700
},
{
"epoch": 0.7729929274759253,
"grad_norm": 22.171875,
"learning_rate": 1.2078058119164154e-07,
"loss": 15.5911,
"step": 26750
},
{
"epoch": 0.7744377740693382,
"grad_norm": 25.8125,
"learning_rate": 1.2100633928732684e-07,
"loss": 15.6635,
"step": 26800
},
{
"epoch": 0.7758826206627512,
"grad_norm": 23.40625,
"learning_rate": 1.2123209738301214e-07,
"loss": 15.5876,
"step": 26850
},
{
"epoch": 0.7773274672561641,
"grad_norm": 21.15625,
"learning_rate": 1.2145785547869745e-07,
"loss": 15.5192,
"step": 26900
},
{
"epoch": 0.778772313849577,
"grad_norm": 23.5625,
"learning_rate": 1.2168361357438277e-07,
"loss": 15.5746,
"step": 26950
},
{
"epoch": 0.7802171604429899,
"grad_norm": 27.359375,
"learning_rate": 1.2190937167006808e-07,
"loss": 15.5407,
"step": 27000
},
{
"epoch": 0.7802171604429899,
"eval_loss": 1.9437412023544312,
"eval_runtime": 340.4,
"eval_samples_per_second": 2739.524,
"eval_steps_per_second": 42.806,
"step": 27000
},
{
"epoch": 0.7816620070364029,
"grad_norm": 23.0625,
"learning_rate": 1.221351297657534e-07,
"loss": 15.609,
"step": 27050
},
{
"epoch": 0.7831068536298158,
"grad_norm": 25.40625,
"learning_rate": 1.223608878614387e-07,
"loss": 15.6637,
"step": 27100
},
{
"epoch": 0.7845517002232288,
"grad_norm": 23.90625,
"learning_rate": 1.22586645957124e-07,
"loss": 15.6405,
"step": 27150
},
{
"epoch": 0.7859965468166418,
"grad_norm": 22.390625,
"learning_rate": 1.2281240405280934e-07,
"loss": 15.5515,
"step": 27200
},
{
"epoch": 0.7874413934100547,
"grad_norm": 25.265625,
"learning_rate": 1.2303816214849464e-07,
"loss": 15.5254,
"step": 27250
},
{
"epoch": 0.7888862400034676,
"grad_norm": 22.125,
"learning_rate": 1.2326392024417994e-07,
"loss": 15.5474,
"step": 27300
},
{
"epoch": 0.7903310865968806,
"grad_norm": 23.03125,
"learning_rate": 1.2348967833986527e-07,
"loss": 15.554,
"step": 27350
},
{
"epoch": 0.7917759331902935,
"grad_norm": 19.96875,
"learning_rate": 1.2371543643555057e-07,
"loss": 15.5717,
"step": 27400
},
{
"epoch": 0.7932207797837064,
"grad_norm": 20.53125,
"learning_rate": 1.2394119453123588e-07,
"loss": 15.5454,
"step": 27450
},
{
"epoch": 0.7946656263771195,
"grad_norm": 21.34375,
"learning_rate": 1.241669526269212e-07,
"loss": 15.5759,
"step": 27500
},
{
"epoch": 0.7961104729705324,
"grad_norm": 23.9375,
"learning_rate": 1.243927107226065e-07,
"loss": 15.5199,
"step": 27550
},
{
"epoch": 0.7975553195639453,
"grad_norm": 21.84375,
"learning_rate": 1.246184688182918e-07,
"loss": 15.4171,
"step": 27600
},
{
"epoch": 0.7990001661573582,
"grad_norm": 22.234375,
"learning_rate": 1.2484422691397714e-07,
"loss": 15.5973,
"step": 27650
},
{
"epoch": 0.8004450127507712,
"grad_norm": 22.421875,
"learning_rate": 1.2506998500966244e-07,
"loss": 15.4923,
"step": 27700
},
{
"epoch": 0.8018898593441841,
"grad_norm": 21.34375,
"learning_rate": 1.2529574310534774e-07,
"loss": 15.509,
"step": 27750
},
{
"epoch": 0.803334705937597,
"grad_norm": 24.359375,
"learning_rate": 1.2552150120103305e-07,
"loss": 15.5824,
"step": 27800
},
{
"epoch": 0.80477955253101,
"grad_norm": 21.25,
"learning_rate": 1.2574725929671838e-07,
"loss": 15.5973,
"step": 27850
},
{
"epoch": 0.806224399124423,
"grad_norm": 27.984375,
"learning_rate": 1.2597301739240368e-07,
"loss": 15.564,
"step": 27900
},
{
"epoch": 0.8076692457178359,
"grad_norm": 33.71875,
"learning_rate": 1.26198775488089e-07,
"loss": 15.439,
"step": 27950
},
{
"epoch": 0.8091140923112489,
"grad_norm": 24.09375,
"learning_rate": 1.264245335837743e-07,
"loss": 15.5574,
"step": 28000
},
{
"epoch": 0.8105589389046618,
"grad_norm": 22.78125,
"learning_rate": 1.266502916794596e-07,
"loss": 15.5851,
"step": 28050
},
{
"epoch": 0.8120037854980747,
"grad_norm": 21.125,
"learning_rate": 1.2687604977514494e-07,
"loss": 15.5334,
"step": 28100
},
{
"epoch": 0.8134486320914877,
"grad_norm": 21.5625,
"learning_rate": 1.2710180787083024e-07,
"loss": 15.5789,
"step": 28150
},
{
"epoch": 0.8148934786849006,
"grad_norm": 21.8125,
"learning_rate": 1.2732756596651557e-07,
"loss": 15.4793,
"step": 28200
},
{
"epoch": 0.8163383252783136,
"grad_norm": 21.234375,
"learning_rate": 1.2755332406220087e-07,
"loss": 15.5015,
"step": 28250
},
{
"epoch": 0.8177831718717266,
"grad_norm": 21.203125,
"learning_rate": 1.2777908215788618e-07,
"loss": 15.3711,
"step": 28300
},
{
"epoch": 0.8192280184651395,
"grad_norm": 23.84375,
"learning_rate": 1.280048402535715e-07,
"loss": 15.4745,
"step": 28350
},
{
"epoch": 0.8206728650585524,
"grad_norm": 22.03125,
"learning_rate": 1.2823059834925678e-07,
"loss": 15.5285,
"step": 28400
},
{
"epoch": 0.8221177116519653,
"grad_norm": 20.9375,
"learning_rate": 1.284563564449421e-07,
"loss": 15.5214,
"step": 28450
},
{
"epoch": 0.8235625582453783,
"grad_norm": 25.546875,
"learning_rate": 1.286821145406274e-07,
"loss": 15.4168,
"step": 28500
},
{
"epoch": 0.8250074048387912,
"grad_norm": 24.265625,
"learning_rate": 1.2890787263631271e-07,
"loss": 15.5043,
"step": 28550
},
{
"epoch": 0.8264522514322041,
"grad_norm": 23.265625,
"learning_rate": 1.2913363073199804e-07,
"loss": 15.4206,
"step": 28600
},
{
"epoch": 0.8278970980256172,
"grad_norm": 22.0,
"learning_rate": 1.2935938882768334e-07,
"loss": 15.4444,
"step": 28650
},
{
"epoch": 0.8293419446190301,
"grad_norm": 25.09375,
"learning_rate": 1.2958514692336867e-07,
"loss": 15.4043,
"step": 28700
},
{
"epoch": 0.830786791212443,
"grad_norm": 21.046875,
"learning_rate": 1.2981090501905398e-07,
"loss": 15.5465,
"step": 28750
},
{
"epoch": 0.832231637805856,
"grad_norm": 21.234375,
"learning_rate": 1.300366631147393e-07,
"loss": 15.4988,
"step": 28800
},
{
"epoch": 0.8336764843992689,
"grad_norm": 21.046875,
"learning_rate": 1.302624212104246e-07,
"loss": 15.4368,
"step": 28850
},
{
"epoch": 0.8351213309926818,
"grad_norm": 23.46875,
"learning_rate": 1.304881793061099e-07,
"loss": 15.4251,
"step": 28900
},
{
"epoch": 0.8365661775860948,
"grad_norm": 23.046875,
"learning_rate": 1.3071393740179524e-07,
"loss": 15.4271,
"step": 28950
},
{
"epoch": 0.8380110241795078,
"grad_norm": 21.0,
"learning_rate": 1.3093969549748054e-07,
"loss": 15.4439,
"step": 29000
},
{
"epoch": 0.8394558707729207,
"grad_norm": 21.96875,
"learning_rate": 1.3116545359316584e-07,
"loss": 15.4197,
"step": 29050
},
{
"epoch": 0.8409007173663336,
"grad_norm": 21.109375,
"learning_rate": 1.3139121168885115e-07,
"loss": 15.428,
"step": 29100
},
{
"epoch": 0.8423455639597466,
"grad_norm": 21.984375,
"learning_rate": 1.3161696978453645e-07,
"loss": 15.3989,
"step": 29150
},
{
"epoch": 0.8437904105531595,
"grad_norm": 21.921875,
"learning_rate": 1.3184272788022178e-07,
"loss": 15.4178,
"step": 29200
},
{
"epoch": 0.8452352571465724,
"grad_norm": 20.765625,
"learning_rate": 1.3206848597590708e-07,
"loss": 15.3614,
"step": 29250
},
{
"epoch": 0.8466801037399854,
"grad_norm": 21.390625,
"learning_rate": 1.322942440715924e-07,
"loss": 15.4306,
"step": 29300
},
{
"epoch": 0.8481249503333983,
"grad_norm": 28.84375,
"learning_rate": 1.325200021672777e-07,
"loss": 15.4706,
"step": 29350
},
{
"epoch": 0.8495697969268113,
"grad_norm": 22.921875,
"learning_rate": 1.32745760262963e-07,
"loss": 15.4703,
"step": 29400
},
{
"epoch": 0.8510146435202243,
"grad_norm": 36.5625,
"learning_rate": 1.3297151835864834e-07,
"loss": 15.45,
"step": 29450
},
{
"epoch": 0.8524594901136372,
"grad_norm": 24.25,
"learning_rate": 1.3319727645433364e-07,
"loss": 15.4233,
"step": 29500
},
{
"epoch": 0.8539043367070501,
"grad_norm": 34.96875,
"learning_rate": 1.3342303455001897e-07,
"loss": 15.4693,
"step": 29550
},
{
"epoch": 0.8553491833004631,
"grad_norm": 21.953125,
"learning_rate": 1.3364879264570427e-07,
"loss": 15.4402,
"step": 29600
},
{
"epoch": 0.856794029893876,
"grad_norm": 22.40625,
"learning_rate": 1.338745507413896e-07,
"loss": 15.4761,
"step": 29650
},
{
"epoch": 0.8582388764872889,
"grad_norm": 20.625,
"learning_rate": 1.341003088370749e-07,
"loss": 15.3796,
"step": 29700
},
{
"epoch": 0.859683723080702,
"grad_norm": 21.328125,
"learning_rate": 1.3432606693276018e-07,
"loss": 15.3857,
"step": 29750
},
{
"epoch": 0.8611285696741149,
"grad_norm": 23.375,
"learning_rate": 1.345518250284455e-07,
"loss": 15.414,
"step": 29800
},
{
"epoch": 0.8625734162675278,
"grad_norm": 22.671875,
"learning_rate": 1.347775831241308e-07,
"loss": 15.3401,
"step": 29850
},
{
"epoch": 0.8640182628609407,
"grad_norm": 22.65625,
"learning_rate": 1.3500334121981614e-07,
"loss": 15.346,
"step": 29900
},
{
"epoch": 0.8654631094543537,
"grad_norm": 23.890625,
"learning_rate": 1.3522909931550144e-07,
"loss": 15.42,
"step": 29950
},
{
"epoch": 0.8669079560477666,
"grad_norm": 20.515625,
"learning_rate": 1.3545485741118675e-07,
"loss": 15.389,
"step": 30000
},
{
"epoch": 0.8669079560477666,
"eval_loss": 1.9219062328338623,
"eval_runtime": 349.965,
"eval_samples_per_second": 2664.65,
"eval_steps_per_second": 41.636,
"step": 30000
},
{
"epoch": 0.8683528026411795,
"grad_norm": 21.8125,
"learning_rate": 1.3568061550687207e-07,
"loss": 15.327,
"step": 30050
},
{
"epoch": 0.8697976492345925,
"grad_norm": 22.8125,
"learning_rate": 1.3590637360255738e-07,
"loss": 15.3229,
"step": 30100
},
{
"epoch": 0.8712424958280055,
"grad_norm": 23.671875,
"learning_rate": 1.361321316982427e-07,
"loss": 15.3576,
"step": 30150
},
{
"epoch": 0.8726873424214184,
"grad_norm": 22.609375,
"learning_rate": 1.36357889793928e-07,
"loss": 15.3539,
"step": 30200
},
{
"epoch": 0.8741321890148314,
"grad_norm": 20.65625,
"learning_rate": 1.365836478896133e-07,
"loss": 15.2693,
"step": 30250
},
{
"epoch": 0.8755770356082443,
"grad_norm": 22.5625,
"learning_rate": 1.3680940598529864e-07,
"loss": 15.4018,
"step": 30300
},
{
"epoch": 0.8770218822016572,
"grad_norm": 19.875,
"learning_rate": 1.3703516408098394e-07,
"loss": 15.4242,
"step": 30350
},
{
"epoch": 0.8784667287950702,
"grad_norm": 27.234375,
"learning_rate": 1.3726092217666924e-07,
"loss": 15.3294,
"step": 30400
},
{
"epoch": 0.8799115753884831,
"grad_norm": 23.375,
"learning_rate": 1.3748668027235455e-07,
"loss": 15.3841,
"step": 30450
},
{
"epoch": 0.8813564219818961,
"grad_norm": 23.125,
"learning_rate": 1.3771243836803988e-07,
"loss": 15.3368,
"step": 30500
},
{
"epoch": 0.882801268575309,
"grad_norm": 23.171875,
"learning_rate": 1.3793819646372518e-07,
"loss": 15.343,
"step": 30550
},
{
"epoch": 0.884246115168722,
"grad_norm": 29.78125,
"learning_rate": 1.3816395455941048e-07,
"loss": 15.3782,
"step": 30600
},
{
"epoch": 0.8856909617621349,
"grad_norm": 22.453125,
"learning_rate": 1.383897126550958e-07,
"loss": 15.4537,
"step": 30650
},
{
"epoch": 0.8871358083555478,
"grad_norm": 21.265625,
"learning_rate": 1.386154707507811e-07,
"loss": 15.4048,
"step": 30700
},
{
"epoch": 0.8885806549489608,
"grad_norm": 24.25,
"learning_rate": 1.3884122884646644e-07,
"loss": 15.3433,
"step": 30750
},
{
"epoch": 0.8900255015423737,
"grad_norm": 25.1875,
"learning_rate": 1.3906698694215174e-07,
"loss": 15.3141,
"step": 30800
},
{
"epoch": 0.8914703481357866,
"grad_norm": 24.25,
"learning_rate": 1.3929274503783704e-07,
"loss": 15.2703,
"step": 30850
},
{
"epoch": 0.8929151947291997,
"grad_norm": 22.3125,
"learning_rate": 1.3951850313352237e-07,
"loss": 15.4022,
"step": 30900
},
{
"epoch": 0.8943600413226126,
"grad_norm": 19.859375,
"learning_rate": 1.3974426122920768e-07,
"loss": 15.2936,
"step": 30950
},
{
"epoch": 0.8958048879160255,
"grad_norm": 20.5,
"learning_rate": 1.39970019324893e-07,
"loss": 15.3219,
"step": 31000
},
{
"epoch": 0.8972497345094385,
"grad_norm": 21.71875,
"learning_rate": 1.4019577742057828e-07,
"loss": 15.2468,
"step": 31050
},
{
"epoch": 0.8986945811028514,
"grad_norm": 23.421875,
"learning_rate": 1.4042153551626358e-07,
"loss": 15.2591,
"step": 31100
},
{
"epoch": 0.9001394276962643,
"grad_norm": 23.09375,
"learning_rate": 1.406472936119489e-07,
"loss": 15.3318,
"step": 31150
},
{
"epoch": 0.9015842742896772,
"grad_norm": 24.09375,
"learning_rate": 1.4087305170763421e-07,
"loss": 15.2105,
"step": 31200
},
{
"epoch": 0.9030291208830903,
"grad_norm": 22.671875,
"learning_rate": 1.4109880980331954e-07,
"loss": 15.2557,
"step": 31250
},
{
"epoch": 0.9044739674765032,
"grad_norm": 22.0625,
"learning_rate": 1.4132456789900484e-07,
"loss": 15.4014,
"step": 31300
},
{
"epoch": 0.9059188140699161,
"grad_norm": 21.796875,
"learning_rate": 1.4155032599469017e-07,
"loss": 15.2382,
"step": 31350
},
{
"epoch": 0.9073636606633291,
"grad_norm": 24.5,
"learning_rate": 1.4177608409037548e-07,
"loss": 15.395,
"step": 31400
},
{
"epoch": 0.908808507256742,
"grad_norm": 21.828125,
"learning_rate": 1.4200184218606078e-07,
"loss": 15.2785,
"step": 31450
},
{
"epoch": 0.9102533538501549,
"grad_norm": 22.5625,
"learning_rate": 1.422276002817461e-07,
"loss": 15.2983,
"step": 31500
},
{
"epoch": 0.9116982004435679,
"grad_norm": 21.328125,
"learning_rate": 1.424533583774314e-07,
"loss": 15.382,
"step": 31550
},
{
"epoch": 0.9131430470369809,
"grad_norm": 21.3125,
"learning_rate": 1.4267911647311674e-07,
"loss": 15.2084,
"step": 31600
},
{
"epoch": 0.9145878936303938,
"grad_norm": 22.6875,
"learning_rate": 1.4290487456880204e-07,
"loss": 15.2803,
"step": 31650
},
{
"epoch": 0.9160327402238068,
"grad_norm": 20.953125,
"learning_rate": 1.4313063266448734e-07,
"loss": 15.3734,
"step": 31700
},
{
"epoch": 0.9174775868172197,
"grad_norm": 22.765625,
"learning_rate": 1.4335639076017265e-07,
"loss": 15.3248,
"step": 31750
},
{
"epoch": 0.9189224334106326,
"grad_norm": 21.640625,
"learning_rate": 1.4358214885585795e-07,
"loss": 15.2958,
"step": 31800
},
{
"epoch": 0.9203672800040456,
"grad_norm": 21.53125,
"learning_rate": 1.4380790695154328e-07,
"loss": 15.2854,
"step": 31850
},
{
"epoch": 0.9218121265974585,
"grad_norm": 21.265625,
"learning_rate": 1.4403366504722858e-07,
"loss": 15.2983,
"step": 31900
},
{
"epoch": 0.9232569731908714,
"grad_norm": 26.265625,
"learning_rate": 1.4425942314291388e-07,
"loss": 15.1967,
"step": 31950
},
{
"epoch": 0.9247018197842845,
"grad_norm": 21.078125,
"learning_rate": 1.444851812385992e-07,
"loss": 15.2861,
"step": 32000
},
{
"epoch": 0.9261466663776974,
"grad_norm": 22.53125,
"learning_rate": 1.447109393342845e-07,
"loss": 15.203,
"step": 32050
},
{
"epoch": 0.9275915129711103,
"grad_norm": 20.46875,
"learning_rate": 1.4493669742996984e-07,
"loss": 15.3343,
"step": 32100
},
{
"epoch": 0.9290363595645232,
"grad_norm": 21.5625,
"learning_rate": 1.4516245552565514e-07,
"loss": 15.1377,
"step": 32150
},
{
"epoch": 0.9304812061579362,
"grad_norm": 23.609375,
"learning_rate": 1.4538821362134047e-07,
"loss": 15.267,
"step": 32200
},
{
"epoch": 0.9319260527513491,
"grad_norm": 22.59375,
"learning_rate": 1.4561397171702577e-07,
"loss": 15.3935,
"step": 32250
},
{
"epoch": 0.933370899344762,
"grad_norm": 23.90625,
"learning_rate": 1.4583972981271108e-07,
"loss": 15.2605,
"step": 32300
},
{
"epoch": 0.9348157459381751,
"grad_norm": 23.171875,
"learning_rate": 1.460654879083964e-07,
"loss": 15.2479,
"step": 32350
},
{
"epoch": 0.936260592531588,
"grad_norm": 22.734375,
"learning_rate": 1.4629124600408168e-07,
"loss": 15.1586,
"step": 32400
},
{
"epoch": 0.9377054391250009,
"grad_norm": 22.078125,
"learning_rate": 1.46517004099767e-07,
"loss": 15.2709,
"step": 32450
},
{
"epoch": 0.9391502857184139,
"grad_norm": 20.484375,
"learning_rate": 1.467427621954523e-07,
"loss": 15.3932,
"step": 32500
},
{
"epoch": 0.9405951323118268,
"grad_norm": 20.171875,
"learning_rate": 1.4696852029113762e-07,
"loss": 15.3021,
"step": 32550
},
{
"epoch": 0.9420399789052397,
"grad_norm": 20.375,
"learning_rate": 1.4719427838682294e-07,
"loss": 15.3676,
"step": 32600
},
{
"epoch": 0.9434848254986526,
"grad_norm": 18.40625,
"learning_rate": 1.4742003648250825e-07,
"loss": 15.225,
"step": 32650
},
{
"epoch": 0.9449296720920656,
"grad_norm": 22.921875,
"learning_rate": 1.4764579457819357e-07,
"loss": 15.2833,
"step": 32700
},
{
"epoch": 0.9463745186854786,
"grad_norm": 21.359375,
"learning_rate": 1.4787155267387888e-07,
"loss": 15.2648,
"step": 32750
},
{
"epoch": 0.9478193652788915,
"grad_norm": 18.03125,
"learning_rate": 1.480973107695642e-07,
"loss": 15.1701,
"step": 32800
},
{
"epoch": 0.9492642118723045,
"grad_norm": 20.734375,
"learning_rate": 1.483230688652495e-07,
"loss": 15.3169,
"step": 32850
},
{
"epoch": 0.9507090584657174,
"grad_norm": 30.40625,
"learning_rate": 1.485488269609348e-07,
"loss": 15.1193,
"step": 32900
},
{
"epoch": 0.9521539050591303,
"grad_norm": 18.96875,
"learning_rate": 1.4877458505662014e-07,
"loss": 15.1477,
"step": 32950
},
{
"epoch": 0.9535987516525433,
"grad_norm": 22.125,
"learning_rate": 1.4900034315230544e-07,
"loss": 15.1363,
"step": 33000
},
{
"epoch": 0.9535987516525433,
"eval_loss": 1.9005507230758667,
"eval_runtime": 343.9939,
"eval_samples_per_second": 2710.903,
"eval_steps_per_second": 42.358,
"step": 33000
},
{
"epoch": 0.9550435982459562,
"grad_norm": 20.171875,
"learning_rate": 1.4922610124799077e-07,
"loss": 15.2704,
"step": 33050
},
{
"epoch": 0.9564884448393692,
"grad_norm": 25.21875,
"learning_rate": 1.4945185934367605e-07,
"loss": 15.2413,
"step": 33100
},
{
"epoch": 0.9579332914327822,
"grad_norm": 20.984375,
"learning_rate": 1.4967761743936135e-07,
"loss": 15.0358,
"step": 33150
},
{
"epoch": 0.9593781380261951,
"grad_norm": 22.65625,
"learning_rate": 1.4990337553504668e-07,
"loss": 15.148,
"step": 33200
},
{
"epoch": 0.960822984619608,
"grad_norm": 24.671875,
"learning_rate": 1.5012913363073198e-07,
"loss": 15.0575,
"step": 33250
},
{
"epoch": 0.962267831213021,
"grad_norm": 21.28125,
"learning_rate": 1.503548917264173e-07,
"loss": 15.1119,
"step": 33300
},
{
"epoch": 0.9637126778064339,
"grad_norm": 24.21875,
"learning_rate": 1.505806498221026e-07,
"loss": 15.21,
"step": 33350
},
{
"epoch": 0.9651575243998468,
"grad_norm": 21.6875,
"learning_rate": 1.5080640791778791e-07,
"loss": 15.1355,
"step": 33400
},
{
"epoch": 0.9666023709932597,
"grad_norm": 24.390625,
"learning_rate": 1.5103216601347324e-07,
"loss": 15.2218,
"step": 33450
},
{
"epoch": 0.9680472175866728,
"grad_norm": 19.25,
"learning_rate": 1.5125792410915854e-07,
"loss": 15.1256,
"step": 33500
},
{
"epoch": 0.9694920641800857,
"grad_norm": 19.984375,
"learning_rate": 1.5148368220484387e-07,
"loss": 15.1171,
"step": 33550
},
{
"epoch": 0.9709369107734986,
"grad_norm": 19.640625,
"learning_rate": 1.5170944030052918e-07,
"loss": 15.0999,
"step": 33600
},
{
"epoch": 0.9723817573669116,
"grad_norm": 24.265625,
"learning_rate": 1.519351983962145e-07,
"loss": 15.2255,
"step": 33650
},
{
"epoch": 0.9738266039603245,
"grad_norm": 25.546875,
"learning_rate": 1.521609564918998e-07,
"loss": 15.0743,
"step": 33700
},
{
"epoch": 0.9752714505537374,
"grad_norm": 21.578125,
"learning_rate": 1.5238671458758508e-07,
"loss": 15.145,
"step": 33750
},
{
"epoch": 0.9767162971471504,
"grad_norm": 24.46875,
"learning_rate": 1.526124726832704e-07,
"loss": 15.2408,
"step": 33800
},
{
"epoch": 0.9781611437405634,
"grad_norm": 21.984375,
"learning_rate": 1.5283823077895571e-07,
"loss": 15.1413,
"step": 33850
},
{
"epoch": 0.9796059903339763,
"grad_norm": 21.828125,
"learning_rate": 1.5306398887464104e-07,
"loss": 15.1452,
"step": 33900
},
{
"epoch": 0.9810508369273893,
"grad_norm": 22.125,
"learning_rate": 1.5328974697032635e-07,
"loss": 15.1786,
"step": 33950
},
{
"epoch": 0.9824956835208022,
"grad_norm": 27.046875,
"learning_rate": 1.5351550506601165e-07,
"loss": 15.069,
"step": 34000
},
{
"epoch": 0.9839405301142151,
"grad_norm": 21.65625,
"learning_rate": 1.5374126316169698e-07,
"loss": 15.1776,
"step": 34050
},
{
"epoch": 0.985385376707628,
"grad_norm": 21.953125,
"learning_rate": 1.5396702125738228e-07,
"loss": 15.1561,
"step": 34100
},
{
"epoch": 0.986830223301041,
"grad_norm": 25.75,
"learning_rate": 1.541927793530676e-07,
"loss": 15.2242,
"step": 34150
},
{
"epoch": 0.9882750698944539,
"grad_norm": 23.484375,
"learning_rate": 1.544185374487529e-07,
"loss": 15.1583,
"step": 34200
},
{
"epoch": 0.989719916487867,
"grad_norm": 27.984375,
"learning_rate": 1.546442955444382e-07,
"loss": 15.1159,
"step": 34250
},
{
"epoch": 0.9911647630812799,
"grad_norm": 21.34375,
"learning_rate": 1.5487005364012354e-07,
"loss": 15.0641,
"step": 34300
},
{
"epoch": 0.9926096096746928,
"grad_norm": 20.25,
"learning_rate": 1.5509581173580884e-07,
"loss": 15.139,
"step": 34350
},
{
"epoch": 0.9940544562681057,
"grad_norm": 21.640625,
"learning_rate": 1.5532156983149415e-07,
"loss": 15.0966,
"step": 34400
},
{
"epoch": 0.9954993028615187,
"grad_norm": 22.125,
"learning_rate": 1.5554732792717945e-07,
"loss": 15.1072,
"step": 34450
},
{
"epoch": 0.9969441494549316,
"grad_norm": 21.859375,
"learning_rate": 1.5577308602286478e-07,
"loss": 15.1392,
"step": 34500
},
{
"epoch": 0.9983889960483445,
"grad_norm": 21.703125,
"learning_rate": 1.5599884411855008e-07,
"loss": 15.2097,
"step": 34550
},
{
"epoch": 0.9998338426417576,
"grad_norm": 23.140625,
"learning_rate": 1.5622460221423538e-07,
"loss": 15.0667,
"step": 34600
}
],
"logging_steps": 50,
"max_steps": 34605,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.042588809533587e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}