cutelemonlili's picture
Add files using upload-large-folder tool
95767bb verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 200,
"global_step": 978,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002044989775051125,
"grad_norm": 2.8995674216310645,
"learning_rate": 9.999974203447434e-06,
"loss": 0.097,
"step": 1
},
{
"epoch": 0.00408997955010225,
"grad_norm": 2.0878590818900724,
"learning_rate": 9.999896814055916e-06,
"loss": 0.0793,
"step": 2
},
{
"epoch": 0.006134969325153374,
"grad_norm": 3.252004392065858,
"learning_rate": 9.999767832624e-06,
"loss": 0.1446,
"step": 3
},
{
"epoch": 0.0081799591002045,
"grad_norm": 2.1719657405930333,
"learning_rate": 9.999587260482597e-06,
"loss": 0.0606,
"step": 4
},
{
"epoch": 0.010224948875255624,
"grad_norm": 1.5951295112804458,
"learning_rate": 9.999355099494961e-06,
"loss": 0.0543,
"step": 5
},
{
"epoch": 0.012269938650306749,
"grad_norm": 2.0082268910826957,
"learning_rate": 9.999071352056676e-06,
"loss": 0.0752,
"step": 6
},
{
"epoch": 0.014314928425357873,
"grad_norm": 1.9536326911273243,
"learning_rate": 9.998736021095621e-06,
"loss": 0.0453,
"step": 7
},
{
"epoch": 0.016359918200409,
"grad_norm": 2.13634714300749,
"learning_rate": 9.99834911007195e-06,
"loss": 0.0732,
"step": 8
},
{
"epoch": 0.018404907975460124,
"grad_norm": 1.920732150945499,
"learning_rate": 9.99791062297805e-06,
"loss": 0.0541,
"step": 9
},
{
"epoch": 0.02044989775051125,
"grad_norm": 2.1324187216203034,
"learning_rate": 9.99742056433851e-06,
"loss": 0.0549,
"step": 10
},
{
"epoch": 0.022494887525562373,
"grad_norm": 2.919114524687416,
"learning_rate": 9.99687893921005e-06,
"loss": 0.0895,
"step": 11
},
{
"epoch": 0.024539877300613498,
"grad_norm": 1.899625115074746,
"learning_rate": 9.996285753181499e-06,
"loss": 0.0589,
"step": 12
},
{
"epoch": 0.026584867075664622,
"grad_norm": 2.5554509832362973,
"learning_rate": 9.99564101237372e-06,
"loss": 0.0785,
"step": 13
},
{
"epoch": 0.028629856850715747,
"grad_norm": 2.4318482065803666,
"learning_rate": 9.994944723439546e-06,
"loss": 0.0784,
"step": 14
},
{
"epoch": 0.03067484662576687,
"grad_norm": 3.583468004202154,
"learning_rate": 9.994196893563722e-06,
"loss": 0.1125,
"step": 15
},
{
"epoch": 0.032719836400818,
"grad_norm": 1.4181812641718199,
"learning_rate": 9.993397530462818e-06,
"loss": 0.0397,
"step": 16
},
{
"epoch": 0.034764826175869123,
"grad_norm": 1.8010048779280416,
"learning_rate": 9.99254664238516e-06,
"loss": 0.0575,
"step": 17
},
{
"epoch": 0.03680981595092025,
"grad_norm": 2.1503927037059385,
"learning_rate": 9.991644238110741e-06,
"loss": 0.0665,
"step": 18
},
{
"epoch": 0.03885480572597137,
"grad_norm": 1.8100049883218121,
"learning_rate": 9.990690326951126e-06,
"loss": 0.0682,
"step": 19
},
{
"epoch": 0.0408997955010225,
"grad_norm": 2.3966939056398266,
"learning_rate": 9.989684918749365e-06,
"loss": 0.0846,
"step": 20
},
{
"epoch": 0.04294478527607362,
"grad_norm": 1.918166279143656,
"learning_rate": 9.988628023879883e-06,
"loss": 0.0668,
"step": 21
},
{
"epoch": 0.044989775051124746,
"grad_norm": 1.7912977784419148,
"learning_rate": 9.98751965324838e-06,
"loss": 0.0729,
"step": 22
},
{
"epoch": 0.04703476482617587,
"grad_norm": 1.9098490695074073,
"learning_rate": 9.986359818291706e-06,
"loss": 0.0733,
"step": 23
},
{
"epoch": 0.049079754601226995,
"grad_norm": 2.2200718894862805,
"learning_rate": 9.985148530977767e-06,
"loss": 0.0723,
"step": 24
},
{
"epoch": 0.05112474437627812,
"grad_norm": 1.8085849304791404,
"learning_rate": 9.983885803805373e-06,
"loss": 0.0713,
"step": 25
},
{
"epoch": 0.053169734151329244,
"grad_norm": 2.5900909947296507,
"learning_rate": 9.982571649804126e-06,
"loss": 0.0805,
"step": 26
},
{
"epoch": 0.05521472392638037,
"grad_norm": 2.557173352737123,
"learning_rate": 9.981206082534287e-06,
"loss": 0.0849,
"step": 27
},
{
"epoch": 0.05725971370143149,
"grad_norm": 2.3095562915819503,
"learning_rate": 9.979789116086625e-06,
"loss": 0.0848,
"step": 28
},
{
"epoch": 0.05930470347648262,
"grad_norm": 1.652313462404793,
"learning_rate": 9.97832076508228e-06,
"loss": 0.057,
"step": 29
},
{
"epoch": 0.06134969325153374,
"grad_norm": 3.3750373556197752,
"learning_rate": 9.976801044672608e-06,
"loss": 0.1154,
"step": 30
},
{
"epoch": 0.06339468302658487,
"grad_norm": 2.7053744260152803,
"learning_rate": 9.97522997053903e-06,
"loss": 0.0841,
"step": 31
},
{
"epoch": 0.065439672801636,
"grad_norm": 2.1510005490299497,
"learning_rate": 9.973607558892864e-06,
"loss": 0.0732,
"step": 32
},
{
"epoch": 0.06748466257668712,
"grad_norm": 2.1823073488659324,
"learning_rate": 9.971933826475162e-06,
"loss": 0.0776,
"step": 33
},
{
"epoch": 0.06952965235173825,
"grad_norm": 2.0539979554320817,
"learning_rate": 9.970208790556531e-06,
"loss": 0.0688,
"step": 34
},
{
"epoch": 0.07157464212678936,
"grad_norm": 1.6876619685011311,
"learning_rate": 9.968432468936967e-06,
"loss": 0.0608,
"step": 35
},
{
"epoch": 0.0736196319018405,
"grad_norm": 3.0575087238752805,
"learning_rate": 9.966604879945659e-06,
"loss": 0.12,
"step": 36
},
{
"epoch": 0.07566462167689161,
"grad_norm": 2.414478148852492,
"learning_rate": 9.964726042440802e-06,
"loss": 0.0958,
"step": 37
},
{
"epoch": 0.07770961145194274,
"grad_norm": 2.173225061106067,
"learning_rate": 9.962795975809411e-06,
"loss": 0.084,
"step": 38
},
{
"epoch": 0.07975460122699386,
"grad_norm": 2.040698856807742,
"learning_rate": 9.960814699967112e-06,
"loss": 0.0794,
"step": 39
},
{
"epoch": 0.081799591002045,
"grad_norm": 2.249606373953477,
"learning_rate": 9.958782235357938e-06,
"loss": 0.0951,
"step": 40
},
{
"epoch": 0.08384458077709611,
"grad_norm": 2.5979000902419895,
"learning_rate": 9.956698602954124e-06,
"loss": 0.1029,
"step": 41
},
{
"epoch": 0.08588957055214724,
"grad_norm": 2.1602269446719644,
"learning_rate": 9.954563824255879e-06,
"loss": 0.0901,
"step": 42
},
{
"epoch": 0.08793456032719836,
"grad_norm": 1.8153325069101112,
"learning_rate": 9.952377921291179e-06,
"loss": 0.0623,
"step": 43
},
{
"epoch": 0.08997955010224949,
"grad_norm": 2.7967114830172615,
"learning_rate": 9.950140916615526e-06,
"loss": 0.1192,
"step": 44
},
{
"epoch": 0.09202453987730061,
"grad_norm": 2.0707153248622827,
"learning_rate": 9.947852833311725e-06,
"loss": 0.0846,
"step": 45
},
{
"epoch": 0.09406952965235174,
"grad_norm": 2.1452757583479474,
"learning_rate": 9.94551369498964e-06,
"loss": 0.0875,
"step": 46
},
{
"epoch": 0.09611451942740286,
"grad_norm": 2.3194318990073923,
"learning_rate": 9.943123525785952e-06,
"loss": 0.0921,
"step": 47
},
{
"epoch": 0.09815950920245399,
"grad_norm": 1.798820349857878,
"learning_rate": 9.940682350363913e-06,
"loss": 0.0592,
"step": 48
},
{
"epoch": 0.10020449897750511,
"grad_norm": 1.8591670519797276,
"learning_rate": 9.938190193913084e-06,
"loss": 0.0757,
"step": 49
},
{
"epoch": 0.10224948875255624,
"grad_norm": 1.8617586001231685,
"learning_rate": 9.935647082149088e-06,
"loss": 0.0677,
"step": 50
},
{
"epoch": 0.10429447852760736,
"grad_norm": 2.402863095839252,
"learning_rate": 9.933053041313325e-06,
"loss": 0.0873,
"step": 51
},
{
"epoch": 0.10633946830265849,
"grad_norm": 2.2249519855906756,
"learning_rate": 9.930408098172725e-06,
"loss": 0.0912,
"step": 52
},
{
"epoch": 0.1083844580777096,
"grad_norm": 2.1251826013803323,
"learning_rate": 9.92771228001945e-06,
"loss": 0.076,
"step": 53
},
{
"epoch": 0.11042944785276074,
"grad_norm": 1.9764253903583366,
"learning_rate": 9.924965614670629e-06,
"loss": 0.0784,
"step": 54
},
{
"epoch": 0.11247443762781185,
"grad_norm": 1.8078917942604569,
"learning_rate": 9.92216813046806e-06,
"loss": 0.0667,
"step": 55
},
{
"epoch": 0.11451942740286299,
"grad_norm": 2.5631523625105372,
"learning_rate": 9.919319856277921e-06,
"loss": 0.1003,
"step": 56
},
{
"epoch": 0.1165644171779141,
"grad_norm": 2.066653670325792,
"learning_rate": 9.916420821490474e-06,
"loss": 0.0756,
"step": 57
},
{
"epoch": 0.11860940695296524,
"grad_norm": 2.5780966602305693,
"learning_rate": 9.91347105601976e-06,
"loss": 0.0984,
"step": 58
},
{
"epoch": 0.12065439672801637,
"grad_norm": 2.219344023968354,
"learning_rate": 9.910470590303294e-06,
"loss": 0.0789,
"step": 59
},
{
"epoch": 0.12269938650306748,
"grad_norm": 2.642779106386566,
"learning_rate": 9.90741945530174e-06,
"loss": 0.078,
"step": 60
},
{
"epoch": 0.12474437627811862,
"grad_norm": 1.8439341873720778,
"learning_rate": 9.904317682498609e-06,
"loss": 0.0725,
"step": 61
},
{
"epoch": 0.12678936605316973,
"grad_norm": 2.1976218170570783,
"learning_rate": 9.901165303899916e-06,
"loss": 0.1094,
"step": 62
},
{
"epoch": 0.12883435582822086,
"grad_norm": 2.4577264483674166,
"learning_rate": 9.89796235203386e-06,
"loss": 0.0922,
"step": 63
},
{
"epoch": 0.130879345603272,
"grad_norm": 3.012519841445848,
"learning_rate": 9.89470885995049e-06,
"loss": 0.1109,
"step": 64
},
{
"epoch": 0.1329243353783231,
"grad_norm": 2.248540711936193,
"learning_rate": 9.891404861221356e-06,
"loss": 0.0892,
"step": 65
},
{
"epoch": 0.13496932515337423,
"grad_norm": 2.3347058109208825,
"learning_rate": 9.888050389939172e-06,
"loss": 0.0851,
"step": 66
},
{
"epoch": 0.13701431492842536,
"grad_norm": 2.460632130242845,
"learning_rate": 9.884645480717452e-06,
"loss": 0.0967,
"step": 67
},
{
"epoch": 0.1390593047034765,
"grad_norm": 1.8587061916271175,
"learning_rate": 9.881190168690164e-06,
"loss": 0.0661,
"step": 68
},
{
"epoch": 0.1411042944785276,
"grad_norm": 2.813362612221172,
"learning_rate": 9.877684489511367e-06,
"loss": 0.1079,
"step": 69
},
{
"epoch": 0.14314928425357873,
"grad_norm": 2.7724880857855085,
"learning_rate": 9.874128479354833e-06,
"loss": 0.0865,
"step": 70
},
{
"epoch": 0.14519427402862986,
"grad_norm": 2.0084192749000223,
"learning_rate": 9.870522174913683e-06,
"loss": 0.0811,
"step": 71
},
{
"epoch": 0.147239263803681,
"grad_norm": 1.901062419637755,
"learning_rate": 9.866865613400008e-06,
"loss": 0.0834,
"step": 72
},
{
"epoch": 0.1492842535787321,
"grad_norm": 2.143697771944517,
"learning_rate": 9.863158832544477e-06,
"loss": 0.0967,
"step": 73
},
{
"epoch": 0.15132924335378323,
"grad_norm": 1.8252931029432322,
"learning_rate": 9.859401870595959e-06,
"loss": 0.0725,
"step": 74
},
{
"epoch": 0.15337423312883436,
"grad_norm": 1.9307281956151774,
"learning_rate": 9.855594766321122e-06,
"loss": 0.077,
"step": 75
},
{
"epoch": 0.1554192229038855,
"grad_norm": 2.2429643925966993,
"learning_rate": 9.85173755900403e-06,
"loss": 0.0891,
"step": 76
},
{
"epoch": 0.1574642126789366,
"grad_norm": 1.8200761545917128,
"learning_rate": 9.847830288445745e-06,
"loss": 0.0785,
"step": 77
},
{
"epoch": 0.15950920245398773,
"grad_norm": 1.8916674815016423,
"learning_rate": 9.843872994963912e-06,
"loss": 0.0755,
"step": 78
},
{
"epoch": 0.16155419222903886,
"grad_norm": 2.0741375008009655,
"learning_rate": 9.83986571939234e-06,
"loss": 0.0744,
"step": 79
},
{
"epoch": 0.16359918200409,
"grad_norm": 1.7919605782077757,
"learning_rate": 9.835808503080586e-06,
"loss": 0.0757,
"step": 80
},
{
"epoch": 0.1656441717791411,
"grad_norm": 1.950729934719885,
"learning_rate": 9.831701387893533e-06,
"loss": 0.0815,
"step": 81
},
{
"epoch": 0.16768916155419222,
"grad_norm": 2.124785118083205,
"learning_rate": 9.82754441621094e-06,
"loss": 0.0807,
"step": 82
},
{
"epoch": 0.16973415132924335,
"grad_norm": 2.053195322602257,
"learning_rate": 9.823337630927027e-06,
"loss": 0.0902,
"step": 83
},
{
"epoch": 0.17177914110429449,
"grad_norm": 2.5090758861647826,
"learning_rate": 9.819081075450014e-06,
"loss": 0.0873,
"step": 84
},
{
"epoch": 0.1738241308793456,
"grad_norm": 2.137957503401185,
"learning_rate": 9.814774793701686e-06,
"loss": 0.092,
"step": 85
},
{
"epoch": 0.17586912065439672,
"grad_norm": 2.230490758825473,
"learning_rate": 9.810418830116933e-06,
"loss": 0.0833,
"step": 86
},
{
"epoch": 0.17791411042944785,
"grad_norm": 2.012709266353046,
"learning_rate": 9.80601322964329e-06,
"loss": 0.0877,
"step": 87
},
{
"epoch": 0.17995910020449898,
"grad_norm": 2.572752374501912,
"learning_rate": 9.80155803774048e-06,
"loss": 0.1141,
"step": 88
},
{
"epoch": 0.18200408997955012,
"grad_norm": 1.5628909847161165,
"learning_rate": 9.797053300379938e-06,
"loss": 0.0672,
"step": 89
},
{
"epoch": 0.18404907975460122,
"grad_norm": 1.8013050781356985,
"learning_rate": 9.792499064044343e-06,
"loss": 0.0804,
"step": 90
},
{
"epoch": 0.18609406952965235,
"grad_norm": 2.128417350277261,
"learning_rate": 9.787895375727137e-06,
"loss": 0.0903,
"step": 91
},
{
"epoch": 0.18813905930470348,
"grad_norm": 2.6231742831814255,
"learning_rate": 9.783242282932028e-06,
"loss": 0.0991,
"step": 92
},
{
"epoch": 0.1901840490797546,
"grad_norm": 2.14671431766684,
"learning_rate": 9.778539833672525e-06,
"loss": 0.0844,
"step": 93
},
{
"epoch": 0.19222903885480572,
"grad_norm": 1.668300942440577,
"learning_rate": 9.773788076471415e-06,
"loss": 0.0677,
"step": 94
},
{
"epoch": 0.19427402862985685,
"grad_norm": 1.6611049562639426,
"learning_rate": 9.76898706036028e-06,
"loss": 0.0815,
"step": 95
},
{
"epoch": 0.19631901840490798,
"grad_norm": 1.7467281372812702,
"learning_rate": 9.764136834878987e-06,
"loss": 0.0802,
"step": 96
},
{
"epoch": 0.1983640081799591,
"grad_norm": 2.0082876640493525,
"learning_rate": 9.759237450075174e-06,
"loss": 0.0845,
"step": 97
},
{
"epoch": 0.20040899795501022,
"grad_norm": 1.6218133242260213,
"learning_rate": 9.754288956503737e-06,
"loss": 0.0792,
"step": 98
},
{
"epoch": 0.20245398773006135,
"grad_norm": 1.8693374042253028,
"learning_rate": 9.749291405226304e-06,
"loss": 0.089,
"step": 99
},
{
"epoch": 0.20449897750511248,
"grad_norm": 2.3402858038101337,
"learning_rate": 9.744244847810716e-06,
"loss": 0.0945,
"step": 100
},
{
"epoch": 0.2065439672801636,
"grad_norm": 2.400216651654056,
"learning_rate": 9.739149336330482e-06,
"loss": 0.0994,
"step": 101
},
{
"epoch": 0.2085889570552147,
"grad_norm": 1.9932426008301034,
"learning_rate": 9.734004923364258e-06,
"loss": 0.0813,
"step": 102
},
{
"epoch": 0.21063394683026584,
"grad_norm": 1.8232352554241547,
"learning_rate": 9.728811661995287e-06,
"loss": 0.0833,
"step": 103
},
{
"epoch": 0.21267893660531698,
"grad_norm": 1.774918510432305,
"learning_rate": 9.72356960581087e-06,
"loss": 0.0853,
"step": 104
},
{
"epoch": 0.2147239263803681,
"grad_norm": 2.987329389159815,
"learning_rate": 9.718278808901797e-06,
"loss": 0.1114,
"step": 105
},
{
"epoch": 0.2167689161554192,
"grad_norm": 2.248351378515216,
"learning_rate": 9.712939325861794e-06,
"loss": 0.0826,
"step": 106
},
{
"epoch": 0.21881390593047034,
"grad_norm": 2.218767795388457,
"learning_rate": 9.707551211786966e-06,
"loss": 0.088,
"step": 107
},
{
"epoch": 0.22085889570552147,
"grad_norm": 2.3431433008509917,
"learning_rate": 9.702114522275216e-06,
"loss": 0.0897,
"step": 108
},
{
"epoch": 0.2229038854805726,
"grad_norm": 1.9166897788167856,
"learning_rate": 9.696629313425688e-06,
"loss": 0.088,
"step": 109
},
{
"epoch": 0.2249488752556237,
"grad_norm": 1.9440115291462636,
"learning_rate": 9.691095641838168e-06,
"loss": 0.0836,
"step": 110
},
{
"epoch": 0.22699386503067484,
"grad_norm": 1.813961610317634,
"learning_rate": 9.685513564612521e-06,
"loss": 0.078,
"step": 111
},
{
"epoch": 0.22903885480572597,
"grad_norm": 1.8809059426216883,
"learning_rate": 9.679883139348082e-06,
"loss": 0.0821,
"step": 112
},
{
"epoch": 0.2310838445807771,
"grad_norm": 2.2311254705001233,
"learning_rate": 9.674204424143079e-06,
"loss": 0.0883,
"step": 113
},
{
"epoch": 0.2331288343558282,
"grad_norm": 1.9295136215801372,
"learning_rate": 9.668477477594021e-06,
"loss": 0.0833,
"step": 114
},
{
"epoch": 0.23517382413087934,
"grad_norm": 1.8615614639144564,
"learning_rate": 9.662702358795098e-06,
"loss": 0.0822,
"step": 115
},
{
"epoch": 0.23721881390593047,
"grad_norm": 1.8761973618596817,
"learning_rate": 9.656879127337571e-06,
"loss": 0.0785,
"step": 116
},
{
"epoch": 0.2392638036809816,
"grad_norm": 2.017270471451727,
"learning_rate": 9.651007843309164e-06,
"loss": 0.0878,
"step": 117
},
{
"epoch": 0.24130879345603273,
"grad_norm": 2.1414773647169936,
"learning_rate": 9.645088567293426e-06,
"loss": 0.0932,
"step": 118
},
{
"epoch": 0.24335378323108384,
"grad_norm": 1.7284124634354323,
"learning_rate": 9.639121360369127e-06,
"loss": 0.0683,
"step": 119
},
{
"epoch": 0.24539877300613497,
"grad_norm": 2.3422614186852577,
"learning_rate": 9.633106284109612e-06,
"loss": 0.1061,
"step": 120
},
{
"epoch": 0.2474437627811861,
"grad_norm": 1.9680728218006462,
"learning_rate": 9.627043400582173e-06,
"loss": 0.0832,
"step": 121
},
{
"epoch": 0.24948875255623723,
"grad_norm": 1.744621659832594,
"learning_rate": 9.620932772347408e-06,
"loss": 0.0716,
"step": 122
},
{
"epoch": 0.25153374233128833,
"grad_norm": 2.003659281799268,
"learning_rate": 9.614774462458573e-06,
"loss": 0.0943,
"step": 123
},
{
"epoch": 0.25357873210633947,
"grad_norm": 1.9112829391643362,
"learning_rate": 9.608568534460938e-06,
"loss": 0.0791,
"step": 124
},
{
"epoch": 0.2556237218813906,
"grad_norm": 1.6018069748701698,
"learning_rate": 9.602315052391116e-06,
"loss": 0.0699,
"step": 125
},
{
"epoch": 0.25766871165644173,
"grad_norm": 1.9898564316497316,
"learning_rate": 9.596014080776424e-06,
"loss": 0.0868,
"step": 126
},
{
"epoch": 0.25971370143149286,
"grad_norm": 1.9062653706577775,
"learning_rate": 9.589665684634197e-06,
"loss": 0.0797,
"step": 127
},
{
"epoch": 0.261758691206544,
"grad_norm": 2.105685404483493,
"learning_rate": 9.583269929471129e-06,
"loss": 0.0802,
"step": 128
},
{
"epoch": 0.26380368098159507,
"grad_norm": 1.8889444529306618,
"learning_rate": 9.576826881282595e-06,
"loss": 0.0773,
"step": 129
},
{
"epoch": 0.2658486707566462,
"grad_norm": 1.89509366954467,
"learning_rate": 9.570336606551966e-06,
"loss": 0.0845,
"step": 130
},
{
"epoch": 0.26789366053169733,
"grad_norm": 2.5730619597875792,
"learning_rate": 9.56379917224993e-06,
"loss": 0.1218,
"step": 131
},
{
"epoch": 0.26993865030674846,
"grad_norm": 3.174335117295452,
"learning_rate": 9.557214645833792e-06,
"loss": 0.1396,
"step": 132
},
{
"epoch": 0.2719836400817996,
"grad_norm": 1.506901278245754,
"learning_rate": 9.550583095246786e-06,
"loss": 0.0631,
"step": 133
},
{
"epoch": 0.2740286298568507,
"grad_norm": 2.3300783174234887,
"learning_rate": 9.543904588917366e-06,
"loss": 0.109,
"step": 134
},
{
"epoch": 0.27607361963190186,
"grad_norm": 1.8554323699407922,
"learning_rate": 9.537179195758513e-06,
"loss": 0.0746,
"step": 135
},
{
"epoch": 0.278118609406953,
"grad_norm": 1.4907022435447066,
"learning_rate": 9.530406985167005e-06,
"loss": 0.0712,
"step": 136
},
{
"epoch": 0.28016359918200406,
"grad_norm": 1.7196544870819945,
"learning_rate": 9.523588027022721e-06,
"loss": 0.075,
"step": 137
},
{
"epoch": 0.2822085889570552,
"grad_norm": 1.7344914939658451,
"learning_rate": 9.516722391687903e-06,
"loss": 0.0856,
"step": 138
},
{
"epoch": 0.2842535787321063,
"grad_norm": 2.1773597101038087,
"learning_rate": 9.50981015000644e-06,
"loss": 0.0929,
"step": 139
},
{
"epoch": 0.28629856850715746,
"grad_norm": 2.0166181602910376,
"learning_rate": 9.502851373303137e-06,
"loss": 0.0892,
"step": 140
},
{
"epoch": 0.2883435582822086,
"grad_norm": 2.0996295005016483,
"learning_rate": 9.495846133382973e-06,
"loss": 0.085,
"step": 141
},
{
"epoch": 0.2903885480572597,
"grad_norm": 2.09058564013836,
"learning_rate": 9.488794502530361e-06,
"loss": 0.0872,
"step": 142
},
{
"epoch": 0.29243353783231085,
"grad_norm": 1.8321276625056864,
"learning_rate": 9.481696553508411e-06,
"loss": 0.0927,
"step": 143
},
{
"epoch": 0.294478527607362,
"grad_norm": 1.918438250366742,
"learning_rate": 9.474552359558167e-06,
"loss": 0.0744,
"step": 144
},
{
"epoch": 0.2965235173824131,
"grad_norm": 2.327981634380635,
"learning_rate": 9.46736199439786e-06,
"loss": 0.1025,
"step": 145
},
{
"epoch": 0.2985685071574642,
"grad_norm": 2.2135170524903995,
"learning_rate": 9.460125532222142e-06,
"loss": 0.09,
"step": 146
},
{
"epoch": 0.3006134969325153,
"grad_norm": 2.2539230814408073,
"learning_rate": 9.452843047701324e-06,
"loss": 0.1023,
"step": 147
},
{
"epoch": 0.30265848670756645,
"grad_norm": 2.104687258049424,
"learning_rate": 9.445514615980604e-06,
"loss": 0.0905,
"step": 148
},
{
"epoch": 0.3047034764826176,
"grad_norm": 1.7372025147408934,
"learning_rate": 9.438140312679292e-06,
"loss": 0.0849,
"step": 149
},
{
"epoch": 0.3067484662576687,
"grad_norm": 2.0671665965859662,
"learning_rate": 9.43072021389003e-06,
"loss": 0.0924,
"step": 150
},
{
"epoch": 0.30879345603271985,
"grad_norm": 1.6350351491282862,
"learning_rate": 9.423254396178003e-06,
"loss": 0.0769,
"step": 151
},
{
"epoch": 0.310838445807771,
"grad_norm": 2.878396608282762,
"learning_rate": 9.415742936580156e-06,
"loss": 0.1538,
"step": 152
},
{
"epoch": 0.3128834355828221,
"grad_norm": 1.4213578692087034,
"learning_rate": 9.408185912604395e-06,
"loss": 0.065,
"step": 153
},
{
"epoch": 0.3149284253578732,
"grad_norm": 2.0855996921354,
"learning_rate": 9.400583402228785e-06,
"loss": 0.0844,
"step": 154
},
{
"epoch": 0.3169734151329243,
"grad_norm": 1.7352864078553754,
"learning_rate": 9.39293548390075e-06,
"loss": 0.0853,
"step": 155
},
{
"epoch": 0.31901840490797545,
"grad_norm": 1.334038745461943,
"learning_rate": 9.385242236536259e-06,
"loss": 0.0656,
"step": 156
},
{
"epoch": 0.3210633946830266,
"grad_norm": 2.174575475791565,
"learning_rate": 9.377503739519019e-06,
"loss": 0.0991,
"step": 157
},
{
"epoch": 0.3231083844580777,
"grad_norm": 1.6357643314755432,
"learning_rate": 9.369720072699648e-06,
"loss": 0.0792,
"step": 158
},
{
"epoch": 0.32515337423312884,
"grad_norm": 2.316934261247635,
"learning_rate": 9.36189131639485e-06,
"loss": 0.1112,
"step": 159
},
{
"epoch": 0.32719836400818,
"grad_norm": 1.9234234290855614,
"learning_rate": 9.354017551386599e-06,
"loss": 0.0851,
"step": 160
},
{
"epoch": 0.3292433537832311,
"grad_norm": 2.475496525507223,
"learning_rate": 9.346098858921292e-06,
"loss": 0.1062,
"step": 161
},
{
"epoch": 0.3312883435582822,
"grad_norm": 2.3268380138649487,
"learning_rate": 9.338135320708912e-06,
"loss": 0.1035,
"step": 162
},
{
"epoch": 0.3333333333333333,
"grad_norm": 1.5336893905703746,
"learning_rate": 9.330127018922195e-06,
"loss": 0.0702,
"step": 163
},
{
"epoch": 0.33537832310838445,
"grad_norm": 2.8082604544179035,
"learning_rate": 9.32207403619577e-06,
"loss": 0.1209,
"step": 164
},
{
"epoch": 0.3374233128834356,
"grad_norm": 1.5750634984249117,
"learning_rate": 9.313976455625316e-06,
"loss": 0.0713,
"step": 165
},
{
"epoch": 0.3394683026584867,
"grad_norm": 2.2373522766525262,
"learning_rate": 9.305834360766695e-06,
"loss": 0.0969,
"step": 166
},
{
"epoch": 0.34151329243353784,
"grad_norm": 2.342451381996767,
"learning_rate": 9.297647835635102e-06,
"loss": 0.0934,
"step": 167
},
{
"epoch": 0.34355828220858897,
"grad_norm": 1.936610520437153,
"learning_rate": 9.289416964704186e-06,
"loss": 0.0883,
"step": 168
},
{
"epoch": 0.3456032719836401,
"grad_norm": 1.8338353993342575,
"learning_rate": 9.281141832905185e-06,
"loss": 0.0778,
"step": 169
},
{
"epoch": 0.3476482617586912,
"grad_norm": 1.9110066741814127,
"learning_rate": 9.272822525626047e-06,
"loss": 0.0735,
"step": 170
},
{
"epoch": 0.3496932515337423,
"grad_norm": 2.179479069452803,
"learning_rate": 9.26445912871055e-06,
"loss": 0.0843,
"step": 171
},
{
"epoch": 0.35173824130879344,
"grad_norm": 1.9177594380676963,
"learning_rate": 9.25605172845742e-06,
"loss": 0.0805,
"step": 172
},
{
"epoch": 0.3537832310838446,
"grad_norm": 2.1882619443952684,
"learning_rate": 9.247600411619434e-06,
"loss": 0.0965,
"step": 173
},
{
"epoch": 0.3558282208588957,
"grad_norm": 2.2176075779513824,
"learning_rate": 9.239105265402525e-06,
"loss": 0.0974,
"step": 174
},
{
"epoch": 0.35787321063394684,
"grad_norm": 1.5074567124767815,
"learning_rate": 9.23056637746489e-06,
"loss": 0.0735,
"step": 175
},
{
"epoch": 0.35991820040899797,
"grad_norm": 2.060069998365139,
"learning_rate": 9.221983835916074e-06,
"loss": 0.1022,
"step": 176
},
{
"epoch": 0.3619631901840491,
"grad_norm": 2.1165212064315235,
"learning_rate": 9.213357729316077e-06,
"loss": 0.0995,
"step": 177
},
{
"epoch": 0.36400817995910023,
"grad_norm": 2.1868849806726787,
"learning_rate": 9.204688146674418e-06,
"loss": 0.0939,
"step": 178
},
{
"epoch": 0.3660531697341513,
"grad_norm": 1.7544924490641574,
"learning_rate": 9.195975177449238e-06,
"loss": 0.0873,
"step": 179
},
{
"epoch": 0.36809815950920244,
"grad_norm": 1.838768964795654,
"learning_rate": 9.187218911546363e-06,
"loss": 0.0864,
"step": 180
},
{
"epoch": 0.37014314928425357,
"grad_norm": 1.9536263850909072,
"learning_rate": 9.178419439318382e-06,
"loss": 0.0828,
"step": 181
},
{
"epoch": 0.3721881390593047,
"grad_norm": 1.8125655303827894,
"learning_rate": 9.169576851563715e-06,
"loss": 0.0707,
"step": 182
},
{
"epoch": 0.37423312883435583,
"grad_norm": 1.5346489369821823,
"learning_rate": 9.160691239525675e-06,
"loss": 0.0707,
"step": 183
},
{
"epoch": 0.37627811860940696,
"grad_norm": 2.0774049712635745,
"learning_rate": 9.151762694891522e-06,
"loss": 0.0892,
"step": 184
},
{
"epoch": 0.3783231083844581,
"grad_norm": 1.6068313703103427,
"learning_rate": 9.142791309791528e-06,
"loss": 0.0737,
"step": 185
},
{
"epoch": 0.3803680981595092,
"grad_norm": 2.491559077597992,
"learning_rate": 9.133777176798013e-06,
"loss": 0.1063,
"step": 186
},
{
"epoch": 0.3824130879345603,
"grad_norm": 1.936364688582553,
"learning_rate": 9.124720388924403e-06,
"loss": 0.0879,
"step": 187
},
{
"epoch": 0.38445807770961143,
"grad_norm": 1.7501246261711056,
"learning_rate": 9.115621039624256e-06,
"loss": 0.0831,
"step": 188
},
{
"epoch": 0.38650306748466257,
"grad_norm": 1.9375047463204769,
"learning_rate": 9.106479222790312e-06,
"loss": 0.0798,
"step": 189
},
{
"epoch": 0.3885480572597137,
"grad_norm": 1.9799704235731947,
"learning_rate": 9.09729503275351e-06,
"loss": 0.0818,
"step": 190
},
{
"epoch": 0.39059304703476483,
"grad_norm": 2.1027233151637046,
"learning_rate": 9.08806856428203e-06,
"loss": 0.0737,
"step": 191
},
{
"epoch": 0.39263803680981596,
"grad_norm": 2.2130274217863377,
"learning_rate": 9.078799912580305e-06,
"loss": 0.1049,
"step": 192
},
{
"epoch": 0.3946830265848671,
"grad_norm": 1.8596492941083875,
"learning_rate": 9.069489173288037e-06,
"loss": 0.0788,
"step": 193
},
{
"epoch": 0.3967280163599182,
"grad_norm": 1.8220962906735956,
"learning_rate": 9.060136442479215e-06,
"loss": 0.0789,
"step": 194
},
{
"epoch": 0.3987730061349693,
"grad_norm": 2.1684932411419773,
"learning_rate": 9.050741816661128e-06,
"loss": 0.1101,
"step": 195
},
{
"epoch": 0.40081799591002043,
"grad_norm": 2.2585167924890674,
"learning_rate": 9.041305392773355e-06,
"loss": 0.0899,
"step": 196
},
{
"epoch": 0.40286298568507156,
"grad_norm": 2.2529963379779514,
"learning_rate": 9.03182726818678e-06,
"loss": 0.1001,
"step": 197
},
{
"epoch": 0.4049079754601227,
"grad_norm": 2.019146584665829,
"learning_rate": 9.022307540702576e-06,
"loss": 0.0889,
"step": 198
},
{
"epoch": 0.4069529652351738,
"grad_norm": 2.0147227938530214,
"learning_rate": 9.012746308551208e-06,
"loss": 0.0779,
"step": 199
},
{
"epoch": 0.40899795501022496,
"grad_norm": 1.6785890661043144,
"learning_rate": 9.003143670391403e-06,
"loss": 0.0714,
"step": 200
},
{
"epoch": 0.40899795501022496,
"eval_loss": 0.09443490207195282,
"eval_runtime": 1.6107,
"eval_samples_per_second": 24.835,
"eval_steps_per_second": 6.209,
"step": 200
},
{
"epoch": 0.4110429447852761,
"grad_norm": 1.7907653453087733,
"learning_rate": 8.993499725309148e-06,
"loss": 0.0644,
"step": 201
},
{
"epoch": 0.4130879345603272,
"grad_norm": 2.0499291659974572,
"learning_rate": 8.983814572816656e-06,
"loss": 0.0764,
"step": 202
},
{
"epoch": 0.41513292433537835,
"grad_norm": 2.027050105104232,
"learning_rate": 8.974088312851346e-06,
"loss": 0.0896,
"step": 203
},
{
"epoch": 0.4171779141104294,
"grad_norm": 1.8185300386254655,
"learning_rate": 8.964321045774808e-06,
"loss": 0.0904,
"step": 204
},
{
"epoch": 0.41922290388548056,
"grad_norm": 1.8351321980331647,
"learning_rate": 8.954512872371768e-06,
"loss": 0.0798,
"step": 205
},
{
"epoch": 0.4212678936605317,
"grad_norm": 2.2777878812250734,
"learning_rate": 8.944663893849053e-06,
"loss": 0.094,
"step": 206
},
{
"epoch": 0.4233128834355828,
"grad_norm": 2.078616561352449,
"learning_rate": 8.934774211834538e-06,
"loss": 0.097,
"step": 207
},
{
"epoch": 0.42535787321063395,
"grad_norm": 1.5026879665719408,
"learning_rate": 8.924843928376105e-06,
"loss": 0.0667,
"step": 208
},
{
"epoch": 0.4274028629856851,
"grad_norm": 2.031373760012224,
"learning_rate": 8.914873145940585e-06,
"loss": 0.0983,
"step": 209
},
{
"epoch": 0.4294478527607362,
"grad_norm": 1.7750919975425428,
"learning_rate": 8.904861967412702e-06,
"loss": 0.0832,
"step": 210
},
{
"epoch": 0.43149284253578735,
"grad_norm": 1.6859653025880537,
"learning_rate": 8.894810496094016e-06,
"loss": 0.0739,
"step": 211
},
{
"epoch": 0.4335378323108384,
"grad_norm": 2.4773597386512374,
"learning_rate": 8.88471883570185e-06,
"loss": 0.104,
"step": 212
},
{
"epoch": 0.43558282208588955,
"grad_norm": 1.7481062215506529,
"learning_rate": 8.874587090368221e-06,
"loss": 0.0685,
"step": 213
},
{
"epoch": 0.4376278118609407,
"grad_norm": 1.8687306127676215,
"learning_rate": 8.86441536463877e-06,
"loss": 0.0812,
"step": 214
},
{
"epoch": 0.4396728016359918,
"grad_norm": 2.7660751966702515,
"learning_rate": 8.85420376347168e-06,
"loss": 0.1228,
"step": 215
},
{
"epoch": 0.44171779141104295,
"grad_norm": 2.008073359861921,
"learning_rate": 8.843952392236595e-06,
"loss": 0.092,
"step": 216
},
{
"epoch": 0.4437627811860941,
"grad_norm": 1.9689667185293374,
"learning_rate": 8.833661356713528e-06,
"loss": 0.0918,
"step": 217
},
{
"epoch": 0.4458077709611452,
"grad_norm": 2.0550779883515844,
"learning_rate": 8.823330763091775e-06,
"loss": 0.0842,
"step": 218
},
{
"epoch": 0.44785276073619634,
"grad_norm": 2.1458614538975316,
"learning_rate": 8.81296071796882e-06,
"loss": 0.0955,
"step": 219
},
{
"epoch": 0.4498977505112474,
"grad_norm": 2.0801721508502173,
"learning_rate": 8.802551328349222e-06,
"loss": 0.0696,
"step": 220
},
{
"epoch": 0.45194274028629855,
"grad_norm": 1.6170897770649597,
"learning_rate": 8.792102701643532e-06,
"loss": 0.074,
"step": 221
},
{
"epoch": 0.4539877300613497,
"grad_norm": 1.6010742203809665,
"learning_rate": 8.78161494566717e-06,
"loss": 0.068,
"step": 222
},
{
"epoch": 0.4560327198364008,
"grad_norm": 1.8263013055696211,
"learning_rate": 8.771088168639312e-06,
"loss": 0.0785,
"step": 223
},
{
"epoch": 0.45807770961145194,
"grad_norm": 1.8074234496570727,
"learning_rate": 8.760522479181784e-06,
"loss": 0.0843,
"step": 224
},
{
"epoch": 0.4601226993865031,
"grad_norm": 1.9423241552319763,
"learning_rate": 8.74991798631793e-06,
"loss": 0.0902,
"step": 225
},
{
"epoch": 0.4621676891615542,
"grad_norm": 2.426636585412464,
"learning_rate": 8.739274799471492e-06,
"loss": 0.1147,
"step": 226
},
{
"epoch": 0.46421267893660534,
"grad_norm": 1.8764452830009553,
"learning_rate": 8.728593028465481e-06,
"loss": 0.088,
"step": 227
},
{
"epoch": 0.4662576687116564,
"grad_norm": 1.8742190983636138,
"learning_rate": 8.717872783521048e-06,
"loss": 0.0919,
"step": 228
},
{
"epoch": 0.46830265848670755,
"grad_norm": 1.9812429967202114,
"learning_rate": 8.707114175256335e-06,
"loss": 0.1032,
"step": 229
},
{
"epoch": 0.4703476482617587,
"grad_norm": 1.5710292326402762,
"learning_rate": 8.696317314685342e-06,
"loss": 0.0735,
"step": 230
},
{
"epoch": 0.4723926380368098,
"grad_norm": 2.135568048299338,
"learning_rate": 8.685482313216784e-06,
"loss": 0.1003,
"step": 231
},
{
"epoch": 0.47443762781186094,
"grad_norm": 1.8410190133874755,
"learning_rate": 8.674609282652936e-06,
"loss": 0.0805,
"step": 232
},
{
"epoch": 0.47648261758691207,
"grad_norm": 1.95093910503971,
"learning_rate": 8.663698335188477e-06,
"loss": 0.0799,
"step": 233
},
{
"epoch": 0.4785276073619632,
"grad_norm": 2.0656801774088582,
"learning_rate": 8.65274958340934e-06,
"loss": 0.0953,
"step": 234
},
{
"epoch": 0.48057259713701433,
"grad_norm": 1.7872037593524146,
"learning_rate": 8.641763140291546e-06,
"loss": 0.0702,
"step": 235
},
{
"epoch": 0.48261758691206547,
"grad_norm": 2.0351005102773634,
"learning_rate": 8.630739119200035e-06,
"loss": 0.0828,
"step": 236
},
{
"epoch": 0.48466257668711654,
"grad_norm": 1.966029733326491,
"learning_rate": 8.61967763388751e-06,
"loss": 0.0887,
"step": 237
},
{
"epoch": 0.4867075664621677,
"grad_norm": 2.2496225787645714,
"learning_rate": 8.608578798493237e-06,
"loss": 0.0921,
"step": 238
},
{
"epoch": 0.4887525562372188,
"grad_norm": 2.3703828414232935,
"learning_rate": 8.597442727541898e-06,
"loss": 0.1055,
"step": 239
},
{
"epoch": 0.49079754601226994,
"grad_norm": 2.072283129147399,
"learning_rate": 8.586269535942386e-06,
"loss": 0.096,
"step": 240
},
{
"epoch": 0.49284253578732107,
"grad_norm": 1.763736942283961,
"learning_rate": 8.575059338986632e-06,
"loss": 0.0851,
"step": 241
},
{
"epoch": 0.4948875255623722,
"grad_norm": 1.9418651840022931,
"learning_rate": 8.563812252348412e-06,
"loss": 0.0817,
"step": 242
},
{
"epoch": 0.49693251533742333,
"grad_norm": 1.4038177877319757,
"learning_rate": 8.552528392082147e-06,
"loss": 0.0692,
"step": 243
},
{
"epoch": 0.49897750511247446,
"grad_norm": 2.2775569689795225,
"learning_rate": 8.541207874621718e-06,
"loss": 0.1092,
"step": 244
},
{
"epoch": 0.5010224948875256,
"grad_norm": 2.5534087713100955,
"learning_rate": 8.529850816779252e-06,
"loss": 0.1033,
"step": 245
},
{
"epoch": 0.5030674846625767,
"grad_norm": 1.531811934175557,
"learning_rate": 8.518457335743927e-06,
"loss": 0.0761,
"step": 246
},
{
"epoch": 0.5051124744376279,
"grad_norm": 2.3960006081387974,
"learning_rate": 8.507027549080753e-06,
"loss": 0.0941,
"step": 247
},
{
"epoch": 0.5071574642126789,
"grad_norm": 2.245296156491926,
"learning_rate": 8.49556157472937e-06,
"loss": 0.0992,
"step": 248
},
{
"epoch": 0.50920245398773,
"grad_norm": 2.1662992544835467,
"learning_rate": 8.484059531002822e-06,
"loss": 0.1096,
"step": 249
},
{
"epoch": 0.5112474437627812,
"grad_norm": 1.9378805133589119,
"learning_rate": 8.472521536586336e-06,
"loss": 0.0884,
"step": 250
},
{
"epoch": 0.5132924335378323,
"grad_norm": 1.7472804645413123,
"learning_rate": 8.460947710536108e-06,
"loss": 0.0881,
"step": 251
},
{
"epoch": 0.5153374233128835,
"grad_norm": 1.8567960096830705,
"learning_rate": 8.44933817227806e-06,
"loss": 0.1041,
"step": 252
},
{
"epoch": 0.5173824130879345,
"grad_norm": 1.6639705835205088,
"learning_rate": 8.437693041606619e-06,
"loss": 0.0767,
"step": 253
},
{
"epoch": 0.5194274028629857,
"grad_norm": 1.7811045494491748,
"learning_rate": 8.426012438683472e-06,
"loss": 0.0795,
"step": 254
},
{
"epoch": 0.5214723926380368,
"grad_norm": 2.601937087112271,
"learning_rate": 8.41429648403634e-06,
"loss": 0.1157,
"step": 255
},
{
"epoch": 0.523517382413088,
"grad_norm": 2.2629417508652896,
"learning_rate": 8.402545298557712e-06,
"loss": 0.0965,
"step": 256
},
{
"epoch": 0.5255623721881391,
"grad_norm": 1.6219382198043681,
"learning_rate": 8.390759003503624e-06,
"loss": 0.0804,
"step": 257
},
{
"epoch": 0.5276073619631901,
"grad_norm": 1.6735037903910355,
"learning_rate": 8.378937720492384e-06,
"loss": 0.0708,
"step": 258
},
{
"epoch": 0.5296523517382413,
"grad_norm": 1.6949968905732045,
"learning_rate": 8.367081571503332e-06,
"loss": 0.0796,
"step": 259
},
{
"epoch": 0.5316973415132924,
"grad_norm": 1.5829034537038222,
"learning_rate": 8.355190678875577e-06,
"loss": 0.0685,
"step": 260
},
{
"epoch": 0.5337423312883436,
"grad_norm": 2.1474520860458814,
"learning_rate": 8.343265165306736e-06,
"loss": 0.0966,
"step": 261
},
{
"epoch": 0.5357873210633947,
"grad_norm": 2.685259620414307,
"learning_rate": 8.331305153851659e-06,
"loss": 0.1199,
"step": 262
},
{
"epoch": 0.5378323108384458,
"grad_norm": 1.5378328527936944,
"learning_rate": 8.319310767921174e-06,
"loss": 0.0746,
"step": 263
},
{
"epoch": 0.5398773006134969,
"grad_norm": 1.5728870201255574,
"learning_rate": 8.307282131280805e-06,
"loss": 0.0794,
"step": 264
},
{
"epoch": 0.5419222903885481,
"grad_norm": 1.9037474406992847,
"learning_rate": 8.295219368049494e-06,
"loss": 0.0831,
"step": 265
},
{
"epoch": 0.5439672801635992,
"grad_norm": 1.8713169547943331,
"learning_rate": 8.283122602698324e-06,
"loss": 0.0866,
"step": 266
},
{
"epoch": 0.5460122699386503,
"grad_norm": 2.0187272804624032,
"learning_rate": 8.270991960049231e-06,
"loss": 0.0953,
"step": 267
},
{
"epoch": 0.5480572597137015,
"grad_norm": 2.3890714658865857,
"learning_rate": 8.258827565273717e-06,
"loss": 0.0993,
"step": 268
},
{
"epoch": 0.5501022494887525,
"grad_norm": 1.4224265522394863,
"learning_rate": 8.24662954389157e-06,
"loss": 0.0685,
"step": 269
},
{
"epoch": 0.5521472392638037,
"grad_norm": 1.8253908241082366,
"learning_rate": 8.234398021769541e-06,
"loss": 0.0859,
"step": 270
},
{
"epoch": 0.5541922290388548,
"grad_norm": 1.8297687093456312,
"learning_rate": 8.222133125120076e-06,
"loss": 0.0842,
"step": 271
},
{
"epoch": 0.556237218813906,
"grad_norm": 1.7325614091536314,
"learning_rate": 8.209834980499995e-06,
"loss": 0.0664,
"step": 272
},
{
"epoch": 0.558282208588957,
"grad_norm": 1.8426658391443724,
"learning_rate": 8.19750371480919e-06,
"loss": 0.0823,
"step": 273
},
{
"epoch": 0.5603271983640081,
"grad_norm": 2.335513659237072,
"learning_rate": 8.185139455289322e-06,
"loss": 0.1004,
"step": 274
},
{
"epoch": 0.5623721881390593,
"grad_norm": 2.281382949923011,
"learning_rate": 8.172742329522493e-06,
"loss": 0.0923,
"step": 275
},
{
"epoch": 0.5644171779141104,
"grad_norm": 2.0875496660986586,
"learning_rate": 8.160312465429952e-06,
"loss": 0.1007,
"step": 276
},
{
"epoch": 0.5664621676891616,
"grad_norm": 1.6706016356250908,
"learning_rate": 8.147849991270753e-06,
"loss": 0.0749,
"step": 277
},
{
"epoch": 0.5685071574642127,
"grad_norm": 2.3348044470325586,
"learning_rate": 8.135355035640445e-06,
"loss": 0.1075,
"step": 278
},
{
"epoch": 0.5705521472392638,
"grad_norm": 1.9325325555725485,
"learning_rate": 8.122827727469737e-06,
"loss": 0.0847,
"step": 279
},
{
"epoch": 0.5725971370143149,
"grad_norm": 2.06473154517661,
"learning_rate": 8.110268196023179e-06,
"loss": 0.0923,
"step": 280
},
{
"epoch": 0.5746421267893661,
"grad_norm": 1.7347784233467545,
"learning_rate": 8.097676570897814e-06,
"loss": 0.0767,
"step": 281
},
{
"epoch": 0.5766871165644172,
"grad_norm": 1.7284531347044014,
"learning_rate": 8.085052982021849e-06,
"loss": 0.0822,
"step": 282
},
{
"epoch": 0.5787321063394683,
"grad_norm": 2.0234039627173863,
"learning_rate": 8.072397559653314e-06,
"loss": 0.0903,
"step": 283
},
{
"epoch": 0.5807770961145194,
"grad_norm": 1.8567076129812703,
"learning_rate": 8.059710434378717e-06,
"loss": 0.0829,
"step": 284
},
{
"epoch": 0.5828220858895705,
"grad_norm": 1.8280706428554012,
"learning_rate": 8.046991737111696e-06,
"loss": 0.0846,
"step": 285
},
{
"epoch": 0.5848670756646217,
"grad_norm": 1.6827693552674245,
"learning_rate": 8.034241599091666e-06,
"loss": 0.0744,
"step": 286
},
{
"epoch": 0.5869120654396728,
"grad_norm": 1.4276933688240632,
"learning_rate": 8.021460151882472e-06,
"loss": 0.0644,
"step": 287
},
{
"epoch": 0.588957055214724,
"grad_norm": 1.7054089254136917,
"learning_rate": 8.008647527371022e-06,
"loss": 0.0691,
"step": 288
},
{
"epoch": 0.591002044989775,
"grad_norm": 2.3943112344962616,
"learning_rate": 7.995803857765934e-06,
"loss": 0.1105,
"step": 289
},
{
"epoch": 0.5930470347648262,
"grad_norm": 2.025612566291375,
"learning_rate": 7.982929275596164e-06,
"loss": 0.0936,
"step": 290
},
{
"epoch": 0.5950920245398773,
"grad_norm": 2.0696844237753984,
"learning_rate": 7.970023913709652e-06,
"loss": 0.0916,
"step": 291
},
{
"epoch": 0.5971370143149284,
"grad_norm": 2.1125496705836184,
"learning_rate": 7.957087905271934e-06,
"loss": 0.0812,
"step": 292
},
{
"epoch": 0.5991820040899796,
"grad_norm": 1.9111826855162881,
"learning_rate": 7.944121383764775e-06,
"loss": 0.0878,
"step": 293
},
{
"epoch": 0.6012269938650306,
"grad_norm": 2.0166887475359507,
"learning_rate": 7.931124482984802e-06,
"loss": 0.088,
"step": 294
},
{
"epoch": 0.6032719836400818,
"grad_norm": 2.4597183492348145,
"learning_rate": 7.918097337042106e-06,
"loss": 0.1066,
"step": 295
},
{
"epoch": 0.6053169734151329,
"grad_norm": 1.7705184105320022,
"learning_rate": 7.905040080358869e-06,
"loss": 0.0784,
"step": 296
},
{
"epoch": 0.6073619631901841,
"grad_norm": 1.7246778829446732,
"learning_rate": 7.891952847667973e-06,
"loss": 0.0777,
"step": 297
},
{
"epoch": 0.6094069529652352,
"grad_norm": 2.1760471200028593,
"learning_rate": 7.878835774011615e-06,
"loss": 0.0983,
"step": 298
},
{
"epoch": 0.6114519427402862,
"grad_norm": 2.1592710885226327,
"learning_rate": 7.865688994739907e-06,
"loss": 0.0996,
"step": 299
},
{
"epoch": 0.6134969325153374,
"grad_norm": 1.7446253812062307,
"learning_rate": 7.85251264550948e-06,
"loss": 0.0767,
"step": 300
},
{
"epoch": 0.6155419222903885,
"grad_norm": 2.784714583612841,
"learning_rate": 7.83930686228209e-06,
"loss": 0.0871,
"step": 301
},
{
"epoch": 0.6175869120654397,
"grad_norm": 1.923087819950953,
"learning_rate": 7.826071781323208e-06,
"loss": 0.076,
"step": 302
},
{
"epoch": 0.6196319018404908,
"grad_norm": 1.78632914754461,
"learning_rate": 7.812807539200622e-06,
"loss": 0.0778,
"step": 303
},
{
"epoch": 0.621676891615542,
"grad_norm": 1.9376192118205642,
"learning_rate": 7.799514272783014e-06,
"loss": 0.0817,
"step": 304
},
{
"epoch": 0.623721881390593,
"grad_norm": 2.550158615394769,
"learning_rate": 7.786192119238568e-06,
"loss": 0.1057,
"step": 305
},
{
"epoch": 0.6257668711656442,
"grad_norm": 1.9711665467023245,
"learning_rate": 7.772841216033534e-06,
"loss": 0.0764,
"step": 306
},
{
"epoch": 0.6278118609406953,
"grad_norm": 1.5340501908307014,
"learning_rate": 7.759461700930824e-06,
"loss": 0.0637,
"step": 307
},
{
"epoch": 0.6298568507157464,
"grad_norm": 2.2338456267605005,
"learning_rate": 7.746053711988584e-06,
"loss": 0.1059,
"step": 308
},
{
"epoch": 0.6319018404907976,
"grad_norm": 1.7891397758115173,
"learning_rate": 7.732617387558769e-06,
"loss": 0.0824,
"step": 309
},
{
"epoch": 0.6339468302658486,
"grad_norm": 2.1234757737848287,
"learning_rate": 7.719152866285722e-06,
"loss": 0.0885,
"step": 310
},
{
"epoch": 0.6359918200408998,
"grad_norm": 2.4102510823654457,
"learning_rate": 7.70566028710473e-06,
"loss": 0.0996,
"step": 311
},
{
"epoch": 0.6380368098159509,
"grad_norm": 1.9375735772859437,
"learning_rate": 7.692139789240611e-06,
"loss": 0.091,
"step": 312
},
{
"epoch": 0.6400817995910021,
"grad_norm": 2.0158092912142824,
"learning_rate": 7.678591512206254e-06,
"loss": 0.088,
"step": 313
},
{
"epoch": 0.6421267893660532,
"grad_norm": 1.6480327933319945,
"learning_rate": 7.665015595801198e-06,
"loss": 0.0791,
"step": 314
},
{
"epoch": 0.6441717791411042,
"grad_norm": 1.8510030476483572,
"learning_rate": 7.651412180110176e-06,
"loss": 0.085,
"step": 315
},
{
"epoch": 0.6462167689161554,
"grad_norm": 1.592679706462086,
"learning_rate": 7.637781405501682e-06,
"loss": 0.0719,
"step": 316
},
{
"epoch": 0.6482617586912065,
"grad_norm": 1.871195454539005,
"learning_rate": 7.6241234126265115e-06,
"loss": 0.0935,
"step": 317
},
{
"epoch": 0.6503067484662577,
"grad_norm": 2.1635066751175978,
"learning_rate": 7.61043834241632e-06,
"loss": 0.0887,
"step": 318
},
{
"epoch": 0.6523517382413088,
"grad_norm": 1.7458256267250807,
"learning_rate": 7.596726336082158e-06,
"loss": 0.0784,
"step": 319
},
{
"epoch": 0.65439672801636,
"grad_norm": 1.9970410164681027,
"learning_rate": 7.5829875351130224e-06,
"loss": 0.0825,
"step": 320
},
{
"epoch": 0.656441717791411,
"grad_norm": 1.8581711995026613,
"learning_rate": 7.569222081274396e-06,
"loss": 0.074,
"step": 321
},
{
"epoch": 0.6584867075664622,
"grad_norm": 1.5023298192040886,
"learning_rate": 7.555430116606778e-06,
"loss": 0.0707,
"step": 322
},
{
"epoch": 0.6605316973415133,
"grad_norm": 1.9742828072984793,
"learning_rate": 7.5416117834242254e-06,
"loss": 0.0839,
"step": 323
},
{
"epoch": 0.6625766871165644,
"grad_norm": 1.7579407302668417,
"learning_rate": 7.527767224312883e-06,
"loss": 0.0802,
"step": 324
},
{
"epoch": 0.6646216768916156,
"grad_norm": 1.7128227508559022,
"learning_rate": 7.513896582129507e-06,
"loss": 0.0745,
"step": 325
},
{
"epoch": 0.6666666666666666,
"grad_norm": 1.9293198934120017,
"learning_rate": 7.500000000000001e-06,
"loss": 0.0856,
"step": 326
},
{
"epoch": 0.6687116564417178,
"grad_norm": 2.0925311155648703,
"learning_rate": 7.4860776213179264e-06,
"loss": 0.0839,
"step": 327
},
{
"epoch": 0.6707566462167689,
"grad_norm": 2.082947312061181,
"learning_rate": 7.472129589743034e-06,
"loss": 0.0844,
"step": 328
},
{
"epoch": 0.6728016359918201,
"grad_norm": 2.0524639760050127,
"learning_rate": 7.458156049199775e-06,
"loss": 0.1008,
"step": 329
},
{
"epoch": 0.6748466257668712,
"grad_norm": 1.8254793507601215,
"learning_rate": 7.44415714387582e-06,
"loss": 0.0692,
"step": 330
},
{
"epoch": 0.6768916155419223,
"grad_norm": 1.9185120612100472,
"learning_rate": 7.430133018220567e-06,
"loss": 0.0902,
"step": 331
},
{
"epoch": 0.6789366053169734,
"grad_norm": 1.5528728788442376,
"learning_rate": 7.416083816943653e-06,
"loss": 0.0681,
"step": 332
},
{
"epoch": 0.6809815950920245,
"grad_norm": 1.8960655345457742,
"learning_rate": 7.4020096850134635e-06,
"loss": 0.0862,
"step": 333
},
{
"epoch": 0.6830265848670757,
"grad_norm": 1.8164525363712967,
"learning_rate": 7.38791076765563e-06,
"loss": 0.08,
"step": 334
},
{
"epoch": 0.6850715746421268,
"grad_norm": 1.8489841001332317,
"learning_rate": 7.37378721035154e-06,
"loss": 0.0863,
"step": 335
},
{
"epoch": 0.6871165644171779,
"grad_norm": 1.9227410779505356,
"learning_rate": 7.359639158836828e-06,
"loss": 0.0797,
"step": 336
},
{
"epoch": 0.689161554192229,
"grad_norm": 2.1782307041733855,
"learning_rate": 7.345466759099875e-06,
"loss": 0.0946,
"step": 337
},
{
"epoch": 0.6912065439672802,
"grad_norm": 2.1346962188887626,
"learning_rate": 7.331270157380304e-06,
"loss": 0.0953,
"step": 338
},
{
"epoch": 0.6932515337423313,
"grad_norm": 1.759960430802437,
"learning_rate": 7.317049500167466e-06,
"loss": 0.0969,
"step": 339
},
{
"epoch": 0.6952965235173824,
"grad_norm": 2.0404870097493646,
"learning_rate": 7.302804934198937e-06,
"loss": 0.0852,
"step": 340
},
{
"epoch": 0.6973415132924335,
"grad_norm": 2.3585223108650037,
"learning_rate": 7.28853660645899e-06,
"loss": 0.1054,
"step": 341
},
{
"epoch": 0.6993865030674846,
"grad_norm": 1.8518134360019116,
"learning_rate": 7.2742446641770985e-06,
"loss": 0.0942,
"step": 342
},
{
"epoch": 0.7014314928425358,
"grad_norm": 1.6802043170675642,
"learning_rate": 7.259929254826393e-06,
"loss": 0.0703,
"step": 343
},
{
"epoch": 0.7034764826175869,
"grad_norm": 2.3222003347544233,
"learning_rate": 7.2455905261221585e-06,
"loss": 0.0981,
"step": 344
},
{
"epoch": 0.7055214723926381,
"grad_norm": 1.7096656290299208,
"learning_rate": 7.231228626020303e-06,
"loss": 0.0686,
"step": 345
},
{
"epoch": 0.7075664621676891,
"grad_norm": 2.301527792978425,
"learning_rate": 7.216843702715831e-06,
"loss": 0.0806,
"step": 346
},
{
"epoch": 0.7096114519427403,
"grad_norm": 1.7573853731950437,
"learning_rate": 7.202435904641316e-06,
"loss": 0.0766,
"step": 347
},
{
"epoch": 0.7116564417177914,
"grad_norm": 1.882419627052227,
"learning_rate": 7.188005380465365e-06,
"loss": 0.0733,
"step": 348
},
{
"epoch": 0.7137014314928425,
"grad_norm": 2.470103268920824,
"learning_rate": 7.173552279091087e-06,
"loss": 0.1016,
"step": 349
},
{
"epoch": 0.7157464212678937,
"grad_norm": 1.4869158817717396,
"learning_rate": 7.159076749654559e-06,
"loss": 0.0624,
"step": 350
},
{
"epoch": 0.7177914110429447,
"grad_norm": 1.5968050085844632,
"learning_rate": 7.144578941523283e-06,
"loss": 0.0707,
"step": 351
},
{
"epoch": 0.7198364008179959,
"grad_norm": 1.6356481647041587,
"learning_rate": 7.130059004294647e-06,
"loss": 0.066,
"step": 352
},
{
"epoch": 0.721881390593047,
"grad_norm": 2.9392656768707504,
"learning_rate": 7.115517087794381e-06,
"loss": 0.1009,
"step": 353
},
{
"epoch": 0.7239263803680982,
"grad_norm": 2.2918804151158065,
"learning_rate": 7.10095334207501e-06,
"loss": 0.0962,
"step": 354
},
{
"epoch": 0.7259713701431493,
"grad_norm": 1.8475331071622312,
"learning_rate": 7.086367917414307e-06,
"loss": 0.082,
"step": 355
},
{
"epoch": 0.7280163599182005,
"grad_norm": 1.9726367085045817,
"learning_rate": 7.071760964313739e-06,
"loss": 0.0732,
"step": 356
},
{
"epoch": 0.7300613496932515,
"grad_norm": 2.1502810171764244,
"learning_rate": 7.057132633496924e-06,
"loss": 0.1049,
"step": 357
},
{
"epoch": 0.7321063394683026,
"grad_norm": 1.8592273232420053,
"learning_rate": 7.042483075908062e-06,
"loss": 0.0862,
"step": 358
},
{
"epoch": 0.7341513292433538,
"grad_norm": 2.355170511162385,
"learning_rate": 7.027812442710385e-06,
"loss": 0.0937,
"step": 359
},
{
"epoch": 0.7361963190184049,
"grad_norm": 1.6779561691380307,
"learning_rate": 7.013120885284599e-06,
"loss": 0.0675,
"step": 360
},
{
"epoch": 0.7382413087934561,
"grad_norm": 2.3918539767349762,
"learning_rate": 6.9984085552273136e-06,
"loss": 0.0964,
"step": 361
},
{
"epoch": 0.7402862985685071,
"grad_norm": 2.0029627191660087,
"learning_rate": 6.983675604349492e-06,
"loss": 0.0808,
"step": 362
},
{
"epoch": 0.7423312883435583,
"grad_norm": 2.361971189154723,
"learning_rate": 6.968922184674868e-06,
"loss": 0.0902,
"step": 363
},
{
"epoch": 0.7443762781186094,
"grad_norm": 1.7941380948957237,
"learning_rate": 6.954148448438389e-06,
"loss": 0.093,
"step": 364
},
{
"epoch": 0.7464212678936605,
"grad_norm": 1.8475776231124883,
"learning_rate": 6.9393545480846405e-06,
"loss": 0.0803,
"step": 365
},
{
"epoch": 0.7484662576687117,
"grad_norm": 1.391463463223123,
"learning_rate": 6.924540636266272e-06,
"loss": 0.0604,
"step": 366
},
{
"epoch": 0.7505112474437627,
"grad_norm": 1.4587955996368223,
"learning_rate": 6.909706865842429e-06,
"loss": 0.0707,
"step": 367
},
{
"epoch": 0.7525562372188139,
"grad_norm": 1.4497943658621633,
"learning_rate": 6.894853389877163e-06,
"loss": 0.0562,
"step": 368
},
{
"epoch": 0.754601226993865,
"grad_norm": 2.2816948101972474,
"learning_rate": 6.879980361637865e-06,
"loss": 0.0933,
"step": 369
},
{
"epoch": 0.7566462167689162,
"grad_norm": 2.2765511971102925,
"learning_rate": 6.86508793459368e-06,
"loss": 0.0799,
"step": 370
},
{
"epoch": 0.7586912065439673,
"grad_norm": 1.8489677964373195,
"learning_rate": 6.8501762624139125e-06,
"loss": 0.0828,
"step": 371
},
{
"epoch": 0.7607361963190185,
"grad_norm": 2.2599682805893244,
"learning_rate": 6.835245498966461e-06,
"loss": 0.1019,
"step": 372
},
{
"epoch": 0.7627811860940695,
"grad_norm": 1.7535048313819637,
"learning_rate": 6.820295798316214e-06,
"loss": 0.0877,
"step": 373
},
{
"epoch": 0.7648261758691206,
"grad_norm": 2.1348338096756962,
"learning_rate": 6.805327314723469e-06,
"loss": 0.0713,
"step": 374
},
{
"epoch": 0.7668711656441718,
"grad_norm": 1.471825773848477,
"learning_rate": 6.790340202642333e-06,
"loss": 0.0648,
"step": 375
},
{
"epoch": 0.7689161554192229,
"grad_norm": 1.9667987135525467,
"learning_rate": 6.775334616719136e-06,
"loss": 0.0933,
"step": 376
},
{
"epoch": 0.7709611451942741,
"grad_norm": 1.9656786527852497,
"learning_rate": 6.760310711790831e-06,
"loss": 0.0886,
"step": 377
},
{
"epoch": 0.7730061349693251,
"grad_norm": 1.7703569506269972,
"learning_rate": 6.7452686428834045e-06,
"loss": 0.0774,
"step": 378
},
{
"epoch": 0.7750511247443763,
"grad_norm": 2.247523798525931,
"learning_rate": 6.73020856521026e-06,
"loss": 0.1031,
"step": 379
},
{
"epoch": 0.7770961145194274,
"grad_norm": 1.872342874790795,
"learning_rate": 6.715130634170636e-06,
"loss": 0.0895,
"step": 380
},
{
"epoch": 0.7791411042944786,
"grad_norm": 2.070656684465323,
"learning_rate": 6.700035005347983e-06,
"loss": 0.0868,
"step": 381
},
{
"epoch": 0.7811860940695297,
"grad_norm": 2.2454799924898667,
"learning_rate": 6.6849218345083785e-06,
"loss": 0.0978,
"step": 382
},
{
"epoch": 0.7832310838445807,
"grad_norm": 1.891754000824279,
"learning_rate": 6.6697912775989045e-06,
"loss": 0.0785,
"step": 383
},
{
"epoch": 0.7852760736196319,
"grad_norm": 1.7579771296347333,
"learning_rate": 6.654643490746042e-06,
"loss": 0.0858,
"step": 384
},
{
"epoch": 0.787321063394683,
"grad_norm": 1.9641471370237964,
"learning_rate": 6.6394786302540645e-06,
"loss": 0.082,
"step": 385
},
{
"epoch": 0.7893660531697342,
"grad_norm": 1.8254808653521009,
"learning_rate": 6.624296852603419e-06,
"loss": 0.0882,
"step": 386
},
{
"epoch": 0.7914110429447853,
"grad_norm": 1.4088372814526477,
"learning_rate": 6.609098314449116e-06,
"loss": 0.0671,
"step": 387
},
{
"epoch": 0.7934560327198364,
"grad_norm": 1.9617841850743343,
"learning_rate": 6.593883172619111e-06,
"loss": 0.0933,
"step": 388
},
{
"epoch": 0.7955010224948875,
"grad_norm": 1.5767225526580313,
"learning_rate": 6.578651584112687e-06,
"loss": 0.0636,
"step": 389
},
{
"epoch": 0.7975460122699386,
"grad_norm": 2.2228834140058336,
"learning_rate": 6.563403706098833e-06,
"loss": 0.1077,
"step": 390
},
{
"epoch": 0.7995910020449898,
"grad_norm": 1.9792433955524278,
"learning_rate": 6.5481396959146225e-06,
"loss": 0.0891,
"step": 391
},
{
"epoch": 0.8016359918200409,
"grad_norm": 1.2215680463568073,
"learning_rate": 6.532859711063594e-06,
"loss": 0.0563,
"step": 392
},
{
"epoch": 0.803680981595092,
"grad_norm": 1.6824250107088006,
"learning_rate": 6.517563909214119e-06,
"loss": 0.0783,
"step": 393
},
{
"epoch": 0.8057259713701431,
"grad_norm": 1.7462647827998714,
"learning_rate": 6.502252448197782e-06,
"loss": 0.0814,
"step": 394
},
{
"epoch": 0.8077709611451943,
"grad_norm": 1.3887650073154911,
"learning_rate": 6.486925486007743e-06,
"loss": 0.0641,
"step": 395
},
{
"epoch": 0.8098159509202454,
"grad_norm": 2.0714118588443613,
"learning_rate": 6.471583180797121e-06,
"loss": 0.1055,
"step": 396
},
{
"epoch": 0.8118609406952966,
"grad_norm": 1.5588416633458682,
"learning_rate": 6.456225690877345e-06,
"loss": 0.0744,
"step": 397
},
{
"epoch": 0.8139059304703476,
"grad_norm": 1.6448175082442864,
"learning_rate": 6.440853174716535e-06,
"loss": 0.0679,
"step": 398
},
{
"epoch": 0.8159509202453987,
"grad_norm": 1.7938499571539583,
"learning_rate": 6.4254657909378615e-06,
"loss": 0.0701,
"step": 399
},
{
"epoch": 0.8179959100204499,
"grad_norm": 2.1584932014661606,
"learning_rate": 6.410063698317901e-06,
"loss": 0.0896,
"step": 400
},
{
"epoch": 0.8179959100204499,
"eval_loss": 0.08662194758653641,
"eval_runtime": 1.5943,
"eval_samples_per_second": 25.089,
"eval_steps_per_second": 6.272,
"step": 400
},
{
"epoch": 0.820040899795501,
"grad_norm": 1.6606377004284583,
"learning_rate": 6.394647055785017e-06,
"loss": 0.0699,
"step": 401
},
{
"epoch": 0.8220858895705522,
"grad_norm": 2.2914577704716113,
"learning_rate": 6.379216022417695e-06,
"loss": 0.0858,
"step": 402
},
{
"epoch": 0.8241308793456033,
"grad_norm": 1.7940636149724014,
"learning_rate": 6.363770757442927e-06,
"loss": 0.0838,
"step": 403
},
{
"epoch": 0.8261758691206544,
"grad_norm": 2.1090208330887363,
"learning_rate": 6.348311420234542e-06,
"loss": 0.0837,
"step": 404
},
{
"epoch": 0.8282208588957055,
"grad_norm": 1.761887269760676,
"learning_rate": 6.332838170311586e-06,
"loss": 0.0791,
"step": 405
},
{
"epoch": 0.8302658486707567,
"grad_norm": 2.0316688681749846,
"learning_rate": 6.31735116733666e-06,
"loss": 0.0762,
"step": 406
},
{
"epoch": 0.8323108384458078,
"grad_norm": 1.4824433767272704,
"learning_rate": 6.301850571114282e-06,
"loss": 0.0531,
"step": 407
},
{
"epoch": 0.8343558282208589,
"grad_norm": 1.9042239460112056,
"learning_rate": 6.286336541589224e-06,
"loss": 0.0685,
"step": 408
},
{
"epoch": 0.83640081799591,
"grad_norm": 1.631266470020269,
"learning_rate": 6.270809238844881e-06,
"loss": 0.0713,
"step": 409
},
{
"epoch": 0.8384458077709611,
"grad_norm": 1.8805596275114955,
"learning_rate": 6.255268823101604e-06,
"loss": 0.0751,
"step": 410
},
{
"epoch": 0.8404907975460123,
"grad_norm": 2.295370695981097,
"learning_rate": 6.239715454715054e-06,
"loss": 0.0984,
"step": 411
},
{
"epoch": 0.8425357873210634,
"grad_norm": 2.269325740615013,
"learning_rate": 6.224149294174549e-06,
"loss": 0.0966,
"step": 412
},
{
"epoch": 0.8445807770961146,
"grad_norm": 2.060132528646075,
"learning_rate": 6.208570502101393e-06,
"loss": 0.0817,
"step": 413
},
{
"epoch": 0.8466257668711656,
"grad_norm": 1.8016710838966334,
"learning_rate": 6.192979239247243e-06,
"loss": 0.0858,
"step": 414
},
{
"epoch": 0.8486707566462167,
"grad_norm": 1.9922284651178528,
"learning_rate": 6.177375666492431e-06,
"loss": 0.0735,
"step": 415
},
{
"epoch": 0.8507157464212679,
"grad_norm": 1.689681220388234,
"learning_rate": 6.161759944844308e-06,
"loss": 0.0756,
"step": 416
},
{
"epoch": 0.852760736196319,
"grad_norm": 2.618019309211191,
"learning_rate": 6.146132235435591e-06,
"loss": 0.0829,
"step": 417
},
{
"epoch": 0.8548057259713702,
"grad_norm": 2.0274624599323414,
"learning_rate": 6.1304926995226895e-06,
"loss": 0.0836,
"step": 418
},
{
"epoch": 0.8568507157464212,
"grad_norm": 2.0858291852426496,
"learning_rate": 6.114841498484049e-06,
"loss": 0.09,
"step": 419
},
{
"epoch": 0.8588957055214724,
"grad_norm": 1.656532684919004,
"learning_rate": 6.099178793818479e-06,
"loss": 0.0674,
"step": 420
},
{
"epoch": 0.8609406952965235,
"grad_norm": 1.781888769859481,
"learning_rate": 6.083504747143496e-06,
"loss": 0.0706,
"step": 421
},
{
"epoch": 0.8629856850715747,
"grad_norm": 2.2606057911008217,
"learning_rate": 6.0678195201936455e-06,
"loss": 0.0969,
"step": 422
},
{
"epoch": 0.8650306748466258,
"grad_norm": 2.3434090242083943,
"learning_rate": 6.0521232748188416e-06,
"loss": 0.1064,
"step": 423
},
{
"epoch": 0.8670756646216768,
"grad_norm": 2.064354269601007,
"learning_rate": 6.0364161729826905e-06,
"loss": 0.0896,
"step": 424
},
{
"epoch": 0.869120654396728,
"grad_norm": 1.7331387406948884,
"learning_rate": 6.020698376760824e-06,
"loss": 0.0753,
"step": 425
},
{
"epoch": 0.8711656441717791,
"grad_norm": 1.6248452960794957,
"learning_rate": 6.0049700483392256e-06,
"loss": 0.0683,
"step": 426
},
{
"epoch": 0.8732106339468303,
"grad_norm": 1.7788246413520943,
"learning_rate": 5.9892313500125545e-06,
"loss": 0.0808,
"step": 427
},
{
"epoch": 0.8752556237218814,
"grad_norm": 1.6403389067415772,
"learning_rate": 5.9734824441824745e-06,
"loss": 0.0763,
"step": 428
},
{
"epoch": 0.8773006134969326,
"grad_norm": 1.968967047123883,
"learning_rate": 5.957723493355977e-06,
"loss": 0.0946,
"step": 429
},
{
"epoch": 0.8793456032719836,
"grad_norm": 1.5050654888065231,
"learning_rate": 5.941954660143703e-06,
"loss": 0.0673,
"step": 430
},
{
"epoch": 0.8813905930470347,
"grad_norm": 1.5627708754572884,
"learning_rate": 5.926176107258265e-06,
"loss": 0.0662,
"step": 431
},
{
"epoch": 0.8834355828220859,
"grad_norm": 1.9429047212464141,
"learning_rate": 5.910387997512573e-06,
"loss": 0.0845,
"step": 432
},
{
"epoch": 0.885480572597137,
"grad_norm": 1.8862289067048144,
"learning_rate": 5.894590493818149e-06,
"loss": 0.074,
"step": 433
},
{
"epoch": 0.8875255623721882,
"grad_norm": 1.4871525287185456,
"learning_rate": 5.8787837591834415e-06,
"loss": 0.0642,
"step": 434
},
{
"epoch": 0.8895705521472392,
"grad_norm": 1.9230413221781277,
"learning_rate": 5.86296795671216e-06,
"loss": 0.0854,
"step": 435
},
{
"epoch": 0.8916155419222904,
"grad_norm": 1.8042936065902104,
"learning_rate": 5.847143249601575e-06,
"loss": 0.0733,
"step": 436
},
{
"epoch": 0.8936605316973415,
"grad_norm": 1.89659500750371,
"learning_rate": 5.831309801140841e-06,
"loss": 0.0717,
"step": 437
},
{
"epoch": 0.8957055214723927,
"grad_norm": 1.988875729296592,
"learning_rate": 5.815467774709314e-06,
"loss": 0.0901,
"step": 438
},
{
"epoch": 0.8977505112474438,
"grad_norm": 2.1651335543706365,
"learning_rate": 5.799617333774861e-06,
"loss": 0.0942,
"step": 439
},
{
"epoch": 0.8997955010224948,
"grad_norm": 1.694629036784553,
"learning_rate": 5.783758641892172e-06,
"loss": 0.0691,
"step": 440
},
{
"epoch": 0.901840490797546,
"grad_norm": 1.8724577454949232,
"learning_rate": 5.767891862701081e-06,
"loss": 0.0704,
"step": 441
},
{
"epoch": 0.9038854805725971,
"grad_norm": 2.1444156343749103,
"learning_rate": 5.7520171599248704e-06,
"loss": 0.0862,
"step": 442
},
{
"epoch": 0.9059304703476483,
"grad_norm": 1.6044981562664562,
"learning_rate": 5.73613469736858e-06,
"loss": 0.0695,
"step": 443
},
{
"epoch": 0.9079754601226994,
"grad_norm": 1.7887677604270025,
"learning_rate": 5.7202446389173225e-06,
"loss": 0.0776,
"step": 444
},
{
"epoch": 0.9100204498977505,
"grad_norm": 2.0623558286912487,
"learning_rate": 5.704347148534589e-06,
"loss": 0.0939,
"step": 445
},
{
"epoch": 0.9120654396728016,
"grad_norm": 1.7656943163705168,
"learning_rate": 5.688442390260559e-06,
"loss": 0.0699,
"step": 446
},
{
"epoch": 0.9141104294478528,
"grad_norm": 1.950808092154816,
"learning_rate": 5.672530528210405e-06,
"loss": 0.0764,
"step": 447
},
{
"epoch": 0.9161554192229039,
"grad_norm": 1.5958859437062274,
"learning_rate": 5.656611726572601e-06,
"loss": 0.0707,
"step": 448
},
{
"epoch": 0.918200408997955,
"grad_norm": 2.106375056034876,
"learning_rate": 5.640686149607228e-06,
"loss": 0.0884,
"step": 449
},
{
"epoch": 0.9202453987730062,
"grad_norm": 1.6944267542875595,
"learning_rate": 5.624753961644281e-06,
"loss": 0.0705,
"step": 450
},
{
"epoch": 0.9222903885480572,
"grad_norm": 1.7841030194649183,
"learning_rate": 5.608815327081969e-06,
"loss": 0.0765,
"step": 451
},
{
"epoch": 0.9243353783231084,
"grad_norm": 1.7712077615995716,
"learning_rate": 5.592870410385021e-06,
"loss": 0.0733,
"step": 452
},
{
"epoch": 0.9263803680981595,
"grad_norm": 2.1116933877527835,
"learning_rate": 5.57691937608299e-06,
"loss": 0.0913,
"step": 453
},
{
"epoch": 0.9284253578732107,
"grad_norm": 1.4163030121649893,
"learning_rate": 5.560962388768554e-06,
"loss": 0.0545,
"step": 454
},
{
"epoch": 0.9304703476482618,
"grad_norm": 1.810312240995325,
"learning_rate": 5.5449996130958185e-06,
"loss": 0.0754,
"step": 455
},
{
"epoch": 0.9325153374233128,
"grad_norm": 1.7804851440319986,
"learning_rate": 5.529031213778615e-06,
"loss": 0.0647,
"step": 456
},
{
"epoch": 0.934560327198364,
"grad_norm": 2.2045196131947624,
"learning_rate": 5.513057355588804e-06,
"loss": 0.0891,
"step": 457
},
{
"epoch": 0.9366053169734151,
"grad_norm": 1.9852749682627289,
"learning_rate": 5.497078203354577e-06,
"loss": 0.0775,
"step": 458
},
{
"epoch": 0.9386503067484663,
"grad_norm": 1.831470663502445,
"learning_rate": 5.481093921958749e-06,
"loss": 0.0845,
"step": 459
},
{
"epoch": 0.9406952965235174,
"grad_norm": 2.11329473791922,
"learning_rate": 5.4651046763370615e-06,
"loss": 0.0797,
"step": 460
},
{
"epoch": 0.9427402862985685,
"grad_norm": 1.936029472084334,
"learning_rate": 5.449110631476481e-06,
"loss": 0.0626,
"step": 461
},
{
"epoch": 0.9447852760736196,
"grad_norm": 2.8880649224481254,
"learning_rate": 5.433111952413496e-06,
"loss": 0.0876,
"step": 462
},
{
"epoch": 0.9468302658486708,
"grad_norm": 1.6788136591444187,
"learning_rate": 5.417108804232409e-06,
"loss": 0.0802,
"step": 463
},
{
"epoch": 0.9488752556237219,
"grad_norm": 1.7603381558531794,
"learning_rate": 5.4011013520636466e-06,
"loss": 0.0711,
"step": 464
},
{
"epoch": 0.950920245398773,
"grad_norm": 1.6546291038527539,
"learning_rate": 5.385089761082039e-06,
"loss": 0.0718,
"step": 465
},
{
"epoch": 0.9529652351738241,
"grad_norm": 1.7527461122946937,
"learning_rate": 5.3690741965051255e-06,
"loss": 0.0772,
"step": 466
},
{
"epoch": 0.9550102249488752,
"grad_norm": 2.153339872012431,
"learning_rate": 5.353054823591446e-06,
"loss": 0.0984,
"step": 467
},
{
"epoch": 0.9570552147239264,
"grad_norm": 1.663490695062259,
"learning_rate": 5.3370318076388405e-06,
"loss": 0.0719,
"step": 468
},
{
"epoch": 0.9591002044989775,
"grad_norm": 2.039791879502307,
"learning_rate": 5.3210053139827374e-06,
"loss": 0.0852,
"step": 469
},
{
"epoch": 0.9611451942740287,
"grad_norm": 1.5152660819257473,
"learning_rate": 5.304975507994453e-06,
"loss": 0.0705,
"step": 470
},
{
"epoch": 0.9631901840490797,
"grad_norm": 2.5741046076702485,
"learning_rate": 5.288942555079479e-06,
"loss": 0.0841,
"step": 471
},
{
"epoch": 0.9652351738241309,
"grad_norm": 1.9038985725819735,
"learning_rate": 5.27290662067578e-06,
"loss": 0.0852,
"step": 472
},
{
"epoch": 0.967280163599182,
"grad_norm": 2.287787910789673,
"learning_rate": 5.256867870252087e-06,
"loss": 0.0943,
"step": 473
},
{
"epoch": 0.9693251533742331,
"grad_norm": 2.001848621526479,
"learning_rate": 5.240826469306187e-06,
"loss": 0.0784,
"step": 474
},
{
"epoch": 0.9713701431492843,
"grad_norm": 2.1169709747380865,
"learning_rate": 5.224782583363215e-06,
"loss": 0.0841,
"step": 475
},
{
"epoch": 0.9734151329243353,
"grad_norm": 1.929731685506713,
"learning_rate": 5.208736377973954e-06,
"loss": 0.0749,
"step": 476
},
{
"epoch": 0.9754601226993865,
"grad_norm": 1.67776042592261,
"learning_rate": 5.1926880187131134e-06,
"loss": 0.0724,
"step": 477
},
{
"epoch": 0.9775051124744376,
"grad_norm": 2.298605193205057,
"learning_rate": 5.176637671177631e-06,
"loss": 0.1006,
"step": 478
},
{
"epoch": 0.9795501022494888,
"grad_norm": 1.6533628146778505,
"learning_rate": 5.160585500984962e-06,
"loss": 0.0646,
"step": 479
},
{
"epoch": 0.9815950920245399,
"grad_norm": 1.837841443310845,
"learning_rate": 5.144531673771364e-06,
"loss": 0.0735,
"step": 480
},
{
"epoch": 0.983640081799591,
"grad_norm": 2.0263842675819426,
"learning_rate": 5.1284763551901995e-06,
"loss": 0.0826,
"step": 481
},
{
"epoch": 0.9856850715746421,
"grad_norm": 1.7313226963602824,
"learning_rate": 5.112419710910213e-06,
"loss": 0.0672,
"step": 482
},
{
"epoch": 0.9877300613496932,
"grad_norm": 2.015542364940025,
"learning_rate": 5.096361906613836e-06,
"loss": 0.0782,
"step": 483
},
{
"epoch": 0.9897750511247444,
"grad_norm": 1.514253640731746,
"learning_rate": 5.080303107995461e-06,
"loss": 0.0737,
"step": 484
},
{
"epoch": 0.9918200408997955,
"grad_norm": 1.8395138697043611,
"learning_rate": 5.064243480759749e-06,
"loss": 0.0718,
"step": 485
},
{
"epoch": 0.9938650306748467,
"grad_norm": 1.884215459624302,
"learning_rate": 5.048183190619904e-06,
"loss": 0.0698,
"step": 486
},
{
"epoch": 0.9959100204498977,
"grad_norm": 2.146037958767005,
"learning_rate": 5.032122403295977e-06,
"loss": 0.0902,
"step": 487
},
{
"epoch": 0.9979550102249489,
"grad_norm": 2.050214345057994,
"learning_rate": 5.016061284513142e-06,
"loss": 0.0682,
"step": 488
},
{
"epoch": 1.0,
"grad_norm": 2.118455250575704,
"learning_rate": 5e-06,
"loss": 0.0774,
"step": 489
},
{
"epoch": 1.0020449897750512,
"grad_norm": 1.2462794649364155,
"learning_rate": 4.983938715486858e-06,
"loss": 0.033,
"step": 490
},
{
"epoch": 1.0040899795501022,
"grad_norm": 1.1459651928513004,
"learning_rate": 4.967877596704026e-06,
"loss": 0.0332,
"step": 491
},
{
"epoch": 1.0061349693251533,
"grad_norm": 1.2067420298473397,
"learning_rate": 4.951816809380098e-06,
"loss": 0.0286,
"step": 492
},
{
"epoch": 1.0081799591002045,
"grad_norm": 1.5512568893071932,
"learning_rate": 4.935756519240253e-06,
"loss": 0.0371,
"step": 493
},
{
"epoch": 1.0102249488752557,
"grad_norm": 1.0286360100738143,
"learning_rate": 4.919696892004539e-06,
"loss": 0.0302,
"step": 494
},
{
"epoch": 1.0122699386503067,
"grad_norm": 1.1516477911852547,
"learning_rate": 4.903638093386167e-06,
"loss": 0.0369,
"step": 495
},
{
"epoch": 1.0143149284253579,
"grad_norm": 1.245679943150789,
"learning_rate": 4.887580289089788e-06,
"loss": 0.0301,
"step": 496
},
{
"epoch": 1.016359918200409,
"grad_norm": 1.703967009754419,
"learning_rate": 4.871523644809802e-06,
"loss": 0.0466,
"step": 497
},
{
"epoch": 1.01840490797546,
"grad_norm": 1.546303848260575,
"learning_rate": 4.855468326228638e-06,
"loss": 0.0318,
"step": 498
},
{
"epoch": 1.0204498977505112,
"grad_norm": 1.251270953754372,
"learning_rate": 4.839414499015041e-06,
"loss": 0.0263,
"step": 499
},
{
"epoch": 1.0224948875255624,
"grad_norm": 1.3764180941343123,
"learning_rate": 4.82336232882237e-06,
"loss": 0.0345,
"step": 500
},
{
"epoch": 1.0245398773006136,
"grad_norm": 1.3372315078519637,
"learning_rate": 4.807311981286888e-06,
"loss": 0.0292,
"step": 501
},
{
"epoch": 1.0265848670756645,
"grad_norm": 1.544786930842698,
"learning_rate": 4.791263622026048e-06,
"loss": 0.0307,
"step": 502
},
{
"epoch": 1.0286298568507157,
"grad_norm": 1.3878895340607698,
"learning_rate": 4.775217416636786e-06,
"loss": 0.0326,
"step": 503
},
{
"epoch": 1.030674846625767,
"grad_norm": 1.6162202894342939,
"learning_rate": 4.7591735306938144e-06,
"loss": 0.0352,
"step": 504
},
{
"epoch": 1.032719836400818,
"grad_norm": 1.7805340361439255,
"learning_rate": 4.7431321297479135e-06,
"loss": 0.0372,
"step": 505
},
{
"epoch": 1.034764826175869,
"grad_norm": 1.758896549825897,
"learning_rate": 4.727093379324222e-06,
"loss": 0.0372,
"step": 506
},
{
"epoch": 1.0368098159509203,
"grad_norm": 1.431786070102582,
"learning_rate": 4.711057444920522e-06,
"loss": 0.0384,
"step": 507
},
{
"epoch": 1.0388548057259714,
"grad_norm": 1.4839513210400883,
"learning_rate": 4.6950244920055475e-06,
"loss": 0.0383,
"step": 508
},
{
"epoch": 1.0408997955010224,
"grad_norm": 1.4451001891145334,
"learning_rate": 4.678994686017263e-06,
"loss": 0.0352,
"step": 509
},
{
"epoch": 1.0429447852760736,
"grad_norm": 1.866286088536185,
"learning_rate": 4.662968192361161e-06,
"loss": 0.0395,
"step": 510
},
{
"epoch": 1.0449897750511248,
"grad_norm": 1.715211234345797,
"learning_rate": 4.646945176408555e-06,
"loss": 0.0313,
"step": 511
},
{
"epoch": 1.047034764826176,
"grad_norm": 1.466087161229122,
"learning_rate": 4.630925803494877e-06,
"loss": 0.0386,
"step": 512
},
{
"epoch": 1.049079754601227,
"grad_norm": 1.9408338143723025,
"learning_rate": 4.614910238917963e-06,
"loss": 0.042,
"step": 513
},
{
"epoch": 1.0511247443762781,
"grad_norm": 1.4266586107150059,
"learning_rate": 4.598898647936354e-06,
"loss": 0.0392,
"step": 514
},
{
"epoch": 1.0531697341513293,
"grad_norm": 1.2356672924767518,
"learning_rate": 4.582891195767591e-06,
"loss": 0.0263,
"step": 515
},
{
"epoch": 1.0552147239263803,
"grad_norm": 1.7715488824913876,
"learning_rate": 4.5668880475865074e-06,
"loss": 0.0405,
"step": 516
},
{
"epoch": 1.0572597137014315,
"grad_norm": 1.5415710251918855,
"learning_rate": 4.55088936852352e-06,
"loss": 0.0357,
"step": 517
},
{
"epoch": 1.0593047034764826,
"grad_norm": 1.4428168778927553,
"learning_rate": 4.534895323662939e-06,
"loss": 0.0317,
"step": 518
},
{
"epoch": 1.0613496932515338,
"grad_norm": 2.2250878381040953,
"learning_rate": 4.518906078041252e-06,
"loss": 0.0415,
"step": 519
},
{
"epoch": 1.0633946830265848,
"grad_norm": 1.9143713762008936,
"learning_rate": 4.502921796645424e-06,
"loss": 0.0525,
"step": 520
},
{
"epoch": 1.065439672801636,
"grad_norm": 1.3675839835879693,
"learning_rate": 4.486942644411197e-06,
"loss": 0.0308,
"step": 521
},
{
"epoch": 1.0674846625766872,
"grad_norm": 1.561173769749881,
"learning_rate": 4.4709687862213866e-06,
"loss": 0.0314,
"step": 522
},
{
"epoch": 1.0695296523517381,
"grad_norm": 1.7909630810420591,
"learning_rate": 4.455000386904185e-06,
"loss": 0.0434,
"step": 523
},
{
"epoch": 1.0715746421267893,
"grad_norm": 1.5523968118402804,
"learning_rate": 4.439037611231448e-06,
"loss": 0.0303,
"step": 524
},
{
"epoch": 1.0736196319018405,
"grad_norm": 1.4974277205131226,
"learning_rate": 4.423080623917012e-06,
"loss": 0.026,
"step": 525
},
{
"epoch": 1.0756646216768917,
"grad_norm": 1.4404941302278607,
"learning_rate": 4.40712958961498e-06,
"loss": 0.0439,
"step": 526
},
{
"epoch": 1.0777096114519427,
"grad_norm": 1.4209943648710208,
"learning_rate": 4.391184672918034e-06,
"loss": 0.033,
"step": 527
},
{
"epoch": 1.0797546012269938,
"grad_norm": 1.555014037990696,
"learning_rate": 4.3752460383557195e-06,
"loss": 0.0326,
"step": 528
},
{
"epoch": 1.081799591002045,
"grad_norm": 2.090310209970452,
"learning_rate": 4.3593138503927725e-06,
"loss": 0.0365,
"step": 529
},
{
"epoch": 1.0838445807770962,
"grad_norm": 1.4619403629973973,
"learning_rate": 4.3433882734274e-06,
"loss": 0.0317,
"step": 530
},
{
"epoch": 1.0858895705521472,
"grad_norm": 1.714317746744886,
"learning_rate": 4.327469471789597e-06,
"loss": 0.0384,
"step": 531
},
{
"epoch": 1.0879345603271984,
"grad_norm": 1.590590872486597,
"learning_rate": 4.311557609739442e-06,
"loss": 0.0259,
"step": 532
},
{
"epoch": 1.0899795501022496,
"grad_norm": 1.1141369910738055,
"learning_rate": 4.295652851465412e-06,
"loss": 0.0252,
"step": 533
},
{
"epoch": 1.0920245398773005,
"grad_norm": 1.4321694078456395,
"learning_rate": 4.27975536108268e-06,
"loss": 0.0266,
"step": 534
},
{
"epoch": 1.0940695296523517,
"grad_norm": 1.7443372340301369,
"learning_rate": 4.263865302631423e-06,
"loss": 0.04,
"step": 535
},
{
"epoch": 1.096114519427403,
"grad_norm": 1.569615321607395,
"learning_rate": 4.24798284007513e-06,
"loss": 0.0349,
"step": 536
},
{
"epoch": 1.098159509202454,
"grad_norm": 1.895924350646276,
"learning_rate": 4.2321081372989195e-06,
"loss": 0.0424,
"step": 537
},
{
"epoch": 1.100204498977505,
"grad_norm": 1.5923290036258984,
"learning_rate": 4.216241358107831e-06,
"loss": 0.0327,
"step": 538
},
{
"epoch": 1.1022494887525562,
"grad_norm": 1.6634363551936802,
"learning_rate": 4.200382666225141e-06,
"loss": 0.0486,
"step": 539
},
{
"epoch": 1.1042944785276074,
"grad_norm": 1.134137554773839,
"learning_rate": 4.184532225290687e-06,
"loss": 0.0223,
"step": 540
},
{
"epoch": 1.1063394683026584,
"grad_norm": 1.426415380744953,
"learning_rate": 4.16869019885916e-06,
"loss": 0.0312,
"step": 541
},
{
"epoch": 1.1083844580777096,
"grad_norm": 1.1710483257547355,
"learning_rate": 4.152856750398426e-06,
"loss": 0.0223,
"step": 542
},
{
"epoch": 1.1104294478527608,
"grad_norm": 1.5906880940400463,
"learning_rate": 4.137032043287841e-06,
"loss": 0.0343,
"step": 543
},
{
"epoch": 1.112474437627812,
"grad_norm": 1.4350816491421392,
"learning_rate": 4.121216240816559e-06,
"loss": 0.035,
"step": 544
},
{
"epoch": 1.114519427402863,
"grad_norm": 1.7558271509204728,
"learning_rate": 4.105409506181855e-06,
"loss": 0.0378,
"step": 545
},
{
"epoch": 1.116564417177914,
"grad_norm": 1.427323382586195,
"learning_rate": 4.089612002487428e-06,
"loss": 0.0312,
"step": 546
},
{
"epoch": 1.1186094069529653,
"grad_norm": 1.6522808692997277,
"learning_rate": 4.0738238927417354e-06,
"loss": 0.0359,
"step": 547
},
{
"epoch": 1.1206543967280163,
"grad_norm": 2.1347991414153986,
"learning_rate": 4.0580453398563005e-06,
"loss": 0.0336,
"step": 548
},
{
"epoch": 1.1226993865030674,
"grad_norm": 1.1279124304678423,
"learning_rate": 4.042276506644024e-06,
"loss": 0.0245,
"step": 549
},
{
"epoch": 1.1247443762781186,
"grad_norm": 1.1967369793368918,
"learning_rate": 4.026517555817527e-06,
"loss": 0.034,
"step": 550
},
{
"epoch": 1.1267893660531698,
"grad_norm": 1.543038828112648,
"learning_rate": 4.010768649987446e-06,
"loss": 0.0323,
"step": 551
},
{
"epoch": 1.1288343558282208,
"grad_norm": 2.005013019238116,
"learning_rate": 3.995029951660777e-06,
"loss": 0.0466,
"step": 552
},
{
"epoch": 1.130879345603272,
"grad_norm": 1.6139182636953708,
"learning_rate": 3.979301623239177e-06,
"loss": 0.0358,
"step": 553
},
{
"epoch": 1.1329243353783232,
"grad_norm": 1.3981520538108454,
"learning_rate": 3.963583827017311e-06,
"loss": 0.0377,
"step": 554
},
{
"epoch": 1.1349693251533743,
"grad_norm": 1.2494252534129648,
"learning_rate": 3.94787672518116e-06,
"loss": 0.0239,
"step": 555
},
{
"epoch": 1.1370143149284253,
"grad_norm": 2.204641878373284,
"learning_rate": 3.932180479806357e-06,
"loss": 0.0456,
"step": 556
},
{
"epoch": 1.1390593047034765,
"grad_norm": 1.565570789271408,
"learning_rate": 3.916495252856506e-06,
"loss": 0.0324,
"step": 557
},
{
"epoch": 1.1411042944785277,
"grad_norm": 1.6789711476896356,
"learning_rate": 3.900821206181521e-06,
"loss": 0.0368,
"step": 558
},
{
"epoch": 1.1431492842535786,
"grad_norm": 1.2216110777737839,
"learning_rate": 3.885158501515954e-06,
"loss": 0.0279,
"step": 559
},
{
"epoch": 1.1451942740286298,
"grad_norm": 1.3633350295226845,
"learning_rate": 3.869507300477311e-06,
"loss": 0.0328,
"step": 560
},
{
"epoch": 1.147239263803681,
"grad_norm": 1.4166924061545885,
"learning_rate": 3.853867764564409e-06,
"loss": 0.0329,
"step": 561
},
{
"epoch": 1.149284253578732,
"grad_norm": 1.6034718030946522,
"learning_rate": 3.838240055155692e-06,
"loss": 0.0334,
"step": 562
},
{
"epoch": 1.1513292433537832,
"grad_norm": 1.077423978891052,
"learning_rate": 3.8226243335075715e-06,
"loss": 0.0224,
"step": 563
},
{
"epoch": 1.1533742331288344,
"grad_norm": 1.3542581793543431,
"learning_rate": 3.8070207607527587e-06,
"loss": 0.0319,
"step": 564
},
{
"epoch": 1.1554192229038855,
"grad_norm": 1.6634879057734975,
"learning_rate": 3.7914294978986083e-06,
"loss": 0.0393,
"step": 565
},
{
"epoch": 1.1574642126789365,
"grad_norm": 1.628283419385412,
"learning_rate": 3.7758507058254547e-06,
"loss": 0.036,
"step": 566
},
{
"epoch": 1.1595092024539877,
"grad_norm": 1.5659369656664788,
"learning_rate": 3.760284545284947e-06,
"loss": 0.0277,
"step": 567
},
{
"epoch": 1.1615541922290389,
"grad_norm": 1.4921911012284916,
"learning_rate": 3.744731176898396e-06,
"loss": 0.0389,
"step": 568
},
{
"epoch": 1.16359918200409,
"grad_norm": 1.8188186488286036,
"learning_rate": 3.7291907611551197e-06,
"loss": 0.0521,
"step": 569
},
{
"epoch": 1.165644171779141,
"grad_norm": 1.7461144499706955,
"learning_rate": 3.7136634584107787e-06,
"loss": 0.0314,
"step": 570
},
{
"epoch": 1.1676891615541922,
"grad_norm": 1.5152391192451116,
"learning_rate": 3.69814942888572e-06,
"loss": 0.0395,
"step": 571
},
{
"epoch": 1.1697341513292434,
"grad_norm": 1.2278434943664795,
"learning_rate": 3.6826488326633393e-06,
"loss": 0.03,
"step": 572
},
{
"epoch": 1.1717791411042944,
"grad_norm": 1.3578782487107308,
"learning_rate": 3.6671618296884147e-06,
"loss": 0.0329,
"step": 573
},
{
"epoch": 1.1738241308793456,
"grad_norm": 1.0722909218685073,
"learning_rate": 3.6516885797654593e-06,
"loss": 0.024,
"step": 574
},
{
"epoch": 1.1758691206543967,
"grad_norm": 1.3302319522941752,
"learning_rate": 3.6362292425570754e-06,
"loss": 0.0281,
"step": 575
},
{
"epoch": 1.177914110429448,
"grad_norm": 1.574439290096253,
"learning_rate": 3.620783977582305e-06,
"loss": 0.0342,
"step": 576
},
{
"epoch": 1.179959100204499,
"grad_norm": 1.2172183727962829,
"learning_rate": 3.605352944214986e-06,
"loss": 0.026,
"step": 577
},
{
"epoch": 1.18200408997955,
"grad_norm": 1.4383648089060725,
"learning_rate": 3.5899363016821e-06,
"loss": 0.0265,
"step": 578
},
{
"epoch": 1.1840490797546013,
"grad_norm": 1.716222220393621,
"learning_rate": 3.5745342090621406e-06,
"loss": 0.0316,
"step": 579
},
{
"epoch": 1.1860940695296525,
"grad_norm": 1.3303476968967134,
"learning_rate": 3.5591468252834654e-06,
"loss": 0.0298,
"step": 580
},
{
"epoch": 1.1881390593047034,
"grad_norm": 1.226061907060063,
"learning_rate": 3.543774309122657e-06,
"loss": 0.0209,
"step": 581
},
{
"epoch": 1.1901840490797546,
"grad_norm": 1.656144699228516,
"learning_rate": 3.528416819202881e-06,
"loss": 0.0332,
"step": 582
},
{
"epoch": 1.1922290388548058,
"grad_norm": 1.5013453797233454,
"learning_rate": 3.5130745139922572e-06,
"loss": 0.0288,
"step": 583
},
{
"epoch": 1.1942740286298568,
"grad_norm": 1.9928513490408657,
"learning_rate": 3.497747551802221e-06,
"loss": 0.0521,
"step": 584
},
{
"epoch": 1.196319018404908,
"grad_norm": 1.2521586168450574,
"learning_rate": 3.4824360907858824e-06,
"loss": 0.0274,
"step": 585
},
{
"epoch": 1.1983640081799591,
"grad_norm": 1.2256691948629876,
"learning_rate": 3.467140288936407e-06,
"loss": 0.0282,
"step": 586
},
{
"epoch": 1.20040899795501,
"grad_norm": 1.6527331639972576,
"learning_rate": 3.4518603040853783e-06,
"loss": 0.0436,
"step": 587
},
{
"epoch": 1.2024539877300613,
"grad_norm": 1.368490830870422,
"learning_rate": 3.43659629390117e-06,
"loss": 0.0254,
"step": 588
},
{
"epoch": 1.2044989775051125,
"grad_norm": 1.8028951429047948,
"learning_rate": 3.421348415887315e-06,
"loss": 0.0408,
"step": 589
},
{
"epoch": 1.2065439672801637,
"grad_norm": 1.497550290975889,
"learning_rate": 3.4061168273808896e-06,
"loss": 0.0381,
"step": 590
},
{
"epoch": 1.2085889570552146,
"grad_norm": 1.6321033308692612,
"learning_rate": 3.390901685550887e-06,
"loss": 0.0383,
"step": 591
},
{
"epoch": 1.2106339468302658,
"grad_norm": 1.5159929907176852,
"learning_rate": 3.3757031473965827e-06,
"loss": 0.0304,
"step": 592
},
{
"epoch": 1.212678936605317,
"grad_norm": 2.4724108292496187,
"learning_rate": 3.360521369745937e-06,
"loss": 0.0518,
"step": 593
},
{
"epoch": 1.2147239263803682,
"grad_norm": 1.437590640293289,
"learning_rate": 3.3453565092539586e-06,
"loss": 0.0257,
"step": 594
},
{
"epoch": 1.2167689161554192,
"grad_norm": 1.169486503639217,
"learning_rate": 3.330208722401097e-06,
"loss": 0.0235,
"step": 595
},
{
"epoch": 1.2188139059304703,
"grad_norm": 1.1268592276904335,
"learning_rate": 3.315078165491622e-06,
"loss": 0.0279,
"step": 596
},
{
"epoch": 1.2208588957055215,
"grad_norm": 1.5683278793352897,
"learning_rate": 3.299964994652017e-06,
"loss": 0.0305,
"step": 597
},
{
"epoch": 1.2229038854805725,
"grad_norm": 1.9967429863243036,
"learning_rate": 3.2848693658293675e-06,
"loss": 0.0397,
"step": 598
},
{
"epoch": 1.2249488752556237,
"grad_norm": 1.4152854852084966,
"learning_rate": 3.269791434789741e-06,
"loss": 0.0256,
"step": 599
},
{
"epoch": 1.2269938650306749,
"grad_norm": 1.2653327623743533,
"learning_rate": 3.254731357116597e-06,
"loss": 0.029,
"step": 600
},
{
"epoch": 1.2269938650306749,
"eval_loss": 0.07708186656236649,
"eval_runtime": 1.5947,
"eval_samples_per_second": 25.083,
"eval_steps_per_second": 6.271,
"step": 600
},
{
"epoch": 1.229038854805726,
"grad_norm": 1.775392827404977,
"learning_rate": 3.2396892882091678e-06,
"loss": 0.0379,
"step": 601
},
{
"epoch": 1.231083844580777,
"grad_norm": 1.3830173529601073,
"learning_rate": 3.2246653832808674e-06,
"loss": 0.0288,
"step": 602
},
{
"epoch": 1.2331288343558282,
"grad_norm": 2.1155311480126264,
"learning_rate": 3.209659797357669e-06,
"loss": 0.0615,
"step": 603
},
{
"epoch": 1.2351738241308794,
"grad_norm": 2.4445073717181662,
"learning_rate": 3.1946726852765325e-06,
"loss": 0.0635,
"step": 604
},
{
"epoch": 1.2372188139059306,
"grad_norm": 2.110428549540264,
"learning_rate": 3.179704201683786e-06,
"loss": 0.0364,
"step": 605
},
{
"epoch": 1.2392638036809815,
"grad_norm": 1.7469628121389942,
"learning_rate": 3.16475450103354e-06,
"loss": 0.0372,
"step": 606
},
{
"epoch": 1.2413087934560327,
"grad_norm": 0.9152243306182197,
"learning_rate": 3.149823737586089e-06,
"loss": 0.0161,
"step": 607
},
{
"epoch": 1.243353783231084,
"grad_norm": 1.3056210800799668,
"learning_rate": 3.1349120654063224e-06,
"loss": 0.0266,
"step": 608
},
{
"epoch": 1.2453987730061349,
"grad_norm": 1.2879845659396767,
"learning_rate": 3.1200196383621363e-06,
"loss": 0.0274,
"step": 609
},
{
"epoch": 1.247443762781186,
"grad_norm": 1.382752868609299,
"learning_rate": 3.105146610122839e-06,
"loss": 0.0303,
"step": 610
},
{
"epoch": 1.2494887525562373,
"grad_norm": 1.433137069436945,
"learning_rate": 3.090293134157572e-06,
"loss": 0.0259,
"step": 611
},
{
"epoch": 1.2515337423312882,
"grad_norm": 1.2272571061501605,
"learning_rate": 3.0754593637337276e-06,
"loss": 0.0305,
"step": 612
},
{
"epoch": 1.2535787321063394,
"grad_norm": 1.8924881887997287,
"learning_rate": 3.0606454519153608e-06,
"loss": 0.0478,
"step": 613
},
{
"epoch": 1.2556237218813906,
"grad_norm": 1.5252407085003916,
"learning_rate": 3.0458515515616117e-06,
"loss": 0.0382,
"step": 614
},
{
"epoch": 1.2576687116564418,
"grad_norm": 1.2566622827157716,
"learning_rate": 3.0310778153251325e-06,
"loss": 0.0265,
"step": 615
},
{
"epoch": 1.259713701431493,
"grad_norm": 1.5092416262198107,
"learning_rate": 3.0163243956505093e-06,
"loss": 0.0313,
"step": 616
},
{
"epoch": 1.261758691206544,
"grad_norm": 2.0141840286367083,
"learning_rate": 3.001591444772687e-06,
"loss": 0.0373,
"step": 617
},
{
"epoch": 1.2638036809815951,
"grad_norm": 1.0802484572404294,
"learning_rate": 2.986879114715403e-06,
"loss": 0.0266,
"step": 618
},
{
"epoch": 1.2658486707566463,
"grad_norm": 1.359876254126494,
"learning_rate": 2.972187557289616e-06,
"loss": 0.0305,
"step": 619
},
{
"epoch": 1.2678936605316973,
"grad_norm": 1.3671926484976908,
"learning_rate": 2.95751692409194e-06,
"loss": 0.0296,
"step": 620
},
{
"epoch": 1.2699386503067485,
"grad_norm": 1.553369266205434,
"learning_rate": 2.9428673665030772e-06,
"loss": 0.0352,
"step": 621
},
{
"epoch": 1.2719836400817996,
"grad_norm": 1.9647938781064505,
"learning_rate": 2.9282390356862606e-06,
"loss": 0.0414,
"step": 622
},
{
"epoch": 1.2740286298568506,
"grad_norm": 1.7057696677799985,
"learning_rate": 2.9136320825856967e-06,
"loss": 0.0364,
"step": 623
},
{
"epoch": 1.2760736196319018,
"grad_norm": 1.6026279841764746,
"learning_rate": 2.899046657924992e-06,
"loss": 0.0411,
"step": 624
},
{
"epoch": 1.278118609406953,
"grad_norm": 1.5405330605244225,
"learning_rate": 2.884482912205621e-06,
"loss": 0.0358,
"step": 625
},
{
"epoch": 1.280163599182004,
"grad_norm": 1.4374105894350884,
"learning_rate": 2.8699409957053535e-06,
"loss": 0.0267,
"step": 626
},
{
"epoch": 1.2822085889570551,
"grad_norm": 1.4661224711168332,
"learning_rate": 2.8554210584767188e-06,
"loss": 0.0271,
"step": 627
},
{
"epoch": 1.2842535787321063,
"grad_norm": 2.5207979346685963,
"learning_rate": 2.840923250345442e-06,
"loss": 0.0481,
"step": 628
},
{
"epoch": 1.2862985685071575,
"grad_norm": 1.617876668968538,
"learning_rate": 2.8264477209089147e-06,
"loss": 0.0369,
"step": 629
},
{
"epoch": 1.2883435582822087,
"grad_norm": 1.808249674605706,
"learning_rate": 2.8119946195346375e-06,
"loss": 0.0391,
"step": 630
},
{
"epoch": 1.2903885480572597,
"grad_norm": 1.3734714216015984,
"learning_rate": 2.7975640953586846e-06,
"loss": 0.0294,
"step": 631
},
{
"epoch": 1.2924335378323109,
"grad_norm": 1.391392647849848,
"learning_rate": 2.78315629728417e-06,
"loss": 0.0304,
"step": 632
},
{
"epoch": 1.294478527607362,
"grad_norm": 1.4706567091432965,
"learning_rate": 2.7687713739796972e-06,
"loss": 0.0302,
"step": 633
},
{
"epoch": 1.296523517382413,
"grad_norm": 1.3791514521388877,
"learning_rate": 2.7544094738778436e-06,
"loss": 0.0338,
"step": 634
},
{
"epoch": 1.2985685071574642,
"grad_norm": 1.9205903741411616,
"learning_rate": 2.7400707451736103e-06,
"loss": 0.0352,
"step": 635
},
{
"epoch": 1.3006134969325154,
"grad_norm": 1.2860131091648965,
"learning_rate": 2.725755335822903e-06,
"loss": 0.0305,
"step": 636
},
{
"epoch": 1.3026584867075663,
"grad_norm": 2.0441358593719396,
"learning_rate": 2.7114633935410083e-06,
"loss": 0.0381,
"step": 637
},
{
"epoch": 1.3047034764826175,
"grad_norm": 1.606441709854988,
"learning_rate": 2.6971950658010666e-06,
"loss": 0.0343,
"step": 638
},
{
"epoch": 1.3067484662576687,
"grad_norm": 1.1232676032960067,
"learning_rate": 2.6829504998325352e-06,
"loss": 0.0223,
"step": 639
},
{
"epoch": 1.30879345603272,
"grad_norm": 2.0695929953679353,
"learning_rate": 2.6687298426196974e-06,
"loss": 0.0437,
"step": 640
},
{
"epoch": 1.310838445807771,
"grad_norm": 1.2094015200347492,
"learning_rate": 2.6545332409001267e-06,
"loss": 0.0251,
"step": 641
},
{
"epoch": 1.312883435582822,
"grad_norm": 1.3109256389089359,
"learning_rate": 2.6403608411631744e-06,
"loss": 0.0319,
"step": 642
},
{
"epoch": 1.3149284253578732,
"grad_norm": 1.3994815087983556,
"learning_rate": 2.62621278964846e-06,
"loss": 0.0281,
"step": 643
},
{
"epoch": 1.3169734151329244,
"grad_norm": 1.3379418428305758,
"learning_rate": 2.612089232344371e-06,
"loss": 0.0301,
"step": 644
},
{
"epoch": 1.3190184049079754,
"grad_norm": 1.2825425624367577,
"learning_rate": 2.5979903149865386e-06,
"loss": 0.016,
"step": 645
},
{
"epoch": 1.3210633946830266,
"grad_norm": 1.1904405038306072,
"learning_rate": 2.5839161830563475e-06,
"loss": 0.0282,
"step": 646
},
{
"epoch": 1.3231083844580778,
"grad_norm": 1.4411097446473085,
"learning_rate": 2.569866981779433e-06,
"loss": 0.0312,
"step": 647
},
{
"epoch": 1.3251533742331287,
"grad_norm": 1.5429385898328931,
"learning_rate": 2.555842856124182e-06,
"loss": 0.0288,
"step": 648
},
{
"epoch": 1.32719836400818,
"grad_norm": 1.6647644695086803,
"learning_rate": 2.541843950800226e-06,
"loss": 0.0345,
"step": 649
},
{
"epoch": 1.329243353783231,
"grad_norm": 1.6594984133499624,
"learning_rate": 2.527870410256966e-06,
"loss": 0.0355,
"step": 650
},
{
"epoch": 1.331288343558282,
"grad_norm": 1.4712049332054007,
"learning_rate": 2.513922378682075e-06,
"loss": 0.0326,
"step": 651
},
{
"epoch": 1.3333333333333333,
"grad_norm": 2.058004614790052,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.0497,
"step": 652
},
{
"epoch": 1.3353783231083844,
"grad_norm": 1.63215913934179,
"learning_rate": 2.486103417870493e-06,
"loss": 0.0407,
"step": 653
},
{
"epoch": 1.3374233128834356,
"grad_norm": 1.4235640655071966,
"learning_rate": 2.472232775687119e-06,
"loss": 0.0256,
"step": 654
},
{
"epoch": 1.3394683026584868,
"grad_norm": 1.4966832324121504,
"learning_rate": 2.4583882165757766e-06,
"loss": 0.0341,
"step": 655
},
{
"epoch": 1.3415132924335378,
"grad_norm": 1.579776697797337,
"learning_rate": 2.4445698833932236e-06,
"loss": 0.0318,
"step": 656
},
{
"epoch": 1.343558282208589,
"grad_norm": 1.5153710401247442,
"learning_rate": 2.4307779187256064e-06,
"loss": 0.041,
"step": 657
},
{
"epoch": 1.3456032719836402,
"grad_norm": 2.3410319215359343,
"learning_rate": 2.417012464886978e-06,
"loss": 0.0493,
"step": 658
},
{
"epoch": 1.3476482617586911,
"grad_norm": 1.7830705068876032,
"learning_rate": 2.4032736639178443e-06,
"loss": 0.038,
"step": 659
},
{
"epoch": 1.3496932515337423,
"grad_norm": 1.401656109282602,
"learning_rate": 2.389561657583681e-06,
"loss": 0.0314,
"step": 660
},
{
"epoch": 1.3517382413087935,
"grad_norm": 1.6005336710481401,
"learning_rate": 2.3758765873734897e-06,
"loss": 0.0339,
"step": 661
},
{
"epoch": 1.3537832310838445,
"grad_norm": 1.4717588678443954,
"learning_rate": 2.3622185944983187e-06,
"loss": 0.024,
"step": 662
},
{
"epoch": 1.3558282208588956,
"grad_norm": 1.4073986715417073,
"learning_rate": 2.3485878198898253e-06,
"loss": 0.0314,
"step": 663
},
{
"epoch": 1.3578732106339468,
"grad_norm": 1.1155919297375605,
"learning_rate": 2.3349844041988044e-06,
"loss": 0.0238,
"step": 664
},
{
"epoch": 1.359918200408998,
"grad_norm": 1.4447238413299701,
"learning_rate": 2.3214084877937464e-06,
"loss": 0.024,
"step": 665
},
{
"epoch": 1.3619631901840492,
"grad_norm": 1.4727637644326748,
"learning_rate": 2.30786021075939e-06,
"loss": 0.0352,
"step": 666
},
{
"epoch": 1.3640081799591002,
"grad_norm": 1.0917991783905898,
"learning_rate": 2.294339712895271e-06,
"loss": 0.02,
"step": 667
},
{
"epoch": 1.3660531697341514,
"grad_norm": 1.5740943427206295,
"learning_rate": 2.28084713371428e-06,
"loss": 0.0323,
"step": 668
},
{
"epoch": 1.3680981595092025,
"grad_norm": 1.4720527439260216,
"learning_rate": 2.2673826124412314e-06,
"loss": 0.0286,
"step": 669
},
{
"epoch": 1.3701431492842535,
"grad_norm": 1.4833939892417702,
"learning_rate": 2.253946288011419e-06,
"loss": 0.0342,
"step": 670
},
{
"epoch": 1.3721881390593047,
"grad_norm": 1.6876515961228076,
"learning_rate": 2.240538299069178e-06,
"loss": 0.0311,
"step": 671
},
{
"epoch": 1.3742331288343559,
"grad_norm": 2.1720167724269874,
"learning_rate": 2.2271587839664673e-06,
"loss": 0.0381,
"step": 672
},
{
"epoch": 1.3762781186094069,
"grad_norm": 1.5126928906252048,
"learning_rate": 2.213807880761434e-06,
"loss": 0.0332,
"step": 673
},
{
"epoch": 1.378323108384458,
"grad_norm": 1.6737538431685655,
"learning_rate": 2.2004857272169878e-06,
"loss": 0.0345,
"step": 674
},
{
"epoch": 1.3803680981595092,
"grad_norm": 1.426935375770983,
"learning_rate": 2.18719246079938e-06,
"loss": 0.0398,
"step": 675
},
{
"epoch": 1.3824130879345602,
"grad_norm": 1.4051149662672344,
"learning_rate": 2.173928218676792e-06,
"loss": 0.0232,
"step": 676
},
{
"epoch": 1.3844580777096114,
"grad_norm": 1.7917331547528335,
"learning_rate": 2.160693137717912e-06,
"loss": 0.0368,
"step": 677
},
{
"epoch": 1.3865030674846626,
"grad_norm": 1.8111522355910634,
"learning_rate": 2.1474873544905204e-06,
"loss": 0.0269,
"step": 678
},
{
"epoch": 1.3885480572597138,
"grad_norm": 1.6693031730647383,
"learning_rate": 2.134311005260093e-06,
"loss": 0.0362,
"step": 679
},
{
"epoch": 1.390593047034765,
"grad_norm": 1.4202013946415086,
"learning_rate": 2.121164225988387e-06,
"loss": 0.0298,
"step": 680
},
{
"epoch": 1.392638036809816,
"grad_norm": 1.3927664117864682,
"learning_rate": 2.108047152332028e-06,
"loss": 0.026,
"step": 681
},
{
"epoch": 1.394683026584867,
"grad_norm": 1.405359317118805,
"learning_rate": 2.0949599196411326e-06,
"loss": 0.0312,
"step": 682
},
{
"epoch": 1.3967280163599183,
"grad_norm": 1.2371988179013782,
"learning_rate": 2.081902662957895e-06,
"loss": 0.0214,
"step": 683
},
{
"epoch": 1.3987730061349692,
"grad_norm": 2.047236610352014,
"learning_rate": 2.0688755170152e-06,
"loss": 0.0421,
"step": 684
},
{
"epoch": 1.4008179959100204,
"grad_norm": 1.2055104899996096,
"learning_rate": 2.0558786162352245e-06,
"loss": 0.0218,
"step": 685
},
{
"epoch": 1.4028629856850716,
"grad_norm": 1.2042481090348163,
"learning_rate": 2.042912094728068e-06,
"loss": 0.0232,
"step": 686
},
{
"epoch": 1.4049079754601226,
"grad_norm": 1.965874166063246,
"learning_rate": 2.029976086290347e-06,
"loss": 0.0422,
"step": 687
},
{
"epoch": 1.4069529652351738,
"grad_norm": 1.7221753979316607,
"learning_rate": 2.017070724403835e-06,
"loss": 0.0315,
"step": 688
},
{
"epoch": 1.408997955010225,
"grad_norm": 1.319102902846999,
"learning_rate": 2.004196142234068e-06,
"loss": 0.0315,
"step": 689
},
{
"epoch": 1.4110429447852761,
"grad_norm": 0.9513064566229582,
"learning_rate": 1.9913524726289784e-06,
"loss": 0.0168,
"step": 690
},
{
"epoch": 1.4130879345603273,
"grad_norm": 1.9447952357011042,
"learning_rate": 1.9785398481175295e-06,
"loss": 0.0413,
"step": 691
},
{
"epoch": 1.4151329243353783,
"grad_norm": 1.5286895548644743,
"learning_rate": 1.965758400908334e-06,
"loss": 0.0274,
"step": 692
},
{
"epoch": 1.4171779141104295,
"grad_norm": 1.1539526277463092,
"learning_rate": 1.9530082628883058e-06,
"loss": 0.0239,
"step": 693
},
{
"epoch": 1.4192229038854807,
"grad_norm": 1.6908331934705023,
"learning_rate": 1.9402895656212834e-06,
"loss": 0.0342,
"step": 694
},
{
"epoch": 1.4212678936605316,
"grad_norm": 2.2914630227874886,
"learning_rate": 1.927602440346687e-06,
"loss": 0.0414,
"step": 695
},
{
"epoch": 1.4233128834355828,
"grad_norm": 1.4300685831945064,
"learning_rate": 1.914947017978153e-06,
"loss": 0.0272,
"step": 696
},
{
"epoch": 1.425357873210634,
"grad_norm": 1.119854466298958,
"learning_rate": 1.9023234291021875e-06,
"loss": 0.0237,
"step": 697
},
{
"epoch": 1.427402862985685,
"grad_norm": 2.157933270356651,
"learning_rate": 1.889731803976822e-06,
"loss": 0.0365,
"step": 698
},
{
"epoch": 1.4294478527607362,
"grad_norm": 2.1495827518419017,
"learning_rate": 1.8771722725302644e-06,
"loss": 0.0421,
"step": 699
},
{
"epoch": 1.4314928425357873,
"grad_norm": 1.4403502460755069,
"learning_rate": 1.8646449643595565e-06,
"loss": 0.0256,
"step": 700
},
{
"epoch": 1.4335378323108383,
"grad_norm": 1.612284239493657,
"learning_rate": 1.8521500087292466e-06,
"loss": 0.0314,
"step": 701
},
{
"epoch": 1.4355828220858895,
"grad_norm": 1.1017923126212417,
"learning_rate": 1.8396875345700498e-06,
"loss": 0.022,
"step": 702
},
{
"epoch": 1.4376278118609407,
"grad_norm": 1.6446468659290325,
"learning_rate": 1.8272576704775074e-06,
"loss": 0.0416,
"step": 703
},
{
"epoch": 1.4396728016359919,
"grad_norm": 1.3298795930204095,
"learning_rate": 1.81486054471068e-06,
"loss": 0.0269,
"step": 704
},
{
"epoch": 1.441717791411043,
"grad_norm": 1.0537263463371598,
"learning_rate": 1.8024962851908106e-06,
"loss": 0.022,
"step": 705
},
{
"epoch": 1.443762781186094,
"grad_norm": 1.569014799305421,
"learning_rate": 1.790165019500007e-06,
"loss": 0.027,
"step": 706
},
{
"epoch": 1.4458077709611452,
"grad_norm": 1.1169910353575982,
"learning_rate": 1.7778668748799244e-06,
"loss": 0.0214,
"step": 707
},
{
"epoch": 1.4478527607361964,
"grad_norm": 1.4170681283218884,
"learning_rate": 1.7656019782304602e-06,
"loss": 0.0241,
"step": 708
},
{
"epoch": 1.4498977505112474,
"grad_norm": 1.548818986804255,
"learning_rate": 1.7533704561084331e-06,
"loss": 0.0362,
"step": 709
},
{
"epoch": 1.4519427402862985,
"grad_norm": 2.0680503202924028,
"learning_rate": 1.7411724347262826e-06,
"loss": 0.0431,
"step": 710
},
{
"epoch": 1.4539877300613497,
"grad_norm": 1.4173336455080414,
"learning_rate": 1.729008039950772e-06,
"loss": 0.0279,
"step": 711
},
{
"epoch": 1.4560327198364007,
"grad_norm": 1.7820106072819453,
"learning_rate": 1.7168773973016779e-06,
"loss": 0.0353,
"step": 712
},
{
"epoch": 1.4580777096114519,
"grad_norm": 1.3988149854171141,
"learning_rate": 1.7047806319505079e-06,
"loss": 0.0271,
"step": 713
},
{
"epoch": 1.460122699386503,
"grad_norm": 1.390929649329335,
"learning_rate": 1.6927178687191953e-06,
"loss": 0.0256,
"step": 714
},
{
"epoch": 1.4621676891615543,
"grad_norm": 1.5277454496972025,
"learning_rate": 1.680689232078827e-06,
"loss": 0.0312,
"step": 715
},
{
"epoch": 1.4642126789366054,
"grad_norm": 1.9787662527459544,
"learning_rate": 1.6686948461483432e-06,
"loss": 0.0297,
"step": 716
},
{
"epoch": 1.4662576687116564,
"grad_norm": 1.3331939153009726,
"learning_rate": 1.656734834693266e-06,
"loss": 0.0269,
"step": 717
},
{
"epoch": 1.4683026584867076,
"grad_norm": 2.1133774298806784,
"learning_rate": 1.6448093211244232e-06,
"loss": 0.048,
"step": 718
},
{
"epoch": 1.4703476482617588,
"grad_norm": 1.9547321525275854,
"learning_rate": 1.6329184284966675e-06,
"loss": 0.0428,
"step": 719
},
{
"epoch": 1.4723926380368098,
"grad_norm": 1.5090987203091184,
"learning_rate": 1.621062279507617e-06,
"loss": 0.0305,
"step": 720
},
{
"epoch": 1.474437627811861,
"grad_norm": 0.9195367104860552,
"learning_rate": 1.6092409964963779e-06,
"loss": 0.0189,
"step": 721
},
{
"epoch": 1.4764826175869121,
"grad_norm": 1.9963374669225287,
"learning_rate": 1.597454701442288e-06,
"loss": 0.0385,
"step": 722
},
{
"epoch": 1.478527607361963,
"grad_norm": 1.5113737285476996,
"learning_rate": 1.5857035159636625e-06,
"loss": 0.033,
"step": 723
},
{
"epoch": 1.4805725971370143,
"grad_norm": 2.004845056871369,
"learning_rate": 1.5739875613165283e-06,
"loss": 0.0339,
"step": 724
},
{
"epoch": 1.4826175869120655,
"grad_norm": 0.9984169610067929,
"learning_rate": 1.5623069583933836e-06,
"loss": 0.02,
"step": 725
},
{
"epoch": 1.4846625766871164,
"grad_norm": 1.6259268058261294,
"learning_rate": 1.550661827721941e-06,
"loss": 0.0273,
"step": 726
},
{
"epoch": 1.4867075664621676,
"grad_norm": 1.6297643950263438,
"learning_rate": 1.5390522894638937e-06,
"loss": 0.028,
"step": 727
},
{
"epoch": 1.4887525562372188,
"grad_norm": 1.53638106009823,
"learning_rate": 1.5274784634136658e-06,
"loss": 0.0293,
"step": 728
},
{
"epoch": 1.49079754601227,
"grad_norm": 1.268974698538747,
"learning_rate": 1.5159404689971797e-06,
"loss": 0.0248,
"step": 729
},
{
"epoch": 1.4928425357873212,
"grad_norm": 1.427953166829002,
"learning_rate": 1.5044384252706312e-06,
"loss": 0.025,
"step": 730
},
{
"epoch": 1.4948875255623721,
"grad_norm": 1.0778297960602063,
"learning_rate": 1.492972450919249e-06,
"loss": 0.0196,
"step": 731
},
{
"epoch": 1.4969325153374233,
"grad_norm": 1.6048151777257864,
"learning_rate": 1.4815426642560753e-06,
"loss": 0.0254,
"step": 732
},
{
"epoch": 1.4989775051124745,
"grad_norm": 1.3837639161000226,
"learning_rate": 1.4701491832207481e-06,
"loss": 0.0234,
"step": 733
},
{
"epoch": 1.5010224948875255,
"grad_norm": 1.6210880071717662,
"learning_rate": 1.458792125378285e-06,
"loss": 0.0279,
"step": 734
},
{
"epoch": 1.5030674846625767,
"grad_norm": 1.6055051497727444,
"learning_rate": 1.4474716079178541e-06,
"loss": 0.047,
"step": 735
},
{
"epoch": 1.5051124744376279,
"grad_norm": 1.4164487131203813,
"learning_rate": 1.436187747651589e-06,
"loss": 0.0294,
"step": 736
},
{
"epoch": 1.5071574642126788,
"grad_norm": 1.404797134072682,
"learning_rate": 1.4249406610133686e-06,
"loss": 0.0333,
"step": 737
},
{
"epoch": 1.50920245398773,
"grad_norm": 1.5568137049723834,
"learning_rate": 1.4137304640576161e-06,
"loss": 0.0261,
"step": 738
},
{
"epoch": 1.5112474437627812,
"grad_norm": 1.4289478333095673,
"learning_rate": 1.4025572724581037e-06,
"loss": 0.0261,
"step": 739
},
{
"epoch": 1.5132924335378322,
"grad_norm": 2.5332634796920264,
"learning_rate": 1.3914212015067653e-06,
"loss": 0.0444,
"step": 740
},
{
"epoch": 1.5153374233128836,
"grad_norm": 1.788966871785357,
"learning_rate": 1.3803223661124938e-06,
"loss": 0.0283,
"step": 741
},
{
"epoch": 1.5173824130879345,
"grad_norm": 1.450672721983178,
"learning_rate": 1.3692608807999652e-06,
"loss": 0.0362,
"step": 742
},
{
"epoch": 1.5194274028629857,
"grad_norm": 1.2779026779976663,
"learning_rate": 1.3582368597084566e-06,
"loss": 0.0259,
"step": 743
},
{
"epoch": 1.521472392638037,
"grad_norm": 1.181583768603287,
"learning_rate": 1.3472504165906614e-06,
"loss": 0.0189,
"step": 744
},
{
"epoch": 1.5235173824130879,
"grad_norm": 0.9817943493019303,
"learning_rate": 1.3363016648115246e-06,
"loss": 0.0184,
"step": 745
},
{
"epoch": 1.525562372188139,
"grad_norm": 1.270037833596693,
"learning_rate": 1.325390717347065e-06,
"loss": 0.0268,
"step": 746
},
{
"epoch": 1.5276073619631902,
"grad_norm": 1.3472246238651557,
"learning_rate": 1.3145176867832165e-06,
"loss": 0.0262,
"step": 747
},
{
"epoch": 1.5296523517382412,
"grad_norm": 1.4783552939397928,
"learning_rate": 1.3036826853146601e-06,
"loss": 0.0256,
"step": 748
},
{
"epoch": 1.5316973415132924,
"grad_norm": 1.5785020479524052,
"learning_rate": 1.2928858247436672e-06,
"loss": 0.0303,
"step": 749
},
{
"epoch": 1.5337423312883436,
"grad_norm": 0.9545819980628849,
"learning_rate": 1.2821272164789544e-06,
"loss": 0.0154,
"step": 750
},
{
"epoch": 1.5357873210633946,
"grad_norm": 1.7853036227571542,
"learning_rate": 1.2714069715345195e-06,
"loss": 0.0366,
"step": 751
},
{
"epoch": 1.537832310838446,
"grad_norm": 1.2881320204863016,
"learning_rate": 1.2607252005285109e-06,
"loss": 0.0271,
"step": 752
},
{
"epoch": 1.539877300613497,
"grad_norm": 1.8402584593837081,
"learning_rate": 1.2500820136820735e-06,
"loss": 0.0397,
"step": 753
},
{
"epoch": 1.5419222903885481,
"grad_norm": 0.9104264280152901,
"learning_rate": 1.2394775208182175e-06,
"loss": 0.0185,
"step": 754
},
{
"epoch": 1.5439672801635993,
"grad_norm": 1.6576714713446372,
"learning_rate": 1.2289118313606895e-06,
"loss": 0.0329,
"step": 755
},
{
"epoch": 1.5460122699386503,
"grad_norm": 1.516510626114462,
"learning_rate": 1.2183850543328313e-06,
"loss": 0.029,
"step": 756
},
{
"epoch": 1.5480572597137015,
"grad_norm": 1.7170915167008158,
"learning_rate": 1.2078972983564686e-06,
"loss": 0.0281,
"step": 757
},
{
"epoch": 1.5501022494887526,
"grad_norm": 1.572147913277003,
"learning_rate": 1.1974486716507782e-06,
"loss": 0.0275,
"step": 758
},
{
"epoch": 1.5521472392638036,
"grad_norm": 1.6917430084108376,
"learning_rate": 1.187039282031182e-06,
"loss": 0.0357,
"step": 759
},
{
"epoch": 1.5541922290388548,
"grad_norm": 1.5988116947928293,
"learning_rate": 1.1766692369082255e-06,
"loss": 0.037,
"step": 760
},
{
"epoch": 1.556237218813906,
"grad_norm": 1.5739169249494382,
"learning_rate": 1.1663386432864725e-06,
"loss": 0.0323,
"step": 761
},
{
"epoch": 1.558282208588957,
"grad_norm": 0.8239355040656156,
"learning_rate": 1.156047607763407e-06,
"loss": 0.0153,
"step": 762
},
{
"epoch": 1.5603271983640081,
"grad_norm": 1.4324066370868447,
"learning_rate": 1.145796236528322e-06,
"loss": 0.0281,
"step": 763
},
{
"epoch": 1.5623721881390593,
"grad_norm": 1.167864770241578,
"learning_rate": 1.135584635361232e-06,
"loss": 0.0206,
"step": 764
},
{
"epoch": 1.5644171779141103,
"grad_norm": 1.2252633383184313,
"learning_rate": 1.1254129096317807e-06,
"loss": 0.0219,
"step": 765
},
{
"epoch": 1.5664621676891617,
"grad_norm": 1.2772246245687098,
"learning_rate": 1.115281164298153e-06,
"loss": 0.0228,
"step": 766
},
{
"epoch": 1.5685071574642127,
"grad_norm": 1.1793575214560597,
"learning_rate": 1.1051895039059851e-06,
"loss": 0.0239,
"step": 767
},
{
"epoch": 1.5705521472392638,
"grad_norm": 1.3979051592502238,
"learning_rate": 1.095138032587298e-06,
"loss": 0.0284,
"step": 768
},
{
"epoch": 1.572597137014315,
"grad_norm": 1.1554168176295245,
"learning_rate": 1.0851268540594168e-06,
"loss": 0.0233,
"step": 769
},
{
"epoch": 1.574642126789366,
"grad_norm": 1.1645512388718606,
"learning_rate": 1.0751560716238968e-06,
"loss": 0.0229,
"step": 770
},
{
"epoch": 1.5766871165644172,
"grad_norm": 1.7131522059742506,
"learning_rate": 1.0652257881654625e-06,
"loss": 0.0406,
"step": 771
},
{
"epoch": 1.5787321063394684,
"grad_norm": 1.2606812526165108,
"learning_rate": 1.0553361061509482e-06,
"loss": 0.0235,
"step": 772
},
{
"epoch": 1.5807770961145193,
"grad_norm": 1.1957626319021837,
"learning_rate": 1.0454871276282335e-06,
"loss": 0.0254,
"step": 773
},
{
"epoch": 1.5828220858895705,
"grad_norm": 1.221410722093273,
"learning_rate": 1.0356789542251939e-06,
"loss": 0.0285,
"step": 774
},
{
"epoch": 1.5848670756646217,
"grad_norm": 1.4005487946112367,
"learning_rate": 1.0259116871486557e-06,
"loss": 0.0237,
"step": 775
},
{
"epoch": 1.5869120654396727,
"grad_norm": 1.363179363127451,
"learning_rate": 1.0161854271833444e-06,
"loss": 0.023,
"step": 776
},
{
"epoch": 1.588957055214724,
"grad_norm": 1.3303699717121924,
"learning_rate": 1.0065002746908532e-06,
"loss": 0.0219,
"step": 777
},
{
"epoch": 1.591002044989775,
"grad_norm": 1.4319116309801472,
"learning_rate": 9.96856329608597e-07,
"loss": 0.031,
"step": 778
},
{
"epoch": 1.5930470347648262,
"grad_norm": 1.1984953249513992,
"learning_rate": 9.87253691448794e-07,
"loss": 0.0245,
"step": 779
},
{
"epoch": 1.5950920245398774,
"grad_norm": 1.2215565328948168,
"learning_rate": 9.776924592974257e-07,
"loss": 0.0248,
"step": 780
},
{
"epoch": 1.5971370143149284,
"grad_norm": 1.4160156872424536,
"learning_rate": 9.681727318132228e-07,
"loss": 0.0242,
"step": 781
},
{
"epoch": 1.5991820040899796,
"grad_norm": 1.1174710591294479,
"learning_rate": 9.586946072266479e-07,
"loss": 0.0191,
"step": 782
},
{
"epoch": 1.6012269938650308,
"grad_norm": 1.0676003932307012,
"learning_rate": 9.492581833388736e-07,
"loss": 0.0188,
"step": 783
},
{
"epoch": 1.6032719836400817,
"grad_norm": 1.0900550444215484,
"learning_rate": 9.398635575207854e-07,
"loss": 0.0218,
"step": 784
},
{
"epoch": 1.605316973415133,
"grad_norm": 1.2361313996180479,
"learning_rate": 9.305108267119645e-07,
"loss": 0.0207,
"step": 785
},
{
"epoch": 1.607361963190184,
"grad_norm": 1.218779379666619,
"learning_rate": 9.212000874196953e-07,
"loss": 0.0226,
"step": 786
},
{
"epoch": 1.609406952965235,
"grad_norm": 1.5316948706786864,
"learning_rate": 9.119314357179687e-07,
"loss": 0.0263,
"step": 787
},
{
"epoch": 1.6114519427402862,
"grad_norm": 1.3658846792851305,
"learning_rate": 9.027049672464916e-07,
"loss": 0.0207,
"step": 788
},
{
"epoch": 1.6134969325153374,
"grad_norm": 2.4597956315625455,
"learning_rate": 8.935207772096904e-07,
"loss": 0.0254,
"step": 789
},
{
"epoch": 1.6155419222903884,
"grad_norm": 1.3358397828434039,
"learning_rate": 8.843789603757446e-07,
"loss": 0.0265,
"step": 790
},
{
"epoch": 1.6175869120654398,
"grad_norm": 1.2481079015069951,
"learning_rate": 8.752796110755985e-07,
"loss": 0.02,
"step": 791
},
{
"epoch": 1.6196319018404908,
"grad_norm": 0.9661429436209987,
"learning_rate": 8.662228232019876e-07,
"loss": 0.0166,
"step": 792
},
{
"epoch": 1.621676891615542,
"grad_norm": 1.7556913252148523,
"learning_rate": 8.572086902084731e-07,
"loss": 0.0341,
"step": 793
},
{
"epoch": 1.6237218813905931,
"grad_norm": 1.418921330732568,
"learning_rate": 8.482373051084791e-07,
"loss": 0.0283,
"step": 794
},
{
"epoch": 1.6257668711656441,
"grad_norm": 2.369535130694504,
"learning_rate": 8.393087604743283e-07,
"loss": 0.0445,
"step": 795
},
{
"epoch": 1.6278118609406953,
"grad_norm": 1.6601126609364323,
"learning_rate": 8.304231484362868e-07,
"loss": 0.0293,
"step": 796
},
{
"epoch": 1.6298568507157465,
"grad_norm": 1.2796195343972467,
"learning_rate": 8.215805606816191e-07,
"loss": 0.0199,
"step": 797
},
{
"epoch": 1.6319018404907975,
"grad_norm": 1.207648315269951,
"learning_rate": 8.127810884536402e-07,
"loss": 0.0181,
"step": 798
},
{
"epoch": 1.6339468302658486,
"grad_norm": 2.1150186432662728,
"learning_rate": 8.040248225507641e-07,
"loss": 0.0473,
"step": 799
},
{
"epoch": 1.6359918200408998,
"grad_norm": 1.4200026666542498,
"learning_rate": 7.953118533255821e-07,
"loss": 0.0247,
"step": 800
},
{
"epoch": 1.6359918200408998,
"eval_loss": 0.07060948759317398,
"eval_runtime": 1.5943,
"eval_samples_per_second": 25.09,
"eval_steps_per_second": 6.272,
"step": 800
},
{
"epoch": 1.6380368098159508,
"grad_norm": 1.5772837122475736,
"learning_rate": 7.866422706839239e-07,
"loss": 0.0264,
"step": 801
},
{
"epoch": 1.6400817995910022,
"grad_norm": 1.1550918911272414,
"learning_rate": 7.780161640839257e-07,
"loss": 0.0224,
"step": 802
},
{
"epoch": 1.6421267893660532,
"grad_norm": 1.4676067465705516,
"learning_rate": 7.694336225351107e-07,
"loss": 0.0237,
"step": 803
},
{
"epoch": 1.6441717791411041,
"grad_norm": 1.4993385397429064,
"learning_rate": 7.60894734597476e-07,
"loss": 0.0295,
"step": 804
},
{
"epoch": 1.6462167689161555,
"grad_norm": 1.2385669586685766,
"learning_rate": 7.52399588380568e-07,
"loss": 0.0243,
"step": 805
},
{
"epoch": 1.6482617586912065,
"grad_norm": 1.4635374861697166,
"learning_rate": 7.439482715425806e-07,
"loss": 0.0252,
"step": 806
},
{
"epoch": 1.6503067484662577,
"grad_norm": 1.2402570999087212,
"learning_rate": 7.355408712894508e-07,
"loss": 0.0211,
"step": 807
},
{
"epoch": 1.6523517382413089,
"grad_norm": 1.5520153711347568,
"learning_rate": 7.271774743739546e-07,
"loss": 0.0303,
"step": 808
},
{
"epoch": 1.6543967280163598,
"grad_norm": 1.2762250260415324,
"learning_rate": 7.18858167094817e-07,
"loss": 0.0242,
"step": 809
},
{
"epoch": 1.656441717791411,
"grad_norm": 1.4244259857298884,
"learning_rate": 7.105830352958143e-07,
"loss": 0.0278,
"step": 810
},
{
"epoch": 1.6584867075664622,
"grad_norm": 1.4760993572706773,
"learning_rate": 7.023521643648984e-07,
"loss": 0.0292,
"step": 811
},
{
"epoch": 1.6605316973415132,
"grad_norm": 1.3443460519107557,
"learning_rate": 6.941656392333046e-07,
"loss": 0.0232,
"step": 812
},
{
"epoch": 1.6625766871165644,
"grad_norm": 1.3709203745792065,
"learning_rate": 6.86023544374686e-07,
"loss": 0.027,
"step": 813
},
{
"epoch": 1.6646216768916156,
"grad_norm": 1.4289920722764744,
"learning_rate": 6.779259638042318e-07,
"loss": 0.0231,
"step": 814
},
{
"epoch": 1.6666666666666665,
"grad_norm": 1.2467075238350902,
"learning_rate": 6.698729810778065e-07,
"loss": 0.0288,
"step": 815
},
{
"epoch": 1.668711656441718,
"grad_norm": 1.5823026933811752,
"learning_rate": 6.618646792910893e-07,
"loss": 0.0326,
"step": 816
},
{
"epoch": 1.670756646216769,
"grad_norm": 1.5584280269321396,
"learning_rate": 6.539011410787105e-07,
"loss": 0.0262,
"step": 817
},
{
"epoch": 1.67280163599182,
"grad_norm": 1.1208057763458479,
"learning_rate": 6.459824486134015e-07,
"loss": 0.0212,
"step": 818
},
{
"epoch": 1.6748466257668713,
"grad_norm": 1.3862339324803945,
"learning_rate": 6.381086836051498e-07,
"loss": 0.0258,
"step": 819
},
{
"epoch": 1.6768916155419222,
"grad_norm": 1.1160447785511467,
"learning_rate": 6.302799273003546e-07,
"loss": 0.0166,
"step": 820
},
{
"epoch": 1.6789366053169734,
"grad_norm": 1.3240491165501231,
"learning_rate": 6.22496260480982e-07,
"loss": 0.0248,
"step": 821
},
{
"epoch": 1.6809815950920246,
"grad_norm": 1.338838004083599,
"learning_rate": 6.147577634637413e-07,
"loss": 0.0262,
"step": 822
},
{
"epoch": 1.6830265848670756,
"grad_norm": 1.3968985445629194,
"learning_rate": 6.070645160992523e-07,
"loss": 0.0281,
"step": 823
},
{
"epoch": 1.6850715746421268,
"grad_norm": 1.171408977887829,
"learning_rate": 5.994165977712175e-07,
"loss": 0.0213,
"step": 824
},
{
"epoch": 1.687116564417178,
"grad_norm": 1.3360283784514455,
"learning_rate": 5.918140873956063e-07,
"loss": 0.0203,
"step": 825
},
{
"epoch": 1.689161554192229,
"grad_norm": 1.2733261388021238,
"learning_rate": 5.842570634198453e-07,
"loss": 0.0193,
"step": 826
},
{
"epoch": 1.6912065439672803,
"grad_norm": 1.6784098486146612,
"learning_rate": 5.767456038219987e-07,
"loss": 0.0262,
"step": 827
},
{
"epoch": 1.6932515337423313,
"grad_norm": 1.0355585556125833,
"learning_rate": 5.692797861099719e-07,
"loss": 0.0215,
"step": 828
},
{
"epoch": 1.6952965235173822,
"grad_norm": 1.4014112675195356,
"learning_rate": 5.618596873207083e-07,
"loss": 0.0225,
"step": 829
},
{
"epoch": 1.6973415132924337,
"grad_norm": 1.6204759478058526,
"learning_rate": 5.544853840193981e-07,
"loss": 0.0283,
"step": 830
},
{
"epoch": 1.6993865030674846,
"grad_norm": 1.1175326576111029,
"learning_rate": 5.471569522986775e-07,
"loss": 0.0197,
"step": 831
},
{
"epoch": 1.7014314928425358,
"grad_norm": 1.5156333961192319,
"learning_rate": 5.398744677778595e-07,
"loss": 0.0286,
"step": 832
},
{
"epoch": 1.703476482617587,
"grad_norm": 1.3492765083670422,
"learning_rate": 5.326380056021419e-07,
"loss": 0.0259,
"step": 833
},
{
"epoch": 1.705521472392638,
"grad_norm": 1.911784218966074,
"learning_rate": 5.254476404418341e-07,
"loss": 0.036,
"step": 834
},
{
"epoch": 1.7075664621676891,
"grad_norm": 1.3456317179935473,
"learning_rate": 5.183034464915898e-07,
"loss": 0.0248,
"step": 835
},
{
"epoch": 1.7096114519427403,
"grad_norm": 1.3465884976486044,
"learning_rate": 5.112054974696395e-07,
"loss": 0.0214,
"step": 836
},
{
"epoch": 1.7116564417177913,
"grad_norm": 1.2682146752514654,
"learning_rate": 5.041538666170282e-07,
"loss": 0.0245,
"step": 837
},
{
"epoch": 1.7137014314928425,
"grad_norm": 1.0732597160929007,
"learning_rate": 4.971486266968634e-07,
"loss": 0.0248,
"step": 838
},
{
"epoch": 1.7157464212678937,
"grad_norm": 1.2390245442361538,
"learning_rate": 4.901898499935609e-07,
"loss": 0.022,
"step": 839
},
{
"epoch": 1.7177914110429446,
"grad_norm": 1.1298732472922557,
"learning_rate": 4.832776083120983e-07,
"loss": 0.019,
"step": 840
},
{
"epoch": 1.719836400817996,
"grad_norm": 1.2513860400146173,
"learning_rate": 4.764119729772809e-07,
"loss": 0.0254,
"step": 841
},
{
"epoch": 1.721881390593047,
"grad_norm": 1.5812395858247674,
"learning_rate": 4.695930148329958e-07,
"loss": 0.0303,
"step": 842
},
{
"epoch": 1.7239263803680982,
"grad_norm": 1.2260179416900976,
"learning_rate": 4.628208042414889e-07,
"loss": 0.0231,
"step": 843
},
{
"epoch": 1.7259713701431494,
"grad_norm": 0.9260246632190309,
"learning_rate": 4.5609541108263377e-07,
"loss": 0.0191,
"step": 844
},
{
"epoch": 1.7280163599182004,
"grad_norm": 1.8092568351032716,
"learning_rate": 4.494169047532154e-07,
"loss": 0.0377,
"step": 845
},
{
"epoch": 1.7300613496932515,
"grad_norm": 1.4342896955808682,
"learning_rate": 4.4278535416620914e-07,
"loss": 0.0296,
"step": 846
},
{
"epoch": 1.7321063394683027,
"grad_norm": 1.411079843320368,
"learning_rate": 4.362008277500701e-07,
"loss": 0.0252,
"step": 847
},
{
"epoch": 1.7341513292433537,
"grad_norm": 1.4065270120904347,
"learning_rate": 4.2966339344803376e-07,
"loss": 0.0236,
"step": 848
},
{
"epoch": 1.7361963190184049,
"grad_norm": 2.637324684778294,
"learning_rate": 4.231731187174065e-07,
"loss": 0.0406,
"step": 849
},
{
"epoch": 1.738241308793456,
"grad_norm": 1.5036834826794743,
"learning_rate": 4.167300705288718e-07,
"loss": 0.0238,
"step": 850
},
{
"epoch": 1.740286298568507,
"grad_norm": 1.7305730073425691,
"learning_rate": 4.10334315365804e-07,
"loss": 0.03,
"step": 851
},
{
"epoch": 1.7423312883435584,
"grad_norm": 1.3670965259099597,
"learning_rate": 4.0398591922357787e-07,
"loss": 0.0244,
"step": 852
},
{
"epoch": 1.7443762781186094,
"grad_norm": 1.4873125793549382,
"learning_rate": 3.9768494760888455e-07,
"loss": 0.0281,
"step": 853
},
{
"epoch": 1.7464212678936604,
"grad_norm": 1.3256819619759466,
"learning_rate": 3.914314655390633e-07,
"loss": 0.018,
"step": 854
},
{
"epoch": 1.7484662576687118,
"grad_norm": 1.0528899986433782,
"learning_rate": 3.852255375414271e-07,
"loss": 0.0185,
"step": 855
},
{
"epoch": 1.7505112474437627,
"grad_norm": 1.5167752108851527,
"learning_rate": 3.7906722765259364e-07,
"loss": 0.0285,
"step": 856
},
{
"epoch": 1.752556237218814,
"grad_norm": 1.2661873569980087,
"learning_rate": 3.7295659941782856e-07,
"loss": 0.0229,
"step": 857
},
{
"epoch": 1.7546012269938651,
"grad_norm": 1.2713073653615368,
"learning_rate": 3.6689371589039013e-07,
"loss": 0.022,
"step": 858
},
{
"epoch": 1.756646216768916,
"grad_norm": 1.410691086480624,
"learning_rate": 3.60878639630875e-07,
"loss": 0.0296,
"step": 859
},
{
"epoch": 1.7586912065439673,
"grad_norm": 0.9920426356145646,
"learning_rate": 3.5491143270657445e-07,
"loss": 0.015,
"step": 860
},
{
"epoch": 1.7607361963190185,
"grad_norm": 1.5216849169101498,
"learning_rate": 3.489921566908372e-07,
"loss": 0.0271,
"step": 861
},
{
"epoch": 1.7627811860940694,
"grad_norm": 1.4674709021434214,
"learning_rate": 3.4312087266242964e-07,
"loss": 0.0263,
"step": 862
},
{
"epoch": 1.7648261758691206,
"grad_norm": 1.7675475614023826,
"learning_rate": 3.3729764120490447e-07,
"loss": 0.0384,
"step": 863
},
{
"epoch": 1.7668711656441718,
"grad_norm": 1.4676888698930726,
"learning_rate": 3.315225224059809e-07,
"loss": 0.0301,
"step": 864
},
{
"epoch": 1.7689161554192228,
"grad_norm": 1.4800320849283661,
"learning_rate": 3.25795575856922e-07,
"loss": 0.0283,
"step": 865
},
{
"epoch": 1.7709611451942742,
"grad_norm": 1.6806826350105444,
"learning_rate": 3.2011686065191894e-07,
"loss": 0.0391,
"step": 866
},
{
"epoch": 1.7730061349693251,
"grad_norm": 1.3249873571873563,
"learning_rate": 3.1448643538748045e-07,
"loss": 0.0203,
"step": 867
},
{
"epoch": 1.7750511247443763,
"grad_norm": 1.8551891141720298,
"learning_rate": 3.0890435816183226e-07,
"loss": 0.0393,
"step": 868
},
{
"epoch": 1.7770961145194275,
"grad_norm": 1.2327805158687992,
"learning_rate": 3.03370686574313e-07,
"loss": 0.0236,
"step": 869
},
{
"epoch": 1.7791411042944785,
"grad_norm": 1.3314203527215986,
"learning_rate": 2.9788547772478416e-07,
"loss": 0.0235,
"step": 870
},
{
"epoch": 1.7811860940695297,
"grad_norm": 1.1861648902004243,
"learning_rate": 2.9244878821303556e-07,
"loss": 0.0154,
"step": 871
},
{
"epoch": 1.7832310838445808,
"grad_norm": 1.3988331617040364,
"learning_rate": 2.870606741382059e-07,
"loss": 0.0349,
"step": 872
},
{
"epoch": 1.7852760736196318,
"grad_norm": 1.4786599382381074,
"learning_rate": 2.817211910982037e-07,
"loss": 0.0281,
"step": 873
},
{
"epoch": 1.787321063394683,
"grad_norm": 1.7984122833021066,
"learning_rate": 2.7643039418912996e-07,
"loss": 0.0291,
"step": 874
},
{
"epoch": 1.7893660531697342,
"grad_norm": 1.7454260608433505,
"learning_rate": 2.711883380047131e-07,
"loss": 0.0292,
"step": 875
},
{
"epoch": 1.7914110429447851,
"grad_norm": 1.435756459453004,
"learning_rate": 2.6599507663574387e-07,
"loss": 0.0293,
"step": 876
},
{
"epoch": 1.7934560327198366,
"grad_norm": 1.4644482699217904,
"learning_rate": 2.6085066366951907e-07,
"loss": 0.0245,
"step": 877
},
{
"epoch": 1.7955010224948875,
"grad_norm": 1.3875758357595886,
"learning_rate": 2.557551521892859e-07,
"loss": 0.0271,
"step": 878
},
{
"epoch": 1.7975460122699385,
"grad_norm": 0.9876574184926428,
"learning_rate": 2.5070859477369645e-07,
"loss": 0.0148,
"step": 879
},
{
"epoch": 1.79959100204499,
"grad_norm": 1.265878789206368,
"learning_rate": 2.457110434962645e-07,
"loss": 0.0216,
"step": 880
},
{
"epoch": 1.8016359918200409,
"grad_norm": 1.40284126411437,
"learning_rate": 2.407625499248273e-07,
"loss": 0.0249,
"step": 881
},
{
"epoch": 1.803680981595092,
"grad_norm": 1.1876694796769958,
"learning_rate": 2.3586316512101416e-07,
"loss": 0.018,
"step": 882
},
{
"epoch": 1.8057259713701432,
"grad_norm": 0.8367133769318583,
"learning_rate": 2.3101293963972094e-07,
"loss": 0.0178,
"step": 883
},
{
"epoch": 1.8077709611451942,
"grad_norm": 1.0995322120882318,
"learning_rate": 2.2621192352858702e-07,
"loss": 0.0198,
"step": 884
},
{
"epoch": 1.8098159509202454,
"grad_norm": 1.703675555853278,
"learning_rate": 2.2146016632747624e-07,
"loss": 0.0341,
"step": 885
},
{
"epoch": 1.8118609406952966,
"grad_norm": 1.6229142547220725,
"learning_rate": 2.1675771706797132e-07,
"loss": 0.0278,
"step": 886
},
{
"epoch": 1.8139059304703475,
"grad_norm": 1.6632882474057635,
"learning_rate": 2.1210462427286528e-07,
"loss": 0.0264,
"step": 887
},
{
"epoch": 1.8159509202453987,
"grad_norm": 1.696506311524546,
"learning_rate": 2.0750093595565735e-07,
"loss": 0.0315,
"step": 888
},
{
"epoch": 1.81799591002045,
"grad_norm": 1.783147077677834,
"learning_rate": 2.0294669962006352e-07,
"loss": 0.0306,
"step": 889
},
{
"epoch": 1.8200408997955009,
"grad_norm": 1.0803640055203296,
"learning_rate": 1.984419622595224e-07,
"loss": 0.0159,
"step": 890
},
{
"epoch": 1.8220858895705523,
"grad_norm": 1.549113936998901,
"learning_rate": 1.9398677035671222e-07,
"loss": 0.0356,
"step": 891
},
{
"epoch": 1.8241308793456033,
"grad_norm": 1.217663448407663,
"learning_rate": 1.8958116988306852e-07,
"loss": 0.0214,
"step": 892
},
{
"epoch": 1.8261758691206544,
"grad_norm": 1.2606236244237474,
"learning_rate": 1.8522520629831396e-07,
"loss": 0.0264,
"step": 893
},
{
"epoch": 1.8282208588957056,
"grad_norm": 1.1212441204936592,
"learning_rate": 1.8091892454998595e-07,
"loss": 0.017,
"step": 894
},
{
"epoch": 1.8302658486707566,
"grad_norm": 1.042748614877236,
"learning_rate": 1.7666236907297407e-07,
"loss": 0.0164,
"step": 895
},
{
"epoch": 1.8323108384458078,
"grad_norm": 1.3863959126170518,
"learning_rate": 1.7245558378906012e-07,
"loss": 0.0266,
"step": 896
},
{
"epoch": 1.834355828220859,
"grad_norm": 1.3029901304956657,
"learning_rate": 1.682986121064689e-07,
"loss": 0.025,
"step": 897
},
{
"epoch": 1.83640081799591,
"grad_norm": 0.8924861887554183,
"learning_rate": 1.641914969194147e-07,
"loss": 0.014,
"step": 898
},
{
"epoch": 1.8384458077709611,
"grad_norm": 1.0234983500191113,
"learning_rate": 1.6013428060766168e-07,
"loss": 0.019,
"step": 899
},
{
"epoch": 1.8404907975460123,
"grad_norm": 0.9136453201589728,
"learning_rate": 1.561270050360897e-07,
"loss": 0.0146,
"step": 900
},
{
"epoch": 1.8425357873210633,
"grad_norm": 1.8298008002925186,
"learning_rate": 1.5216971155425474e-07,
"loss": 0.0367,
"step": 901
},
{
"epoch": 1.8445807770961147,
"grad_norm": 0.9475181347283721,
"learning_rate": 1.4826244099596986e-07,
"loss": 0.0148,
"step": 902
},
{
"epoch": 1.8466257668711656,
"grad_norm": 0.9550648746556579,
"learning_rate": 1.444052336788787e-07,
"loss": 0.015,
"step": 903
},
{
"epoch": 1.8486707566462166,
"grad_norm": 1.4927311894076911,
"learning_rate": 1.4059812940404093e-07,
"loss": 0.0286,
"step": 904
},
{
"epoch": 1.850715746421268,
"grad_norm": 1.1696983318525789,
"learning_rate": 1.3684116745552423e-07,
"loss": 0.0212,
"step": 905
},
{
"epoch": 1.852760736196319,
"grad_norm": 1.2578768723641045,
"learning_rate": 1.33134386599994e-07,
"loss": 0.0244,
"step": 906
},
{
"epoch": 1.8548057259713702,
"grad_norm": 1.558622255405316,
"learning_rate": 1.2947782508631823e-07,
"loss": 0.0237,
"step": 907
},
{
"epoch": 1.8568507157464214,
"grad_norm": 1.52292583646827,
"learning_rate": 1.2587152064516828e-07,
"loss": 0.0246,
"step": 908
},
{
"epoch": 1.8588957055214723,
"grad_norm": 1.1936675481636827,
"learning_rate": 1.2231551048863421e-07,
"loss": 0.022,
"step": 909
},
{
"epoch": 1.8609406952965235,
"grad_norm": 1.4367150151701205,
"learning_rate": 1.1880983130983626e-07,
"loss": 0.0274,
"step": 910
},
{
"epoch": 1.8629856850715747,
"grad_norm": 1.4947550938995606,
"learning_rate": 1.1535451928254948e-07,
"loss": 0.0225,
"step": 911
},
{
"epoch": 1.8650306748466257,
"grad_norm": 1.6128578500137207,
"learning_rate": 1.1194961006082972e-07,
"loss": 0.0332,
"step": 912
},
{
"epoch": 1.8670756646216768,
"grad_norm": 1.1175499571067549,
"learning_rate": 1.0859513877864381e-07,
"loss": 0.0202,
"step": 913
},
{
"epoch": 1.869120654396728,
"grad_norm": 1.7323202236444266,
"learning_rate": 1.0529114004951047e-07,
"loss": 0.0423,
"step": 914
},
{
"epoch": 1.871165644171779,
"grad_norm": 1.1600018835452393,
"learning_rate": 1.0203764796614057e-07,
"loss": 0.0194,
"step": 915
},
{
"epoch": 1.8732106339468304,
"grad_norm": 1.3204191190409245,
"learning_rate": 9.883469610008578e-08,
"loss": 0.027,
"step": 916
},
{
"epoch": 1.8752556237218814,
"grad_norm": 1.5789271332032802,
"learning_rate": 9.568231750139212e-08,
"loss": 0.0381,
"step": 917
},
{
"epoch": 1.8773006134969326,
"grad_norm": 1.8636082047134532,
"learning_rate": 9.258054469825972e-08,
"loss": 0.0343,
"step": 918
},
{
"epoch": 1.8793456032719837,
"grad_norm": 1.729715689169104,
"learning_rate": 8.952940969670809e-08,
"loss": 0.0333,
"step": 919
},
{
"epoch": 1.8813905930470347,
"grad_norm": 1.075641909438574,
"learning_rate": 8.652894398024137e-08,
"loss": 0.0191,
"step": 920
},
{
"epoch": 1.883435582822086,
"grad_norm": 1.3038211172361949,
"learning_rate": 8.357917850952802e-08,
"loss": 0.0235,
"step": 921
},
{
"epoch": 1.885480572597137,
"grad_norm": 1.06035453866717,
"learning_rate": 8.06801437220811e-08,
"loss": 0.0191,
"step": 922
},
{
"epoch": 1.887525562372188,
"grad_norm": 1.2603710476757168,
"learning_rate": 7.783186953194189e-08,
"loss": 0.0246,
"step": 923
},
{
"epoch": 1.8895705521472392,
"grad_norm": 1.6288893230239516,
"learning_rate": 7.503438532937169e-08,
"loss": 0.036,
"step": 924
},
{
"epoch": 1.8916155419222904,
"grad_norm": 1.3517453504226422,
"learning_rate": 7.228771998054995e-08,
"loss": 0.0239,
"step": 925
},
{
"epoch": 1.8936605316973414,
"grad_norm": 1.1095416942390794,
"learning_rate": 6.959190182727616e-08,
"loss": 0.0165,
"step": 926
},
{
"epoch": 1.8957055214723928,
"grad_norm": 1.356854695078147,
"learning_rate": 6.694695868667556e-08,
"loss": 0.0236,
"step": 927
},
{
"epoch": 1.8977505112474438,
"grad_norm": 1.3824579801789842,
"learning_rate": 6.43529178509139e-08,
"loss": 0.0267,
"step": 928
},
{
"epoch": 1.8997955010224947,
"grad_norm": 1.5910728635319653,
"learning_rate": 6.180980608691656e-08,
"loss": 0.0269,
"step": 929
},
{
"epoch": 1.9018404907975461,
"grad_norm": 1.3467484522834312,
"learning_rate": 5.9317649636088656e-08,
"loss": 0.0272,
"step": 930
},
{
"epoch": 1.903885480572597,
"grad_norm": 1.404654837104888,
"learning_rate": 5.687647421404874e-08,
"loss": 0.0242,
"step": 931
},
{
"epoch": 1.9059304703476483,
"grad_norm": 1.2357699402907227,
"learning_rate": 5.4486305010361116e-08,
"loss": 0.0211,
"step": 932
},
{
"epoch": 1.9079754601226995,
"grad_norm": 1.1078419095507943,
"learning_rate": 5.214716668827558e-08,
"loss": 0.0174,
"step": 933
},
{
"epoch": 1.9100204498977504,
"grad_norm": 1.1455453685150343,
"learning_rate": 4.985908338447476e-08,
"loss": 0.0215,
"step": 934
},
{
"epoch": 1.9120654396728016,
"grad_norm": 1.5709389406578784,
"learning_rate": 4.7622078708822184e-08,
"loss": 0.0269,
"step": 935
},
{
"epoch": 1.9141104294478528,
"grad_norm": 1.1654623477471513,
"learning_rate": 4.543617574412185e-08,
"loss": 0.0207,
"step": 936
},
{
"epoch": 1.9161554192229038,
"grad_norm": 1.2721022886120923,
"learning_rate": 4.330139704587788e-08,
"loss": 0.0247,
"step": 937
},
{
"epoch": 1.918200408997955,
"grad_norm": 1.7353353754797876,
"learning_rate": 4.1217764642062505e-08,
"loss": 0.0325,
"step": 938
},
{
"epoch": 1.9202453987730062,
"grad_norm": 1.4523042929349372,
"learning_rate": 3.9185300032889005e-08,
"loss": 0.0245,
"step": 939
},
{
"epoch": 1.9222903885480571,
"grad_norm": 1.038008423835432,
"learning_rate": 3.720402419058966e-08,
"loss": 0.0172,
"step": 940
},
{
"epoch": 1.9243353783231085,
"grad_norm": 1.5830227670771397,
"learning_rate": 3.5273957559199265e-08,
"loss": 0.0363,
"step": 941
},
{
"epoch": 1.9263803680981595,
"grad_norm": 1.3333668595314416,
"learning_rate": 3.339512005434309e-08,
"loss": 0.0299,
"step": 942
},
{
"epoch": 1.9284253578732107,
"grad_norm": 1.013011546713111,
"learning_rate": 3.156753106303367e-08,
"loss": 0.0211,
"step": 943
},
{
"epoch": 1.9304703476482619,
"grad_norm": 1.0888245467200752,
"learning_rate": 2.979120944346936e-08,
"loss": 0.0197,
"step": 944
},
{
"epoch": 1.9325153374233128,
"grad_norm": 1.5607072199045122,
"learning_rate": 2.8066173524839978e-08,
"loss": 0.0254,
"step": 945
},
{
"epoch": 1.934560327198364,
"grad_norm": 1.299299607811048,
"learning_rate": 2.6392441107137013e-08,
"loss": 0.021,
"step": 946
},
{
"epoch": 1.9366053169734152,
"grad_norm": 1.6718512782375123,
"learning_rate": 2.4770029460970956e-08,
"loss": 0.0261,
"step": 947
},
{
"epoch": 1.9386503067484662,
"grad_norm": 1.5210112393820647,
"learning_rate": 2.319895532739369e-08,
"loss": 0.0301,
"step": 948
},
{
"epoch": 1.9406952965235174,
"grad_norm": 1.1866446525320051,
"learning_rate": 2.1679234917721946e-08,
"loss": 0.0219,
"step": 949
},
{
"epoch": 1.9427402862985685,
"grad_norm": 1.5585668068927292,
"learning_rate": 2.0210883913376334e-08,
"loss": 0.0271,
"step": 950
},
{
"epoch": 1.9447852760736195,
"grad_norm": 1.7577228374712899,
"learning_rate": 1.8793917465713686e-08,
"loss": 0.0368,
"step": 951
},
{
"epoch": 1.946830265848671,
"grad_norm": 1.122105721519514,
"learning_rate": 1.742835019587441e-08,
"loss": 0.0195,
"step": 952
},
{
"epoch": 1.9488752556237219,
"grad_norm": 1.0658375784177427,
"learning_rate": 1.6114196194628174e-08,
"loss": 0.017,
"step": 953
},
{
"epoch": 1.9509202453987728,
"grad_norm": 1.3923391468611417,
"learning_rate": 1.4851469022234e-08,
"loss": 0.0273,
"step": 954
},
{
"epoch": 1.9529652351738243,
"grad_norm": 1.3524898267742,
"learning_rate": 1.3640181708293731e-08,
"loss": 0.0259,
"step": 955
},
{
"epoch": 1.9550102249488752,
"grad_norm": 1.8660062595825002,
"learning_rate": 1.2480346751622686e-08,
"loss": 0.0324,
"step": 956
},
{
"epoch": 1.9570552147239264,
"grad_norm": 1.179664700215166,
"learning_rate": 1.137197612011809e-08,
"loss": 0.0259,
"step": 957
},
{
"epoch": 1.9591002044989776,
"grad_norm": 1.8058704230919458,
"learning_rate": 1.0315081250636405e-08,
"loss": 0.0265,
"step": 958
},
{
"epoch": 1.9611451942740286,
"grad_norm": 1.2051173078887687,
"learning_rate": 9.30967304887509e-09,
"loss": 0.0195,
"step": 959
},
{
"epoch": 1.9631901840490797,
"grad_norm": 1.4007351229371985,
"learning_rate": 8.35576188926046e-09,
"loss": 0.0304,
"step": 960
},
{
"epoch": 1.965235173824131,
"grad_norm": 1.1573585526709194,
"learning_rate": 7.453357614841116e-09,
"loss": 0.0202,
"step": 961
},
{
"epoch": 1.967280163599182,
"grad_norm": 1.3119134280852782,
"learning_rate": 6.60246953718302e-09,
"loss": 0.0284,
"step": 962
},
{
"epoch": 1.969325153374233,
"grad_norm": 2.16522611001059,
"learning_rate": 5.803106436279571e-09,
"loss": 0.039,
"step": 963
},
{
"epoch": 1.9713701431492843,
"grad_norm": 1.2979533184475112,
"learning_rate": 5.055276560454459e-09,
"loss": 0.025,
"step": 964
},
{
"epoch": 1.9734151329243352,
"grad_norm": 1.0802181068863799,
"learning_rate": 4.358987626281175e-09,
"loss": 0.0151,
"step": 965
},
{
"epoch": 1.9754601226993866,
"grad_norm": 1.7117951934036464,
"learning_rate": 3.71424681850141e-09,
"loss": 0.0355,
"step": 966
},
{
"epoch": 1.9775051124744376,
"grad_norm": 1.3376552108682394,
"learning_rate": 3.1210607899512244e-09,
"loss": 0.0251,
"step": 967
},
{
"epoch": 1.9795501022494888,
"grad_norm": 1.1725724434752407,
"learning_rate": 2.579435661492213e-09,
"loss": 0.0204,
"step": 968
},
{
"epoch": 1.98159509202454,
"grad_norm": 1.696450698444297,
"learning_rate": 2.0893770219493347e-09,
"loss": 0.0299,
"step": 969
},
{
"epoch": 1.983640081799591,
"grad_norm": 1.325923649921468,
"learning_rate": 1.6508899280515134e-09,
"loss": 0.0192,
"step": 970
},
{
"epoch": 1.9856850715746421,
"grad_norm": 1.1813786320601327,
"learning_rate": 1.2639789043805695e-09,
"loss": 0.0196,
"step": 971
},
{
"epoch": 1.9877300613496933,
"grad_norm": 0.9284331487465304,
"learning_rate": 9.286479433257e-10,
"loss": 0.0144,
"step": 972
},
{
"epoch": 1.9897750511247443,
"grad_norm": 1.7289144251248756,
"learning_rate": 6.4490050503907e-10,
"loss": 0.0365,
"step": 973
},
{
"epoch": 1.9918200408997955,
"grad_norm": 1.3982000044906164,
"learning_rate": 4.127395174036153e-10,
"loss": 0.0259,
"step": 974
},
{
"epoch": 1.9938650306748467,
"grad_norm": 1.1534216307337495,
"learning_rate": 2.321673760002918e-10,
"loss": 0.0185,
"step": 975
},
{
"epoch": 1.9959100204498976,
"grad_norm": 1.4226080527702496,
"learning_rate": 1.0318594408476045e-10,
"loss": 0.0227,
"step": 976
},
{
"epoch": 1.997955010224949,
"grad_norm": 2.2227043437975627,
"learning_rate": 2.57965525674031e-11,
"loss": 0.0355,
"step": 977
},
{
"epoch": 2.0,
"grad_norm": 1.7779216395225974,
"learning_rate": 0.0,
"loss": 0.029,
"step": 978
},
{
"epoch": 2.0,
"step": 978,
"total_flos": 4304231890944.0,
"train_loss": 0.056620436601515986,
"train_runtime": 754.4403,
"train_samples_per_second": 10.36,
"train_steps_per_second": 1.296
}
],
"logging_steps": 1,
"max_steps": 978,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4304231890944.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}