bert-1ds-domain / trainer_state.json
Vishal24's picture
Upload checkpoint-387090
d0977bd verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 30.0,
"eval_steps": 500,
"global_step": 387090,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.038750678136867396,
"grad_norm": 7.044970512390137,
"learning_rate": 1.9922498643726265e-05,
"loss": 5.0771,
"step": 500
},
{
"epoch": 0.07750135627373479,
"grad_norm": 6.129168510437012,
"learning_rate": 1.9844997287452532e-05,
"loss": 4.5935,
"step": 1000
},
{
"epoch": 0.11625203441060218,
"grad_norm": 6.968145847320557,
"learning_rate": 1.9767495931178796e-05,
"loss": 4.3872,
"step": 1500
},
{
"epoch": 0.15500271254746958,
"grad_norm": 6.270899295806885,
"learning_rate": 1.9689994574905063e-05,
"loss": 4.2507,
"step": 2000
},
{
"epoch": 0.19375339068433697,
"grad_norm": 6.4452104568481445,
"learning_rate": 1.9612493218631326e-05,
"loss": 4.135,
"step": 2500
},
{
"epoch": 0.23250406882120436,
"grad_norm": 6.564201354980469,
"learning_rate": 1.9534991862357593e-05,
"loss": 4.0704,
"step": 3000
},
{
"epoch": 0.2712547469580718,
"grad_norm": 6.716593265533447,
"learning_rate": 1.9457490506083857e-05,
"loss": 3.9814,
"step": 3500
},
{
"epoch": 0.31000542509493917,
"grad_norm": 6.578341484069824,
"learning_rate": 1.9379989149810124e-05,
"loss": 3.9078,
"step": 4000
},
{
"epoch": 0.34875610323180656,
"grad_norm": 6.6163482666015625,
"learning_rate": 1.9302487793536387e-05,
"loss": 3.858,
"step": 4500
},
{
"epoch": 0.38750678136867395,
"grad_norm": 5.90346097946167,
"learning_rate": 1.9224986437262654e-05,
"loss": 3.8036,
"step": 5000
},
{
"epoch": 0.42625745950554134,
"grad_norm": 5.8487467765808105,
"learning_rate": 1.9147485080988918e-05,
"loss": 3.7518,
"step": 5500
},
{
"epoch": 0.4650081376424087,
"grad_norm": 6.573217391967773,
"learning_rate": 1.9069983724715185e-05,
"loss": 3.7164,
"step": 6000
},
{
"epoch": 0.5037588157792762,
"grad_norm": 5.970231056213379,
"learning_rate": 1.899248236844145e-05,
"loss": 3.673,
"step": 6500
},
{
"epoch": 0.5425094939161436,
"grad_norm": 6.683649063110352,
"learning_rate": 1.8914981012167715e-05,
"loss": 3.6405,
"step": 7000
},
{
"epoch": 0.581260172053011,
"grad_norm": 6.538488864898682,
"learning_rate": 1.883747965589398e-05,
"loss": 3.6133,
"step": 7500
},
{
"epoch": 0.6200108501898783,
"grad_norm": 6.528162479400635,
"learning_rate": 1.8759978299620246e-05,
"loss": 3.5754,
"step": 8000
},
{
"epoch": 0.6587615283267457,
"grad_norm": 6.43408203125,
"learning_rate": 1.868247694334651e-05,
"loss": 3.5548,
"step": 8500
},
{
"epoch": 0.6975122064636131,
"grad_norm": 6.131889820098877,
"learning_rate": 1.8604975587072776e-05,
"loss": 3.5228,
"step": 9000
},
{
"epoch": 0.7362628846004805,
"grad_norm": 6.320891857147217,
"learning_rate": 1.852747423079904e-05,
"loss": 3.4999,
"step": 9500
},
{
"epoch": 0.7750135627373479,
"grad_norm": 6.105418682098389,
"learning_rate": 1.8449972874525307e-05,
"loss": 3.5071,
"step": 10000
},
{
"epoch": 0.8137642408742153,
"grad_norm": 6.774458885192871,
"learning_rate": 1.837247151825157e-05,
"loss": 3.4616,
"step": 10500
},
{
"epoch": 0.8525149190110827,
"grad_norm": 6.263659477233887,
"learning_rate": 1.8294970161977838e-05,
"loss": 3.4499,
"step": 11000
},
{
"epoch": 0.8912655971479501,
"grad_norm": 6.58251428604126,
"learning_rate": 1.82174688057041e-05,
"loss": 3.4157,
"step": 11500
},
{
"epoch": 0.9300162752848175,
"grad_norm": 6.030143737792969,
"learning_rate": 1.8139967449430368e-05,
"loss": 3.3842,
"step": 12000
},
{
"epoch": 0.9687669534216848,
"grad_norm": 6.361506462097168,
"learning_rate": 1.806246609315663e-05,
"loss": 3.3707,
"step": 12500
},
{
"epoch": 1.0,
"eval_loss": 3.2508351802825928,
"eval_runtime": 267.1986,
"eval_samples_per_second": 772.702,
"eval_steps_per_second": 12.077,
"step": 12903
},
{
"epoch": 1.0075176315585523,
"grad_norm": 6.418643474578857,
"learning_rate": 1.79849647368829e-05,
"loss": 3.3673,
"step": 13000
},
{
"epoch": 1.0462683096954197,
"grad_norm": 6.310774326324463,
"learning_rate": 1.7907463380609162e-05,
"loss": 3.3276,
"step": 13500
},
{
"epoch": 1.0850189878322871,
"grad_norm": 6.517366409301758,
"learning_rate": 1.782996202433543e-05,
"loss": 3.3288,
"step": 14000
},
{
"epoch": 1.1237696659691545,
"grad_norm": 6.407958984375,
"learning_rate": 1.7752460668061693e-05,
"loss": 3.3003,
"step": 14500
},
{
"epoch": 1.162520344106022,
"grad_norm": 6.145129203796387,
"learning_rate": 1.767495931178796e-05,
"loss": 3.2694,
"step": 15000
},
{
"epoch": 1.2012710222428893,
"grad_norm": 6.586604118347168,
"learning_rate": 1.7597457955514223e-05,
"loss": 3.2627,
"step": 15500
},
{
"epoch": 1.2400217003797567,
"grad_norm": 6.122056007385254,
"learning_rate": 1.751995659924049e-05,
"loss": 3.2631,
"step": 16000
},
{
"epoch": 1.278772378516624,
"grad_norm": 6.545727252960205,
"learning_rate": 1.7442455242966754e-05,
"loss": 3.2324,
"step": 16500
},
{
"epoch": 1.3175230566534915,
"grad_norm": 6.427816390991211,
"learning_rate": 1.7364953886693017e-05,
"loss": 3.227,
"step": 17000
},
{
"epoch": 1.3562737347903588,
"grad_norm": 6.253689765930176,
"learning_rate": 1.7287452530419284e-05,
"loss": 3.2099,
"step": 17500
},
{
"epoch": 1.3950244129272262,
"grad_norm": 6.5702080726623535,
"learning_rate": 1.7209951174145548e-05,
"loss": 3.2102,
"step": 18000
},
{
"epoch": 1.4337750910640936,
"grad_norm": 6.4822564125061035,
"learning_rate": 1.7132449817871815e-05,
"loss": 3.1935,
"step": 18500
},
{
"epoch": 1.472525769200961,
"grad_norm": 6.524315357208252,
"learning_rate": 1.705494846159808e-05,
"loss": 3.1955,
"step": 19000
},
{
"epoch": 1.5112764473378284,
"grad_norm": 6.302344799041748,
"learning_rate": 1.6977447105324345e-05,
"loss": 3.1726,
"step": 19500
},
{
"epoch": 1.5500271254746958,
"grad_norm": 5.837028503417969,
"learning_rate": 1.689994574905061e-05,
"loss": 3.1277,
"step": 20000
},
{
"epoch": 1.5887778036115632,
"grad_norm": 6.489377975463867,
"learning_rate": 1.6822444392776876e-05,
"loss": 3.1414,
"step": 20500
},
{
"epoch": 1.6275284817484306,
"grad_norm": 6.543872833251953,
"learning_rate": 1.674494303650314e-05,
"loss": 3.104,
"step": 21000
},
{
"epoch": 1.666279159885298,
"grad_norm": 6.05628776550293,
"learning_rate": 1.6667441680229406e-05,
"loss": 3.1459,
"step": 21500
},
{
"epoch": 1.7050298380221653,
"grad_norm": 6.027078151702881,
"learning_rate": 1.658994032395567e-05,
"loss": 3.0963,
"step": 22000
},
{
"epoch": 1.7437805161590327,
"grad_norm": 6.577582359313965,
"learning_rate": 1.6512438967681937e-05,
"loss": 3.118,
"step": 22500
},
{
"epoch": 1.7825311942959001,
"grad_norm": 5.9164204597473145,
"learning_rate": 1.64349376114082e-05,
"loss": 3.0928,
"step": 23000
},
{
"epoch": 1.8212818724327677,
"grad_norm": 6.155348300933838,
"learning_rate": 1.6357436255134468e-05,
"loss": 3.0885,
"step": 23500
},
{
"epoch": 1.8600325505696351,
"grad_norm": 6.302849769592285,
"learning_rate": 1.627993489886073e-05,
"loss": 3.0741,
"step": 24000
},
{
"epoch": 1.8987832287065025,
"grad_norm": 6.140907287597656,
"learning_rate": 1.6202433542586998e-05,
"loss": 3.0633,
"step": 24500
},
{
"epoch": 1.93753390684337,
"grad_norm": 5.85639762878418,
"learning_rate": 1.612493218631326e-05,
"loss": 3.0401,
"step": 25000
},
{
"epoch": 1.9762845849802373,
"grad_norm": 6.558920383453369,
"learning_rate": 1.604743083003953e-05,
"loss": 3.05,
"step": 25500
},
{
"epoch": 2.0,
"eval_loss": 2.9521713256835938,
"eval_runtime": 258.4886,
"eval_samples_per_second": 798.739,
"eval_steps_per_second": 12.484,
"step": 25806
},
{
"epoch": 2.0150352631171047,
"grad_norm": 6.003655433654785,
"learning_rate": 1.5969929473765792e-05,
"loss": 3.0292,
"step": 26000
},
{
"epoch": 2.053785941253972,
"grad_norm": 6.43280029296875,
"learning_rate": 1.589242811749206e-05,
"loss": 3.0205,
"step": 26500
},
{
"epoch": 2.0925366193908395,
"grad_norm": 6.051511287689209,
"learning_rate": 1.5814926761218323e-05,
"loss": 3.0152,
"step": 27000
},
{
"epoch": 2.131287297527707,
"grad_norm": 7.381418704986572,
"learning_rate": 1.573742540494459e-05,
"loss": 3.0067,
"step": 27500
},
{
"epoch": 2.1700379756645742,
"grad_norm": 6.032004356384277,
"learning_rate": 1.5659924048670853e-05,
"loss": 2.9821,
"step": 28000
},
{
"epoch": 2.2087886538014416,
"grad_norm": 6.481622695922852,
"learning_rate": 1.558242269239712e-05,
"loss": 2.9824,
"step": 28500
},
{
"epoch": 2.247539331938309,
"grad_norm": 5.934979438781738,
"learning_rate": 1.5504921336123384e-05,
"loss": 2.9708,
"step": 29000
},
{
"epoch": 2.2862900100751764,
"grad_norm": 7.498392581939697,
"learning_rate": 1.542741997984965e-05,
"loss": 2.9836,
"step": 29500
},
{
"epoch": 2.325040688212044,
"grad_norm": 6.350077152252197,
"learning_rate": 1.5349918623575914e-05,
"loss": 2.9608,
"step": 30000
},
{
"epoch": 2.363791366348911,
"grad_norm": 5.6795783042907715,
"learning_rate": 1.527241726730218e-05,
"loss": 2.9551,
"step": 30500
},
{
"epoch": 2.4025420444857786,
"grad_norm": 6.395376682281494,
"learning_rate": 1.5194915911028445e-05,
"loss": 2.9551,
"step": 31000
},
{
"epoch": 2.441292722622646,
"grad_norm": 6.238061904907227,
"learning_rate": 1.511741455475471e-05,
"loss": 2.9527,
"step": 31500
},
{
"epoch": 2.4800434007595134,
"grad_norm": 6.641284465789795,
"learning_rate": 1.5039913198480975e-05,
"loss": 2.9444,
"step": 32000
},
{
"epoch": 2.5187940788963807,
"grad_norm": 6.30321741104126,
"learning_rate": 1.496241184220724e-05,
"loss": 2.9346,
"step": 32500
},
{
"epoch": 2.557544757033248,
"grad_norm": 8.681157112121582,
"learning_rate": 1.4884910485933506e-05,
"loss": 2.9204,
"step": 33000
},
{
"epoch": 2.5962954351701155,
"grad_norm": 6.423407077789307,
"learning_rate": 1.4807409129659771e-05,
"loss": 2.9066,
"step": 33500
},
{
"epoch": 2.635046113306983,
"grad_norm": 6.697604179382324,
"learning_rate": 1.4729907773386036e-05,
"loss": 2.9079,
"step": 34000
},
{
"epoch": 2.6737967914438503,
"grad_norm": 6.646244049072266,
"learning_rate": 1.4652406417112302e-05,
"loss": 2.9148,
"step": 34500
},
{
"epoch": 2.7125474695807177,
"grad_norm": 6.80411958694458,
"learning_rate": 1.4574905060838567e-05,
"loss": 2.9005,
"step": 35000
},
{
"epoch": 2.751298147717585,
"grad_norm": 6.345988750457764,
"learning_rate": 1.4497403704564832e-05,
"loss": 2.888,
"step": 35500
},
{
"epoch": 2.7900488258544525,
"grad_norm": 5.965686798095703,
"learning_rate": 1.4419902348291098e-05,
"loss": 2.8845,
"step": 36000
},
{
"epoch": 2.82879950399132,
"grad_norm": 6.068357944488525,
"learning_rate": 1.4342400992017363e-05,
"loss": 2.8835,
"step": 36500
},
{
"epoch": 2.8675501821281872,
"grad_norm": 5.874370098114014,
"learning_rate": 1.4264899635743628e-05,
"loss": 2.8904,
"step": 37000
},
{
"epoch": 2.9063008602650546,
"grad_norm": 6.0566935539245605,
"learning_rate": 1.4187398279469893e-05,
"loss": 2.8823,
"step": 37500
},
{
"epoch": 2.945051538401922,
"grad_norm": 6.21787691116333,
"learning_rate": 1.4109896923196159e-05,
"loss": 2.867,
"step": 38000
},
{
"epoch": 2.9838022165387894,
"grad_norm": 6.055897235870361,
"learning_rate": 1.4032395566922424e-05,
"loss": 2.867,
"step": 38500
},
{
"epoch": 3.0,
"eval_loss": 2.775702953338623,
"eval_runtime": 259.0101,
"eval_samples_per_second": 797.131,
"eval_steps_per_second": 12.459,
"step": 38709
},
{
"epoch": 3.022552894675657,
"grad_norm": 5.503760814666748,
"learning_rate": 1.3954894210648689e-05,
"loss": 2.8428,
"step": 39000
},
{
"epoch": 3.061303572812524,
"grad_norm": 6.250561714172363,
"learning_rate": 1.3877392854374954e-05,
"loss": 2.842,
"step": 39500
},
{
"epoch": 3.1000542509493916,
"grad_norm": 6.394408226013184,
"learning_rate": 1.379989149810122e-05,
"loss": 2.8368,
"step": 40000
},
{
"epoch": 3.138804929086259,
"grad_norm": 5.7096428871154785,
"learning_rate": 1.3722390141827483e-05,
"loss": 2.8253,
"step": 40500
},
{
"epoch": 3.1775556072231264,
"grad_norm": 6.807374000549316,
"learning_rate": 1.3644888785553749e-05,
"loss": 2.821,
"step": 41000
},
{
"epoch": 3.2163062853599937,
"grad_norm": 6.367000102996826,
"learning_rate": 1.3567387429280014e-05,
"loss": 2.8302,
"step": 41500
},
{
"epoch": 3.255056963496861,
"grad_norm": 6.30033540725708,
"learning_rate": 1.3489886073006279e-05,
"loss": 2.8191,
"step": 42000
},
{
"epoch": 3.2938076416337285,
"grad_norm": 7.257653713226318,
"learning_rate": 1.3412384716732544e-05,
"loss": 2.8196,
"step": 42500
},
{
"epoch": 3.332558319770596,
"grad_norm": 7.1162109375,
"learning_rate": 1.333488336045881e-05,
"loss": 2.817,
"step": 43000
},
{
"epoch": 3.3713089979074633,
"grad_norm": 6.336881160736084,
"learning_rate": 1.3257382004185075e-05,
"loss": 2.8064,
"step": 43500
},
{
"epoch": 3.4100596760443307,
"grad_norm": 6.641462326049805,
"learning_rate": 1.317988064791134e-05,
"loss": 2.8035,
"step": 44000
},
{
"epoch": 3.448810354181198,
"grad_norm": 6.033754348754883,
"learning_rate": 1.3102379291637605e-05,
"loss": 2.7976,
"step": 44500
},
{
"epoch": 3.4875610323180655,
"grad_norm": 6.544773101806641,
"learning_rate": 1.302487793536387e-05,
"loss": 2.8048,
"step": 45000
},
{
"epoch": 3.526311710454933,
"grad_norm": 6.382020950317383,
"learning_rate": 1.2947376579090136e-05,
"loss": 2.7982,
"step": 45500
},
{
"epoch": 3.5650623885918002,
"grad_norm": 6.194632053375244,
"learning_rate": 1.2869875222816401e-05,
"loss": 2.7749,
"step": 46000
},
{
"epoch": 3.6038130667286676,
"grad_norm": 6.429641246795654,
"learning_rate": 1.2792373866542665e-05,
"loss": 2.7853,
"step": 46500
},
{
"epoch": 3.642563744865535,
"grad_norm": 6.209822177886963,
"learning_rate": 1.271487251026893e-05,
"loss": 2.7841,
"step": 47000
},
{
"epoch": 3.6813144230024024,
"grad_norm": 6.935910701751709,
"learning_rate": 1.2637371153995195e-05,
"loss": 2.7681,
"step": 47500
},
{
"epoch": 3.72006510113927,
"grad_norm": 7.021639347076416,
"learning_rate": 1.255986979772146e-05,
"loss": 2.7658,
"step": 48000
},
{
"epoch": 3.758815779276137,
"grad_norm": 6.242121696472168,
"learning_rate": 1.2482368441447726e-05,
"loss": 2.7698,
"step": 48500
},
{
"epoch": 3.7975664574130046,
"grad_norm": 6.123905658721924,
"learning_rate": 1.2404867085173991e-05,
"loss": 2.7711,
"step": 49000
},
{
"epoch": 3.836317135549872,
"grad_norm": 6.735771179199219,
"learning_rate": 1.2327365728900256e-05,
"loss": 2.726,
"step": 49500
},
{
"epoch": 3.8750678136867394,
"grad_norm": 6.921602725982666,
"learning_rate": 1.2249864372626522e-05,
"loss": 2.7545,
"step": 50000
},
{
"epoch": 3.9138184918236067,
"grad_norm": 6.343456745147705,
"learning_rate": 1.2172363016352787e-05,
"loss": 2.7474,
"step": 50500
},
{
"epoch": 3.9525691699604746,
"grad_norm": 6.30169677734375,
"learning_rate": 1.2094861660079052e-05,
"loss": 2.7467,
"step": 51000
},
{
"epoch": 3.9913198480973415,
"grad_norm": 6.6629767417907715,
"learning_rate": 1.2017360303805317e-05,
"loss": 2.7475,
"step": 51500
},
{
"epoch": 4.0,
"eval_loss": 2.6640822887420654,
"eval_runtime": 260.2494,
"eval_samples_per_second": 793.335,
"eval_steps_per_second": 12.4,
"step": 51612
},
{
"epoch": 4.030070526234209,
"grad_norm": 6.397671222686768,
"learning_rate": 1.1939858947531581e-05,
"loss": 2.7311,
"step": 52000
},
{
"epoch": 4.068821204371076,
"grad_norm": 6.374961853027344,
"learning_rate": 1.1862357591257846e-05,
"loss": 2.7119,
"step": 52500
},
{
"epoch": 4.107571882507944,
"grad_norm": 5.920938968658447,
"learning_rate": 1.1784856234984112e-05,
"loss": 2.7217,
"step": 53000
},
{
"epoch": 4.146322560644811,
"grad_norm": 6.377143859863281,
"learning_rate": 1.1707354878710377e-05,
"loss": 2.7044,
"step": 53500
},
{
"epoch": 4.185073238781679,
"grad_norm": 7.047250270843506,
"learning_rate": 1.1629853522436642e-05,
"loss": 2.7213,
"step": 54000
},
{
"epoch": 4.223823916918546,
"grad_norm": 6.682352066040039,
"learning_rate": 1.1552352166162907e-05,
"loss": 2.7025,
"step": 54500
},
{
"epoch": 4.262574595055414,
"grad_norm": 6.547230243682861,
"learning_rate": 1.1474850809889173e-05,
"loss": 2.7068,
"step": 55000
},
{
"epoch": 4.301325273192281,
"grad_norm": 6.038912296295166,
"learning_rate": 1.1397349453615438e-05,
"loss": 2.7061,
"step": 55500
},
{
"epoch": 4.3400759513291485,
"grad_norm": 6.072612762451172,
"learning_rate": 1.1319848097341703e-05,
"loss": 2.7037,
"step": 56000
},
{
"epoch": 4.378826629466015,
"grad_norm": 5.6306281089782715,
"learning_rate": 1.1242346741067968e-05,
"loss": 2.6999,
"step": 56500
},
{
"epoch": 4.417577307602883,
"grad_norm": 6.18297004699707,
"learning_rate": 1.1164845384794234e-05,
"loss": 2.6974,
"step": 57000
},
{
"epoch": 4.45632798573975,
"grad_norm": 6.371115207672119,
"learning_rate": 1.1087344028520499e-05,
"loss": 2.6918,
"step": 57500
},
{
"epoch": 4.495078663876618,
"grad_norm": 6.444944381713867,
"learning_rate": 1.1009842672246764e-05,
"loss": 2.6874,
"step": 58000
},
{
"epoch": 4.533829342013485,
"grad_norm": 6.176960468292236,
"learning_rate": 1.093234131597303e-05,
"loss": 2.68,
"step": 58500
},
{
"epoch": 4.572580020150353,
"grad_norm": 6.731847763061523,
"learning_rate": 1.0854839959699295e-05,
"loss": 2.6919,
"step": 59000
},
{
"epoch": 4.61133069828722,
"grad_norm": 7.826213836669922,
"learning_rate": 1.077733860342556e-05,
"loss": 2.6824,
"step": 59500
},
{
"epoch": 4.650081376424088,
"grad_norm": 7.052020072937012,
"learning_rate": 1.0699837247151825e-05,
"loss": 2.6616,
"step": 60000
},
{
"epoch": 4.6888320545609545,
"grad_norm": 5.36915922164917,
"learning_rate": 1.062233589087809e-05,
"loss": 2.667,
"step": 60500
},
{
"epoch": 4.727582732697822,
"grad_norm": 6.491717338562012,
"learning_rate": 1.0544834534604356e-05,
"loss": 2.6896,
"step": 61000
},
{
"epoch": 4.766333410834689,
"grad_norm": 7.702902793884277,
"learning_rate": 1.0467333178330621e-05,
"loss": 2.6712,
"step": 61500
},
{
"epoch": 4.805084088971557,
"grad_norm": 6.359930992126465,
"learning_rate": 1.0389831822056886e-05,
"loss": 2.6704,
"step": 62000
},
{
"epoch": 4.843834767108424,
"grad_norm": 6.2874531745910645,
"learning_rate": 1.0312330465783152e-05,
"loss": 2.6757,
"step": 62500
},
{
"epoch": 4.882585445245292,
"grad_norm": 6.827906131744385,
"learning_rate": 1.0234829109509417e-05,
"loss": 2.6567,
"step": 63000
},
{
"epoch": 4.921336123382159,
"grad_norm": 6.620416164398193,
"learning_rate": 1.0157327753235682e-05,
"loss": 2.6615,
"step": 63500
},
{
"epoch": 4.960086801519027,
"grad_norm": 6.6219162940979,
"learning_rate": 1.0079826396961947e-05,
"loss": 2.657,
"step": 64000
},
{
"epoch": 4.998837479655894,
"grad_norm": 6.214903831481934,
"learning_rate": 1.0002325040688213e-05,
"loss": 2.6549,
"step": 64500
},
{
"epoch": 5.0,
"eval_loss": 2.578911066055298,
"eval_runtime": 265.1883,
"eval_samples_per_second": 778.56,
"eval_steps_per_second": 12.169,
"step": 64515
},
{
"epoch": 5.0375881577927615,
"grad_norm": 6.627685546875,
"learning_rate": 9.924823684414478e-06,
"loss": 2.6203,
"step": 65000
},
{
"epoch": 5.076338835929628,
"grad_norm": 6.23040771484375,
"learning_rate": 9.847322328140743e-06,
"loss": 2.6349,
"step": 65500
},
{
"epoch": 5.115089514066496,
"grad_norm": 6.667369365692139,
"learning_rate": 9.769820971867009e-06,
"loss": 2.647,
"step": 66000
},
{
"epoch": 5.153840192203363,
"grad_norm": 6.694558620452881,
"learning_rate": 9.692319615593274e-06,
"loss": 2.6214,
"step": 66500
},
{
"epoch": 5.192590870340231,
"grad_norm": 6.280242443084717,
"learning_rate": 9.614818259319539e-06,
"loss": 2.6206,
"step": 67000
},
{
"epoch": 5.231341548477098,
"grad_norm": 6.660119533538818,
"learning_rate": 9.537316903045804e-06,
"loss": 2.6307,
"step": 67500
},
{
"epoch": 5.270092226613966,
"grad_norm": 6.439652919769287,
"learning_rate": 9.45981554677207e-06,
"loss": 2.6431,
"step": 68000
},
{
"epoch": 5.308842904750833,
"grad_norm": 6.055843830108643,
"learning_rate": 9.382314190498335e-06,
"loss": 2.6144,
"step": 68500
},
{
"epoch": 5.347593582887701,
"grad_norm": 6.519714832305908,
"learning_rate": 9.3048128342246e-06,
"loss": 2.6056,
"step": 69000
},
{
"epoch": 5.3863442610245675,
"grad_norm": 6.72304630279541,
"learning_rate": 9.227311477950864e-06,
"loss": 2.623,
"step": 69500
},
{
"epoch": 5.425094939161435,
"grad_norm": 7.048790454864502,
"learning_rate": 9.149810121677129e-06,
"loss": 2.6043,
"step": 70000
},
{
"epoch": 5.463845617298302,
"grad_norm": 6.654219627380371,
"learning_rate": 9.072308765403394e-06,
"loss": 2.6135,
"step": 70500
},
{
"epoch": 5.50259629543517,
"grad_norm": 5.948112487792969,
"learning_rate": 8.99480740912966e-06,
"loss": 2.6295,
"step": 71000
},
{
"epoch": 5.541346973572038,
"grad_norm": 7.8044328689575195,
"learning_rate": 8.917306052855925e-06,
"loss": 2.6104,
"step": 71500
},
{
"epoch": 5.580097651708905,
"grad_norm": 6.743612766265869,
"learning_rate": 8.83980469658219e-06,
"loss": 2.6216,
"step": 72000
},
{
"epoch": 5.618848329845772,
"grad_norm": 6.346240043640137,
"learning_rate": 8.762303340308455e-06,
"loss": 2.6238,
"step": 72500
},
{
"epoch": 5.65759900798264,
"grad_norm": 6.496920108795166,
"learning_rate": 8.68480198403472e-06,
"loss": 2.6334,
"step": 73000
},
{
"epoch": 5.6963496861195075,
"grad_norm": 6.356810569763184,
"learning_rate": 8.607300627760986e-06,
"loss": 2.5995,
"step": 73500
},
{
"epoch": 5.7351003642563745,
"grad_norm": 6.226792812347412,
"learning_rate": 8.529799271487251e-06,
"loss": 2.5974,
"step": 74000
},
{
"epoch": 5.773851042393241,
"grad_norm": 6.6555962562561035,
"learning_rate": 8.452297915213516e-06,
"loss": 2.6285,
"step": 74500
},
{
"epoch": 5.812601720530109,
"grad_norm": 6.32110595703125,
"learning_rate": 8.374796558939782e-06,
"loss": 2.6035,
"step": 75000
},
{
"epoch": 5.851352398666977,
"grad_norm": 6.651345252990723,
"learning_rate": 8.297295202666047e-06,
"loss": 2.5886,
"step": 75500
},
{
"epoch": 5.890103076803844,
"grad_norm": 6.736583232879639,
"learning_rate": 8.219793846392312e-06,
"loss": 2.5903,
"step": 76000
},
{
"epoch": 5.928853754940711,
"grad_norm": 6.635737895965576,
"learning_rate": 8.142292490118577e-06,
"loss": 2.597,
"step": 76500
},
{
"epoch": 5.967604433077579,
"grad_norm": 6.3186492919921875,
"learning_rate": 8.064791133844843e-06,
"loss": 2.5732,
"step": 77000
},
{
"epoch": 6.0,
"eval_loss": 2.5146169662475586,
"eval_runtime": 259.2569,
"eval_samples_per_second": 796.372,
"eval_steps_per_second": 12.447,
"step": 77418
},
{
"epoch": 6.006355111214447,
"grad_norm": 6.408041000366211,
"learning_rate": 7.987289777571108e-06,
"loss": 2.5742,
"step": 77500
},
{
"epoch": 6.045105789351314,
"grad_norm": 6.398166656494141,
"learning_rate": 7.909788421297373e-06,
"loss": 2.5829,
"step": 78000
},
{
"epoch": 6.083856467488181,
"grad_norm": 6.89434289932251,
"learning_rate": 7.832287065023639e-06,
"loss": 2.58,
"step": 78500
},
{
"epoch": 6.122607145625048,
"grad_norm": 5.935701847076416,
"learning_rate": 7.754785708749904e-06,
"loss": 2.5853,
"step": 79000
},
{
"epoch": 6.161357823761916,
"grad_norm": 7.224461555480957,
"learning_rate": 7.677284352476169e-06,
"loss": 2.5597,
"step": 79500
},
{
"epoch": 6.200108501898783,
"grad_norm": 6.59751033782959,
"learning_rate": 7.5997829962024335e-06,
"loss": 2.5821,
"step": 80000
},
{
"epoch": 6.238859180035651,
"grad_norm": 6.414103031158447,
"learning_rate": 7.522281639928699e-06,
"loss": 2.5542,
"step": 80500
},
{
"epoch": 6.277609858172518,
"grad_norm": 6.270075798034668,
"learning_rate": 7.444780283654964e-06,
"loss": 2.5735,
"step": 81000
},
{
"epoch": 6.316360536309386,
"grad_norm": 6.3846306800842285,
"learning_rate": 7.367278927381229e-06,
"loss": 2.5563,
"step": 81500
},
{
"epoch": 6.355111214446253,
"grad_norm": 6.725887298583984,
"learning_rate": 7.2897775711074945e-06,
"loss": 2.5582,
"step": 82000
},
{
"epoch": 6.3938618925831205,
"grad_norm": 6.913090229034424,
"learning_rate": 7.21227621483376e-06,
"loss": 2.5681,
"step": 82500
},
{
"epoch": 6.4326125707199875,
"grad_norm": 6.630814075469971,
"learning_rate": 7.134774858560025e-06,
"loss": 2.5493,
"step": 83000
},
{
"epoch": 6.471363248856855,
"grad_norm": 7.482264518737793,
"learning_rate": 7.05727350228629e-06,
"loss": 2.5672,
"step": 83500
},
{
"epoch": 6.510113926993722,
"grad_norm": 5.896800518035889,
"learning_rate": 6.979772146012556e-06,
"loss": 2.5563,
"step": 84000
},
{
"epoch": 6.54886460513059,
"grad_norm": 6.603734016418457,
"learning_rate": 6.902270789738821e-06,
"loss": 2.5358,
"step": 84500
},
{
"epoch": 6.587615283267457,
"grad_norm": 6.386889457702637,
"learning_rate": 6.824769433465086e-06,
"loss": 2.5449,
"step": 85000
},
{
"epoch": 6.626365961404325,
"grad_norm": 6.661931037902832,
"learning_rate": 6.747268077191351e-06,
"loss": 2.5405,
"step": 85500
},
{
"epoch": 6.665116639541192,
"grad_norm": 6.331045627593994,
"learning_rate": 6.669766720917617e-06,
"loss": 2.5419,
"step": 86000
},
{
"epoch": 6.70386731767806,
"grad_norm": 7.050119400024414,
"learning_rate": 6.592265364643882e-06,
"loss": 2.5196,
"step": 86500
},
{
"epoch": 6.742617995814927,
"grad_norm": 6.065616130828857,
"learning_rate": 6.514764008370147e-06,
"loss": 2.539,
"step": 87000
},
{
"epoch": 6.781368673951794,
"grad_norm": 5.768097877502441,
"learning_rate": 6.4372626520964125e-06,
"loss": 2.5245,
"step": 87500
},
{
"epoch": 6.820119352088661,
"grad_norm": 6.785781383514404,
"learning_rate": 6.359761295822677e-06,
"loss": 2.5473,
"step": 88000
},
{
"epoch": 6.858870030225529,
"grad_norm": 6.658846855163574,
"learning_rate": 6.282259939548942e-06,
"loss": 2.5385,
"step": 88500
},
{
"epoch": 6.897620708362396,
"grad_norm": 5.932773590087891,
"learning_rate": 6.2047585832752074e-06,
"loss": 2.528,
"step": 89000
},
{
"epoch": 6.936371386499264,
"grad_norm": 6.457767963409424,
"learning_rate": 6.127257227001473e-06,
"loss": 2.5327,
"step": 89500
},
{
"epoch": 6.975122064636131,
"grad_norm": 6.143023490905762,
"learning_rate": 6.049755870727738e-06,
"loss": 2.5352,
"step": 90000
},
{
"epoch": 7.0,
"eval_loss": 2.4585013389587402,
"eval_runtime": 258.9573,
"eval_samples_per_second": 797.294,
"eval_steps_per_second": 12.462,
"step": 90321
},
{
"epoch": 7.013872742772999,
"grad_norm": 6.153046607971191,
"learning_rate": 5.972254514454003e-06,
"loss": 2.5315,
"step": 90500
},
{
"epoch": 7.052623420909866,
"grad_norm": 7.131119728088379,
"learning_rate": 5.8947531581802685e-06,
"loss": 2.5431,
"step": 91000
},
{
"epoch": 7.0913740990467335,
"grad_norm": 6.677100658416748,
"learning_rate": 5.817251801906534e-06,
"loss": 2.5204,
"step": 91500
},
{
"epoch": 7.1301247771836005,
"grad_norm": 6.799976348876953,
"learning_rate": 5.739750445632799e-06,
"loss": 2.5221,
"step": 92000
},
{
"epoch": 7.168875455320468,
"grad_norm": 6.515171051025391,
"learning_rate": 5.662249089359064e-06,
"loss": 2.5222,
"step": 92500
},
{
"epoch": 7.207626133457335,
"grad_norm": 7.057505130767822,
"learning_rate": 5.58474773308533e-06,
"loss": 2.5262,
"step": 93000
},
{
"epoch": 7.246376811594203,
"grad_norm": 5.927343368530273,
"learning_rate": 5.507246376811595e-06,
"loss": 2.5272,
"step": 93500
},
{
"epoch": 7.28512748973107,
"grad_norm": 6.7214155197143555,
"learning_rate": 5.42974502053786e-06,
"loss": 2.5195,
"step": 94000
},
{
"epoch": 7.323878167867938,
"grad_norm": 6.162799835205078,
"learning_rate": 5.352243664264125e-06,
"loss": 2.5117,
"step": 94500
},
{
"epoch": 7.362628846004805,
"grad_norm": 6.725783824920654,
"learning_rate": 5.274742307990391e-06,
"loss": 2.522,
"step": 95000
},
{
"epoch": 7.401379524141673,
"grad_norm": 5.721879959106445,
"learning_rate": 5.197240951716656e-06,
"loss": 2.5047,
"step": 95500
},
{
"epoch": 7.44013020227854,
"grad_norm": 7.531757354736328,
"learning_rate": 5.11973959544292e-06,
"loss": 2.4981,
"step": 96000
},
{
"epoch": 7.478880880415407,
"grad_norm": 6.200819492340088,
"learning_rate": 5.042238239169186e-06,
"loss": 2.5016,
"step": 96500
},
{
"epoch": 7.517631558552274,
"grad_norm": 6.8695597648620605,
"learning_rate": 4.964736882895451e-06,
"loss": 2.5085,
"step": 97000
},
{
"epoch": 7.556382236689142,
"grad_norm": 6.3883843421936035,
"learning_rate": 4.887235526621716e-06,
"loss": 2.5092,
"step": 97500
},
{
"epoch": 7.595132914826009,
"grad_norm": 6.085172653198242,
"learning_rate": 4.809734170347981e-06,
"loss": 2.4957,
"step": 98000
},
{
"epoch": 7.633883592962877,
"grad_norm": 6.23600435256958,
"learning_rate": 4.732232814074247e-06,
"loss": 2.4876,
"step": 98500
},
{
"epoch": 7.672634271099744,
"grad_norm": 6.483453750610352,
"learning_rate": 4.654731457800512e-06,
"loss": 2.5029,
"step": 99000
},
{
"epoch": 7.711384949236612,
"grad_norm": 6.627302646636963,
"learning_rate": 4.577230101526777e-06,
"loss": 2.4989,
"step": 99500
},
{
"epoch": 7.750135627373479,
"grad_norm": 7.044070243835449,
"learning_rate": 4.4997287452530425e-06,
"loss": 2.5085,
"step": 100000
},
{
"epoch": 7.7888863055103466,
"grad_norm": 5.986552715301514,
"learning_rate": 4.422227388979308e-06,
"loss": 2.4842,
"step": 100500
},
{
"epoch": 7.8276369836472135,
"grad_norm": 6.3408708572387695,
"learning_rate": 4.344726032705573e-06,
"loss": 2.4973,
"step": 101000
},
{
"epoch": 7.866387661784081,
"grad_norm": 6.100359916687012,
"learning_rate": 4.267224676431838e-06,
"loss": 2.5111,
"step": 101500
},
{
"epoch": 7.905138339920948,
"grad_norm": 6.7454833984375,
"learning_rate": 4.1897233201581036e-06,
"loss": 2.4766,
"step": 102000
},
{
"epoch": 7.943889018057816,
"grad_norm": 6.790141582489014,
"learning_rate": 4.112221963884369e-06,
"loss": 2.4788,
"step": 102500
},
{
"epoch": 7.982639696194683,
"grad_norm": 6.926203727722168,
"learning_rate": 4.034720607610634e-06,
"loss": 2.4875,
"step": 103000
},
{
"epoch": 8.0,
"eval_loss": 2.435317277908325,
"eval_runtime": 258.5225,
"eval_samples_per_second": 798.634,
"eval_steps_per_second": 12.482,
"step": 103224
},
{
"epoch": 8.02139037433155,
"grad_norm": 6.832672119140625,
"learning_rate": 3.957219251336899e-06,
"loss": 2.4812,
"step": 103500
},
{
"epoch": 8.060141052468419,
"grad_norm": 6.771292209625244,
"learning_rate": 3.879717895063164e-06,
"loss": 2.4945,
"step": 104000
},
{
"epoch": 8.098891730605285,
"grad_norm": 6.624267101287842,
"learning_rate": 3.802216538789429e-06,
"loss": 2.4813,
"step": 104500
},
{
"epoch": 8.137642408742153,
"grad_norm": 6.566524028778076,
"learning_rate": 3.724715182515694e-06,
"loss": 2.5087,
"step": 105000
},
{
"epoch": 8.17639308687902,
"grad_norm": 6.612277507781982,
"learning_rate": 3.647213826241959e-06,
"loss": 2.481,
"step": 105500
},
{
"epoch": 8.215143765015888,
"grad_norm": 6.12284517288208,
"learning_rate": 3.5697124699682244e-06,
"loss": 2.4825,
"step": 106000
},
{
"epoch": 8.253894443152754,
"grad_norm": 6.495052814483643,
"learning_rate": 3.4922111136944897e-06,
"loss": 2.4883,
"step": 106500
},
{
"epoch": 8.292645121289622,
"grad_norm": 7.689423561096191,
"learning_rate": 3.414709757420755e-06,
"loss": 2.4857,
"step": 107000
},
{
"epoch": 8.33139579942649,
"grad_norm": 6.188397407531738,
"learning_rate": 3.3372084011470202e-06,
"loss": 2.4788,
"step": 107500
},
{
"epoch": 8.370146477563358,
"grad_norm": 6.282194137573242,
"learning_rate": 3.2597070448732855e-06,
"loss": 2.4856,
"step": 108000
},
{
"epoch": 8.408897155700224,
"grad_norm": 6.457098007202148,
"learning_rate": 3.1822056885995508e-06,
"loss": 2.4623,
"step": 108500
},
{
"epoch": 8.447647833837092,
"grad_norm": 7.726540565490723,
"learning_rate": 3.1047043323258156e-06,
"loss": 2.4671,
"step": 109000
},
{
"epoch": 8.48639851197396,
"grad_norm": 6.308920383453369,
"learning_rate": 3.027202976052081e-06,
"loss": 2.4808,
"step": 109500
},
{
"epoch": 8.525149190110827,
"grad_norm": 6.501667499542236,
"learning_rate": 2.949701619778346e-06,
"loss": 2.4736,
"step": 110000
},
{
"epoch": 8.563899868247695,
"grad_norm": 7.358393669128418,
"learning_rate": 2.8722002635046114e-06,
"loss": 2.4697,
"step": 110500
},
{
"epoch": 8.602650546384561,
"grad_norm": 6.261012554168701,
"learning_rate": 2.7946989072308767e-06,
"loss": 2.4631,
"step": 111000
},
{
"epoch": 8.64140122452143,
"grad_norm": 6.515717029571533,
"learning_rate": 2.717197550957142e-06,
"loss": 2.4915,
"step": 111500
},
{
"epoch": 8.680151902658297,
"grad_norm": 6.8307600021362305,
"learning_rate": 2.6396961946834072e-06,
"loss": 2.48,
"step": 112000
},
{
"epoch": 8.718902580795163,
"grad_norm": 6.784819602966309,
"learning_rate": 2.5621948384096725e-06,
"loss": 2.4748,
"step": 112500
},
{
"epoch": 8.75765325893203,
"grad_norm": 7.1304473876953125,
"learning_rate": 2.4846934821359373e-06,
"loss": 2.4723,
"step": 113000
},
{
"epoch": 8.796403937068899,
"grad_norm": 6.297511100769043,
"learning_rate": 2.4071921258622026e-06,
"loss": 2.463,
"step": 113500
},
{
"epoch": 8.835154615205767,
"grad_norm": 6.689960479736328,
"learning_rate": 2.329690769588468e-06,
"loss": 2.4621,
"step": 114000
},
{
"epoch": 8.873905293342634,
"grad_norm": 6.450560569763184,
"learning_rate": 2.252189413314733e-06,
"loss": 2.4559,
"step": 114500
},
{
"epoch": 8.9126559714795,
"grad_norm": 6.459935665130615,
"learning_rate": 2.1746880570409984e-06,
"loss": 2.4646,
"step": 115000
},
{
"epoch": 8.951406649616368,
"grad_norm": 6.182426452636719,
"learning_rate": 2.0971867007672637e-06,
"loss": 2.4665,
"step": 115500
},
{
"epoch": 8.990157327753236,
"grad_norm": 7.122648239135742,
"learning_rate": 2.019685344493529e-06,
"loss": 2.475,
"step": 116000
},
{
"epoch": 9.0,
"eval_loss": 2.406507968902588,
"eval_runtime": 258.9009,
"eval_samples_per_second": 797.467,
"eval_steps_per_second": 12.464,
"step": 116127
},
{
"epoch": 9.028908005890104,
"grad_norm": 7.267585754394531,
"learning_rate": 1.942183988219794e-06,
"loss": 2.447,
"step": 116500
},
{
"epoch": 9.06765868402697,
"grad_norm": 6.2447991371154785,
"learning_rate": 1.8646826319460593e-06,
"loss": 2.4609,
"step": 117000
},
{
"epoch": 9.106409362163838,
"grad_norm": 6.521481037139893,
"learning_rate": 1.7871812756723245e-06,
"loss": 2.4418,
"step": 117500
},
{
"epoch": 9.145160040300706,
"grad_norm": 6.647397041320801,
"learning_rate": 1.7096799193985896e-06,
"loss": 2.4665,
"step": 118000
},
{
"epoch": 9.183910718437573,
"grad_norm": 6.247033596038818,
"learning_rate": 1.6321785631248548e-06,
"loss": 2.4647,
"step": 118500
},
{
"epoch": 9.22266139657444,
"grad_norm": 6.595357894897461,
"learning_rate": 1.5546772068511201e-06,
"loss": 2.4705,
"step": 119000
},
{
"epoch": 9.261412074711307,
"grad_norm": 8.117677688598633,
"learning_rate": 1.4771758505773854e-06,
"loss": 2.4629,
"step": 119500
},
{
"epoch": 9.300162752848175,
"grad_norm": 6.991618633270264,
"learning_rate": 1.3996744943036504e-06,
"loss": 2.4498,
"step": 120000
},
{
"epoch": 9.338913430985043,
"grad_norm": 6.236393451690674,
"learning_rate": 1.3221731380299157e-06,
"loss": 2.467,
"step": 120500
},
{
"epoch": 9.377664109121909,
"grad_norm": 6.595478534698486,
"learning_rate": 1.2446717817561808e-06,
"loss": 2.4547,
"step": 121000
},
{
"epoch": 9.416414787258777,
"grad_norm": 7.194475173950195,
"learning_rate": 1.167170425482446e-06,
"loss": 2.4669,
"step": 121500
},
{
"epoch": 9.455165465395645,
"grad_norm": 6.341099262237549,
"learning_rate": 1.0896690692087113e-06,
"loss": 2.4661,
"step": 122000
},
{
"epoch": 9.493916143532513,
"grad_norm": 7.257521629333496,
"learning_rate": 1.0121677129349766e-06,
"loss": 2.4629,
"step": 122500
},
{
"epoch": 9.532666821669379,
"grad_norm": 6.399875164031982,
"learning_rate": 9.346663566612417e-07,
"loss": 2.4555,
"step": 123000
},
{
"epoch": 9.571417499806246,
"grad_norm": 7.292248249053955,
"learning_rate": 8.571650003875069e-07,
"loss": 2.4646,
"step": 123500
},
{
"epoch": 9.610168177943114,
"grad_norm": 6.8132548332214355,
"learning_rate": 7.79663644113772e-07,
"loss": 2.4521,
"step": 124000
},
{
"epoch": 9.648918856079982,
"grad_norm": 6.302210330963135,
"learning_rate": 7.021622878400372e-07,
"loss": 2.451,
"step": 124500
},
{
"epoch": 9.687669534216848,
"grad_norm": 6.902337551116943,
"learning_rate": 6.246609315663025e-07,
"loss": 2.4515,
"step": 125000
},
{
"epoch": 9.726420212353716,
"grad_norm": 6.4049296379089355,
"learning_rate": 5.471595752925676e-07,
"loss": 2.454,
"step": 125500
},
{
"epoch": 9.765170890490584,
"grad_norm": 7.109240531921387,
"learning_rate": 4.6965821901883286e-07,
"loss": 2.4379,
"step": 126000
},
{
"epoch": 9.803921568627452,
"grad_norm": 6.1289873123168945,
"learning_rate": 3.921568627450981e-07,
"loss": 2.4438,
"step": 126500
},
{
"epoch": 9.842672246764318,
"grad_norm": 6.873955726623535,
"learning_rate": 3.146555064713633e-07,
"loss": 2.4526,
"step": 127000
},
{
"epoch": 9.881422924901186,
"grad_norm": 6.842904090881348,
"learning_rate": 2.3715415019762845e-07,
"loss": 2.4471,
"step": 127500
},
{
"epoch": 9.920173603038053,
"grad_norm": 9.636740684509277,
"learning_rate": 1.5965279392389367e-07,
"loss": 2.4469,
"step": 128000
},
{
"epoch": 9.958924281174921,
"grad_norm": 6.161515235900879,
"learning_rate": 8.21514376501589e-08,
"loss": 2.4608,
"step": 128500
},
{
"epoch": 9.997674959311787,
"grad_norm": 6.582516193389893,
"learning_rate": 4.6500813764240875e-09,
"loss": 2.4411,
"step": 129000
},
{
"epoch": 10.0,
"eval_loss": 2.3977291584014893,
"eval_runtime": 258.9982,
"eval_samples_per_second": 797.168,
"eval_steps_per_second": 12.46,
"step": 129030
},
{
"epoch": 10.036425637448655,
"grad_norm": 6.090233325958252,
"learning_rate": 9.963574362551346e-06,
"loss": 2.4784,
"step": 129500
},
{
"epoch": 10.075176315585523,
"grad_norm": 6.285606384277344,
"learning_rate": 9.924823684414478e-06,
"loss": 2.4657,
"step": 130000
},
{
"epoch": 10.11392699372239,
"grad_norm": 5.937399864196777,
"learning_rate": 9.886073006277611e-06,
"loss": 2.4869,
"step": 130500
},
{
"epoch": 10.152677671859257,
"grad_norm": 7.235742568969727,
"learning_rate": 9.847322328140743e-06,
"loss": 2.4726,
"step": 131000
},
{
"epoch": 10.191428349996125,
"grad_norm": 6.6334028244018555,
"learning_rate": 9.808571650003877e-06,
"loss": 2.472,
"step": 131500
},
{
"epoch": 10.230179028132993,
"grad_norm": 7.366402626037598,
"learning_rate": 9.769820971867009e-06,
"loss": 2.4887,
"step": 132000
},
{
"epoch": 10.26892970626986,
"grad_norm": 6.17592716217041,
"learning_rate": 9.731070293730142e-06,
"loss": 2.4854,
"step": 132500
},
{
"epoch": 10.307680384406726,
"grad_norm": 6.376716613769531,
"learning_rate": 9.692319615593274e-06,
"loss": 2.486,
"step": 133000
},
{
"epoch": 10.346431062543594,
"grad_norm": 6.293849945068359,
"learning_rate": 9.653568937456407e-06,
"loss": 2.4707,
"step": 133500
},
{
"epoch": 10.385181740680462,
"grad_norm": 6.606166839599609,
"learning_rate": 9.614818259319539e-06,
"loss": 2.4704,
"step": 134000
},
{
"epoch": 10.42393241881733,
"grad_norm": 6.805929660797119,
"learning_rate": 9.576067581182673e-06,
"loss": 2.4727,
"step": 134500
},
{
"epoch": 10.462683096954196,
"grad_norm": 6.598349571228027,
"learning_rate": 9.537316903045804e-06,
"loss": 2.4825,
"step": 135000
},
{
"epoch": 10.501433775091064,
"grad_norm": 5.807904243469238,
"learning_rate": 9.498566224908938e-06,
"loss": 2.4721,
"step": 135500
},
{
"epoch": 10.540184453227932,
"grad_norm": 6.681980609893799,
"learning_rate": 9.45981554677207e-06,
"loss": 2.4764,
"step": 136000
},
{
"epoch": 10.5789351313648,
"grad_norm": 6.540719032287598,
"learning_rate": 9.421064868635203e-06,
"loss": 2.4545,
"step": 136500
},
{
"epoch": 10.617685809501666,
"grad_norm": 6.627035140991211,
"learning_rate": 9.382314190498335e-06,
"loss": 2.4778,
"step": 137000
},
{
"epoch": 10.656436487638533,
"grad_norm": 6.348284721374512,
"learning_rate": 9.343563512361468e-06,
"loss": 2.4597,
"step": 137500
},
{
"epoch": 10.695187165775401,
"grad_norm": 6.790314197540283,
"learning_rate": 9.3048128342246e-06,
"loss": 2.471,
"step": 138000
},
{
"epoch": 10.733937843912269,
"grad_norm": 6.8181233406066895,
"learning_rate": 9.266062156087732e-06,
"loss": 2.4571,
"step": 138500
},
{
"epoch": 10.772688522049135,
"grad_norm": 6.593683242797852,
"learning_rate": 9.227311477950864e-06,
"loss": 2.4843,
"step": 139000
},
{
"epoch": 10.811439200186003,
"grad_norm": 6.600128650665283,
"learning_rate": 9.188560799813997e-06,
"loss": 2.464,
"step": 139500
},
{
"epoch": 10.85018987832287,
"grad_norm": 6.368162631988525,
"learning_rate": 9.149810121677129e-06,
"loss": 2.4598,
"step": 140000
},
{
"epoch": 10.888940556459739,
"grad_norm": 6.5435943603515625,
"learning_rate": 9.111059443540262e-06,
"loss": 2.4704,
"step": 140500
},
{
"epoch": 10.927691234596605,
"grad_norm": 6.06011962890625,
"learning_rate": 9.072308765403394e-06,
"loss": 2.4514,
"step": 141000
},
{
"epoch": 10.966441912733472,
"grad_norm": 7.2288689613342285,
"learning_rate": 9.033558087266528e-06,
"loss": 2.4521,
"step": 141500
},
{
"epoch": 11.0,
"eval_loss": 2.3912322521209717,
"eval_runtime": 258.9953,
"eval_samples_per_second": 797.176,
"eval_steps_per_second": 12.46,
"step": 141933
},
{
"epoch": 11.00519259087034,
"grad_norm": 6.698403358459473,
"learning_rate": 8.99480740912966e-06,
"loss": 2.4457,
"step": 142000
},
{
"epoch": 11.043943269007208,
"grad_norm": 6.455236911773682,
"learning_rate": 8.956056730992793e-06,
"loss": 2.4507,
"step": 142500
},
{
"epoch": 11.082693947144074,
"grad_norm": 6.590576648712158,
"learning_rate": 8.917306052855925e-06,
"loss": 2.4256,
"step": 143000
},
{
"epoch": 11.121444625280942,
"grad_norm": 6.957404136657715,
"learning_rate": 8.878555374719058e-06,
"loss": 2.4549,
"step": 143500
},
{
"epoch": 11.16019530341781,
"grad_norm": 6.926699161529541,
"learning_rate": 8.83980469658219e-06,
"loss": 2.4499,
"step": 144000
},
{
"epoch": 11.198945981554678,
"grad_norm": 6.484086036682129,
"learning_rate": 8.801054018445324e-06,
"loss": 2.4443,
"step": 144500
},
{
"epoch": 11.237696659691544,
"grad_norm": 6.107706069946289,
"learning_rate": 8.762303340308455e-06,
"loss": 2.4459,
"step": 145000
},
{
"epoch": 11.276447337828412,
"grad_norm": 7.301278591156006,
"learning_rate": 8.723552662171589e-06,
"loss": 2.4463,
"step": 145500
},
{
"epoch": 11.31519801596528,
"grad_norm": 6.378045082092285,
"learning_rate": 8.68480198403472e-06,
"loss": 2.4494,
"step": 146000
},
{
"epoch": 11.353948694102147,
"grad_norm": 6.803300857543945,
"learning_rate": 8.646051305897854e-06,
"loss": 2.4235,
"step": 146500
},
{
"epoch": 11.392699372239015,
"grad_norm": 6.401794910430908,
"learning_rate": 8.607300627760986e-06,
"loss": 2.4353,
"step": 147000
},
{
"epoch": 11.431450050375881,
"grad_norm": 6.455550193786621,
"learning_rate": 8.56854994962412e-06,
"loss": 2.4306,
"step": 147500
},
{
"epoch": 11.470200728512749,
"grad_norm": 6.416442394256592,
"learning_rate": 8.529799271487251e-06,
"loss": 2.4143,
"step": 148000
},
{
"epoch": 11.508951406649617,
"grad_norm": 6.768812656402588,
"learning_rate": 8.491048593350385e-06,
"loss": 2.4184,
"step": 148500
},
{
"epoch": 11.547702084786483,
"grad_norm": 6.085323810577393,
"learning_rate": 8.452297915213516e-06,
"loss": 2.4318,
"step": 149000
},
{
"epoch": 11.58645276292335,
"grad_norm": 6.181857585906982,
"learning_rate": 8.41354723707665e-06,
"loss": 2.4348,
"step": 149500
},
{
"epoch": 11.625203441060219,
"grad_norm": 6.558756351470947,
"learning_rate": 8.374796558939782e-06,
"loss": 2.413,
"step": 150000
},
{
"epoch": 11.663954119197086,
"grad_norm": 6.249685287475586,
"learning_rate": 8.336045880802915e-06,
"loss": 2.4271,
"step": 150500
},
{
"epoch": 11.702704797333954,
"grad_norm": 6.789103984832764,
"learning_rate": 8.297295202666047e-06,
"loss": 2.4226,
"step": 151000
},
{
"epoch": 11.74145547547082,
"grad_norm": 6.4289140701293945,
"learning_rate": 8.25854452452918e-06,
"loss": 2.4184,
"step": 151500
},
{
"epoch": 11.780206153607688,
"grad_norm": 6.098612308502197,
"learning_rate": 8.219793846392312e-06,
"loss": 2.4132,
"step": 152000
},
{
"epoch": 11.818956831744556,
"grad_norm": 6.500378608703613,
"learning_rate": 8.181043168255444e-06,
"loss": 2.4184,
"step": 152500
},
{
"epoch": 11.857707509881424,
"grad_norm": 6.583259105682373,
"learning_rate": 8.142292490118577e-06,
"loss": 2.4259,
"step": 153000
},
{
"epoch": 11.89645818801829,
"grad_norm": 6.7018303871154785,
"learning_rate": 8.10354181198171e-06,
"loss": 2.4185,
"step": 153500
},
{
"epoch": 11.935208866155158,
"grad_norm": 6.679374694824219,
"learning_rate": 8.064791133844843e-06,
"loss": 2.4078,
"step": 154000
},
{
"epoch": 11.973959544292025,
"grad_norm": 6.576003551483154,
"learning_rate": 8.026040455707974e-06,
"loss": 2.4212,
"step": 154500
},
{
"epoch": 12.0,
"eval_loss": 2.3491039276123047,
"eval_runtime": 260.4232,
"eval_samples_per_second": 792.806,
"eval_steps_per_second": 12.391,
"step": 154836
},
{
"epoch": 12.012710222428893,
"grad_norm": 6.768045902252197,
"learning_rate": 7.987289777571108e-06,
"loss": 2.399,
"step": 155000
},
{
"epoch": 12.05146090056576,
"grad_norm": 6.445169925689697,
"learning_rate": 7.94853909943424e-06,
"loss": 2.4055,
"step": 155500
},
{
"epoch": 12.090211578702627,
"grad_norm": 6.684764385223389,
"learning_rate": 7.909788421297373e-06,
"loss": 2.3979,
"step": 156000
},
{
"epoch": 12.128962256839495,
"grad_norm": 7.150822162628174,
"learning_rate": 7.871037743160505e-06,
"loss": 2.4091,
"step": 156500
},
{
"epoch": 12.167712934976363,
"grad_norm": 6.7067131996154785,
"learning_rate": 7.832287065023639e-06,
"loss": 2.4057,
"step": 157000
},
{
"epoch": 12.206463613113229,
"grad_norm": 6.288236141204834,
"learning_rate": 7.79353638688677e-06,
"loss": 2.4024,
"step": 157500
},
{
"epoch": 12.245214291250097,
"grad_norm": 6.532754898071289,
"learning_rate": 7.754785708749904e-06,
"loss": 2.4119,
"step": 158000
},
{
"epoch": 12.283964969386965,
"grad_norm": 6.437507629394531,
"learning_rate": 7.716035030613036e-06,
"loss": 2.4048,
"step": 158500
},
{
"epoch": 12.322715647523832,
"grad_norm": 6.648064136505127,
"learning_rate": 7.677284352476169e-06,
"loss": 2.3954,
"step": 159000
},
{
"epoch": 12.361466325660698,
"grad_norm": 6.406070232391357,
"learning_rate": 7.6385336743393e-06,
"loss": 2.4069,
"step": 159500
},
{
"epoch": 12.400217003797566,
"grad_norm": 6.75925350189209,
"learning_rate": 7.5997829962024335e-06,
"loss": 2.3803,
"step": 160000
},
{
"epoch": 12.438967681934434,
"grad_norm": 7.390876770019531,
"learning_rate": 7.561032318065566e-06,
"loss": 2.3952,
"step": 160500
},
{
"epoch": 12.477718360071302,
"grad_norm": 6.584438800811768,
"learning_rate": 7.522281639928699e-06,
"loss": 2.3921,
"step": 161000
},
{
"epoch": 12.516469038208168,
"grad_norm": 6.7814040184021,
"learning_rate": 7.483530961791831e-06,
"loss": 2.4035,
"step": 161500
},
{
"epoch": 12.555219716345036,
"grad_norm": 6.544926166534424,
"learning_rate": 7.444780283654964e-06,
"loss": 2.3855,
"step": 162000
},
{
"epoch": 12.593970394481904,
"grad_norm": 6.649155139923096,
"learning_rate": 7.406029605518097e-06,
"loss": 2.3884,
"step": 162500
},
{
"epoch": 12.632721072618772,
"grad_norm": 6.128752708435059,
"learning_rate": 7.367278927381229e-06,
"loss": 2.3915,
"step": 163000
},
{
"epoch": 12.671471750755638,
"grad_norm": 6.694360733032227,
"learning_rate": 7.328528249244362e-06,
"loss": 2.4065,
"step": 163500
},
{
"epoch": 12.710222428892505,
"grad_norm": 6.9979963302612305,
"learning_rate": 7.2897775711074945e-06,
"loss": 2.3816,
"step": 164000
},
{
"epoch": 12.748973107029373,
"grad_norm": 6.7657294273376465,
"learning_rate": 7.251026892970627e-06,
"loss": 2.385,
"step": 164500
},
{
"epoch": 12.787723785166241,
"grad_norm": 7.142265796661377,
"learning_rate": 7.21227621483376e-06,
"loss": 2.3809,
"step": 165000
},
{
"epoch": 12.826474463303107,
"grad_norm": 6.2213134765625,
"learning_rate": 7.1735255366968924e-06,
"loss": 2.3883,
"step": 165500
},
{
"epoch": 12.865225141439975,
"grad_norm": 6.274342060089111,
"learning_rate": 7.134774858560025e-06,
"loss": 2.3838,
"step": 166000
},
{
"epoch": 12.903975819576843,
"grad_norm": 6.5893049240112305,
"learning_rate": 7.096024180423158e-06,
"loss": 2.3832,
"step": 166500
},
{
"epoch": 12.94272649771371,
"grad_norm": 6.229060173034668,
"learning_rate": 7.05727350228629e-06,
"loss": 2.3839,
"step": 167000
},
{
"epoch": 12.981477175850577,
"grad_norm": 7.251420497894287,
"learning_rate": 7.018522824149423e-06,
"loss": 2.3838,
"step": 167500
},
{
"epoch": 13.0,
"eval_loss": 2.3215689659118652,
"eval_runtime": 259.7568,
"eval_samples_per_second": 794.84,
"eval_steps_per_second": 12.423,
"step": 167739
},
{
"epoch": 13.020227853987445,
"grad_norm": 5.944735050201416,
"learning_rate": 6.979772146012556e-06,
"loss": 2.3687,
"step": 168000
},
{
"epoch": 13.058978532124312,
"grad_norm": 6.25685977935791,
"learning_rate": 6.941021467875688e-06,
"loss": 2.3761,
"step": 168500
},
{
"epoch": 13.09772921026118,
"grad_norm": 6.244680881500244,
"learning_rate": 6.902270789738821e-06,
"loss": 2.3463,
"step": 169000
},
{
"epoch": 13.136479888398046,
"grad_norm": 6.370804309844971,
"learning_rate": 6.8635201116019535e-06,
"loss": 2.3597,
"step": 169500
},
{
"epoch": 13.175230566534914,
"grad_norm": 6.249234676361084,
"learning_rate": 6.824769433465086e-06,
"loss": 2.3679,
"step": 170000
},
{
"epoch": 13.213981244671782,
"grad_norm": 6.973300933837891,
"learning_rate": 6.786018755328219e-06,
"loss": 2.3669,
"step": 170500
},
{
"epoch": 13.25273192280865,
"grad_norm": 7.319492816925049,
"learning_rate": 6.747268077191351e-06,
"loss": 2.3528,
"step": 171000
},
{
"epoch": 13.291482600945516,
"grad_norm": 6.924526214599609,
"learning_rate": 6.708517399054484e-06,
"loss": 2.3662,
"step": 171500
},
{
"epoch": 13.330233279082384,
"grad_norm": 6.761091709136963,
"learning_rate": 6.669766720917617e-06,
"loss": 2.3608,
"step": 172000
},
{
"epoch": 13.368983957219251,
"grad_norm": 6.105197429656982,
"learning_rate": 6.631016042780749e-06,
"loss": 2.3536,
"step": 172500
},
{
"epoch": 13.40773463535612,
"grad_norm": 6.724457740783691,
"learning_rate": 6.592265364643882e-06,
"loss": 2.3682,
"step": 173000
},
{
"epoch": 13.446485313492985,
"grad_norm": 6.62090539932251,
"learning_rate": 6.553514686507015e-06,
"loss": 2.3549,
"step": 173500
},
{
"epoch": 13.485235991629853,
"grad_norm": 6.862425327301025,
"learning_rate": 6.514764008370147e-06,
"loss": 2.3475,
"step": 174000
},
{
"epoch": 13.523986669766721,
"grad_norm": 6.164032936096191,
"learning_rate": 6.47601333023328e-06,
"loss": 2.3625,
"step": 174500
},
{
"epoch": 13.562737347903589,
"grad_norm": 7.522220134735107,
"learning_rate": 6.4372626520964125e-06,
"loss": 2.3676,
"step": 175000
},
{
"epoch": 13.601488026040455,
"grad_norm": 6.564206600189209,
"learning_rate": 6.398511973959545e-06,
"loss": 2.3606,
"step": 175500
},
{
"epoch": 13.640238704177323,
"grad_norm": 6.069074630737305,
"learning_rate": 6.359761295822677e-06,
"loss": 2.3644,
"step": 176000
},
{
"epoch": 13.67898938231419,
"grad_norm": 6.570771217346191,
"learning_rate": 6.3210106176858095e-06,
"loss": 2.3711,
"step": 176500
},
{
"epoch": 13.717740060451058,
"grad_norm": 6.1281609535217285,
"learning_rate": 6.282259939548942e-06,
"loss": 2.348,
"step": 177000
},
{
"epoch": 13.756490738587924,
"grad_norm": 6.176905632019043,
"learning_rate": 6.243509261412075e-06,
"loss": 2.379,
"step": 177500
},
{
"epoch": 13.795241416724792,
"grad_norm": 7.890781402587891,
"learning_rate": 6.2047585832752074e-06,
"loss": 2.365,
"step": 178000
},
{
"epoch": 13.83399209486166,
"grad_norm": 6.160940647125244,
"learning_rate": 6.16600790513834e-06,
"loss": 2.3391,
"step": 178500
},
{
"epoch": 13.872742772998528,
"grad_norm": 6.732828617095947,
"learning_rate": 6.127257227001473e-06,
"loss": 2.355,
"step": 179000
},
{
"epoch": 13.911493451135394,
"grad_norm": 6.500529766082764,
"learning_rate": 6.088506548864605e-06,
"loss": 2.3512,
"step": 179500
},
{
"epoch": 13.950244129272262,
"grad_norm": 7.362790584564209,
"learning_rate": 6.049755870727738e-06,
"loss": 2.3654,
"step": 180000
},
{
"epoch": 13.98899480740913,
"grad_norm": 7.070291519165039,
"learning_rate": 6.011005192590871e-06,
"loss": 2.3444,
"step": 180500
},
{
"epoch": 14.0,
"eval_loss": 2.2924630641937256,
"eval_runtime": 259.3076,
"eval_samples_per_second": 796.217,
"eval_steps_per_second": 12.445,
"step": 180642
},
{
"epoch": 14.027745485545998,
"grad_norm": 7.284486293792725,
"learning_rate": 5.972254514454003e-06,
"loss": 2.3296,
"step": 181000
},
{
"epoch": 14.066496163682864,
"grad_norm": 7.636621952056885,
"learning_rate": 5.933503836317136e-06,
"loss": 2.3314,
"step": 181500
},
{
"epoch": 14.105246841819731,
"grad_norm": 6.692602634429932,
"learning_rate": 5.8947531581802685e-06,
"loss": 2.3363,
"step": 182000
},
{
"epoch": 14.1439975199566,
"grad_norm": 6.751750469207764,
"learning_rate": 5.856002480043401e-06,
"loss": 2.3174,
"step": 182500
},
{
"epoch": 14.182748198093467,
"grad_norm": 7.041817665100098,
"learning_rate": 5.817251801906534e-06,
"loss": 2.3295,
"step": 183000
},
{
"epoch": 14.221498876230335,
"grad_norm": 7.414912700653076,
"learning_rate": 5.778501123769666e-06,
"loss": 2.3386,
"step": 183500
},
{
"epoch": 14.260249554367201,
"grad_norm": 7.009491920471191,
"learning_rate": 5.739750445632799e-06,
"loss": 2.3282,
"step": 184000
},
{
"epoch": 14.299000232504069,
"grad_norm": 6.77699089050293,
"learning_rate": 5.700999767495932e-06,
"loss": 2.3323,
"step": 184500
},
{
"epoch": 14.337750910640937,
"grad_norm": 6.922458171844482,
"learning_rate": 5.662249089359064e-06,
"loss": 2.3545,
"step": 185000
},
{
"epoch": 14.376501588777803,
"grad_norm": 7.635495185852051,
"learning_rate": 5.623498411222197e-06,
"loss": 2.3429,
"step": 185500
},
{
"epoch": 14.41525226691467,
"grad_norm": 6.657200813293457,
"learning_rate": 5.58474773308533e-06,
"loss": 2.3371,
"step": 186000
},
{
"epoch": 14.454002945051538,
"grad_norm": 6.328368663787842,
"learning_rate": 5.545997054948462e-06,
"loss": 2.3225,
"step": 186500
},
{
"epoch": 14.492753623188406,
"grad_norm": 6.7084503173828125,
"learning_rate": 5.507246376811595e-06,
"loss": 2.3141,
"step": 187000
},
{
"epoch": 14.531504301325274,
"grad_norm": 6.23046875,
"learning_rate": 5.4684956986747275e-06,
"loss": 2.3387,
"step": 187500
},
{
"epoch": 14.57025497946214,
"grad_norm": 6.53918981552124,
"learning_rate": 5.42974502053786e-06,
"loss": 2.3355,
"step": 188000
},
{
"epoch": 14.609005657599008,
"grad_norm": 6.816432952880859,
"learning_rate": 5.390994342400993e-06,
"loss": 2.3409,
"step": 188500
},
{
"epoch": 14.647756335735876,
"grad_norm": 6.9504475593566895,
"learning_rate": 5.352243664264125e-06,
"loss": 2.3274,
"step": 189000
},
{
"epoch": 14.686507013872744,
"grad_norm": 7.058226585388184,
"learning_rate": 5.313492986127258e-06,
"loss": 2.3295,
"step": 189500
},
{
"epoch": 14.72525769200961,
"grad_norm": 6.337547302246094,
"learning_rate": 5.274742307990391e-06,
"loss": 2.316,
"step": 190000
},
{
"epoch": 14.764008370146477,
"grad_norm": 7.420670032501221,
"learning_rate": 5.235991629853523e-06,
"loss": 2.3313,
"step": 190500
},
{
"epoch": 14.802759048283345,
"grad_norm": 6.559388637542725,
"learning_rate": 5.197240951716656e-06,
"loss": 2.3368,
"step": 191000
},
{
"epoch": 14.841509726420213,
"grad_norm": 6.416265487670898,
"learning_rate": 5.1584902735797886e-06,
"loss": 2.3139,
"step": 191500
},
{
"epoch": 14.88026040455708,
"grad_norm": 6.204991817474365,
"learning_rate": 5.11973959544292e-06,
"loss": 2.3209,
"step": 192000
},
{
"epoch": 14.919011082693947,
"grad_norm": 7.657558441162109,
"learning_rate": 5.080988917306053e-06,
"loss": 2.3346,
"step": 192500
},
{
"epoch": 14.957761760830815,
"grad_norm": 6.812448024749756,
"learning_rate": 5.042238239169186e-06,
"loss": 2.3226,
"step": 193000
},
{
"epoch": 14.996512438967683,
"grad_norm": 5.866453170776367,
"learning_rate": 5.003487561032318e-06,
"loss": 2.3034,
"step": 193500
},
{
"epoch": 15.0,
"eval_loss": 2.2758021354675293,
"eval_runtime": 268.9287,
"eval_samples_per_second": 767.731,
"eval_steps_per_second": 11.999,
"step": 193545
},
{
"epoch": 15.035263117104549,
"grad_norm": 6.998913288116455,
"learning_rate": 4.964736882895451e-06,
"loss": 2.3103,
"step": 194000
},
{
"epoch": 15.074013795241417,
"grad_norm": 7.022980213165283,
"learning_rate": 4.9259862047585835e-06,
"loss": 2.3121,
"step": 194500
},
{
"epoch": 15.112764473378284,
"grad_norm": 6.3553056716918945,
"learning_rate": 4.887235526621716e-06,
"loss": 2.325,
"step": 195000
},
{
"epoch": 15.151515151515152,
"grad_norm": 7.574887752532959,
"learning_rate": 4.848484848484849e-06,
"loss": 2.3128,
"step": 195500
},
{
"epoch": 15.190265829652018,
"grad_norm": 6.3977556228637695,
"learning_rate": 4.809734170347981e-06,
"loss": 2.3058,
"step": 196000
},
{
"epoch": 15.229016507788886,
"grad_norm": 6.198862552642822,
"learning_rate": 4.770983492211114e-06,
"loss": 2.3111,
"step": 196500
},
{
"epoch": 15.267767185925754,
"grad_norm": 7.1892499923706055,
"learning_rate": 4.732232814074247e-06,
"loss": 2.3181,
"step": 197000
},
{
"epoch": 15.306517864062622,
"grad_norm": 6.773824214935303,
"learning_rate": 4.693482135937379e-06,
"loss": 2.3158,
"step": 197500
},
{
"epoch": 15.345268542199488,
"grad_norm": 6.595972537994385,
"learning_rate": 4.654731457800512e-06,
"loss": 2.2989,
"step": 198000
},
{
"epoch": 15.384019220336356,
"grad_norm": 7.397641658782959,
"learning_rate": 4.615980779663645e-06,
"loss": 2.3143,
"step": 198500
},
{
"epoch": 15.422769898473224,
"grad_norm": 7.2511820793151855,
"learning_rate": 4.577230101526777e-06,
"loss": 2.3077,
"step": 199000
},
{
"epoch": 15.461520576610091,
"grad_norm": 6.52310848236084,
"learning_rate": 4.53847942338991e-06,
"loss": 2.3062,
"step": 199500
},
{
"epoch": 15.500271254746957,
"grad_norm": 6.681788921356201,
"learning_rate": 4.4997287452530425e-06,
"loss": 2.3078,
"step": 200000
},
{
"epoch": 15.539021932883825,
"grad_norm": 7.010565280914307,
"learning_rate": 4.460978067116175e-06,
"loss": 2.3031,
"step": 200500
},
{
"epoch": 15.577772611020693,
"grad_norm": 7.412187576293945,
"learning_rate": 4.422227388979308e-06,
"loss": 2.3029,
"step": 201000
},
{
"epoch": 15.616523289157561,
"grad_norm": 6.586581707000732,
"learning_rate": 4.38347671084244e-06,
"loss": 2.3092,
"step": 201500
},
{
"epoch": 15.655273967294427,
"grad_norm": 6.430338382720947,
"learning_rate": 4.344726032705573e-06,
"loss": 2.2972,
"step": 202000
},
{
"epoch": 15.694024645431295,
"grad_norm": 6.151809215545654,
"learning_rate": 4.305975354568706e-06,
"loss": 2.2972,
"step": 202500
},
{
"epoch": 15.732775323568163,
"grad_norm": 7.195096492767334,
"learning_rate": 4.267224676431838e-06,
"loss": 2.3045,
"step": 203000
},
{
"epoch": 15.77152600170503,
"grad_norm": 6.76158332824707,
"learning_rate": 4.228473998294971e-06,
"loss": 2.2995,
"step": 203500
},
{
"epoch": 15.810276679841897,
"grad_norm": 6.710601329803467,
"learning_rate": 4.1897233201581036e-06,
"loss": 2.3045,
"step": 204000
},
{
"epoch": 15.849027357978764,
"grad_norm": 6.813743591308594,
"learning_rate": 4.150972642021236e-06,
"loss": 2.3114,
"step": 204500
},
{
"epoch": 15.887778036115632,
"grad_norm": 7.168315410614014,
"learning_rate": 4.112221963884369e-06,
"loss": 2.2995,
"step": 205000
},
{
"epoch": 15.9265287142525,
"grad_norm": 6.606774806976318,
"learning_rate": 4.0734712857475015e-06,
"loss": 2.3023,
"step": 205500
},
{
"epoch": 15.965279392389366,
"grad_norm": 6.852230548858643,
"learning_rate": 4.034720607610634e-06,
"loss": 2.311,
"step": 206000
},
{
"epoch": 16.0,
"eval_loss": 2.252058982849121,
"eval_runtime": 272.2097,
"eval_samples_per_second": 758.478,
"eval_steps_per_second": 11.855,
"step": 206448
},
{
"epoch": 16.004030070526234,
"grad_norm": 7.245954990386963,
"learning_rate": 3.995969929473767e-06,
"loss": 2.2896,
"step": 206500
},
{
"epoch": 16.0427807486631,
"grad_norm": 6.094116687774658,
"learning_rate": 3.957219251336899e-06,
"loss": 2.2999,
"step": 207000
},
{
"epoch": 16.08153142679997,
"grad_norm": 6.302695274353027,
"learning_rate": 3.918468573200031e-06,
"loss": 2.3017,
"step": 207500
},
{
"epoch": 16.120282104936837,
"grad_norm": 6.800222873687744,
"learning_rate": 3.879717895063164e-06,
"loss": 2.2876,
"step": 208000
},
{
"epoch": 16.159032783073705,
"grad_norm": 7.139950752258301,
"learning_rate": 3.840967216926296e-06,
"loss": 2.2975,
"step": 208500
},
{
"epoch": 16.19778346121057,
"grad_norm": 6.805322170257568,
"learning_rate": 3.802216538789429e-06,
"loss": 2.2994,
"step": 209000
},
{
"epoch": 16.236534139347437,
"grad_norm": 6.6877336502075195,
"learning_rate": 3.7634658606525617e-06,
"loss": 2.277,
"step": 209500
},
{
"epoch": 16.275284817484305,
"grad_norm": 6.0831193923950195,
"learning_rate": 3.724715182515694e-06,
"loss": 2.3029,
"step": 210000
},
{
"epoch": 16.314035495621173,
"grad_norm": 6.021631240844727,
"learning_rate": 3.6859645043788265e-06,
"loss": 2.2959,
"step": 210500
},
{
"epoch": 16.35278617375804,
"grad_norm": 7.072383403778076,
"learning_rate": 3.647213826241959e-06,
"loss": 2.2709,
"step": 211000
},
{
"epoch": 16.39153685189491,
"grad_norm": 6.0719404220581055,
"learning_rate": 3.608463148105092e-06,
"loss": 2.2952,
"step": 211500
},
{
"epoch": 16.430287530031777,
"grad_norm": 6.733717441558838,
"learning_rate": 3.5697124699682244e-06,
"loss": 2.2984,
"step": 212000
},
{
"epoch": 16.469038208168644,
"grad_norm": 7.269532203674316,
"learning_rate": 3.530961791831357e-06,
"loss": 2.2855,
"step": 212500
},
{
"epoch": 16.50778888630551,
"grad_norm": 7.440357208251953,
"learning_rate": 3.4922111136944897e-06,
"loss": 2.2747,
"step": 213000
},
{
"epoch": 16.546539564442377,
"grad_norm": 7.448116302490234,
"learning_rate": 3.4534604355576223e-06,
"loss": 2.2933,
"step": 213500
},
{
"epoch": 16.585290242579244,
"grad_norm": 6.202878475189209,
"learning_rate": 3.414709757420755e-06,
"loss": 2.2963,
"step": 214000
},
{
"epoch": 16.624040920716112,
"grad_norm": 7.019168376922607,
"learning_rate": 3.3759590792838876e-06,
"loss": 2.2667,
"step": 214500
},
{
"epoch": 16.66279159885298,
"grad_norm": 6.448665142059326,
"learning_rate": 3.3372084011470202e-06,
"loss": 2.2905,
"step": 215000
},
{
"epoch": 16.701542276989848,
"grad_norm": 6.160965442657471,
"learning_rate": 3.298457723010153e-06,
"loss": 2.2854,
"step": 215500
},
{
"epoch": 16.740292955126716,
"grad_norm": 6.956637859344482,
"learning_rate": 3.2597070448732855e-06,
"loss": 2.2944,
"step": 216000
},
{
"epoch": 16.779043633263584,
"grad_norm": 6.935131549835205,
"learning_rate": 3.220956366736418e-06,
"loss": 2.2795,
"step": 216500
},
{
"epoch": 16.817794311400448,
"grad_norm": 6.656859397888184,
"learning_rate": 3.1822056885995508e-06,
"loss": 2.2872,
"step": 217000
},
{
"epoch": 16.856544989537316,
"grad_norm": 6.204549312591553,
"learning_rate": 3.1434550104626834e-06,
"loss": 2.2832,
"step": 217500
},
{
"epoch": 16.895295667674183,
"grad_norm": 6.77413272857666,
"learning_rate": 3.1047043323258156e-06,
"loss": 2.2719,
"step": 218000
},
{
"epoch": 16.93404634581105,
"grad_norm": 6.447382926940918,
"learning_rate": 3.0659536541889482e-06,
"loss": 2.2702,
"step": 218500
},
{
"epoch": 16.97279702394792,
"grad_norm": 7.396761894226074,
"learning_rate": 3.027202976052081e-06,
"loss": 2.2813,
"step": 219000
},
{
"epoch": 17.0,
"eval_loss": 2.2362165451049805,
"eval_runtime": 266.8391,
"eval_samples_per_second": 773.743,
"eval_steps_per_second": 12.093,
"step": 219351
},
{
"epoch": 17.011547702084787,
"grad_norm": 6.575385093688965,
"learning_rate": 2.9884522979152135e-06,
"loss": 2.2747,
"step": 219500
},
{
"epoch": 17.050298380221655,
"grad_norm": 7.168444633483887,
"learning_rate": 2.949701619778346e-06,
"loss": 2.2868,
"step": 220000
},
{
"epoch": 17.089049058358523,
"grad_norm": 7.069167613983154,
"learning_rate": 2.9109509416414788e-06,
"loss": 2.2838,
"step": 220500
},
{
"epoch": 17.12779973649539,
"grad_norm": 6.792834758758545,
"learning_rate": 2.8722002635046114e-06,
"loss": 2.2836,
"step": 221000
},
{
"epoch": 17.166550414632255,
"grad_norm": 6.546488285064697,
"learning_rate": 2.833449585367744e-06,
"loss": 2.2733,
"step": 221500
},
{
"epoch": 17.205301092769123,
"grad_norm": 6.293231010437012,
"learning_rate": 2.7946989072308767e-06,
"loss": 2.2688,
"step": 222000
},
{
"epoch": 17.24405177090599,
"grad_norm": 6.560914039611816,
"learning_rate": 2.7559482290940093e-06,
"loss": 2.2787,
"step": 222500
},
{
"epoch": 17.28280244904286,
"grad_norm": 6.571765422821045,
"learning_rate": 2.717197550957142e-06,
"loss": 2.2801,
"step": 223000
},
{
"epoch": 17.321553127179726,
"grad_norm": 7.396661281585693,
"learning_rate": 2.6784468728202746e-06,
"loss": 2.28,
"step": 223500
},
{
"epoch": 17.360303805316594,
"grad_norm": 6.239862442016602,
"learning_rate": 2.6396961946834072e-06,
"loss": 2.2743,
"step": 224000
},
{
"epoch": 17.39905448345346,
"grad_norm": 6.766594886779785,
"learning_rate": 2.60094551654654e-06,
"loss": 2.2456,
"step": 224500
},
{
"epoch": 17.43780516159033,
"grad_norm": 6.488914966583252,
"learning_rate": 2.5621948384096725e-06,
"loss": 2.2666,
"step": 225000
},
{
"epoch": 17.476555839727194,
"grad_norm": 6.036900043487549,
"learning_rate": 2.523444160272805e-06,
"loss": 2.2577,
"step": 225500
},
{
"epoch": 17.51530651786406,
"grad_norm": 6.977652549743652,
"learning_rate": 2.4846934821359373e-06,
"loss": 2.2657,
"step": 226000
},
{
"epoch": 17.55405719600093,
"grad_norm": 6.468418121337891,
"learning_rate": 2.44594280399907e-06,
"loss": 2.2737,
"step": 226500
},
{
"epoch": 17.592807874137797,
"grad_norm": 6.7042646408081055,
"learning_rate": 2.4071921258622026e-06,
"loss": 2.2685,
"step": 227000
},
{
"epoch": 17.631558552274665,
"grad_norm": 6.591056823730469,
"learning_rate": 2.3684414477253352e-06,
"loss": 2.2836,
"step": 227500
},
{
"epoch": 17.670309230411533,
"grad_norm": 7.078721523284912,
"learning_rate": 2.329690769588468e-06,
"loss": 2.2754,
"step": 228000
},
{
"epoch": 17.7090599085484,
"grad_norm": 6.701901435852051,
"learning_rate": 2.2909400914516005e-06,
"loss": 2.2494,
"step": 228500
},
{
"epoch": 17.74781058668527,
"grad_norm": 6.622567176818848,
"learning_rate": 2.252189413314733e-06,
"loss": 2.2689,
"step": 229000
},
{
"epoch": 17.786561264822133,
"grad_norm": 6.573280334472656,
"learning_rate": 2.2134387351778658e-06,
"loss": 2.271,
"step": 229500
},
{
"epoch": 17.825311942959,
"grad_norm": 6.9067206382751465,
"learning_rate": 2.1746880570409984e-06,
"loss": 2.2573,
"step": 230000
},
{
"epoch": 17.86406262109587,
"grad_norm": 6.601592063903809,
"learning_rate": 2.135937378904131e-06,
"loss": 2.2743,
"step": 230500
},
{
"epoch": 17.902813299232736,
"grad_norm": 6.949497699737549,
"learning_rate": 2.0971867007672637e-06,
"loss": 2.2644,
"step": 231000
},
{
"epoch": 17.941563977369604,
"grad_norm": 5.614126205444336,
"learning_rate": 2.0584360226303963e-06,
"loss": 2.2608,
"step": 231500
},
{
"epoch": 17.980314655506472,
"grad_norm": 6.880855560302734,
"learning_rate": 2.019685344493529e-06,
"loss": 2.2862,
"step": 232000
},
{
"epoch": 18.0,
"eval_loss": 2.229489326477051,
"eval_runtime": 270.0811,
"eval_samples_per_second": 764.456,
"eval_steps_per_second": 11.948,
"step": 232254
},
{
"epoch": 18.01906533364334,
"grad_norm": 6.630836486816406,
"learning_rate": 1.9809346663566616e-06,
"loss": 2.2632,
"step": 232500
},
{
"epoch": 18.057816011780208,
"grad_norm": 6.50869607925415,
"learning_rate": 1.942183988219794e-06,
"loss": 2.2584,
"step": 233000
},
{
"epoch": 18.096566689917072,
"grad_norm": 6.81369161605835,
"learning_rate": 1.9034333100829266e-06,
"loss": 2.2599,
"step": 233500
},
{
"epoch": 18.13531736805394,
"grad_norm": 6.202197074890137,
"learning_rate": 1.8646826319460593e-06,
"loss": 2.2532,
"step": 234000
},
{
"epoch": 18.174068046190808,
"grad_norm": 6.907183647155762,
"learning_rate": 1.8259319538091919e-06,
"loss": 2.2553,
"step": 234500
},
{
"epoch": 18.212818724327676,
"grad_norm": 7.445714473724365,
"learning_rate": 1.7871812756723245e-06,
"loss": 2.2586,
"step": 235000
},
{
"epoch": 18.251569402464543,
"grad_norm": 6.844184398651123,
"learning_rate": 1.7484305975354572e-06,
"loss": 2.2502,
"step": 235500
},
{
"epoch": 18.29032008060141,
"grad_norm": 6.495091438293457,
"learning_rate": 1.7096799193985896e-06,
"loss": 2.2703,
"step": 236000
},
{
"epoch": 18.32907075873828,
"grad_norm": 6.848631858825684,
"learning_rate": 1.6709292412617222e-06,
"loss": 2.2494,
"step": 236500
},
{
"epoch": 18.367821436875147,
"grad_norm": 6.527080535888672,
"learning_rate": 1.6321785631248548e-06,
"loss": 2.2676,
"step": 237000
},
{
"epoch": 18.40657211501201,
"grad_norm": 6.402927875518799,
"learning_rate": 1.5934278849879875e-06,
"loss": 2.256,
"step": 237500
},
{
"epoch": 18.44532279314888,
"grad_norm": 6.720060348510742,
"learning_rate": 1.5546772068511201e-06,
"loss": 2.256,
"step": 238000
},
{
"epoch": 18.484073471285747,
"grad_norm": 6.392049312591553,
"learning_rate": 1.5159265287142528e-06,
"loss": 2.272,
"step": 238500
},
{
"epoch": 18.522824149422615,
"grad_norm": 6.625200271606445,
"learning_rate": 1.4771758505773854e-06,
"loss": 2.2561,
"step": 239000
},
{
"epoch": 18.561574827559483,
"grad_norm": 6.451653003692627,
"learning_rate": 1.438425172440518e-06,
"loss": 2.2518,
"step": 239500
},
{
"epoch": 18.60032550569635,
"grad_norm": 6.246822357177734,
"learning_rate": 1.3996744943036504e-06,
"loss": 2.2541,
"step": 240000
},
{
"epoch": 18.639076183833218,
"grad_norm": 6.265354156494141,
"learning_rate": 1.360923816166783e-06,
"loss": 2.2546,
"step": 240500
},
{
"epoch": 18.677826861970086,
"grad_norm": 6.439133644104004,
"learning_rate": 1.3221731380299157e-06,
"loss": 2.2583,
"step": 241000
},
{
"epoch": 18.71657754010695,
"grad_norm": 6.528525352478027,
"learning_rate": 1.2834224598930483e-06,
"loss": 2.2467,
"step": 241500
},
{
"epoch": 18.755328218243818,
"grad_norm": 7.4315900802612305,
"learning_rate": 1.2446717817561808e-06,
"loss": 2.2585,
"step": 242000
},
{
"epoch": 18.794078896380686,
"grad_norm": 7.4202141761779785,
"learning_rate": 1.2059211036193134e-06,
"loss": 2.2637,
"step": 242500
},
{
"epoch": 18.832829574517554,
"grad_norm": 6.3204145431518555,
"learning_rate": 1.167170425482446e-06,
"loss": 2.264,
"step": 243000
},
{
"epoch": 18.87158025265442,
"grad_norm": 6.220766067504883,
"learning_rate": 1.1284197473455787e-06,
"loss": 2.2705,
"step": 243500
},
{
"epoch": 18.91033093079129,
"grad_norm": 6.558001518249512,
"learning_rate": 1.0896690692087113e-06,
"loss": 2.2632,
"step": 244000
},
{
"epoch": 18.949081608928157,
"grad_norm": 6.786870956420898,
"learning_rate": 1.050918391071844e-06,
"loss": 2.2441,
"step": 244500
},
{
"epoch": 18.987832287065025,
"grad_norm": 6.955057621002197,
"learning_rate": 1.0121677129349766e-06,
"loss": 2.2503,
"step": 245000
},
{
"epoch": 19.0,
"eval_loss": 2.2231059074401855,
"eval_runtime": 272.187,
"eval_samples_per_second": 758.541,
"eval_steps_per_second": 11.856,
"step": 245157
},
{
"epoch": 19.02658296520189,
"grad_norm": 6.136529922485352,
"learning_rate": 9.73417034798109e-07,
"loss": 2.2528,
"step": 245500
},
{
"epoch": 19.065333643338757,
"grad_norm": 7.144802093505859,
"learning_rate": 9.346663566612417e-07,
"loss": 2.248,
"step": 246000
},
{
"epoch": 19.104084321475625,
"grad_norm": 5.582034111022949,
"learning_rate": 8.959156785243743e-07,
"loss": 2.2513,
"step": 246500
},
{
"epoch": 19.142834999612493,
"grad_norm": 6.747804164886475,
"learning_rate": 8.571650003875069e-07,
"loss": 2.2647,
"step": 247000
},
{
"epoch": 19.18158567774936,
"grad_norm": 6.1470417976379395,
"learning_rate": 8.184143222506395e-07,
"loss": 2.2548,
"step": 247500
},
{
"epoch": 19.22033635588623,
"grad_norm": 6.574125289916992,
"learning_rate": 7.79663644113772e-07,
"loss": 2.2714,
"step": 248000
},
{
"epoch": 19.259087034023096,
"grad_norm": 6.6587982177734375,
"learning_rate": 7.409129659769046e-07,
"loss": 2.2491,
"step": 248500
},
{
"epoch": 19.297837712159964,
"grad_norm": 6.578282356262207,
"learning_rate": 7.021622878400372e-07,
"loss": 2.2483,
"step": 249000
},
{
"epoch": 19.33658839029683,
"grad_norm": 6.449355602264404,
"learning_rate": 6.634116097031699e-07,
"loss": 2.2558,
"step": 249500
},
{
"epoch": 19.375339068433696,
"grad_norm": 5.921240329742432,
"learning_rate": 6.246609315663025e-07,
"loss": 2.2428,
"step": 250000
},
{
"epoch": 19.414089746570564,
"grad_norm": 6.655218124389648,
"learning_rate": 5.859102534294351e-07,
"loss": 2.2616,
"step": 250500
},
{
"epoch": 19.452840424707432,
"grad_norm": 6.733659744262695,
"learning_rate": 5.471595752925676e-07,
"loss": 2.2481,
"step": 251000
},
{
"epoch": 19.4915911028443,
"grad_norm": 6.9586968421936035,
"learning_rate": 5.084088971557003e-07,
"loss": 2.2495,
"step": 251500
},
{
"epoch": 19.530341780981168,
"grad_norm": 6.441699028015137,
"learning_rate": 4.6965821901883286e-07,
"loss": 2.2456,
"step": 252000
},
{
"epoch": 19.569092459118036,
"grad_norm": 6.126708984375,
"learning_rate": 4.3090754088196544e-07,
"loss": 2.2561,
"step": 252500
},
{
"epoch": 19.607843137254903,
"grad_norm": 6.69553279876709,
"learning_rate": 3.921568627450981e-07,
"loss": 2.2435,
"step": 253000
},
{
"epoch": 19.646593815391768,
"grad_norm": 7.468321800231934,
"learning_rate": 3.5340618460823066e-07,
"loss": 2.2517,
"step": 253500
},
{
"epoch": 19.685344493528635,
"grad_norm": 6.724938869476318,
"learning_rate": 3.146555064713633e-07,
"loss": 2.2307,
"step": 254000
},
{
"epoch": 19.724095171665503,
"grad_norm": 6.407966613769531,
"learning_rate": 2.7590482833449587e-07,
"loss": 2.2469,
"step": 254500
},
{
"epoch": 19.76284584980237,
"grad_norm": 6.521556377410889,
"learning_rate": 2.3715415019762845e-07,
"loss": 2.2266,
"step": 255000
},
{
"epoch": 19.80159652793924,
"grad_norm": 6.066943645477295,
"learning_rate": 1.9840347206076106e-07,
"loss": 2.2448,
"step": 255500
},
{
"epoch": 19.840347206076107,
"grad_norm": 6.806808948516846,
"learning_rate": 1.5965279392389367e-07,
"loss": 2.2355,
"step": 256000
},
{
"epoch": 19.879097884212975,
"grad_norm": 6.567816734313965,
"learning_rate": 1.2090211578702627e-07,
"loss": 2.2581,
"step": 256500
},
{
"epoch": 19.917848562349842,
"grad_norm": 7.037693977355957,
"learning_rate": 8.21514376501589e-08,
"loss": 2.2558,
"step": 257000
},
{
"epoch": 19.95659924048671,
"grad_norm": 7.10353422164917,
"learning_rate": 4.340075951329148e-08,
"loss": 2.2513,
"step": 257500
},
{
"epoch": 19.995349918623575,
"grad_norm": 7.407341003417969,
"learning_rate": 4.6500813764240875e-09,
"loss": 2.2448,
"step": 258000
},
{
"epoch": 20.0,
"eval_loss": 2.211674213409424,
"eval_runtime": 270.2294,
"eval_samples_per_second": 764.036,
"eval_steps_per_second": 11.942,
"step": 258060
},
{
"epoch": 20.034100596760442,
"grad_norm": 6.319613456726074,
"learning_rate": 6.643932935493038e-06,
"loss": 2.2464,
"step": 258500
},
{
"epoch": 20.07285127489731,
"grad_norm": 10.947772026062012,
"learning_rate": 6.61809915006846e-06,
"loss": 2.2717,
"step": 259000
},
{
"epoch": 20.111601953034178,
"grad_norm": 6.688451290130615,
"learning_rate": 6.592265364643882e-06,
"loss": 2.246,
"step": 259500
},
{
"epoch": 20.150352631171046,
"grad_norm": 7.084783554077148,
"learning_rate": 6.566431579219303e-06,
"loss": 2.2547,
"step": 260000
},
{
"epoch": 20.189103309307914,
"grad_norm": 7.182523250579834,
"learning_rate": 6.540597793794725e-06,
"loss": 2.2673,
"step": 260500
},
{
"epoch": 20.22785398744478,
"grad_norm": 6.572226524353027,
"learning_rate": 6.514764008370147e-06,
"loss": 2.2696,
"step": 261000
},
{
"epoch": 20.26660466558165,
"grad_norm": 6.861509323120117,
"learning_rate": 6.488930222945569e-06,
"loss": 2.2602,
"step": 261500
},
{
"epoch": 20.305355343718514,
"grad_norm": 7.068969249725342,
"learning_rate": 6.46309643752099e-06,
"loss": 2.2736,
"step": 262000
},
{
"epoch": 20.34410602185538,
"grad_norm": 6.5293660163879395,
"learning_rate": 6.4372626520964125e-06,
"loss": 2.2698,
"step": 262500
},
{
"epoch": 20.38285669999225,
"grad_norm": 6.285311698913574,
"learning_rate": 6.411428866671834e-06,
"loss": 2.2671,
"step": 263000
},
{
"epoch": 20.421607378129117,
"grad_norm": 6.466723918914795,
"learning_rate": 6.3855950812472554e-06,
"loss": 2.267,
"step": 263500
},
{
"epoch": 20.460358056265985,
"grad_norm": 7.045479774475098,
"learning_rate": 6.359761295822677e-06,
"loss": 2.2499,
"step": 264000
},
{
"epoch": 20.499108734402853,
"grad_norm": 7.05580472946167,
"learning_rate": 6.333927510398099e-06,
"loss": 2.2793,
"step": 264500
},
{
"epoch": 20.53785941253972,
"grad_norm": 7.213685035705566,
"learning_rate": 6.308093724973521e-06,
"loss": 2.2519,
"step": 265000
},
{
"epoch": 20.57661009067659,
"grad_norm": 6.6378984451293945,
"learning_rate": 6.282259939548942e-06,
"loss": 2.2699,
"step": 265500
},
{
"epoch": 20.615360768813453,
"grad_norm": 6.8442463874816895,
"learning_rate": 6.2564261541243645e-06,
"loss": 2.2697,
"step": 266000
},
{
"epoch": 20.65411144695032,
"grad_norm": 7.099138259887695,
"learning_rate": 6.230592368699786e-06,
"loss": 2.2622,
"step": 266500
},
{
"epoch": 20.69286212508719,
"grad_norm": 6.572378635406494,
"learning_rate": 6.2047585832752074e-06,
"loss": 2.2709,
"step": 267000
},
{
"epoch": 20.731612803224056,
"grad_norm": 6.410079479217529,
"learning_rate": 6.17892479785063e-06,
"loss": 2.2599,
"step": 267500
},
{
"epoch": 20.770363481360924,
"grad_norm": 7.154236316680908,
"learning_rate": 6.153091012426051e-06,
"loss": 2.2654,
"step": 268000
},
{
"epoch": 20.809114159497792,
"grad_norm": 7.05757999420166,
"learning_rate": 6.127257227001473e-06,
"loss": 2.2673,
"step": 268500
},
{
"epoch": 20.84786483763466,
"grad_norm": 7.457660675048828,
"learning_rate": 6.101423441576895e-06,
"loss": 2.2534,
"step": 269000
},
{
"epoch": 20.886615515771528,
"grad_norm": 6.697342872619629,
"learning_rate": 6.0755896561523165e-06,
"loss": 2.2721,
"step": 269500
},
{
"epoch": 20.925366193908392,
"grad_norm": 6.83280611038208,
"learning_rate": 6.049755870727738e-06,
"loss": 2.276,
"step": 270000
},
{
"epoch": 20.96411687204526,
"grad_norm": 6.298649311065674,
"learning_rate": 6.02392208530316e-06,
"loss": 2.265,
"step": 270500
},
{
"epoch": 21.0,
"eval_loss": 2.2148919105529785,
"eval_runtime": 275.6952,
"eval_samples_per_second": 748.889,
"eval_steps_per_second": 11.705,
"step": 270963
},
{
"epoch": 21.002867550182128,
"grad_norm": 6.698548316955566,
"learning_rate": 5.998088299878582e-06,
"loss": 2.2576,
"step": 271000
},
{
"epoch": 21.041618228318995,
"grad_norm": 6.784346103668213,
"learning_rate": 5.972254514454003e-06,
"loss": 2.2398,
"step": 271500
},
{
"epoch": 21.080368906455863,
"grad_norm": 7.072300910949707,
"learning_rate": 5.946420729029425e-06,
"loss": 2.2557,
"step": 272000
},
{
"epoch": 21.11911958459273,
"grad_norm": 6.624369144439697,
"learning_rate": 5.920586943604847e-06,
"loss": 2.2337,
"step": 272500
},
{
"epoch": 21.1578702627296,
"grad_norm": 6.317164897918701,
"learning_rate": 5.8947531581802685e-06,
"loss": 2.2534,
"step": 273000
},
{
"epoch": 21.196620940866467,
"grad_norm": 6.728669166564941,
"learning_rate": 5.86891937275569e-06,
"loss": 2.2505,
"step": 273500
},
{
"epoch": 21.23537161900333,
"grad_norm": 6.596154689788818,
"learning_rate": 5.843085587331112e-06,
"loss": 2.253,
"step": 274000
},
{
"epoch": 21.2741222971402,
"grad_norm": 6.471163749694824,
"learning_rate": 5.817251801906534e-06,
"loss": 2.2556,
"step": 274500
},
{
"epoch": 21.312872975277067,
"grad_norm": 6.29288911819458,
"learning_rate": 5.791418016481955e-06,
"loss": 2.2567,
"step": 275000
},
{
"epoch": 21.351623653413935,
"grad_norm": 7.078927040100098,
"learning_rate": 5.7655842310573776e-06,
"loss": 2.2294,
"step": 275500
},
{
"epoch": 21.390374331550802,
"grad_norm": 6.867557525634766,
"learning_rate": 5.739750445632799e-06,
"loss": 2.2574,
"step": 276000
},
{
"epoch": 21.42912500968767,
"grad_norm": 6.830238342285156,
"learning_rate": 5.7139166602082205e-06,
"loss": 2.2794,
"step": 276500
},
{
"epoch": 21.467875687824538,
"grad_norm": 6.694831371307373,
"learning_rate": 5.688082874783643e-06,
"loss": 2.253,
"step": 277000
},
{
"epoch": 21.506626365961406,
"grad_norm": 7.064994812011719,
"learning_rate": 5.662249089359064e-06,
"loss": 2.2435,
"step": 277500
},
{
"epoch": 21.54537704409827,
"grad_norm": 6.832572937011719,
"learning_rate": 5.636415303934486e-06,
"loss": 2.2478,
"step": 278000
},
{
"epoch": 21.584127722235138,
"grad_norm": 7.045238494873047,
"learning_rate": 5.610581518509908e-06,
"loss": 2.2434,
"step": 278500
},
{
"epoch": 21.622878400372006,
"grad_norm": 6.720279216766357,
"learning_rate": 5.58474773308533e-06,
"loss": 2.238,
"step": 279000
},
{
"epoch": 21.661629078508874,
"grad_norm": 7.401440143585205,
"learning_rate": 5.558913947660751e-06,
"loss": 2.2461,
"step": 279500
},
{
"epoch": 21.70037975664574,
"grad_norm": 6.497147560119629,
"learning_rate": 5.5330801622361725e-06,
"loss": 2.2339,
"step": 280000
},
{
"epoch": 21.73913043478261,
"grad_norm": 6.529776096343994,
"learning_rate": 5.507246376811595e-06,
"loss": 2.2501,
"step": 280500
},
{
"epoch": 21.777881112919477,
"grad_norm": 6.42600679397583,
"learning_rate": 5.481412591387016e-06,
"loss": 2.235,
"step": 281000
},
{
"epoch": 21.816631791056345,
"grad_norm": 6.715229034423828,
"learning_rate": 5.455578805962438e-06,
"loss": 2.2401,
"step": 281500
},
{
"epoch": 21.85538246919321,
"grad_norm": 6.575899124145508,
"learning_rate": 5.42974502053786e-06,
"loss": 2.2576,
"step": 282000
},
{
"epoch": 21.894133147330077,
"grad_norm": 5.999971866607666,
"learning_rate": 5.403911235113282e-06,
"loss": 2.2379,
"step": 282500
},
{
"epoch": 21.932883825466945,
"grad_norm": 6.936278343200684,
"learning_rate": 5.378077449688703e-06,
"loss": 2.2534,
"step": 283000
},
{
"epoch": 21.971634503603813,
"grad_norm": 6.040930271148682,
"learning_rate": 5.352243664264125e-06,
"loss": 2.2391,
"step": 283500
},
{
"epoch": 22.0,
"eval_loss": 2.1943371295928955,
"eval_runtime": 268.1318,
"eval_samples_per_second": 770.013,
"eval_steps_per_second": 12.035,
"step": 283866
},
{
"epoch": 22.01038518174068,
"grad_norm": 6.7548747062683105,
"learning_rate": 5.326409878839547e-06,
"loss": 2.2428,
"step": 284000
},
{
"epoch": 22.04913585987755,
"grad_norm": 7.0850749015808105,
"learning_rate": 5.300576093414968e-06,
"loss": 2.2273,
"step": 284500
},
{
"epoch": 22.087886538014416,
"grad_norm": 6.658077239990234,
"learning_rate": 5.274742307990391e-06,
"loss": 2.2214,
"step": 285000
},
{
"epoch": 22.126637216151284,
"grad_norm": 7.19653844833374,
"learning_rate": 5.248908522565812e-06,
"loss": 2.2273,
"step": 285500
},
{
"epoch": 22.16538789428815,
"grad_norm": 7.094461441040039,
"learning_rate": 5.223074737141234e-06,
"loss": 2.2359,
"step": 286000
},
{
"epoch": 22.204138572425016,
"grad_norm": 7.156402587890625,
"learning_rate": 5.197240951716656e-06,
"loss": 2.1969,
"step": 286500
},
{
"epoch": 22.242889250561884,
"grad_norm": 6.595995903015137,
"learning_rate": 5.171407166292077e-06,
"loss": 2.2223,
"step": 287000
},
{
"epoch": 22.281639928698752,
"grad_norm": 7.04496955871582,
"learning_rate": 5.145573380867499e-06,
"loss": 2.2343,
"step": 287500
},
{
"epoch": 22.32039060683562,
"grad_norm": 7.146208763122559,
"learning_rate": 5.11973959544292e-06,
"loss": 2.2338,
"step": 288000
},
{
"epoch": 22.359141284972488,
"grad_norm": 6.4659576416015625,
"learning_rate": 5.093905810018343e-06,
"loss": 2.2273,
"step": 288500
},
{
"epoch": 22.397891963109355,
"grad_norm": 6.372287750244141,
"learning_rate": 5.068072024593764e-06,
"loss": 2.2247,
"step": 289000
},
{
"epoch": 22.436642641246223,
"grad_norm": 7.088085174560547,
"learning_rate": 5.042238239169186e-06,
"loss": 2.2474,
"step": 289500
},
{
"epoch": 22.475393319383087,
"grad_norm": 6.911520004272461,
"learning_rate": 5.016404453744608e-06,
"loss": 2.2356,
"step": 290000
},
{
"epoch": 22.514143997519955,
"grad_norm": 7.5756611824035645,
"learning_rate": 4.990570668320029e-06,
"loss": 2.2297,
"step": 290500
},
{
"epoch": 22.552894675656823,
"grad_norm": 6.587701320648193,
"learning_rate": 4.964736882895451e-06,
"loss": 2.2245,
"step": 291000
},
{
"epoch": 22.59164535379369,
"grad_norm": 5.8870849609375,
"learning_rate": 4.938903097470873e-06,
"loss": 2.229,
"step": 291500
},
{
"epoch": 22.63039603193056,
"grad_norm": 6.882173538208008,
"learning_rate": 4.913069312046295e-06,
"loss": 2.2254,
"step": 292000
},
{
"epoch": 22.669146710067427,
"grad_norm": 6.710127830505371,
"learning_rate": 4.887235526621716e-06,
"loss": 2.223,
"step": 292500
},
{
"epoch": 22.707897388204294,
"grad_norm": 6.753304481506348,
"learning_rate": 4.8614017411971385e-06,
"loss": 2.2311,
"step": 293000
},
{
"epoch": 22.746648066341162,
"grad_norm": 6.02184534072876,
"learning_rate": 4.83556795577256e-06,
"loss": 2.2198,
"step": 293500
},
{
"epoch": 22.78539874447803,
"grad_norm": 7.022054195404053,
"learning_rate": 4.809734170347981e-06,
"loss": 2.2275,
"step": 294000
},
{
"epoch": 22.824149422614894,
"grad_norm": 7.951735019683838,
"learning_rate": 4.783900384923404e-06,
"loss": 2.2314,
"step": 294500
},
{
"epoch": 22.862900100751762,
"grad_norm": 5.854333877563477,
"learning_rate": 4.758066599498825e-06,
"loss": 2.2172,
"step": 295000
},
{
"epoch": 22.90165077888863,
"grad_norm": 6.547132968902588,
"learning_rate": 4.732232814074247e-06,
"loss": 2.226,
"step": 295500
},
{
"epoch": 22.940401457025498,
"grad_norm": 6.535789966583252,
"learning_rate": 4.706399028649668e-06,
"loss": 2.2299,
"step": 296000
},
{
"epoch": 22.979152135162366,
"grad_norm": 7.285912036895752,
"learning_rate": 4.6805652432250905e-06,
"loss": 2.2239,
"step": 296500
},
{
"epoch": 23.0,
"eval_loss": 2.185373067855835,
"eval_runtime": 265.4806,
"eval_samples_per_second": 777.703,
"eval_steps_per_second": 12.155,
"step": 296769
},
{
"epoch": 23.017902813299234,
"grad_norm": 6.972716808319092,
"learning_rate": 4.654731457800512e-06,
"loss": 2.2193,
"step": 297000
},
{
"epoch": 23.0566534914361,
"grad_norm": 6.841848373413086,
"learning_rate": 4.628897672375933e-06,
"loss": 2.2139,
"step": 297500
},
{
"epoch": 23.09540416957297,
"grad_norm": 6.285813331604004,
"learning_rate": 4.603063886951356e-06,
"loss": 2.2092,
"step": 298000
},
{
"epoch": 23.134154847709834,
"grad_norm": 6.615530967712402,
"learning_rate": 4.577230101526777e-06,
"loss": 2.2141,
"step": 298500
},
{
"epoch": 23.1729055258467,
"grad_norm": 6.762087821960449,
"learning_rate": 4.551396316102199e-06,
"loss": 2.1944,
"step": 299000
},
{
"epoch": 23.21165620398357,
"grad_norm": 7.053805351257324,
"learning_rate": 4.525562530677621e-06,
"loss": 2.2129,
"step": 299500
},
{
"epoch": 23.250406882120437,
"grad_norm": 7.14516544342041,
"learning_rate": 4.4997287452530425e-06,
"loss": 2.2038,
"step": 300000
},
{
"epoch": 23.289157560257305,
"grad_norm": 6.8478803634643555,
"learning_rate": 4.473894959828464e-06,
"loss": 2.2166,
"step": 300500
},
{
"epoch": 23.327908238394173,
"grad_norm": 6.808053970336914,
"learning_rate": 4.448061174403886e-06,
"loss": 2.224,
"step": 301000
},
{
"epoch": 23.36665891653104,
"grad_norm": 7.149857521057129,
"learning_rate": 4.422227388979308e-06,
"loss": 2.2081,
"step": 301500
},
{
"epoch": 23.40540959466791,
"grad_norm": 6.334920406341553,
"learning_rate": 4.396393603554729e-06,
"loss": 2.2217,
"step": 302000
},
{
"epoch": 23.444160272804773,
"grad_norm": 7.154323577880859,
"learning_rate": 4.3705598181301515e-06,
"loss": 2.2129,
"step": 302500
},
{
"epoch": 23.48291095094164,
"grad_norm": 7.202456474304199,
"learning_rate": 4.344726032705573e-06,
"loss": 2.2019,
"step": 303000
},
{
"epoch": 23.52166162907851,
"grad_norm": 6.832441806793213,
"learning_rate": 4.3188922472809945e-06,
"loss": 2.214,
"step": 303500
},
{
"epoch": 23.560412307215376,
"grad_norm": 6.258272647857666,
"learning_rate": 4.293058461856417e-06,
"loss": 2.21,
"step": 304000
},
{
"epoch": 23.599162985352244,
"grad_norm": 6.8391194343566895,
"learning_rate": 4.267224676431838e-06,
"loss": 2.2106,
"step": 304500
},
{
"epoch": 23.637913663489112,
"grad_norm": 6.621433734893799,
"learning_rate": 4.24139089100726e-06,
"loss": 2.2219,
"step": 305000
},
{
"epoch": 23.67666434162598,
"grad_norm": 6.718801498413086,
"learning_rate": 4.215557105582681e-06,
"loss": 2.2215,
"step": 305500
},
{
"epoch": 23.715415019762847,
"grad_norm": 7.0543622970581055,
"learning_rate": 4.1897233201581036e-06,
"loss": 2.2182,
"step": 306000
},
{
"epoch": 23.75416569789971,
"grad_norm": 7.598169326782227,
"learning_rate": 4.163889534733525e-06,
"loss": 2.2218,
"step": 306500
},
{
"epoch": 23.79291637603658,
"grad_norm": 6.874271392822266,
"learning_rate": 4.1380557493089465e-06,
"loss": 2.2061,
"step": 307000
},
{
"epoch": 23.831667054173447,
"grad_norm": 6.820863723754883,
"learning_rate": 4.112221963884369e-06,
"loss": 2.2166,
"step": 307500
},
{
"epoch": 23.870417732310315,
"grad_norm": 7.149729251861572,
"learning_rate": 4.08638817845979e-06,
"loss": 2.2089,
"step": 308000
},
{
"epoch": 23.909168410447183,
"grad_norm": 6.278995990753174,
"learning_rate": 4.060554393035212e-06,
"loss": 2.2163,
"step": 308500
},
{
"epoch": 23.94791908858405,
"grad_norm": 7.162642002105713,
"learning_rate": 4.034720607610634e-06,
"loss": 2.2222,
"step": 309000
},
{
"epoch": 23.98666976672092,
"grad_norm": 6.67965841293335,
"learning_rate": 4.0088868221860556e-06,
"loss": 2.1965,
"step": 309500
},
{
"epoch": 24.0,
"eval_loss": 2.1702771186828613,
"eval_runtime": 264.5615,
"eval_samples_per_second": 780.405,
"eval_steps_per_second": 12.198,
"step": 309672
},
{
"epoch": 24.025420444857787,
"grad_norm": 6.355005264282227,
"learning_rate": 3.983053036761477e-06,
"loss": 2.1935,
"step": 310000
},
{
"epoch": 24.06417112299465,
"grad_norm": 6.339087963104248,
"learning_rate": 3.957219251336899e-06,
"loss": 2.2022,
"step": 310500
},
{
"epoch": 24.10292180113152,
"grad_norm": 6.386953353881836,
"learning_rate": 3.931385465912321e-06,
"loss": 2.1941,
"step": 311000
},
{
"epoch": 24.141672479268387,
"grad_norm": 6.9508376121521,
"learning_rate": 3.905551680487742e-06,
"loss": 2.1991,
"step": 311500
},
{
"epoch": 24.180423157405254,
"grad_norm": 7.1515727043151855,
"learning_rate": 3.879717895063164e-06,
"loss": 2.2118,
"step": 312000
},
{
"epoch": 24.219173835542122,
"grad_norm": 6.807953357696533,
"learning_rate": 3.853884109638585e-06,
"loss": 2.2158,
"step": 312500
},
{
"epoch": 24.25792451367899,
"grad_norm": 7.41762638092041,
"learning_rate": 3.828050324214007e-06,
"loss": 2.1948,
"step": 313000
},
{
"epoch": 24.296675191815858,
"grad_norm": 7.462344646453857,
"learning_rate": 3.802216538789429e-06,
"loss": 2.2061,
"step": 313500
},
{
"epoch": 24.335425869952726,
"grad_norm": 6.6912384033203125,
"learning_rate": 3.7763827533648505e-06,
"loss": 2.1932,
"step": 314000
},
{
"epoch": 24.37417654808959,
"grad_norm": 6.79492712020874,
"learning_rate": 3.7505489679402724e-06,
"loss": 2.193,
"step": 314500
},
{
"epoch": 24.412927226226458,
"grad_norm": 6.873208522796631,
"learning_rate": 3.724715182515694e-06,
"loss": 2.1756,
"step": 315000
},
{
"epoch": 24.451677904363326,
"grad_norm": 6.520395278930664,
"learning_rate": 3.6988813970911158e-06,
"loss": 2.2019,
"step": 315500
},
{
"epoch": 24.490428582500193,
"grad_norm": 7.425100326538086,
"learning_rate": 3.6730476116665377e-06,
"loss": 2.1933,
"step": 316000
},
{
"epoch": 24.52917926063706,
"grad_norm": 6.990531921386719,
"learning_rate": 3.647213826241959e-06,
"loss": 2.1953,
"step": 316500
},
{
"epoch": 24.56792993877393,
"grad_norm": 6.99529504776001,
"learning_rate": 3.621380040817381e-06,
"loss": 2.1668,
"step": 317000
},
{
"epoch": 24.606680616910797,
"grad_norm": 7.046565532684326,
"learning_rate": 3.595546255392803e-06,
"loss": 2.2185,
"step": 317500
},
{
"epoch": 24.645431295047665,
"grad_norm": 7.261152744293213,
"learning_rate": 3.5697124699682244e-06,
"loss": 2.1776,
"step": 318000
},
{
"epoch": 24.68418197318453,
"grad_norm": 7.088150978088379,
"learning_rate": 3.5438786845436463e-06,
"loss": 2.1939,
"step": 318500
},
{
"epoch": 24.722932651321397,
"grad_norm": 7.677366733551025,
"learning_rate": 3.518044899119068e-06,
"loss": 2.1916,
"step": 319000
},
{
"epoch": 24.761683329458265,
"grad_norm": 7.108632564544678,
"learning_rate": 3.4922111136944897e-06,
"loss": 2.1851,
"step": 319500
},
{
"epoch": 24.800434007595133,
"grad_norm": 7.283915996551514,
"learning_rate": 3.4663773282699116e-06,
"loss": 2.2015,
"step": 320000
},
{
"epoch": 24.839184685732,
"grad_norm": 7.392533779144287,
"learning_rate": 3.440543542845333e-06,
"loss": 2.1915,
"step": 320500
},
{
"epoch": 24.87793536386887,
"grad_norm": 6.849175453186035,
"learning_rate": 3.414709757420755e-06,
"loss": 2.1931,
"step": 321000
},
{
"epoch": 24.916686042005736,
"grad_norm": 6.42083740234375,
"learning_rate": 3.388875971996177e-06,
"loss": 2.198,
"step": 321500
},
{
"epoch": 24.955436720142604,
"grad_norm": 6.040030002593994,
"learning_rate": 3.3630421865715983e-06,
"loss": 2.1802,
"step": 322000
},
{
"epoch": 24.99418739827947,
"grad_norm": 7.585995674133301,
"learning_rate": 3.3372084011470202e-06,
"loss": 2.1946,
"step": 322500
},
{
"epoch": 25.0,
"eval_loss": 2.159193992614746,
"eval_runtime": 270.695,
"eval_samples_per_second": 762.722,
"eval_steps_per_second": 11.921,
"step": 322575
},
{
"epoch": 25.032938076416336,
"grad_norm": 7.309504985809326,
"learning_rate": 3.3113746157224417e-06,
"loss": 2.1854,
"step": 323000
},
{
"epoch": 25.071688754553204,
"grad_norm": 6.450008869171143,
"learning_rate": 3.2855408302978636e-06,
"loss": 2.1827,
"step": 323500
},
{
"epoch": 25.11043943269007,
"grad_norm": 6.82379674911499,
"learning_rate": 3.2597070448732855e-06,
"loss": 2.1838,
"step": 324000
},
{
"epoch": 25.14919011082694,
"grad_norm": 7.034087657928467,
"learning_rate": 3.233873259448707e-06,
"loss": 2.1618,
"step": 324500
},
{
"epoch": 25.187940788963807,
"grad_norm": 7.005911827087402,
"learning_rate": 3.208039474024129e-06,
"loss": 2.1912,
"step": 325000
},
{
"epoch": 25.226691467100675,
"grad_norm": 6.7085394859313965,
"learning_rate": 3.1822056885995508e-06,
"loss": 2.1795,
"step": 325500
},
{
"epoch": 25.265442145237543,
"grad_norm": 6.773245334625244,
"learning_rate": 3.1563719031749722e-06,
"loss": 2.1965,
"step": 326000
},
{
"epoch": 25.304192823374407,
"grad_norm": 6.718632698059082,
"learning_rate": 3.130538117750394e-06,
"loss": 2.1976,
"step": 326500
},
{
"epoch": 25.342943501511275,
"grad_norm": 8.191710472106934,
"learning_rate": 3.1047043323258156e-06,
"loss": 2.1762,
"step": 327000
},
{
"epoch": 25.381694179648143,
"grad_norm": 7.172983169555664,
"learning_rate": 3.0788705469012375e-06,
"loss": 2.1703,
"step": 327500
},
{
"epoch": 25.42044485778501,
"grad_norm": 6.283721446990967,
"learning_rate": 3.0530367614766594e-06,
"loss": 2.1692,
"step": 328000
},
{
"epoch": 25.45919553592188,
"grad_norm": 6.850103855133057,
"learning_rate": 3.027202976052081e-06,
"loss": 2.1914,
"step": 328500
},
{
"epoch": 25.497946214058747,
"grad_norm": 6.31437873840332,
"learning_rate": 3.0013691906275028e-06,
"loss": 2.1692,
"step": 329000
},
{
"epoch": 25.536696892195614,
"grad_norm": 6.947432994842529,
"learning_rate": 2.9755354052029247e-06,
"loss": 2.1848,
"step": 329500
},
{
"epoch": 25.575447570332482,
"grad_norm": 6.133412837982178,
"learning_rate": 2.949701619778346e-06,
"loss": 2.1827,
"step": 330000
},
{
"epoch": 25.61419824846935,
"grad_norm": 7.019827365875244,
"learning_rate": 2.923867834353768e-06,
"loss": 2.1654,
"step": 330500
},
{
"epoch": 25.652948926606214,
"grad_norm": 7.326742172241211,
"learning_rate": 2.8980340489291895e-06,
"loss": 2.1929,
"step": 331000
},
{
"epoch": 25.691699604743082,
"grad_norm": 7.231571674346924,
"learning_rate": 2.8722002635046114e-06,
"loss": 2.1913,
"step": 331500
},
{
"epoch": 25.73045028287995,
"grad_norm": 7.050189971923828,
"learning_rate": 2.8463664780800333e-06,
"loss": 2.191,
"step": 332000
},
{
"epoch": 25.769200961016818,
"grad_norm": 6.654092311859131,
"learning_rate": 2.8205326926554548e-06,
"loss": 2.1871,
"step": 332500
},
{
"epoch": 25.807951639153686,
"grad_norm": 7.114500522613525,
"learning_rate": 2.7946989072308767e-06,
"loss": 2.1842,
"step": 333000
},
{
"epoch": 25.846702317290553,
"grad_norm": 6.987917900085449,
"learning_rate": 2.7688651218062986e-06,
"loss": 2.1782,
"step": 333500
},
{
"epoch": 25.88545299542742,
"grad_norm": 6.479386806488037,
"learning_rate": 2.74303133638172e-06,
"loss": 2.1904,
"step": 334000
},
{
"epoch": 25.924203673564286,
"grad_norm": 6.597611904144287,
"learning_rate": 2.717197550957142e-06,
"loss": 2.1782,
"step": 334500
},
{
"epoch": 25.962954351701153,
"grad_norm": 7.492031097412109,
"learning_rate": 2.6913637655325634e-06,
"loss": 2.1976,
"step": 335000
},
{
"epoch": 26.0,
"eval_loss": 2.144183874130249,
"eval_runtime": 268.6578,
"eval_samples_per_second": 768.505,
"eval_steps_per_second": 12.012,
"step": 335478
},
{
"epoch": 26.00170502983802,
"grad_norm": 7.5874552726745605,
"learning_rate": 2.6655299801079853e-06,
"loss": 2.1755,
"step": 335500
},
{
"epoch": 26.04045570797489,
"grad_norm": 7.499856948852539,
"learning_rate": 2.6396961946834072e-06,
"loss": 2.1885,
"step": 336000
},
{
"epoch": 26.079206386111757,
"grad_norm": 7.2821946144104,
"learning_rate": 2.6138624092588287e-06,
"loss": 2.1782,
"step": 336500
},
{
"epoch": 26.117957064248625,
"grad_norm": 7.0137834548950195,
"learning_rate": 2.5880286238342506e-06,
"loss": 2.1688,
"step": 337000
},
{
"epoch": 26.156707742385493,
"grad_norm": 6.468008518218994,
"learning_rate": 2.5621948384096725e-06,
"loss": 2.1735,
"step": 337500
},
{
"epoch": 26.19545842052236,
"grad_norm": 6.922983169555664,
"learning_rate": 2.536361052985094e-06,
"loss": 2.1643,
"step": 338000
},
{
"epoch": 26.23420909865923,
"grad_norm": 6.963326454162598,
"learning_rate": 2.510527267560516e-06,
"loss": 2.1569,
"step": 338500
},
{
"epoch": 26.272959776796093,
"grad_norm": 6.4791579246521,
"learning_rate": 2.4846934821359373e-06,
"loss": 2.1816,
"step": 339000
},
{
"epoch": 26.31171045493296,
"grad_norm": 7.289137840270996,
"learning_rate": 2.4588596967113592e-06,
"loss": 2.1628,
"step": 339500
},
{
"epoch": 26.350461133069828,
"grad_norm": 7.020922660827637,
"learning_rate": 2.433025911286781e-06,
"loss": 2.1608,
"step": 340000
},
{
"epoch": 26.389211811206696,
"grad_norm": 6.522220134735107,
"learning_rate": 2.4071921258622026e-06,
"loss": 2.1736,
"step": 340500
},
{
"epoch": 26.427962489343564,
"grad_norm": 7.149320602416992,
"learning_rate": 2.3813583404376245e-06,
"loss": 2.1761,
"step": 341000
},
{
"epoch": 26.46671316748043,
"grad_norm": 7.04742431640625,
"learning_rate": 2.3555245550130464e-06,
"loss": 2.168,
"step": 341500
},
{
"epoch": 26.5054638456173,
"grad_norm": 7.135145664215088,
"learning_rate": 2.329690769588468e-06,
"loss": 2.1928,
"step": 342000
},
{
"epoch": 26.544214523754167,
"grad_norm": 7.492802619934082,
"learning_rate": 2.3038569841638898e-06,
"loss": 2.1764,
"step": 342500
},
{
"epoch": 26.58296520189103,
"grad_norm": 6.618491172790527,
"learning_rate": 2.2780231987393112e-06,
"loss": 2.1768,
"step": 343000
},
{
"epoch": 26.6217158800279,
"grad_norm": 6.808167457580566,
"learning_rate": 2.252189413314733e-06,
"loss": 2.1623,
"step": 343500
},
{
"epoch": 26.660466558164767,
"grad_norm": 6.65431022644043,
"learning_rate": 2.226355627890155e-06,
"loss": 2.1658,
"step": 344000
},
{
"epoch": 26.699217236301635,
"grad_norm": 7.762594699859619,
"learning_rate": 2.2005218424655765e-06,
"loss": 2.1794,
"step": 344500
},
{
"epoch": 26.737967914438503,
"grad_norm": 6.6927056312561035,
"learning_rate": 2.1746880570409984e-06,
"loss": 2.1624,
"step": 345000
},
{
"epoch": 26.77671859257537,
"grad_norm": 6.606927394866943,
"learning_rate": 2.1488542716164203e-06,
"loss": 2.1741,
"step": 345500
},
{
"epoch": 26.81546927071224,
"grad_norm": 6.104671955108643,
"learning_rate": 2.1230204861918418e-06,
"loss": 2.1716,
"step": 346000
},
{
"epoch": 26.854219948849106,
"grad_norm": 5.965663433074951,
"learning_rate": 2.0971867007672637e-06,
"loss": 2.1674,
"step": 346500
},
{
"epoch": 26.89297062698597,
"grad_norm": 6.041355133056641,
"learning_rate": 2.071352915342685e-06,
"loss": 2.181,
"step": 347000
},
{
"epoch": 26.93172130512284,
"grad_norm": 7.279519557952881,
"learning_rate": 2.045519129918107e-06,
"loss": 2.1661,
"step": 347500
},
{
"epoch": 26.970471983259706,
"grad_norm": 6.790727615356445,
"learning_rate": 2.019685344493529e-06,
"loss": 2.1658,
"step": 348000
},
{
"epoch": 27.0,
"eval_loss": 2.137254238128662,
"eval_runtime": 268.8992,
"eval_samples_per_second": 767.815,
"eval_steps_per_second": 12.001,
"step": 348381
},
{
"epoch": 27.009222661396574,
"grad_norm": 6.905515193939209,
"learning_rate": 1.9938515590689504e-06,
"loss": 2.1569,
"step": 348500
},
{
"epoch": 27.047973339533442,
"grad_norm": 6.515853404998779,
"learning_rate": 1.9680177736443723e-06,
"loss": 2.1713,
"step": 349000
},
{
"epoch": 27.08672401767031,
"grad_norm": 6.981870651245117,
"learning_rate": 1.942183988219794e-06,
"loss": 2.1745,
"step": 349500
},
{
"epoch": 27.125474695807178,
"grad_norm": 6.35358190536499,
"learning_rate": 1.9163502027952157e-06,
"loss": 2.1644,
"step": 350000
},
{
"epoch": 27.164225373944046,
"grad_norm": 7.149428844451904,
"learning_rate": 1.8905164173706376e-06,
"loss": 2.1816,
"step": 350500
},
{
"epoch": 27.20297605208091,
"grad_norm": 7.136536121368408,
"learning_rate": 1.8646826319460593e-06,
"loss": 2.1562,
"step": 351000
},
{
"epoch": 27.241726730217778,
"grad_norm": 6.473196506500244,
"learning_rate": 1.838848846521481e-06,
"loss": 2.167,
"step": 351500
},
{
"epoch": 27.280477408354646,
"grad_norm": 6.8429694175720215,
"learning_rate": 1.8130150610969026e-06,
"loss": 2.1587,
"step": 352000
},
{
"epoch": 27.319228086491513,
"grad_norm": 6.667392253875732,
"learning_rate": 1.7871812756723245e-06,
"loss": 2.1575,
"step": 352500
},
{
"epoch": 27.35797876462838,
"grad_norm": 7.551825046539307,
"learning_rate": 1.7613474902477462e-06,
"loss": 2.1567,
"step": 353000
},
{
"epoch": 27.39672944276525,
"grad_norm": 7.393056392669678,
"learning_rate": 1.735513704823168e-06,
"loss": 2.163,
"step": 353500
},
{
"epoch": 27.435480120902117,
"grad_norm": 6.7227678298950195,
"learning_rate": 1.7096799193985896e-06,
"loss": 2.1678,
"step": 354000
},
{
"epoch": 27.474230799038985,
"grad_norm": 6.587380409240723,
"learning_rate": 1.6838461339740115e-06,
"loss": 2.1611,
"step": 354500
},
{
"epoch": 27.51298147717585,
"grad_norm": 7.290678977966309,
"learning_rate": 1.6580123485494332e-06,
"loss": 2.1555,
"step": 355000
},
{
"epoch": 27.551732155312717,
"grad_norm": 6.52154016494751,
"learning_rate": 1.6321785631248548e-06,
"loss": 2.1487,
"step": 355500
},
{
"epoch": 27.590482833449585,
"grad_norm": 6.613160610198975,
"learning_rate": 1.6063447777002765e-06,
"loss": 2.1599,
"step": 356000
},
{
"epoch": 27.629233511586452,
"grad_norm": 7.148532390594482,
"learning_rate": 1.5805109922756984e-06,
"loss": 2.1731,
"step": 356500
},
{
"epoch": 27.66798418972332,
"grad_norm": 6.29647159576416,
"learning_rate": 1.5546772068511201e-06,
"loss": 2.1641,
"step": 357000
},
{
"epoch": 27.706734867860188,
"grad_norm": 6.647765636444092,
"learning_rate": 1.5288434214265418e-06,
"loss": 2.1756,
"step": 357500
},
{
"epoch": 27.745485545997056,
"grad_norm": 6.541094779968262,
"learning_rate": 1.5030096360019635e-06,
"loss": 2.1584,
"step": 358000
},
{
"epoch": 27.784236224133924,
"grad_norm": 7.08396053314209,
"learning_rate": 1.4771758505773854e-06,
"loss": 2.1551,
"step": 358500
},
{
"epoch": 27.822986902270788,
"grad_norm": 6.8339643478393555,
"learning_rate": 1.451342065152807e-06,
"loss": 2.1575,
"step": 359000
},
{
"epoch": 27.861737580407656,
"grad_norm": 6.175314903259277,
"learning_rate": 1.4255082797282288e-06,
"loss": 2.1312,
"step": 359500
},
{
"epoch": 27.900488258544524,
"grad_norm": 6.25184965133667,
"learning_rate": 1.3996744943036504e-06,
"loss": 2.1509,
"step": 360000
},
{
"epoch": 27.93923893668139,
"grad_norm": 7.08027982711792,
"learning_rate": 1.3738407088790723e-06,
"loss": 2.159,
"step": 360500
},
{
"epoch": 27.97798961481826,
"grad_norm": 6.8008880615234375,
"learning_rate": 1.348006923454494e-06,
"loss": 2.1634,
"step": 361000
},
{
"epoch": 28.0,
"eval_loss": 2.130622386932373,
"eval_runtime": 269.6844,
"eval_samples_per_second": 765.58,
"eval_steps_per_second": 11.966,
"step": 361284
},
{
"epoch": 28.016740292955127,
"grad_norm": 7.46795654296875,
"learning_rate": 1.3221731380299157e-06,
"loss": 2.1363,
"step": 361500
},
{
"epoch": 28.055490971091995,
"grad_norm": 7.271740436553955,
"learning_rate": 1.2963393526053374e-06,
"loss": 2.1604,
"step": 362000
},
{
"epoch": 28.094241649228863,
"grad_norm": 6.692265510559082,
"learning_rate": 1.2705055671807593e-06,
"loss": 2.1596,
"step": 362500
},
{
"epoch": 28.132992327365727,
"grad_norm": 6.122591018676758,
"learning_rate": 1.2446717817561808e-06,
"loss": 2.1524,
"step": 363000
},
{
"epoch": 28.171743005502595,
"grad_norm": 6.683858394622803,
"learning_rate": 1.2188379963316027e-06,
"loss": 2.1598,
"step": 363500
},
{
"epoch": 28.210493683639463,
"grad_norm": 6.768929958343506,
"learning_rate": 1.1930042109070243e-06,
"loss": 2.1515,
"step": 364000
},
{
"epoch": 28.24924436177633,
"grad_norm": 6.956704139709473,
"learning_rate": 1.167170425482446e-06,
"loss": 2.1552,
"step": 364500
},
{
"epoch": 28.2879950399132,
"grad_norm": 6.655780792236328,
"learning_rate": 1.1413366400578677e-06,
"loss": 2.1551,
"step": 365000
},
{
"epoch": 28.326745718050066,
"grad_norm": 7.394413471221924,
"learning_rate": 1.1155028546332896e-06,
"loss": 2.1465,
"step": 365500
},
{
"epoch": 28.365496396186934,
"grad_norm": 7.250267505645752,
"learning_rate": 1.0896690692087113e-06,
"loss": 2.1729,
"step": 366000
},
{
"epoch": 28.404247074323802,
"grad_norm": 6.102252960205078,
"learning_rate": 1.063835283784133e-06,
"loss": 2.1556,
"step": 366500
},
{
"epoch": 28.44299775246067,
"grad_norm": 6.5598297119140625,
"learning_rate": 1.0380014983595547e-06,
"loss": 2.1473,
"step": 367000
},
{
"epoch": 28.481748430597534,
"grad_norm": 7.368846416473389,
"learning_rate": 1.0121677129349766e-06,
"loss": 2.1552,
"step": 367500
},
{
"epoch": 28.520499108734402,
"grad_norm": 6.635545253753662,
"learning_rate": 9.863339275103983e-07,
"loss": 2.1584,
"step": 368000
},
{
"epoch": 28.55924978687127,
"grad_norm": 6.502518177032471,
"learning_rate": 9.6050014208582e-07,
"loss": 2.1669,
"step": 368500
},
{
"epoch": 28.598000465008138,
"grad_norm": 7.150147914886475,
"learning_rate": 9.346663566612417e-07,
"loss": 2.158,
"step": 369000
},
{
"epoch": 28.636751143145005,
"grad_norm": 6.391610622406006,
"learning_rate": 9.088325712366634e-07,
"loss": 2.1464,
"step": 369500
},
{
"epoch": 28.675501821281873,
"grad_norm": 6.436591625213623,
"learning_rate": 8.829987858120852e-07,
"loss": 2.1438,
"step": 370000
},
{
"epoch": 28.71425249941874,
"grad_norm": 6.646981716156006,
"learning_rate": 8.571650003875069e-07,
"loss": 2.1507,
"step": 370500
},
{
"epoch": 28.753003177555605,
"grad_norm": 6.943175792694092,
"learning_rate": 8.313312149629287e-07,
"loss": 2.1483,
"step": 371000
},
{
"epoch": 28.791753855692473,
"grad_norm": 6.345837116241455,
"learning_rate": 8.054974295383504e-07,
"loss": 2.1662,
"step": 371500
},
{
"epoch": 28.83050453382934,
"grad_norm": 6.562370300292969,
"learning_rate": 7.79663644113772e-07,
"loss": 2.1554,
"step": 372000
},
{
"epoch": 28.86925521196621,
"grad_norm": 6.556326866149902,
"learning_rate": 7.538298586891937e-07,
"loss": 2.1448,
"step": 372500
},
{
"epoch": 28.908005890103077,
"grad_norm": 6.487407684326172,
"learning_rate": 7.279960732646154e-07,
"loss": 2.1425,
"step": 373000
},
{
"epoch": 28.946756568239945,
"grad_norm": 7.614674091339111,
"learning_rate": 7.021622878400372e-07,
"loss": 2.1565,
"step": 373500
},
{
"epoch": 28.985507246376812,
"grad_norm": 6.897189140319824,
"learning_rate": 6.763285024154589e-07,
"loss": 2.1438,
"step": 374000
},
{
"epoch": 29.0,
"eval_loss": 2.1226651668548584,
"eval_runtime": 266.9511,
"eval_samples_per_second": 773.419,
"eval_steps_per_second": 12.088,
"step": 374187
},
{
"epoch": 29.02425792451368,
"grad_norm": 6.869750499725342,
"learning_rate": 6.504947169908807e-07,
"loss": 2.138,
"step": 374500
},
{
"epoch": 29.063008602650548,
"grad_norm": 7.1249589920043945,
"learning_rate": 6.246609315663025e-07,
"loss": 2.1527,
"step": 375000
},
{
"epoch": 29.101759280787412,
"grad_norm": 7.201192378997803,
"learning_rate": 5.988271461417243e-07,
"loss": 2.1517,
"step": 375500
},
{
"epoch": 29.14050995892428,
"grad_norm": 6.720222473144531,
"learning_rate": 5.729933607171459e-07,
"loss": 2.1526,
"step": 376000
},
{
"epoch": 29.179260637061148,
"grad_norm": 6.9030866622924805,
"learning_rate": 5.471595752925676e-07,
"loss": 2.1532,
"step": 376500
},
{
"epoch": 29.218011315198016,
"grad_norm": 5.900801181793213,
"learning_rate": 5.213257898679893e-07,
"loss": 2.1534,
"step": 377000
},
{
"epoch": 29.256761993334884,
"grad_norm": 6.259501934051514,
"learning_rate": 4.954920044434111e-07,
"loss": 2.1621,
"step": 377500
},
{
"epoch": 29.29551267147175,
"grad_norm": 6.566405296325684,
"learning_rate": 4.6965821901883286e-07,
"loss": 2.1621,
"step": 378000
},
{
"epoch": 29.33426334960862,
"grad_norm": 6.553793430328369,
"learning_rate": 4.438244335942546e-07,
"loss": 2.1631,
"step": 378500
},
{
"epoch": 29.373014027745487,
"grad_norm": 6.773620128631592,
"learning_rate": 4.1799064816967634e-07,
"loss": 2.1556,
"step": 379000
},
{
"epoch": 29.41176470588235,
"grad_norm": 6.494615077972412,
"learning_rate": 3.921568627450981e-07,
"loss": 2.1554,
"step": 379500
},
{
"epoch": 29.45051538401922,
"grad_norm": 7.172949314117432,
"learning_rate": 3.663230773205198e-07,
"loss": 2.1494,
"step": 380000
},
{
"epoch": 29.489266062156087,
"grad_norm": 6.8991241455078125,
"learning_rate": 3.4048929189594155e-07,
"loss": 2.1406,
"step": 380500
},
{
"epoch": 29.528016740292955,
"grad_norm": 7.046799182891846,
"learning_rate": 3.146555064713633e-07,
"loss": 2.1493,
"step": 381000
},
{
"epoch": 29.566767418429823,
"grad_norm": 6.826701641082764,
"learning_rate": 2.8882172104678503e-07,
"loss": 2.1459,
"step": 381500
},
{
"epoch": 29.60551809656669,
"grad_norm": 6.649389743804932,
"learning_rate": 2.6298793562220677e-07,
"loss": 2.1593,
"step": 382000
},
{
"epoch": 29.64426877470356,
"grad_norm": 6.10260009765625,
"learning_rate": 2.3715415019762845e-07,
"loss": 2.151,
"step": 382500
},
{
"epoch": 29.683019452840426,
"grad_norm": 6.9101128578186035,
"learning_rate": 2.113203647730502e-07,
"loss": 2.1625,
"step": 383000
},
{
"epoch": 29.72177013097729,
"grad_norm": 6.671387672424316,
"learning_rate": 1.8548657934847193e-07,
"loss": 2.1496,
"step": 383500
},
{
"epoch": 29.76052080911416,
"grad_norm": 7.705864429473877,
"learning_rate": 1.5965279392389367e-07,
"loss": 2.1496,
"step": 384000
},
{
"epoch": 29.799271487251026,
"grad_norm": 6.319827079772949,
"learning_rate": 1.338190084993154e-07,
"loss": 2.149,
"step": 384500
},
{
"epoch": 29.838022165387894,
"grad_norm": 6.850592613220215,
"learning_rate": 1.0798522307473716e-07,
"loss": 2.1511,
"step": 385000
},
{
"epoch": 29.876772843524762,
"grad_norm": 7.166690826416016,
"learning_rate": 8.21514376501589e-08,
"loss": 2.1492,
"step": 385500
},
{
"epoch": 29.91552352166163,
"grad_norm": 6.324465274810791,
"learning_rate": 5.631765222558062e-08,
"loss": 2.1523,
"step": 386000
},
{
"epoch": 29.954274199798498,
"grad_norm": 7.214087009429932,
"learning_rate": 3.048386680100235e-08,
"loss": 2.1628,
"step": 386500
},
{
"epoch": 29.993024877935365,
"grad_norm": 6.88369607925415,
"learning_rate": 4.6500813764240875e-09,
"loss": 2.144,
"step": 387000
},
{
"epoch": 30.0,
"eval_loss": 2.125218629837036,
"eval_runtime": 267.0237,
"eval_samples_per_second": 773.209,
"eval_steps_per_second": 12.085,
"step": 387090
}
],
"logging_steps": 500,
"max_steps": 387090,
"num_input_tokens_seen": 0,
"num_train_epochs": 30,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.706577784666885e+17,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}