cutelemonlili's picture
Add files using upload-large-folder tool
2fc287d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 200,
"global_step": 1376,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014534883720930232,
"grad_norm": 4.90753978197305,
"learning_rate": 9.999986968251269e-06,
"loss": 0.2756,
"step": 1
},
{
"epoch": 0.0029069767441860465,
"grad_norm": 2.0826321480239645,
"learning_rate": 9.99994787307301e-06,
"loss": 0.2055,
"step": 2
},
{
"epoch": 0.00436046511627907,
"grad_norm": 2.0071189367521165,
"learning_rate": 9.999882714669009e-06,
"loss": 0.2428,
"step": 3
},
{
"epoch": 0.005813953488372093,
"grad_norm": 1.8308409567606185,
"learning_rate": 9.99979149337892e-06,
"loss": 0.1988,
"step": 4
},
{
"epoch": 0.007267441860465116,
"grad_norm": 2.1523095681813,
"learning_rate": 9.999674209678253e-06,
"loss": 0.2308,
"step": 5
},
{
"epoch": 0.00872093023255814,
"grad_norm": 2.042843363698041,
"learning_rate": 9.999530864178371e-06,
"loss": 0.1847,
"step": 6
},
{
"epoch": 0.010174418604651164,
"grad_norm": 1.7333272867252485,
"learning_rate": 9.999361457626493e-06,
"loss": 0.2195,
"step": 7
},
{
"epoch": 0.011627906976744186,
"grad_norm": 1.68765684619863,
"learning_rate": 9.999165990905684e-06,
"loss": 0.1968,
"step": 8
},
{
"epoch": 0.01308139534883721,
"grad_norm": 2.3397968827666062,
"learning_rate": 9.99894446503485e-06,
"loss": 0.1655,
"step": 9
},
{
"epoch": 0.014534883720930232,
"grad_norm": 1.8740327980184355,
"learning_rate": 9.998696881168743e-06,
"loss": 0.2018,
"step": 10
},
{
"epoch": 0.015988372093023256,
"grad_norm": 1.8903939988289815,
"learning_rate": 9.998423240597942e-06,
"loss": 0.1755,
"step": 11
},
{
"epoch": 0.01744186046511628,
"grad_norm": 2.536766624700735,
"learning_rate": 9.998123544748852e-06,
"loss": 0.2422,
"step": 12
},
{
"epoch": 0.0188953488372093,
"grad_norm": 2.019676282269962,
"learning_rate": 9.997797795183699e-06,
"loss": 0.1937,
"step": 13
},
{
"epoch": 0.020348837209302327,
"grad_norm": 1.717985207230675,
"learning_rate": 9.997445993600516e-06,
"loss": 0.1596,
"step": 14
},
{
"epoch": 0.02180232558139535,
"grad_norm": 1.7865449356650134,
"learning_rate": 9.99706814183314e-06,
"loss": 0.1943,
"step": 15
},
{
"epoch": 0.023255813953488372,
"grad_norm": 1.9620864278100416,
"learning_rate": 9.996664241851197e-06,
"loss": 0.1623,
"step": 16
},
{
"epoch": 0.024709302325581394,
"grad_norm": 1.8373307354674375,
"learning_rate": 9.996234295760099e-06,
"loss": 0.2007,
"step": 17
},
{
"epoch": 0.02616279069767442,
"grad_norm": 2.227861617466709,
"learning_rate": 9.995778305801025e-06,
"loss": 0.18,
"step": 18
},
{
"epoch": 0.027616279069767442,
"grad_norm": 1.6895392880355353,
"learning_rate": 9.995296274350912e-06,
"loss": 0.1483,
"step": 19
},
{
"epoch": 0.029069767441860465,
"grad_norm": 1.7640692153593953,
"learning_rate": 9.994788203922447e-06,
"loss": 0.149,
"step": 20
},
{
"epoch": 0.030523255813953487,
"grad_norm": 1.455602398611072,
"learning_rate": 9.994254097164047e-06,
"loss": 0.1534,
"step": 21
},
{
"epoch": 0.03197674418604651,
"grad_norm": 1.932687065520904,
"learning_rate": 9.993693956859849e-06,
"loss": 0.1628,
"step": 22
},
{
"epoch": 0.03343023255813953,
"grad_norm": 1.86140975395309,
"learning_rate": 9.9931077859297e-06,
"loss": 0.189,
"step": 23
},
{
"epoch": 0.03488372093023256,
"grad_norm": 1.6940825298691309,
"learning_rate": 9.99249558742913e-06,
"loss": 0.1636,
"step": 24
},
{
"epoch": 0.036337209302325583,
"grad_norm": 1.9970017291615039,
"learning_rate": 9.991857364549347e-06,
"loss": 0.1354,
"step": 25
},
{
"epoch": 0.0377906976744186,
"grad_norm": 1.611614767401661,
"learning_rate": 9.991193120617213e-06,
"loss": 0.1271,
"step": 26
},
{
"epoch": 0.03924418604651163,
"grad_norm": 1.6668640058803244,
"learning_rate": 9.990502859095234e-06,
"loss": 0.1471,
"step": 27
},
{
"epoch": 0.040697674418604654,
"grad_norm": 1.6279188049671565,
"learning_rate": 9.989786583581535e-06,
"loss": 0.1672,
"step": 28
},
{
"epoch": 0.04215116279069767,
"grad_norm": 1.7050930977268255,
"learning_rate": 9.989044297809846e-06,
"loss": 0.1621,
"step": 29
},
{
"epoch": 0.0436046511627907,
"grad_norm": 1.6059761856688315,
"learning_rate": 9.98827600564948e-06,
"loss": 0.1413,
"step": 30
},
{
"epoch": 0.04505813953488372,
"grad_norm": 1.657144516453862,
"learning_rate": 9.987481711105312e-06,
"loss": 0.1747,
"step": 31
},
{
"epoch": 0.046511627906976744,
"grad_norm": 1.5640565805511477,
"learning_rate": 9.986661418317759e-06,
"loss": 0.1553,
"step": 32
},
{
"epoch": 0.04796511627906977,
"grad_norm": 1.3799874670130874,
"learning_rate": 9.985815131562765e-06,
"loss": 0.1185,
"step": 33
},
{
"epoch": 0.04941860465116279,
"grad_norm": 1.6674345854485941,
"learning_rate": 9.984942855251765e-06,
"loss": 0.1666,
"step": 34
},
{
"epoch": 0.050872093023255814,
"grad_norm": 1.6727808497465833,
"learning_rate": 9.984044593931674e-06,
"loss": 0.1918,
"step": 35
},
{
"epoch": 0.05232558139534884,
"grad_norm": 1.5015093467589995,
"learning_rate": 9.983120352284861e-06,
"loss": 0.1464,
"step": 36
},
{
"epoch": 0.05377906976744186,
"grad_norm": 1.505665041651918,
"learning_rate": 9.982170135129116e-06,
"loss": 0.1259,
"step": 37
},
{
"epoch": 0.055232558139534885,
"grad_norm": 1.5003368863482498,
"learning_rate": 9.981193947417638e-06,
"loss": 0.1187,
"step": 38
},
{
"epoch": 0.056686046511627904,
"grad_norm": 1.900977624703376,
"learning_rate": 9.980191794239e-06,
"loss": 0.1584,
"step": 39
},
{
"epoch": 0.05813953488372093,
"grad_norm": 2.1817255633060855,
"learning_rate": 9.979163680817124e-06,
"loss": 0.1711,
"step": 40
},
{
"epoch": 0.059593023255813955,
"grad_norm": 1.9211415894532136,
"learning_rate": 9.978109612511257e-06,
"loss": 0.1723,
"step": 41
},
{
"epoch": 0.061046511627906974,
"grad_norm": 3.230526894351387,
"learning_rate": 9.977029594815942e-06,
"loss": 0.2277,
"step": 42
},
{
"epoch": 0.0625,
"grad_norm": 1.6529179532438223,
"learning_rate": 9.975923633360985e-06,
"loss": 0.1547,
"step": 43
},
{
"epoch": 0.06395348837209303,
"grad_norm": 1.7575964357251361,
"learning_rate": 9.974791733911431e-06,
"loss": 0.2112,
"step": 44
},
{
"epoch": 0.06540697674418605,
"grad_norm": 1.7248448102436156,
"learning_rate": 9.973633902367532e-06,
"loss": 0.1583,
"step": 45
},
{
"epoch": 0.06686046511627906,
"grad_norm": 1.747621900146478,
"learning_rate": 9.972450144764713e-06,
"loss": 0.1747,
"step": 46
},
{
"epoch": 0.06831395348837209,
"grad_norm": 1.8290141742667902,
"learning_rate": 9.971240467273552e-06,
"loss": 0.1524,
"step": 47
},
{
"epoch": 0.06976744186046512,
"grad_norm": 1.8430702236241727,
"learning_rate": 9.970004876199731e-06,
"loss": 0.1496,
"step": 48
},
{
"epoch": 0.07122093023255814,
"grad_norm": 2.2055666937974614,
"learning_rate": 9.968743377984013e-06,
"loss": 0.17,
"step": 49
},
{
"epoch": 0.07267441860465117,
"grad_norm": 1.538052515526653,
"learning_rate": 9.967455979202214e-06,
"loss": 0.154,
"step": 50
},
{
"epoch": 0.07412790697674419,
"grad_norm": 1.633366596107583,
"learning_rate": 9.966142686565155e-06,
"loss": 0.1578,
"step": 51
},
{
"epoch": 0.0755813953488372,
"grad_norm": 1.6546648500780583,
"learning_rate": 9.964803506918634e-06,
"loss": 0.16,
"step": 52
},
{
"epoch": 0.07703488372093023,
"grad_norm": 1.6111923238734047,
"learning_rate": 9.963438447243394e-06,
"loss": 0.1182,
"step": 53
},
{
"epoch": 0.07848837209302326,
"grad_norm": 1.5663545251276552,
"learning_rate": 9.96204751465508e-06,
"loss": 0.1348,
"step": 54
},
{
"epoch": 0.07994186046511628,
"grad_norm": 1.5491604249911004,
"learning_rate": 9.960630716404205e-06,
"loss": 0.1305,
"step": 55
},
{
"epoch": 0.08139534883720931,
"grad_norm": 1.5904201245888179,
"learning_rate": 9.959188059876115e-06,
"loss": 0.1485,
"step": 56
},
{
"epoch": 0.08284883720930232,
"grad_norm": 1.8216437492726838,
"learning_rate": 9.957719552590944e-06,
"loss": 0.1787,
"step": 57
},
{
"epoch": 0.08430232558139535,
"grad_norm": 1.423635987100293,
"learning_rate": 9.956225202203576e-06,
"loss": 0.1335,
"step": 58
},
{
"epoch": 0.08575581395348837,
"grad_norm": 1.651615225486003,
"learning_rate": 9.954705016503614e-06,
"loss": 0.1339,
"step": 59
},
{
"epoch": 0.0872093023255814,
"grad_norm": 1.658410900976534,
"learning_rate": 9.95315900341533e-06,
"loss": 0.1618,
"step": 60
},
{
"epoch": 0.08866279069767442,
"grad_norm": 1.7421372069577428,
"learning_rate": 9.951587170997621e-06,
"loss": 0.1559,
"step": 61
},
{
"epoch": 0.09011627906976744,
"grad_norm": 1.6410793659457708,
"learning_rate": 9.949989527443982e-06,
"loss": 0.1187,
"step": 62
},
{
"epoch": 0.09156976744186046,
"grad_norm": 1.3084758751505552,
"learning_rate": 9.948366081082446e-06,
"loss": 0.1078,
"step": 63
},
{
"epoch": 0.09302325581395349,
"grad_norm": 1.8532352795685818,
"learning_rate": 9.946716840375552e-06,
"loss": 0.1787,
"step": 64
},
{
"epoch": 0.09447674418604651,
"grad_norm": 1.7832244807577888,
"learning_rate": 9.945041813920296e-06,
"loss": 0.1327,
"step": 65
},
{
"epoch": 0.09593023255813954,
"grad_norm": 1.5586347200013577,
"learning_rate": 9.943341010448086e-06,
"loss": 0.1254,
"step": 66
},
{
"epoch": 0.09738372093023256,
"grad_norm": 1.3472131423550082,
"learning_rate": 9.941614438824703e-06,
"loss": 0.1473,
"step": 67
},
{
"epoch": 0.09883720930232558,
"grad_norm": 1.6722596945098753,
"learning_rate": 9.939862108050244e-06,
"loss": 0.1816,
"step": 68
},
{
"epoch": 0.1002906976744186,
"grad_norm": 1.5162995990299708,
"learning_rate": 9.93808402725908e-06,
"loss": 0.1418,
"step": 69
},
{
"epoch": 0.10174418604651163,
"grad_norm": 1.7264251629725236,
"learning_rate": 9.936280205719817e-06,
"loss": 0.1845,
"step": 70
},
{
"epoch": 0.10319767441860465,
"grad_norm": 1.8868123011708366,
"learning_rate": 9.934450652835233e-06,
"loss": 0.1454,
"step": 71
},
{
"epoch": 0.10465116279069768,
"grad_norm": 1.6996113668126296,
"learning_rate": 9.932595378142233e-06,
"loss": 0.1611,
"step": 72
},
{
"epoch": 0.10610465116279069,
"grad_norm": 1.3926857562718415,
"learning_rate": 9.930714391311813e-06,
"loss": 0.13,
"step": 73
},
{
"epoch": 0.10755813953488372,
"grad_norm": 1.3730967025306435,
"learning_rate": 9.928807702148986e-06,
"loss": 0.14,
"step": 74
},
{
"epoch": 0.10901162790697674,
"grad_norm": 1.6148257033625841,
"learning_rate": 9.926875320592756e-06,
"loss": 0.2184,
"step": 75
},
{
"epoch": 0.11046511627906977,
"grad_norm": 1.6770555686431594,
"learning_rate": 9.924917256716042e-06,
"loss": 0.1343,
"step": 76
},
{
"epoch": 0.1119186046511628,
"grad_norm": 1.5552428567558088,
"learning_rate": 9.922933520725645e-06,
"loss": 0.1228,
"step": 77
},
{
"epoch": 0.11337209302325581,
"grad_norm": 1.6238590358211495,
"learning_rate": 9.920924122962185e-06,
"loss": 0.1883,
"step": 78
},
{
"epoch": 0.11482558139534883,
"grad_norm": 1.3715125415573524,
"learning_rate": 9.918889073900046e-06,
"loss": 0.1204,
"step": 79
},
{
"epoch": 0.11627906976744186,
"grad_norm": 1.5437673536268413,
"learning_rate": 9.91682838414733e-06,
"loss": 0.1443,
"step": 80
},
{
"epoch": 0.11773255813953488,
"grad_norm": 1.9238885575325282,
"learning_rate": 9.914742064445795e-06,
"loss": 0.151,
"step": 81
},
{
"epoch": 0.11918604651162791,
"grad_norm": 1.5000041779719893,
"learning_rate": 9.912630125670793e-06,
"loss": 0.1478,
"step": 82
},
{
"epoch": 0.12063953488372094,
"grad_norm": 1.4161989982638625,
"learning_rate": 9.910492578831231e-06,
"loss": 0.1205,
"step": 83
},
{
"epoch": 0.12209302325581395,
"grad_norm": 1.887456331836179,
"learning_rate": 9.908329435069495e-06,
"loss": 0.1892,
"step": 84
},
{
"epoch": 0.12354651162790697,
"grad_norm": 1.4188322937106472,
"learning_rate": 9.906140705661406e-06,
"loss": 0.1354,
"step": 85
},
{
"epoch": 0.125,
"grad_norm": 1.4674049503326925,
"learning_rate": 9.903926402016153e-06,
"loss": 0.1188,
"step": 86
},
{
"epoch": 0.12645348837209303,
"grad_norm": 1.7991239782209092,
"learning_rate": 9.901686535676233e-06,
"loss": 0.1758,
"step": 87
},
{
"epoch": 0.12790697674418605,
"grad_norm": 1.755012463932795,
"learning_rate": 9.899421118317399e-06,
"loss": 0.1811,
"step": 88
},
{
"epoch": 0.12936046511627908,
"grad_norm": 1.6961857675852638,
"learning_rate": 9.897130161748588e-06,
"loss": 0.1617,
"step": 89
},
{
"epoch": 0.1308139534883721,
"grad_norm": 1.8355331445231158,
"learning_rate": 9.894813677911868e-06,
"loss": 0.1709,
"step": 90
},
{
"epoch": 0.13226744186046513,
"grad_norm": 1.637508412196323,
"learning_rate": 9.892471678882377e-06,
"loss": 0.1467,
"step": 91
},
{
"epoch": 0.13372093023255813,
"grad_norm": 2.2746840658968677,
"learning_rate": 9.890104176868246e-06,
"loss": 0.1416,
"step": 92
},
{
"epoch": 0.13517441860465115,
"grad_norm": 1.7646971278296149,
"learning_rate": 9.887711184210559e-06,
"loss": 0.1597,
"step": 93
},
{
"epoch": 0.13662790697674418,
"grad_norm": 1.8116059351522782,
"learning_rate": 9.885292713383264e-06,
"loss": 0.1599,
"step": 94
},
{
"epoch": 0.1380813953488372,
"grad_norm": 1.8626165104101766,
"learning_rate": 9.882848776993119e-06,
"loss": 0.1377,
"step": 95
},
{
"epoch": 0.13953488372093023,
"grad_norm": 1.5910215156996428,
"learning_rate": 9.880379387779637e-06,
"loss": 0.1596,
"step": 96
},
{
"epoch": 0.14098837209302326,
"grad_norm": 1.3431975319409974,
"learning_rate": 9.877884558614997e-06,
"loss": 0.1346,
"step": 97
},
{
"epoch": 0.14244186046511628,
"grad_norm": 1.3362805614728352,
"learning_rate": 9.875364302503995e-06,
"loss": 0.1127,
"step": 98
},
{
"epoch": 0.1438953488372093,
"grad_norm": 1.8199783165196086,
"learning_rate": 9.872818632583969e-06,
"loss": 0.1604,
"step": 99
},
{
"epoch": 0.14534883720930233,
"grad_norm": 1.6827249026814992,
"learning_rate": 9.870247562124731e-06,
"loss": 0.1346,
"step": 100
},
{
"epoch": 0.14680232558139536,
"grad_norm": 1.4674119242601242,
"learning_rate": 9.8676511045285e-06,
"loss": 0.1168,
"step": 101
},
{
"epoch": 0.14825581395348839,
"grad_norm": 1.6729272753403375,
"learning_rate": 9.865029273329826e-06,
"loss": 0.1478,
"step": 102
},
{
"epoch": 0.14970930232558138,
"grad_norm": 1.716840091927574,
"learning_rate": 9.862382082195531e-06,
"loss": 0.1494,
"step": 103
},
{
"epoch": 0.1511627906976744,
"grad_norm": 1.8089594266192164,
"learning_rate": 9.859709544924624e-06,
"loss": 0.1362,
"step": 104
},
{
"epoch": 0.15261627906976744,
"grad_norm": 1.5150282280782363,
"learning_rate": 9.85701167544824e-06,
"loss": 0.1684,
"step": 105
},
{
"epoch": 0.15406976744186046,
"grad_norm": 1.523111761533529,
"learning_rate": 9.854288487829561e-06,
"loss": 0.145,
"step": 106
},
{
"epoch": 0.1555232558139535,
"grad_norm": 1.5473976095842301,
"learning_rate": 9.851539996263748e-06,
"loss": 0.1349,
"step": 107
},
{
"epoch": 0.1569767441860465,
"grad_norm": 1.9497016837218222,
"learning_rate": 9.848766215077859e-06,
"loss": 0.1751,
"step": 108
},
{
"epoch": 0.15843023255813954,
"grad_norm": 1.8085748930229228,
"learning_rate": 9.845967158730783e-06,
"loss": 0.1401,
"step": 109
},
{
"epoch": 0.15988372093023256,
"grad_norm": 1.7391260062163252,
"learning_rate": 9.843142841813158e-06,
"loss": 0.1599,
"step": 110
},
{
"epoch": 0.1613372093023256,
"grad_norm": 1.6195797255004982,
"learning_rate": 9.840293279047302e-06,
"loss": 0.1632,
"step": 111
},
{
"epoch": 0.16279069767441862,
"grad_norm": 1.82089731812025,
"learning_rate": 9.837418485287126e-06,
"loss": 0.1355,
"step": 112
},
{
"epoch": 0.16424418604651161,
"grad_norm": 1.7953380194093425,
"learning_rate": 9.83451847551807e-06,
"loss": 0.1635,
"step": 113
},
{
"epoch": 0.16569767441860464,
"grad_norm": 1.6497571195888396,
"learning_rate": 9.831593264857011e-06,
"loss": 0.1563,
"step": 114
},
{
"epoch": 0.16715116279069767,
"grad_norm": 1.4404520426344958,
"learning_rate": 9.828642868552195e-06,
"loss": 0.142,
"step": 115
},
{
"epoch": 0.1686046511627907,
"grad_norm": 1.5292175625418722,
"learning_rate": 9.825667301983149e-06,
"loss": 0.1322,
"step": 116
},
{
"epoch": 0.17005813953488372,
"grad_norm": 2.0789140113841236,
"learning_rate": 9.822666580660606e-06,
"loss": 0.1272,
"step": 117
},
{
"epoch": 0.17151162790697674,
"grad_norm": 1.5437780941386026,
"learning_rate": 9.819640720226429e-06,
"loss": 0.1699,
"step": 118
},
{
"epoch": 0.17296511627906977,
"grad_norm": 1.5471057200671505,
"learning_rate": 9.816589736453516e-06,
"loss": 0.1233,
"step": 119
},
{
"epoch": 0.1744186046511628,
"grad_norm": 1.5206963754435927,
"learning_rate": 9.81351364524573e-06,
"loss": 0.1265,
"step": 120
},
{
"epoch": 0.17587209302325582,
"grad_norm": 2.2824687888973947,
"learning_rate": 9.81041246263781e-06,
"loss": 0.1806,
"step": 121
},
{
"epoch": 0.17732558139534885,
"grad_norm": 1.490348186092453,
"learning_rate": 9.807286204795287e-06,
"loss": 0.1254,
"step": 122
},
{
"epoch": 0.17877906976744187,
"grad_norm": 1.4547407899623217,
"learning_rate": 9.804134888014407e-06,
"loss": 0.1669,
"step": 123
},
{
"epoch": 0.18023255813953487,
"grad_norm": 1.3955416003002072,
"learning_rate": 9.800958528722035e-06,
"loss": 0.1227,
"step": 124
},
{
"epoch": 0.1816860465116279,
"grad_norm": 1.366189834603044,
"learning_rate": 9.797757143475577e-06,
"loss": 0.1141,
"step": 125
},
{
"epoch": 0.18313953488372092,
"grad_norm": 1.784363750973548,
"learning_rate": 9.794530748962894e-06,
"loss": 0.1545,
"step": 126
},
{
"epoch": 0.18459302325581395,
"grad_norm": 1.694786135391199,
"learning_rate": 9.791279362002212e-06,
"loss": 0.1441,
"step": 127
},
{
"epoch": 0.18604651162790697,
"grad_norm": 1.5799996656623245,
"learning_rate": 9.78800299954203e-06,
"loss": 0.1495,
"step": 128
},
{
"epoch": 0.1875,
"grad_norm": 1.810120051088718,
"learning_rate": 9.784701678661045e-06,
"loss": 0.1489,
"step": 129
},
{
"epoch": 0.18895348837209303,
"grad_norm": 1.6065641177622767,
"learning_rate": 9.781375416568048e-06,
"loss": 0.1498,
"step": 130
},
{
"epoch": 0.19040697674418605,
"grad_norm": 1.6146187321984817,
"learning_rate": 9.778024230601846e-06,
"loss": 0.1616,
"step": 131
},
{
"epoch": 0.19186046511627908,
"grad_norm": 1.5540139065991918,
"learning_rate": 9.774648138231163e-06,
"loss": 0.15,
"step": 132
},
{
"epoch": 0.1933139534883721,
"grad_norm": 1.692543845359626,
"learning_rate": 9.771247157054554e-06,
"loss": 0.1459,
"step": 133
},
{
"epoch": 0.19476744186046513,
"grad_norm": 1.9872344731203477,
"learning_rate": 9.767821304800312e-06,
"loss": 0.153,
"step": 134
},
{
"epoch": 0.19622093023255813,
"grad_norm": 2.1160080220932027,
"learning_rate": 9.764370599326375e-06,
"loss": 0.179,
"step": 135
},
{
"epoch": 0.19767441860465115,
"grad_norm": 1.6129189257388363,
"learning_rate": 9.760895058620236e-06,
"loss": 0.1689,
"step": 136
},
{
"epoch": 0.19912790697674418,
"grad_norm": 1.650799570186251,
"learning_rate": 9.75739470079884e-06,
"loss": 0.1671,
"step": 137
},
{
"epoch": 0.2005813953488372,
"grad_norm": 2.030672631597455,
"learning_rate": 9.753869544108504e-06,
"loss": 0.18,
"step": 138
},
{
"epoch": 0.20203488372093023,
"grad_norm": 1.319224461569878,
"learning_rate": 9.75031960692481e-06,
"loss": 0.1483,
"step": 139
},
{
"epoch": 0.20348837209302326,
"grad_norm": 1.407548702023548,
"learning_rate": 9.74674490775251e-06,
"loss": 0.1333,
"step": 140
},
{
"epoch": 0.20494186046511628,
"grad_norm": 1.4802679481612602,
"learning_rate": 9.743145465225443e-06,
"loss": 0.1117,
"step": 141
},
{
"epoch": 0.2063953488372093,
"grad_norm": 1.4078486946857556,
"learning_rate": 9.739521298106417e-06,
"loss": 0.1307,
"step": 142
},
{
"epoch": 0.20784883720930233,
"grad_norm": 1.9988611788754047,
"learning_rate": 9.735872425287124e-06,
"loss": 0.1269,
"step": 143
},
{
"epoch": 0.20930232558139536,
"grad_norm": 1.4797178696285465,
"learning_rate": 9.732198865788047e-06,
"loss": 0.1559,
"step": 144
},
{
"epoch": 0.21075581395348839,
"grad_norm": 1.6049649292469328,
"learning_rate": 9.728500638758345e-06,
"loss": 0.1381,
"step": 145
},
{
"epoch": 0.21220930232558138,
"grad_norm": 1.811631968427576,
"learning_rate": 9.724777763475765e-06,
"loss": 0.1637,
"step": 146
},
{
"epoch": 0.2136627906976744,
"grad_norm": 1.4755303711009382,
"learning_rate": 9.721030259346536e-06,
"loss": 0.1054,
"step": 147
},
{
"epoch": 0.21511627906976744,
"grad_norm": 1.5259450627037843,
"learning_rate": 9.71725814590527e-06,
"loss": 0.1422,
"step": 148
},
{
"epoch": 0.21656976744186046,
"grad_norm": 1.6229825202968946,
"learning_rate": 9.713461442814862e-06,
"loss": 0.1298,
"step": 149
},
{
"epoch": 0.2180232558139535,
"grad_norm": 1.4725135801149893,
"learning_rate": 9.709640169866385e-06,
"loss": 0.1361,
"step": 150
},
{
"epoch": 0.2194767441860465,
"grad_norm": 1.5398373955451008,
"learning_rate": 9.705794346978988e-06,
"loss": 0.1531,
"step": 151
},
{
"epoch": 0.22093023255813954,
"grad_norm": 1.676675600727517,
"learning_rate": 9.701923994199784e-06,
"loss": 0.1579,
"step": 152
},
{
"epoch": 0.22238372093023256,
"grad_norm": 1.6397792048445756,
"learning_rate": 9.698029131703766e-06,
"loss": 0.1446,
"step": 153
},
{
"epoch": 0.2238372093023256,
"grad_norm": 1.5894721901549633,
"learning_rate": 9.694109779793677e-06,
"loss": 0.1533,
"step": 154
},
{
"epoch": 0.22529069767441862,
"grad_norm": 1.6862648872718184,
"learning_rate": 9.690165958899923e-06,
"loss": 0.1483,
"step": 155
},
{
"epoch": 0.22674418604651161,
"grad_norm": 1.465800326444994,
"learning_rate": 9.686197689580457e-06,
"loss": 0.1338,
"step": 156
},
{
"epoch": 0.22819767441860464,
"grad_norm": 1.4653415401590757,
"learning_rate": 9.682204992520674e-06,
"loss": 0.1045,
"step": 157
},
{
"epoch": 0.22965116279069767,
"grad_norm": 1.8737300808849864,
"learning_rate": 9.678187888533302e-06,
"loss": 0.1457,
"step": 158
},
{
"epoch": 0.2311046511627907,
"grad_norm": 1.9112821169487715,
"learning_rate": 9.674146398558303e-06,
"loss": 0.157,
"step": 159
},
{
"epoch": 0.23255813953488372,
"grad_norm": 1.6172822347717775,
"learning_rate": 9.670080543662742e-06,
"loss": 0.1708,
"step": 160
},
{
"epoch": 0.23401162790697674,
"grad_norm": 1.5162309678255983,
"learning_rate": 9.665990345040702e-06,
"loss": 0.1407,
"step": 161
},
{
"epoch": 0.23546511627906977,
"grad_norm": 1.5939441326156882,
"learning_rate": 9.66187582401316e-06,
"loss": 0.155,
"step": 162
},
{
"epoch": 0.2369186046511628,
"grad_norm": 1.6355525068645504,
"learning_rate": 9.657737002027878e-06,
"loss": 0.1847,
"step": 163
},
{
"epoch": 0.23837209302325582,
"grad_norm": 1.3046079540322713,
"learning_rate": 9.653573900659292e-06,
"loss": 0.1282,
"step": 164
},
{
"epoch": 0.23982558139534885,
"grad_norm": 1.5490255723693547,
"learning_rate": 9.649386541608395e-06,
"loss": 0.1477,
"step": 165
},
{
"epoch": 0.24127906976744187,
"grad_norm": 1.595779809647306,
"learning_rate": 9.645174946702634e-06,
"loss": 0.1678,
"step": 166
},
{
"epoch": 0.24273255813953487,
"grad_norm": 1.2880223015165764,
"learning_rate": 9.640939137895788e-06,
"loss": 0.1463,
"step": 167
},
{
"epoch": 0.2441860465116279,
"grad_norm": 1.5824936872282962,
"learning_rate": 9.636679137267852e-06,
"loss": 0.1709,
"step": 168
},
{
"epoch": 0.24563953488372092,
"grad_norm": 1.5803584865776295,
"learning_rate": 9.632394967024934e-06,
"loss": 0.1744,
"step": 169
},
{
"epoch": 0.24709302325581395,
"grad_norm": 1.4525033434391654,
"learning_rate": 9.628086649499121e-06,
"loss": 0.1441,
"step": 170
},
{
"epoch": 0.24854651162790697,
"grad_norm": 1.3887804855032542,
"learning_rate": 9.623754207148382e-06,
"loss": 0.1482,
"step": 171
},
{
"epoch": 0.25,
"grad_norm": 1.4357196619635686,
"learning_rate": 9.619397662556434e-06,
"loss": 0.1281,
"step": 172
},
{
"epoch": 0.251453488372093,
"grad_norm": 1.7625555681963379,
"learning_rate": 9.615017038432636e-06,
"loss": 0.1772,
"step": 173
},
{
"epoch": 0.25290697674418605,
"grad_norm": 1.493762339052703,
"learning_rate": 9.610612357611868e-06,
"loss": 0.1158,
"step": 174
},
{
"epoch": 0.2543604651162791,
"grad_norm": 1.429067156820555,
"learning_rate": 9.606183643054401e-06,
"loss": 0.1427,
"step": 175
},
{
"epoch": 0.2558139534883721,
"grad_norm": 1.6273610891658072,
"learning_rate": 9.601730917845798e-06,
"loss": 0.1567,
"step": 176
},
{
"epoch": 0.25726744186046513,
"grad_norm": 1.7484274417059458,
"learning_rate": 9.597254205196775e-06,
"loss": 0.1352,
"step": 177
},
{
"epoch": 0.25872093023255816,
"grad_norm": 1.4768497051427878,
"learning_rate": 9.592753528443092e-06,
"loss": 0.1554,
"step": 178
},
{
"epoch": 0.2601744186046512,
"grad_norm": 1.792017449974729,
"learning_rate": 9.588228911045423e-06,
"loss": 0.1611,
"step": 179
},
{
"epoch": 0.2616279069767442,
"grad_norm": 1.8929106049989604,
"learning_rate": 9.58368037658924e-06,
"loss": 0.1599,
"step": 180
},
{
"epoch": 0.26308139534883723,
"grad_norm": 1.6939699226935878,
"learning_rate": 9.579107948784684e-06,
"loss": 0.1625,
"step": 181
},
{
"epoch": 0.26453488372093026,
"grad_norm": 1.564868234763488,
"learning_rate": 9.57451165146645e-06,
"loss": 0.1533,
"step": 182
},
{
"epoch": 0.26598837209302323,
"grad_norm": 1.5170878496402254,
"learning_rate": 9.569891508593654e-06,
"loss": 0.1435,
"step": 183
},
{
"epoch": 0.26744186046511625,
"grad_norm": 1.3597169726380491,
"learning_rate": 9.565247544249709e-06,
"loss": 0.1419,
"step": 184
},
{
"epoch": 0.2688953488372093,
"grad_norm": 1.3285568618373091,
"learning_rate": 9.56057978264221e-06,
"loss": 0.1199,
"step": 185
},
{
"epoch": 0.2703488372093023,
"grad_norm": 1.610052139476346,
"learning_rate": 9.55588824810279e-06,
"loss": 0.1329,
"step": 186
},
{
"epoch": 0.27180232558139533,
"grad_norm": 1.8398199886190283,
"learning_rate": 9.551172965087017e-06,
"loss": 0.161,
"step": 187
},
{
"epoch": 0.27325581395348836,
"grad_norm": 1.7116096397748537,
"learning_rate": 9.54643395817424e-06,
"loss": 0.1918,
"step": 188
},
{
"epoch": 0.2747093023255814,
"grad_norm": 1.8785142775891301,
"learning_rate": 9.541671252067475e-06,
"loss": 0.2001,
"step": 189
},
{
"epoch": 0.2761627906976744,
"grad_norm": 1.9515749354758625,
"learning_rate": 9.53688487159328e-06,
"loss": 0.195,
"step": 190
},
{
"epoch": 0.27761627906976744,
"grad_norm": 1.539979313310192,
"learning_rate": 9.532074841701619e-06,
"loss": 0.1586,
"step": 191
},
{
"epoch": 0.27906976744186046,
"grad_norm": 1.6618019039238547,
"learning_rate": 9.527241187465735e-06,
"loss": 0.1625,
"step": 192
},
{
"epoch": 0.2805232558139535,
"grad_norm": 1.534997769425568,
"learning_rate": 9.522383934082009e-06,
"loss": 0.1421,
"step": 193
},
{
"epoch": 0.2819767441860465,
"grad_norm": 1.6350100832134988,
"learning_rate": 9.517503106869845e-06,
"loss": 0.1254,
"step": 194
},
{
"epoch": 0.28343023255813954,
"grad_norm": 1.4730057580868088,
"learning_rate": 9.512598731271532e-06,
"loss": 0.1575,
"step": 195
},
{
"epoch": 0.28488372093023256,
"grad_norm": 2.071129803071774,
"learning_rate": 9.507670832852103e-06,
"loss": 0.1589,
"step": 196
},
{
"epoch": 0.2863372093023256,
"grad_norm": 2.002706015726789,
"learning_rate": 9.502719437299212e-06,
"loss": 0.1739,
"step": 197
},
{
"epoch": 0.2877906976744186,
"grad_norm": 1.0738206830624688,
"learning_rate": 9.497744570422997e-06,
"loss": 0.1045,
"step": 198
},
{
"epoch": 0.28924418604651164,
"grad_norm": 1.6809668906114832,
"learning_rate": 9.492746258155944e-06,
"loss": 0.1688,
"step": 199
},
{
"epoch": 0.29069767441860467,
"grad_norm": 1.568763317573355,
"learning_rate": 9.487724526552753e-06,
"loss": 0.154,
"step": 200
},
{
"epoch": 0.29069767441860467,
"eval_loss": 0.14336518943309784,
"eval_runtime": 2.2549,
"eval_samples_per_second": 24.835,
"eval_steps_per_second": 6.209,
"step": 200
},
{
"epoch": 0.2921511627906977,
"grad_norm": 1.4702550707766895,
"learning_rate": 9.4826794017902e-06,
"loss": 0.1549,
"step": 201
},
{
"epoch": 0.2936046511627907,
"grad_norm": 1.5525125309981525,
"learning_rate": 9.477610910167005e-06,
"loss": 0.138,
"step": 202
},
{
"epoch": 0.29505813953488375,
"grad_norm": 1.9512544957864502,
"learning_rate": 9.472519078103693e-06,
"loss": 0.191,
"step": 203
},
{
"epoch": 0.29651162790697677,
"grad_norm": 1.6914763992393016,
"learning_rate": 9.467403932142452e-06,
"loss": 0.1415,
"step": 204
},
{
"epoch": 0.29796511627906974,
"grad_norm": 1.4472857294705104,
"learning_rate": 9.462265498947002e-06,
"loss": 0.1429,
"step": 205
},
{
"epoch": 0.29941860465116277,
"grad_norm": 1.5823405481416377,
"learning_rate": 9.457103805302454e-06,
"loss": 0.1326,
"step": 206
},
{
"epoch": 0.3008720930232558,
"grad_norm": 1.420341202696158,
"learning_rate": 9.451918878115163e-06,
"loss": 0.128,
"step": 207
},
{
"epoch": 0.3023255813953488,
"grad_norm": 1.3234585094233786,
"learning_rate": 9.446710744412595e-06,
"loss": 0.1409,
"step": 208
},
{
"epoch": 0.30377906976744184,
"grad_norm": 1.4054727085825658,
"learning_rate": 9.441479431343189e-06,
"loss": 0.1295,
"step": 209
},
{
"epoch": 0.30523255813953487,
"grad_norm": 1.762998737448682,
"learning_rate": 9.436224966176205e-06,
"loss": 0.1832,
"step": 210
},
{
"epoch": 0.3066860465116279,
"grad_norm": 1.6833002091383082,
"learning_rate": 9.430947376301593e-06,
"loss": 0.1393,
"step": 211
},
{
"epoch": 0.3081395348837209,
"grad_norm": 1.4120192730188195,
"learning_rate": 9.425646689229843e-06,
"loss": 0.1295,
"step": 212
},
{
"epoch": 0.30959302325581395,
"grad_norm": 1.1626004083245838,
"learning_rate": 9.420322932591842e-06,
"loss": 0.1202,
"step": 213
},
{
"epoch": 0.311046511627907,
"grad_norm": 1.5259387861927707,
"learning_rate": 9.414976134138736e-06,
"loss": 0.1193,
"step": 214
},
{
"epoch": 0.3125,
"grad_norm": 1.619894973779481,
"learning_rate": 9.409606321741776e-06,
"loss": 0.1698,
"step": 215
},
{
"epoch": 0.313953488372093,
"grad_norm": 1.7822094110301971,
"learning_rate": 9.404213523392183e-06,
"loss": 0.1393,
"step": 216
},
{
"epoch": 0.31540697674418605,
"grad_norm": 1.5931388190429225,
"learning_rate": 9.39879776720099e-06,
"loss": 0.1383,
"step": 217
},
{
"epoch": 0.3168604651162791,
"grad_norm": 2.072625803623837,
"learning_rate": 9.393359081398914e-06,
"loss": 0.1834,
"step": 218
},
{
"epoch": 0.3183139534883721,
"grad_norm": 1.7269561451971633,
"learning_rate": 9.387897494336182e-06,
"loss": 0.2005,
"step": 219
},
{
"epoch": 0.31976744186046513,
"grad_norm": 1.9777378749032901,
"learning_rate": 9.38241303448241e-06,
"loss": 0.1648,
"step": 220
},
{
"epoch": 0.32122093023255816,
"grad_norm": 1.7036324366034152,
"learning_rate": 9.376905730426438e-06,
"loss": 0.1661,
"step": 221
},
{
"epoch": 0.3226744186046512,
"grad_norm": 1.6114418788776652,
"learning_rate": 9.371375610876189e-06,
"loss": 0.1871,
"step": 222
},
{
"epoch": 0.3241279069767442,
"grad_norm": 1.6496270048093367,
"learning_rate": 9.365822704658511e-06,
"loss": 0.1683,
"step": 223
},
{
"epoch": 0.32558139534883723,
"grad_norm": 1.9563162812494876,
"learning_rate": 9.36024704071904e-06,
"loss": 0.1941,
"step": 224
},
{
"epoch": 0.32703488372093026,
"grad_norm": 2.0820637027256086,
"learning_rate": 9.354648648122032e-06,
"loss": 0.1951,
"step": 225
},
{
"epoch": 0.32848837209302323,
"grad_norm": 1.5149036427355425,
"learning_rate": 9.349027556050225e-06,
"loss": 0.1985,
"step": 226
},
{
"epoch": 0.32994186046511625,
"grad_norm": 1.3330622481681265,
"learning_rate": 9.343383793804688e-06,
"loss": 0.0971,
"step": 227
},
{
"epoch": 0.3313953488372093,
"grad_norm": 1.917698823268353,
"learning_rate": 9.337717390804653e-06,
"loss": 0.1743,
"step": 228
},
{
"epoch": 0.3328488372093023,
"grad_norm": 1.477889283501478,
"learning_rate": 9.332028376587377e-06,
"loss": 0.1367,
"step": 229
},
{
"epoch": 0.33430232558139533,
"grad_norm": 1.35154982485599,
"learning_rate": 9.326316780807982e-06,
"loss": 0.1498,
"step": 230
},
{
"epoch": 0.33575581395348836,
"grad_norm": 1.9401902636815154,
"learning_rate": 9.320582633239303e-06,
"loss": 0.1633,
"step": 231
},
{
"epoch": 0.3372093023255814,
"grad_norm": 1.5805620021792157,
"learning_rate": 9.314825963771724e-06,
"loss": 0.172,
"step": 232
},
{
"epoch": 0.3386627906976744,
"grad_norm": 1.2624039358903756,
"learning_rate": 9.309046802413033e-06,
"loss": 0.1445,
"step": 233
},
{
"epoch": 0.34011627906976744,
"grad_norm": 1.6329242629844094,
"learning_rate": 9.303245179288265e-06,
"loss": 0.1617,
"step": 234
},
{
"epoch": 0.34156976744186046,
"grad_norm": 1.572417591830697,
"learning_rate": 9.297421124639534e-06,
"loss": 0.1901,
"step": 235
},
{
"epoch": 0.3430232558139535,
"grad_norm": 1.4594857772846002,
"learning_rate": 9.29157466882589e-06,
"loss": 0.1408,
"step": 236
},
{
"epoch": 0.3444767441860465,
"grad_norm": 1.341145951847404,
"learning_rate": 9.28570584232315e-06,
"loss": 0.1468,
"step": 237
},
{
"epoch": 0.34593023255813954,
"grad_norm": 1.3186091526025143,
"learning_rate": 9.27981467572374e-06,
"loss": 0.1269,
"step": 238
},
{
"epoch": 0.34738372093023256,
"grad_norm": 1.2632212032396861,
"learning_rate": 9.273901199736544e-06,
"loss": 0.1329,
"step": 239
},
{
"epoch": 0.3488372093023256,
"grad_norm": 1.2397475494586065,
"learning_rate": 9.267965445186733e-06,
"loss": 0.1188,
"step": 240
},
{
"epoch": 0.3502906976744186,
"grad_norm": 1.4040827076553737,
"learning_rate": 9.262007443015614e-06,
"loss": 0.1217,
"step": 241
},
{
"epoch": 0.35174418604651164,
"grad_norm": 1.5619651302909152,
"learning_rate": 9.25602722428046e-06,
"loss": 0.1372,
"step": 242
},
{
"epoch": 0.35319767441860467,
"grad_norm": 1.759985209874878,
"learning_rate": 9.250024820154356e-06,
"loss": 0.1545,
"step": 243
},
{
"epoch": 0.3546511627906977,
"grad_norm": 1.426120234598387,
"learning_rate": 9.24400026192603e-06,
"loss": 0.1254,
"step": 244
},
{
"epoch": 0.3561046511627907,
"grad_norm": 2.1333005251482238,
"learning_rate": 9.237953580999694e-06,
"loss": 0.1715,
"step": 245
},
{
"epoch": 0.35755813953488375,
"grad_norm": 1.4669057043638045,
"learning_rate": 9.231884808894877e-06,
"loss": 0.1589,
"step": 246
},
{
"epoch": 0.35901162790697677,
"grad_norm": 1.70352194581075,
"learning_rate": 9.225793977246267e-06,
"loss": 0.1714,
"step": 247
},
{
"epoch": 0.36046511627906974,
"grad_norm": 1.6341050240990393,
"learning_rate": 9.219681117803537e-06,
"loss": 0.1715,
"step": 248
},
{
"epoch": 0.36191860465116277,
"grad_norm": 1.5094144987016003,
"learning_rate": 9.213546262431185e-06,
"loss": 0.1195,
"step": 249
},
{
"epoch": 0.3633720930232558,
"grad_norm": 1.4662745386322011,
"learning_rate": 9.207389443108372e-06,
"loss": 0.1502,
"step": 250
},
{
"epoch": 0.3648255813953488,
"grad_norm": 1.2772800586235213,
"learning_rate": 9.201210691928745e-06,
"loss": 0.1211,
"step": 251
},
{
"epoch": 0.36627906976744184,
"grad_norm": 1.4892966499782139,
"learning_rate": 9.195010041100276e-06,
"loss": 0.1168,
"step": 252
},
{
"epoch": 0.36773255813953487,
"grad_norm": 1.423847678645897,
"learning_rate": 9.188787522945098e-06,
"loss": 0.1338,
"step": 253
},
{
"epoch": 0.3691860465116279,
"grad_norm": 1.5018029337938783,
"learning_rate": 9.182543169899325e-06,
"loss": 0.1324,
"step": 254
},
{
"epoch": 0.3706395348837209,
"grad_norm": 1.3714052238212613,
"learning_rate": 9.176277014512894e-06,
"loss": 0.1568,
"step": 255
},
{
"epoch": 0.37209302325581395,
"grad_norm": 1.409010922605016,
"learning_rate": 9.16998908944939e-06,
"loss": 0.1604,
"step": 256
},
{
"epoch": 0.373546511627907,
"grad_norm": 1.6529698075548698,
"learning_rate": 9.163679427485878e-06,
"loss": 0.1567,
"step": 257
},
{
"epoch": 0.375,
"grad_norm": 1.3541902426493249,
"learning_rate": 9.157348061512728e-06,
"loss": 0.1557,
"step": 258
},
{
"epoch": 0.376453488372093,
"grad_norm": 1.4327109786235201,
"learning_rate": 9.150995024533446e-06,
"loss": 0.1578,
"step": 259
},
{
"epoch": 0.37790697674418605,
"grad_norm": 1.3363028855734624,
"learning_rate": 9.14462034966451e-06,
"loss": 0.1404,
"step": 260
},
{
"epoch": 0.3793604651162791,
"grad_norm": 1.8919021627256019,
"learning_rate": 9.138224070135183e-06,
"loss": 0.1841,
"step": 261
},
{
"epoch": 0.3808139534883721,
"grad_norm": 1.48877635650518,
"learning_rate": 9.131806219287344e-06,
"loss": 0.1292,
"step": 262
},
{
"epoch": 0.38226744186046513,
"grad_norm": 1.4141106832003385,
"learning_rate": 9.125366830575325e-06,
"loss": 0.1355,
"step": 263
},
{
"epoch": 0.38372093023255816,
"grad_norm": 1.3866461149312288,
"learning_rate": 9.118905937565723e-06,
"loss": 0.1493,
"step": 264
},
{
"epoch": 0.3851744186046512,
"grad_norm": 1.3927609057507213,
"learning_rate": 9.112423573937232e-06,
"loss": 0.1187,
"step": 265
},
{
"epoch": 0.3866279069767442,
"grad_norm": 1.63044313572066,
"learning_rate": 9.105919773480464e-06,
"loss": 0.1604,
"step": 266
},
{
"epoch": 0.38808139534883723,
"grad_norm": 1.6214111304578023,
"learning_rate": 9.09939457009778e-06,
"loss": 0.1917,
"step": 267
},
{
"epoch": 0.38953488372093026,
"grad_norm": 1.4173048655266718,
"learning_rate": 9.092847997803098e-06,
"loss": 0.17,
"step": 268
},
{
"epoch": 0.39098837209302323,
"grad_norm": 1.6513577218808286,
"learning_rate": 9.08628009072174e-06,
"loss": 0.1725,
"step": 269
},
{
"epoch": 0.39244186046511625,
"grad_norm": 1.5628095311374188,
"learning_rate": 9.079690883090227e-06,
"loss": 0.1608,
"step": 270
},
{
"epoch": 0.3938953488372093,
"grad_norm": 1.9629889905768871,
"learning_rate": 9.073080409256118e-06,
"loss": 0.1587,
"step": 271
},
{
"epoch": 0.3953488372093023,
"grad_norm": 1.947239430105324,
"learning_rate": 9.066448703677828e-06,
"loss": 0.2092,
"step": 272
},
{
"epoch": 0.39680232558139533,
"grad_norm": 1.6096007627861755,
"learning_rate": 9.059795800924445e-06,
"loss": 0.2076,
"step": 273
},
{
"epoch": 0.39825581395348836,
"grad_norm": 1.4682775876201015,
"learning_rate": 9.053121735675552e-06,
"loss": 0.1338,
"step": 274
},
{
"epoch": 0.3997093023255814,
"grad_norm": 1.541753647412304,
"learning_rate": 9.046426542721046e-06,
"loss": 0.1714,
"step": 275
},
{
"epoch": 0.4011627906976744,
"grad_norm": 1.413689581379683,
"learning_rate": 9.039710256960956e-06,
"loss": 0.1346,
"step": 276
},
{
"epoch": 0.40261627906976744,
"grad_norm": 1.6313953237696512,
"learning_rate": 9.03297291340526e-06,
"loss": 0.1596,
"step": 277
},
{
"epoch": 0.40406976744186046,
"grad_norm": 1.2910181632753897,
"learning_rate": 9.026214547173706e-06,
"loss": 0.1553,
"step": 278
},
{
"epoch": 0.4055232558139535,
"grad_norm": 1.4228459831893367,
"learning_rate": 9.019435193495627e-06,
"loss": 0.1377,
"step": 279
},
{
"epoch": 0.4069767441860465,
"grad_norm": 1.459679839660881,
"learning_rate": 9.012634887709755e-06,
"loss": 0.1404,
"step": 280
},
{
"epoch": 0.40843023255813954,
"grad_norm": 1.6321159835021386,
"learning_rate": 9.005813665264042e-06,
"loss": 0.1393,
"step": 281
},
{
"epoch": 0.40988372093023256,
"grad_norm": 1.9115197748274007,
"learning_rate": 8.998971561715468e-06,
"loss": 0.164,
"step": 282
},
{
"epoch": 0.4113372093023256,
"grad_norm": 1.5888048449790153,
"learning_rate": 8.992108612729868e-06,
"loss": 0.1422,
"step": 283
},
{
"epoch": 0.4127906976744186,
"grad_norm": 2.3019049975561585,
"learning_rate": 8.985224854081727e-06,
"loss": 0.1863,
"step": 284
},
{
"epoch": 0.41424418604651164,
"grad_norm": 1.5757906262469172,
"learning_rate": 8.978320321654014e-06,
"loss": 0.1531,
"step": 285
},
{
"epoch": 0.41569767441860467,
"grad_norm": 1.3184585148618004,
"learning_rate": 8.97139505143798e-06,
"loss": 0.1035,
"step": 286
},
{
"epoch": 0.4171511627906977,
"grad_norm": 1.9152332078193026,
"learning_rate": 8.964449079532978e-06,
"loss": 0.1982,
"step": 287
},
{
"epoch": 0.4186046511627907,
"grad_norm": 1.3318524817371034,
"learning_rate": 8.957482442146271e-06,
"loss": 0.1422,
"step": 288
},
{
"epoch": 0.42005813953488375,
"grad_norm": 1.2994676654595296,
"learning_rate": 8.950495175592849e-06,
"loss": 0.0954,
"step": 289
},
{
"epoch": 0.42151162790697677,
"grad_norm": 1.3863595317355537,
"learning_rate": 8.94348731629523e-06,
"loss": 0.1481,
"step": 290
},
{
"epoch": 0.42296511627906974,
"grad_norm": 1.4930615065996538,
"learning_rate": 8.93645890078328e-06,
"loss": 0.1474,
"step": 291
},
{
"epoch": 0.42441860465116277,
"grad_norm": 1.1744911565495457,
"learning_rate": 8.929409965694016e-06,
"loss": 0.1228,
"step": 292
},
{
"epoch": 0.4258720930232558,
"grad_norm": 1.3229206282158534,
"learning_rate": 8.92234054777142e-06,
"loss": 0.1165,
"step": 293
},
{
"epoch": 0.4273255813953488,
"grad_norm": 1.500633285810525,
"learning_rate": 8.915250683866242e-06,
"loss": 0.1561,
"step": 294
},
{
"epoch": 0.42877906976744184,
"grad_norm": 1.4979998345117733,
"learning_rate": 8.908140410935813e-06,
"loss": 0.1466,
"step": 295
},
{
"epoch": 0.43023255813953487,
"grad_norm": 1.2712924829276973,
"learning_rate": 8.901009766043846e-06,
"loss": 0.1026,
"step": 296
},
{
"epoch": 0.4316860465116279,
"grad_norm": 1.6863134441377277,
"learning_rate": 8.893858786360255e-06,
"loss": 0.1687,
"step": 297
},
{
"epoch": 0.4331395348837209,
"grad_norm": 1.5887492952986007,
"learning_rate": 8.886687509160944e-06,
"loss": 0.1973,
"step": 298
},
{
"epoch": 0.43459302325581395,
"grad_norm": 1.6354886094555063,
"learning_rate": 8.879495971827628e-06,
"loss": 0.1881,
"step": 299
},
{
"epoch": 0.436046511627907,
"grad_norm": 1.5874859267729486,
"learning_rate": 8.872284211847629e-06,
"loss": 0.1105,
"step": 300
},
{
"epoch": 0.4375,
"grad_norm": 1.7021255827480994,
"learning_rate": 8.865052266813686e-06,
"loss": 0.1592,
"step": 301
},
{
"epoch": 0.438953488372093,
"grad_norm": 1.1905429804396237,
"learning_rate": 8.857800174423754e-06,
"loss": 0.1569,
"step": 302
},
{
"epoch": 0.44040697674418605,
"grad_norm": 1.7925350240691666,
"learning_rate": 8.850527972480812e-06,
"loss": 0.1823,
"step": 303
},
{
"epoch": 0.4418604651162791,
"grad_norm": 1.682150104412601,
"learning_rate": 8.843235698892661e-06,
"loss": 0.1725,
"step": 304
},
{
"epoch": 0.4433139534883721,
"grad_norm": 1.4842418533887083,
"learning_rate": 8.835923391671735e-06,
"loss": 0.1095,
"step": 305
},
{
"epoch": 0.44476744186046513,
"grad_norm": 1.4624470494811705,
"learning_rate": 8.828591088934894e-06,
"loss": 0.1286,
"step": 306
},
{
"epoch": 0.44622093023255816,
"grad_norm": 1.3843653917366865,
"learning_rate": 8.821238828903227e-06,
"loss": 0.1423,
"step": 307
},
{
"epoch": 0.4476744186046512,
"grad_norm": 1.4949237211694288,
"learning_rate": 8.813866649901857e-06,
"loss": 0.1426,
"step": 308
},
{
"epoch": 0.4491279069767442,
"grad_norm": 1.338762423782917,
"learning_rate": 8.806474590359736e-06,
"loss": 0.1609,
"step": 309
},
{
"epoch": 0.45058139534883723,
"grad_norm": 1.3119459351922982,
"learning_rate": 8.799062688809452e-06,
"loss": 0.1508,
"step": 310
},
{
"epoch": 0.45203488372093026,
"grad_norm": 1.4964356641654488,
"learning_rate": 8.79163098388702e-06,
"loss": 0.1536,
"step": 311
},
{
"epoch": 0.45348837209302323,
"grad_norm": 1.8958424478348768,
"learning_rate": 8.784179514331683e-06,
"loss": 0.2154,
"step": 312
},
{
"epoch": 0.45494186046511625,
"grad_norm": 1.616943621767028,
"learning_rate": 8.776708318985712e-06,
"loss": 0.1338,
"step": 313
},
{
"epoch": 0.4563953488372093,
"grad_norm": 1.3369024538015155,
"learning_rate": 8.769217436794205e-06,
"loss": 0.1481,
"step": 314
},
{
"epoch": 0.4578488372093023,
"grad_norm": 1.5002304811430793,
"learning_rate": 8.761706906804878e-06,
"loss": 0.1484,
"step": 315
},
{
"epoch": 0.45930232558139533,
"grad_norm": 1.5317354250290076,
"learning_rate": 8.75417676816787e-06,
"loss": 0.1388,
"step": 316
},
{
"epoch": 0.46075581395348836,
"grad_norm": 1.5881770732342904,
"learning_rate": 8.746627060135528e-06,
"loss": 0.1607,
"step": 317
},
{
"epoch": 0.4622093023255814,
"grad_norm": 1.2933179184781483,
"learning_rate": 8.73905782206221e-06,
"loss": 0.1425,
"step": 318
},
{
"epoch": 0.4636627906976744,
"grad_norm": 1.4900348199011015,
"learning_rate": 8.731469093404086e-06,
"loss": 0.1686,
"step": 319
},
{
"epoch": 0.46511627906976744,
"grad_norm": 1.2187036161656735,
"learning_rate": 8.72386091371891e-06,
"loss": 0.156,
"step": 320
},
{
"epoch": 0.46656976744186046,
"grad_norm": 1.5361392026631566,
"learning_rate": 8.71623332266584e-06,
"loss": 0.1697,
"step": 321
},
{
"epoch": 0.4680232558139535,
"grad_norm": 1.5624029046245063,
"learning_rate": 8.708586360005218e-06,
"loss": 0.1565,
"step": 322
},
{
"epoch": 0.4694767441860465,
"grad_norm": 1.2313528832746294,
"learning_rate": 8.700920065598358e-06,
"loss": 0.1319,
"step": 323
},
{
"epoch": 0.47093023255813954,
"grad_norm": 1.4770385920550462,
"learning_rate": 8.693234479407353e-06,
"loss": 0.1212,
"step": 324
},
{
"epoch": 0.47238372093023256,
"grad_norm": 2.1148845935494913,
"learning_rate": 8.685529641494852e-06,
"loss": 0.1714,
"step": 325
},
{
"epoch": 0.4738372093023256,
"grad_norm": 1.3818082146663953,
"learning_rate": 8.677805592023858e-06,
"loss": 0.1328,
"step": 326
},
{
"epoch": 0.4752906976744186,
"grad_norm": 2.0939762202676935,
"learning_rate": 8.670062371257525e-06,
"loss": 0.2428,
"step": 327
},
{
"epoch": 0.47674418604651164,
"grad_norm": 1.5066771176481124,
"learning_rate": 8.662300019558931e-06,
"loss": 0.1397,
"step": 328
},
{
"epoch": 0.47819767441860467,
"grad_norm": 1.3186753628942027,
"learning_rate": 8.654518577390885e-06,
"loss": 0.1513,
"step": 329
},
{
"epoch": 0.4796511627906977,
"grad_norm": 1.6619235949225295,
"learning_rate": 8.646718085315707e-06,
"loss": 0.1637,
"step": 330
},
{
"epoch": 0.4811046511627907,
"grad_norm": 1.5149079703407748,
"learning_rate": 8.638898583995016e-06,
"loss": 0.1614,
"step": 331
},
{
"epoch": 0.48255813953488375,
"grad_norm": 1.6949649519406504,
"learning_rate": 8.631060114189526e-06,
"loss": 0.1613,
"step": 332
},
{
"epoch": 0.48401162790697677,
"grad_norm": 1.6341150247952272,
"learning_rate": 8.62320271675882e-06,
"loss": 0.139,
"step": 333
},
{
"epoch": 0.48546511627906974,
"grad_norm": 1.5722940293537715,
"learning_rate": 8.615326432661155e-06,
"loss": 0.1389,
"step": 334
},
{
"epoch": 0.48691860465116277,
"grad_norm": 1.6784313452366777,
"learning_rate": 8.607431302953229e-06,
"loss": 0.1586,
"step": 335
},
{
"epoch": 0.4883720930232558,
"grad_norm": 1.579470915019441,
"learning_rate": 8.599517368789981e-06,
"loss": 0.1628,
"step": 336
},
{
"epoch": 0.4898255813953488,
"grad_norm": 1.8175899396499493,
"learning_rate": 8.591584671424371e-06,
"loss": 0.1778,
"step": 337
},
{
"epoch": 0.49127906976744184,
"grad_norm": 1.3679144266460825,
"learning_rate": 8.583633252207171e-06,
"loss": 0.145,
"step": 338
},
{
"epoch": 0.49273255813953487,
"grad_norm": 1.5973543371186025,
"learning_rate": 8.575663152586735e-06,
"loss": 0.1371,
"step": 339
},
{
"epoch": 0.4941860465116279,
"grad_norm": 1.4223743951319792,
"learning_rate": 8.5676744141088e-06,
"loss": 0.1349,
"step": 340
},
{
"epoch": 0.4956395348837209,
"grad_norm": 1.7867654597125795,
"learning_rate": 8.559667078416257e-06,
"loss": 0.1652,
"step": 341
},
{
"epoch": 0.49709302325581395,
"grad_norm": 1.6721772672683666,
"learning_rate": 8.551641187248942e-06,
"loss": 0.1827,
"step": 342
},
{
"epoch": 0.498546511627907,
"grad_norm": 1.5334190767352909,
"learning_rate": 8.543596782443415e-06,
"loss": 0.1367,
"step": 343
},
{
"epoch": 0.5,
"grad_norm": 1.4823817376465154,
"learning_rate": 8.535533905932739e-06,
"loss": 0.1427,
"step": 344
},
{
"epoch": 0.501453488372093,
"grad_norm": 1.5846228452591529,
"learning_rate": 8.527452599746265e-06,
"loss": 0.1705,
"step": 345
},
{
"epoch": 0.502906976744186,
"grad_norm": 1.3811944073529825,
"learning_rate": 8.519352906009417e-06,
"loss": 0.135,
"step": 346
},
{
"epoch": 0.5043604651162791,
"grad_norm": 1.4875376615322078,
"learning_rate": 8.511234866943463e-06,
"loss": 0.1388,
"step": 347
},
{
"epoch": 0.5058139534883721,
"grad_norm": 1.402324894315079,
"learning_rate": 8.5030985248653e-06,
"loss": 0.1474,
"step": 348
},
{
"epoch": 0.5072674418604651,
"grad_norm": 1.463484821168073,
"learning_rate": 8.494943922187236e-06,
"loss": 0.1234,
"step": 349
},
{
"epoch": 0.5087209302325582,
"grad_norm": 1.6398780382743268,
"learning_rate": 8.486771101416765e-06,
"loss": 0.1335,
"step": 350
},
{
"epoch": 0.5101744186046512,
"grad_norm": 1.244410077865948,
"learning_rate": 8.47858010515634e-06,
"loss": 0.1499,
"step": 351
},
{
"epoch": 0.5116279069767442,
"grad_norm": 1.7990830264360216,
"learning_rate": 8.470370976103171e-06,
"loss": 0.1662,
"step": 352
},
{
"epoch": 0.5130813953488372,
"grad_norm": 1.7219602924486799,
"learning_rate": 8.462143757048976e-06,
"loss": 0.1294,
"step": 353
},
{
"epoch": 0.5145348837209303,
"grad_norm": 1.1737595986370917,
"learning_rate": 8.453898490879776e-06,
"loss": 0.1391,
"step": 354
},
{
"epoch": 0.5159883720930233,
"grad_norm": 1.5033196146866203,
"learning_rate": 8.445635220575663e-06,
"loss": 0.1207,
"step": 355
},
{
"epoch": 0.5174418604651163,
"grad_norm": 1.526089460093771,
"learning_rate": 8.43735398921059e-06,
"loss": 0.1575,
"step": 356
},
{
"epoch": 0.5188953488372093,
"grad_norm": 1.0613154614063016,
"learning_rate": 8.429054839952122e-06,
"loss": 0.107,
"step": 357
},
{
"epoch": 0.5203488372093024,
"grad_norm": 1.5465351587584268,
"learning_rate": 8.42073781606123e-06,
"loss": 0.1412,
"step": 358
},
{
"epoch": 0.5218023255813954,
"grad_norm": 1.6263257643507738,
"learning_rate": 8.412402960892061e-06,
"loss": 0.1528,
"step": 359
},
{
"epoch": 0.5232558139534884,
"grad_norm": 1.4428369608603286,
"learning_rate": 8.40405031789171e-06,
"loss": 0.1229,
"step": 360
},
{
"epoch": 0.5247093023255814,
"grad_norm": 1.5887049579267734,
"learning_rate": 8.395679930599997e-06,
"loss": 0.1768,
"step": 361
},
{
"epoch": 0.5261627906976745,
"grad_norm": 1.6132505441168516,
"learning_rate": 8.387291842649234e-06,
"loss": 0.147,
"step": 362
},
{
"epoch": 0.5276162790697675,
"grad_norm": 1.6221175313012306,
"learning_rate": 8.378886097764001e-06,
"loss": 0.1393,
"step": 363
},
{
"epoch": 0.5290697674418605,
"grad_norm": 1.3815088653312464,
"learning_rate": 8.370462739760922e-06,
"loss": 0.1346,
"step": 364
},
{
"epoch": 0.5305232558139535,
"grad_norm": 1.613218374432377,
"learning_rate": 8.362021812548433e-06,
"loss": 0.1296,
"step": 365
},
{
"epoch": 0.5319767441860465,
"grad_norm": 1.5239164352989918,
"learning_rate": 8.353563360126548e-06,
"loss": 0.2012,
"step": 366
},
{
"epoch": 0.5334302325581395,
"grad_norm": 1.4465320300081779,
"learning_rate": 8.345087426586638e-06,
"loss": 0.1436,
"step": 367
},
{
"epoch": 0.5348837209302325,
"grad_norm": 1.6179330922890798,
"learning_rate": 8.336594056111197e-06,
"loss": 0.1699,
"step": 368
},
{
"epoch": 0.5363372093023255,
"grad_norm": 1.2917432875508248,
"learning_rate": 8.328083292973617e-06,
"loss": 0.1294,
"step": 369
},
{
"epoch": 0.5377906976744186,
"grad_norm": 1.823166535138271,
"learning_rate": 8.319555181537942e-06,
"loss": 0.1693,
"step": 370
},
{
"epoch": 0.5392441860465116,
"grad_norm": 1.5964311134632778,
"learning_rate": 8.311009766258659e-06,
"loss": 0.1838,
"step": 371
},
{
"epoch": 0.5406976744186046,
"grad_norm": 1.5287488881908324,
"learning_rate": 8.30244709168045e-06,
"loss": 0.164,
"step": 372
},
{
"epoch": 0.5421511627906976,
"grad_norm": 1.617466008500704,
"learning_rate": 8.293867202437962e-06,
"loss": 0.1675,
"step": 373
},
{
"epoch": 0.5436046511627907,
"grad_norm": 1.81589698419327,
"learning_rate": 8.285270143255579e-06,
"loss": 0.158,
"step": 374
},
{
"epoch": 0.5450581395348837,
"grad_norm": 1.496211076073237,
"learning_rate": 8.27665595894719e-06,
"loss": 0.1332,
"step": 375
},
{
"epoch": 0.5465116279069767,
"grad_norm": 1.528765465197372,
"learning_rate": 8.268024694415949e-06,
"loss": 0.1313,
"step": 376
},
{
"epoch": 0.5479651162790697,
"grad_norm": 1.7252370903269931,
"learning_rate": 8.25937639465404e-06,
"loss": 0.1635,
"step": 377
},
{
"epoch": 0.5494186046511628,
"grad_norm": 1.3299953218632383,
"learning_rate": 8.250711104742453e-06,
"loss": 0.1198,
"step": 378
},
{
"epoch": 0.5508720930232558,
"grad_norm": 1.6165817454268103,
"learning_rate": 8.242028869850743e-06,
"loss": 0.142,
"step": 379
},
{
"epoch": 0.5523255813953488,
"grad_norm": 1.4748568202518297,
"learning_rate": 8.23332973523679e-06,
"loss": 0.1134,
"step": 380
},
{
"epoch": 0.5537790697674418,
"grad_norm": 1.327698827166681,
"learning_rate": 8.224613746246565e-06,
"loss": 0.1465,
"step": 381
},
{
"epoch": 0.5552325581395349,
"grad_norm": 1.3793313249754087,
"learning_rate": 8.215880948313904e-06,
"loss": 0.1304,
"step": 382
},
{
"epoch": 0.5566860465116279,
"grad_norm": 1.7504804510682308,
"learning_rate": 8.207131386960256e-06,
"loss": 0.1673,
"step": 383
},
{
"epoch": 0.5581395348837209,
"grad_norm": 1.595850264406846,
"learning_rate": 8.198365107794457e-06,
"loss": 0.1444,
"step": 384
},
{
"epoch": 0.559593023255814,
"grad_norm": 1.5020289813450294,
"learning_rate": 8.189582156512484e-06,
"loss": 0.1403,
"step": 385
},
{
"epoch": 0.561046511627907,
"grad_norm": 1.4326207754311362,
"learning_rate": 8.180782578897225e-06,
"loss": 0.0998,
"step": 386
},
{
"epoch": 0.5625,
"grad_norm": 1.7057272776604826,
"learning_rate": 8.171966420818227e-06,
"loss": 0.1388,
"step": 387
},
{
"epoch": 0.563953488372093,
"grad_norm": 1.7222325633827718,
"learning_rate": 8.163133728231482e-06,
"loss": 0.1897,
"step": 388
},
{
"epoch": 0.565406976744186,
"grad_norm": 1.473100288550089,
"learning_rate": 8.154284547179158e-06,
"loss": 0.1419,
"step": 389
},
{
"epoch": 0.5668604651162791,
"grad_norm": 1.9904385322120357,
"learning_rate": 8.145418923789375e-06,
"loss": 0.1935,
"step": 390
},
{
"epoch": 0.5683139534883721,
"grad_norm": 1.8647312855206553,
"learning_rate": 8.136536904275965e-06,
"loss": 0.2022,
"step": 391
},
{
"epoch": 0.5697674418604651,
"grad_norm": 1.7630172852357997,
"learning_rate": 8.127638534938227e-06,
"loss": 0.1924,
"step": 392
},
{
"epoch": 0.5712209302325582,
"grad_norm": 1.3019122719624554,
"learning_rate": 8.118723862160687e-06,
"loss": 0.1469,
"step": 393
},
{
"epoch": 0.5726744186046512,
"grad_norm": 1.860210908405091,
"learning_rate": 8.109792932412853e-06,
"loss": 0.1508,
"step": 394
},
{
"epoch": 0.5741279069767442,
"grad_norm": 1.345826572185921,
"learning_rate": 8.10084579224898e-06,
"loss": 0.155,
"step": 395
},
{
"epoch": 0.5755813953488372,
"grad_norm": 1.9559560979112873,
"learning_rate": 8.09188248830782e-06,
"loss": 0.183,
"step": 396
},
{
"epoch": 0.5770348837209303,
"grad_norm": 1.7929107775519544,
"learning_rate": 8.082903067312384e-06,
"loss": 0.1219,
"step": 397
},
{
"epoch": 0.5784883720930233,
"grad_norm": 1.4217667098877558,
"learning_rate": 8.073907576069692e-06,
"loss": 0.1615,
"step": 398
},
{
"epoch": 0.5799418604651163,
"grad_norm": 1.74509835604015,
"learning_rate": 8.064896061470542e-06,
"loss": 0.1638,
"step": 399
},
{
"epoch": 0.5813953488372093,
"grad_norm": 1.412746062615197,
"learning_rate": 8.055868570489247e-06,
"loss": 0.1497,
"step": 400
},
{
"epoch": 0.5813953488372093,
"eval_loss": 0.1344260424375534,
"eval_runtime": 2.2004,
"eval_samples_per_second": 25.449,
"eval_steps_per_second": 6.362,
"step": 400
},
{
"epoch": 0.5828488372093024,
"grad_norm": 1.2068327544738824,
"learning_rate": 8.046825150183406e-06,
"loss": 0.1201,
"step": 401
},
{
"epoch": 0.5843023255813954,
"grad_norm": 1.6435893838564368,
"learning_rate": 8.037765847693652e-06,
"loss": 0.1145,
"step": 402
},
{
"epoch": 0.5857558139534884,
"grad_norm": 1.6608170672171527,
"learning_rate": 8.028690710243407e-06,
"loss": 0.1279,
"step": 403
},
{
"epoch": 0.5872093023255814,
"grad_norm": 1.4328859838303152,
"learning_rate": 8.019599785138635e-06,
"loss": 0.1373,
"step": 404
},
{
"epoch": 0.5886627906976745,
"grad_norm": 1.4343862675807864,
"learning_rate": 8.010493119767596e-06,
"loss": 0.1498,
"step": 405
},
{
"epoch": 0.5901162790697675,
"grad_norm": 1.6086081681876832,
"learning_rate": 8.001370761600598e-06,
"loss": 0.141,
"step": 406
},
{
"epoch": 0.5915697674418605,
"grad_norm": 2.1039051172285,
"learning_rate": 7.992232758189756e-06,
"loss": 0.1973,
"step": 407
},
{
"epoch": 0.5930232558139535,
"grad_norm": 1.8140948838773037,
"learning_rate": 7.983079157168736e-06,
"loss": 0.1748,
"step": 408
},
{
"epoch": 0.5944767441860465,
"grad_norm": 1.3362665282466653,
"learning_rate": 7.973910006252508e-06,
"loss": 0.1397,
"step": 409
},
{
"epoch": 0.5959302325581395,
"grad_norm": 1.5133021367871378,
"learning_rate": 7.9647253532371e-06,
"loss": 0.1227,
"step": 410
},
{
"epoch": 0.5973837209302325,
"grad_norm": 1.3742813696710579,
"learning_rate": 7.955525245999348e-06,
"loss": 0.1292,
"step": 411
},
{
"epoch": 0.5988372093023255,
"grad_norm": 1.1612211072347856,
"learning_rate": 7.946309732496646e-06,
"loss": 0.1167,
"step": 412
},
{
"epoch": 0.6002906976744186,
"grad_norm": 1.2766729466996067,
"learning_rate": 7.9370788607667e-06,
"loss": 0.1229,
"step": 413
},
{
"epoch": 0.6017441860465116,
"grad_norm": 1.5577927339728042,
"learning_rate": 7.927832678927265e-06,
"loss": 0.1267,
"step": 414
},
{
"epoch": 0.6031976744186046,
"grad_norm": 1.252636212314083,
"learning_rate": 7.918571235175914e-06,
"loss": 0.1487,
"step": 415
},
{
"epoch": 0.6046511627906976,
"grad_norm": 1.4878931355701959,
"learning_rate": 7.909294577789765e-06,
"loss": 0.1456,
"step": 416
},
{
"epoch": 0.6061046511627907,
"grad_norm": 1.6750463601592944,
"learning_rate": 7.900002755125249e-06,
"loss": 0.1539,
"step": 417
},
{
"epoch": 0.6075581395348837,
"grad_norm": 1.670161301173932,
"learning_rate": 7.890695815617844e-06,
"loss": 0.1588,
"step": 418
},
{
"epoch": 0.6090116279069767,
"grad_norm": 1.5574686810516989,
"learning_rate": 7.881373807781827e-06,
"loss": 0.1598,
"step": 419
},
{
"epoch": 0.6104651162790697,
"grad_norm": 1.4407087883766663,
"learning_rate": 7.872036780210025e-06,
"loss": 0.1292,
"step": 420
},
{
"epoch": 0.6119186046511628,
"grad_norm": 1.532394578818048,
"learning_rate": 7.86268478157356e-06,
"loss": 0.1417,
"step": 421
},
{
"epoch": 0.6133720930232558,
"grad_norm": 1.558475109143411,
"learning_rate": 7.853317860621586e-06,
"loss": 0.1243,
"step": 422
},
{
"epoch": 0.6148255813953488,
"grad_norm": 1.2251102819390556,
"learning_rate": 7.843936066181049e-06,
"loss": 0.1218,
"step": 423
},
{
"epoch": 0.6162790697674418,
"grad_norm": 1.3841095970618222,
"learning_rate": 7.834539447156424e-06,
"loss": 0.1085,
"step": 424
},
{
"epoch": 0.6177325581395349,
"grad_norm": 1.3356825728851078,
"learning_rate": 7.825128052529462e-06,
"loss": 0.1116,
"step": 425
},
{
"epoch": 0.6191860465116279,
"grad_norm": 1.5573547752820238,
"learning_rate": 7.815701931358934e-06,
"loss": 0.1388,
"step": 426
},
{
"epoch": 0.6206395348837209,
"grad_norm": 1.7038054695327371,
"learning_rate": 7.80626113278038e-06,
"loss": 0.1519,
"step": 427
},
{
"epoch": 0.622093023255814,
"grad_norm": 1.5094414416286532,
"learning_rate": 7.796805706005843e-06,
"loss": 0.1149,
"step": 428
},
{
"epoch": 0.623546511627907,
"grad_norm": 1.4273226525726925,
"learning_rate": 7.787335700323622e-06,
"loss": 0.1254,
"step": 429
},
{
"epoch": 0.625,
"grad_norm": 1.4966730289367747,
"learning_rate": 7.777851165098012e-06,
"loss": 0.156,
"step": 430
},
{
"epoch": 0.626453488372093,
"grad_norm": 1.6315240876401846,
"learning_rate": 7.768352149769044e-06,
"loss": 0.1621,
"step": 431
},
{
"epoch": 0.627906976744186,
"grad_norm": 1.7333824682174062,
"learning_rate": 7.75883870385223e-06,
"loss": 0.1376,
"step": 432
},
{
"epoch": 0.6293604651162791,
"grad_norm": 1.602227708449946,
"learning_rate": 7.749310876938306e-06,
"loss": 0.1735,
"step": 433
},
{
"epoch": 0.6308139534883721,
"grad_norm": 1.3977532588731882,
"learning_rate": 7.739768718692969e-06,
"loss": 0.1289,
"step": 434
},
{
"epoch": 0.6322674418604651,
"grad_norm": 1.268259604423468,
"learning_rate": 7.730212278856625e-06,
"loss": 0.1017,
"step": 435
},
{
"epoch": 0.6337209302325582,
"grad_norm": 1.3411672369388048,
"learning_rate": 7.72064160724412e-06,
"loss": 0.1362,
"step": 436
},
{
"epoch": 0.6351744186046512,
"grad_norm": 1.2808388499819041,
"learning_rate": 7.71105675374449e-06,
"loss": 0.1394,
"step": 437
},
{
"epoch": 0.6366279069767442,
"grad_norm": 1.295520262094418,
"learning_rate": 7.701457768320696e-06,
"loss": 0.1442,
"step": 438
},
{
"epoch": 0.6380813953488372,
"grad_norm": 2.109775997490956,
"learning_rate": 7.691844701009365e-06,
"loss": 0.1762,
"step": 439
},
{
"epoch": 0.6395348837209303,
"grad_norm": 1.8500489048534585,
"learning_rate": 7.682217601920529e-06,
"loss": 0.1796,
"step": 440
},
{
"epoch": 0.6409883720930233,
"grad_norm": 1.5939072801392646,
"learning_rate": 7.672576521237361e-06,
"loss": 0.1516,
"step": 441
},
{
"epoch": 0.6424418604651163,
"grad_norm": 1.6138091025138628,
"learning_rate": 7.662921509215916e-06,
"loss": 0.1829,
"step": 442
},
{
"epoch": 0.6438953488372093,
"grad_norm": 1.676099177115984,
"learning_rate": 7.653252616184875e-06,
"loss": 0.1237,
"step": 443
},
{
"epoch": 0.6453488372093024,
"grad_norm": 1.2244457381485303,
"learning_rate": 7.643569892545267e-06,
"loss": 0.1306,
"step": 444
},
{
"epoch": 0.6468023255813954,
"grad_norm": 1.5503434065388142,
"learning_rate": 7.633873388770223e-06,
"loss": 0.1432,
"step": 445
},
{
"epoch": 0.6482558139534884,
"grad_norm": 1.3861212995292134,
"learning_rate": 7.624163155404702e-06,
"loss": 0.1246,
"step": 446
},
{
"epoch": 0.6497093023255814,
"grad_norm": 1.4732536785794919,
"learning_rate": 7.614439243065235e-06,
"loss": 0.171,
"step": 447
},
{
"epoch": 0.6511627906976745,
"grad_norm": 1.9155294406879655,
"learning_rate": 7.604701702439652e-06,
"loss": 0.159,
"step": 448
},
{
"epoch": 0.6526162790697675,
"grad_norm": 1.866142590003918,
"learning_rate": 7.594950584286826e-06,
"loss": 0.1705,
"step": 449
},
{
"epoch": 0.6540697674418605,
"grad_norm": 1.3523787659587572,
"learning_rate": 7.585185939436409e-06,
"loss": 0.139,
"step": 450
},
{
"epoch": 0.6555232558139535,
"grad_norm": 1.314104202140968,
"learning_rate": 7.5754078187885586e-06,
"loss": 0.1222,
"step": 451
},
{
"epoch": 0.6569767441860465,
"grad_norm": 1.3649972766280627,
"learning_rate": 7.5656162733136776e-06,
"loss": 0.1429,
"step": 452
},
{
"epoch": 0.6584302325581395,
"grad_norm": 1.3599029511484766,
"learning_rate": 7.555811354052152e-06,
"loss": 0.1483,
"step": 453
},
{
"epoch": 0.6598837209302325,
"grad_norm": 1.249236779922474,
"learning_rate": 7.545993112114078e-06,
"loss": 0.135,
"step": 454
},
{
"epoch": 0.6613372093023255,
"grad_norm": 1.4068263051558898,
"learning_rate": 7.536161598679002e-06,
"loss": 0.14,
"step": 455
},
{
"epoch": 0.6627906976744186,
"grad_norm": 1.873871839563441,
"learning_rate": 7.526316864995648e-06,
"loss": 0.1585,
"step": 456
},
{
"epoch": 0.6642441860465116,
"grad_norm": 1.506919292093952,
"learning_rate": 7.516458962381654e-06,
"loss": 0.1308,
"step": 457
},
{
"epoch": 0.6656976744186046,
"grad_norm": 1.561352936275751,
"learning_rate": 7.506587942223305e-06,
"loss": 0.1374,
"step": 458
},
{
"epoch": 0.6671511627906976,
"grad_norm": 1.520294700430976,
"learning_rate": 7.4967038559752626e-06,
"loss": 0.1181,
"step": 459
},
{
"epoch": 0.6686046511627907,
"grad_norm": 1.6731627584388915,
"learning_rate": 7.486806755160298e-06,
"loss": 0.1595,
"step": 460
},
{
"epoch": 0.6700581395348837,
"grad_norm": 1.3704388693068905,
"learning_rate": 7.476896691369023e-06,
"loss": 0.1188,
"step": 461
},
{
"epoch": 0.6715116279069767,
"grad_norm": 1.4337091782306097,
"learning_rate": 7.466973716259622e-06,
"loss": 0.1132,
"step": 462
},
{
"epoch": 0.6729651162790697,
"grad_norm": 1.3716017344315858,
"learning_rate": 7.457037881557585e-06,
"loss": 0.1334,
"step": 463
},
{
"epoch": 0.6744186046511628,
"grad_norm": 1.5766750432421248,
"learning_rate": 7.447089239055428e-06,
"loss": 0.1143,
"step": 464
},
{
"epoch": 0.6758720930232558,
"grad_norm": 1.5864606252067934,
"learning_rate": 7.437127840612438e-06,
"loss": 0.1309,
"step": 465
},
{
"epoch": 0.6773255813953488,
"grad_norm": 1.4567542884868516,
"learning_rate": 7.4271537381543916e-06,
"loss": 0.147,
"step": 466
},
{
"epoch": 0.6787790697674418,
"grad_norm": 1.4514082699066,
"learning_rate": 7.417166983673286e-06,
"loss": 0.1551,
"step": 467
},
{
"epoch": 0.6802325581395349,
"grad_norm": 1.2445772110855982,
"learning_rate": 7.407167629227072e-06,
"loss": 0.1066,
"step": 468
},
{
"epoch": 0.6816860465116279,
"grad_norm": 1.370255809316678,
"learning_rate": 7.3971557269393805e-06,
"loss": 0.1377,
"step": 469
},
{
"epoch": 0.6831395348837209,
"grad_norm": 1.4592622873191892,
"learning_rate": 7.3871313289992466e-06,
"loss": 0.1314,
"step": 470
},
{
"epoch": 0.684593023255814,
"grad_norm": 1.6311413610361725,
"learning_rate": 7.377094487660847e-06,
"loss": 0.1628,
"step": 471
},
{
"epoch": 0.686046511627907,
"grad_norm": 1.5762097334368965,
"learning_rate": 7.367045255243217e-06,
"loss": 0.156,
"step": 472
},
{
"epoch": 0.6875,
"grad_norm": 1.7627902314989499,
"learning_rate": 7.3569836841299905e-06,
"loss": 0.1315,
"step": 473
},
{
"epoch": 0.688953488372093,
"grad_norm": 2.222043724851836,
"learning_rate": 7.346909826769107e-06,
"loss": 0.1731,
"step": 474
},
{
"epoch": 0.690406976744186,
"grad_norm": 1.5406069442857668,
"learning_rate": 7.336823735672563e-06,
"loss": 0.1386,
"step": 475
},
{
"epoch": 0.6918604651162791,
"grad_norm": 1.5796855692443843,
"learning_rate": 7.326725463416118e-06,
"loss": 0.15,
"step": 476
},
{
"epoch": 0.6933139534883721,
"grad_norm": 1.4662737935999774,
"learning_rate": 7.316615062639031e-06,
"loss": 0.1435,
"step": 477
},
{
"epoch": 0.6947674418604651,
"grad_norm": 1.2313632203692446,
"learning_rate": 7.306492586043783e-06,
"loss": 0.0919,
"step": 478
},
{
"epoch": 0.6962209302325582,
"grad_norm": 1.7509061840145403,
"learning_rate": 7.296358086395803e-06,
"loss": 0.1372,
"step": 479
},
{
"epoch": 0.6976744186046512,
"grad_norm": 1.6433660385495745,
"learning_rate": 7.286211616523193e-06,
"loss": 0.131,
"step": 480
},
{
"epoch": 0.6991279069767442,
"grad_norm": 1.8160423455057901,
"learning_rate": 7.276053229316451e-06,
"loss": 0.1945,
"step": 481
},
{
"epoch": 0.7005813953488372,
"grad_norm": 1.4727605332371438,
"learning_rate": 7.265882977728195e-06,
"loss": 0.1382,
"step": 482
},
{
"epoch": 0.7020348837209303,
"grad_norm": 1.4064135407009675,
"learning_rate": 7.255700914772891e-06,
"loss": 0.1266,
"step": 483
},
{
"epoch": 0.7034883720930233,
"grad_norm": 1.6003335768968785,
"learning_rate": 7.245507093526575e-06,
"loss": 0.169,
"step": 484
},
{
"epoch": 0.7049418604651163,
"grad_norm": 1.3092194579961982,
"learning_rate": 7.2353015671265716e-06,
"loss": 0.1184,
"step": 485
},
{
"epoch": 0.7063953488372093,
"grad_norm": 1.5489983894775146,
"learning_rate": 7.225084388771226e-06,
"loss": 0.1524,
"step": 486
},
{
"epoch": 0.7078488372093024,
"grad_norm": 1.4796324887099654,
"learning_rate": 7.214855611719616e-06,
"loss": 0.1592,
"step": 487
},
{
"epoch": 0.7093023255813954,
"grad_norm": 1.4347850977601093,
"learning_rate": 7.204615289291283e-06,
"loss": 0.1618,
"step": 488
},
{
"epoch": 0.7107558139534884,
"grad_norm": 1.5275682896148945,
"learning_rate": 7.194363474865951e-06,
"loss": 0.1281,
"step": 489
},
{
"epoch": 0.7122093023255814,
"grad_norm": 1.785937536200274,
"learning_rate": 7.184100221883248e-06,
"loss": 0.1454,
"step": 490
},
{
"epoch": 0.7136627906976745,
"grad_norm": 1.2976847404347416,
"learning_rate": 7.173825583842427e-06,
"loss": 0.1317,
"step": 491
},
{
"epoch": 0.7151162790697675,
"grad_norm": 1.166513744560416,
"learning_rate": 7.163539614302088e-06,
"loss": 0.13,
"step": 492
},
{
"epoch": 0.7165697674418605,
"grad_norm": 1.3524452986795923,
"learning_rate": 7.153242366879903e-06,
"loss": 0.1514,
"step": 493
},
{
"epoch": 0.7180232558139535,
"grad_norm": 1.1994423678946693,
"learning_rate": 7.142933895252324e-06,
"loss": 0.1194,
"step": 494
},
{
"epoch": 0.7194767441860465,
"grad_norm": 1.2798330620724652,
"learning_rate": 7.1326142531543184e-06,
"loss": 0.1322,
"step": 495
},
{
"epoch": 0.7209302325581395,
"grad_norm": 1.2830117846056146,
"learning_rate": 7.122283494379076e-06,
"loss": 0.1368,
"step": 496
},
{
"epoch": 0.7223837209302325,
"grad_norm": 1.6773615252194154,
"learning_rate": 7.1119416727777414e-06,
"loss": 0.152,
"step": 497
},
{
"epoch": 0.7238372093023255,
"grad_norm": 1.4919923732243248,
"learning_rate": 7.101588842259122e-06,
"loss": 0.1712,
"step": 498
},
{
"epoch": 0.7252906976744186,
"grad_norm": 1.4335685593529308,
"learning_rate": 7.09122505678941e-06,
"loss": 0.1476,
"step": 499
},
{
"epoch": 0.7267441860465116,
"grad_norm": 1.6645358118933977,
"learning_rate": 7.080850370391907e-06,
"loss": 0.1275,
"step": 500
},
{
"epoch": 0.7281976744186046,
"grad_norm": 1.4570433505689948,
"learning_rate": 7.0704648371467355e-06,
"loss": 0.1173,
"step": 501
},
{
"epoch": 0.7296511627906976,
"grad_norm": 1.4500084348795894,
"learning_rate": 7.060068511190559e-06,
"loss": 0.1343,
"step": 502
},
{
"epoch": 0.7311046511627907,
"grad_norm": 1.3583549655605316,
"learning_rate": 7.049661446716298e-06,
"loss": 0.1392,
"step": 503
},
{
"epoch": 0.7325581395348837,
"grad_norm": 1.678903155981633,
"learning_rate": 7.039243697972856e-06,
"loss": 0.1237,
"step": 504
},
{
"epoch": 0.7340116279069767,
"grad_norm": 1.1813005908965784,
"learning_rate": 7.028815319264825e-06,
"loss": 0.1239,
"step": 505
},
{
"epoch": 0.7354651162790697,
"grad_norm": 1.6621537638966262,
"learning_rate": 7.01837636495221e-06,
"loss": 0.1373,
"step": 506
},
{
"epoch": 0.7369186046511628,
"grad_norm": 1.4624579516595546,
"learning_rate": 7.007926889450142e-06,
"loss": 0.1074,
"step": 507
},
{
"epoch": 0.7383720930232558,
"grad_norm": 1.6615046264229711,
"learning_rate": 6.997466947228596e-06,
"loss": 0.149,
"step": 508
},
{
"epoch": 0.7398255813953488,
"grad_norm": 1.4750586970164907,
"learning_rate": 6.9869965928121095e-06,
"loss": 0.11,
"step": 509
},
{
"epoch": 0.7412790697674418,
"grad_norm": 1.2507783508104329,
"learning_rate": 6.976515880779492e-06,
"loss": 0.1141,
"step": 510
},
{
"epoch": 0.7427325581395349,
"grad_norm": 1.545121618824877,
"learning_rate": 6.966024865763546e-06,
"loss": 0.1291,
"step": 511
},
{
"epoch": 0.7441860465116279,
"grad_norm": 1.438312213107679,
"learning_rate": 6.95552360245078e-06,
"loss": 0.1502,
"step": 512
},
{
"epoch": 0.7456395348837209,
"grad_norm": 1.6592935733066172,
"learning_rate": 6.945012145581127e-06,
"loss": 0.175,
"step": 513
},
{
"epoch": 0.747093023255814,
"grad_norm": 1.514707717226784,
"learning_rate": 6.9344905499476475e-06,
"loss": 0.15,
"step": 514
},
{
"epoch": 0.748546511627907,
"grad_norm": 1.604276122246373,
"learning_rate": 6.9239588703962625e-06,
"loss": 0.1572,
"step": 515
},
{
"epoch": 0.75,
"grad_norm": 1.3281897884109624,
"learning_rate": 6.913417161825449e-06,
"loss": 0.1519,
"step": 516
},
{
"epoch": 0.751453488372093,
"grad_norm": 1.4267995539927456,
"learning_rate": 6.90286547918597e-06,
"loss": 0.1335,
"step": 517
},
{
"epoch": 0.752906976744186,
"grad_norm": 1.5170579839155427,
"learning_rate": 6.8923038774805705e-06,
"loss": 0.1519,
"step": 518
},
{
"epoch": 0.7543604651162791,
"grad_norm": 1.3797508342143487,
"learning_rate": 6.881732411763712e-06,
"loss": 0.1343,
"step": 519
},
{
"epoch": 0.7558139534883721,
"grad_norm": 1.5104127395592026,
"learning_rate": 6.871151137141266e-06,
"loss": 0.138,
"step": 520
},
{
"epoch": 0.7572674418604651,
"grad_norm": 1.3568718550092838,
"learning_rate": 6.860560108770238e-06,
"loss": 0.1367,
"step": 521
},
{
"epoch": 0.7587209302325582,
"grad_norm": 1.5091260876222339,
"learning_rate": 6.849959381858475e-06,
"loss": 0.1186,
"step": 522
},
{
"epoch": 0.7601744186046512,
"grad_norm": 1.318107897408335,
"learning_rate": 6.839349011664381e-06,
"loss": 0.111,
"step": 523
},
{
"epoch": 0.7616279069767442,
"grad_norm": 1.4214672012040783,
"learning_rate": 6.828729053496629e-06,
"loss": 0.1174,
"step": 524
},
{
"epoch": 0.7630813953488372,
"grad_norm": 1.4659206181489879,
"learning_rate": 6.8180995627138665e-06,
"loss": 0.1492,
"step": 525
},
{
"epoch": 0.7645348837209303,
"grad_norm": 1.7334326757131346,
"learning_rate": 6.80746059472444e-06,
"loss": 0.1363,
"step": 526
},
{
"epoch": 0.7659883720930233,
"grad_norm": 1.4971209402674073,
"learning_rate": 6.796812204986087e-06,
"loss": 0.1465,
"step": 527
},
{
"epoch": 0.7674418604651163,
"grad_norm": 1.4222274425753152,
"learning_rate": 6.786154449005664e-06,
"loss": 0.1356,
"step": 528
},
{
"epoch": 0.7688953488372093,
"grad_norm": 1.1331281504639361,
"learning_rate": 6.775487382338854e-06,
"loss": 0.0901,
"step": 529
},
{
"epoch": 0.7703488372093024,
"grad_norm": 1.4096278465211138,
"learning_rate": 6.764811060589867e-06,
"loss": 0.1083,
"step": 530
},
{
"epoch": 0.7718023255813954,
"grad_norm": 1.4098544395095702,
"learning_rate": 6.754125539411159e-06,
"loss": 0.1217,
"step": 531
},
{
"epoch": 0.7732558139534884,
"grad_norm": 1.131188089335205,
"learning_rate": 6.743430874503143e-06,
"loss": 0.1337,
"step": 532
},
{
"epoch": 0.7747093023255814,
"grad_norm": 1.2696620283091575,
"learning_rate": 6.732727121613894e-06,
"loss": 0.097,
"step": 533
},
{
"epoch": 0.7761627906976745,
"grad_norm": 1.5173238831421139,
"learning_rate": 6.722014336538858e-06,
"loss": 0.1178,
"step": 534
},
{
"epoch": 0.7776162790697675,
"grad_norm": 1.393380969750145,
"learning_rate": 6.7112925751205636e-06,
"loss": 0.1409,
"step": 535
},
{
"epoch": 0.7790697674418605,
"grad_norm": 1.5020699777762692,
"learning_rate": 6.700561893248332e-06,
"loss": 0.1635,
"step": 536
},
{
"epoch": 0.7805232558139535,
"grad_norm": 1.319982727969388,
"learning_rate": 6.689822346857983e-06,
"loss": 0.1047,
"step": 537
},
{
"epoch": 0.7819767441860465,
"grad_norm": 1.6061685140083173,
"learning_rate": 6.679073991931544e-06,
"loss": 0.1513,
"step": 538
},
{
"epoch": 0.7834302325581395,
"grad_norm": 1.451224509588678,
"learning_rate": 6.66831688449696e-06,
"loss": 0.1223,
"step": 539
},
{
"epoch": 0.7848837209302325,
"grad_norm": 1.3257073380514315,
"learning_rate": 6.657551080627801e-06,
"loss": 0.147,
"step": 540
},
{
"epoch": 0.7863372093023255,
"grad_norm": 1.412854606695532,
"learning_rate": 6.646776636442964e-06,
"loss": 0.1427,
"step": 541
},
{
"epoch": 0.7877906976744186,
"grad_norm": 1.2307963654566814,
"learning_rate": 6.63599360810639e-06,
"loss": 0.1149,
"step": 542
},
{
"epoch": 0.7892441860465116,
"grad_norm": 1.2378486319546478,
"learning_rate": 6.6252020518267664e-06,
"loss": 0.145,
"step": 543
},
{
"epoch": 0.7906976744186046,
"grad_norm": 1.4375410232495538,
"learning_rate": 6.614402023857231e-06,
"loss": 0.1458,
"step": 544
},
{
"epoch": 0.7921511627906976,
"grad_norm": 1.7682907013920814,
"learning_rate": 6.603593580495088e-06,
"loss": 0.1492,
"step": 545
},
{
"epoch": 0.7936046511627907,
"grad_norm": 1.5849164073586606,
"learning_rate": 6.5927767780815e-06,
"loss": 0.1244,
"step": 546
},
{
"epoch": 0.7950581395348837,
"grad_norm": 1.4013284382226423,
"learning_rate": 6.581951673001212e-06,
"loss": 0.1575,
"step": 547
},
{
"epoch": 0.7965116279069767,
"grad_norm": 1.6703536565707453,
"learning_rate": 6.5711183216822405e-06,
"loss": 0.1345,
"step": 548
},
{
"epoch": 0.7979651162790697,
"grad_norm": 1.5575391293614544,
"learning_rate": 6.56027678059559e-06,
"loss": 0.1424,
"step": 549
},
{
"epoch": 0.7994186046511628,
"grad_norm": 1.1506378616299937,
"learning_rate": 6.549427106254959e-06,
"loss": 0.1204,
"step": 550
},
{
"epoch": 0.8008720930232558,
"grad_norm": 1.6831420345046888,
"learning_rate": 6.5385693552164375e-06,
"loss": 0.1533,
"step": 551
},
{
"epoch": 0.8023255813953488,
"grad_norm": 1.5433557869849786,
"learning_rate": 6.527703584078219e-06,
"loss": 0.1287,
"step": 552
},
{
"epoch": 0.8037790697674418,
"grad_norm": 1.37938065902107,
"learning_rate": 6.516829849480304e-06,
"loss": 0.1466,
"step": 553
},
{
"epoch": 0.8052325581395349,
"grad_norm": 1.4138166190724506,
"learning_rate": 6.505948208104202e-06,
"loss": 0.1336,
"step": 554
},
{
"epoch": 0.8066860465116279,
"grad_norm": 1.2958346006809633,
"learning_rate": 6.495058716672641e-06,
"loss": 0.1155,
"step": 555
},
{
"epoch": 0.8081395348837209,
"grad_norm": 1.536894893444166,
"learning_rate": 6.4841614319492665e-06,
"loss": 0.1467,
"step": 556
},
{
"epoch": 0.809593023255814,
"grad_norm": 1.42823984914434,
"learning_rate": 6.473256410738349e-06,
"loss": 0.1245,
"step": 557
},
{
"epoch": 0.811046511627907,
"grad_norm": 1.738051363991152,
"learning_rate": 6.462343709884488e-06,
"loss": 0.1431,
"step": 558
},
{
"epoch": 0.8125,
"grad_norm": 1.199922342897142,
"learning_rate": 6.451423386272312e-06,
"loss": 0.1328,
"step": 559
},
{
"epoch": 0.813953488372093,
"grad_norm": 1.8691018081903936,
"learning_rate": 6.440495496826189e-06,
"loss": 0.1695,
"step": 560
},
{
"epoch": 0.815406976744186,
"grad_norm": 1.431507205382704,
"learning_rate": 6.429560098509919e-06,
"loss": 0.1519,
"step": 561
},
{
"epoch": 0.8168604651162791,
"grad_norm": 1.764161344903358,
"learning_rate": 6.4186172483264505e-06,
"loss": 0.1503,
"step": 562
},
{
"epoch": 0.8183139534883721,
"grad_norm": 1.3488150562790815,
"learning_rate": 6.4076670033175725e-06,
"loss": 0.1419,
"step": 563
},
{
"epoch": 0.8197674418604651,
"grad_norm": 1.6253830612966447,
"learning_rate": 6.396709420563621e-06,
"loss": 0.1779,
"step": 564
},
{
"epoch": 0.8212209302325582,
"grad_norm": 1.628949856290011,
"learning_rate": 6.385744557183181e-06,
"loss": 0.1325,
"step": 565
},
{
"epoch": 0.8226744186046512,
"grad_norm": 2.1235072067385237,
"learning_rate": 6.374772470332793e-06,
"loss": 0.1833,
"step": 566
},
{
"epoch": 0.8241279069767442,
"grad_norm": 1.9172827402856294,
"learning_rate": 6.363793217206645e-06,
"loss": 0.1903,
"step": 567
},
{
"epoch": 0.8255813953488372,
"grad_norm": 1.4322727783843214,
"learning_rate": 6.352806855036287e-06,
"loss": 0.1271,
"step": 568
},
{
"epoch": 0.8270348837209303,
"grad_norm": 1.2966960086456427,
"learning_rate": 6.341813441090323e-06,
"loss": 0.146,
"step": 569
},
{
"epoch": 0.8284883720930233,
"grad_norm": 1.550482403879787,
"learning_rate": 6.330813032674116e-06,
"loss": 0.1432,
"step": 570
},
{
"epoch": 0.8299418604651163,
"grad_norm": 1.3655414999390942,
"learning_rate": 6.3198056871294885e-06,
"loss": 0.109,
"step": 571
},
{
"epoch": 0.8313953488372093,
"grad_norm": 1.4217140051202517,
"learning_rate": 6.308791461834427e-06,
"loss": 0.1318,
"step": 572
},
{
"epoch": 0.8328488372093024,
"grad_norm": 1.7714886028240058,
"learning_rate": 6.297770414202778e-06,
"loss": 0.1777,
"step": 573
},
{
"epoch": 0.8343023255813954,
"grad_norm": 1.4570998125079995,
"learning_rate": 6.286742601683947e-06,
"loss": 0.1237,
"step": 574
},
{
"epoch": 0.8357558139534884,
"grad_norm": 1.4949515901539128,
"learning_rate": 6.275708081762611e-06,
"loss": 0.131,
"step": 575
},
{
"epoch": 0.8372093023255814,
"grad_norm": 1.3854089542275734,
"learning_rate": 6.264666911958404e-06,
"loss": 0.1125,
"step": 576
},
{
"epoch": 0.8386627906976745,
"grad_norm": 1.5479190064330035,
"learning_rate": 6.253619149825627e-06,
"loss": 0.1402,
"step": 577
},
{
"epoch": 0.8401162790697675,
"grad_norm": 1.4026025728759723,
"learning_rate": 6.242564852952946e-06,
"loss": 0.1209,
"step": 578
},
{
"epoch": 0.8415697674418605,
"grad_norm": 1.4121319145908862,
"learning_rate": 6.231504078963087e-06,
"loss": 0.138,
"step": 579
},
{
"epoch": 0.8430232558139535,
"grad_norm": 2.209836743441391,
"learning_rate": 6.220436885512539e-06,
"loss": 0.1362,
"step": 580
},
{
"epoch": 0.8444767441860465,
"grad_norm": 1.585787336115818,
"learning_rate": 6.209363330291261e-06,
"loss": 0.201,
"step": 581
},
{
"epoch": 0.8459302325581395,
"grad_norm": 1.9740958700357052,
"learning_rate": 6.198283471022362e-06,
"loss": 0.1469,
"step": 582
},
{
"epoch": 0.8473837209302325,
"grad_norm": 1.4412498217025638,
"learning_rate": 6.187197365461822e-06,
"loss": 0.1708,
"step": 583
},
{
"epoch": 0.8488372093023255,
"grad_norm": 1.4582305731121776,
"learning_rate": 6.1761050713981795e-06,
"loss": 0.1266,
"step": 584
},
{
"epoch": 0.8502906976744186,
"grad_norm": 1.450107224126935,
"learning_rate": 6.165006646652227e-06,
"loss": 0.1561,
"step": 585
},
{
"epoch": 0.8517441860465116,
"grad_norm": 2.004083462986721,
"learning_rate": 6.1539021490767206e-06,
"loss": 0.2133,
"step": 586
},
{
"epoch": 0.8531976744186046,
"grad_norm": 1.6247079616074993,
"learning_rate": 6.1427916365560666e-06,
"loss": 0.1742,
"step": 587
},
{
"epoch": 0.8546511627906976,
"grad_norm": 1.552175859788209,
"learning_rate": 6.1316751670060295e-06,
"loss": 0.136,
"step": 588
},
{
"epoch": 0.8561046511627907,
"grad_norm": 1.276124479823587,
"learning_rate": 6.120552798373423e-06,
"loss": 0.1694,
"step": 589
},
{
"epoch": 0.8575581395348837,
"grad_norm": 1.5410147115702781,
"learning_rate": 6.109424588635814e-06,
"loss": 0.105,
"step": 590
},
{
"epoch": 0.8590116279069767,
"grad_norm": 1.2952469635559278,
"learning_rate": 6.098290595801215e-06,
"loss": 0.1179,
"step": 591
},
{
"epoch": 0.8604651162790697,
"grad_norm": 1.1306177176882681,
"learning_rate": 6.087150877907786e-06,
"loss": 0.081,
"step": 592
},
{
"epoch": 0.8619186046511628,
"grad_norm": 1.7371127866399059,
"learning_rate": 6.076005493023527e-06,
"loss": 0.1893,
"step": 593
},
{
"epoch": 0.8633720930232558,
"grad_norm": 1.4638570381751588,
"learning_rate": 6.0648544992459804e-06,
"loss": 0.1572,
"step": 594
},
{
"epoch": 0.8648255813953488,
"grad_norm": 1.6634088466354064,
"learning_rate": 6.053697954701927e-06,
"loss": 0.1517,
"step": 595
},
{
"epoch": 0.8662790697674418,
"grad_norm": 1.620767793264197,
"learning_rate": 6.04253591754708e-06,
"loss": 0.1678,
"step": 596
},
{
"epoch": 0.8677325581395349,
"grad_norm": 1.6119437429007382,
"learning_rate": 6.031368445965784e-06,
"loss": 0.159,
"step": 597
},
{
"epoch": 0.8691860465116279,
"grad_norm": 1.78819833988465,
"learning_rate": 6.0201955981707135e-06,
"loss": 0.1993,
"step": 598
},
{
"epoch": 0.8706395348837209,
"grad_norm": 1.95544535687204,
"learning_rate": 6.009017432402569e-06,
"loss": 0.1759,
"step": 599
},
{
"epoch": 0.872093023255814,
"grad_norm": 1.5590305502883546,
"learning_rate": 5.997834006929765e-06,
"loss": 0.1469,
"step": 600
},
{
"epoch": 0.872093023255814,
"eval_loss": 0.1277909129858017,
"eval_runtime": 2.208,
"eval_samples_per_second": 25.362,
"eval_steps_per_second": 6.341,
"step": 600
},
{
"epoch": 0.873546511627907,
"grad_norm": 1.3164705679840971,
"learning_rate": 5.98664538004814e-06,
"loss": 0.1003,
"step": 601
},
{
"epoch": 0.875,
"grad_norm": 1.2239480234296696,
"learning_rate": 5.975451610080643e-06,
"loss": 0.1121,
"step": 602
},
{
"epoch": 0.876453488372093,
"grad_norm": 1.894849529945871,
"learning_rate": 5.964252755377033e-06,
"loss": 0.1654,
"step": 603
},
{
"epoch": 0.877906976744186,
"grad_norm": 1.2492208085247638,
"learning_rate": 5.953048874313575e-06,
"loss": 0.1122,
"step": 604
},
{
"epoch": 0.8793604651162791,
"grad_norm": 1.8022455407233433,
"learning_rate": 5.941840025292733e-06,
"loss": 0.1257,
"step": 605
},
{
"epoch": 0.8808139534883721,
"grad_norm": 1.3177318316240192,
"learning_rate": 5.930626266742871e-06,
"loss": 0.1383,
"step": 606
},
{
"epoch": 0.8822674418604651,
"grad_norm": 1.140730369983588,
"learning_rate": 5.9194076571179415e-06,
"loss": 0.116,
"step": 607
},
{
"epoch": 0.8837209302325582,
"grad_norm": 1.3723451817269945,
"learning_rate": 5.908184254897183e-06,
"loss": 0.1549,
"step": 608
},
{
"epoch": 0.8851744186046512,
"grad_norm": 1.5626353100716068,
"learning_rate": 5.89695611858482e-06,
"loss": 0.1442,
"step": 609
},
{
"epoch": 0.8866279069767442,
"grad_norm": 1.8839870797491676,
"learning_rate": 5.885723306709754e-06,
"loss": 0.1286,
"step": 610
},
{
"epoch": 0.8880813953488372,
"grad_norm": 1.082745017222708,
"learning_rate": 5.8744858778252555e-06,
"loss": 0.1223,
"step": 611
},
{
"epoch": 0.8895348837209303,
"grad_norm": 1.4674626548779213,
"learning_rate": 5.8632438905086685e-06,
"loss": 0.1456,
"step": 612
},
{
"epoch": 0.8909883720930233,
"grad_norm": 1.358671032886957,
"learning_rate": 5.851997403361089e-06,
"loss": 0.1299,
"step": 613
},
{
"epoch": 0.8924418604651163,
"grad_norm": 1.296241900835058,
"learning_rate": 5.840746475007079e-06,
"loss": 0.1419,
"step": 614
},
{
"epoch": 0.8938953488372093,
"grad_norm": 1.3842657938075646,
"learning_rate": 5.8294911640943455e-06,
"loss": 0.1215,
"step": 615
},
{
"epoch": 0.8953488372093024,
"grad_norm": 1.3467341878188124,
"learning_rate": 5.818231529293441e-06,
"loss": 0.1123,
"step": 616
},
{
"epoch": 0.8968023255813954,
"grad_norm": 1.4251072382018615,
"learning_rate": 5.80696762929746e-06,
"loss": 0.1423,
"step": 617
},
{
"epoch": 0.8982558139534884,
"grad_norm": 1.5077208770452775,
"learning_rate": 5.795699522821727e-06,
"loss": 0.1728,
"step": 618
},
{
"epoch": 0.8997093023255814,
"grad_norm": 1.248864820858094,
"learning_rate": 5.784427268603498e-06,
"loss": 0.1195,
"step": 619
},
{
"epoch": 0.9011627906976745,
"grad_norm": 1.4092688361917134,
"learning_rate": 5.773150925401642e-06,
"loss": 0.1266,
"step": 620
},
{
"epoch": 0.9026162790697675,
"grad_norm": 1.688504971336848,
"learning_rate": 5.761870551996349e-06,
"loss": 0.1379,
"step": 621
},
{
"epoch": 0.9040697674418605,
"grad_norm": 1.2541754034602337,
"learning_rate": 5.750586207188817e-06,
"loss": 0.0917,
"step": 622
},
{
"epoch": 0.9055232558139535,
"grad_norm": 1.2487120647276626,
"learning_rate": 5.7392979498009445e-06,
"loss": 0.1241,
"step": 623
},
{
"epoch": 0.9069767441860465,
"grad_norm": 1.445654790451785,
"learning_rate": 5.728005838675026e-06,
"loss": 0.1456,
"step": 624
},
{
"epoch": 0.9084302325581395,
"grad_norm": 1.7499610557683432,
"learning_rate": 5.7167099326734385e-06,
"loss": 0.1405,
"step": 625
},
{
"epoch": 0.9098837209302325,
"grad_norm": 1.648183080801681,
"learning_rate": 5.7054102906783526e-06,
"loss": 0.1822,
"step": 626
},
{
"epoch": 0.9113372093023255,
"grad_norm": 1.6571510451841074,
"learning_rate": 5.6941069715914e-06,
"loss": 0.1181,
"step": 627
},
{
"epoch": 0.9127906976744186,
"grad_norm": 1.944131860744343,
"learning_rate": 5.6828000343333904e-06,
"loss": 0.1719,
"step": 628
},
{
"epoch": 0.9142441860465116,
"grad_norm": 1.7309656879851627,
"learning_rate": 5.671489537843987e-06,
"loss": 0.1689,
"step": 629
},
{
"epoch": 0.9156976744186046,
"grad_norm": 1.7575196308441996,
"learning_rate": 5.660175541081411e-06,
"loss": 0.182,
"step": 630
},
{
"epoch": 0.9171511627906976,
"grad_norm": 1.277009648528385,
"learning_rate": 5.648858103022128e-06,
"loss": 0.1516,
"step": 631
},
{
"epoch": 0.9186046511627907,
"grad_norm": 1.340132227949514,
"learning_rate": 5.63753728266054e-06,
"loss": 0.1176,
"step": 632
},
{
"epoch": 0.9200581395348837,
"grad_norm": 1.7630027193552753,
"learning_rate": 5.626213139008684e-06,
"loss": 0.1677,
"step": 633
},
{
"epoch": 0.9215116279069767,
"grad_norm": 1.5111942011086947,
"learning_rate": 5.614885731095915e-06,
"loss": 0.1763,
"step": 634
},
{
"epoch": 0.9229651162790697,
"grad_norm": 1.3355309378554394,
"learning_rate": 5.603555117968607e-06,
"loss": 0.1073,
"step": 635
},
{
"epoch": 0.9244186046511628,
"grad_norm": 1.404078498056489,
"learning_rate": 5.592221358689843e-06,
"loss": 0.1249,
"step": 636
},
{
"epoch": 0.9258720930232558,
"grad_norm": 1.4270282877283114,
"learning_rate": 5.580884512339103e-06,
"loss": 0.1035,
"step": 637
},
{
"epoch": 0.9273255813953488,
"grad_norm": 1.5176928637053217,
"learning_rate": 5.56954463801196e-06,
"loss": 0.1373,
"step": 638
},
{
"epoch": 0.9287790697674418,
"grad_norm": 1.3456243259305802,
"learning_rate": 5.558201794819773e-06,
"loss": 0.1868,
"step": 639
},
{
"epoch": 0.9302325581395349,
"grad_norm": 1.5593018977581026,
"learning_rate": 5.546856041889374e-06,
"loss": 0.1429,
"step": 640
},
{
"epoch": 0.9316860465116279,
"grad_norm": 1.6976998517010211,
"learning_rate": 5.53550743836276e-06,
"loss": 0.1523,
"step": 641
},
{
"epoch": 0.9331395348837209,
"grad_norm": 1.3798256912079117,
"learning_rate": 5.524156043396796e-06,
"loss": 0.1179,
"step": 642
},
{
"epoch": 0.934593023255814,
"grad_norm": 1.4294475115030438,
"learning_rate": 5.512801916162891e-06,
"loss": 0.1309,
"step": 643
},
{
"epoch": 0.936046511627907,
"grad_norm": 2.7198043975539137,
"learning_rate": 5.501445115846697e-06,
"loss": 0.207,
"step": 644
},
{
"epoch": 0.9375,
"grad_norm": 1.4833934130748365,
"learning_rate": 5.490085701647805e-06,
"loss": 0.1355,
"step": 645
},
{
"epoch": 0.938953488372093,
"grad_norm": 1.3355964013808177,
"learning_rate": 5.478723732779422e-06,
"loss": 0.1374,
"step": 646
},
{
"epoch": 0.940406976744186,
"grad_norm": 1.7207500799533988,
"learning_rate": 5.467359268468081e-06,
"loss": 0.1605,
"step": 647
},
{
"epoch": 0.9418604651162791,
"grad_norm": 1.3090162322795527,
"learning_rate": 5.455992367953318e-06,
"loss": 0.1445,
"step": 648
},
{
"epoch": 0.9433139534883721,
"grad_norm": 1.2788214859695954,
"learning_rate": 5.444623090487371e-06,
"loss": 0.1047,
"step": 649
},
{
"epoch": 0.9447674418604651,
"grad_norm": 2.0071938164433365,
"learning_rate": 5.433251495334864e-06,
"loss": 0.1546,
"step": 650
},
{
"epoch": 0.9462209302325582,
"grad_norm": 1.3782058820605825,
"learning_rate": 5.4218776417725095e-06,
"loss": 0.136,
"step": 651
},
{
"epoch": 0.9476744186046512,
"grad_norm": 1.281462748132549,
"learning_rate": 5.410501589088786e-06,
"loss": 0.1035,
"step": 652
},
{
"epoch": 0.9491279069767442,
"grad_norm": 1.9048686906108325,
"learning_rate": 5.3991233965836365e-06,
"loss": 0.1683,
"step": 653
},
{
"epoch": 0.9505813953488372,
"grad_norm": 1.6316273841656876,
"learning_rate": 5.387743123568161e-06,
"loss": 0.1446,
"step": 654
},
{
"epoch": 0.9520348837209303,
"grad_norm": 1.4861937009380384,
"learning_rate": 5.376360829364301e-06,
"loss": 0.1527,
"step": 655
},
{
"epoch": 0.9534883720930233,
"grad_norm": 1.8814574763895897,
"learning_rate": 5.364976573304538e-06,
"loss": 0.1614,
"step": 656
},
{
"epoch": 0.9549418604651163,
"grad_norm": 1.4646624685889762,
"learning_rate": 5.3535904147315765e-06,
"loss": 0.1543,
"step": 657
},
{
"epoch": 0.9563953488372093,
"grad_norm": 1.287864663906961,
"learning_rate": 5.34220241299804e-06,
"loss": 0.1279,
"step": 658
},
{
"epoch": 0.9578488372093024,
"grad_norm": 1.7385545460984706,
"learning_rate": 5.330812627466159e-06,
"loss": 0.1639,
"step": 659
},
{
"epoch": 0.9593023255813954,
"grad_norm": 1.7081543020679955,
"learning_rate": 5.319421117507461e-06,
"loss": 0.1597,
"step": 660
},
{
"epoch": 0.9607558139534884,
"grad_norm": 1.3995536288669932,
"learning_rate": 5.308027942502467e-06,
"loss": 0.1103,
"step": 661
},
{
"epoch": 0.9622093023255814,
"grad_norm": 1.2943413923835272,
"learning_rate": 5.296633161840374e-06,
"loss": 0.159,
"step": 662
},
{
"epoch": 0.9636627906976745,
"grad_norm": 1.5010261384648311,
"learning_rate": 5.285236834918749e-06,
"loss": 0.1101,
"step": 663
},
{
"epoch": 0.9651162790697675,
"grad_norm": 1.59839024793064,
"learning_rate": 5.273839021143217e-06,
"loss": 0.1438,
"step": 664
},
{
"epoch": 0.9665697674418605,
"grad_norm": 1.3887266849885156,
"learning_rate": 5.262439779927163e-06,
"loss": 0.1336,
"step": 665
},
{
"epoch": 0.9680232558139535,
"grad_norm": 1.2605940576749957,
"learning_rate": 5.251039170691399e-06,
"loss": 0.1168,
"step": 666
},
{
"epoch": 0.9694767441860465,
"grad_norm": 1.4056700350437759,
"learning_rate": 5.2396372528638785e-06,
"loss": 0.1501,
"step": 667
},
{
"epoch": 0.9709302325581395,
"grad_norm": 1.4534488094786742,
"learning_rate": 5.22823408587937e-06,
"loss": 0.1336,
"step": 668
},
{
"epoch": 0.9723837209302325,
"grad_norm": 1.336814955827563,
"learning_rate": 5.216829729179158e-06,
"loss": 0.1126,
"step": 669
},
{
"epoch": 0.9738372093023255,
"grad_norm": 1.5142125626046417,
"learning_rate": 5.205424242210727e-06,
"loss": 0.1096,
"step": 670
},
{
"epoch": 0.9752906976744186,
"grad_norm": 1.4755522779468442,
"learning_rate": 5.194017684427453e-06,
"loss": 0.1087,
"step": 671
},
{
"epoch": 0.9767441860465116,
"grad_norm": 1.5661924416341968,
"learning_rate": 5.182610115288296e-06,
"loss": 0.1274,
"step": 672
},
{
"epoch": 0.9781976744186046,
"grad_norm": 1.9124058904385859,
"learning_rate": 5.171201594257481e-06,
"loss": 0.1314,
"step": 673
},
{
"epoch": 0.9796511627906976,
"grad_norm": 1.504304863764823,
"learning_rate": 5.159792180804204e-06,
"loss": 0.1759,
"step": 674
},
{
"epoch": 0.9811046511627907,
"grad_norm": 1.6754651562035756,
"learning_rate": 5.148381934402306e-06,
"loss": 0.1911,
"step": 675
},
{
"epoch": 0.9825581395348837,
"grad_norm": 1.4176711979057277,
"learning_rate": 5.136970914529975e-06,
"loss": 0.1305,
"step": 676
},
{
"epoch": 0.9840116279069767,
"grad_norm": 1.5962091642432297,
"learning_rate": 5.125559180669427e-06,
"loss": 0.1385,
"step": 677
},
{
"epoch": 0.9854651162790697,
"grad_norm": 1.223911365889511,
"learning_rate": 5.1141467923066016e-06,
"loss": 0.1169,
"step": 678
},
{
"epoch": 0.9869186046511628,
"grad_norm": 1.5617758354090914,
"learning_rate": 5.102733808930851e-06,
"loss": 0.1172,
"step": 679
},
{
"epoch": 0.9883720930232558,
"grad_norm": 1.6974851747495288,
"learning_rate": 5.0913202900346246e-06,
"loss": 0.13,
"step": 680
},
{
"epoch": 0.9898255813953488,
"grad_norm": 1.631411370867565,
"learning_rate": 5.07990629511317e-06,
"loss": 0.1454,
"step": 681
},
{
"epoch": 0.9912790697674418,
"grad_norm": 1.5300412665655438,
"learning_rate": 5.068491883664212e-06,
"loss": 0.109,
"step": 682
},
{
"epoch": 0.9927325581395349,
"grad_norm": 1.442218618017608,
"learning_rate": 5.057077115187645e-06,
"loss": 0.1429,
"step": 683
},
{
"epoch": 0.9941860465116279,
"grad_norm": 1.347588478220433,
"learning_rate": 5.04566204918523e-06,
"loss": 0.1255,
"step": 684
},
{
"epoch": 0.9956395348837209,
"grad_norm": 1.605701847815844,
"learning_rate": 5.034246745160275e-06,
"loss": 0.1533,
"step": 685
},
{
"epoch": 0.997093023255814,
"grad_norm": 1.4017801022069876,
"learning_rate": 5.022831262617328e-06,
"loss": 0.1291,
"step": 686
},
{
"epoch": 0.998546511627907,
"grad_norm": 1.5723087938578602,
"learning_rate": 5.011415661061869e-06,
"loss": 0.106,
"step": 687
},
{
"epoch": 1.0,
"grad_norm": 1.160944506207306,
"learning_rate": 5e-06,
"loss": 0.0753,
"step": 688
},
{
"epoch": 1.001453488372093,
"grad_norm": 0.9804794119294494,
"learning_rate": 4.988584338938133e-06,
"loss": 0.0865,
"step": 689
},
{
"epoch": 1.002906976744186,
"grad_norm": 0.8710757605721173,
"learning_rate": 4.977168737382674e-06,
"loss": 0.0666,
"step": 690
},
{
"epoch": 1.004360465116279,
"grad_norm": 1.195545295432579,
"learning_rate": 4.965753254839727e-06,
"loss": 0.0836,
"step": 691
},
{
"epoch": 1.005813953488372,
"grad_norm": 1.2127630890866723,
"learning_rate": 4.954337950814771e-06,
"loss": 0.0551,
"step": 692
},
{
"epoch": 1.007267441860465,
"grad_norm": 1.5404241417212168,
"learning_rate": 4.942922884812357e-06,
"loss": 0.1009,
"step": 693
},
{
"epoch": 1.0087209302325582,
"grad_norm": 1.1556879357489478,
"learning_rate": 4.9315081163357905e-06,
"loss": 0.0675,
"step": 694
},
{
"epoch": 1.010174418604651,
"grad_norm": 1.1774111310837216,
"learning_rate": 4.920093704886832e-06,
"loss": 0.0639,
"step": 695
},
{
"epoch": 1.0116279069767442,
"grad_norm": 1.1528720659312301,
"learning_rate": 4.908679709965376e-06,
"loss": 0.0548,
"step": 696
},
{
"epoch": 1.0130813953488371,
"grad_norm": 1.1786682354719682,
"learning_rate": 4.897266191069152e-06,
"loss": 0.0793,
"step": 697
},
{
"epoch": 1.0145348837209303,
"grad_norm": 1.1681775175087292,
"learning_rate": 4.8858532076934e-06,
"loss": 0.0621,
"step": 698
},
{
"epoch": 1.0159883720930232,
"grad_norm": 1.396149038821934,
"learning_rate": 4.874440819330576e-06,
"loss": 0.0595,
"step": 699
},
{
"epoch": 1.0174418604651163,
"grad_norm": 1.0444832188691113,
"learning_rate": 4.8630290854700264e-06,
"loss": 0.06,
"step": 700
},
{
"epoch": 1.0188953488372092,
"grad_norm": 1.3215192237780464,
"learning_rate": 4.851618065597696e-06,
"loss": 0.0594,
"step": 701
},
{
"epoch": 1.0203488372093024,
"grad_norm": 1.3022953280724392,
"learning_rate": 4.840207819195797e-06,
"loss": 0.0582,
"step": 702
},
{
"epoch": 1.0218023255813953,
"grad_norm": 1.2305716639772417,
"learning_rate": 4.82879840574252e-06,
"loss": 0.0626,
"step": 703
},
{
"epoch": 1.0232558139534884,
"grad_norm": 1.612550300365213,
"learning_rate": 4.817389884711706e-06,
"loss": 0.0739,
"step": 704
},
{
"epoch": 1.0247093023255813,
"grad_norm": 1.2261189219475501,
"learning_rate": 4.805982315572547e-06,
"loss": 0.0842,
"step": 705
},
{
"epoch": 1.0261627906976745,
"grad_norm": 1.5497272626438672,
"learning_rate": 4.794575757789274e-06,
"loss": 0.0808,
"step": 706
},
{
"epoch": 1.0276162790697674,
"grad_norm": 1.5930387295456183,
"learning_rate": 4.7831702708208445e-06,
"loss": 0.0617,
"step": 707
},
{
"epoch": 1.0290697674418605,
"grad_norm": 1.3103541839768624,
"learning_rate": 4.7717659141206315e-06,
"loss": 0.0728,
"step": 708
},
{
"epoch": 1.0305232558139534,
"grad_norm": 1.2251435492506793,
"learning_rate": 4.760362747136125e-06,
"loss": 0.0707,
"step": 709
},
{
"epoch": 1.0319767441860466,
"grad_norm": 1.3796327304677727,
"learning_rate": 4.748960829308601e-06,
"loss": 0.0681,
"step": 710
},
{
"epoch": 1.0334302325581395,
"grad_norm": 1.1725032499441765,
"learning_rate": 4.737560220072839e-06,
"loss": 0.0565,
"step": 711
},
{
"epoch": 1.0348837209302326,
"grad_norm": 1.245077983095157,
"learning_rate": 4.726160978856782e-06,
"loss": 0.0528,
"step": 712
},
{
"epoch": 1.0363372093023255,
"grad_norm": 1.1988572535104782,
"learning_rate": 4.714763165081253e-06,
"loss": 0.0677,
"step": 713
},
{
"epoch": 1.0377906976744187,
"grad_norm": 1.4960786476723407,
"learning_rate": 4.703366838159627e-06,
"loss": 0.0639,
"step": 714
},
{
"epoch": 1.0392441860465116,
"grad_norm": 1.4091361813904233,
"learning_rate": 4.691972057497534e-06,
"loss": 0.0602,
"step": 715
},
{
"epoch": 1.0406976744186047,
"grad_norm": 1.5037404883411951,
"learning_rate": 4.6805788824925395e-06,
"loss": 0.0705,
"step": 716
},
{
"epoch": 1.0421511627906976,
"grad_norm": 1.2607654648800852,
"learning_rate": 4.669187372533843e-06,
"loss": 0.053,
"step": 717
},
{
"epoch": 1.0436046511627908,
"grad_norm": 1.6239843045120586,
"learning_rate": 4.657797587001961e-06,
"loss": 0.0704,
"step": 718
},
{
"epoch": 1.0450581395348837,
"grad_norm": 1.5045378575805983,
"learning_rate": 4.646409585268425e-06,
"loss": 0.0606,
"step": 719
},
{
"epoch": 1.0465116279069768,
"grad_norm": 1.291286419079524,
"learning_rate": 4.635023426695462e-06,
"loss": 0.0644,
"step": 720
},
{
"epoch": 1.0479651162790697,
"grad_norm": 1.370959757593235,
"learning_rate": 4.6236391706357e-06,
"loss": 0.0704,
"step": 721
},
{
"epoch": 1.0494186046511629,
"grad_norm": 1.5664627647333818,
"learning_rate": 4.612256876431839e-06,
"loss": 0.0498,
"step": 722
},
{
"epoch": 1.0508720930232558,
"grad_norm": 1.2766329620982255,
"learning_rate": 4.600876603416364e-06,
"loss": 0.0473,
"step": 723
},
{
"epoch": 1.052325581395349,
"grad_norm": 1.5959190709847693,
"learning_rate": 4.589498410911215e-06,
"loss": 0.083,
"step": 724
},
{
"epoch": 1.0537790697674418,
"grad_norm": 1.2654889143492782,
"learning_rate": 4.578122358227492e-06,
"loss": 0.061,
"step": 725
},
{
"epoch": 1.055232558139535,
"grad_norm": 1.232952014205962,
"learning_rate": 4.566748504665136e-06,
"loss": 0.0601,
"step": 726
},
{
"epoch": 1.056686046511628,
"grad_norm": 1.3155701382257572,
"learning_rate": 4.555376909512631e-06,
"loss": 0.0537,
"step": 727
},
{
"epoch": 1.058139534883721,
"grad_norm": 1.6520885025976517,
"learning_rate": 4.544007632046682e-06,
"loss": 0.0798,
"step": 728
},
{
"epoch": 1.059593023255814,
"grad_norm": 1.2269767633790387,
"learning_rate": 4.532640731531921e-06,
"loss": 0.0454,
"step": 729
},
{
"epoch": 1.0610465116279069,
"grad_norm": 1.6055062515206473,
"learning_rate": 4.52127626722058e-06,
"loss": 0.1011,
"step": 730
},
{
"epoch": 1.0625,
"grad_norm": 1.5283525172896468,
"learning_rate": 4.509914298352197e-06,
"loss": 0.0507,
"step": 731
},
{
"epoch": 1.0639534883720931,
"grad_norm": 1.4633436434507294,
"learning_rate": 4.4985548841533035e-06,
"loss": 0.0948,
"step": 732
},
{
"epoch": 1.065406976744186,
"grad_norm": 1.3019061971094648,
"learning_rate": 4.487198083837111e-06,
"loss": 0.0631,
"step": 733
},
{
"epoch": 1.066860465116279,
"grad_norm": 1.3266926997772812,
"learning_rate": 4.475843956603205e-06,
"loss": 0.0604,
"step": 734
},
{
"epoch": 1.068313953488372,
"grad_norm": 1.3235367351231275,
"learning_rate": 4.4644925616372405e-06,
"loss": 0.0605,
"step": 735
},
{
"epoch": 1.069767441860465,
"grad_norm": 1.295361077164503,
"learning_rate": 4.4531439581106295e-06,
"loss": 0.06,
"step": 736
},
{
"epoch": 1.0712209302325582,
"grad_norm": 1.6139195727409428,
"learning_rate": 4.441798205180228e-06,
"loss": 0.0797,
"step": 737
},
{
"epoch": 1.072674418604651,
"grad_norm": 1.2563569911360324,
"learning_rate": 4.430455361988041e-06,
"loss": 0.0705,
"step": 738
},
{
"epoch": 1.0741279069767442,
"grad_norm": 1.2037437189059568,
"learning_rate": 4.419115487660899e-06,
"loss": 0.06,
"step": 739
},
{
"epoch": 1.0755813953488371,
"grad_norm": 1.2888284666571121,
"learning_rate": 4.40777864131016e-06,
"loss": 0.0574,
"step": 740
},
{
"epoch": 1.0770348837209303,
"grad_norm": 1.4666943653511182,
"learning_rate": 4.396444882031394e-06,
"loss": 0.0748,
"step": 741
},
{
"epoch": 1.0784883720930232,
"grad_norm": 1.1651868897402284,
"learning_rate": 4.3851142689040885e-06,
"loss": 0.0624,
"step": 742
},
{
"epoch": 1.0799418604651163,
"grad_norm": 1.6627210680829636,
"learning_rate": 4.373786860991318e-06,
"loss": 0.0521,
"step": 743
},
{
"epoch": 1.0813953488372092,
"grad_norm": 1.7854261575212669,
"learning_rate": 4.3624627173394615e-06,
"loss": 0.0956,
"step": 744
},
{
"epoch": 1.0828488372093024,
"grad_norm": 1.281448055061643,
"learning_rate": 4.351141896977874e-06,
"loss": 0.0618,
"step": 745
},
{
"epoch": 1.0843023255813953,
"grad_norm": 1.9115172580935775,
"learning_rate": 4.339824458918592e-06,
"loss": 0.0778,
"step": 746
},
{
"epoch": 1.0857558139534884,
"grad_norm": 1.1325100660387584,
"learning_rate": 4.328510462156015e-06,
"loss": 0.0447,
"step": 747
},
{
"epoch": 1.0872093023255813,
"grad_norm": 1.2777657933057078,
"learning_rate": 4.317199965666613e-06,
"loss": 0.0757,
"step": 748
},
{
"epoch": 1.0886627906976745,
"grad_norm": 1.3785018962846736,
"learning_rate": 4.305893028408601e-06,
"loss": 0.0654,
"step": 749
},
{
"epoch": 1.0901162790697674,
"grad_norm": 1.2206761011457805,
"learning_rate": 4.294589709321651e-06,
"loss": 0.0724,
"step": 750
},
{
"epoch": 1.0915697674418605,
"grad_norm": 1.4018327959477974,
"learning_rate": 4.283290067326562e-06,
"loss": 0.0904,
"step": 751
},
{
"epoch": 1.0930232558139534,
"grad_norm": 1.5155226786120202,
"learning_rate": 4.271994161324977e-06,
"loss": 0.0475,
"step": 752
},
{
"epoch": 1.0944767441860466,
"grad_norm": 1.5020821031236455,
"learning_rate": 4.260702050199056e-06,
"loss": 0.0627,
"step": 753
},
{
"epoch": 1.0959302325581395,
"grad_norm": 1.1544970293337866,
"learning_rate": 4.2494137928111835e-06,
"loss": 0.0731,
"step": 754
},
{
"epoch": 1.0973837209302326,
"grad_norm": 1.156709990172585,
"learning_rate": 4.238129448003651e-06,
"loss": 0.0496,
"step": 755
},
{
"epoch": 1.0988372093023255,
"grad_norm": 1.0821603719730972,
"learning_rate": 4.22684907459836e-06,
"loss": 0.0619,
"step": 756
},
{
"epoch": 1.1002906976744187,
"grad_norm": 1.1819763770544474,
"learning_rate": 4.215572731396504e-06,
"loss": 0.0582,
"step": 757
},
{
"epoch": 1.1017441860465116,
"grad_norm": 1.3810737668324913,
"learning_rate": 4.204300477178274e-06,
"loss": 0.0484,
"step": 758
},
{
"epoch": 1.1031976744186047,
"grad_norm": 1.6953078691365866,
"learning_rate": 4.19303237070254e-06,
"loss": 0.0856,
"step": 759
},
{
"epoch": 1.1046511627906976,
"grad_norm": 1.2505887912927889,
"learning_rate": 4.181768470706561e-06,
"loss": 0.0498,
"step": 760
},
{
"epoch": 1.1061046511627908,
"grad_norm": 1.3867145509235537,
"learning_rate": 4.170508835905655e-06,
"loss": 0.0529,
"step": 761
},
{
"epoch": 1.1075581395348837,
"grad_norm": 1.731884519992483,
"learning_rate": 4.159253524992922e-06,
"loss": 0.0649,
"step": 762
},
{
"epoch": 1.1090116279069768,
"grad_norm": 1.6560829967990272,
"learning_rate": 4.148002596638911e-06,
"loss": 0.0575,
"step": 763
},
{
"epoch": 1.1104651162790697,
"grad_norm": 1.95162595456697,
"learning_rate": 4.136756109491333e-06,
"loss": 0.081,
"step": 764
},
{
"epoch": 1.1119186046511629,
"grad_norm": 1.1885528527482156,
"learning_rate": 4.1255141221747445e-06,
"loss": 0.0531,
"step": 765
},
{
"epoch": 1.1133720930232558,
"grad_norm": 1.5782587513465858,
"learning_rate": 4.1142766932902475e-06,
"loss": 0.0698,
"step": 766
},
{
"epoch": 1.114825581395349,
"grad_norm": 1.7115336320491816,
"learning_rate": 4.103043881415181e-06,
"loss": 0.0743,
"step": 767
},
{
"epoch": 1.1162790697674418,
"grad_norm": 1.3705963951525217,
"learning_rate": 4.091815745102818e-06,
"loss": 0.0634,
"step": 768
},
{
"epoch": 1.117732558139535,
"grad_norm": 1.2666345308767393,
"learning_rate": 4.080592342882059e-06,
"loss": 0.0557,
"step": 769
},
{
"epoch": 1.119186046511628,
"grad_norm": 1.466660608860313,
"learning_rate": 4.069373733257129e-06,
"loss": 0.0532,
"step": 770
},
{
"epoch": 1.120639534883721,
"grad_norm": 1.9380585631227165,
"learning_rate": 4.058159974707267e-06,
"loss": 0.0838,
"step": 771
},
{
"epoch": 1.122093023255814,
"grad_norm": 1.3123759677425353,
"learning_rate": 4.046951125686427e-06,
"loss": 0.0661,
"step": 772
},
{
"epoch": 1.1235465116279069,
"grad_norm": 1.3861730341247065,
"learning_rate": 4.035747244622968e-06,
"loss": 0.0732,
"step": 773
},
{
"epoch": 1.125,
"grad_norm": 1.5843091362943043,
"learning_rate": 4.02454838991936e-06,
"loss": 0.0601,
"step": 774
},
{
"epoch": 1.1264534883720931,
"grad_norm": 1.1871559950997301,
"learning_rate": 4.013354619951864e-06,
"loss": 0.0542,
"step": 775
},
{
"epoch": 1.127906976744186,
"grad_norm": 1.4833725641864457,
"learning_rate": 4.002165993070237e-06,
"loss": 0.0421,
"step": 776
},
{
"epoch": 1.129360465116279,
"grad_norm": 1.2607191005902103,
"learning_rate": 3.990982567597434e-06,
"loss": 0.0675,
"step": 777
},
{
"epoch": 1.130813953488372,
"grad_norm": 1.302371033029548,
"learning_rate": 3.979804401829287e-06,
"loss": 0.0552,
"step": 778
},
{
"epoch": 1.1322674418604652,
"grad_norm": 1.0982776342344591,
"learning_rate": 3.968631554034219e-06,
"loss": 0.0417,
"step": 779
},
{
"epoch": 1.1337209302325582,
"grad_norm": 1.0186685111120157,
"learning_rate": 3.957464082452922e-06,
"loss": 0.0433,
"step": 780
},
{
"epoch": 1.135174418604651,
"grad_norm": 1.405227738570767,
"learning_rate": 3.946302045298076e-06,
"loss": 0.0589,
"step": 781
},
{
"epoch": 1.1366279069767442,
"grad_norm": 1.7360986285508477,
"learning_rate": 3.93514550075402e-06,
"loss": 0.0831,
"step": 782
},
{
"epoch": 1.1380813953488371,
"grad_norm": 1.3982460147182094,
"learning_rate": 3.923994506976475e-06,
"loss": 0.0717,
"step": 783
},
{
"epoch": 1.1395348837209303,
"grad_norm": 1.67771687763503,
"learning_rate": 3.912849122092216e-06,
"loss": 0.0623,
"step": 784
},
{
"epoch": 1.1409883720930232,
"grad_norm": 1.36066170200081,
"learning_rate": 3.901709404198787e-06,
"loss": 0.0632,
"step": 785
},
{
"epoch": 1.1424418604651163,
"grad_norm": 1.724665327424766,
"learning_rate": 3.890575411364187e-06,
"loss": 0.0904,
"step": 786
},
{
"epoch": 1.1438953488372092,
"grad_norm": 1.292310436918187,
"learning_rate": 3.879447201626579e-06,
"loss": 0.0486,
"step": 787
},
{
"epoch": 1.1453488372093024,
"grad_norm": 1.3385498197221257,
"learning_rate": 3.868324832993972e-06,
"loss": 0.0543,
"step": 788
},
{
"epoch": 1.1468023255813953,
"grad_norm": 1.3062276729009943,
"learning_rate": 3.857208363443936e-06,
"loss": 0.0607,
"step": 789
},
{
"epoch": 1.1482558139534884,
"grad_norm": 1.6713853170193331,
"learning_rate": 3.84609785092328e-06,
"loss": 0.0541,
"step": 790
},
{
"epoch": 1.1497093023255813,
"grad_norm": 1.4621166936575898,
"learning_rate": 3.834993353347774e-06,
"loss": 0.0822,
"step": 791
},
{
"epoch": 1.1511627906976745,
"grad_norm": 1.3920933935968336,
"learning_rate": 3.823894928601822e-06,
"loss": 0.0502,
"step": 792
},
{
"epoch": 1.1526162790697674,
"grad_norm": 1.3690857531456042,
"learning_rate": 3.8128026345381804e-06,
"loss": 0.0726,
"step": 793
},
{
"epoch": 1.1540697674418605,
"grad_norm": 1.435007037574927,
"learning_rate": 3.8017165289776397e-06,
"loss": 0.0733,
"step": 794
},
{
"epoch": 1.1555232558139534,
"grad_norm": 1.3209050161532427,
"learning_rate": 3.7906366697087426e-06,
"loss": 0.0625,
"step": 795
},
{
"epoch": 1.1569767441860466,
"grad_norm": 1.3609715123113075,
"learning_rate": 3.7795631144874607e-06,
"loss": 0.064,
"step": 796
},
{
"epoch": 1.1584302325581395,
"grad_norm": 1.35359644663218,
"learning_rate": 3.768495921036915e-06,
"loss": 0.0786,
"step": 797
},
{
"epoch": 1.1598837209302326,
"grad_norm": 1.4861713191531716,
"learning_rate": 3.7574351470470547e-06,
"loss": 0.0777,
"step": 798
},
{
"epoch": 1.1613372093023255,
"grad_norm": 1.256830182158305,
"learning_rate": 3.7463808501743736e-06,
"loss": 0.0649,
"step": 799
},
{
"epoch": 1.1627906976744187,
"grad_norm": 1.18227664622794,
"learning_rate": 3.7353330880415963e-06,
"loss": 0.0614,
"step": 800
},
{
"epoch": 1.1627906976744187,
"eval_loss": 0.13292963802814484,
"eval_runtime": 2.205,
"eval_samples_per_second": 25.396,
"eval_steps_per_second": 6.349,
"step": 800
},
{
"epoch": 1.1642441860465116,
"grad_norm": 1.3697674011754686,
"learning_rate": 3.724291918237391e-06,
"loss": 0.0802,
"step": 801
},
{
"epoch": 1.1656976744186047,
"grad_norm": 1.5758218959573673,
"learning_rate": 3.7132573983160538e-06,
"loss": 0.0555,
"step": 802
},
{
"epoch": 1.1671511627906976,
"grad_norm": 1.4752518250909372,
"learning_rate": 3.7022295857972244e-06,
"loss": 0.0598,
"step": 803
},
{
"epoch": 1.1686046511627908,
"grad_norm": 1.1107721704788946,
"learning_rate": 3.691208538165574e-06,
"loss": 0.0526,
"step": 804
},
{
"epoch": 1.1700581395348837,
"grad_norm": 1.5316239040252175,
"learning_rate": 3.6801943128705128e-06,
"loss": 0.0642,
"step": 805
},
{
"epoch": 1.1715116279069768,
"grad_norm": 1.1009808525064773,
"learning_rate": 3.6691869673258847e-06,
"loss": 0.0579,
"step": 806
},
{
"epoch": 1.1729651162790697,
"grad_norm": 1.2132447629779688,
"learning_rate": 3.6581865589096784e-06,
"loss": 0.0534,
"step": 807
},
{
"epoch": 1.1744186046511629,
"grad_norm": 1.21822644945824,
"learning_rate": 3.6471931449637127e-06,
"loss": 0.0478,
"step": 808
},
{
"epoch": 1.1758720930232558,
"grad_norm": 2.1071653837605484,
"learning_rate": 3.6362067827933555e-06,
"loss": 0.0591,
"step": 809
},
{
"epoch": 1.177325581395349,
"grad_norm": 1.2907948357744514,
"learning_rate": 3.625227529667209e-06,
"loss": 0.0492,
"step": 810
},
{
"epoch": 1.1787790697674418,
"grad_norm": 1.3081083590425786,
"learning_rate": 3.6142554428168208e-06,
"loss": 0.0641,
"step": 811
},
{
"epoch": 1.1802325581395348,
"grad_norm": 1.497246846467104,
"learning_rate": 3.6032905794363805e-06,
"loss": 0.0657,
"step": 812
},
{
"epoch": 1.181686046511628,
"grad_norm": 1.3989638492911172,
"learning_rate": 3.5923329966824288e-06,
"loss": 0.0677,
"step": 813
},
{
"epoch": 1.183139534883721,
"grad_norm": 1.4702223684938749,
"learning_rate": 3.5813827516735487e-06,
"loss": 0.0858,
"step": 814
},
{
"epoch": 1.184593023255814,
"grad_norm": 1.2980406228178047,
"learning_rate": 3.5704399014900814e-06,
"loss": 0.0547,
"step": 815
},
{
"epoch": 1.1860465116279069,
"grad_norm": 1.2387292846340658,
"learning_rate": 3.5595045031738123e-06,
"loss": 0.0477,
"step": 816
},
{
"epoch": 1.1875,
"grad_norm": 1.2267059696407838,
"learning_rate": 3.5485766137276894e-06,
"loss": 0.0499,
"step": 817
},
{
"epoch": 1.1889534883720931,
"grad_norm": 1.5850681514152112,
"learning_rate": 3.5376562901155138e-06,
"loss": 0.0608,
"step": 818
},
{
"epoch": 1.190406976744186,
"grad_norm": 1.4591340811318565,
"learning_rate": 3.526743589261652e-06,
"loss": 0.0553,
"step": 819
},
{
"epoch": 1.191860465116279,
"grad_norm": 1.3466268312861094,
"learning_rate": 3.5158385680507356e-06,
"loss": 0.0601,
"step": 820
},
{
"epoch": 1.193313953488372,
"grad_norm": 1.28302549307778,
"learning_rate": 3.50494128332736e-06,
"loss": 0.0614,
"step": 821
},
{
"epoch": 1.1947674418604652,
"grad_norm": 1.5064832595806237,
"learning_rate": 3.4940517918958e-06,
"loss": 0.053,
"step": 822
},
{
"epoch": 1.1962209302325582,
"grad_norm": 1.241785767259479,
"learning_rate": 3.483170150519697e-06,
"loss": 0.0648,
"step": 823
},
{
"epoch": 1.197674418604651,
"grad_norm": 1.9169832841939474,
"learning_rate": 3.472296415921783e-06,
"loss": 0.0653,
"step": 824
},
{
"epoch": 1.1991279069767442,
"grad_norm": 1.1075005753142289,
"learning_rate": 3.4614306447835646e-06,
"loss": 0.0623,
"step": 825
},
{
"epoch": 1.2005813953488371,
"grad_norm": 1.4624159678780777,
"learning_rate": 3.4505728937450437e-06,
"loss": 0.0502,
"step": 826
},
{
"epoch": 1.2020348837209303,
"grad_norm": 1.4873573941031213,
"learning_rate": 3.439723219404411e-06,
"loss": 0.0589,
"step": 827
},
{
"epoch": 1.2034883720930232,
"grad_norm": 1.3071195675919425,
"learning_rate": 3.4288816783177624e-06,
"loss": 0.059,
"step": 828
},
{
"epoch": 1.2049418604651163,
"grad_norm": 1.3306075906173456,
"learning_rate": 3.41804832699879e-06,
"loss": 0.0557,
"step": 829
},
{
"epoch": 1.2063953488372092,
"grad_norm": 1.2154802175356552,
"learning_rate": 3.407223221918501e-06,
"loss": 0.0593,
"step": 830
},
{
"epoch": 1.2078488372093024,
"grad_norm": 1.368819793191012,
"learning_rate": 3.396406419504914e-06,
"loss": 0.0756,
"step": 831
},
{
"epoch": 1.2093023255813953,
"grad_norm": 1.6765723354799054,
"learning_rate": 3.3855979761427705e-06,
"loss": 0.0546,
"step": 832
},
{
"epoch": 1.2107558139534884,
"grad_norm": 1.4117226452497533,
"learning_rate": 3.3747979481732352e-06,
"loss": 0.0665,
"step": 833
},
{
"epoch": 1.2122093023255813,
"grad_norm": 1.2207966641443189,
"learning_rate": 3.364006391893612e-06,
"loss": 0.0481,
"step": 834
},
{
"epoch": 1.2136627906976745,
"grad_norm": 1.4940343341477946,
"learning_rate": 3.3532233635570377e-06,
"loss": 0.0578,
"step": 835
},
{
"epoch": 1.2151162790697674,
"grad_norm": 1.3800156284226655,
"learning_rate": 3.3424489193722016e-06,
"loss": 0.0642,
"step": 836
},
{
"epoch": 1.2165697674418605,
"grad_norm": 1.5526090076765855,
"learning_rate": 3.331683115503041e-06,
"loss": 0.0639,
"step": 837
},
{
"epoch": 1.2180232558139534,
"grad_norm": 1.2911489672133427,
"learning_rate": 3.320926008068458e-06,
"loss": 0.0577,
"step": 838
},
{
"epoch": 1.2194767441860466,
"grad_norm": 1.546598681848458,
"learning_rate": 3.310177653142018e-06,
"loss": 0.0593,
"step": 839
},
{
"epoch": 1.2209302325581395,
"grad_norm": 1.188966656927106,
"learning_rate": 3.2994381067516702e-06,
"loss": 0.067,
"step": 840
},
{
"epoch": 1.2223837209302326,
"grad_norm": 1.3683724251027738,
"learning_rate": 3.2887074248794372e-06,
"loss": 0.0661,
"step": 841
},
{
"epoch": 1.2238372093023255,
"grad_norm": 1.4058614221882173,
"learning_rate": 3.2779856634611433e-06,
"loss": 0.0519,
"step": 842
},
{
"epoch": 1.2252906976744187,
"grad_norm": 1.1579274317787078,
"learning_rate": 3.267272878386106e-06,
"loss": 0.0699,
"step": 843
},
{
"epoch": 1.2267441860465116,
"grad_norm": 1.2336765642615921,
"learning_rate": 3.256569125496858e-06,
"loss": 0.0526,
"step": 844
},
{
"epoch": 1.2281976744186047,
"grad_norm": 1.590986386650674,
"learning_rate": 3.2458744605888414e-06,
"loss": 0.0653,
"step": 845
},
{
"epoch": 1.2296511627906976,
"grad_norm": 1.2675149654017217,
"learning_rate": 3.2351889394101356e-06,
"loss": 0.0517,
"step": 846
},
{
"epoch": 1.2311046511627908,
"grad_norm": 1.4746783771397893,
"learning_rate": 3.224512617661147e-06,
"loss": 0.0597,
"step": 847
},
{
"epoch": 1.2325581395348837,
"grad_norm": 1.3951470526945027,
"learning_rate": 3.2138455509943365e-06,
"loss": 0.06,
"step": 848
},
{
"epoch": 1.2340116279069768,
"grad_norm": 1.6377226127179718,
"learning_rate": 3.2031877950139138e-06,
"loss": 0.0634,
"step": 849
},
{
"epoch": 1.2354651162790697,
"grad_norm": 1.3590191499897781,
"learning_rate": 3.192539405275562e-06,
"loss": 0.0585,
"step": 850
},
{
"epoch": 1.2369186046511629,
"grad_norm": 1.197442084738658,
"learning_rate": 3.181900437286133e-06,
"loss": 0.0522,
"step": 851
},
{
"epoch": 1.2383720930232558,
"grad_norm": 1.457106341008041,
"learning_rate": 3.171270946503373e-06,
"loss": 0.0757,
"step": 852
},
{
"epoch": 1.239825581395349,
"grad_norm": 1.1462139163166636,
"learning_rate": 3.160650988335619e-06,
"loss": 0.066,
"step": 853
},
{
"epoch": 1.2412790697674418,
"grad_norm": 1.422733421409013,
"learning_rate": 3.1500406181415266e-06,
"loss": 0.0574,
"step": 854
},
{
"epoch": 1.2427325581395348,
"grad_norm": 1.3764162372997557,
"learning_rate": 3.1394398912297623e-06,
"loss": 0.0569,
"step": 855
},
{
"epoch": 1.244186046511628,
"grad_norm": 1.3748066305788302,
"learning_rate": 3.1288488628587343e-06,
"loss": 0.079,
"step": 856
},
{
"epoch": 1.245639534883721,
"grad_norm": 1.5587103993805567,
"learning_rate": 3.118267588236288e-06,
"loss": 0.048,
"step": 857
},
{
"epoch": 1.247093023255814,
"grad_norm": 1.0821878347641152,
"learning_rate": 3.1076961225194303e-06,
"loss": 0.0373,
"step": 858
},
{
"epoch": 1.2485465116279069,
"grad_norm": 1.3196026626843278,
"learning_rate": 3.0971345208140315e-06,
"loss": 0.0762,
"step": 859
},
{
"epoch": 1.25,
"grad_norm": 1.2188350465078943,
"learning_rate": 3.0865828381745515e-06,
"loss": 0.0603,
"step": 860
},
{
"epoch": 1.2514534883720931,
"grad_norm": 1.2251652362284926,
"learning_rate": 3.07604112960374e-06,
"loss": 0.0622,
"step": 861
},
{
"epoch": 1.252906976744186,
"grad_norm": 1.3179349347598328,
"learning_rate": 3.065509450052353e-06,
"loss": 0.0834,
"step": 862
},
{
"epoch": 1.254360465116279,
"grad_norm": 1.5906225203629438,
"learning_rate": 3.054987854418876e-06,
"loss": 0.0809,
"step": 863
},
{
"epoch": 1.255813953488372,
"grad_norm": 1.4130566616345097,
"learning_rate": 3.044476397549221e-06,
"loss": 0.0654,
"step": 864
},
{
"epoch": 1.2572674418604652,
"grad_norm": 1.743946208757902,
"learning_rate": 3.0339751342364563e-06,
"loss": 0.0625,
"step": 865
},
{
"epoch": 1.2587209302325582,
"grad_norm": 1.7612769821987542,
"learning_rate": 3.02348411922051e-06,
"loss": 0.0642,
"step": 866
},
{
"epoch": 1.260174418604651,
"grad_norm": 3.522383016452111,
"learning_rate": 3.0130034071878935e-06,
"loss": 0.0452,
"step": 867
},
{
"epoch": 1.2616279069767442,
"grad_norm": 1.8108815533825797,
"learning_rate": 3.002533052771405e-06,
"loss": 0.069,
"step": 868
},
{
"epoch": 1.2630813953488373,
"grad_norm": 1.4705889091500777,
"learning_rate": 2.99207311054986e-06,
"loss": 0.0665,
"step": 869
},
{
"epoch": 1.2645348837209303,
"grad_norm": 1.2345293570590237,
"learning_rate": 2.9816236350477924e-06,
"loss": 0.0471,
"step": 870
},
{
"epoch": 1.2659883720930232,
"grad_norm": 1.3814272608337272,
"learning_rate": 2.9711846807351775e-06,
"loss": 0.0497,
"step": 871
},
{
"epoch": 1.2674418604651163,
"grad_norm": 1.4737560562363339,
"learning_rate": 2.9607563020271446e-06,
"loss": 0.0754,
"step": 872
},
{
"epoch": 1.2688953488372092,
"grad_norm": 1.2213469734360816,
"learning_rate": 2.950338553283704e-06,
"loss": 0.0439,
"step": 873
},
{
"epoch": 1.2703488372093024,
"grad_norm": 1.0228539211087246,
"learning_rate": 2.939931488809443e-06,
"loss": 0.0453,
"step": 874
},
{
"epoch": 1.2718023255813953,
"grad_norm": 1.3148270871243914,
"learning_rate": 2.9295351628532666e-06,
"loss": 0.0691,
"step": 875
},
{
"epoch": 1.2732558139534884,
"grad_norm": 1.6172892866219821,
"learning_rate": 2.9191496296080935e-06,
"loss": 0.0716,
"step": 876
},
{
"epoch": 1.2747093023255813,
"grad_norm": 1.5950616252422385,
"learning_rate": 2.9087749432105917e-06,
"loss": 0.0691,
"step": 877
},
{
"epoch": 1.2761627906976745,
"grad_norm": 1.465303805672259,
"learning_rate": 2.898411157740879e-06,
"loss": 0.0642,
"step": 878
},
{
"epoch": 1.2776162790697674,
"grad_norm": 1.234388737205486,
"learning_rate": 2.8880583272222594e-06,
"loss": 0.0485,
"step": 879
},
{
"epoch": 1.2790697674418605,
"grad_norm": 1.088371039773677,
"learning_rate": 2.8777165056209256e-06,
"loss": 0.0435,
"step": 880
},
{
"epoch": 1.2805232558139534,
"grad_norm": 1.2648066111949126,
"learning_rate": 2.867385746845685e-06,
"loss": 0.0707,
"step": 881
},
{
"epoch": 1.2819767441860466,
"grad_norm": 1.429685475726185,
"learning_rate": 2.8570661047476773e-06,
"loss": 0.0561,
"step": 882
},
{
"epoch": 1.2834302325581395,
"grad_norm": 1.9989759755669896,
"learning_rate": 2.8467576331200986e-06,
"loss": 0.0794,
"step": 883
},
{
"epoch": 1.2848837209302326,
"grad_norm": 1.2470700343219876,
"learning_rate": 2.836460385697911e-06,
"loss": 0.046,
"step": 884
},
{
"epoch": 1.2863372093023255,
"grad_norm": 1.3164750906741658,
"learning_rate": 2.8261744161575745e-06,
"loss": 0.0658,
"step": 885
},
{
"epoch": 1.2877906976744187,
"grad_norm": 1.1137226528892454,
"learning_rate": 2.815899778116753e-06,
"loss": 0.0438,
"step": 886
},
{
"epoch": 1.2892441860465116,
"grad_norm": 1.5963496710705627,
"learning_rate": 2.80563652513405e-06,
"loss": 0.0768,
"step": 887
},
{
"epoch": 1.2906976744186047,
"grad_norm": 1.2418886512188267,
"learning_rate": 2.7953847107087173e-06,
"loss": 0.0476,
"step": 888
},
{
"epoch": 1.2921511627906976,
"grad_norm": 1.4960278840419248,
"learning_rate": 2.785144388280385e-06,
"loss": 0.0737,
"step": 889
},
{
"epoch": 1.2936046511627908,
"grad_norm": 1.078019987993341,
"learning_rate": 2.7749156112287746e-06,
"loss": 0.0686,
"step": 890
},
{
"epoch": 1.2950581395348837,
"grad_norm": 1.3385018973715532,
"learning_rate": 2.7646984328734284e-06,
"loss": 0.0422,
"step": 891
},
{
"epoch": 1.2965116279069768,
"grad_norm": 1.5906816734841456,
"learning_rate": 2.754492906473425e-06,
"loss": 0.0555,
"step": 892
},
{
"epoch": 1.2979651162790697,
"grad_norm": 1.2925137226713737,
"learning_rate": 2.744299085227109e-06,
"loss": 0.0472,
"step": 893
},
{
"epoch": 1.2994186046511627,
"grad_norm": 1.259018228463516,
"learning_rate": 2.7341170222718073e-06,
"loss": 0.0517,
"step": 894
},
{
"epoch": 1.3008720930232558,
"grad_norm": 1.350283378016509,
"learning_rate": 2.723946770683552e-06,
"loss": 0.0496,
"step": 895
},
{
"epoch": 1.302325581395349,
"grad_norm": 1.3897000129677248,
"learning_rate": 2.7137883834768076e-06,
"loss": 0.0555,
"step": 896
},
{
"epoch": 1.3037790697674418,
"grad_norm": 1.4501986699541858,
"learning_rate": 2.703641913604198e-06,
"loss": 0.0734,
"step": 897
},
{
"epoch": 1.3052325581395348,
"grad_norm": 1.316272770181074,
"learning_rate": 2.6935074139562174e-06,
"loss": 0.0598,
"step": 898
},
{
"epoch": 1.306686046511628,
"grad_norm": 1.1543542777127582,
"learning_rate": 2.683384937360971e-06,
"loss": 0.0635,
"step": 899
},
{
"epoch": 1.308139534883721,
"grad_norm": 1.2569738112859057,
"learning_rate": 2.673274536583883e-06,
"loss": 0.0493,
"step": 900
},
{
"epoch": 1.309593023255814,
"grad_norm": 1.5911605897872676,
"learning_rate": 2.663176264327439e-06,
"loss": 0.064,
"step": 901
},
{
"epoch": 1.3110465116279069,
"grad_norm": 1.398413502674623,
"learning_rate": 2.6530901732308934e-06,
"loss": 0.0542,
"step": 902
},
{
"epoch": 1.3125,
"grad_norm": 1.481895453387704,
"learning_rate": 2.6430163158700116e-06,
"loss": 0.0423,
"step": 903
},
{
"epoch": 1.3139534883720931,
"grad_norm": 1.5404341782721638,
"learning_rate": 2.632954744756784e-06,
"loss": 0.0743,
"step": 904
},
{
"epoch": 1.315406976744186,
"grad_norm": 1.6089449641602316,
"learning_rate": 2.6229055123391545e-06,
"loss": 0.0762,
"step": 905
},
{
"epoch": 1.316860465116279,
"grad_norm": 1.3087109651209734,
"learning_rate": 2.612868671000755e-06,
"loss": 0.063,
"step": 906
},
{
"epoch": 1.318313953488372,
"grad_norm": 1.3502162014519732,
"learning_rate": 2.602844273060623e-06,
"loss": 0.0613,
"step": 907
},
{
"epoch": 1.3197674418604652,
"grad_norm": 1.757116221919893,
"learning_rate": 2.592832370772931e-06,
"loss": 0.0598,
"step": 908
},
{
"epoch": 1.3212209302325582,
"grad_norm": 1.440632285315959,
"learning_rate": 2.582833016326716e-06,
"loss": 0.0603,
"step": 909
},
{
"epoch": 1.322674418604651,
"grad_norm": 1.3695411985419055,
"learning_rate": 2.5728462618456114e-06,
"loss": 0.0603,
"step": 910
},
{
"epoch": 1.3241279069767442,
"grad_norm": 1.3717727081965483,
"learning_rate": 2.562872159387563e-06,
"loss": 0.0489,
"step": 911
},
{
"epoch": 1.3255813953488373,
"grad_norm": 1.3965140784846173,
"learning_rate": 2.5529107609445737e-06,
"loss": 0.053,
"step": 912
},
{
"epoch": 1.3270348837209303,
"grad_norm": 1.3605550236382897,
"learning_rate": 2.542962118442417e-06,
"loss": 0.0652,
"step": 913
},
{
"epoch": 1.3284883720930232,
"grad_norm": 1.5017426449957705,
"learning_rate": 2.5330262837403795e-06,
"loss": 0.059,
"step": 914
},
{
"epoch": 1.3299418604651163,
"grad_norm": 1.2686790614066732,
"learning_rate": 2.523103308630978e-06,
"loss": 0.059,
"step": 915
},
{
"epoch": 1.3313953488372092,
"grad_norm": 1.4932765351503499,
"learning_rate": 2.513193244839704e-06,
"loss": 0.0801,
"step": 916
},
{
"epoch": 1.3328488372093024,
"grad_norm": 1.277822451358384,
"learning_rate": 2.5032961440247382e-06,
"loss": 0.0563,
"step": 917
},
{
"epoch": 1.3343023255813953,
"grad_norm": 1.1466090066744306,
"learning_rate": 2.4934120577766963e-06,
"loss": 0.0482,
"step": 918
},
{
"epoch": 1.3357558139534884,
"grad_norm": 1.3716592416122075,
"learning_rate": 2.483541037618346e-06,
"loss": 0.0618,
"step": 919
},
{
"epoch": 1.3372093023255813,
"grad_norm": 1.365751544441765,
"learning_rate": 2.473683135004354e-06,
"loss": 0.062,
"step": 920
},
{
"epoch": 1.3386627906976745,
"grad_norm": 2.264876190842683,
"learning_rate": 2.4638384013210004e-06,
"loss": 0.1066,
"step": 921
},
{
"epoch": 1.3401162790697674,
"grad_norm": 1.6250492792457274,
"learning_rate": 2.4540068878859247e-06,
"loss": 0.0857,
"step": 922
},
{
"epoch": 1.3415697674418605,
"grad_norm": 1.821123714104965,
"learning_rate": 2.4441886459478502e-06,
"loss": 0.0484,
"step": 923
},
{
"epoch": 1.3430232558139534,
"grad_norm": 1.3628978122194464,
"learning_rate": 2.4343837266863245e-06,
"loss": 0.0716,
"step": 924
},
{
"epoch": 1.3444767441860466,
"grad_norm": 1.2104456979218408,
"learning_rate": 2.4245921812114427e-06,
"loss": 0.0413,
"step": 925
},
{
"epoch": 1.3459302325581395,
"grad_norm": 1.4006337692831987,
"learning_rate": 2.4148140605635923e-06,
"loss": 0.0884,
"step": 926
},
{
"epoch": 1.3473837209302326,
"grad_norm": 1.5775325655062982,
"learning_rate": 2.405049415713173e-06,
"loss": 0.0826,
"step": 927
},
{
"epoch": 1.3488372093023255,
"grad_norm": 1.3469035877818512,
"learning_rate": 2.3952982975603494e-06,
"loss": 0.0528,
"step": 928
},
{
"epoch": 1.3502906976744187,
"grad_norm": 1.3912044037862688,
"learning_rate": 2.385560756934765e-06,
"loss": 0.0683,
"step": 929
},
{
"epoch": 1.3517441860465116,
"grad_norm": 1.4330392863334833,
"learning_rate": 2.3758368445952977e-06,
"loss": 0.0615,
"step": 930
},
{
"epoch": 1.3531976744186047,
"grad_norm": 1.353177391111986,
"learning_rate": 2.3661266112297765e-06,
"loss": 0.0441,
"step": 931
},
{
"epoch": 1.3546511627906976,
"grad_norm": 1.4685441680460056,
"learning_rate": 2.356430107454733e-06,
"loss": 0.0719,
"step": 932
},
{
"epoch": 1.3561046511627908,
"grad_norm": 1.3373925778152866,
"learning_rate": 2.346747383815126e-06,
"loss": 0.0581,
"step": 933
},
{
"epoch": 1.3575581395348837,
"grad_norm": 1.6807992341418785,
"learning_rate": 2.337078490784084e-06,
"loss": 0.083,
"step": 934
},
{
"epoch": 1.3590116279069768,
"grad_norm": 1.3439853412810583,
"learning_rate": 2.32742347876264e-06,
"loss": 0.0601,
"step": 935
},
{
"epoch": 1.3604651162790697,
"grad_norm": 1.3060456175816213,
"learning_rate": 2.317782398079473e-06,
"loss": 0.055,
"step": 936
},
{
"epoch": 1.3619186046511627,
"grad_norm": 1.5527718046211736,
"learning_rate": 2.3081552989906347e-06,
"loss": 0.0734,
"step": 937
},
{
"epoch": 1.3633720930232558,
"grad_norm": 1.5997787553142,
"learning_rate": 2.298542231679305e-06,
"loss": 0.0727,
"step": 938
},
{
"epoch": 1.364825581395349,
"grad_norm": 1.647243597942959,
"learning_rate": 2.2889432462555106e-06,
"loss": 0.069,
"step": 939
},
{
"epoch": 1.3662790697674418,
"grad_norm": 1.4131937959777625,
"learning_rate": 2.279358392755882e-06,
"loss": 0.0572,
"step": 940
},
{
"epoch": 1.3677325581395348,
"grad_norm": 1.2921469692871526,
"learning_rate": 2.269787721143376e-06,
"loss": 0.0558,
"step": 941
},
{
"epoch": 1.369186046511628,
"grad_norm": 1.5778632142343019,
"learning_rate": 2.2602312813070315e-06,
"loss": 0.0653,
"step": 942
},
{
"epoch": 1.370639534883721,
"grad_norm": 1.1650103416898503,
"learning_rate": 2.250689123061694e-06,
"loss": 0.0583,
"step": 943
},
{
"epoch": 1.372093023255814,
"grad_norm": 1.638353937587653,
"learning_rate": 2.2411612961477704e-06,
"loss": 0.0954,
"step": 944
},
{
"epoch": 1.3735465116279069,
"grad_norm": 1.3332240689990287,
"learning_rate": 2.2316478502309576e-06,
"loss": 0.0686,
"step": 945
},
{
"epoch": 1.375,
"grad_norm": 1.4897450673269428,
"learning_rate": 2.2221488349019903e-06,
"loss": 0.0637,
"step": 946
},
{
"epoch": 1.3764534883720931,
"grad_norm": 1.2433687422683675,
"learning_rate": 2.2126642996763793e-06,
"loss": 0.0636,
"step": 947
},
{
"epoch": 1.377906976744186,
"grad_norm": 1.5438951823738334,
"learning_rate": 2.203194293994159e-06,
"loss": 0.063,
"step": 948
},
{
"epoch": 1.379360465116279,
"grad_norm": 1.353307173474805,
"learning_rate": 2.193738867219623e-06,
"loss": 0.0551,
"step": 949
},
{
"epoch": 1.380813953488372,
"grad_norm": 1.3926599213141382,
"learning_rate": 2.184298068641067e-06,
"loss": 0.0658,
"step": 950
},
{
"epoch": 1.3822674418604652,
"grad_norm": 1.5876307826339826,
"learning_rate": 2.174871947470541e-06,
"loss": 0.0714,
"step": 951
},
{
"epoch": 1.3837209302325582,
"grad_norm": 1.5940606667679074,
"learning_rate": 2.1654605528435774e-06,
"loss": 0.0984,
"step": 952
},
{
"epoch": 1.385174418604651,
"grad_norm": 1.6476409988363916,
"learning_rate": 2.1560639338189533e-06,
"loss": 0.0618,
"step": 953
},
{
"epoch": 1.3866279069767442,
"grad_norm": 1.3123274183423979,
"learning_rate": 2.1466821393784148e-06,
"loss": 0.0526,
"step": 954
},
{
"epoch": 1.3880813953488373,
"grad_norm": 1.1675619599323548,
"learning_rate": 2.137315218426442e-06,
"loss": 0.0465,
"step": 955
},
{
"epoch": 1.3895348837209303,
"grad_norm": 1.6048177965652641,
"learning_rate": 2.127963219789974e-06,
"loss": 0.1021,
"step": 956
},
{
"epoch": 1.3909883720930232,
"grad_norm": 0.8988094880823243,
"learning_rate": 2.1186261922181746e-06,
"loss": 0.0387,
"step": 957
},
{
"epoch": 1.3924418604651163,
"grad_norm": 1.287189982666728,
"learning_rate": 2.109304184382157e-06,
"loss": 0.065,
"step": 958
},
{
"epoch": 1.3938953488372092,
"grad_norm": 1.1802701284704842,
"learning_rate": 2.0999972448747525e-06,
"loss": 0.0453,
"step": 959
},
{
"epoch": 1.3953488372093024,
"grad_norm": 1.3499018387202095,
"learning_rate": 2.0907054222102367e-06,
"loss": 0.0439,
"step": 960
},
{
"epoch": 1.3968023255813953,
"grad_norm": 1.620339483385674,
"learning_rate": 2.081428764824089e-06,
"loss": 0.075,
"step": 961
},
{
"epoch": 1.3982558139534884,
"grad_norm": 1.2075011912654863,
"learning_rate": 2.072167321072736e-06,
"loss": 0.0653,
"step": 962
},
{
"epoch": 1.3997093023255813,
"grad_norm": 1.4261627536661536,
"learning_rate": 2.0629211392333033e-06,
"loss": 0.0605,
"step": 963
},
{
"epoch": 1.4011627906976745,
"grad_norm": 1.5799723516279018,
"learning_rate": 2.0536902675033547e-06,
"loss": 0.0667,
"step": 964
},
{
"epoch": 1.4026162790697674,
"grad_norm": 1.4908299612274243,
"learning_rate": 2.044474754000655e-06,
"loss": 0.058,
"step": 965
},
{
"epoch": 1.4040697674418605,
"grad_norm": 1.4424465604879053,
"learning_rate": 2.0352746467629018e-06,
"loss": 0.0749,
"step": 966
},
{
"epoch": 1.4055232558139534,
"grad_norm": 1.3986123725352064,
"learning_rate": 2.0260899937474943e-06,
"loss": 0.0587,
"step": 967
},
{
"epoch": 1.4069767441860466,
"grad_norm": 1.5870575696903628,
"learning_rate": 2.0169208428312647e-06,
"loss": 0.0825,
"step": 968
},
{
"epoch": 1.4084302325581395,
"grad_norm": 1.4921609261163944,
"learning_rate": 2.0077672418102443e-06,
"loss": 0.0796,
"step": 969
},
{
"epoch": 1.4098837209302326,
"grad_norm": 1.5995313772665938,
"learning_rate": 1.998629238399402e-06,
"loss": 0.0659,
"step": 970
},
{
"epoch": 1.4113372093023255,
"grad_norm": 1.2823207017129816,
"learning_rate": 1.9895068802324065e-06,
"loss": 0.0484,
"step": 971
},
{
"epoch": 1.4127906976744187,
"grad_norm": 1.3944837875822929,
"learning_rate": 1.980400214861367e-06,
"loss": 0.0507,
"step": 972
},
{
"epoch": 1.4142441860465116,
"grad_norm": 1.3570706618634196,
"learning_rate": 1.971309289756595e-06,
"loss": 0.0553,
"step": 973
},
{
"epoch": 1.4156976744186047,
"grad_norm": 1.3823236957095917,
"learning_rate": 1.9622341523063484e-06,
"loss": 0.0604,
"step": 974
},
{
"epoch": 1.4171511627906976,
"grad_norm": 1.5503182481651412,
"learning_rate": 1.953174849816595e-06,
"loss": 0.1027,
"step": 975
},
{
"epoch": 1.4186046511627908,
"grad_norm": 1.25459622511254,
"learning_rate": 1.944131429510754e-06,
"loss": 0.0646,
"step": 976
},
{
"epoch": 1.4200581395348837,
"grad_norm": 1.6351810496565538,
"learning_rate": 1.93510393852946e-06,
"loss": 0.0659,
"step": 977
},
{
"epoch": 1.4215116279069768,
"grad_norm": 1.2975727336300502,
"learning_rate": 1.9260924239303075e-06,
"loss": 0.0672,
"step": 978
},
{
"epoch": 1.4229651162790697,
"grad_norm": 1.3534696019811543,
"learning_rate": 1.9170969326876177e-06,
"loss": 0.0516,
"step": 979
},
{
"epoch": 1.4244186046511627,
"grad_norm": 1.5117325218714435,
"learning_rate": 1.90811751169218e-06,
"loss": 0.0512,
"step": 980
},
{
"epoch": 1.4258720930232558,
"grad_norm": 1.2260873105062584,
"learning_rate": 1.8991542077510205e-06,
"loss": 0.0535,
"step": 981
},
{
"epoch": 1.427325581395349,
"grad_norm": 1.4430073129022352,
"learning_rate": 1.8902070675871465e-06,
"loss": 0.0623,
"step": 982
},
{
"epoch": 1.4287790697674418,
"grad_norm": 1.4117041417091303,
"learning_rate": 1.881276137839314e-06,
"loss": 0.0367,
"step": 983
},
{
"epoch": 1.4302325581395348,
"grad_norm": 1.3665830676976485,
"learning_rate": 1.8723614650617721e-06,
"loss": 0.0658,
"step": 984
},
{
"epoch": 1.431686046511628,
"grad_norm": 1.4391996547793735,
"learning_rate": 1.8634630957240352e-06,
"loss": 0.0762,
"step": 985
},
{
"epoch": 1.433139534883721,
"grad_norm": 1.5377368090806955,
"learning_rate": 1.8545810762106263e-06,
"loss": 0.0709,
"step": 986
},
{
"epoch": 1.434593023255814,
"grad_norm": 1.3753400620423242,
"learning_rate": 1.845715452820845e-06,
"loss": 0.0501,
"step": 987
},
{
"epoch": 1.4360465116279069,
"grad_norm": 1.593242899745401,
"learning_rate": 1.8368662717685188e-06,
"loss": 0.0799,
"step": 988
},
{
"epoch": 1.4375,
"grad_norm": 1.6328684919856264,
"learning_rate": 1.8280335791817733e-06,
"loss": 0.053,
"step": 989
},
{
"epoch": 1.4389534883720931,
"grad_norm": 1.432266744388245,
"learning_rate": 1.819217421102779e-06,
"loss": 0.0617,
"step": 990
},
{
"epoch": 1.440406976744186,
"grad_norm": 1.2883150490212059,
"learning_rate": 1.8104178434875175e-06,
"loss": 0.0646,
"step": 991
},
{
"epoch": 1.441860465116279,
"grad_norm": 1.364249538543468,
"learning_rate": 1.8016348922055448e-06,
"loss": 0.0465,
"step": 992
},
{
"epoch": 1.443313953488372,
"grad_norm": 1.3168346576696832,
"learning_rate": 1.7928686130397443e-06,
"loss": 0.0792,
"step": 993
},
{
"epoch": 1.4447674418604652,
"grad_norm": 1.8107406359055818,
"learning_rate": 1.7841190516860973e-06,
"loss": 0.0732,
"step": 994
},
{
"epoch": 1.4462209302325582,
"grad_norm": 1.2063299992756713,
"learning_rate": 1.7753862537534356e-06,
"loss": 0.0762,
"step": 995
},
{
"epoch": 1.447674418604651,
"grad_norm": 1.8674037040685265,
"learning_rate": 1.7666702647632128e-06,
"loss": 0.0695,
"step": 996
},
{
"epoch": 1.4491279069767442,
"grad_norm": 1.1951714079592195,
"learning_rate": 1.7579711301492574e-06,
"loss": 0.0597,
"step": 997
},
{
"epoch": 1.4505813953488373,
"grad_norm": 2.0661364210414725,
"learning_rate": 1.7492888952575475e-06,
"loss": 0.0786,
"step": 998
},
{
"epoch": 1.4520348837209303,
"grad_norm": 1.3804303251215748,
"learning_rate": 1.740623605345963e-06,
"loss": 0.0632,
"step": 999
},
{
"epoch": 1.4534883720930232,
"grad_norm": 1.3877478679606865,
"learning_rate": 1.7319753055840555e-06,
"loss": 0.0467,
"step": 1000
},
{
"epoch": 1.4534883720930232,
"eval_loss": 0.1322409063577652,
"eval_runtime": 2.2065,
"eval_samples_per_second": 25.379,
"eval_steps_per_second": 6.345,
"step": 1000
},
{
"epoch": 1.4549418604651163,
"grad_norm": 1.2437202205338127,
"learning_rate": 1.7233440410528117e-06,
"loss": 0.0504,
"step": 1001
},
{
"epoch": 1.4563953488372092,
"grad_norm": 1.6237467009740467,
"learning_rate": 1.7147298567444231e-06,
"loss": 0.0938,
"step": 1002
},
{
"epoch": 1.4578488372093024,
"grad_norm": 1.5164187899961117,
"learning_rate": 1.7061327975620402e-06,
"loss": 0.0772,
"step": 1003
},
{
"epoch": 1.4593023255813953,
"grad_norm": 1.5187557348768597,
"learning_rate": 1.697552908319553e-06,
"loss": 0.0624,
"step": 1004
},
{
"epoch": 1.4607558139534884,
"grad_norm": 1.178181594838555,
"learning_rate": 1.6889902337413415e-06,
"loss": 0.0655,
"step": 1005
},
{
"epoch": 1.4622093023255813,
"grad_norm": 1.4575686492434048,
"learning_rate": 1.6804448184620598e-06,
"loss": 0.0631,
"step": 1006
},
{
"epoch": 1.4636627906976745,
"grad_norm": 2.0108235684971327,
"learning_rate": 1.6719167070263848e-06,
"loss": 0.093,
"step": 1007
},
{
"epoch": 1.4651162790697674,
"grad_norm": 1.3503757474217617,
"learning_rate": 1.6634059438888034e-06,
"loss": 0.0498,
"step": 1008
},
{
"epoch": 1.4665697674418605,
"grad_norm": 1.4598338826538206,
"learning_rate": 1.6549125734133625e-06,
"loss": 0.0543,
"step": 1009
},
{
"epoch": 1.4680232558139534,
"grad_norm": 1.4195207368199612,
"learning_rate": 1.6464366398734532e-06,
"loss": 0.0598,
"step": 1010
},
{
"epoch": 1.4694767441860466,
"grad_norm": 1.357120211132288,
"learning_rate": 1.6379781874515666e-06,
"loss": 0.0511,
"step": 1011
},
{
"epoch": 1.4709302325581395,
"grad_norm": 1.8474857055851184,
"learning_rate": 1.6295372602390768e-06,
"loss": 0.0676,
"step": 1012
},
{
"epoch": 1.4723837209302326,
"grad_norm": 1.4685941935514055,
"learning_rate": 1.6211139022359995e-06,
"loss": 0.0616,
"step": 1013
},
{
"epoch": 1.4738372093023255,
"grad_norm": 1.379812903529308,
"learning_rate": 1.6127081573507685e-06,
"loss": 0.0589,
"step": 1014
},
{
"epoch": 1.4752906976744187,
"grad_norm": 1.2647342925759666,
"learning_rate": 1.6043200694000038e-06,
"loss": 0.0754,
"step": 1015
},
{
"epoch": 1.4767441860465116,
"grad_norm": 1.13404075461665,
"learning_rate": 1.5959496821082905e-06,
"loss": 0.0544,
"step": 1016
},
{
"epoch": 1.4781976744186047,
"grad_norm": 1.4701585201110707,
"learning_rate": 1.5875970391079393e-06,
"loss": 0.0625,
"step": 1017
},
{
"epoch": 1.4796511627906976,
"grad_norm": 1.2418075850932098,
"learning_rate": 1.5792621839387717e-06,
"loss": 0.0489,
"step": 1018
},
{
"epoch": 1.4811046511627908,
"grad_norm": 1.191695849684784,
"learning_rate": 1.5709451600478787e-06,
"loss": 0.0439,
"step": 1019
},
{
"epoch": 1.4825581395348837,
"grad_norm": 1.2753380078997356,
"learning_rate": 1.562646010789411e-06,
"loss": 0.0616,
"step": 1020
},
{
"epoch": 1.4840116279069768,
"grad_norm": 1.5221939760552072,
"learning_rate": 1.5543647794243355e-06,
"loss": 0.0827,
"step": 1021
},
{
"epoch": 1.4854651162790697,
"grad_norm": 1.7991818130452302,
"learning_rate": 1.5461015091202263e-06,
"loss": 0.0717,
"step": 1022
},
{
"epoch": 1.4869186046511627,
"grad_norm": 1.3744111180193537,
"learning_rate": 1.5378562429510257e-06,
"loss": 0.0639,
"step": 1023
},
{
"epoch": 1.4883720930232558,
"grad_norm": 1.6295469366842923,
"learning_rate": 1.5296290238968303e-06,
"loss": 0.0615,
"step": 1024
},
{
"epoch": 1.489825581395349,
"grad_norm": 1.4986295841575483,
"learning_rate": 1.5214198948436604e-06,
"loss": 0.0512,
"step": 1025
},
{
"epoch": 1.4912790697674418,
"grad_norm": 1.5258968983298233,
"learning_rate": 1.5132288985832383e-06,
"loss": 0.0567,
"step": 1026
},
{
"epoch": 1.4927325581395348,
"grad_norm": 1.3320359313785561,
"learning_rate": 1.5050560778127648e-06,
"loss": 0.0475,
"step": 1027
},
{
"epoch": 1.494186046511628,
"grad_norm": 1.4535765609363385,
"learning_rate": 1.496901475134701e-06,
"loss": 0.0491,
"step": 1028
},
{
"epoch": 1.495639534883721,
"grad_norm": 1.7901069273417733,
"learning_rate": 1.4887651330565378e-06,
"loss": 0.066,
"step": 1029
},
{
"epoch": 1.497093023255814,
"grad_norm": 1.7585562014185228,
"learning_rate": 1.4806470939905842e-06,
"loss": 0.054,
"step": 1030
},
{
"epoch": 1.4985465116279069,
"grad_norm": 1.6626210530787378,
"learning_rate": 1.472547400253735e-06,
"loss": 0.0757,
"step": 1031
},
{
"epoch": 1.5,
"grad_norm": 1.4156229167835312,
"learning_rate": 1.4644660940672628e-06,
"loss": 0.063,
"step": 1032
},
{
"epoch": 1.5014534883720931,
"grad_norm": 1.4973808057494127,
"learning_rate": 1.4564032175565873e-06,
"loss": 0.0539,
"step": 1033
},
{
"epoch": 1.502906976744186,
"grad_norm": 1.462014108284378,
"learning_rate": 1.4483588127510585e-06,
"loss": 0.047,
"step": 1034
},
{
"epoch": 1.504360465116279,
"grad_norm": 1.511766423540138,
"learning_rate": 1.440332921583744e-06,
"loss": 0.0576,
"step": 1035
},
{
"epoch": 1.505813953488372,
"grad_norm": 1.4551371173160883,
"learning_rate": 1.432325585891201e-06,
"loss": 0.0611,
"step": 1036
},
{
"epoch": 1.5072674418604652,
"grad_norm": 1.6008143073565722,
"learning_rate": 1.4243368474132663e-06,
"loss": 0.0612,
"step": 1037
},
{
"epoch": 1.5087209302325582,
"grad_norm": 1.5177354364722053,
"learning_rate": 1.41636674779283e-06,
"loss": 0.0667,
"step": 1038
},
{
"epoch": 1.510174418604651,
"grad_norm": 1.4839839114250544,
"learning_rate": 1.408415328575629e-06,
"loss": 0.0749,
"step": 1039
},
{
"epoch": 1.5116279069767442,
"grad_norm": 1.4529745164922419,
"learning_rate": 1.4004826312100218e-06,
"loss": 0.0411,
"step": 1040
},
{
"epoch": 1.5130813953488373,
"grad_norm": 1.1970103229686213,
"learning_rate": 1.3925686970467745e-06,
"loss": 0.0527,
"step": 1041
},
{
"epoch": 1.5145348837209303,
"grad_norm": 1.3891332095713593,
"learning_rate": 1.3846735673388473e-06,
"loss": 0.059,
"step": 1042
},
{
"epoch": 1.5159883720930232,
"grad_norm": 1.191612023245594,
"learning_rate": 1.3767972832411813e-06,
"loss": 0.0644,
"step": 1043
},
{
"epoch": 1.5174418604651163,
"grad_norm": 1.2633634289004876,
"learning_rate": 1.3689398858104753e-06,
"loss": 0.0518,
"step": 1044
},
{
"epoch": 1.5188953488372094,
"grad_norm": 1.259697007622928,
"learning_rate": 1.3611014160049846e-06,
"loss": 0.0399,
"step": 1045
},
{
"epoch": 1.5203488372093024,
"grad_norm": 1.2500835906399796,
"learning_rate": 1.3532819146842934e-06,
"loss": 0.0555,
"step": 1046
},
{
"epoch": 1.5218023255813953,
"grad_norm": 1.2215856052507712,
"learning_rate": 1.3454814226091156e-06,
"loss": 0.0541,
"step": 1047
},
{
"epoch": 1.5232558139534884,
"grad_norm": 1.0651853757254646,
"learning_rate": 1.337699980441069e-06,
"loss": 0.0597,
"step": 1048
},
{
"epoch": 1.5247093023255816,
"grad_norm": 1.4449454027688438,
"learning_rate": 1.3299376287424763e-06,
"loss": 0.0528,
"step": 1049
},
{
"epoch": 1.5261627906976745,
"grad_norm": 1.3052426229372127,
"learning_rate": 1.3221944079761413e-06,
"loss": 0.0575,
"step": 1050
},
{
"epoch": 1.5276162790697674,
"grad_norm": 1.3323801979825336,
"learning_rate": 1.3144703585051498e-06,
"loss": 0.0471,
"step": 1051
},
{
"epoch": 1.5290697674418605,
"grad_norm": 1.3097362291071397,
"learning_rate": 1.3067655205926488e-06,
"loss": 0.0686,
"step": 1052
},
{
"epoch": 1.5305232558139537,
"grad_norm": 1.556833363521575,
"learning_rate": 1.2990799344016436e-06,
"loss": 0.0567,
"step": 1053
},
{
"epoch": 1.5319767441860463,
"grad_norm": 1.2414839248874836,
"learning_rate": 1.2914136399947841e-06,
"loss": 0.0461,
"step": 1054
},
{
"epoch": 1.5334302325581395,
"grad_norm": 1.4598506706462977,
"learning_rate": 1.283766677334161e-06,
"loss": 0.0661,
"step": 1055
},
{
"epoch": 1.5348837209302326,
"grad_norm": 1.5053648322674895,
"learning_rate": 1.2761390862810907e-06,
"loss": 0.0684,
"step": 1056
},
{
"epoch": 1.5363372093023255,
"grad_norm": 1.533086553131052,
"learning_rate": 1.2685309065959168e-06,
"loss": 0.0623,
"step": 1057
},
{
"epoch": 1.5377906976744184,
"grad_norm": 1.376493396330026,
"learning_rate": 1.260942177937789e-06,
"loss": 0.0523,
"step": 1058
},
{
"epoch": 1.5392441860465116,
"grad_norm": 1.4332881734009741,
"learning_rate": 1.2533729398644735e-06,
"loss": 0.042,
"step": 1059
},
{
"epoch": 1.5406976744186047,
"grad_norm": 0.9999597570352821,
"learning_rate": 1.2458232318321306e-06,
"loss": 0.0336,
"step": 1060
},
{
"epoch": 1.5421511627906976,
"grad_norm": 1.211215207564261,
"learning_rate": 1.238293093195122e-06,
"loss": 0.052,
"step": 1061
},
{
"epoch": 1.5436046511627906,
"grad_norm": 1.394715555164229,
"learning_rate": 1.2307825632057952e-06,
"loss": 0.0753,
"step": 1062
},
{
"epoch": 1.5450581395348837,
"grad_norm": 1.3381942952433081,
"learning_rate": 1.2232916810142886e-06,
"loss": 0.058,
"step": 1063
},
{
"epoch": 1.5465116279069768,
"grad_norm": 1.7578672319163982,
"learning_rate": 1.2158204856683176e-06,
"loss": 0.0657,
"step": 1064
},
{
"epoch": 1.5479651162790697,
"grad_norm": 1.5353885598622956,
"learning_rate": 1.2083690161129808e-06,
"loss": 0.059,
"step": 1065
},
{
"epoch": 1.5494186046511627,
"grad_norm": 1.405316093009901,
"learning_rate": 1.2009373111905487e-06,
"loss": 0.0579,
"step": 1066
},
{
"epoch": 1.5508720930232558,
"grad_norm": 1.607692815378601,
"learning_rate": 1.1935254096402655e-06,
"loss": 0.0653,
"step": 1067
},
{
"epoch": 1.552325581395349,
"grad_norm": 1.497479074300615,
"learning_rate": 1.1861333500981449e-06,
"loss": 0.0523,
"step": 1068
},
{
"epoch": 1.5537790697674418,
"grad_norm": 1.1203473988887975,
"learning_rate": 1.1787611710967751e-06,
"loss": 0.0452,
"step": 1069
},
{
"epoch": 1.5552325581395348,
"grad_norm": 1.4222136382752362,
"learning_rate": 1.1714089110651071e-06,
"loss": 0.0635,
"step": 1070
},
{
"epoch": 1.556686046511628,
"grad_norm": 0.9658885164743157,
"learning_rate": 1.1640766083282662e-06,
"loss": 0.0771,
"step": 1071
},
{
"epoch": 1.558139534883721,
"grad_norm": 1.3063613380960122,
"learning_rate": 1.1567643011073393e-06,
"loss": 0.0573,
"step": 1072
},
{
"epoch": 1.559593023255814,
"grad_norm": 1.6416890877169144,
"learning_rate": 1.1494720275191901e-06,
"loss": 0.0616,
"step": 1073
},
{
"epoch": 1.5610465116279069,
"grad_norm": 1.3817386853715168,
"learning_rate": 1.1421998255762468e-06,
"loss": 0.0503,
"step": 1074
},
{
"epoch": 1.5625,
"grad_norm": 1.3346841117708024,
"learning_rate": 1.134947733186315e-06,
"loss": 0.047,
"step": 1075
},
{
"epoch": 1.5639534883720931,
"grad_norm": 1.3343322414067098,
"learning_rate": 1.127715788152372e-06,
"loss": 0.0636,
"step": 1076
},
{
"epoch": 1.565406976744186,
"grad_norm": 1.7154646419432018,
"learning_rate": 1.1205040281723728e-06,
"loss": 0.0645,
"step": 1077
},
{
"epoch": 1.566860465116279,
"grad_norm": 1.4443691543089068,
"learning_rate": 1.1133124908390575e-06,
"loss": 0.0569,
"step": 1078
},
{
"epoch": 1.568313953488372,
"grad_norm": 1.9549846970535834,
"learning_rate": 1.106141213639747e-06,
"loss": 0.0843,
"step": 1079
},
{
"epoch": 1.5697674418604652,
"grad_norm": 1.4036582458578477,
"learning_rate": 1.0989902339561554e-06,
"loss": 0.0541,
"step": 1080
},
{
"epoch": 1.5712209302325582,
"grad_norm": 1.7620858308016654,
"learning_rate": 1.0918595890641891e-06,
"loss": 0.0571,
"step": 1081
},
{
"epoch": 1.572674418604651,
"grad_norm": 2.0116431611899657,
"learning_rate": 1.0847493161337602e-06,
"loss": 0.0509,
"step": 1082
},
{
"epoch": 1.5741279069767442,
"grad_norm": 1.3745830759736906,
"learning_rate": 1.077659452228581e-06,
"loss": 0.0506,
"step": 1083
},
{
"epoch": 1.5755813953488373,
"grad_norm": 1.491007758939216,
"learning_rate": 1.0705900343059856e-06,
"loss": 0.0615,
"step": 1084
},
{
"epoch": 1.5770348837209303,
"grad_norm": 1.369265137859038,
"learning_rate": 1.0635410992167212e-06,
"loss": 0.0674,
"step": 1085
},
{
"epoch": 1.5784883720930232,
"grad_norm": 1.0623500340337344,
"learning_rate": 1.0565126837047718e-06,
"loss": 0.0467,
"step": 1086
},
{
"epoch": 1.5799418604651163,
"grad_norm": 1.3868730182724978,
"learning_rate": 1.049504824407152e-06,
"loss": 0.0583,
"step": 1087
},
{
"epoch": 1.5813953488372094,
"grad_norm": 1.5500696399009597,
"learning_rate": 1.04251755785373e-06,
"loss": 0.054,
"step": 1088
},
{
"epoch": 1.5828488372093024,
"grad_norm": 1.416069042767884,
"learning_rate": 1.0355509204670234e-06,
"loss": 0.0462,
"step": 1089
},
{
"epoch": 1.5843023255813953,
"grad_norm": 1.2069842940803852,
"learning_rate": 1.0286049485620213e-06,
"loss": 0.0483,
"step": 1090
},
{
"epoch": 1.5857558139534884,
"grad_norm": 1.6409720581547536,
"learning_rate": 1.0216796783459866e-06,
"loss": 0.0489,
"step": 1091
},
{
"epoch": 1.5872093023255816,
"grad_norm": 1.7346156581015553,
"learning_rate": 1.0147751459182737e-06,
"loss": 0.0452,
"step": 1092
},
{
"epoch": 1.5886627906976745,
"grad_norm": 1.304713086631584,
"learning_rate": 1.007891387270134e-06,
"loss": 0.0652,
"step": 1093
},
{
"epoch": 1.5901162790697674,
"grad_norm": 1.169058686538856,
"learning_rate": 1.001028438284533e-06,
"loss": 0.0395,
"step": 1094
},
{
"epoch": 1.5915697674418605,
"grad_norm": 1.581843571962872,
"learning_rate": 9.941863347359597e-07,
"loss": 0.0593,
"step": 1095
},
{
"epoch": 1.5930232558139537,
"grad_norm": 1.492544567921207,
"learning_rate": 9.873651122902472e-07,
"loss": 0.0834,
"step": 1096
},
{
"epoch": 1.5944767441860463,
"grad_norm": 1.459822032594516,
"learning_rate": 9.805648065043745e-07,
"loss": 0.0511,
"step": 1097
},
{
"epoch": 1.5959302325581395,
"grad_norm": 1.3911943572138405,
"learning_rate": 9.737854528262953e-07,
"loss": 0.0531,
"step": 1098
},
{
"epoch": 1.5973837209302326,
"grad_norm": 1.0428194925897607,
"learning_rate": 9.670270865947406e-07,
"loss": 0.0646,
"step": 1099
},
{
"epoch": 1.5988372093023255,
"grad_norm": 1.3576620238676198,
"learning_rate": 9.602897430390456e-07,
"loss": 0.0471,
"step": 1100
},
{
"epoch": 1.6002906976744184,
"grad_norm": 1.5766868187890286,
"learning_rate": 9.53573457278954e-07,
"loss": 0.0637,
"step": 1101
},
{
"epoch": 1.6017441860465116,
"grad_norm": 1.4800012500421134,
"learning_rate": 9.468782643244484e-07,
"loss": 0.0695,
"step": 1102
},
{
"epoch": 1.6031976744186047,
"grad_norm": 1.7029559921767845,
"learning_rate": 9.40204199075555e-07,
"loss": 0.0608,
"step": 1103
},
{
"epoch": 1.6046511627906976,
"grad_norm": 1.1192989686723849,
"learning_rate": 9.335512963221732e-07,
"loss": 0.0501,
"step": 1104
},
{
"epoch": 1.6061046511627906,
"grad_norm": 1.392049569057171,
"learning_rate": 9.269195907438843e-07,
"loss": 0.0956,
"step": 1105
},
{
"epoch": 1.6075581395348837,
"grad_norm": 1.4377153631105137,
"learning_rate": 9.203091169097761e-07,
"loss": 0.0639,
"step": 1106
},
{
"epoch": 1.6090116279069768,
"grad_norm": 1.4194367382834814,
"learning_rate": 9.137199092782617e-07,
"loss": 0.0504,
"step": 1107
},
{
"epoch": 1.6104651162790697,
"grad_norm": 1.0856317537640523,
"learning_rate": 9.071520021969027e-07,
"loss": 0.0385,
"step": 1108
},
{
"epoch": 1.6119186046511627,
"grad_norm": 1.3095443403571647,
"learning_rate": 9.006054299022227e-07,
"loss": 0.058,
"step": 1109
},
{
"epoch": 1.6133720930232558,
"grad_norm": 1.377541034515128,
"learning_rate": 8.940802265195375e-07,
"loss": 0.0688,
"step": 1110
},
{
"epoch": 1.614825581395349,
"grad_norm": 1.1095826182633532,
"learning_rate": 8.875764260627695e-07,
"loss": 0.0473,
"step": 1111
},
{
"epoch": 1.6162790697674418,
"grad_norm": 1.5224215809822206,
"learning_rate": 8.810940624342784e-07,
"loss": 0.0825,
"step": 1112
},
{
"epoch": 1.6177325581395348,
"grad_norm": 1.8482839553995425,
"learning_rate": 8.746331694246756e-07,
"loss": 0.0744,
"step": 1113
},
{
"epoch": 1.619186046511628,
"grad_norm": 1.6017104682800916,
"learning_rate": 8.681937807126567e-07,
"loss": 0.069,
"step": 1114
},
{
"epoch": 1.620639534883721,
"grad_norm": 1.2376828031612417,
"learning_rate": 8.617759298648182e-07,
"loss": 0.0495,
"step": 1115
},
{
"epoch": 1.622093023255814,
"grad_norm": 1.284668072013395,
"learning_rate": 8.553796503354899e-07,
"loss": 0.0771,
"step": 1116
},
{
"epoch": 1.6235465116279069,
"grad_norm": 1.3931203949754822,
"learning_rate": 8.490049754665541e-07,
"loss": 0.0574,
"step": 1117
},
{
"epoch": 1.625,
"grad_norm": 1.4765649037778599,
"learning_rate": 8.426519384872733e-07,
"loss": 0.0682,
"step": 1118
},
{
"epoch": 1.6264534883720931,
"grad_norm": 1.181845082363929,
"learning_rate": 8.363205725141238e-07,
"loss": 0.0494,
"step": 1119
},
{
"epoch": 1.627906976744186,
"grad_norm": 1.3928319550967374,
"learning_rate": 8.30010910550611e-07,
"loss": 0.059,
"step": 1120
},
{
"epoch": 1.629360465116279,
"grad_norm": 1.1141954284915958,
"learning_rate": 8.237229854871076e-07,
"loss": 0.0471,
"step": 1121
},
{
"epoch": 1.630813953488372,
"grad_norm": 1.4876650888680079,
"learning_rate": 8.174568301006763e-07,
"loss": 0.0805,
"step": 1122
},
{
"epoch": 1.6322674418604652,
"grad_norm": 1.4355426352193548,
"learning_rate": 8.11212477054904e-07,
"loss": 0.0412,
"step": 1123
},
{
"epoch": 1.6337209302325582,
"grad_norm": 1.3219907781653484,
"learning_rate": 8.049899588997246e-07,
"loss": 0.0644,
"step": 1124
},
{
"epoch": 1.635174418604651,
"grad_norm": 1.2661461720313956,
"learning_rate": 7.987893080712572e-07,
"loss": 0.0647,
"step": 1125
},
{
"epoch": 1.6366279069767442,
"grad_norm": 1.4112271603442867,
"learning_rate": 7.926105568916292e-07,
"loss": 0.0559,
"step": 1126
},
{
"epoch": 1.6380813953488373,
"grad_norm": 1.4885958147808351,
"learning_rate": 7.864537375688164e-07,
"loss": 0.0665,
"step": 1127
},
{
"epoch": 1.6395348837209303,
"grad_norm": 1.2255901712760742,
"learning_rate": 7.803188821964652e-07,
"loss": 0.0447,
"step": 1128
},
{
"epoch": 1.6409883720930232,
"grad_norm": 1.5452975728529494,
"learning_rate": 7.742060227537351e-07,
"loss": 0.067,
"step": 1129
},
{
"epoch": 1.6424418604651163,
"grad_norm": 1.3595014210127743,
"learning_rate": 7.681151911051232e-07,
"loss": 0.0613,
"step": 1130
},
{
"epoch": 1.6438953488372094,
"grad_norm": 1.4222742042571062,
"learning_rate": 7.620464190003074e-07,
"loss": 0.0725,
"step": 1131
},
{
"epoch": 1.6453488372093024,
"grad_norm": 1.2218037474493977,
"learning_rate": 7.559997380739714e-07,
"loss": 0.0518,
"step": 1132
},
{
"epoch": 1.6468023255813953,
"grad_norm": 1.7245104586600535,
"learning_rate": 7.499751798456456e-07,
"loss": 0.072,
"step": 1133
},
{
"epoch": 1.6482558139534884,
"grad_norm": 1.3548373797393403,
"learning_rate": 7.439727757195408e-07,
"loss": 0.0655,
"step": 1134
},
{
"epoch": 1.6497093023255816,
"grad_norm": 1.4843313971694068,
"learning_rate": 7.379925569843877e-07,
"loss": 0.0701,
"step": 1135
},
{
"epoch": 1.6511627906976745,
"grad_norm": 1.2524145522085295,
"learning_rate": 7.320345548132679e-07,
"loss": 0.0429,
"step": 1136
},
{
"epoch": 1.6526162790697674,
"grad_norm": 1.3051416609601543,
"learning_rate": 7.260988002634584e-07,
"loss": 0.0709,
"step": 1137
},
{
"epoch": 1.6540697674418605,
"grad_norm": 1.1280777447038597,
"learning_rate": 7.201853242762613e-07,
"loss": 0.0653,
"step": 1138
},
{
"epoch": 1.6555232558139537,
"grad_norm": 1.524319939704524,
"learning_rate": 7.142941576768526e-07,
"loss": 0.0671,
"step": 1139
},
{
"epoch": 1.6569767441860463,
"grad_norm": 1.1315251169649219,
"learning_rate": 7.084253311741101e-07,
"loss": 0.0415,
"step": 1140
},
{
"epoch": 1.6584302325581395,
"grad_norm": 1.5134458313250108,
"learning_rate": 7.025788753604668e-07,
"loss": 0.0507,
"step": 1141
},
{
"epoch": 1.6598837209302326,
"grad_norm": 1.3932510174316692,
"learning_rate": 6.967548207117364e-07,
"loss": 0.0653,
"step": 1142
},
{
"epoch": 1.6613372093023255,
"grad_norm": 1.369278789308021,
"learning_rate": 6.909531975869682e-07,
"loss": 0.0602,
"step": 1143
},
{
"epoch": 1.6627906976744184,
"grad_norm": 1.335104392777719,
"learning_rate": 6.851740362282788e-07,
"loss": 0.0505,
"step": 1144
},
{
"epoch": 1.6642441860465116,
"grad_norm": 1.7252990354329707,
"learning_rate": 6.794173667606995e-07,
"loss": 0.0679,
"step": 1145
},
{
"epoch": 1.6656976744186047,
"grad_norm": 1.3931797125192535,
"learning_rate": 6.736832191920184e-07,
"loss": 0.0689,
"step": 1146
},
{
"epoch": 1.6671511627906976,
"grad_norm": 1.270578497522883,
"learning_rate": 6.679716234126243e-07,
"loss": 0.0663,
"step": 1147
},
{
"epoch": 1.6686046511627906,
"grad_norm": 1.1833811909595593,
"learning_rate": 6.622826091953483e-07,
"loss": 0.043,
"step": 1148
},
{
"epoch": 1.6700581395348837,
"grad_norm": 1.2856539604876163,
"learning_rate": 6.566162061953141e-07,
"loss": 0.0689,
"step": 1149
},
{
"epoch": 1.6715116279069768,
"grad_norm": 1.2026108031855738,
"learning_rate": 6.50972443949775e-07,
"loss": 0.0436,
"step": 1150
},
{
"epoch": 1.6729651162790697,
"grad_norm": 1.4050042046526028,
"learning_rate": 6.453513518779708e-07,
"loss": 0.0683,
"step": 1151
},
{
"epoch": 1.6744186046511627,
"grad_norm": 1.172885792221737,
"learning_rate": 6.397529592809615e-07,
"loss": 0.0457,
"step": 1152
},
{
"epoch": 1.6758720930232558,
"grad_norm": 1.6645132359630175,
"learning_rate": 6.341772953414893e-07,
"loss": 0.0656,
"step": 1153
},
{
"epoch": 1.677325581395349,
"grad_norm": 1.203947351057658,
"learning_rate": 6.286243891238114e-07,
"loss": 0.0508,
"step": 1154
},
{
"epoch": 1.6787790697674418,
"grad_norm": 1.5653427701253289,
"learning_rate": 6.23094269573562e-07,
"loss": 0.0581,
"step": 1155
},
{
"epoch": 1.6802325581395348,
"grad_norm": 1.3692678493335273,
"learning_rate": 6.175869655175898e-07,
"loss": 0.0574,
"step": 1156
},
{
"epoch": 1.681686046511628,
"grad_norm": 1.223293248941118,
"learning_rate": 6.121025056638186e-07,
"loss": 0.0645,
"step": 1157
},
{
"epoch": 1.683139534883721,
"grad_norm": 1.1984686741863968,
"learning_rate": 6.06640918601088e-07,
"loss": 0.0492,
"step": 1158
},
{
"epoch": 1.684593023255814,
"grad_norm": 1.8607181067232295,
"learning_rate": 6.012022327990097e-07,
"loss": 0.0717,
"step": 1159
},
{
"epoch": 1.6860465116279069,
"grad_norm": 1.3129459081678303,
"learning_rate": 5.957864766078186e-07,
"loss": 0.0527,
"step": 1160
},
{
"epoch": 1.6875,
"grad_norm": 1.2419623459260931,
"learning_rate": 5.903936782582253e-07,
"loss": 0.0381,
"step": 1161
},
{
"epoch": 1.6889534883720931,
"grad_norm": 1.5778350174178633,
"learning_rate": 5.850238658612667e-07,
"loss": 0.0657,
"step": 1162
},
{
"epoch": 1.690406976744186,
"grad_norm": 1.227703263910834,
"learning_rate": 5.796770674081592e-07,
"loss": 0.0578,
"step": 1163
},
{
"epoch": 1.691860465116279,
"grad_norm": 1.2163654872782266,
"learning_rate": 5.743533107701593e-07,
"loss": 0.0622,
"step": 1164
},
{
"epoch": 1.693313953488372,
"grad_norm": 1.7561357257514982,
"learning_rate": 5.690526236984079e-07,
"loss": 0.0706,
"step": 1165
},
{
"epoch": 1.6947674418604652,
"grad_norm": 1.236485378770411,
"learning_rate": 5.637750338237963e-07,
"loss": 0.0476,
"step": 1166
},
{
"epoch": 1.6962209302325582,
"grad_norm": 1.273190952518188,
"learning_rate": 5.585205686568123e-07,
"loss": 0.0572,
"step": 1167
},
{
"epoch": 1.697674418604651,
"grad_norm": 1.3840145983571588,
"learning_rate": 5.532892555874059e-07,
"loss": 0.0498,
"step": 1168
},
{
"epoch": 1.6991279069767442,
"grad_norm": 1.6682935105457355,
"learning_rate": 5.48081121884838e-07,
"loss": 0.0501,
"step": 1169
},
{
"epoch": 1.7005813953488373,
"grad_norm": 1.7215714569023008,
"learning_rate": 5.428961946975464e-07,
"loss": 0.0621,
"step": 1170
},
{
"epoch": 1.7020348837209303,
"grad_norm": 1.2802300699738327,
"learning_rate": 5.377345010529977e-07,
"loss": 0.0507,
"step": 1171
},
{
"epoch": 1.7034883720930232,
"grad_norm": 1.179288667096262,
"learning_rate": 5.325960678575498e-07,
"loss": 0.0622,
"step": 1172
},
{
"epoch": 1.7049418604651163,
"grad_norm": 1.397312032777205,
"learning_rate": 5.274809218963089e-07,
"loss": 0.048,
"step": 1173
},
{
"epoch": 1.7063953488372094,
"grad_norm": 1.0705846916376756,
"learning_rate": 5.22389089832997e-07,
"loss": 0.0462,
"step": 1174
},
{
"epoch": 1.7078488372093024,
"grad_norm": 1.5708429842240574,
"learning_rate": 5.173205982098018e-07,
"loss": 0.0777,
"step": 1175
},
{
"epoch": 1.7093023255813953,
"grad_norm": 1.6720126005052158,
"learning_rate": 5.122754734472496e-07,
"loss": 0.0751,
"step": 1176
},
{
"epoch": 1.7107558139534884,
"grad_norm": 1.4286721495638015,
"learning_rate": 5.072537418440565e-07,
"loss": 0.0509,
"step": 1177
},
{
"epoch": 1.7122093023255816,
"grad_norm": 1.594631174541215,
"learning_rate": 5.022554295770038e-07,
"loss": 0.0578,
"step": 1178
},
{
"epoch": 1.7136627906976745,
"grad_norm": 1.2897023400093752,
"learning_rate": 4.972805627007881e-07,
"loss": 0.059,
"step": 1179
},
{
"epoch": 1.7151162790697674,
"grad_norm": 1.0706258806400843,
"learning_rate": 4.92329167147898e-07,
"loss": 0.0403,
"step": 1180
},
{
"epoch": 1.7165697674418605,
"grad_norm": 1.247773190344945,
"learning_rate": 4.874012687284685e-07,
"loss": 0.0588,
"step": 1181
},
{
"epoch": 1.7180232558139537,
"grad_norm": 1.1610372428474574,
"learning_rate": 4.824968931301549e-07,
"loss": 0.0493,
"step": 1182
},
{
"epoch": 1.7194767441860463,
"grad_norm": 0.9925566701240426,
"learning_rate": 4.776160659179918e-07,
"loss": 0.0423,
"step": 1183
},
{
"epoch": 1.7209302325581395,
"grad_norm": 1.2198238537851194,
"learning_rate": 4.727588125342669e-07,
"loss": 0.0434,
"step": 1184
},
{
"epoch": 1.7223837209302326,
"grad_norm": 1.0814209449015801,
"learning_rate": 4.679251582983807e-07,
"loss": 0.0515,
"step": 1185
},
{
"epoch": 1.7238372093023255,
"grad_norm": 1.253442945461474,
"learning_rate": 4.631151284067209e-07,
"loss": 0.0401,
"step": 1186
},
{
"epoch": 1.7252906976744184,
"grad_norm": 1.6572342387258825,
"learning_rate": 4.583287479325266e-07,
"loss": 0.0851,
"step": 1187
},
{
"epoch": 1.7267441860465116,
"grad_norm": 1.5674512288501314,
"learning_rate": 4.5356604182576315e-07,
"loss": 0.0748,
"step": 1188
},
{
"epoch": 1.7281976744186047,
"grad_norm": 1.4246277140745789,
"learning_rate": 4.4882703491298364e-07,
"loss": 0.0507,
"step": 1189
},
{
"epoch": 1.7296511627906976,
"grad_norm": 1.4692955325669175,
"learning_rate": 4.4411175189720935e-07,
"loss": 0.0599,
"step": 1190
},
{
"epoch": 1.7311046511627906,
"grad_norm": 1.3880435931686506,
"learning_rate": 4.3942021735779163e-07,
"loss": 0.0801,
"step": 1191
},
{
"epoch": 1.7325581395348837,
"grad_norm": 1.2441163102111796,
"learning_rate": 4.347524557502919e-07,
"loss": 0.0656,
"step": 1192
},
{
"epoch": 1.7340116279069768,
"grad_norm": 1.333242236616478,
"learning_rate": 4.301084914063475e-07,
"loss": 0.051,
"step": 1193
},
{
"epoch": 1.7354651162790697,
"grad_norm": 1.48014650555478,
"learning_rate": 4.2548834853355036e-07,
"loss": 0.0678,
"step": 1194
},
{
"epoch": 1.7369186046511627,
"grad_norm": 1.4136502655814862,
"learning_rate": 4.2089205121531475e-07,
"loss": 0.0423,
"step": 1195
},
{
"epoch": 1.7383720930232558,
"grad_norm": 1.466619588107673,
"learning_rate": 4.163196234107603e-07,
"loss": 0.0422,
"step": 1196
},
{
"epoch": 1.739825581395349,
"grad_norm": 1.0867167600455685,
"learning_rate": 4.117710889545767e-07,
"loss": 0.0465,
"step": 1197
},
{
"epoch": 1.7412790697674418,
"grad_norm": 1.1097337994586927,
"learning_rate": 4.0724647155690855e-07,
"loss": 0.049,
"step": 1198
},
{
"epoch": 1.7427325581395348,
"grad_norm": 1.4162752653560682,
"learning_rate": 4.0274579480322485e-07,
"loss": 0.0706,
"step": 1199
},
{
"epoch": 1.744186046511628,
"grad_norm": 1.6314238706254214,
"learning_rate": 3.9826908215420344e-07,
"loss": 0.0648,
"step": 1200
},
{
"epoch": 1.744186046511628,
"eval_loss": 0.13038784265518188,
"eval_runtime": 2.2036,
"eval_samples_per_second": 25.414,
"eval_steps_per_second": 6.353,
"step": 1200
},
{
"epoch": 1.745639534883721,
"grad_norm": 1.4924894866086462,
"learning_rate": 3.938163569455999e-07,
"loss": 0.065,
"step": 1201
},
{
"epoch": 1.747093023255814,
"grad_norm": 1.2812945325918792,
"learning_rate": 3.893876423881343e-07,
"loss": 0.056,
"step": 1202
},
{
"epoch": 1.7485465116279069,
"grad_norm": 1.4847911329450767,
"learning_rate": 3.8498296156736336e-07,
"loss": 0.0609,
"step": 1203
},
{
"epoch": 1.75,
"grad_norm": 1.1933603478148918,
"learning_rate": 3.8060233744356634e-07,
"loss": 0.0528,
"step": 1204
},
{
"epoch": 1.7514534883720931,
"grad_norm": 1.2151612672851098,
"learning_rate": 3.7624579285161945e-07,
"loss": 0.0448,
"step": 1205
},
{
"epoch": 1.752906976744186,
"grad_norm": 1.2721240144688912,
"learning_rate": 3.719133505008793e-07,
"loss": 0.0638,
"step": 1206
},
{
"epoch": 1.754360465116279,
"grad_norm": 1.5815876693660877,
"learning_rate": 3.67605032975068e-07,
"loss": 0.0611,
"step": 1207
},
{
"epoch": 1.755813953488372,
"grad_norm": 1.4791508048535231,
"learning_rate": 3.633208627321483e-07,
"loss": 0.0565,
"step": 1208
},
{
"epoch": 1.7572674418604652,
"grad_norm": 1.9027980020443158,
"learning_rate": 3.590608621042141e-07,
"loss": 0.0835,
"step": 1209
},
{
"epoch": 1.7587209302325582,
"grad_norm": 1.2720247109280667,
"learning_rate": 3.548250532973663e-07,
"loss": 0.041,
"step": 1210
},
{
"epoch": 1.760174418604651,
"grad_norm": 1.3313592730700925,
"learning_rate": 3.50613458391606e-07,
"loss": 0.0662,
"step": 1211
},
{
"epoch": 1.7616279069767442,
"grad_norm": 1.742038899061449,
"learning_rate": 3.464260993407098e-07,
"loss": 0.0625,
"step": 1212
},
{
"epoch": 1.7630813953488373,
"grad_norm": 1.313671785304789,
"learning_rate": 3.422629979721226e-07,
"loss": 0.058,
"step": 1213
},
{
"epoch": 1.7645348837209303,
"grad_norm": 1.4381256471907784,
"learning_rate": 3.381241759868403e-07,
"loss": 0.0577,
"step": 1214
},
{
"epoch": 1.7659883720930232,
"grad_norm": 1.1247184686299345,
"learning_rate": 3.340096549592997e-07,
"loss": 0.0381,
"step": 1215
},
{
"epoch": 1.7674418604651163,
"grad_norm": 1.3749991805427006,
"learning_rate": 3.299194563372604e-07,
"loss": 0.05,
"step": 1216
},
{
"epoch": 1.7688953488372094,
"grad_norm": 1.2620255884743692,
"learning_rate": 3.258536014417002e-07,
"loss": 0.0404,
"step": 1217
},
{
"epoch": 1.7703488372093024,
"grad_norm": 1.4310474211303066,
"learning_rate": 3.2181211146669835e-07,
"loss": 0.0637,
"step": 1218
},
{
"epoch": 1.7718023255813953,
"grad_norm": 1.372997486141924,
"learning_rate": 3.177950074793279e-07,
"loss": 0.059,
"step": 1219
},
{
"epoch": 1.7732558139534884,
"grad_norm": 1.4514336664056386,
"learning_rate": 3.1380231041954366e-07,
"loss": 0.0802,
"step": 1220
},
{
"epoch": 1.7747093023255816,
"grad_norm": 1.294363423127495,
"learning_rate": 3.0983404110007775e-07,
"loss": 0.0463,
"step": 1221
},
{
"epoch": 1.7761627906976745,
"grad_norm": 1.2489012530557877,
"learning_rate": 3.05890220206323e-07,
"loss": 0.0758,
"step": 1222
},
{
"epoch": 1.7776162790697674,
"grad_norm": 1.3733868017342423,
"learning_rate": 3.0197086829623524e-07,
"loss": 0.0433,
"step": 1223
},
{
"epoch": 1.7790697674418605,
"grad_norm": 1.2696169610865513,
"learning_rate": 2.980760058002163e-07,
"loss": 0.0695,
"step": 1224
},
{
"epoch": 1.7805232558139537,
"grad_norm": 1.1885642085760084,
"learning_rate": 2.9420565302101467e-07,
"loss": 0.0689,
"step": 1225
},
{
"epoch": 1.7819767441860463,
"grad_norm": 1.7975663880717683,
"learning_rate": 2.9035983013361524e-07,
"loss": 0.0832,
"step": 1226
},
{
"epoch": 1.7834302325581395,
"grad_norm": 1.4668495811562468,
"learning_rate": 2.8653855718513867e-07,
"loss": 0.0839,
"step": 1227
},
{
"epoch": 1.7848837209302326,
"grad_norm": 1.7943430675185803,
"learning_rate": 2.827418540947313e-07,
"loss": 0.0734,
"step": 1228
},
{
"epoch": 1.7863372093023255,
"grad_norm": 1.414532030794273,
"learning_rate": 2.7896974065346636e-07,
"loss": 0.0591,
"step": 1229
},
{
"epoch": 1.7877906976744184,
"grad_norm": 1.3282221407021015,
"learning_rate": 2.7522223652423627e-07,
"loss": 0.0574,
"step": 1230
},
{
"epoch": 1.7892441860465116,
"grad_norm": 1.4616173540064281,
"learning_rate": 2.7149936124165556e-07,
"loss": 0.088,
"step": 1231
},
{
"epoch": 1.7906976744186047,
"grad_norm": 1.62055262401236,
"learning_rate": 2.67801134211953e-07,
"loss": 0.0555,
"step": 1232
},
{
"epoch": 1.7921511627906976,
"grad_norm": 1.5054710143448626,
"learning_rate": 2.6412757471287633e-07,
"loss": 0.0355,
"step": 1233
},
{
"epoch": 1.7936046511627906,
"grad_norm": 1.2551896585231108,
"learning_rate": 2.6047870189358504e-07,
"loss": 0.0619,
"step": 1234
},
{
"epoch": 1.7950581395348837,
"grad_norm": 1.4827616308813487,
"learning_rate": 2.568545347745582e-07,
"loss": 0.0699,
"step": 1235
},
{
"epoch": 1.7965116279069768,
"grad_norm": 1.1634489682279106,
"learning_rate": 2.5325509224748965e-07,
"loss": 0.0424,
"step": 1236
},
{
"epoch": 1.7979651162790697,
"grad_norm": 1.2805820993346737,
"learning_rate": 2.4968039307519174e-07,
"loss": 0.0738,
"step": 1237
},
{
"epoch": 1.7994186046511627,
"grad_norm": 1.2970028495965267,
"learning_rate": 2.461304558914973e-07,
"loss": 0.0483,
"step": 1238
},
{
"epoch": 1.8008720930232558,
"grad_norm": 1.3774251049609605,
"learning_rate": 2.426052992011613e-07,
"loss": 0.0594,
"step": 1239
},
{
"epoch": 1.802325581395349,
"grad_norm": 1.2741543577811263,
"learning_rate": 2.3910494137976526e-07,
"loss": 0.061,
"step": 1240
},
{
"epoch": 1.8037790697674418,
"grad_norm": 1.4946869686916726,
"learning_rate": 2.356294006736254e-07,
"loss": 0.0422,
"step": 1241
},
{
"epoch": 1.8052325581395348,
"grad_norm": 1.1976454792671984,
"learning_rate": 2.321786951996885e-07,
"loss": 0.049,
"step": 1242
},
{
"epoch": 1.806686046511628,
"grad_norm": 1.7640200270984723,
"learning_rate": 2.2875284294544663e-07,
"loss": 0.0654,
"step": 1243
},
{
"epoch": 1.808139534883721,
"grad_norm": 1.4131166766489878,
"learning_rate": 2.2535186176883771e-07,
"loss": 0.0615,
"step": 1244
},
{
"epoch": 1.809593023255814,
"grad_norm": 1.3382405156149042,
"learning_rate": 2.2197576939815447e-07,
"loss": 0.0596,
"step": 1245
},
{
"epoch": 1.8110465116279069,
"grad_norm": 1.0375341190167515,
"learning_rate": 2.186245834319517e-07,
"loss": 0.0378,
"step": 1246
},
{
"epoch": 1.8125,
"grad_norm": 1.26799708959962,
"learning_rate": 2.152983213389559e-07,
"loss": 0.0612,
"step": 1247
},
{
"epoch": 1.8139534883720931,
"grad_norm": 0.9680143896503044,
"learning_rate": 2.1199700045797077e-07,
"loss": 0.0391,
"step": 1248
},
{
"epoch": 1.815406976744186,
"grad_norm": 1.4351941135226602,
"learning_rate": 2.0872063799778908e-07,
"loss": 0.0465,
"step": 1249
},
{
"epoch": 1.816860465116279,
"grad_norm": 1.2212499026345323,
"learning_rate": 2.054692510371059e-07,
"loss": 0.0596,
"step": 1250
},
{
"epoch": 1.818313953488372,
"grad_norm": 1.4124067831447158,
"learning_rate": 2.0224285652442332e-07,
"loss": 0.0434,
"step": 1251
},
{
"epoch": 1.8197674418604652,
"grad_norm": 1.6900102388116847,
"learning_rate": 1.9904147127796646e-07,
"loss": 0.0633,
"step": 1252
},
{
"epoch": 1.8212209302325582,
"grad_norm": 1.1245930914451603,
"learning_rate": 1.9586511198559422e-07,
"loss": 0.0518,
"step": 1253
},
{
"epoch": 1.822674418604651,
"grad_norm": 1.4535155723070285,
"learning_rate": 1.9271379520471366e-07,
"loss": 0.0611,
"step": 1254
},
{
"epoch": 1.8241279069767442,
"grad_norm": 1.1651708233377704,
"learning_rate": 1.8958753736219137e-07,
"loss": 0.0437,
"step": 1255
},
{
"epoch": 1.8255813953488373,
"grad_norm": 1.698268844667189,
"learning_rate": 1.8648635475427112e-07,
"loss": 0.0661,
"step": 1256
},
{
"epoch": 1.8270348837209303,
"grad_norm": 1.2897615950696786,
"learning_rate": 1.8341026354648461e-07,
"loss": 0.0653,
"step": 1257
},
{
"epoch": 1.8284883720930232,
"grad_norm": 1.454033537605278,
"learning_rate": 1.8035927977357204e-07,
"loss": 0.0824,
"step": 1258
},
{
"epoch": 1.8299418604651163,
"grad_norm": 1.4782681121621621,
"learning_rate": 1.773334193393944e-07,
"loss": 0.0695,
"step": 1259
},
{
"epoch": 1.8313953488372094,
"grad_norm": 1.2472901471200917,
"learning_rate": 1.7433269801685304e-07,
"loss": 0.0535,
"step": 1260
},
{
"epoch": 1.8328488372093024,
"grad_norm": 1.411827026048705,
"learning_rate": 1.713571314478063e-07,
"loss": 0.0558,
"step": 1261
},
{
"epoch": 1.8343023255813953,
"grad_norm": 1.3101061668799474,
"learning_rate": 1.684067351429891e-07,
"loss": 0.0847,
"step": 1262
},
{
"epoch": 1.8357558139534884,
"grad_norm": 1.3283465609162761,
"learning_rate": 1.6548152448193021e-07,
"loss": 0.0517,
"step": 1263
},
{
"epoch": 1.8372093023255816,
"grad_norm": 1.073779839976608,
"learning_rate": 1.6258151471287397e-07,
"loss": 0.059,
"step": 1264
},
{
"epoch": 1.8386627906976745,
"grad_norm": 1.5768683051386514,
"learning_rate": 1.5970672095269978e-07,
"loss": 0.0715,
"step": 1265
},
{
"epoch": 1.8401162790697674,
"grad_norm": 1.6425224461963919,
"learning_rate": 1.5685715818684332e-07,
"loss": 0.0742,
"step": 1266
},
{
"epoch": 1.8415697674418605,
"grad_norm": 1.2460670529438014,
"learning_rate": 1.540328412692188e-07,
"loss": 0.0509,
"step": 1267
},
{
"epoch": 1.8430232558139537,
"grad_norm": 1.1780779637487353,
"learning_rate": 1.512337849221429e-07,
"loss": 0.0558,
"step": 1268
},
{
"epoch": 1.8444767441860463,
"grad_norm": 1.3379396790966653,
"learning_rate": 1.4846000373625325e-07,
"loss": 0.039,
"step": 1269
},
{
"epoch": 1.8459302325581395,
"grad_norm": 1.6926159045939793,
"learning_rate": 1.4571151217043944e-07,
"loss": 0.0581,
"step": 1270
},
{
"epoch": 1.8473837209302326,
"grad_norm": 1.358687319927915,
"learning_rate": 1.4298832455176104e-07,
"loss": 0.0442,
"step": 1271
},
{
"epoch": 1.8488372093023255,
"grad_norm": 1.1372613393653885,
"learning_rate": 1.4029045507537696e-07,
"loss": 0.0619,
"step": 1272
},
{
"epoch": 1.8502906976744184,
"grad_norm": 1.761528640147089,
"learning_rate": 1.376179178044701e-07,
"loss": 0.1053,
"step": 1273
},
{
"epoch": 1.8517441860465116,
"grad_norm": 1.103400969846263,
"learning_rate": 1.3497072667017497e-07,
"loss": 0.0444,
"step": 1274
},
{
"epoch": 1.8531976744186047,
"grad_norm": 1.6546592470023223,
"learning_rate": 1.3234889547150132e-07,
"loss": 0.0705,
"step": 1275
},
{
"epoch": 1.8546511627906976,
"grad_norm": 1.4348034154756075,
"learning_rate": 1.297524378752696e-07,
"loss": 0.0479,
"step": 1276
},
{
"epoch": 1.8561046511627906,
"grad_norm": 1.571516172687651,
"learning_rate": 1.2718136741603216e-07,
"loss": 0.0694,
"step": 1277
},
{
"epoch": 1.8575581395348837,
"grad_norm": 1.3840050658107048,
"learning_rate": 1.2463569749600613e-07,
"loss": 0.035,
"step": 1278
},
{
"epoch": 1.8590116279069768,
"grad_norm": 1.4947152523030978,
"learning_rate": 1.2211544138500452e-07,
"loss": 0.0561,
"step": 1279
},
{
"epoch": 1.8604651162790697,
"grad_norm": 1.1233071008724007,
"learning_rate": 1.196206122203647e-07,
"loss": 0.036,
"step": 1280
},
{
"epoch": 1.8619186046511627,
"grad_norm": 1.314094801957361,
"learning_rate": 1.1715122300688109e-07,
"loss": 0.0535,
"step": 1281
},
{
"epoch": 1.8633720930232558,
"grad_norm": 1.2805535837964788,
"learning_rate": 1.1470728661673814e-07,
"loss": 0.0556,
"step": 1282
},
{
"epoch": 1.864825581395349,
"grad_norm": 1.6182345359056034,
"learning_rate": 1.122888157894414e-07,
"loss": 0.0446,
"step": 1283
},
{
"epoch": 1.8662790697674418,
"grad_norm": 1.5370590795498948,
"learning_rate": 1.0989582313175373e-07,
"loss": 0.0648,
"step": 1284
},
{
"epoch": 1.8677325581395348,
"grad_norm": 1.2775753759686193,
"learning_rate": 1.0752832111762479e-07,
"loss": 0.0485,
"step": 1285
},
{
"epoch": 1.869186046511628,
"grad_norm": 1.3721105986516784,
"learning_rate": 1.0518632208813274e-07,
"loss": 0.0461,
"step": 1286
},
{
"epoch": 1.870639534883721,
"grad_norm": 1.3509315346274664,
"learning_rate": 1.0286983825141373e-07,
"loss": 0.0591,
"step": 1287
},
{
"epoch": 1.872093023255814,
"grad_norm": 1.3166068602170886,
"learning_rate": 1.0057888168260311e-07,
"loss": 0.0667,
"step": 1288
},
{
"epoch": 1.8735465116279069,
"grad_norm": 1.0841449480013747,
"learning_rate": 9.831346432376765e-08,
"loss": 0.0296,
"step": 1289
},
{
"epoch": 1.875,
"grad_norm": 1.4139829336218244,
"learning_rate": 9.607359798384785e-08,
"loss": 0.0899,
"step": 1290
},
{
"epoch": 1.8764534883720931,
"grad_norm": 1.3681050302392366,
"learning_rate": 9.385929433859353e-08,
"loss": 0.04,
"step": 1291
},
{
"epoch": 1.877906976744186,
"grad_norm": 1.4537407830939912,
"learning_rate": 9.167056493050497e-08,
"loss": 0.0582,
"step": 1292
},
{
"epoch": 1.879360465116279,
"grad_norm": 1.437948138954234,
"learning_rate": 8.95074211687702e-08,
"loss": 0.0558,
"step": 1293
},
{
"epoch": 1.880813953488372,
"grad_norm": 1.2655853734528568,
"learning_rate": 8.736987432920785e-08,
"loss": 0.0562,
"step": 1294
},
{
"epoch": 1.8822674418604652,
"grad_norm": 1.2886785360003277,
"learning_rate": 8.525793555420714e-08,
"loss": 0.0642,
"step": 1295
},
{
"epoch": 1.8837209302325582,
"grad_norm": 1.5650728826423228,
"learning_rate": 8.317161585266964e-08,
"loss": 0.048,
"step": 1296
},
{
"epoch": 1.885174418604651,
"grad_norm": 1.75943043500276,
"learning_rate": 8.111092609995375e-08,
"loss": 0.0516,
"step": 1297
},
{
"epoch": 1.8866279069767442,
"grad_norm": 2.137820572547727,
"learning_rate": 7.907587703781583e-08,
"loss": 0.1197,
"step": 1298
},
{
"epoch": 1.8880813953488373,
"grad_norm": 1.4094974542843408,
"learning_rate": 7.706647927435528e-08,
"loss": 0.0446,
"step": 1299
},
{
"epoch": 1.8895348837209303,
"grad_norm": 1.5046480179949109,
"learning_rate": 7.508274328395848e-08,
"loss": 0.0717,
"step": 1300
},
{
"epoch": 1.8909883720930232,
"grad_norm": 0.9467909119425738,
"learning_rate": 7.312467940724488e-08,
"loss": 0.0391,
"step": 1301
},
{
"epoch": 1.8924418604651163,
"grad_norm": 1.159518695387788,
"learning_rate": 7.119229785101322e-08,
"loss": 0.0499,
"step": 1302
},
{
"epoch": 1.8938953488372094,
"grad_norm": 1.1359551931047827,
"learning_rate": 6.928560868818823e-08,
"loss": 0.0528,
"step": 1303
},
{
"epoch": 1.8953488372093024,
"grad_norm": 1.3754138749662996,
"learning_rate": 6.74046218577673e-08,
"loss": 0.0901,
"step": 1304
},
{
"epoch": 1.8968023255813953,
"grad_norm": 1.1173372045284649,
"learning_rate": 6.554934716476946e-08,
"loss": 0.0416,
"step": 1305
},
{
"epoch": 1.8982558139534884,
"grad_norm": 1.6715320242870557,
"learning_rate": 6.371979428018371e-08,
"loss": 0.051,
"step": 1306
},
{
"epoch": 1.8997093023255816,
"grad_norm": 1.6108507509250944,
"learning_rate": 6.191597274091965e-08,
"loss": 0.048,
"step": 1307
},
{
"epoch": 1.9011627906976745,
"grad_norm": 1.1025808474832854,
"learning_rate": 6.01378919497575e-08,
"loss": 0.0473,
"step": 1308
},
{
"epoch": 1.9026162790697674,
"grad_norm": 1.1565768061022645,
"learning_rate": 5.838556117529759e-08,
"loss": 0.0532,
"step": 1309
},
{
"epoch": 1.9040697674418605,
"grad_norm": 1.3641230663168908,
"learning_rate": 5.6658989551913736e-08,
"loss": 0.059,
"step": 1310
},
{
"epoch": 1.9055232558139537,
"grad_norm": 1.3356709104324593,
"learning_rate": 5.495818607970549e-08,
"loss": 0.0612,
"step": 1311
},
{
"epoch": 1.9069767441860463,
"grad_norm": 1.660638299272094,
"learning_rate": 5.3283159624448745e-08,
"loss": 0.0496,
"step": 1312
},
{
"epoch": 1.9084302325581395,
"grad_norm": 1.7498825229557364,
"learning_rate": 5.16339189175552e-08,
"loss": 0.0577,
"step": 1313
},
{
"epoch": 1.9098837209302326,
"grad_norm": 1.5655598568640097,
"learning_rate": 5.0010472556019096e-08,
"loss": 0.0553,
"step": 1314
},
{
"epoch": 1.9113372093023255,
"grad_norm": 1.3860084050687909,
"learning_rate": 4.841282900237942e-08,
"loss": 0.0631,
"step": 1315
},
{
"epoch": 1.9127906976744184,
"grad_norm": 1.3887503495723006,
"learning_rate": 4.684099658467223e-08,
"loss": 0.0699,
"step": 1316
},
{
"epoch": 1.9142441860465116,
"grad_norm": 1.2678955663748521,
"learning_rate": 4.529498349638728e-08,
"loss": 0.0402,
"step": 1317
},
{
"epoch": 1.9156976744186047,
"grad_norm": 1.284564731699548,
"learning_rate": 4.377479779642535e-08,
"loss": 0.0502,
"step": 1318
},
{
"epoch": 1.9171511627906976,
"grad_norm": 1.2529895690190205,
"learning_rate": 4.228044740905879e-08,
"loss": 0.0626,
"step": 1319
},
{
"epoch": 1.9186046511627906,
"grad_norm": 1.5762385143793198,
"learning_rate": 4.081194012388601e-08,
"loss": 0.0627,
"step": 1320
},
{
"epoch": 1.9200581395348837,
"grad_norm": 1.6231285336791623,
"learning_rate": 3.936928359579539e-08,
"loss": 0.0667,
"step": 1321
},
{
"epoch": 1.9215116279069768,
"grad_norm": 1.5262840772998052,
"learning_rate": 3.7952485344921465e-08,
"loss": 0.0425,
"step": 1322
},
{
"epoch": 1.9229651162790697,
"grad_norm": 1.145845214252006,
"learning_rate": 3.656155275660711e-08,
"loss": 0.0455,
"step": 1323
},
{
"epoch": 1.9244186046511627,
"grad_norm": 2.0089578761429916,
"learning_rate": 3.5196493081366966e-08,
"loss": 0.0779,
"step": 1324
},
{
"epoch": 1.9258720930232558,
"grad_norm": 1.613161432653411,
"learning_rate": 3.385731343484633e-08,
"loss": 0.0759,
"step": 1325
},
{
"epoch": 1.927325581395349,
"grad_norm": 1.1432087881946933,
"learning_rate": 3.254402079778618e-08,
"loss": 0.0547,
"step": 1326
},
{
"epoch": 1.9287790697674418,
"grad_norm": 1.4249087522542705,
"learning_rate": 3.125662201598656e-08,
"loss": 0.0663,
"step": 1327
},
{
"epoch": 1.9302325581395348,
"grad_norm": 1.2132853467810891,
"learning_rate": 2.9995123800270476e-08,
"loss": 0.0499,
"step": 1328
},
{
"epoch": 1.931686046511628,
"grad_norm": 1.193012210008745,
"learning_rate": 2.8759532726448937e-08,
"loss": 0.0523,
"step": 1329
},
{
"epoch": 1.933139534883721,
"grad_norm": 1.358877307198463,
"learning_rate": 2.754985523528708e-08,
"loss": 0.0695,
"step": 1330
},
{
"epoch": 1.934593023255814,
"grad_norm": 1.6866286956608185,
"learning_rate": 2.6366097632469778e-08,
"loss": 0.0625,
"step": 1331
},
{
"epoch": 1.9360465116279069,
"grad_norm": 1.585941621289744,
"learning_rate": 2.5208266088569966e-08,
"loss": 0.0511,
"step": 1332
},
{
"epoch": 1.9375,
"grad_norm": 1.3872165917004133,
"learning_rate": 2.4076366639015914e-08,
"loss": 0.0568,
"step": 1333
},
{
"epoch": 1.9389534883720931,
"grad_norm": 1.457747420185785,
"learning_rate": 2.2970405184058463e-08,
"loss": 0.0547,
"step": 1334
},
{
"epoch": 1.940406976744186,
"grad_norm": 1.1949184875255823,
"learning_rate": 2.1890387488742726e-08,
"loss": 0.0399,
"step": 1335
},
{
"epoch": 1.941860465116279,
"grad_norm": 1.5940091425661185,
"learning_rate": 2.083631918287643e-08,
"loss": 0.0518,
"step": 1336
},
{
"epoch": 1.943313953488372,
"grad_norm": 1.382910110487527,
"learning_rate": 1.9808205761001065e-08,
"loss": 0.0532,
"step": 1337
},
{
"epoch": 1.9447674418604652,
"grad_norm": 1.4736697924051638,
"learning_rate": 1.880605258236301e-08,
"loss": 0.0588,
"step": 1338
},
{
"epoch": 1.9462209302325582,
"grad_norm": 1.295565534954337,
"learning_rate": 1.782986487088467e-08,
"loss": 0.0724,
"step": 1339
},
{
"epoch": 1.947674418604651,
"grad_norm": 1.3488063432605657,
"learning_rate": 1.6879647715140613e-08,
"loss": 0.0923,
"step": 1340
},
{
"epoch": 1.9491279069767442,
"grad_norm": 1.3676570070169922,
"learning_rate": 1.5955406068326462e-08,
"loss": 0.0515,
"step": 1341
},
{
"epoch": 1.9505813953488373,
"grad_norm": 1.2884297596799819,
"learning_rate": 1.5057144748236162e-08,
"loss": 0.0462,
"step": 1342
},
{
"epoch": 1.9520348837209303,
"grad_norm": 1.2443705721719838,
"learning_rate": 1.4184868437236987e-08,
"loss": 0.0576,
"step": 1343
},
{
"epoch": 1.9534883720930232,
"grad_norm": 1.3053643973067475,
"learning_rate": 1.333858168224178e-08,
"loss": 0.0537,
"step": 1344
},
{
"epoch": 1.9549418604651163,
"grad_norm": 1.546574318041773,
"learning_rate": 1.2518288894690089e-08,
"loss": 0.0372,
"step": 1345
},
{
"epoch": 1.9563953488372094,
"grad_norm": 1.2490005145711904,
"learning_rate": 1.1723994350521518e-08,
"loss": 0.0541,
"step": 1346
},
{
"epoch": 1.9578488372093024,
"grad_norm": 1.5464557013202604,
"learning_rate": 1.0955702190154072e-08,
"loss": 0.0558,
"step": 1347
},
{
"epoch": 1.9593023255813953,
"grad_norm": 1.2582934072656768,
"learning_rate": 1.0213416418465294e-08,
"loss": 0.0567,
"step": 1348
},
{
"epoch": 1.9607558139534884,
"grad_norm": 1.9089698225345129,
"learning_rate": 9.497140904766722e-09,
"loss": 0.0556,
"step": 1349
},
{
"epoch": 1.9622093023255816,
"grad_norm": 1.509899723394668,
"learning_rate": 8.806879382788347e-09,
"loss": 0.0603,
"step": 1350
},
{
"epoch": 1.9636627906976745,
"grad_norm": 1.4511032534590458,
"learning_rate": 8.142635450654746e-09,
"loss": 0.0762,
"step": 1351
},
{
"epoch": 1.9651162790697674,
"grad_norm": 1.6080149924866183,
"learning_rate": 7.5044125708712e-09,
"loss": 0.0609,
"step": 1352
},
{
"epoch": 1.9665697674418605,
"grad_norm": 1.1922215779000969,
"learning_rate": 6.89221407030094e-09,
"loss": 0.0444,
"step": 1353
},
{
"epoch": 1.9680232558139537,
"grad_norm": 1.3256995449929614,
"learning_rate": 6.3060431401512634e-09,
"loss": 0.0474,
"step": 1354
},
{
"epoch": 1.9694767441860463,
"grad_norm": 1.1949382933968422,
"learning_rate": 5.7459028359546645e-09,
"loss": 0.0467,
"step": 1355
},
{
"epoch": 1.9709302325581395,
"grad_norm": 1.693812650124421,
"learning_rate": 5.211796077554399e-09,
"loss": 0.0645,
"step": 1356
},
{
"epoch": 1.9723837209302326,
"grad_norm": 1.454630218156985,
"learning_rate": 4.703725649088941e-09,
"loss": 0.0514,
"step": 1357
},
{
"epoch": 1.9738372093023255,
"grad_norm": 1.4686563612295154,
"learning_rate": 4.221694198976445e-09,
"loss": 0.0654,
"step": 1358
},
{
"epoch": 1.9752906976744184,
"grad_norm": 1.7602700723605103,
"learning_rate": 3.765704239901413e-09,
"loss": 0.0678,
"step": 1359
},
{
"epoch": 1.9767441860465116,
"grad_norm": 1.2613024941113347,
"learning_rate": 3.3357581488030476e-09,
"loss": 0.0627,
"step": 1360
},
{
"epoch": 1.9781976744186047,
"grad_norm": 1.7048864018499121,
"learning_rate": 2.9318581668613676e-09,
"loss": 0.0691,
"step": 1361
},
{
"epoch": 1.9796511627906976,
"grad_norm": 1.1475541829943066,
"learning_rate": 2.5540063994849982e-09,
"loss": 0.0512,
"step": 1362
},
{
"epoch": 1.9811046511627906,
"grad_norm": 1.6476817749953838,
"learning_rate": 2.202204816302289e-09,
"loss": 0.05,
"step": 1363
},
{
"epoch": 1.9825581395348837,
"grad_norm": 1.0248136833336827,
"learning_rate": 1.8764552511485457e-09,
"loss": 0.0374,
"step": 1364
},
{
"epoch": 1.9840116279069768,
"grad_norm": 1.248745978654899,
"learning_rate": 1.576759402058814e-09,
"loss": 0.0581,
"step": 1365
},
{
"epoch": 1.9854651162790697,
"grad_norm": 1.4353092543675106,
"learning_rate": 1.3031188312573328e-09,
"loss": 0.0512,
"step": 1366
},
{
"epoch": 1.9869186046511627,
"grad_norm": 1.6080768674831933,
"learning_rate": 1.0555349651503178e-09,
"loss": 0.063,
"step": 1367
},
{
"epoch": 1.9883720930232558,
"grad_norm": 1.426444153430007,
"learning_rate": 8.340090943176338e-10,
"loss": 0.0526,
"step": 1368
},
{
"epoch": 1.989825581395349,
"grad_norm": 1.3214408477997244,
"learning_rate": 6.385423735078e-10,
"loss": 0.0463,
"step": 1369
},
{
"epoch": 1.9912790697674418,
"grad_norm": 1.7983231171481742,
"learning_rate": 4.691358216291075e-10,
"loss": 0.058,
"step": 1370
},
{
"epoch": 1.9927325581395348,
"grad_norm": 1.4390939528927378,
"learning_rate": 3.257903217479541e-10,
"loss": 0.0639,
"step": 1371
},
{
"epoch": 1.994186046511628,
"grad_norm": 1.1021839312652135,
"learning_rate": 2.0850662108051755e-10,
"loss": 0.0402,
"step": 1372
},
{
"epoch": 1.995639534883721,
"grad_norm": 1.3590522618876162,
"learning_rate": 1.1728533099220063e-10,
"loss": 0.0509,
"step": 1373
},
{
"epoch": 1.997093023255814,
"grad_norm": 1.4976019289330444,
"learning_rate": 5.2126926991524774e-11,
"loss": 0.0671,
"step": 1374
},
{
"epoch": 1.9985465116279069,
"grad_norm": 1.3973669217539937,
"learning_rate": 1.3031748730685246e-11,
"loss": 0.0565,
"step": 1375
},
{
"epoch": 2.0,
"grad_norm": 0.8399439880889858,
"learning_rate": 0.0,
"loss": 0.0345,
"step": 1376
},
{
"epoch": 2.0,
"step": 1376,
"total_flos": 7660524011520.0,
"train_loss": 0.10411996046909629,
"train_runtime": 1065.2694,
"train_samples_per_second": 10.32,
"train_steps_per_second": 1.292
}
],
"logging_steps": 1,
"max_steps": 1376,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7660524011520.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}