cutelemonlili's picture
Add files using upload-large-folder tool
3df1f85 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 200,
"global_step": 754,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002652519893899204,
"grad_norm": 3.143817743419433,
"learning_rate": 9.999956599329999e-06,
"loss": 0.4703,
"step": 1
},
{
"epoch": 0.005305039787798408,
"grad_norm": 2.9153346228246715,
"learning_rate": 9.999826398073438e-06,
"loss": 0.4676,
"step": 2
},
{
"epoch": 0.007957559681697613,
"grad_norm": 2.1837013669916683,
"learning_rate": 9.999609398490651e-06,
"loss": 0.3752,
"step": 3
},
{
"epoch": 0.010610079575596816,
"grad_norm": 2.7009872655116145,
"learning_rate": 9.999305604348804e-06,
"loss": 0.494,
"step": 4
},
{
"epoch": 0.013262599469496022,
"grad_norm": 2.71509007103024,
"learning_rate": 9.998915020921847e-06,
"loss": 0.4101,
"step": 5
},
{
"epoch": 0.015915119363395226,
"grad_norm": 2.046773566123141,
"learning_rate": 9.998437654990412e-06,
"loss": 0.3118,
"step": 6
},
{
"epoch": 0.01856763925729443,
"grad_norm": 1.6805054009773157,
"learning_rate": 9.997873514841703e-06,
"loss": 0.2808,
"step": 7
},
{
"epoch": 0.021220159151193633,
"grad_norm": 1.9031301046910736,
"learning_rate": 9.997222610269339e-06,
"loss": 0.288,
"step": 8
},
{
"epoch": 0.023872679045092837,
"grad_norm": 2.2790419571248,
"learning_rate": 9.996484952573203e-06,
"loss": 0.3187,
"step": 9
},
{
"epoch": 0.026525198938992044,
"grad_norm": 2.005476307323896,
"learning_rate": 9.995660554559225e-06,
"loss": 0.2684,
"step": 10
},
{
"epoch": 0.029177718832891247,
"grad_norm": 2.1118955586623542,
"learning_rate": 9.99474943053918e-06,
"loss": 0.2861,
"step": 11
},
{
"epoch": 0.03183023872679045,
"grad_norm": 1.9084863447700302,
"learning_rate": 9.993751596330421e-06,
"loss": 0.2447,
"step": 12
},
{
"epoch": 0.034482758620689655,
"grad_norm": 2.443033048528926,
"learning_rate": 9.99266706925562e-06,
"loss": 0.295,
"step": 13
},
{
"epoch": 0.03713527851458886,
"grad_norm": 2.252694252488703,
"learning_rate": 9.991495868142457e-06,
"loss": 0.2895,
"step": 14
},
{
"epoch": 0.03978779840848806,
"grad_norm": 2.134403724246919,
"learning_rate": 9.990238013323298e-06,
"loss": 0.2704,
"step": 15
},
{
"epoch": 0.042440318302387266,
"grad_norm": 2.28401373031769,
"learning_rate": 9.98889352663484e-06,
"loss": 0.2648,
"step": 16
},
{
"epoch": 0.04509283819628647,
"grad_norm": 2.1064551964073286,
"learning_rate": 9.987462431417732e-06,
"loss": 0.2033,
"step": 17
},
{
"epoch": 0.04774535809018567,
"grad_norm": 2.1590960874250102,
"learning_rate": 9.985944752516168e-06,
"loss": 0.206,
"step": 18
},
{
"epoch": 0.050397877984084884,
"grad_norm": 1.787435615555535,
"learning_rate": 9.984340516277464e-06,
"loss": 0.2446,
"step": 19
},
{
"epoch": 0.05305039787798409,
"grad_norm": 2.154695176620596,
"learning_rate": 9.982649750551589e-06,
"loss": 0.2428,
"step": 20
},
{
"epoch": 0.05570291777188329,
"grad_norm": 2.013092950603686,
"learning_rate": 9.980872484690689e-06,
"loss": 0.2095,
"step": 21
},
{
"epoch": 0.058355437665782495,
"grad_norm": 1.871221689867333,
"learning_rate": 9.979008749548575e-06,
"loss": 0.2119,
"step": 22
},
{
"epoch": 0.0610079575596817,
"grad_norm": 2.1428411294867975,
"learning_rate": 9.977058577480192e-06,
"loss": 0.2031,
"step": 23
},
{
"epoch": 0.0636604774535809,
"grad_norm": 2.029431401764423,
"learning_rate": 9.975022002341045e-06,
"loss": 0.2279,
"step": 24
},
{
"epoch": 0.06631299734748011,
"grad_norm": 1.9037807432106393,
"learning_rate": 9.972899059486629e-06,
"loss": 0.2236,
"step": 25
},
{
"epoch": 0.06896551724137931,
"grad_norm": 1.5412274357921503,
"learning_rate": 9.970689785771798e-06,
"loss": 0.1816,
"step": 26
},
{
"epoch": 0.07161803713527852,
"grad_norm": 2.230329801114519,
"learning_rate": 9.968394219550136e-06,
"loss": 0.2428,
"step": 27
},
{
"epoch": 0.07427055702917772,
"grad_norm": 1.902794098881427,
"learning_rate": 9.966012400673291e-06,
"loss": 0.2154,
"step": 28
},
{
"epoch": 0.07692307692307693,
"grad_norm": 1.8780742288675212,
"learning_rate": 9.96354437049027e-06,
"loss": 0.1972,
"step": 29
},
{
"epoch": 0.07957559681697612,
"grad_norm": 1.8047024624739505,
"learning_rate": 9.960990171846745e-06,
"loss": 0.2003,
"step": 30
},
{
"epoch": 0.08222811671087533,
"grad_norm": 1.99420196004025,
"learning_rate": 9.958349849084286e-06,
"loss": 0.2229,
"step": 31
},
{
"epoch": 0.08488063660477453,
"grad_norm": 1.9295211213650523,
"learning_rate": 9.955623448039605e-06,
"loss": 0.2077,
"step": 32
},
{
"epoch": 0.08753315649867374,
"grad_norm": 1.9560816295756256,
"learning_rate": 9.952811016043753e-06,
"loss": 0.2035,
"step": 33
},
{
"epoch": 0.09018567639257294,
"grad_norm": 1.8994575592237317,
"learning_rate": 9.949912601921306e-06,
"loss": 0.2365,
"step": 34
},
{
"epoch": 0.09283819628647215,
"grad_norm": 1.7820353508663918,
"learning_rate": 9.946928255989507e-06,
"loss": 0.1889,
"step": 35
},
{
"epoch": 0.09549071618037135,
"grad_norm": 1.9972563802629155,
"learning_rate": 9.943858030057404e-06,
"loss": 0.1992,
"step": 36
},
{
"epoch": 0.09814323607427056,
"grad_norm": 1.9858847962279158,
"learning_rate": 9.94070197742494e-06,
"loss": 0.2126,
"step": 37
},
{
"epoch": 0.10079575596816977,
"grad_norm": 2.2468613321703375,
"learning_rate": 9.937460152882035e-06,
"loss": 0.1966,
"step": 38
},
{
"epoch": 0.10344827586206896,
"grad_norm": 1.9047848108957046,
"learning_rate": 9.934132612707631e-06,
"loss": 0.1929,
"step": 39
},
{
"epoch": 0.10610079575596817,
"grad_norm": 1.5198039037792352,
"learning_rate": 9.93071941466872e-06,
"loss": 0.1454,
"step": 40
},
{
"epoch": 0.10875331564986737,
"grad_norm": 1.942256631038327,
"learning_rate": 9.927220618019331e-06,
"loss": 0.1927,
"step": 41
},
{
"epoch": 0.11140583554376658,
"grad_norm": 1.9039836584761076,
"learning_rate": 9.923636283499513e-06,
"loss": 0.1683,
"step": 42
},
{
"epoch": 0.11405835543766578,
"grad_norm": 2.3882871093584015,
"learning_rate": 9.919966473334278e-06,
"loss": 0.1925,
"step": 43
},
{
"epoch": 0.11671087533156499,
"grad_norm": 1.9482729522397495,
"learning_rate": 9.916211251232507e-06,
"loss": 0.204,
"step": 44
},
{
"epoch": 0.11936339522546419,
"grad_norm": 1.9541832414635472,
"learning_rate": 9.912370682385866e-06,
"loss": 0.1718,
"step": 45
},
{
"epoch": 0.1220159151193634,
"grad_norm": 2.2469271806449744,
"learning_rate": 9.908444833467659e-06,
"loss": 0.2099,
"step": 46
},
{
"epoch": 0.1246684350132626,
"grad_norm": 1.9273893068530785,
"learning_rate": 9.904433772631674e-06,
"loss": 0.1896,
"step": 47
},
{
"epoch": 0.1273209549071618,
"grad_norm": 2.020005570778388,
"learning_rate": 9.900337569511003e-06,
"loss": 0.2135,
"step": 48
},
{
"epoch": 0.129973474801061,
"grad_norm": 1.787168676896887,
"learning_rate": 9.896156295216832e-06,
"loss": 0.1895,
"step": 49
},
{
"epoch": 0.13262599469496023,
"grad_norm": 1.9492709359169609,
"learning_rate": 9.891890022337201e-06,
"loss": 0.1948,
"step": 50
},
{
"epoch": 0.13527851458885942,
"grad_norm": 1.4707392860636022,
"learning_rate": 9.88753882493575e-06,
"loss": 0.1494,
"step": 51
},
{
"epoch": 0.13793103448275862,
"grad_norm": 1.8405786005165676,
"learning_rate": 9.883102778550434e-06,
"loss": 0.1752,
"step": 52
},
{
"epoch": 0.14058355437665782,
"grad_norm": 1.8735255704087945,
"learning_rate": 9.878581960192206e-06,
"loss": 0.1779,
"step": 53
},
{
"epoch": 0.14323607427055704,
"grad_norm": 2.0141493172675746,
"learning_rate": 9.873976448343685e-06,
"loss": 0.2168,
"step": 54
},
{
"epoch": 0.14588859416445624,
"grad_norm": 1.7017677167638687,
"learning_rate": 9.86928632295779e-06,
"loss": 0.1589,
"step": 55
},
{
"epoch": 0.14854111405835543,
"grad_norm": 1.6283124591177214,
"learning_rate": 9.864511665456355e-06,
"loss": 0.1399,
"step": 56
},
{
"epoch": 0.15119363395225463,
"grad_norm": 2.2298913035183845,
"learning_rate": 9.859652558728714e-06,
"loss": 0.2046,
"step": 57
},
{
"epoch": 0.15384615384615385,
"grad_norm": 2.12248040462289,
"learning_rate": 9.854709087130261e-06,
"loss": 0.1859,
"step": 58
},
{
"epoch": 0.15649867374005305,
"grad_norm": 2.4614522882556247,
"learning_rate": 9.84968133648099e-06,
"loss": 0.2284,
"step": 59
},
{
"epoch": 0.15915119363395225,
"grad_norm": 2.0728088827577578,
"learning_rate": 9.844569394063997e-06,
"loss": 0.1826,
"step": 60
},
{
"epoch": 0.16180371352785147,
"grad_norm": 1.626832715567298,
"learning_rate": 9.839373348623976e-06,
"loss": 0.144,
"step": 61
},
{
"epoch": 0.16445623342175067,
"grad_norm": 2.095258455090102,
"learning_rate": 9.834093290365665e-06,
"loss": 0.1762,
"step": 62
},
{
"epoch": 0.16710875331564987,
"grad_norm": 2.1180496536142495,
"learning_rate": 9.828729310952292e-06,
"loss": 0.1754,
"step": 63
},
{
"epoch": 0.16976127320954906,
"grad_norm": 2.523281048591522,
"learning_rate": 9.823281503503976e-06,
"loss": 0.2063,
"step": 64
},
{
"epoch": 0.1724137931034483,
"grad_norm": 1.7605117934687937,
"learning_rate": 9.817749962596115e-06,
"loss": 0.1513,
"step": 65
},
{
"epoch": 0.17506631299734748,
"grad_norm": 1.8068568519736308,
"learning_rate": 9.812134784257743e-06,
"loss": 0.1695,
"step": 66
},
{
"epoch": 0.17771883289124668,
"grad_norm": 2.194003391788825,
"learning_rate": 9.80643606596986e-06,
"loss": 0.1873,
"step": 67
},
{
"epoch": 0.18037135278514588,
"grad_norm": 1.8221102873639325,
"learning_rate": 9.80065390666374e-06,
"loss": 0.1709,
"step": 68
},
{
"epoch": 0.1830238726790451,
"grad_norm": 1.81681516720243,
"learning_rate": 9.794788406719223e-06,
"loss": 0.1611,
"step": 69
},
{
"epoch": 0.1856763925729443,
"grad_norm": 1.6215357337836858,
"learning_rate": 9.788839667962956e-06,
"loss": 0.1491,
"step": 70
},
{
"epoch": 0.1883289124668435,
"grad_norm": 1.9544453657582719,
"learning_rate": 9.78280779366664e-06,
"loss": 0.1666,
"step": 71
},
{
"epoch": 0.1909814323607427,
"grad_norm": 1.9891058976186013,
"learning_rate": 9.77669288854523e-06,
"loss": 0.1652,
"step": 72
},
{
"epoch": 0.19363395225464192,
"grad_norm": 1.7657716439562443,
"learning_rate": 9.770495058755113e-06,
"loss": 0.1665,
"step": 73
},
{
"epoch": 0.1962864721485411,
"grad_norm": 2.0702179893057657,
"learning_rate": 9.764214411892283e-06,
"loss": 0.1839,
"step": 74
},
{
"epoch": 0.1989389920424403,
"grad_norm": 1.94174040080256,
"learning_rate": 9.757851056990446e-06,
"loss": 0.1514,
"step": 75
},
{
"epoch": 0.20159151193633953,
"grad_norm": 2.0359509600829355,
"learning_rate": 9.751405104519151e-06,
"loss": 0.168,
"step": 76
},
{
"epoch": 0.20424403183023873,
"grad_norm": 1.8488897921580416,
"learning_rate": 9.744876666381861e-06,
"loss": 0.1642,
"step": 77
},
{
"epoch": 0.20689655172413793,
"grad_norm": 1.9493633096974006,
"learning_rate": 9.738265855914014e-06,
"loss": 0.1451,
"step": 78
},
{
"epoch": 0.20954907161803712,
"grad_norm": 1.969779554700488,
"learning_rate": 9.731572787881045e-06,
"loss": 0.1738,
"step": 79
},
{
"epoch": 0.21220159151193635,
"grad_norm": 1.6792825100474953,
"learning_rate": 9.724797578476414e-06,
"loss": 0.1243,
"step": 80
},
{
"epoch": 0.21485411140583555,
"grad_norm": 1.9140356369318925,
"learning_rate": 9.71794034531957e-06,
"loss": 0.1591,
"step": 81
},
{
"epoch": 0.21750663129973474,
"grad_norm": 1.7940577658445638,
"learning_rate": 9.711001207453919e-06,
"loss": 0.1411,
"step": 82
},
{
"epoch": 0.22015915119363394,
"grad_norm": 2.383028621766121,
"learning_rate": 9.703980285344752e-06,
"loss": 0.1752,
"step": 83
},
{
"epoch": 0.22281167108753316,
"grad_norm": 2.051261568492118,
"learning_rate": 9.696877700877162e-06,
"loss": 0.1974,
"step": 84
},
{
"epoch": 0.22546419098143236,
"grad_norm": 1.694394298477383,
"learning_rate": 9.689693577353917e-06,
"loss": 0.1395,
"step": 85
},
{
"epoch": 0.22811671087533156,
"grad_norm": 1.8051445315092405,
"learning_rate": 9.682428039493325e-06,
"loss": 0.1576,
"step": 86
},
{
"epoch": 0.23076923076923078,
"grad_norm": 1.988060978591615,
"learning_rate": 9.675081213427076e-06,
"loss": 0.152,
"step": 87
},
{
"epoch": 0.23342175066312998,
"grad_norm": 1.6413056627568852,
"learning_rate": 9.667653226698033e-06,
"loss": 0.157,
"step": 88
},
{
"epoch": 0.23607427055702918,
"grad_norm": 1.7068088087742874,
"learning_rate": 9.660144208258039e-06,
"loss": 0.1518,
"step": 89
},
{
"epoch": 0.23872679045092837,
"grad_norm": 2.0128746338247856,
"learning_rate": 9.652554288465668e-06,
"loss": 0.1792,
"step": 90
},
{
"epoch": 0.2413793103448276,
"grad_norm": 1.7874268938466609,
"learning_rate": 9.644883599083959e-06,
"loss": 0.1459,
"step": 91
},
{
"epoch": 0.2440318302387268,
"grad_norm": 2.15165919348902,
"learning_rate": 9.637132273278135e-06,
"loss": 0.1745,
"step": 92
},
{
"epoch": 0.246684350132626,
"grad_norm": 2.3979359764341415,
"learning_rate": 9.629300445613294e-06,
"loss": 0.2075,
"step": 93
},
{
"epoch": 0.2493368700265252,
"grad_norm": 2.250801983136737,
"learning_rate": 9.62138825205206e-06,
"loss": 0.1958,
"step": 94
},
{
"epoch": 0.2519893899204244,
"grad_norm": 2.021920489405669,
"learning_rate": 9.613395829952233e-06,
"loss": 0.1608,
"step": 95
},
{
"epoch": 0.2546419098143236,
"grad_norm": 2.2291104046239076,
"learning_rate": 9.605323318064403e-06,
"loss": 0.196,
"step": 96
},
{
"epoch": 0.2572944297082228,
"grad_norm": 1.7681014524697132,
"learning_rate": 9.59717085652954e-06,
"loss": 0.1367,
"step": 97
},
{
"epoch": 0.259946949602122,
"grad_norm": 2.186660089739903,
"learning_rate": 9.588938586876564e-06,
"loss": 0.1485,
"step": 98
},
{
"epoch": 0.2625994694960212,
"grad_norm": 2.2507510523601044,
"learning_rate": 9.580626652019878e-06,
"loss": 0.1795,
"step": 99
},
{
"epoch": 0.26525198938992045,
"grad_norm": 1.9046066926852447,
"learning_rate": 9.5722351962569e-06,
"loss": 0.1676,
"step": 100
},
{
"epoch": 0.26790450928381965,
"grad_norm": 1.8505248856341252,
"learning_rate": 9.563764365265553e-06,
"loss": 0.1539,
"step": 101
},
{
"epoch": 0.27055702917771884,
"grad_norm": 2.0270020661057755,
"learning_rate": 9.555214306101732e-06,
"loss": 0.1702,
"step": 102
},
{
"epoch": 0.27320954907161804,
"grad_norm": 1.6795319874110077,
"learning_rate": 9.546585167196755e-06,
"loss": 0.1503,
"step": 103
},
{
"epoch": 0.27586206896551724,
"grad_norm": 2.341752387924688,
"learning_rate": 9.537877098354787e-06,
"loss": 0.1766,
"step": 104
},
{
"epoch": 0.27851458885941643,
"grad_norm": 2.5362314705747093,
"learning_rate": 9.529090250750234e-06,
"loss": 0.1854,
"step": 105
},
{
"epoch": 0.28116710875331563,
"grad_norm": 1.6421242469941448,
"learning_rate": 9.52022477692513e-06,
"loss": 0.12,
"step": 106
},
{
"epoch": 0.2838196286472148,
"grad_norm": 1.7705962516319114,
"learning_rate": 9.511280830786471e-06,
"loss": 0.15,
"step": 107
},
{
"epoch": 0.2864721485411141,
"grad_norm": 1.8018966800140481,
"learning_rate": 9.502258567603563e-06,
"loss": 0.1446,
"step": 108
},
{
"epoch": 0.2891246684350133,
"grad_norm": 2.154026465895243,
"learning_rate": 9.493158144005314e-06,
"loss": 0.175,
"step": 109
},
{
"epoch": 0.2917771883289125,
"grad_norm": 2.091201257414692,
"learning_rate": 9.483979717977513e-06,
"loss": 0.1533,
"step": 110
},
{
"epoch": 0.29442970822281167,
"grad_norm": 2.049226329317327,
"learning_rate": 9.474723448860096e-06,
"loss": 0.1582,
"step": 111
},
{
"epoch": 0.29708222811671087,
"grad_norm": 1.6937916325037254,
"learning_rate": 9.46538949734438e-06,
"loss": 0.1319,
"step": 112
},
{
"epoch": 0.29973474801061006,
"grad_norm": 2.2804043446780193,
"learning_rate": 9.455978025470257e-06,
"loss": 0.1744,
"step": 113
},
{
"epoch": 0.30238726790450926,
"grad_norm": 2.427110301571042,
"learning_rate": 9.44648919662341e-06,
"loss": 0.1972,
"step": 114
},
{
"epoch": 0.3050397877984085,
"grad_norm": 2.4719255725338307,
"learning_rate": 9.436923175532442e-06,
"loss": 0.1863,
"step": 115
},
{
"epoch": 0.3076923076923077,
"grad_norm": 1.9198368860593722,
"learning_rate": 9.427280128266049e-06,
"loss": 0.1428,
"step": 116
},
{
"epoch": 0.3103448275862069,
"grad_norm": 1.9531949152663541,
"learning_rate": 9.417560222230115e-06,
"loss": 0.1723,
"step": 117
},
{
"epoch": 0.3129973474801061,
"grad_norm": 2.1363324241184456,
"learning_rate": 9.407763626164812e-06,
"loss": 0.1758,
"step": 118
},
{
"epoch": 0.3156498673740053,
"grad_norm": 1.8882375069693482,
"learning_rate": 9.397890510141674e-06,
"loss": 0.1248,
"step": 119
},
{
"epoch": 0.3183023872679045,
"grad_norm": 2.1194646896664584,
"learning_rate": 9.387941045560641e-06,
"loss": 0.1589,
"step": 120
},
{
"epoch": 0.3209549071618037,
"grad_norm": 1.5127624036717937,
"learning_rate": 9.377915405147085e-06,
"loss": 0.1446,
"step": 121
},
{
"epoch": 0.32360742705570295,
"grad_norm": 2.158233386717242,
"learning_rate": 9.367813762948809e-06,
"loss": 0.1584,
"step": 122
},
{
"epoch": 0.32625994694960214,
"grad_norm": 1.941105678166514,
"learning_rate": 9.357636294333031e-06,
"loss": 0.1272,
"step": 123
},
{
"epoch": 0.32891246684350134,
"grad_norm": 1.8205147183171333,
"learning_rate": 9.347383175983333e-06,
"loss": 0.1545,
"step": 124
},
{
"epoch": 0.33156498673740054,
"grad_norm": 1.6927779015644462,
"learning_rate": 9.337054585896596e-06,
"loss": 0.1202,
"step": 125
},
{
"epoch": 0.33421750663129973,
"grad_norm": 1.9276160632994672,
"learning_rate": 9.326650703379913e-06,
"loss": 0.1432,
"step": 126
},
{
"epoch": 0.33687002652519893,
"grad_norm": 1.9253639897291404,
"learning_rate": 9.316171709047475e-06,
"loss": 0.148,
"step": 127
},
{
"epoch": 0.3395225464190981,
"grad_norm": 1.8807065660047437,
"learning_rate": 9.305617784817426e-06,
"loss": 0.1401,
"step": 128
},
{
"epoch": 0.3421750663129973,
"grad_norm": 1.95250475420529,
"learning_rate": 9.294989113908726e-06,
"loss": 0.1548,
"step": 129
},
{
"epoch": 0.3448275862068966,
"grad_norm": 2.3715397805842255,
"learning_rate": 9.284285880837947e-06,
"loss": 0.1756,
"step": 130
},
{
"epoch": 0.34748010610079577,
"grad_norm": 1.93914417403847,
"learning_rate": 9.273508271416082e-06,
"loss": 0.1399,
"step": 131
},
{
"epoch": 0.35013262599469497,
"grad_norm": 1.9560254067556455,
"learning_rate": 9.262656472745324e-06,
"loss": 0.1389,
"step": 132
},
{
"epoch": 0.35278514588859416,
"grad_norm": 1.9150433290760882,
"learning_rate": 9.251730673215802e-06,
"loss": 0.1407,
"step": 133
},
{
"epoch": 0.35543766578249336,
"grad_norm": 1.8135844757186297,
"learning_rate": 9.240731062502323e-06,
"loss": 0.1306,
"step": 134
},
{
"epoch": 0.35809018567639256,
"grad_norm": 2.162238929625922,
"learning_rate": 9.229657831561082e-06,
"loss": 0.1496,
"step": 135
},
{
"epoch": 0.36074270557029176,
"grad_norm": 2.055713876885785,
"learning_rate": 9.218511172626333e-06,
"loss": 0.1753,
"step": 136
},
{
"epoch": 0.363395225464191,
"grad_norm": 1.6946475181680336,
"learning_rate": 9.207291279207058e-06,
"loss": 0.1335,
"step": 137
},
{
"epoch": 0.3660477453580902,
"grad_norm": 2.101107547619466,
"learning_rate": 9.195998346083621e-06,
"loss": 0.1493,
"step": 138
},
{
"epoch": 0.3687002652519894,
"grad_norm": 1.8055607972774566,
"learning_rate": 9.184632569304365e-06,
"loss": 0.1262,
"step": 139
},
{
"epoch": 0.3713527851458886,
"grad_norm": 1.690201779069974,
"learning_rate": 9.173194146182219e-06,
"loss": 0.1499,
"step": 140
},
{
"epoch": 0.3740053050397878,
"grad_norm": 2.1397994585808955,
"learning_rate": 9.161683275291275e-06,
"loss": 0.157,
"step": 141
},
{
"epoch": 0.376657824933687,
"grad_norm": 1.9870860707292484,
"learning_rate": 9.150100156463337e-06,
"loss": 0.1529,
"step": 142
},
{
"epoch": 0.3793103448275862,
"grad_norm": 2.207156912182982,
"learning_rate": 9.138444990784455e-06,
"loss": 0.1859,
"step": 143
},
{
"epoch": 0.3819628647214854,
"grad_norm": 1.9713520675081648,
"learning_rate": 9.126717980591422e-06,
"loss": 0.1459,
"step": 144
},
{
"epoch": 0.38461538461538464,
"grad_norm": 1.7065232206005256,
"learning_rate": 9.114919329468283e-06,
"loss": 0.1461,
"step": 145
},
{
"epoch": 0.38726790450928383,
"grad_norm": 2.07601306564582,
"learning_rate": 9.103049242242781e-06,
"loss": 0.1785,
"step": 146
},
{
"epoch": 0.38992042440318303,
"grad_norm": 1.776835787519842,
"learning_rate": 9.091107924982814e-06,
"loss": 0.1525,
"step": 147
},
{
"epoch": 0.3925729442970822,
"grad_norm": 1.8663700533344434,
"learning_rate": 9.079095584992848e-06,
"loss": 0.1409,
"step": 148
},
{
"epoch": 0.3952254641909814,
"grad_norm": 2.2784042310709407,
"learning_rate": 9.067012430810326e-06,
"loss": 0.1785,
"step": 149
},
{
"epoch": 0.3978779840848806,
"grad_norm": 1.7721799125937119,
"learning_rate": 9.05485867220204e-06,
"loss": 0.1346,
"step": 150
},
{
"epoch": 0.4005305039787798,
"grad_norm": 2.13950597469927,
"learning_rate": 9.0426345201605e-06,
"loss": 0.1652,
"step": 151
},
{
"epoch": 0.40318302387267907,
"grad_norm": 1.8810097271179547,
"learning_rate": 9.03034018690026e-06,
"loss": 0.1449,
"step": 152
},
{
"epoch": 0.40583554376657827,
"grad_norm": 2.4159078987026863,
"learning_rate": 9.01797588585424e-06,
"loss": 0.1683,
"step": 153
},
{
"epoch": 0.40848806366047746,
"grad_norm": 2.0980888650490668,
"learning_rate": 9.00554183167002e-06,
"loss": 0.1424,
"step": 154
},
{
"epoch": 0.41114058355437666,
"grad_norm": 2.062288135177168,
"learning_rate": 8.993038240206114e-06,
"loss": 0.1392,
"step": 155
},
{
"epoch": 0.41379310344827586,
"grad_norm": 1.9204203170622036,
"learning_rate": 8.98046532852822e-06,
"loss": 0.1418,
"step": 156
},
{
"epoch": 0.41644562334217505,
"grad_norm": 2.08299140855318,
"learning_rate": 8.967823314905452e-06,
"loss": 0.1486,
"step": 157
},
{
"epoch": 0.41909814323607425,
"grad_norm": 1.844754808254566,
"learning_rate": 8.95511241880656e-06,
"loss": 0.1263,
"step": 158
},
{
"epoch": 0.4217506631299735,
"grad_norm": 1.902160167609469,
"learning_rate": 8.942332860896102e-06,
"loss": 0.1435,
"step": 159
},
{
"epoch": 0.4244031830238727,
"grad_norm": 1.4047023346474345,
"learning_rate": 8.929484863030631e-06,
"loss": 0.1204,
"step": 160
},
{
"epoch": 0.4270557029177719,
"grad_norm": 1.5383340677271746,
"learning_rate": 8.91656864825483e-06,
"loss": 0.1307,
"step": 161
},
{
"epoch": 0.4297082228116711,
"grad_norm": 1.8646076049063893,
"learning_rate": 8.903584440797652e-06,
"loss": 0.1403,
"step": 162
},
{
"epoch": 0.4323607427055703,
"grad_norm": 1.6149573488861944,
"learning_rate": 8.890532466068417e-06,
"loss": 0.1381,
"step": 163
},
{
"epoch": 0.4350132625994695,
"grad_norm": 2.1580924076315715,
"learning_rate": 8.877412950652907e-06,
"loss": 0.1577,
"step": 164
},
{
"epoch": 0.4376657824933687,
"grad_norm": 2.2128873070419375,
"learning_rate": 8.864226122309423e-06,
"loss": 0.1526,
"step": 165
},
{
"epoch": 0.4403183023872679,
"grad_norm": 2.174869482049831,
"learning_rate": 8.850972209964837e-06,
"loss": 0.1473,
"step": 166
},
{
"epoch": 0.44297082228116713,
"grad_norm": 1.6128773258978748,
"learning_rate": 8.837651443710623e-06,
"loss": 0.1217,
"step": 167
},
{
"epoch": 0.44562334217506633,
"grad_norm": 1.8324959002361592,
"learning_rate": 8.824264054798852e-06,
"loss": 0.1509,
"step": 168
},
{
"epoch": 0.4482758620689655,
"grad_norm": 1.708365841100358,
"learning_rate": 8.810810275638183e-06,
"loss": 0.124,
"step": 169
},
{
"epoch": 0.4509283819628647,
"grad_norm": 1.7502467085637508,
"learning_rate": 8.797290339789827e-06,
"loss": 0.1299,
"step": 170
},
{
"epoch": 0.4535809018567639,
"grad_norm": 2.060960880635761,
"learning_rate": 8.783704481963498e-06,
"loss": 0.1428,
"step": 171
},
{
"epoch": 0.4562334217506631,
"grad_norm": 1.8464905297693734,
"learning_rate": 8.770052938013323e-06,
"loss": 0.1502,
"step": 172
},
{
"epoch": 0.4588859416445623,
"grad_norm": 1.8087559826720645,
"learning_rate": 8.756335944933768e-06,
"loss": 0.1351,
"step": 173
},
{
"epoch": 0.46153846153846156,
"grad_norm": 1.7162671809778718,
"learning_rate": 8.742553740855507e-06,
"loss": 0.1244,
"step": 174
},
{
"epoch": 0.46419098143236076,
"grad_norm": 2.25916831884875,
"learning_rate": 8.728706565041296e-06,
"loss": 0.176,
"step": 175
},
{
"epoch": 0.46684350132625996,
"grad_norm": 1.9927732799007924,
"learning_rate": 8.714794657881818e-06,
"loss": 0.1329,
"step": 176
},
{
"epoch": 0.46949602122015915,
"grad_norm": 1.9150506644030154,
"learning_rate": 8.700818260891512e-06,
"loss": 0.1455,
"step": 177
},
{
"epoch": 0.47214854111405835,
"grad_norm": 1.918021311795205,
"learning_rate": 8.686777616704375e-06,
"loss": 0.14,
"step": 178
},
{
"epoch": 0.47480106100795755,
"grad_norm": 2.0258194425535203,
"learning_rate": 8.67267296906975e-06,
"loss": 0.1676,
"step": 179
},
{
"epoch": 0.47745358090185674,
"grad_norm": 1.9171122906217821,
"learning_rate": 8.658504562848104e-06,
"loss": 0.1369,
"step": 180
},
{
"epoch": 0.48010610079575594,
"grad_norm": 2.218531055729975,
"learning_rate": 8.644272644006764e-06,
"loss": 0.152,
"step": 181
},
{
"epoch": 0.4827586206896552,
"grad_norm": 2.0038883017886375,
"learning_rate": 8.629977459615655e-06,
"loss": 0.1426,
"step": 182
},
{
"epoch": 0.4854111405835544,
"grad_norm": 1.7175445641951717,
"learning_rate": 8.61561925784301e-06,
"loss": 0.1249,
"step": 183
},
{
"epoch": 0.4880636604774536,
"grad_norm": 1.815071151822735,
"learning_rate": 8.601198287951059e-06,
"loss": 0.1301,
"step": 184
},
{
"epoch": 0.4907161803713528,
"grad_norm": 2.1561780015246463,
"learning_rate": 8.586714800291704e-06,
"loss": 0.1397,
"step": 185
},
{
"epoch": 0.493368700265252,
"grad_norm": 1.9294815320965064,
"learning_rate": 8.572169046302174e-06,
"loss": 0.141,
"step": 186
},
{
"epoch": 0.4960212201591512,
"grad_norm": 1.659333790717064,
"learning_rate": 8.557561278500656e-06,
"loss": 0.1214,
"step": 187
},
{
"epoch": 0.4986737400530504,
"grad_norm": 1.6753928986149604,
"learning_rate": 8.542891750481913e-06,
"loss": 0.1238,
"step": 188
},
{
"epoch": 0.5013262599469496,
"grad_norm": 1.909985436763465,
"learning_rate": 8.528160716912882e-06,
"loss": 0.1189,
"step": 189
},
{
"epoch": 0.5039787798408488,
"grad_norm": 1.956919058433968,
"learning_rate": 8.513368433528255e-06,
"loss": 0.1328,
"step": 190
},
{
"epoch": 0.506631299734748,
"grad_norm": 1.7825907922117281,
"learning_rate": 8.498515157126038e-06,
"loss": 0.1362,
"step": 191
},
{
"epoch": 0.5092838196286472,
"grad_norm": 2.1745948430153232,
"learning_rate": 8.483601145563087e-06,
"loss": 0.1403,
"step": 192
},
{
"epoch": 0.5119363395225465,
"grad_norm": 1.5767065409411627,
"learning_rate": 8.46862665775064e-06,
"loss": 0.121,
"step": 193
},
{
"epoch": 0.5145888594164456,
"grad_norm": 2.1206690113650737,
"learning_rate": 8.45359195364982e-06,
"loss": 0.1435,
"step": 194
},
{
"epoch": 0.5172413793103449,
"grad_norm": 2.0215180716784453,
"learning_rate": 8.438497294267117e-06,
"loss": 0.1362,
"step": 195
},
{
"epoch": 0.519893899204244,
"grad_norm": 2.2299825948556697,
"learning_rate": 8.423342941649866e-06,
"loss": 0.1337,
"step": 196
},
{
"epoch": 0.5225464190981433,
"grad_norm": 2.262534468463096,
"learning_rate": 8.40812915888169e-06,
"loss": 0.1538,
"step": 197
},
{
"epoch": 0.5251989389920424,
"grad_norm": 2.306393950737034,
"learning_rate": 8.392856210077932e-06,
"loss": 0.1696,
"step": 198
},
{
"epoch": 0.5278514588859416,
"grad_norm": 1.9803460856435193,
"learning_rate": 8.37752436038108e-06,
"loss": 0.1307,
"step": 199
},
{
"epoch": 0.5305039787798409,
"grad_norm": 1.7912221503357104,
"learning_rate": 8.36213387595615e-06,
"loss": 0.1292,
"step": 200
},
{
"epoch": 0.5305039787798409,
"eval_loss": 0.14994381368160248,
"eval_runtime": 1.3023,
"eval_samples_per_second": 23.804,
"eval_steps_per_second": 6.143,
"step": 200
},
{
"epoch": 0.53315649867374,
"grad_norm": 1.6059764600949016,
"learning_rate": 8.34668502398608e-06,
"loss": 0.106,
"step": 201
},
{
"epoch": 0.5358090185676393,
"grad_norm": 1.706895090532193,
"learning_rate": 8.331178072667079e-06,
"loss": 0.1261,
"step": 202
},
{
"epoch": 0.5384615384615384,
"grad_norm": 1.7309972720553148,
"learning_rate": 8.315613291203977e-06,
"loss": 0.1177,
"step": 203
},
{
"epoch": 0.5411140583554377,
"grad_norm": 2.119393668480105,
"learning_rate": 8.299990949805551e-06,
"loss": 0.1512,
"step": 204
},
{
"epoch": 0.5437665782493368,
"grad_norm": 2.1446534939795194,
"learning_rate": 8.28431131967984e-06,
"loss": 0.1451,
"step": 205
},
{
"epoch": 0.5464190981432361,
"grad_norm": 2.3040675508103057,
"learning_rate": 8.268574673029415e-06,
"loss": 0.1528,
"step": 206
},
{
"epoch": 0.5490716180371353,
"grad_norm": 1.828325753916526,
"learning_rate": 8.252781283046688e-06,
"loss": 0.1346,
"step": 207
},
{
"epoch": 0.5517241379310345,
"grad_norm": 1.8710705926707558,
"learning_rate": 8.23693142390914e-06,
"loss": 0.1463,
"step": 208
},
{
"epoch": 0.5543766578249337,
"grad_norm": 1.8384322195422618,
"learning_rate": 8.22102537077457e-06,
"loss": 0.1383,
"step": 209
},
{
"epoch": 0.5570291777188329,
"grad_norm": 1.9467404806792865,
"learning_rate": 8.205063399776326e-06,
"loss": 0.1415,
"step": 210
},
{
"epoch": 0.5596816976127321,
"grad_norm": 1.7702552779136205,
"learning_rate": 8.189045788018502e-06,
"loss": 0.141,
"step": 211
},
{
"epoch": 0.5623342175066313,
"grad_norm": 2.0515294131254525,
"learning_rate": 8.172972813571132e-06,
"loss": 0.1527,
"step": 212
},
{
"epoch": 0.5649867374005305,
"grad_norm": 2.0940031140217865,
"learning_rate": 8.156844755465357e-06,
"loss": 0.15,
"step": 213
},
{
"epoch": 0.5676392572944297,
"grad_norm": 1.9660531752973018,
"learning_rate": 8.14066189368859e-06,
"loss": 0.1408,
"step": 214
},
{
"epoch": 0.5702917771883289,
"grad_norm": 2.1910231506350186,
"learning_rate": 8.124424509179648e-06,
"loss": 0.1463,
"step": 215
},
{
"epoch": 0.5729442970822282,
"grad_norm": 1.5430710287754699,
"learning_rate": 8.108132883823878e-06,
"loss": 0.1097,
"step": 216
},
{
"epoch": 0.5755968169761273,
"grad_norm": 1.8265329192409332,
"learning_rate": 8.091787300448264e-06,
"loss": 0.1395,
"step": 217
},
{
"epoch": 0.5782493368700266,
"grad_norm": 1.8822040988497644,
"learning_rate": 8.07538804281651e-06,
"loss": 0.1298,
"step": 218
},
{
"epoch": 0.5809018567639257,
"grad_norm": 2.0002279443157445,
"learning_rate": 8.058935395624128e-06,
"loss": 0.1421,
"step": 219
},
{
"epoch": 0.583554376657825,
"grad_norm": 2.1521750298282964,
"learning_rate": 8.042429644493479e-06,
"loss": 0.1379,
"step": 220
},
{
"epoch": 0.5862068965517241,
"grad_norm": 1.5640533332285234,
"learning_rate": 8.025871075968828e-06,
"loss": 0.1228,
"step": 221
},
{
"epoch": 0.5888594164456233,
"grad_norm": 2.0211033096735864,
"learning_rate": 8.00925997751136e-06,
"loss": 0.1402,
"step": 222
},
{
"epoch": 0.5915119363395226,
"grad_norm": 1.7401492380650114,
"learning_rate": 7.992596637494199e-06,
"loss": 0.1223,
"step": 223
},
{
"epoch": 0.5941644562334217,
"grad_norm": 1.5482263543431414,
"learning_rate": 7.975881345197394e-06,
"loss": 0.1278,
"step": 224
},
{
"epoch": 0.596816976127321,
"grad_norm": 2.012786559192079,
"learning_rate": 7.959114390802894e-06,
"loss": 0.1322,
"step": 225
},
{
"epoch": 0.5994694960212201,
"grad_norm": 1.9101656055747547,
"learning_rate": 7.942296065389528e-06,
"loss": 0.1411,
"step": 226
},
{
"epoch": 0.6021220159151194,
"grad_norm": 1.8476462070875777,
"learning_rate": 7.925426660927926e-06,
"loss": 0.1325,
"step": 227
},
{
"epoch": 0.6047745358090185,
"grad_norm": 1.9254949919992999,
"learning_rate": 7.908506470275474e-06,
"loss": 0.1226,
"step": 228
},
{
"epoch": 0.6074270557029178,
"grad_norm": 2.0758812690239052,
"learning_rate": 7.891535787171216e-06,
"loss": 0.1263,
"step": 229
},
{
"epoch": 0.610079575596817,
"grad_norm": 1.9368560259395753,
"learning_rate": 7.874514906230757e-06,
"loss": 0.1308,
"step": 230
},
{
"epoch": 0.6127320954907162,
"grad_norm": 1.7832950240034287,
"learning_rate": 7.857444122941155e-06,
"loss": 0.1229,
"step": 231
},
{
"epoch": 0.6153846153846154,
"grad_norm": 1.9284054045996681,
"learning_rate": 7.84032373365578e-06,
"loss": 0.1315,
"step": 232
},
{
"epoch": 0.6180371352785146,
"grad_norm": 1.8237229630031635,
"learning_rate": 7.82315403558918e-06,
"loss": 0.1263,
"step": 233
},
{
"epoch": 0.6206896551724138,
"grad_norm": 2.3980431877447552,
"learning_rate": 7.805935326811913e-06,
"loss": 0.1594,
"step": 234
},
{
"epoch": 0.623342175066313,
"grad_norm": 1.908328991517894,
"learning_rate": 7.78866790624538e-06,
"loss": 0.1221,
"step": 235
},
{
"epoch": 0.6259946949602122,
"grad_norm": 1.9070140557762951,
"learning_rate": 7.771352073656628e-06,
"loss": 0.1099,
"step": 236
},
{
"epoch": 0.6286472148541115,
"grad_norm": 1.9086850445705497,
"learning_rate": 7.753988129653152e-06,
"loss": 0.1217,
"step": 237
},
{
"epoch": 0.6312997347480106,
"grad_norm": 2.298658640053533,
"learning_rate": 7.736576375677676e-06,
"loss": 0.1534,
"step": 238
},
{
"epoch": 0.6339522546419099,
"grad_norm": 2.012272147602231,
"learning_rate": 7.719117114002912e-06,
"loss": 0.1367,
"step": 239
},
{
"epoch": 0.636604774535809,
"grad_norm": 2.054870475879218,
"learning_rate": 7.701610647726323e-06,
"loss": 0.1528,
"step": 240
},
{
"epoch": 0.6392572944297082,
"grad_norm": 1.931987531988978,
"learning_rate": 7.684057280764855e-06,
"loss": 0.1359,
"step": 241
},
{
"epoch": 0.6419098143236074,
"grad_norm": 1.6653702816540303,
"learning_rate": 7.666457317849663e-06,
"loss": 0.1271,
"step": 242
},
{
"epoch": 0.6445623342175066,
"grad_norm": 1.8059464643897476,
"learning_rate": 7.648811064520821e-06,
"loss": 0.1355,
"step": 243
},
{
"epoch": 0.6472148541114059,
"grad_norm": 1.8078224433971741,
"learning_rate": 7.631118827122013e-06,
"loss": 0.1202,
"step": 244
},
{
"epoch": 0.649867374005305,
"grad_norm": 1.948459636065191,
"learning_rate": 7.613380912795225e-06,
"loss": 0.1429,
"step": 245
},
{
"epoch": 0.6525198938992043,
"grad_norm": 2.2076773590525316,
"learning_rate": 7.595597629475402e-06,
"loss": 0.1516,
"step": 246
},
{
"epoch": 0.6551724137931034,
"grad_norm": 1.8887891109191988,
"learning_rate": 7.57776928588511e-06,
"loss": 0.1334,
"step": 247
},
{
"epoch": 0.6578249336870027,
"grad_norm": 2.3599876519640297,
"learning_rate": 7.559896191529169e-06,
"loss": 0.1523,
"step": 248
},
{
"epoch": 0.6604774535809018,
"grad_norm": 1.6073366184090068,
"learning_rate": 7.54197865668929e-06,
"loss": 0.1094,
"step": 249
},
{
"epoch": 0.6631299734748011,
"grad_norm": 1.6066208266739346,
"learning_rate": 7.524016992418676e-06,
"loss": 0.1133,
"step": 250
},
{
"epoch": 0.6657824933687002,
"grad_norm": 2.402024874117795,
"learning_rate": 7.506011510536635e-06,
"loss": 0.1683,
"step": 251
},
{
"epoch": 0.6684350132625995,
"grad_norm": 2.4880666218264955,
"learning_rate": 7.487962523623159e-06,
"loss": 0.1604,
"step": 252
},
{
"epoch": 0.6710875331564987,
"grad_norm": 2.1970208087255076,
"learning_rate": 7.469870345013495e-06,
"loss": 0.1261,
"step": 253
},
{
"epoch": 0.6737400530503979,
"grad_norm": 1.949833364046787,
"learning_rate": 7.451735288792716e-06,
"loss": 0.1283,
"step": 254
},
{
"epoch": 0.6763925729442971,
"grad_norm": 2.043764786717034,
"learning_rate": 7.4335576697902546e-06,
"loss": 0.1217,
"step": 255
},
{
"epoch": 0.6790450928381963,
"grad_norm": 1.7800715009896282,
"learning_rate": 7.415337803574449e-06,
"loss": 0.1134,
"step": 256
},
{
"epoch": 0.6816976127320955,
"grad_norm": 2.036030495974864,
"learning_rate": 7.3970760064470634e-06,
"loss": 0.1346,
"step": 257
},
{
"epoch": 0.6843501326259946,
"grad_norm": 1.7953348831678584,
"learning_rate": 7.378772595437785e-06,
"loss": 0.1068,
"step": 258
},
{
"epoch": 0.6870026525198939,
"grad_norm": 2.151685228258153,
"learning_rate": 7.360427888298737e-06,
"loss": 0.1374,
"step": 259
},
{
"epoch": 0.6896551724137931,
"grad_norm": 1.6830864009549993,
"learning_rate": 7.342042203498952e-06,
"loss": 0.0968,
"step": 260
},
{
"epoch": 0.6923076923076923,
"grad_norm": 2.0738297934683545,
"learning_rate": 7.323615860218844e-06,
"loss": 0.1274,
"step": 261
},
{
"epoch": 0.6949602122015915,
"grad_norm": 2.416943322684093,
"learning_rate": 7.3051491783446705e-06,
"loss": 0.1395,
"step": 262
},
{
"epoch": 0.6976127320954907,
"grad_norm": 1.8473933982642847,
"learning_rate": 7.2866424784629806e-06,
"loss": 0.1189,
"step": 263
},
{
"epoch": 0.7002652519893899,
"grad_norm": 1.990601440702459,
"learning_rate": 7.26809608185504e-06,
"loss": 0.1202,
"step": 264
},
{
"epoch": 0.7029177718832891,
"grad_norm": 1.904814403012209,
"learning_rate": 7.249510310491268e-06,
"loss": 0.1208,
"step": 265
},
{
"epoch": 0.7055702917771883,
"grad_norm": 1.8589441949655492,
"learning_rate": 7.230885487025635e-06,
"loss": 0.1181,
"step": 266
},
{
"epoch": 0.7082228116710876,
"grad_norm": 1.6795096831283485,
"learning_rate": 7.212221934790067e-06,
"loss": 0.1192,
"step": 267
},
{
"epoch": 0.7108753315649867,
"grad_norm": 1.723252840070737,
"learning_rate": 7.193519977788834e-06,
"loss": 0.1236,
"step": 268
},
{
"epoch": 0.713527851458886,
"grad_norm": 2.418801413723791,
"learning_rate": 7.174779940692922e-06,
"loss": 0.1544,
"step": 269
},
{
"epoch": 0.7161803713527851,
"grad_norm": 1.770338034548602,
"learning_rate": 7.1560021488343956e-06,
"loss": 0.1286,
"step": 270
},
{
"epoch": 0.7188328912466844,
"grad_norm": 1.6417602409918115,
"learning_rate": 7.1371869282007545e-06,
"loss": 0.1142,
"step": 271
},
{
"epoch": 0.7214854111405835,
"grad_norm": 2.2143362328692207,
"learning_rate": 7.118334605429272e-06,
"loss": 0.1542,
"step": 272
},
{
"epoch": 0.7241379310344828,
"grad_norm": 2.316061054037307,
"learning_rate": 7.099445507801324e-06,
"loss": 0.1376,
"step": 273
},
{
"epoch": 0.726790450928382,
"grad_norm": 2.184752996333078,
"learning_rate": 7.080519963236706e-06,
"loss": 0.1268,
"step": 274
},
{
"epoch": 0.7294429708222812,
"grad_norm": 1.840884510423046,
"learning_rate": 7.0615583002879465e-06,
"loss": 0.1346,
"step": 275
},
{
"epoch": 0.7320954907161804,
"grad_norm": 1.811306716138802,
"learning_rate": 7.042560848134592e-06,
"loss": 0.1149,
"step": 276
},
{
"epoch": 0.7347480106100795,
"grad_norm": 2.0428053121611476,
"learning_rate": 7.023527936577507e-06,
"loss": 0.1407,
"step": 277
},
{
"epoch": 0.7374005305039788,
"grad_norm": 1.6116834909252478,
"learning_rate": 7.004459896033137e-06,
"loss": 0.1193,
"step": 278
},
{
"epoch": 0.7400530503978779,
"grad_norm": 2.1213770604701976,
"learning_rate": 6.985357057527774e-06,
"loss": 0.1434,
"step": 279
},
{
"epoch": 0.7427055702917772,
"grad_norm": 1.91726573137005,
"learning_rate": 6.966219752691814e-06,
"loss": 0.125,
"step": 280
},
{
"epoch": 0.7453580901856764,
"grad_norm": 1.67695736254988,
"learning_rate": 6.947048313753998e-06,
"loss": 0.1338,
"step": 281
},
{
"epoch": 0.7480106100795756,
"grad_norm": 2.489354393443295,
"learning_rate": 6.927843073535645e-06,
"loss": 0.1473,
"step": 282
},
{
"epoch": 0.7506631299734748,
"grad_norm": 1.7295592511699343,
"learning_rate": 6.9086043654448734e-06,
"loss": 0.1256,
"step": 283
},
{
"epoch": 0.753315649867374,
"grad_norm": 1.9395856232269375,
"learning_rate": 6.889332523470808e-06,
"loss": 0.1195,
"step": 284
},
{
"epoch": 0.7559681697612732,
"grad_norm": 2.066706586377597,
"learning_rate": 6.870027882177791e-06,
"loss": 0.1358,
"step": 285
},
{
"epoch": 0.7586206896551724,
"grad_norm": 2.0770046146432395,
"learning_rate": 6.850690776699574e-06,
"loss": 0.1429,
"step": 286
},
{
"epoch": 0.7612732095490716,
"grad_norm": 1.881483786265112,
"learning_rate": 6.831321542733482e-06,
"loss": 0.1266,
"step": 287
},
{
"epoch": 0.7639257294429708,
"grad_norm": 1.695062472576317,
"learning_rate": 6.811920516534616e-06,
"loss": 0.1202,
"step": 288
},
{
"epoch": 0.76657824933687,
"grad_norm": 2.1054959851607506,
"learning_rate": 6.7924880349099855e-06,
"loss": 0.137,
"step": 289
},
{
"epoch": 0.7692307692307693,
"grad_norm": 1.559652722034954,
"learning_rate": 6.773024435212678e-06,
"loss": 0.0985,
"step": 290
},
{
"epoch": 0.7718832891246684,
"grad_norm": 2.2208427457963342,
"learning_rate": 6.753530055336006e-06,
"loss": 0.1466,
"step": 291
},
{
"epoch": 0.7745358090185677,
"grad_norm": 2.1893216240483753,
"learning_rate": 6.734005233707624e-06,
"loss": 0.1537,
"step": 292
},
{
"epoch": 0.7771883289124668,
"grad_norm": 2.3583536185079703,
"learning_rate": 6.714450309283671e-06,
"loss": 0.1384,
"step": 293
},
{
"epoch": 0.7798408488063661,
"grad_norm": 2.116909607447533,
"learning_rate": 6.694865621542873e-06,
"loss": 0.1525,
"step": 294
},
{
"epoch": 0.7824933687002652,
"grad_norm": 1.553790399379545,
"learning_rate": 6.675251510480662e-06,
"loss": 0.1074,
"step": 295
},
{
"epoch": 0.7851458885941645,
"grad_norm": 1.702247544650182,
"learning_rate": 6.655608316603257e-06,
"loss": 0.1037,
"step": 296
},
{
"epoch": 0.7877984084880637,
"grad_norm": 1.4027176458517787,
"learning_rate": 6.635936380921774e-06,
"loss": 0.0842,
"step": 297
},
{
"epoch": 0.7904509283819628,
"grad_norm": 1.8615502107309188,
"learning_rate": 6.616236044946283e-06,
"loss": 0.1335,
"step": 298
},
{
"epoch": 0.7931034482758621,
"grad_norm": 1.6648222471112546,
"learning_rate": 6.5965076506799e-06,
"loss": 0.1181,
"step": 299
},
{
"epoch": 0.7957559681697612,
"grad_norm": 1.9881029740135223,
"learning_rate": 6.576751540612835e-06,
"loss": 0.1112,
"step": 300
},
{
"epoch": 0.7984084880636605,
"grad_norm": 2.2013679784949667,
"learning_rate": 6.556968057716457e-06,
"loss": 0.1237,
"step": 301
},
{
"epoch": 0.8010610079575596,
"grad_norm": 2.1049495467849697,
"learning_rate": 6.537157545437326e-06,
"loss": 0.14,
"step": 302
},
{
"epoch": 0.8037135278514589,
"grad_norm": 1.9391684954075283,
"learning_rate": 6.517320347691245e-06,
"loss": 0.1432,
"step": 303
},
{
"epoch": 0.8063660477453581,
"grad_norm": 1.897903152535271,
"learning_rate": 6.497456808857286e-06,
"loss": 0.1242,
"step": 304
},
{
"epoch": 0.8090185676392573,
"grad_norm": 2.400907859514575,
"learning_rate": 6.477567273771807e-06,
"loss": 0.1363,
"step": 305
},
{
"epoch": 0.8116710875331565,
"grad_norm": 2.0519874786243095,
"learning_rate": 6.4576520877224644e-06,
"loss": 0.149,
"step": 306
},
{
"epoch": 0.8143236074270557,
"grad_norm": 1.613637672772973,
"learning_rate": 6.437711596442228e-06,
"loss": 0.1078,
"step": 307
},
{
"epoch": 0.8169761273209549,
"grad_norm": 2.0653398136129675,
"learning_rate": 6.4177461461033675e-06,
"loss": 0.1416,
"step": 308
},
{
"epoch": 0.8196286472148541,
"grad_norm": 2.2371819869165073,
"learning_rate": 6.397756083311454e-06,
"loss": 0.1395,
"step": 309
},
{
"epoch": 0.8222811671087533,
"grad_norm": 1.7682314840535782,
"learning_rate": 6.377741755099334e-06,
"loss": 0.1248,
"step": 310
},
{
"epoch": 0.8249336870026526,
"grad_norm": 1.7180875056368419,
"learning_rate": 6.357703508921109e-06,
"loss": 0.1146,
"step": 311
},
{
"epoch": 0.8275862068965517,
"grad_norm": 1.837539966421522,
"learning_rate": 6.337641692646106e-06,
"loss": 0.1063,
"step": 312
},
{
"epoch": 0.830238726790451,
"grad_norm": 1.9061111225956069,
"learning_rate": 6.317556654552825e-06,
"loss": 0.1261,
"step": 313
},
{
"epoch": 0.8328912466843501,
"grad_norm": 1.7785361235235786,
"learning_rate": 6.297448743322918e-06,
"loss": 0.1213,
"step": 314
},
{
"epoch": 0.8355437665782494,
"grad_norm": 2.285655601737043,
"learning_rate": 6.277318308035109e-06,
"loss": 0.136,
"step": 315
},
{
"epoch": 0.8381962864721485,
"grad_norm": 2.292822876847262,
"learning_rate": 6.257165698159149e-06,
"loss": 0.1437,
"step": 316
},
{
"epoch": 0.8408488063660478,
"grad_norm": 1.925635900709222,
"learning_rate": 6.236991263549748e-06,
"loss": 0.1244,
"step": 317
},
{
"epoch": 0.843501326259947,
"grad_norm": 1.8764634867307084,
"learning_rate": 6.2167953544404955e-06,
"loss": 0.1269,
"step": 318
},
{
"epoch": 0.8461538461538461,
"grad_norm": 1.9227813803757845,
"learning_rate": 6.1965783214377895e-06,
"loss": 0.1052,
"step": 319
},
{
"epoch": 0.8488063660477454,
"grad_norm": 1.6243389871471685,
"learning_rate": 6.176340515514738e-06,
"loss": 0.0946,
"step": 320
},
{
"epoch": 0.8514588859416445,
"grad_norm": 1.9721230457586518,
"learning_rate": 6.156082288005078e-06,
"loss": 0.1242,
"step": 321
},
{
"epoch": 0.8541114058355438,
"grad_norm": 1.939736016873224,
"learning_rate": 6.135803990597066e-06,
"loss": 0.1107,
"step": 322
},
{
"epoch": 0.8567639257294429,
"grad_norm": 1.910794512321765,
"learning_rate": 6.115505975327382e-06,
"loss": 0.1157,
"step": 323
},
{
"epoch": 0.8594164456233422,
"grad_norm": 2.3310014277848006,
"learning_rate": 6.095188594575008e-06,
"loss": 0.1473,
"step": 324
},
{
"epoch": 0.8620689655172413,
"grad_norm": 2.021424856899511,
"learning_rate": 6.074852201055121e-06,
"loss": 0.1412,
"step": 325
},
{
"epoch": 0.8647214854111406,
"grad_norm": 1.8755075770524758,
"learning_rate": 6.054497147812962e-06,
"loss": 0.1374,
"step": 326
},
{
"epoch": 0.8673740053050398,
"grad_norm": 2.0283809132037947,
"learning_rate": 6.034123788217712e-06,
"loss": 0.1225,
"step": 327
},
{
"epoch": 0.870026525198939,
"grad_norm": 1.77815086557222,
"learning_rate": 6.013732475956352e-06,
"loss": 0.1105,
"step": 328
},
{
"epoch": 0.8726790450928382,
"grad_norm": 1.698228853578335,
"learning_rate": 5.993323565027528e-06,
"loss": 0.0964,
"step": 329
},
{
"epoch": 0.8753315649867374,
"grad_norm": 1.8428502419687727,
"learning_rate": 5.972897409735403e-06,
"loss": 0.1227,
"step": 330
},
{
"epoch": 0.8779840848806366,
"grad_norm": 1.7797504117579037,
"learning_rate": 5.952454364683507e-06,
"loss": 0.1129,
"step": 331
},
{
"epoch": 0.8806366047745358,
"grad_norm": 2.5588750953515143,
"learning_rate": 5.931994784768582e-06,
"loss": 0.1512,
"step": 332
},
{
"epoch": 0.883289124668435,
"grad_norm": 2.4302985612966697,
"learning_rate": 5.911519025174419e-06,
"loss": 0.1419,
"step": 333
},
{
"epoch": 0.8859416445623343,
"grad_norm": 2.116715417993045,
"learning_rate": 5.891027441365689e-06,
"loss": 0.1248,
"step": 334
},
{
"epoch": 0.8885941644562334,
"grad_norm": 1.8450184290705178,
"learning_rate": 5.870520389081782e-06,
"loss": 0.1093,
"step": 335
},
{
"epoch": 0.8912466843501327,
"grad_norm": 1.8378061965275745,
"learning_rate": 5.849998224330621e-06,
"loss": 0.1103,
"step": 336
},
{
"epoch": 0.8938992042440318,
"grad_norm": 2.0712714287437963,
"learning_rate": 5.829461303382484e-06,
"loss": 0.1318,
"step": 337
},
{
"epoch": 0.896551724137931,
"grad_norm": 2.1172154065660034,
"learning_rate": 5.808909982763825e-06,
"loss": 0.1185,
"step": 338
},
{
"epoch": 0.8992042440318302,
"grad_norm": 2.600554195545317,
"learning_rate": 5.788344619251076e-06,
"loss": 0.1462,
"step": 339
},
{
"epoch": 0.9018567639257294,
"grad_norm": 2.08199632146431,
"learning_rate": 5.767765569864459e-06,
"loss": 0.1383,
"step": 340
},
{
"epoch": 0.9045092838196287,
"grad_norm": 1.806422329448522,
"learning_rate": 5.747173191861788e-06,
"loss": 0.0905,
"step": 341
},
{
"epoch": 0.9071618037135278,
"grad_norm": 1.8050690831990468,
"learning_rate": 5.726567842732262e-06,
"loss": 0.1156,
"step": 342
},
{
"epoch": 0.9098143236074271,
"grad_norm": 2.091128781060801,
"learning_rate": 5.705949880190266e-06,
"loss": 0.1195,
"step": 343
},
{
"epoch": 0.9124668435013262,
"grad_norm": 1.8670275726747676,
"learning_rate": 5.685319662169157e-06,
"loss": 0.1029,
"step": 344
},
{
"epoch": 0.9151193633952255,
"grad_norm": 2.039989303735285,
"learning_rate": 5.664677546815043e-06,
"loss": 0.1312,
"step": 345
},
{
"epoch": 0.9177718832891246,
"grad_norm": 1.859016569528762,
"learning_rate": 5.644023892480583e-06,
"loss": 0.1055,
"step": 346
},
{
"epoch": 0.9204244031830239,
"grad_norm": 1.5903154857571018,
"learning_rate": 5.623359057718752e-06,
"loss": 0.0973,
"step": 347
},
{
"epoch": 0.9230769230769231,
"grad_norm": 1.9024912783079182,
"learning_rate": 5.6026834012766155e-06,
"loss": 0.1147,
"step": 348
},
{
"epoch": 0.9257294429708223,
"grad_norm": 2.4600571478007645,
"learning_rate": 5.581997282089114e-06,
"loss": 0.1274,
"step": 349
},
{
"epoch": 0.9283819628647215,
"grad_norm": 2.1252658277426493,
"learning_rate": 5.561301059272821e-06,
"loss": 0.1192,
"step": 350
},
{
"epoch": 0.9310344827586207,
"grad_norm": 2.2663754831446528,
"learning_rate": 5.540595092119709e-06,
"loss": 0.1368,
"step": 351
},
{
"epoch": 0.9336870026525199,
"grad_norm": 2.1656127015955637,
"learning_rate": 5.519879740090918e-06,
"loss": 0.1072,
"step": 352
},
{
"epoch": 0.9363395225464191,
"grad_norm": 2.1805472380714552,
"learning_rate": 5.499155362810512e-06,
"loss": 0.1475,
"step": 353
},
{
"epoch": 0.9389920424403183,
"grad_norm": 1.9313395207402062,
"learning_rate": 5.478422320059231e-06,
"loss": 0.1138,
"step": 354
},
{
"epoch": 0.9416445623342176,
"grad_norm": 1.9593835652421479,
"learning_rate": 5.457680971768258e-06,
"loss": 0.1236,
"step": 355
},
{
"epoch": 0.9442970822281167,
"grad_norm": 1.9523449371463568,
"learning_rate": 5.436931678012956e-06,
"loss": 0.1191,
"step": 356
},
{
"epoch": 0.946949602122016,
"grad_norm": 2.213437974106433,
"learning_rate": 5.4161747990066235e-06,
"loss": 0.157,
"step": 357
},
{
"epoch": 0.9496021220159151,
"grad_norm": 1.7830925038957812,
"learning_rate": 5.395410695094246e-06,
"loss": 0.1048,
"step": 358
},
{
"epoch": 0.9522546419098143,
"grad_norm": 1.9526782766464377,
"learning_rate": 5.374639726746232e-06,
"loss": 0.1298,
"step": 359
},
{
"epoch": 0.9549071618037135,
"grad_norm": 2.225209885017297,
"learning_rate": 5.353862254552159e-06,
"loss": 0.1114,
"step": 360
},
{
"epoch": 0.9575596816976127,
"grad_norm": 1.8717932182864292,
"learning_rate": 5.333078639214511e-06,
"loss": 0.114,
"step": 361
},
{
"epoch": 0.9602122015915119,
"grad_norm": 2.037766974038598,
"learning_rate": 5.31228924154242e-06,
"loss": 0.1061,
"step": 362
},
{
"epoch": 0.9628647214854111,
"grad_norm": 1.7551149566575617,
"learning_rate": 5.2914944224454e-06,
"loss": 0.1105,
"step": 363
},
{
"epoch": 0.9655172413793104,
"grad_norm": 1.6746920582972595,
"learning_rate": 5.270694542927089e-06,
"loss": 0.1129,
"step": 364
},
{
"epoch": 0.9681697612732095,
"grad_norm": 1.7421649434309925,
"learning_rate": 5.249889964078965e-06,
"loss": 0.1095,
"step": 365
},
{
"epoch": 0.9708222811671088,
"grad_norm": 1.705953446601723,
"learning_rate": 5.2290810470740925e-06,
"loss": 0.1009,
"step": 366
},
{
"epoch": 0.9734748010610079,
"grad_norm": 1.9937643181748503,
"learning_rate": 5.2082681531608505e-06,
"loss": 0.1374,
"step": 367
},
{
"epoch": 0.9761273209549072,
"grad_norm": 2.173137907211324,
"learning_rate": 5.187451643656654e-06,
"loss": 0.1279,
"step": 368
},
{
"epoch": 0.9787798408488063,
"grad_norm": 1.8358057318165615,
"learning_rate": 5.166631879941686e-06,
"loss": 0.1032,
"step": 369
},
{
"epoch": 0.9814323607427056,
"grad_norm": 2.097800711750244,
"learning_rate": 5.145809223452625e-06,
"loss": 0.119,
"step": 370
},
{
"epoch": 0.9840848806366048,
"grad_norm": 1.7975003014708764,
"learning_rate": 5.124984035676366e-06,
"loss": 0.0984,
"step": 371
},
{
"epoch": 0.986737400530504,
"grad_norm": 1.6452454858458356,
"learning_rate": 5.1041566781437525e-06,
"loss": 0.0995,
"step": 372
},
{
"epoch": 0.9893899204244032,
"grad_norm": 1.6148448625331262,
"learning_rate": 5.083327512423294e-06,
"loss": 0.087,
"step": 373
},
{
"epoch": 0.9920424403183024,
"grad_norm": 1.8548088873299826,
"learning_rate": 5.062496900114887e-06,
"loss": 0.1028,
"step": 374
},
{
"epoch": 0.9946949602122016,
"grad_norm": 2.286016586242463,
"learning_rate": 5.041665202843543e-06,
"loss": 0.1394,
"step": 375
},
{
"epoch": 0.9973474801061007,
"grad_norm": 1.9248524614602884,
"learning_rate": 5.020832782253115e-06,
"loss": 0.1206,
"step": 376
},
{
"epoch": 1.0,
"grad_norm": 1.7947903575090804,
"learning_rate": 5e-06,
"loss": 0.108,
"step": 377
},
{
"epoch": 1.0026525198938991,
"grad_norm": 1.1448965437517742,
"learning_rate": 4.979167217746888e-06,
"loss": 0.0474,
"step": 378
},
{
"epoch": 1.0053050397877985,
"grad_norm": 1.4782795600036578,
"learning_rate": 4.9583347971564575e-06,
"loss": 0.0546,
"step": 379
},
{
"epoch": 1.0079575596816976,
"grad_norm": 1.4977052145475058,
"learning_rate": 4.937503099885115e-06,
"loss": 0.0527,
"step": 380
},
{
"epoch": 1.0106100795755968,
"grad_norm": 1.3509330137945688,
"learning_rate": 4.916672487576708e-06,
"loss": 0.0581,
"step": 381
},
{
"epoch": 1.013262599469496,
"grad_norm": 1.3766430452266287,
"learning_rate": 4.895843321856249e-06,
"loss": 0.0543,
"step": 382
},
{
"epoch": 1.0159151193633953,
"grad_norm": 1.1838930001612804,
"learning_rate": 4.875015964323635e-06,
"loss": 0.0501,
"step": 383
},
{
"epoch": 1.0185676392572944,
"grad_norm": 1.1327638132836573,
"learning_rate": 4.854190776547377e-06,
"loss": 0.0488,
"step": 384
},
{
"epoch": 1.0212201591511936,
"grad_norm": 1.3242917574913802,
"learning_rate": 4.833368120058317e-06,
"loss": 0.0528,
"step": 385
},
{
"epoch": 1.023872679045093,
"grad_norm": 1.3957886304337215,
"learning_rate": 4.812548356343347e-06,
"loss": 0.0435,
"step": 386
},
{
"epoch": 1.026525198938992,
"grad_norm": 1.3976646283902743,
"learning_rate": 4.79173184683915e-06,
"loss": 0.0504,
"step": 387
},
{
"epoch": 1.0291777188328912,
"grad_norm": 1.594064921848588,
"learning_rate": 4.770918952925908e-06,
"loss": 0.0467,
"step": 388
},
{
"epoch": 1.0318302387267904,
"grad_norm": 1.8424058890187611,
"learning_rate": 4.750110035921038e-06,
"loss": 0.0592,
"step": 389
},
{
"epoch": 1.0344827586206897,
"grad_norm": 1.5198759678201303,
"learning_rate": 4.729305457072913e-06,
"loss": 0.0493,
"step": 390
},
{
"epoch": 1.0371352785145889,
"grad_norm": 1.63737016645813,
"learning_rate": 4.708505577554601e-06,
"loss": 0.0637,
"step": 391
},
{
"epoch": 1.039787798408488,
"grad_norm": 1.513812328618622,
"learning_rate": 4.687710758457583e-06,
"loss": 0.0452,
"step": 392
},
{
"epoch": 1.0424403183023874,
"grad_norm": 1.6651422093934103,
"learning_rate": 4.6669213607854915e-06,
"loss": 0.0498,
"step": 393
},
{
"epoch": 1.0450928381962865,
"grad_norm": 1.5964261962470236,
"learning_rate": 4.646137745447843e-06,
"loss": 0.0623,
"step": 394
},
{
"epoch": 1.0477453580901857,
"grad_norm": 1.7129301697378825,
"learning_rate": 4.6253602732537685e-06,
"loss": 0.0455,
"step": 395
},
{
"epoch": 1.0503978779840848,
"grad_norm": 1.5300658689026623,
"learning_rate": 4.6045893049057544e-06,
"loss": 0.0555,
"step": 396
},
{
"epoch": 1.0530503978779842,
"grad_norm": 1.3274779219248138,
"learning_rate": 4.583825200993377e-06,
"loss": 0.0461,
"step": 397
},
{
"epoch": 1.0557029177718833,
"grad_norm": 1.6225406314000637,
"learning_rate": 4.563068321987047e-06,
"loss": 0.0531,
"step": 398
},
{
"epoch": 1.0583554376657824,
"grad_norm": 1.5953486175793383,
"learning_rate": 4.542319028231744e-06,
"loss": 0.0477,
"step": 399
},
{
"epoch": 1.0610079575596818,
"grad_norm": 1.5805509793613008,
"learning_rate": 4.521577679940769e-06,
"loss": 0.0429,
"step": 400
},
{
"epoch": 1.0610079575596818,
"eval_loss": 0.1300889253616333,
"eval_runtime": 1.2993,
"eval_samples_per_second": 23.859,
"eval_steps_per_second": 6.157,
"step": 400
},
{
"epoch": 1.063660477453581,
"grad_norm": 1.6980952582384556,
"learning_rate": 4.50084463718949e-06,
"loss": 0.06,
"step": 401
},
{
"epoch": 1.06631299734748,
"grad_norm": 1.5737277597483659,
"learning_rate": 4.480120259909084e-06,
"loss": 0.0488,
"step": 402
},
{
"epoch": 1.0689655172413792,
"grad_norm": 1.6771874133081281,
"learning_rate": 4.459404907880293e-06,
"loss": 0.0539,
"step": 403
},
{
"epoch": 1.0716180371352786,
"grad_norm": 1.6529014957836754,
"learning_rate": 4.438698940727179e-06,
"loss": 0.0572,
"step": 404
},
{
"epoch": 1.0742705570291777,
"grad_norm": 1.5970118458627822,
"learning_rate": 4.418002717910887e-06,
"loss": 0.06,
"step": 405
},
{
"epoch": 1.0769230769230769,
"grad_norm": 1.4471958707171573,
"learning_rate": 4.397316598723385e-06,
"loss": 0.0538,
"step": 406
},
{
"epoch": 1.079575596816976,
"grad_norm": 1.0348681849940542,
"learning_rate": 4.37664094228125e-06,
"loss": 0.035,
"step": 407
},
{
"epoch": 1.0822281167108754,
"grad_norm": 1.3544554476316786,
"learning_rate": 4.3559761075194185e-06,
"loss": 0.0417,
"step": 408
},
{
"epoch": 1.0848806366047745,
"grad_norm": 1.9097492769272106,
"learning_rate": 4.335322453184959e-06,
"loss": 0.0564,
"step": 409
},
{
"epoch": 1.0875331564986737,
"grad_norm": 1.337428576441879,
"learning_rate": 4.314680337830847e-06,
"loss": 0.0471,
"step": 410
},
{
"epoch": 1.090185676392573,
"grad_norm": 1.6610870125693307,
"learning_rate": 4.294050119809735e-06,
"loss": 0.0495,
"step": 411
},
{
"epoch": 1.0928381962864722,
"grad_norm": 1.648323552864328,
"learning_rate": 4.273432157267739e-06,
"loss": 0.0541,
"step": 412
},
{
"epoch": 1.0954907161803713,
"grad_norm": 1.5486084688969948,
"learning_rate": 4.252826808138214e-06,
"loss": 0.0559,
"step": 413
},
{
"epoch": 1.0981432360742707,
"grad_norm": 1.7853234719100994,
"learning_rate": 4.232234430135542e-06,
"loss": 0.0688,
"step": 414
},
{
"epoch": 1.1007957559681698,
"grad_norm": 1.5356971600614069,
"learning_rate": 4.2116553807489255e-06,
"loss": 0.0516,
"step": 415
},
{
"epoch": 1.103448275862069,
"grad_norm": 1.6247663576609628,
"learning_rate": 4.191090017236177e-06,
"loss": 0.0514,
"step": 416
},
{
"epoch": 1.106100795755968,
"grad_norm": 1.534189806530592,
"learning_rate": 4.170538696617518e-06,
"loss": 0.0524,
"step": 417
},
{
"epoch": 1.1087533156498675,
"grad_norm": 1.3158821951034483,
"learning_rate": 4.15000177566938e-06,
"loss": 0.0403,
"step": 418
},
{
"epoch": 1.1114058355437666,
"grad_norm": 1.5138948007814246,
"learning_rate": 4.129479610918219e-06,
"loss": 0.049,
"step": 419
},
{
"epoch": 1.1140583554376657,
"grad_norm": 1.2320891847094047,
"learning_rate": 4.108972558634312e-06,
"loss": 0.042,
"step": 420
},
{
"epoch": 1.1167108753315649,
"grad_norm": 1.6729860721769905,
"learning_rate": 4.088480974825584e-06,
"loss": 0.0527,
"step": 421
},
{
"epoch": 1.1193633952254642,
"grad_norm": 2.128191222629532,
"learning_rate": 4.0680052152314185e-06,
"loss": 0.064,
"step": 422
},
{
"epoch": 1.1220159151193634,
"grad_norm": 1.5415156129118721,
"learning_rate": 4.047545635316494e-06,
"loss": 0.0479,
"step": 423
},
{
"epoch": 1.1246684350132625,
"grad_norm": 1.4478769280496202,
"learning_rate": 4.0271025902646e-06,
"loss": 0.0461,
"step": 424
},
{
"epoch": 1.1273209549071619,
"grad_norm": 1.4168078919989058,
"learning_rate": 4.006676434972474e-06,
"loss": 0.0461,
"step": 425
},
{
"epoch": 1.129973474801061,
"grad_norm": 1.7163680394879697,
"learning_rate": 3.98626752404365e-06,
"loss": 0.0563,
"step": 426
},
{
"epoch": 1.1326259946949602,
"grad_norm": 1.4561458702273795,
"learning_rate": 3.96587621178229e-06,
"loss": 0.0451,
"step": 427
},
{
"epoch": 1.1352785145888595,
"grad_norm": 1.3054592699914147,
"learning_rate": 3.94550285218704e-06,
"loss": 0.0413,
"step": 428
},
{
"epoch": 1.1379310344827587,
"grad_norm": 1.410529232372516,
"learning_rate": 3.92514779894488e-06,
"loss": 0.0475,
"step": 429
},
{
"epoch": 1.1405835543766578,
"grad_norm": 1.1401984741886695,
"learning_rate": 3.904811405424993e-06,
"loss": 0.034,
"step": 430
},
{
"epoch": 1.143236074270557,
"grad_norm": 1.677403926891428,
"learning_rate": 3.8844940246726206e-06,
"loss": 0.0537,
"step": 431
},
{
"epoch": 1.1458885941644563,
"grad_norm": 2.0010687043565856,
"learning_rate": 3.864196009402935e-06,
"loss": 0.0571,
"step": 432
},
{
"epoch": 1.1485411140583555,
"grad_norm": 1.4276836740813328,
"learning_rate": 3.843917711994923e-06,
"loss": 0.039,
"step": 433
},
{
"epoch": 1.1511936339522546,
"grad_norm": 1.7186301264192605,
"learning_rate": 3.823659484485264e-06,
"loss": 0.0566,
"step": 434
},
{
"epoch": 1.1538461538461537,
"grad_norm": 1.6127857163652946,
"learning_rate": 3.803421678562213e-06,
"loss": 0.0392,
"step": 435
},
{
"epoch": 1.156498673740053,
"grad_norm": 1.6060058035757456,
"learning_rate": 3.783204645559504e-06,
"loss": 0.058,
"step": 436
},
{
"epoch": 1.1591511936339522,
"grad_norm": 1.8031413344908938,
"learning_rate": 3.7630087364502545e-06,
"loss": 0.0565,
"step": 437
},
{
"epoch": 1.1618037135278514,
"grad_norm": 1.3528132025697588,
"learning_rate": 3.742834301840853e-06,
"loss": 0.0488,
"step": 438
},
{
"epoch": 1.1644562334217508,
"grad_norm": 1.4452161790201725,
"learning_rate": 3.722681691964892e-06,
"loss": 0.052,
"step": 439
},
{
"epoch": 1.16710875331565,
"grad_norm": 1.506462400743829,
"learning_rate": 3.702551256677083e-06,
"loss": 0.0517,
"step": 440
},
{
"epoch": 1.169761273209549,
"grad_norm": 1.4482316284963157,
"learning_rate": 3.6824433454471755e-06,
"loss": 0.0486,
"step": 441
},
{
"epoch": 1.1724137931034484,
"grad_norm": 1.4651862565906717,
"learning_rate": 3.662358307353897e-06,
"loss": 0.0528,
"step": 442
},
{
"epoch": 1.1750663129973475,
"grad_norm": 1.7861455263109471,
"learning_rate": 3.6422964910788917e-06,
"loss": 0.068,
"step": 443
},
{
"epoch": 1.1777188328912467,
"grad_norm": 1.8641910946820022,
"learning_rate": 3.6222582449006673e-06,
"loss": 0.0557,
"step": 444
},
{
"epoch": 1.1803713527851458,
"grad_norm": 1.493182090383967,
"learning_rate": 3.602243916688548e-06,
"loss": 0.0468,
"step": 445
},
{
"epoch": 1.1830238726790452,
"grad_norm": 1.458973918132782,
"learning_rate": 3.5822538538966333e-06,
"loss": 0.0546,
"step": 446
},
{
"epoch": 1.1856763925729443,
"grad_norm": 1.1386325113824567,
"learning_rate": 3.5622884035577743e-06,
"loss": 0.044,
"step": 447
},
{
"epoch": 1.1883289124668435,
"grad_norm": 1.8301480285009966,
"learning_rate": 3.542347912277537e-06,
"loss": 0.0532,
"step": 448
},
{
"epoch": 1.1909814323607426,
"grad_norm": 1.4956585692430135,
"learning_rate": 3.5224327262281956e-06,
"loss": 0.0583,
"step": 449
},
{
"epoch": 1.193633952254642,
"grad_norm": 1.5435072195981274,
"learning_rate": 3.502543191142713e-06,
"loss": 0.0527,
"step": 450
},
{
"epoch": 1.1962864721485411,
"grad_norm": 1.2942065638898985,
"learning_rate": 3.482679652308756e-06,
"loss": 0.0458,
"step": 451
},
{
"epoch": 1.1989389920424403,
"grad_norm": 1.5335416817545946,
"learning_rate": 3.462842454562677e-06,
"loss": 0.0435,
"step": 452
},
{
"epoch": 1.2015915119363396,
"grad_norm": 1.4166517541759323,
"learning_rate": 3.443031942283544e-06,
"loss": 0.0423,
"step": 453
},
{
"epoch": 1.2042440318302388,
"grad_norm": 1.8664420121487215,
"learning_rate": 3.423248459387165e-06,
"loss": 0.0546,
"step": 454
},
{
"epoch": 1.206896551724138,
"grad_norm": 1.3815730200823022,
"learning_rate": 3.403492349320101e-06,
"loss": 0.043,
"step": 455
},
{
"epoch": 1.209549071618037,
"grad_norm": 1.581543542046171,
"learning_rate": 3.3837639550537183e-06,
"loss": 0.0563,
"step": 456
},
{
"epoch": 1.2122015915119364,
"grad_norm": 1.420673121312355,
"learning_rate": 3.364063619078228e-06,
"loss": 0.0364,
"step": 457
},
{
"epoch": 1.2148541114058355,
"grad_norm": 1.4971596799850846,
"learning_rate": 3.344391683396744e-06,
"loss": 0.0446,
"step": 458
},
{
"epoch": 1.2175066312997347,
"grad_norm": 1.701718210938796,
"learning_rate": 3.3247484895193406e-06,
"loss": 0.052,
"step": 459
},
{
"epoch": 1.2201591511936338,
"grad_norm": 1.4921935610407966,
"learning_rate": 3.305134378457127e-06,
"loss": 0.046,
"step": 460
},
{
"epoch": 1.2228116710875332,
"grad_norm": 1.6030117015462066,
"learning_rate": 3.2855496907163296e-06,
"loss": 0.0472,
"step": 461
},
{
"epoch": 1.2254641909814323,
"grad_norm": 1.2527893295170924,
"learning_rate": 3.2659947662923767e-06,
"loss": 0.0443,
"step": 462
},
{
"epoch": 1.2281167108753315,
"grad_norm": 1.9885749044696137,
"learning_rate": 3.246469944663996e-06,
"loss": 0.0619,
"step": 463
},
{
"epoch": 1.2307692307692308,
"grad_norm": 1.4623016319017155,
"learning_rate": 3.226975564787322e-06,
"loss": 0.042,
"step": 464
},
{
"epoch": 1.23342175066313,
"grad_norm": 1.372530389209216,
"learning_rate": 3.2075119650900166e-06,
"loss": 0.0406,
"step": 465
},
{
"epoch": 1.2360742705570291,
"grad_norm": 1.3245791205337774,
"learning_rate": 3.1880794834653872e-06,
"loss": 0.0471,
"step": 466
},
{
"epoch": 1.2387267904509285,
"grad_norm": 1.765042349753763,
"learning_rate": 3.1686784572665176e-06,
"loss": 0.0545,
"step": 467
},
{
"epoch": 1.2413793103448276,
"grad_norm": 1.6880726450034362,
"learning_rate": 3.149309223300428e-06,
"loss": 0.0417,
"step": 468
},
{
"epoch": 1.2440318302387268,
"grad_norm": 1.6867768845931341,
"learning_rate": 3.12997211782221e-06,
"loss": 0.064,
"step": 469
},
{
"epoch": 1.246684350132626,
"grad_norm": 1.6829202873944922,
"learning_rate": 3.1106674765291943e-06,
"loss": 0.0487,
"step": 470
},
{
"epoch": 1.2493368700265253,
"grad_norm": 1.482635437530892,
"learning_rate": 3.0913956345551287e-06,
"loss": 0.0437,
"step": 471
},
{
"epoch": 1.2519893899204244,
"grad_norm": 1.0243355165306798,
"learning_rate": 3.072156926464356e-06,
"loss": 0.0286,
"step": 472
},
{
"epoch": 1.2546419098143236,
"grad_norm": 1.7086250346584169,
"learning_rate": 3.052951686246003e-06,
"loss": 0.0524,
"step": 473
},
{
"epoch": 1.2572944297082227,
"grad_norm": 1.5669584106420342,
"learning_rate": 3.033780247308187e-06,
"loss": 0.0416,
"step": 474
},
{
"epoch": 1.259946949602122,
"grad_norm": 1.5314026636512095,
"learning_rate": 3.0146429424722277e-06,
"loss": 0.0459,
"step": 475
},
{
"epoch": 1.2625994694960212,
"grad_norm": 1.6510097722189716,
"learning_rate": 2.9955401039668642e-06,
"loss": 0.0451,
"step": 476
},
{
"epoch": 1.2652519893899203,
"grad_norm": 1.1998735054502896,
"learning_rate": 2.976472063422493e-06,
"loss": 0.0421,
"step": 477
},
{
"epoch": 1.2679045092838197,
"grad_norm": 1.8230740287638594,
"learning_rate": 2.9574391518654077e-06,
"loss": 0.0483,
"step": 478
},
{
"epoch": 1.2705570291777188,
"grad_norm": 1.8679306651778758,
"learning_rate": 2.938441699712055e-06,
"loss": 0.0541,
"step": 479
},
{
"epoch": 1.273209549071618,
"grad_norm": 1.5955196355028194,
"learning_rate": 2.9194800367632946e-06,
"loss": 0.0558,
"step": 480
},
{
"epoch": 1.2758620689655173,
"grad_norm": 1.8476586232402528,
"learning_rate": 2.9005544921986774e-06,
"loss": 0.0564,
"step": 481
},
{
"epoch": 1.2785145888594165,
"grad_norm": 1.4298742037812726,
"learning_rate": 2.8816653945707286e-06,
"loss": 0.0442,
"step": 482
},
{
"epoch": 1.2811671087533156,
"grad_norm": 1.287640370036359,
"learning_rate": 2.8628130717992463e-06,
"loss": 0.0373,
"step": 483
},
{
"epoch": 1.2838196286472148,
"grad_norm": 1.4067433199098056,
"learning_rate": 2.8439978511656057e-06,
"loss": 0.045,
"step": 484
},
{
"epoch": 1.2864721485411141,
"grad_norm": 1.4224319432151542,
"learning_rate": 2.82522005930708e-06,
"loss": 0.0385,
"step": 485
},
{
"epoch": 1.2891246684350133,
"grad_norm": 1.5527398960311367,
"learning_rate": 2.8064800222111673e-06,
"loss": 0.0413,
"step": 486
},
{
"epoch": 1.2917771883289124,
"grad_norm": 1.4424002142583947,
"learning_rate": 2.787778065209934e-06,
"loss": 0.0398,
"step": 487
},
{
"epoch": 1.2944297082228116,
"grad_norm": 1.3110803703822922,
"learning_rate": 2.7691145129743645e-06,
"loss": 0.048,
"step": 488
},
{
"epoch": 1.297082228116711,
"grad_norm": 1.5121856468389003,
"learning_rate": 2.7504896895087317e-06,
"loss": 0.0414,
"step": 489
},
{
"epoch": 1.29973474801061,
"grad_norm": 1.7902702451543397,
"learning_rate": 2.7319039181449604e-06,
"loss": 0.0616,
"step": 490
},
{
"epoch": 1.3023872679045092,
"grad_norm": 1.834255575531623,
"learning_rate": 2.713357521537023e-06,
"loss": 0.0479,
"step": 491
},
{
"epoch": 1.3050397877984086,
"grad_norm": 1.3600003850482532,
"learning_rate": 2.6948508216553304e-06,
"loss": 0.042,
"step": 492
},
{
"epoch": 1.3076923076923077,
"grad_norm": 1.9237485999672874,
"learning_rate": 2.6763841397811576e-06,
"loss": 0.0462,
"step": 493
},
{
"epoch": 1.3103448275862069,
"grad_norm": 1.2574491259586364,
"learning_rate": 2.65795779650105e-06,
"loss": 0.0382,
"step": 494
},
{
"epoch": 1.3129973474801062,
"grad_norm": 1.8546812652508733,
"learning_rate": 2.6395721117012648e-06,
"loss": 0.0582,
"step": 495
},
{
"epoch": 1.3156498673740054,
"grad_norm": 1.4303414853405272,
"learning_rate": 2.6212274045622167e-06,
"loss": 0.0437,
"step": 496
},
{
"epoch": 1.3183023872679045,
"grad_norm": 1.3475863489270175,
"learning_rate": 2.6029239935529395e-06,
"loss": 0.0374,
"step": 497
},
{
"epoch": 1.3209549071618036,
"grad_norm": 1.3885729140928034,
"learning_rate": 2.5846621964255524e-06,
"loss": 0.0378,
"step": 498
},
{
"epoch": 1.323607427055703,
"grad_norm": 1.6004829128070286,
"learning_rate": 2.5664423302097462e-06,
"loss": 0.0456,
"step": 499
},
{
"epoch": 1.3262599469496021,
"grad_norm": 1.4483756915235027,
"learning_rate": 2.5482647112072857e-06,
"loss": 0.0483,
"step": 500
},
{
"epoch": 1.3289124668435013,
"grad_norm": 1.5083194734193153,
"learning_rate": 2.530129654986505e-06,
"loss": 0.0403,
"step": 501
},
{
"epoch": 1.3315649867374004,
"grad_norm": 1.0241725585916064,
"learning_rate": 2.5120374763768422e-06,
"loss": 0.0309,
"step": 502
},
{
"epoch": 1.3342175066312998,
"grad_norm": 1.4775342162666805,
"learning_rate": 2.493988489463366e-06,
"loss": 0.0449,
"step": 503
},
{
"epoch": 1.336870026525199,
"grad_norm": 1.453802729887559,
"learning_rate": 2.475983007581326e-06,
"loss": 0.0552,
"step": 504
},
{
"epoch": 1.339522546419098,
"grad_norm": 1.4761120060128954,
"learning_rate": 2.458021343310713e-06,
"loss": 0.0538,
"step": 505
},
{
"epoch": 1.3421750663129974,
"grad_norm": 1.4602044460085675,
"learning_rate": 2.4401038084708313e-06,
"loss": 0.0367,
"step": 506
},
{
"epoch": 1.3448275862068966,
"grad_norm": 1.1064714894292946,
"learning_rate": 2.422230714114891e-06,
"loss": 0.0318,
"step": 507
},
{
"epoch": 1.3474801061007957,
"grad_norm": 1.5984242375021522,
"learning_rate": 2.4044023705246e-06,
"loss": 0.0451,
"step": 508
},
{
"epoch": 1.350132625994695,
"grad_norm": 1.8076623418841897,
"learning_rate": 2.3866190872047775e-06,
"loss": 0.055,
"step": 509
},
{
"epoch": 1.3527851458885942,
"grad_norm": 1.53849591572484,
"learning_rate": 2.3688811728779875e-06,
"loss": 0.0413,
"step": 510
},
{
"epoch": 1.3554376657824934,
"grad_norm": 1.3925465600995954,
"learning_rate": 2.351188935479181e-06,
"loss": 0.0405,
"step": 511
},
{
"epoch": 1.3580901856763925,
"grad_norm": 1.4430777345183743,
"learning_rate": 2.333542682150339e-06,
"loss": 0.0418,
"step": 512
},
{
"epoch": 1.3607427055702916,
"grad_norm": 1.7058397273368744,
"learning_rate": 2.3159427192351467e-06,
"loss": 0.0417,
"step": 513
},
{
"epoch": 1.363395225464191,
"grad_norm": 1.6101298515299973,
"learning_rate": 2.2983893522736795e-06,
"loss": 0.0436,
"step": 514
},
{
"epoch": 1.3660477453580901,
"grad_norm": 1.5663612607810036,
"learning_rate": 2.2808828859970905e-06,
"loss": 0.0415,
"step": 515
},
{
"epoch": 1.3687002652519893,
"grad_norm": 1.3127711478830895,
"learning_rate": 2.263423624322326e-06,
"loss": 0.0429,
"step": 516
},
{
"epoch": 1.3713527851458887,
"grad_norm": 1.4631985318610559,
"learning_rate": 2.2460118703468475e-06,
"loss": 0.0426,
"step": 517
},
{
"epoch": 1.3740053050397878,
"grad_norm": 2.056857172754129,
"learning_rate": 2.228647926343373e-06,
"loss": 0.0549,
"step": 518
},
{
"epoch": 1.376657824933687,
"grad_norm": 1.4596328383955115,
"learning_rate": 2.211332093754622e-06,
"loss": 0.0463,
"step": 519
},
{
"epoch": 1.3793103448275863,
"grad_norm": 1.4795909661114393,
"learning_rate": 2.1940646731880887e-06,
"loss": 0.0382,
"step": 520
},
{
"epoch": 1.3819628647214854,
"grad_norm": 1.79070132377592,
"learning_rate": 2.1768459644108223e-06,
"loss": 0.0493,
"step": 521
},
{
"epoch": 1.3846153846153846,
"grad_norm": 1.6598245589918261,
"learning_rate": 2.159676266344222e-06,
"loss": 0.0504,
"step": 522
},
{
"epoch": 1.387267904509284,
"grad_norm": 1.5697697079805137,
"learning_rate": 2.142555877058847e-06,
"loss": 0.056,
"step": 523
},
{
"epoch": 1.389920424403183,
"grad_norm": 1.5250399923513906,
"learning_rate": 2.125485093769242e-06,
"loss": 0.0451,
"step": 524
},
{
"epoch": 1.3925729442970822,
"grad_norm": 1.488259185423536,
"learning_rate": 2.108464212828786e-06,
"loss": 0.0403,
"step": 525
},
{
"epoch": 1.3952254641909814,
"grad_norm": 1.7336708194270882,
"learning_rate": 2.091493529724528e-06,
"loss": 0.0545,
"step": 526
},
{
"epoch": 1.3978779840848805,
"grad_norm": 1.457566134032916,
"learning_rate": 2.0745733390720744e-06,
"loss": 0.0401,
"step": 527
},
{
"epoch": 1.4005305039787799,
"grad_norm": 2.0446606459323173,
"learning_rate": 2.057703934610474e-06,
"loss": 0.0511,
"step": 528
},
{
"epoch": 1.403183023872679,
"grad_norm": 1.8179579055212491,
"learning_rate": 2.0408856091971063e-06,
"loss": 0.0415,
"step": 529
},
{
"epoch": 1.4058355437665782,
"grad_norm": 1.4476314828089778,
"learning_rate": 2.024118654802608e-06,
"loss": 0.043,
"step": 530
},
{
"epoch": 1.4084880636604775,
"grad_norm": 1.4148575078926533,
"learning_rate": 2.007403362505802e-06,
"loss": 0.0408,
"step": 531
},
{
"epoch": 1.4111405835543767,
"grad_norm": 1.6321368893473254,
"learning_rate": 1.990740022488642e-06,
"loss": 0.0381,
"step": 532
},
{
"epoch": 1.4137931034482758,
"grad_norm": 1.5202544820877246,
"learning_rate": 1.9741289240311757e-06,
"loss": 0.045,
"step": 533
},
{
"epoch": 1.4164456233421752,
"grad_norm": 1.5924780405260186,
"learning_rate": 1.957570355506522e-06,
"loss": 0.0453,
"step": 534
},
{
"epoch": 1.4190981432360743,
"grad_norm": 1.7962035777442131,
"learning_rate": 1.9410646043758737e-06,
"loss": 0.0514,
"step": 535
},
{
"epoch": 1.4217506631299734,
"grad_norm": 1.3406272487252804,
"learning_rate": 1.9246119571834904e-06,
"loss": 0.0333,
"step": 536
},
{
"epoch": 1.4244031830238728,
"grad_norm": 1.730688912537288,
"learning_rate": 1.9082126995517376e-06,
"loss": 0.0475,
"step": 537
},
{
"epoch": 1.427055702917772,
"grad_norm": 1.5376121127489066,
"learning_rate": 1.8918671161761227e-06,
"loss": 0.0343,
"step": 538
},
{
"epoch": 1.429708222811671,
"grad_norm": 1.4633947145114203,
"learning_rate": 1.8755754908203528e-06,
"loss": 0.0409,
"step": 539
},
{
"epoch": 1.4323607427055702,
"grad_norm": 1.6079257325417993,
"learning_rate": 1.8593381063114113e-06,
"loss": 0.0418,
"step": 540
},
{
"epoch": 1.4350132625994694,
"grad_norm": 1.36008605859255,
"learning_rate": 1.8431552445346434e-06,
"loss": 0.04,
"step": 541
},
{
"epoch": 1.4376657824933687,
"grad_norm": 1.7947332913494305,
"learning_rate": 1.827027186428869e-06,
"loss": 0.0601,
"step": 542
},
{
"epoch": 1.4403183023872679,
"grad_norm": 1.3927853248296074,
"learning_rate": 1.8109542119815e-06,
"loss": 0.0425,
"step": 543
},
{
"epoch": 1.442970822281167,
"grad_norm": 1.415485606603946,
"learning_rate": 1.7949366002236762e-06,
"loss": 0.0408,
"step": 544
},
{
"epoch": 1.4456233421750664,
"grad_norm": 1.3417207244235902,
"learning_rate": 1.7789746292254313e-06,
"loss": 0.0415,
"step": 545
},
{
"epoch": 1.4482758620689655,
"grad_norm": 1.5705396456844423,
"learning_rate": 1.7630685760908623e-06,
"loss": 0.0414,
"step": 546
},
{
"epoch": 1.4509283819628647,
"grad_norm": 1.4346574390804177,
"learning_rate": 1.7472187169533128e-06,
"loss": 0.0399,
"step": 547
},
{
"epoch": 1.453580901856764,
"grad_norm": 1.5831645649543884,
"learning_rate": 1.7314253269705854e-06,
"loss": 0.0456,
"step": 548
},
{
"epoch": 1.4562334217506632,
"grad_norm": 1.5188840072828362,
"learning_rate": 1.7156886803201638e-06,
"loss": 0.0508,
"step": 549
},
{
"epoch": 1.4588859416445623,
"grad_norm": 1.739652082715737,
"learning_rate": 1.70000905019445e-06,
"loss": 0.0495,
"step": 550
},
{
"epoch": 1.4615384615384617,
"grad_norm": 1.826264975830559,
"learning_rate": 1.6843867087960252e-06,
"loss": 0.0437,
"step": 551
},
{
"epoch": 1.4641909814323608,
"grad_norm": 1.30588237840256,
"learning_rate": 1.6688219273329215e-06,
"loss": 0.0383,
"step": 552
},
{
"epoch": 1.46684350132626,
"grad_norm": 1.4986591185883977,
"learning_rate": 1.6533149760139206e-06,
"loss": 0.0389,
"step": 553
},
{
"epoch": 1.469496021220159,
"grad_norm": 1.4584576788521406,
"learning_rate": 1.6378661240438498e-06,
"loss": 0.0398,
"step": 554
},
{
"epoch": 1.4721485411140582,
"grad_norm": 1.738259374230387,
"learning_rate": 1.6224756396189216e-06,
"loss": 0.0489,
"step": 555
},
{
"epoch": 1.4748010610079576,
"grad_norm": 1.4204392194550715,
"learning_rate": 1.6071437899220688e-06,
"loss": 0.0414,
"step": 556
},
{
"epoch": 1.4774535809018567,
"grad_norm": 1.5679568822862189,
"learning_rate": 1.591870841118312e-06,
"loss": 0.0527,
"step": 557
},
{
"epoch": 1.4801061007957559,
"grad_norm": 1.595033631355667,
"learning_rate": 1.576657058350135e-06,
"loss": 0.043,
"step": 558
},
{
"epoch": 1.4827586206896552,
"grad_norm": 1.738257177021954,
"learning_rate": 1.561502705732883e-06,
"loss": 0.0479,
"step": 559
},
{
"epoch": 1.4854111405835544,
"grad_norm": 1.365927651900338,
"learning_rate": 1.546408046350183e-06,
"loss": 0.0408,
"step": 560
},
{
"epoch": 1.4880636604774535,
"grad_norm": 1.5461936213104102,
"learning_rate": 1.5313733422493626e-06,
"loss": 0.0432,
"step": 561
},
{
"epoch": 1.490716180371353,
"grad_norm": 1.4928548013692324,
"learning_rate": 1.516398854436914e-06,
"loss": 0.04,
"step": 562
},
{
"epoch": 1.493368700265252,
"grad_norm": 1.640223366433278,
"learning_rate": 1.501484842873963e-06,
"loss": 0.0546,
"step": 563
},
{
"epoch": 1.4960212201591512,
"grad_norm": 1.7973098844829862,
"learning_rate": 1.486631566471745e-06,
"loss": 0.046,
"step": 564
},
{
"epoch": 1.4986737400530503,
"grad_norm": 1.6511015774830353,
"learning_rate": 1.4718392830871192e-06,
"loss": 0.0445,
"step": 565
},
{
"epoch": 1.5013262599469495,
"grad_norm": 2.1489351880145637,
"learning_rate": 1.457108249518089e-06,
"loss": 0.0626,
"step": 566
},
{
"epoch": 1.5039787798408488,
"grad_norm": 1.4283531671752674,
"learning_rate": 1.4424387214993457e-06,
"loss": 0.0388,
"step": 567
},
{
"epoch": 1.506631299734748,
"grad_norm": 1.9589195614651327,
"learning_rate": 1.4278309536978275e-06,
"loss": 0.0403,
"step": 568
},
{
"epoch": 1.509283819628647,
"grad_norm": 1.6259539938234473,
"learning_rate": 1.4132851997082969e-06,
"loss": 0.0453,
"step": 569
},
{
"epoch": 1.5119363395225465,
"grad_norm": 1.52635618170666,
"learning_rate": 1.3988017120489417e-06,
"loss": 0.0487,
"step": 570
},
{
"epoch": 1.5145888594164456,
"grad_norm": 1.166944034701237,
"learning_rate": 1.384380742156991e-06,
"loss": 0.0345,
"step": 571
},
{
"epoch": 1.5172413793103448,
"grad_norm": 1.4683349538414516,
"learning_rate": 1.370022540384347e-06,
"loss": 0.0456,
"step": 572
},
{
"epoch": 1.5198938992042441,
"grad_norm": 1.2722220574404348,
"learning_rate": 1.3557273559932372e-06,
"loss": 0.0321,
"step": 573
},
{
"epoch": 1.5225464190981433,
"grad_norm": 1.420343043420184,
"learning_rate": 1.3414954371518968e-06,
"loss": 0.0486,
"step": 574
},
{
"epoch": 1.5251989389920424,
"grad_norm": 1.613818956433735,
"learning_rate": 1.32732703093025e-06,
"loss": 0.0462,
"step": 575
},
{
"epoch": 1.5278514588859418,
"grad_norm": 1.5717020920513154,
"learning_rate": 1.3132223832956265e-06,
"loss": 0.0393,
"step": 576
},
{
"epoch": 1.530503978779841,
"grad_norm": 1.899833644607642,
"learning_rate": 1.2991817391084887e-06,
"loss": 0.0571,
"step": 577
},
{
"epoch": 1.53315649867374,
"grad_norm": 1.3467128729397928,
"learning_rate": 1.2852053421181826e-06,
"loss": 0.0436,
"step": 578
},
{
"epoch": 1.5358090185676394,
"grad_norm": 1.4684591070731103,
"learning_rate": 1.2712934349587063e-06,
"loss": 0.0409,
"step": 579
},
{
"epoch": 1.5384615384615383,
"grad_norm": 1.4659121730561506,
"learning_rate": 1.257446259144494e-06,
"loss": 0.0343,
"step": 580
},
{
"epoch": 1.5411140583554377,
"grad_norm": 1.2296857151247282,
"learning_rate": 1.2436640550662333e-06,
"loss": 0.0315,
"step": 581
},
{
"epoch": 1.5437665782493368,
"grad_norm": 1.5658367180498545,
"learning_rate": 1.2299470619866778e-06,
"loss": 0.0468,
"step": 582
},
{
"epoch": 1.546419098143236,
"grad_norm": 1.5447401683597295,
"learning_rate": 1.2162955180365033e-06,
"loss": 0.0406,
"step": 583
},
{
"epoch": 1.5490716180371353,
"grad_norm": 1.563584941111247,
"learning_rate": 1.2027096602101728e-06,
"loss": 0.0385,
"step": 584
},
{
"epoch": 1.5517241379310345,
"grad_norm": 1.6434334673624074,
"learning_rate": 1.1891897243618184e-06,
"loss": 0.0452,
"step": 585
},
{
"epoch": 1.5543766578249336,
"grad_norm": 1.4760334609742514,
"learning_rate": 1.1757359452011497e-06,
"loss": 0.0366,
"step": 586
},
{
"epoch": 1.557029177718833,
"grad_norm": 1.232360571349415,
"learning_rate": 1.1623485562893772e-06,
"loss": 0.0373,
"step": 587
},
{
"epoch": 1.5596816976127321,
"grad_norm": 1.2603274281429182,
"learning_rate": 1.1490277900351637e-06,
"loss": 0.0336,
"step": 588
},
{
"epoch": 1.5623342175066313,
"grad_norm": 1.4672611395225155,
"learning_rate": 1.1357738776905802e-06,
"loss": 0.0499,
"step": 589
},
{
"epoch": 1.5649867374005306,
"grad_norm": 1.3938518014710124,
"learning_rate": 1.1225870493470952e-06,
"loss": 0.0407,
"step": 590
},
{
"epoch": 1.5676392572944295,
"grad_norm": 1.5266979684533877,
"learning_rate": 1.1094675339315825e-06,
"loss": 0.0389,
"step": 591
},
{
"epoch": 1.570291777188329,
"grad_norm": 1.8368858591777315,
"learning_rate": 1.0964155592023483e-06,
"loss": 0.0353,
"step": 592
},
{
"epoch": 1.5729442970822283,
"grad_norm": 1.6847631793792286,
"learning_rate": 1.083431351745171e-06,
"loss": 0.0492,
"step": 593
},
{
"epoch": 1.5755968169761272,
"grad_norm": 1.457143006494126,
"learning_rate": 1.0705151369693712e-06,
"loss": 0.0373,
"step": 594
},
{
"epoch": 1.5782493368700266,
"grad_norm": 1.4537669375386875,
"learning_rate": 1.0576671391038996e-06,
"loss": 0.0404,
"step": 595
},
{
"epoch": 1.5809018567639257,
"grad_norm": 1.570259411391911,
"learning_rate": 1.0448875811934417e-06,
"loss": 0.0374,
"step": 596
},
{
"epoch": 1.5835543766578248,
"grad_norm": 1.4551622624887812,
"learning_rate": 1.0321766850945486e-06,
"loss": 0.0349,
"step": 597
},
{
"epoch": 1.5862068965517242,
"grad_norm": 1.320645141600928,
"learning_rate": 1.0195346714717813e-06,
"loss": 0.0365,
"step": 598
},
{
"epoch": 1.5888594164456233,
"grad_norm": 1.7279741174166037,
"learning_rate": 1.0069617597938869e-06,
"loss": 0.048,
"step": 599
},
{
"epoch": 1.5915119363395225,
"grad_norm": 1.3238694043973904,
"learning_rate": 9.944581683299804e-07,
"loss": 0.0402,
"step": 600
},
{
"epoch": 1.5915119363395225,
"eval_loss": 0.1134517639875412,
"eval_runtime": 1.2987,
"eval_samples_per_second": 23.87,
"eval_steps_per_second": 6.16,
"step": 600
},
{
"epoch": 1.5941644562334218,
"grad_norm": 1.3051093843669757,
"learning_rate": 9.82024114145761e-07,
"loss": 0.0382,
"step": 601
},
{
"epoch": 1.596816976127321,
"grad_norm": 1.162659102103195,
"learning_rate": 9.696598130997415e-07,
"loss": 0.036,
"step": 602
},
{
"epoch": 1.5994694960212201,
"grad_norm": 1.7546331982712047,
"learning_rate": 9.57365479839501e-07,
"loss": 0.0494,
"step": 603
},
{
"epoch": 1.6021220159151195,
"grad_norm": 1.3672793279747577,
"learning_rate": 9.45141327797961e-07,
"loss": 0.0322,
"step": 604
},
{
"epoch": 1.6047745358090184,
"grad_norm": 1.3661103077040575,
"learning_rate": 9.32987569189675e-07,
"loss": 0.0391,
"step": 605
},
{
"epoch": 1.6074270557029178,
"grad_norm": 1.5456527347090927,
"learning_rate": 9.209044150071522e-07,
"loss": 0.042,
"step": 606
},
{
"epoch": 1.6100795755968171,
"grad_norm": 1.6924497065293256,
"learning_rate": 9.088920750171876e-07,
"loss": 0.0415,
"step": 607
},
{
"epoch": 1.612732095490716,
"grad_norm": 1.74031693520605,
"learning_rate": 8.969507577572189e-07,
"loss": 0.0431,
"step": 608
},
{
"epoch": 1.6153846153846154,
"grad_norm": 1.551498689243845,
"learning_rate": 8.850806705317183e-07,
"loss": 0.0361,
"step": 609
},
{
"epoch": 1.6180371352785146,
"grad_norm": 1.542261921192749,
"learning_rate": 8.732820194085794e-07,
"loss": 0.0428,
"step": 610
},
{
"epoch": 1.6206896551724137,
"grad_norm": 1.78892779608987,
"learning_rate": 8.615550092155478e-07,
"loss": 0.0451,
"step": 611
},
{
"epoch": 1.623342175066313,
"grad_norm": 1.555805233678888,
"learning_rate": 8.498998435366634e-07,
"loss": 0.0365,
"step": 612
},
{
"epoch": 1.6259946949602122,
"grad_norm": 1.5060342455389297,
"learning_rate": 8.383167247087259e-07,
"loss": 0.04,
"step": 613
},
{
"epoch": 1.6286472148541113,
"grad_norm": 1.611093579148822,
"learning_rate": 8.268058538177826e-07,
"loss": 0.0375,
"step": 614
},
{
"epoch": 1.6312997347480107,
"grad_norm": 1.588047950817851,
"learning_rate": 8.15367430695636e-07,
"loss": 0.0456,
"step": 615
},
{
"epoch": 1.6339522546419099,
"grad_norm": 1.3816401812608983,
"learning_rate": 8.040016539163792e-07,
"loss": 0.0312,
"step": 616
},
{
"epoch": 1.636604774535809,
"grad_norm": 1.4853951274849326,
"learning_rate": 7.927087207929418e-07,
"loss": 0.0371,
"step": 617
},
{
"epoch": 1.6392572944297084,
"grad_norm": 1.4635552701452224,
"learning_rate": 7.814888273736698e-07,
"loss": 0.0398,
"step": 618
},
{
"epoch": 1.6419098143236073,
"grad_norm": 1.4395599081772887,
"learning_rate": 7.70342168438919e-07,
"loss": 0.0377,
"step": 619
},
{
"epoch": 1.6445623342175066,
"grad_norm": 1.4904004060637046,
"learning_rate": 7.592689374976769e-07,
"loss": 0.0422,
"step": 620
},
{
"epoch": 1.647214854111406,
"grad_norm": 1.3555033892449118,
"learning_rate": 7.482693267842e-07,
"loss": 0.0298,
"step": 621
},
{
"epoch": 1.649867374005305,
"grad_norm": 1.6737297709368,
"learning_rate": 7.373435272546764e-07,
"loss": 0.0426,
"step": 622
},
{
"epoch": 1.6525198938992043,
"grad_norm": 1.4564908618465116,
"learning_rate": 7.264917285839168e-07,
"loss": 0.0329,
"step": 623
},
{
"epoch": 1.6551724137931034,
"grad_norm": 1.2263290141969596,
"learning_rate": 7.157141191620548e-07,
"loss": 0.0405,
"step": 624
},
{
"epoch": 1.6578249336870026,
"grad_norm": 1.5061155245041684,
"learning_rate": 7.050108860912752e-07,
"loss": 0.0423,
"step": 625
},
{
"epoch": 1.660477453580902,
"grad_norm": 1.342531472718274,
"learning_rate": 6.943822151825735e-07,
"loss": 0.0381,
"step": 626
},
{
"epoch": 1.663129973474801,
"grad_norm": 1.529969425604032,
"learning_rate": 6.838282909525268e-07,
"loss": 0.0378,
"step": 627
},
{
"epoch": 1.6657824933687002,
"grad_norm": 1.4693891043473597,
"learning_rate": 6.733492966200872e-07,
"loss": 0.0409,
"step": 628
},
{
"epoch": 1.6684350132625996,
"grad_norm": 1.2856071654517978,
"learning_rate": 6.629454141034053e-07,
"loss": 0.0359,
"step": 629
},
{
"epoch": 1.6710875331564987,
"grad_norm": 1.261936660004472,
"learning_rate": 6.526168240166686e-07,
"loss": 0.0375,
"step": 630
},
{
"epoch": 1.6737400530503979,
"grad_norm": 1.9177656406918595,
"learning_rate": 6.423637056669702e-07,
"loss": 0.0528,
"step": 631
},
{
"epoch": 1.6763925729442972,
"grad_norm": 1.1739232814066218,
"learning_rate": 6.321862370511922e-07,
"loss": 0.0339,
"step": 632
},
{
"epoch": 1.6790450928381961,
"grad_norm": 1.5997908198741042,
"learning_rate": 6.220845948529159e-07,
"loss": 0.0365,
"step": 633
},
{
"epoch": 1.6816976127320955,
"grad_norm": 1.9896095612620952,
"learning_rate": 6.120589544393596e-07,
"loss": 0.0442,
"step": 634
},
{
"epoch": 1.6843501326259946,
"grad_norm": 1.4447133472783895,
"learning_rate": 6.021094898583269e-07,
"loss": 0.0417,
"step": 635
},
{
"epoch": 1.6870026525198938,
"grad_norm": 1.423453466191924,
"learning_rate": 5.922363738351888e-07,
"loss": 0.0416,
"step": 636
},
{
"epoch": 1.6896551724137931,
"grad_norm": 1.6412520906518056,
"learning_rate": 5.824397777698859e-07,
"loss": 0.0376,
"step": 637
},
{
"epoch": 1.6923076923076923,
"grad_norm": 1.3500786343856588,
"learning_rate": 5.727198717339511e-07,
"loss": 0.0365,
"step": 638
},
{
"epoch": 1.6949602122015914,
"grad_norm": 1.3281753235494516,
"learning_rate": 5.630768244675583e-07,
"loss": 0.0355,
"step": 639
},
{
"epoch": 1.6976127320954908,
"grad_norm": 1.5686317332318083,
"learning_rate": 5.535108033765913e-07,
"loss": 0.047,
"step": 640
},
{
"epoch": 1.70026525198939,
"grad_norm": 1.8508472340714566,
"learning_rate": 5.440219745297432e-07,
"loss": 0.049,
"step": 641
},
{
"epoch": 1.702917771883289,
"grad_norm": 1.6565736316956416,
"learning_rate": 5.346105026556226e-07,
"loss": 0.0397,
"step": 642
},
{
"epoch": 1.7055702917771884,
"grad_norm": 1.5711209136989794,
"learning_rate": 5.252765511399044e-07,
"loss": 0.0436,
"step": 643
},
{
"epoch": 1.7082228116710876,
"grad_norm": 1.5673409139078343,
"learning_rate": 5.160202820224875e-07,
"loss": 0.0441,
"step": 644
},
{
"epoch": 1.7108753315649867,
"grad_norm": 1.545089411507962,
"learning_rate": 5.068418559946864e-07,
"loss": 0.0348,
"step": 645
},
{
"epoch": 1.713527851458886,
"grad_norm": 1.432485291304248,
"learning_rate": 4.977414323964364e-07,
"loss": 0.0375,
"step": 646
},
{
"epoch": 1.716180371352785,
"grad_norm": 1.6399078665274962,
"learning_rate": 4.88719169213529e-07,
"loss": 0.039,
"step": 647
},
{
"epoch": 1.7188328912466844,
"grad_norm": 1.987599098491381,
"learning_rate": 4.797752230748721e-07,
"loss": 0.0515,
"step": 648
},
{
"epoch": 1.7214854111405835,
"grad_norm": 1.1831605370816431,
"learning_rate": 4.7090974924976716e-07,
"loss": 0.0313,
"step": 649
},
{
"epoch": 1.7241379310344827,
"grad_norm": 1.462425034427707,
"learning_rate": 4.6212290164521554e-07,
"loss": 0.0384,
"step": 650
},
{
"epoch": 1.726790450928382,
"grad_norm": 1.2550042615331936,
"learning_rate": 4.534148328032456e-07,
"loss": 0.0333,
"step": 651
},
{
"epoch": 1.7294429708222812,
"grad_norm": 2.006128939436255,
"learning_rate": 4.4478569389826864e-07,
"loss": 0.0362,
"step": 652
},
{
"epoch": 1.7320954907161803,
"grad_norm": 1.0400629902188836,
"learning_rate": 4.3623563473444817e-07,
"loss": 0.0389,
"step": 653
},
{
"epoch": 1.7347480106100797,
"grad_norm": 1.40192347825387,
"learning_rate": 4.277648037430998e-07,
"loss": 0.039,
"step": 654
},
{
"epoch": 1.7374005305039788,
"grad_norm": 1.183366553658279,
"learning_rate": 4.193733479801232e-07,
"loss": 0.0288,
"step": 655
},
{
"epoch": 1.740053050397878,
"grad_norm": 1.3787704383311228,
"learning_rate": 4.110614131234375e-07,
"loss": 0.0391,
"step": 656
},
{
"epoch": 1.7427055702917773,
"grad_norm": 1.5234821215536667,
"learning_rate": 4.028291434704601e-07,
"loss": 0.0402,
"step": 657
},
{
"epoch": 1.7453580901856764,
"grad_norm": 1.7063010359505684,
"learning_rate": 3.946766819355985e-07,
"loss": 0.0477,
"step": 658
},
{
"epoch": 1.7480106100795756,
"grad_norm": 1.6813393539550572,
"learning_rate": 3.866041700477691e-07,
"loss": 0.0451,
"step": 659
},
{
"epoch": 1.750663129973475,
"grad_norm": 1.7013026951961887,
"learning_rate": 3.786117479479423e-07,
"loss": 0.0375,
"step": 660
},
{
"epoch": 1.7533156498673739,
"grad_norm": 1.335932666043299,
"learning_rate": 3.7069955438670704e-07,
"loss": 0.0365,
"step": 661
},
{
"epoch": 1.7559681697612732,
"grad_norm": 1.3147540821216455,
"learning_rate": 3.62867726721865e-07,
"loss": 0.033,
"step": 662
},
{
"epoch": 1.7586206896551724,
"grad_norm": 1.6551386695113186,
"learning_rate": 3.5511640091604293e-07,
"loss": 0.0443,
"step": 663
},
{
"epoch": 1.7612732095490715,
"grad_norm": 1.2796382269681643,
"learning_rate": 3.474457115343344e-07,
"loss": 0.0341,
"step": 664
},
{
"epoch": 1.7639257294429709,
"grad_norm": 1.4215576260782057,
"learning_rate": 3.398557917419626e-07,
"loss": 0.0353,
"step": 665
},
{
"epoch": 1.76657824933687,
"grad_norm": 1.701821067856708,
"learning_rate": 3.3234677330196865e-07,
"loss": 0.0451,
"step": 666
},
{
"epoch": 1.7692307692307692,
"grad_norm": 1.182220074597278,
"learning_rate": 3.2491878657292643e-07,
"loss": 0.0366,
"step": 667
},
{
"epoch": 1.7718832891246685,
"grad_norm": 1.6450299221574771,
"learning_rate": 3.175719605066746e-07,
"loss": 0.0405,
"step": 668
},
{
"epoch": 1.7745358090185677,
"grad_norm": 1.9308466623075535,
"learning_rate": 3.1030642264608393e-07,
"loss": 0.0425,
"step": 669
},
{
"epoch": 1.7771883289124668,
"grad_norm": 2.0326661438505202,
"learning_rate": 3.0312229912283884e-07,
"loss": 0.048,
"step": 670
},
{
"epoch": 1.7798408488063662,
"grad_norm": 1.576985336913738,
"learning_rate": 2.96019714655249e-07,
"loss": 0.0385,
"step": 671
},
{
"epoch": 1.782493368700265,
"grad_norm": 1.4496388975425247,
"learning_rate": 2.88998792546083e-07,
"loss": 0.0366,
"step": 672
},
{
"epoch": 1.7851458885941645,
"grad_norm": 1.3474637091777282,
"learning_rate": 2.820596546804316e-07,
"loss": 0.0445,
"step": 673
},
{
"epoch": 1.7877984084880638,
"grad_norm": 1.6312145174464268,
"learning_rate": 2.7520242152358767e-07,
"loss": 0.047,
"step": 674
},
{
"epoch": 1.7904509283819627,
"grad_norm": 1.1102230652695808,
"learning_rate": 2.6842721211895516e-07,
"loss": 0.0309,
"step": 675
},
{
"epoch": 1.793103448275862,
"grad_norm": 1.6712723138079337,
"learning_rate": 2.617341440859883e-07,
"loss": 0.0391,
"step": 676
},
{
"epoch": 1.7957559681697612,
"grad_norm": 1.4567044509094966,
"learning_rate": 2.551233336181386e-07,
"loss": 0.0368,
"step": 677
},
{
"epoch": 1.7984084880636604,
"grad_norm": 1.4754171869798691,
"learning_rate": 2.485948954808493e-07,
"loss": 0.0386,
"step": 678
},
{
"epoch": 1.8010610079575597,
"grad_norm": 1.2965369200067902,
"learning_rate": 2.421489430095547e-07,
"loss": 0.0272,
"step": 679
},
{
"epoch": 1.8037135278514589,
"grad_norm": 1.4954201899471793,
"learning_rate": 2.357855881077181e-07,
"loss": 0.0436,
"step": 680
},
{
"epoch": 1.806366047745358,
"grad_norm": 1.081445215280861,
"learning_rate": 2.2950494124488687e-07,
"loss": 0.0355,
"step": 681
},
{
"epoch": 1.8090185676392574,
"grad_norm": 1.319457308590872,
"learning_rate": 2.2330711145477247e-07,
"loss": 0.0301,
"step": 682
},
{
"epoch": 1.8116710875331565,
"grad_norm": 1.5878396866106896,
"learning_rate": 2.1719220633336147e-07,
"loss": 0.0335,
"step": 683
},
{
"epoch": 1.8143236074270557,
"grad_norm": 1.4109191318255065,
"learning_rate": 2.1116033203704534e-07,
"loss": 0.0321,
"step": 684
},
{
"epoch": 1.816976127320955,
"grad_norm": 1.7498102959049076,
"learning_rate": 2.0521159328077856e-07,
"loss": 0.037,
"step": 685
},
{
"epoch": 1.819628647214854,
"grad_norm": 1.4473167506873907,
"learning_rate": 1.993460933362601e-07,
"loss": 0.0364,
"step": 686
},
{
"epoch": 1.8222811671087533,
"grad_norm": 1.2905934999941218,
"learning_rate": 1.935639340301415e-07,
"loss": 0.0304,
"step": 687
},
{
"epoch": 1.8249336870026527,
"grad_norm": 1.6396632866378924,
"learning_rate": 1.8786521574225837e-07,
"loss": 0.0492,
"step": 688
},
{
"epoch": 1.8275862068965516,
"grad_norm": 1.4628611231715085,
"learning_rate": 1.8225003740388546e-07,
"loss": 0.0384,
"step": 689
},
{
"epoch": 1.830238726790451,
"grad_norm": 1.4318855930579866,
"learning_rate": 1.7671849649602502e-07,
"loss": 0.0363,
"step": 690
},
{
"epoch": 1.83289124668435,
"grad_norm": 1.1555536281682646,
"learning_rate": 1.7127068904770948e-07,
"loss": 0.0316,
"step": 691
},
{
"epoch": 1.8355437665782492,
"grad_norm": 1.276197266535223,
"learning_rate": 1.6590670963433642e-07,
"loss": 0.0322,
"step": 692
},
{
"epoch": 1.8381962864721486,
"grad_norm": 1.540221628307776,
"learning_rate": 1.6062665137602572e-07,
"loss": 0.039,
"step": 693
},
{
"epoch": 1.8408488063660478,
"grad_norm": 1.488141776444817,
"learning_rate": 1.5543060593600334e-07,
"loss": 0.0497,
"step": 694
},
{
"epoch": 1.843501326259947,
"grad_norm": 1.4064075758646781,
"learning_rate": 1.5031866351901182e-07,
"loss": 0.0354,
"step": 695
},
{
"epoch": 1.8461538461538463,
"grad_norm": 1.7289030418499938,
"learning_rate": 1.4529091286973994e-07,
"loss": 0.047,
"step": 696
},
{
"epoch": 1.8488063660477454,
"grad_norm": 1.4042197094833593,
"learning_rate": 1.403474412712874e-07,
"loss": 0.0352,
"step": 697
},
{
"epoch": 1.8514588859416445,
"grad_norm": 1.0019472105701877,
"learning_rate": 1.3548833454364641e-07,
"loss": 0.024,
"step": 698
},
{
"epoch": 1.854111405835544,
"grad_norm": 1.773377229970334,
"learning_rate": 1.3071367704221129e-07,
"loss": 0.0444,
"step": 699
},
{
"epoch": 1.8567639257294428,
"grad_norm": 1.1177532519030482,
"learning_rate": 1.260235516563163e-07,
"loss": 0.0363,
"step": 700
},
{
"epoch": 1.8594164456233422,
"grad_norm": 1.2900623934609892,
"learning_rate": 1.2141803980779464e-07,
"loss": 0.0317,
"step": 701
},
{
"epoch": 1.8620689655172413,
"grad_norm": 1.8304735271621675,
"learning_rate": 1.1689722144956672e-07,
"loss": 0.0432,
"step": 702
},
{
"epoch": 1.8647214854111405,
"grad_norm": 1.4188940996465451,
"learning_rate": 1.1246117506425014e-07,
"loss": 0.0392,
"step": 703
},
{
"epoch": 1.8673740053050398,
"grad_norm": 1.4825923835504944,
"learning_rate": 1.0810997766279974e-07,
"loss": 0.0382,
"step": 704
},
{
"epoch": 1.870026525198939,
"grad_norm": 1.1014643278241296,
"learning_rate": 1.0384370478316919e-07,
"loss": 0.0311,
"step": 705
},
{
"epoch": 1.8726790450928381,
"grad_norm": 1.3678036067136727,
"learning_rate": 9.966243048899704e-08,
"loss": 0.0331,
"step": 706
},
{
"epoch": 1.8753315649867375,
"grad_norm": 1.673390021703709,
"learning_rate": 9.556622736832665e-08,
"loss": 0.0481,
"step": 707
},
{
"epoch": 1.8779840848806366,
"grad_norm": 1.5656233630923257,
"learning_rate": 9.155516653234276e-08,
"loss": 0.0442,
"step": 708
},
{
"epoch": 1.8806366047745358,
"grad_norm": 1.320780746379846,
"learning_rate": 8.762931761413573e-08,
"loss": 0.0382,
"step": 709
},
{
"epoch": 1.8832891246684351,
"grad_norm": 1.8359864667941603,
"learning_rate": 8.378874876749433e-08,
"loss": 0.0369,
"step": 710
},
{
"epoch": 1.8859416445623343,
"grad_norm": 1.6235154801266736,
"learning_rate": 8.003352666572428e-08,
"loss": 0.0404,
"step": 711
},
{
"epoch": 1.8885941644562334,
"grad_norm": 1.8357177886604772,
"learning_rate": 7.636371650048658e-08,
"loss": 0.0458,
"step": 712
},
{
"epoch": 1.8912466843501328,
"grad_norm": 1.4570512705105305,
"learning_rate": 7.277938198066992e-08,
"loss": 0.0418,
"step": 713
},
{
"epoch": 1.8938992042440317,
"grad_norm": 1.2604162586852787,
"learning_rate": 6.928058533128112e-08,
"loss": 0.0323,
"step": 714
},
{
"epoch": 1.896551724137931,
"grad_norm": 1.6009023727823846,
"learning_rate": 6.58673872923693e-08,
"loss": 0.0359,
"step": 715
},
{
"epoch": 1.8992042440318302,
"grad_norm": 1.2976436667625482,
"learning_rate": 6.253984711796612e-08,
"loss": 0.0371,
"step": 716
},
{
"epoch": 1.9018567639257293,
"grad_norm": 1.2870991478173943,
"learning_rate": 5.929802257506112e-08,
"loss": 0.045,
"step": 717
},
{
"epoch": 1.9045092838196287,
"grad_norm": 1.6902715191719975,
"learning_rate": 5.6141969942596906e-08,
"loss": 0.0389,
"step": 718
},
{
"epoch": 1.9071618037135278,
"grad_norm": 1.7050325681735479,
"learning_rate": 5.307174401049275e-08,
"loss": 0.0334,
"step": 719
},
{
"epoch": 1.909814323607427,
"grad_norm": 1.5808736002449604,
"learning_rate": 5.0087398078694785e-08,
"loss": 0.0417,
"step": 720
},
{
"epoch": 1.9124668435013263,
"grad_norm": 1.3430964938486658,
"learning_rate": 4.718898395624671e-08,
"loss": 0.0339,
"step": 721
},
{
"epoch": 1.9151193633952255,
"grad_norm": 1.489257458379661,
"learning_rate": 4.437655196039559e-08,
"loss": 0.0429,
"step": 722
},
{
"epoch": 1.9177718832891246,
"grad_norm": 1.4210307447912756,
"learning_rate": 4.1650150915714674e-08,
"loss": 0.0421,
"step": 723
},
{
"epoch": 1.920424403183024,
"grad_norm": 1.2959156290024842,
"learning_rate": 3.900982815325582e-08,
"loss": 0.0355,
"step": 724
},
{
"epoch": 1.9230769230769231,
"grad_norm": 1.4827766464308814,
"learning_rate": 3.645562950973014e-08,
"loss": 0.0369,
"step": 725
},
{
"epoch": 1.9257294429708223,
"grad_norm": 1.572959594152032,
"learning_rate": 3.3987599326710806e-08,
"loss": 0.0523,
"step": 726
},
{
"epoch": 1.9283819628647216,
"grad_norm": 1.519959930918152,
"learning_rate": 3.160578044986373e-08,
"loss": 0.0354,
"step": 727
},
{
"epoch": 1.9310344827586206,
"grad_norm": 1.2687605705293235,
"learning_rate": 2.9310214228202016e-08,
"loss": 0.0367,
"step": 728
},
{
"epoch": 1.93368700265252,
"grad_norm": 1.2589757041102903,
"learning_rate": 2.7100940513370976e-08,
"loss": 0.037,
"step": 729
},
{
"epoch": 1.936339522546419,
"grad_norm": 1.3802142564158149,
"learning_rate": 2.4977997658954257e-08,
"loss": 0.0384,
"step": 730
},
{
"epoch": 1.9389920424403182,
"grad_norm": 1.1728085570996578,
"learning_rate": 2.29414225198088e-08,
"loss": 0.0332,
"step": 731
},
{
"epoch": 1.9416445623342176,
"grad_norm": 1.4829028934918602,
"learning_rate": 2.0991250451424806e-08,
"loss": 0.0382,
"step": 732
},
{
"epoch": 1.9442970822281167,
"grad_norm": 1.0525766488908528,
"learning_rate": 1.912751530931234e-08,
"loss": 0.0307,
"step": 733
},
{
"epoch": 1.9469496021220158,
"grad_norm": 1.7747299163019683,
"learning_rate": 1.735024944841235e-08,
"loss": 0.0412,
"step": 734
},
{
"epoch": 1.9496021220159152,
"grad_norm": 1.3308814219146978,
"learning_rate": 1.5659483722537117e-08,
"loss": 0.0332,
"step": 735
},
{
"epoch": 1.9522546419098143,
"grad_norm": 1.6064157720427565,
"learning_rate": 1.4055247483832356e-08,
"loss": 0.0434,
"step": 736
},
{
"epoch": 1.9549071618037135,
"grad_norm": 1.334355441410993,
"learning_rate": 1.2537568582269289e-08,
"loss": 0.0407,
"step": 737
},
{
"epoch": 1.9575596816976129,
"grad_norm": 1.8576856489215392,
"learning_rate": 1.110647336516002e-08,
"loss": 0.041,
"step": 738
},
{
"epoch": 1.9602122015915118,
"grad_norm": 1.3915797639255931,
"learning_rate": 9.761986676701251e-09,
"loss": 0.0338,
"step": 739
},
{
"epoch": 1.9628647214854111,
"grad_norm": 1.696616311407856,
"learning_rate": 8.504131857542952e-09,
"loss": 0.0438,
"step": 740
},
{
"epoch": 1.9655172413793105,
"grad_norm": 1.2509363269884906,
"learning_rate": 7.332930744380906e-09,
"loss": 0.0359,
"step": 741
},
{
"epoch": 1.9681697612732094,
"grad_norm": 1.5867898086364787,
"learning_rate": 6.24840366958035e-09,
"loss": 0.0337,
"step": 742
},
{
"epoch": 1.9708222811671088,
"grad_norm": 1.0944506086655037,
"learning_rate": 5.250569460822363e-09,
"loss": 0.0304,
"step": 743
},
{
"epoch": 1.973474801061008,
"grad_norm": 1.6628593056793242,
"learning_rate": 4.339445440776358e-09,
"loss": 0.0377,
"step": 744
},
{
"epoch": 1.976127320954907,
"grad_norm": 1.7268640463935314,
"learning_rate": 3.5150474267992007e-09,
"loss": 0.0435,
"step": 745
},
{
"epoch": 1.9787798408488064,
"grad_norm": 1.469911347318018,
"learning_rate": 2.7773897306615504e-09,
"loss": 0.0377,
"step": 746
},
{
"epoch": 1.9814323607427056,
"grad_norm": 1.420365368396037,
"learning_rate": 2.126485158298608e-09,
"loss": 0.0384,
"step": 747
},
{
"epoch": 1.9840848806366047,
"grad_norm": 1.1547923580578376,
"learning_rate": 1.5623450095880731e-09,
"loss": 0.0289,
"step": 748
},
{
"epoch": 1.986737400530504,
"grad_norm": 1.7325651456934643,
"learning_rate": 1.0849790781541913e-09,
"loss": 0.0369,
"step": 749
},
{
"epoch": 1.9893899204244032,
"grad_norm": 1.6534729465614353,
"learning_rate": 6.943956511973326e-10,
"loss": 0.0513,
"step": 750
},
{
"epoch": 1.9920424403183024,
"grad_norm": 1.2257336512995223,
"learning_rate": 3.9060150935077425e-10,
"loss": 0.0341,
"step": 751
},
{
"epoch": 1.9946949602122017,
"grad_norm": 1.5013042404201287,
"learning_rate": 1.7360192656246112e-10,
"loss": 0.0428,
"step": 752
},
{
"epoch": 1.9973474801061006,
"grad_norm": 1.5234604261794584,
"learning_rate": 4.340067000230264e-11,
"loss": 0.0356,
"step": 753
},
{
"epoch": 2.0,
"grad_norm": 1.1204706677910448,
"learning_rate": 0.0,
"loss": 0.0257,
"step": 754
},
{
"epoch": 2.0,
"step": 754,
"total_flos": 7513536872448.0,
"train_loss": 0.09835794550769367,
"train_runtime": 628.4293,
"train_samples_per_second": 9.589,
"train_steps_per_second": 1.2
}
],
"logging_steps": 1,
"max_steps": 754,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7513536872448.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}