cutelemonlili's picture
Add files using upload-large-folder tool
9edf54b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 200,
"global_step": 1004,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00199203187250996,
"grad_norm": 5.302925716458761,
"learning_rate": 9.99997552222299e-06,
"loss": 0.4962,
"step": 1
},
{
"epoch": 0.00398406374501992,
"grad_norm": 4.0068584615292755,
"learning_rate": 9.999902089131626e-06,
"loss": 0.5131,
"step": 2
},
{
"epoch": 0.00597609561752988,
"grad_norm": 3.2685057084225533,
"learning_rate": 9.999779701444897e-06,
"loss": 0.3642,
"step": 3
},
{
"epoch": 0.00796812749003984,
"grad_norm": 2.969388810564799,
"learning_rate": 9.999608360361114e-06,
"loss": 0.4366,
"step": 4
},
{
"epoch": 0.0099601593625498,
"grad_norm": 2.3257277817224877,
"learning_rate": 9.9993880675579e-06,
"loss": 0.2805,
"step": 5
},
{
"epoch": 0.01195219123505976,
"grad_norm": 2.316100445863212,
"learning_rate": 9.999118825192162e-06,
"loss": 0.3663,
"step": 6
},
{
"epoch": 0.013944223107569721,
"grad_norm": 2.0546227682793825,
"learning_rate": 9.998800635900085e-06,
"loss": 0.2761,
"step": 7
},
{
"epoch": 0.01593625498007968,
"grad_norm": 2.6542621051493294,
"learning_rate": 9.998433502797097e-06,
"loss": 0.3528,
"step": 8
},
{
"epoch": 0.017928286852589643,
"grad_norm": 2.184358255514989,
"learning_rate": 9.998017429477834e-06,
"loss": 0.2891,
"step": 9
},
{
"epoch": 0.0199203187250996,
"grad_norm": 2.210497121507057,
"learning_rate": 9.99755242001612e-06,
"loss": 0.3052,
"step": 10
},
{
"epoch": 0.021912350597609563,
"grad_norm": 2.391291694670009,
"learning_rate": 9.99703847896491e-06,
"loss": 0.3088,
"step": 11
},
{
"epoch": 0.02390438247011952,
"grad_norm": 1.8867370891248774,
"learning_rate": 9.996475611356265e-06,
"loss": 0.2799,
"step": 12
},
{
"epoch": 0.025896414342629483,
"grad_norm": 1.472434411377998,
"learning_rate": 9.995863822701278e-06,
"loss": 0.1877,
"step": 13
},
{
"epoch": 0.027888446215139442,
"grad_norm": 1.550344203694589,
"learning_rate": 9.99520311899004e-06,
"loss": 0.1703,
"step": 14
},
{
"epoch": 0.029880478087649404,
"grad_norm": 2.6379971141320264,
"learning_rate": 9.994493506691577e-06,
"loss": 0.2986,
"step": 15
},
{
"epoch": 0.03187250996015936,
"grad_norm": 2.1294276467465734,
"learning_rate": 9.993734992753777e-06,
"loss": 0.2666,
"step": 16
},
{
"epoch": 0.03386454183266932,
"grad_norm": 1.9769257581715476,
"learning_rate": 9.992927584603339e-06,
"loss": 0.2376,
"step": 17
},
{
"epoch": 0.035856573705179286,
"grad_norm": 2.262018155817254,
"learning_rate": 9.992071290145684e-06,
"loss": 0.2436,
"step": 18
},
{
"epoch": 0.037848605577689244,
"grad_norm": 2.1357744572584605,
"learning_rate": 9.991166117764885e-06,
"loss": 0.2337,
"step": 19
},
{
"epoch": 0.0398406374501992,
"grad_norm": 2.065609976313799,
"learning_rate": 9.990212076323587e-06,
"loss": 0.2614,
"step": 20
},
{
"epoch": 0.04183266932270916,
"grad_norm": 2.0110212882357894,
"learning_rate": 9.989209175162912e-06,
"loss": 0.2282,
"step": 21
},
{
"epoch": 0.043824701195219126,
"grad_norm": 2.0333703620396433,
"learning_rate": 9.988157424102381e-06,
"loss": 0.2166,
"step": 22
},
{
"epoch": 0.045816733067729085,
"grad_norm": 1.7747686699151117,
"learning_rate": 9.9870568334398e-06,
"loss": 0.1755,
"step": 23
},
{
"epoch": 0.04780876494023904,
"grad_norm": 1.610758994875989,
"learning_rate": 9.98590741395118e-06,
"loss": 0.1979,
"step": 24
},
{
"epoch": 0.049800796812749,
"grad_norm": 1.8977306151271889,
"learning_rate": 9.98470917689061e-06,
"loss": 0.2244,
"step": 25
},
{
"epoch": 0.05179282868525897,
"grad_norm": 1.7649852344717443,
"learning_rate": 9.983462133990163e-06,
"loss": 0.1947,
"step": 26
},
{
"epoch": 0.053784860557768925,
"grad_norm": 2.1142885695917877,
"learning_rate": 9.982166297459775e-06,
"loss": 0.2156,
"step": 27
},
{
"epoch": 0.055776892430278883,
"grad_norm": 2.278915052614827,
"learning_rate": 9.980821679987125e-06,
"loss": 0.1901,
"step": 28
},
{
"epoch": 0.05776892430278884,
"grad_norm": 2.038139109307458,
"learning_rate": 9.979428294737509e-06,
"loss": 0.2118,
"step": 29
},
{
"epoch": 0.05976095617529881,
"grad_norm": 2.133948012877493,
"learning_rate": 9.97798615535372e-06,
"loss": 0.2146,
"step": 30
},
{
"epoch": 0.061752988047808766,
"grad_norm": 2.176892858037584,
"learning_rate": 9.976495275955904e-06,
"loss": 0.2097,
"step": 31
},
{
"epoch": 0.06374501992031872,
"grad_norm": 1.9943827766271132,
"learning_rate": 9.974955671141425e-06,
"loss": 0.1968,
"step": 32
},
{
"epoch": 0.06573705179282868,
"grad_norm": 1.9835967650496324,
"learning_rate": 9.973367355984724e-06,
"loss": 0.2116,
"step": 33
},
{
"epoch": 0.06772908366533864,
"grad_norm": 2.0204571026763434,
"learning_rate": 9.971730346037172e-06,
"loss": 0.2026,
"step": 34
},
{
"epoch": 0.0697211155378486,
"grad_norm": 1.9322301469201182,
"learning_rate": 9.970044657326913e-06,
"loss": 0.2053,
"step": 35
},
{
"epoch": 0.07171314741035857,
"grad_norm": 1.947308039962251,
"learning_rate": 9.968310306358715e-06,
"loss": 0.1917,
"step": 36
},
{
"epoch": 0.07370517928286853,
"grad_norm": 2.046915884396975,
"learning_rate": 9.966527310113798e-06,
"loss": 0.2365,
"step": 37
},
{
"epoch": 0.07569721115537849,
"grad_norm": 2.09697417316823,
"learning_rate": 9.964695686049676e-06,
"loss": 0.2343,
"step": 38
},
{
"epoch": 0.07768924302788845,
"grad_norm": 2.6849483307841076,
"learning_rate": 9.962815452099985e-06,
"loss": 0.2169,
"step": 39
},
{
"epoch": 0.0796812749003984,
"grad_norm": 1.8234840582625904,
"learning_rate": 9.960886626674302e-06,
"loss": 0.2119,
"step": 40
},
{
"epoch": 0.08167330677290836,
"grad_norm": 1.7256360371957784,
"learning_rate": 9.95890922865797e-06,
"loss": 0.1769,
"step": 41
},
{
"epoch": 0.08366533864541832,
"grad_norm": 1.682991846433481,
"learning_rate": 9.956883277411914e-06,
"loss": 0.1724,
"step": 42
},
{
"epoch": 0.08565737051792828,
"grad_norm": 2.167281430513665,
"learning_rate": 9.954808792772447e-06,
"loss": 0.2185,
"step": 43
},
{
"epoch": 0.08764940239043825,
"grad_norm": 1.7035143668249393,
"learning_rate": 9.952685795051078e-06,
"loss": 0.1701,
"step": 44
},
{
"epoch": 0.08964143426294821,
"grad_norm": 1.9522204858955863,
"learning_rate": 9.95051430503431e-06,
"loss": 0.1977,
"step": 45
},
{
"epoch": 0.09163346613545817,
"grad_norm": 1.9293284789264147,
"learning_rate": 9.948294343983446e-06,
"loss": 0.1995,
"step": 46
},
{
"epoch": 0.09362549800796813,
"grad_norm": 2.221989822189503,
"learning_rate": 9.94602593363437e-06,
"loss": 0.2135,
"step": 47
},
{
"epoch": 0.09561752988047809,
"grad_norm": 2.5630931672539123,
"learning_rate": 9.943709096197334e-06,
"loss": 0.1935,
"step": 48
},
{
"epoch": 0.09760956175298804,
"grad_norm": 2.2472057427928256,
"learning_rate": 9.941343854356757e-06,
"loss": 0.1932,
"step": 49
},
{
"epoch": 0.099601593625498,
"grad_norm": 2.2907511035454076,
"learning_rate": 9.938930231270982e-06,
"loss": 0.2136,
"step": 50
},
{
"epoch": 0.10159362549800798,
"grad_norm": 2.1818748427349854,
"learning_rate": 9.93646825057206e-06,
"loss": 0.2028,
"step": 51
},
{
"epoch": 0.10358565737051793,
"grad_norm": 1.9532878719288238,
"learning_rate": 9.933957936365515e-06,
"loss": 0.1867,
"step": 52
},
{
"epoch": 0.10557768924302789,
"grad_norm": 2.2638694550211667,
"learning_rate": 9.931399313230112e-06,
"loss": 0.2124,
"step": 53
},
{
"epoch": 0.10756972111553785,
"grad_norm": 2.0453202854269597,
"learning_rate": 9.928792406217615e-06,
"loss": 0.2013,
"step": 54
},
{
"epoch": 0.10956175298804781,
"grad_norm": 2.092666084016267,
"learning_rate": 9.926137240852539e-06,
"loss": 0.1815,
"step": 55
},
{
"epoch": 0.11155378486055777,
"grad_norm": 1.7643433151250012,
"learning_rate": 9.9234338431319e-06,
"loss": 0.1758,
"step": 56
},
{
"epoch": 0.11354581673306773,
"grad_norm": 1.7711425457667331,
"learning_rate": 9.920682239524968e-06,
"loss": 0.1732,
"step": 57
},
{
"epoch": 0.11553784860557768,
"grad_norm": 1.97738896833117,
"learning_rate": 9.917882456972999e-06,
"loss": 0.1995,
"step": 58
},
{
"epoch": 0.11752988047808766,
"grad_norm": 1.4375108861286383,
"learning_rate": 9.915034522888972e-06,
"loss": 0.1485,
"step": 59
},
{
"epoch": 0.11952191235059761,
"grad_norm": 2.0375847131853293,
"learning_rate": 9.912138465157325e-06,
"loss": 0.2021,
"step": 60
},
{
"epoch": 0.12151394422310757,
"grad_norm": 2.168757999506768,
"learning_rate": 9.909194312133681e-06,
"loss": 0.172,
"step": 61
},
{
"epoch": 0.12350597609561753,
"grad_norm": 1.859742755561109,
"learning_rate": 9.90620209264457e-06,
"loss": 0.1538,
"step": 62
},
{
"epoch": 0.1254980079681275,
"grad_norm": 1.800175258878161,
"learning_rate": 9.90316183598714e-06,
"loss": 0.1643,
"step": 63
},
{
"epoch": 0.12749003984063745,
"grad_norm": 1.780289250472305,
"learning_rate": 9.900073571928887e-06,
"loss": 0.1448,
"step": 64
},
{
"epoch": 0.1294820717131474,
"grad_norm": 1.8989241084218624,
"learning_rate": 9.896937330707341e-06,
"loss": 0.1661,
"step": 65
},
{
"epoch": 0.13147410358565736,
"grad_norm": 1.7382723265927287,
"learning_rate": 9.893753143029792e-06,
"loss": 0.1794,
"step": 66
},
{
"epoch": 0.13346613545816732,
"grad_norm": 1.6940222793391266,
"learning_rate": 9.89052104007297e-06,
"loss": 0.154,
"step": 67
},
{
"epoch": 0.13545816733067728,
"grad_norm": 2.091556995024388,
"learning_rate": 9.887241053482756e-06,
"loss": 0.1795,
"step": 68
},
{
"epoch": 0.13745019920318724,
"grad_norm": 2.378952761035815,
"learning_rate": 9.883913215373862e-06,
"loss": 0.172,
"step": 69
},
{
"epoch": 0.1394422310756972,
"grad_norm": 2.0726149359464143,
"learning_rate": 9.880537558329518e-06,
"loss": 0.16,
"step": 70
},
{
"epoch": 0.14143426294820718,
"grad_norm": 1.8509199481149878,
"learning_rate": 9.877114115401159e-06,
"loss": 0.151,
"step": 71
},
{
"epoch": 0.14342629482071714,
"grad_norm": 1.9383364231106874,
"learning_rate": 9.87364292010809e-06,
"loss": 0.161,
"step": 72
},
{
"epoch": 0.1454183266932271,
"grad_norm": 1.9835120874739343,
"learning_rate": 9.870124006437172e-06,
"loss": 0.1885,
"step": 73
},
{
"epoch": 0.14741035856573706,
"grad_norm": 1.7057694570966788,
"learning_rate": 9.866557408842479e-06,
"loss": 0.1619,
"step": 74
},
{
"epoch": 0.14940239043824702,
"grad_norm": 1.7963768846634989,
"learning_rate": 9.86294316224496e-06,
"loss": 0.1438,
"step": 75
},
{
"epoch": 0.15139442231075698,
"grad_norm": 1.8886735096348262,
"learning_rate": 9.859281302032107e-06,
"loss": 0.1753,
"step": 76
},
{
"epoch": 0.15338645418326693,
"grad_norm": 2.099235241932376,
"learning_rate": 9.855571864057598e-06,
"loss": 0.1837,
"step": 77
},
{
"epoch": 0.1553784860557769,
"grad_norm": 2.1695503922620683,
"learning_rate": 9.85181488464095e-06,
"loss": 0.1776,
"step": 78
},
{
"epoch": 0.15737051792828685,
"grad_norm": 2.2241373204323884,
"learning_rate": 9.848010400567167e-06,
"loss": 0.1762,
"step": 79
},
{
"epoch": 0.1593625498007968,
"grad_norm": 2.0073012916773556,
"learning_rate": 9.844158449086372e-06,
"loss": 0.1643,
"step": 80
},
{
"epoch": 0.16135458167330677,
"grad_norm": 1.900677106194213,
"learning_rate": 9.84025906791345e-06,
"loss": 0.1647,
"step": 81
},
{
"epoch": 0.16334661354581673,
"grad_norm": 2.2420248448336646,
"learning_rate": 9.836312295227674e-06,
"loss": 0.1904,
"step": 82
},
{
"epoch": 0.16533864541832669,
"grad_norm": 2.0352049260820966,
"learning_rate": 9.832318169672334e-06,
"loss": 0.1652,
"step": 83
},
{
"epoch": 0.16733067729083664,
"grad_norm": 2.1040895543260834,
"learning_rate": 9.828276730354353e-06,
"loss": 0.1681,
"step": 84
},
{
"epoch": 0.1693227091633466,
"grad_norm": 2.0323467551141543,
"learning_rate": 9.824188016843915e-06,
"loss": 0.1646,
"step": 85
},
{
"epoch": 0.17131474103585656,
"grad_norm": 1.7648807229029235,
"learning_rate": 9.820052069174062e-06,
"loss": 0.1581,
"step": 86
},
{
"epoch": 0.17330677290836655,
"grad_norm": 2.256460995644407,
"learning_rate": 9.81586892784032e-06,
"loss": 0.1937,
"step": 87
},
{
"epoch": 0.1752988047808765,
"grad_norm": 2.1166732483303745,
"learning_rate": 9.811638633800287e-06,
"loss": 0.189,
"step": 88
},
{
"epoch": 0.17729083665338646,
"grad_norm": 2.1883580991082527,
"learning_rate": 9.807361228473241e-06,
"loss": 0.2019,
"step": 89
},
{
"epoch": 0.17928286852589642,
"grad_norm": 1.8188619314670527,
"learning_rate": 9.803036753739733e-06,
"loss": 0.1633,
"step": 90
},
{
"epoch": 0.18127490039840638,
"grad_norm": 2.130272928076839,
"learning_rate": 9.798665251941172e-06,
"loss": 0.1858,
"step": 91
},
{
"epoch": 0.18326693227091634,
"grad_norm": 1.9280374177349535,
"learning_rate": 9.794246765879421e-06,
"loss": 0.1367,
"step": 92
},
{
"epoch": 0.1852589641434263,
"grad_norm": 2.0068283142732564,
"learning_rate": 9.789781338816362e-06,
"loss": 0.1552,
"step": 93
},
{
"epoch": 0.18725099601593626,
"grad_norm": 2.1363237967962387,
"learning_rate": 9.785269014473487e-06,
"loss": 0.1635,
"step": 94
},
{
"epoch": 0.1892430278884462,
"grad_norm": 2.2218212298896978,
"learning_rate": 9.780709837031464e-06,
"loss": 0.1625,
"step": 95
},
{
"epoch": 0.19123505976095617,
"grad_norm": 1.9767577152960487,
"learning_rate": 9.776103851129706e-06,
"loss": 0.1632,
"step": 96
},
{
"epoch": 0.19322709163346613,
"grad_norm": 2.3121968917141094,
"learning_rate": 9.77145110186593e-06,
"loss": 0.1621,
"step": 97
},
{
"epoch": 0.1952191235059761,
"grad_norm": 1.7755655818003055,
"learning_rate": 9.766751634795719e-06,
"loss": 0.1669,
"step": 98
},
{
"epoch": 0.19721115537848605,
"grad_norm": 2.240571790906682,
"learning_rate": 9.762005495932076e-06,
"loss": 0.1637,
"step": 99
},
{
"epoch": 0.199203187250996,
"grad_norm": 1.6446940187832602,
"learning_rate": 9.757212731744973e-06,
"loss": 0.1302,
"step": 100
},
{
"epoch": 0.20119521912350596,
"grad_norm": 2.0617960985029504,
"learning_rate": 9.752373389160896e-06,
"loss": 0.1707,
"step": 101
},
{
"epoch": 0.20318725099601595,
"grad_norm": 2.1835252538751413,
"learning_rate": 9.747487515562384e-06,
"loss": 0.1649,
"step": 102
},
{
"epoch": 0.2051792828685259,
"grad_norm": 2.0129569799572407,
"learning_rate": 9.742555158787567e-06,
"loss": 0.1637,
"step": 103
},
{
"epoch": 0.20717131474103587,
"grad_norm": 2.524863713870793,
"learning_rate": 9.737576367129694e-06,
"loss": 0.1816,
"step": 104
},
{
"epoch": 0.20916334661354583,
"grad_norm": 1.9649026368356415,
"learning_rate": 9.73255118933667e-06,
"loss": 0.1216,
"step": 105
},
{
"epoch": 0.21115537848605578,
"grad_norm": 1.6210946660994265,
"learning_rate": 9.727479674610565e-06,
"loss": 0.1424,
"step": 106
},
{
"epoch": 0.21314741035856574,
"grad_norm": 2.371276035809818,
"learning_rate": 9.722361872607142e-06,
"loss": 0.1943,
"step": 107
},
{
"epoch": 0.2151394422310757,
"grad_norm": 1.760306649155103,
"learning_rate": 9.717197833435367e-06,
"loss": 0.1588,
"step": 108
},
{
"epoch": 0.21713147410358566,
"grad_norm": 1.850921085250863,
"learning_rate": 9.71198760765692e-06,
"loss": 0.135,
"step": 109
},
{
"epoch": 0.21912350597609562,
"grad_norm": 1.91926012170198,
"learning_rate": 9.706731246285701e-06,
"loss": 0.146,
"step": 110
},
{
"epoch": 0.22111553784860558,
"grad_norm": 1.9074743120890272,
"learning_rate": 9.701428800787325e-06,
"loss": 0.1546,
"step": 111
},
{
"epoch": 0.22310756972111553,
"grad_norm": 1.8574536892339737,
"learning_rate": 9.696080323078621e-06,
"loss": 0.15,
"step": 112
},
{
"epoch": 0.2250996015936255,
"grad_norm": 1.9674896794532148,
"learning_rate": 9.690685865527132e-06,
"loss": 0.1605,
"step": 113
},
{
"epoch": 0.22709163346613545,
"grad_norm": 1.683408319737727,
"learning_rate": 9.685245480950584e-06,
"loss": 0.1483,
"step": 114
},
{
"epoch": 0.2290836653386454,
"grad_norm": 1.8357325701673104,
"learning_rate": 9.679759222616389e-06,
"loss": 0.1342,
"step": 115
},
{
"epoch": 0.23107569721115537,
"grad_norm": 1.7645234238909926,
"learning_rate": 9.67422714424111e-06,
"loss": 0.1496,
"step": 116
},
{
"epoch": 0.23306772908366533,
"grad_norm": 2.3580150329396097,
"learning_rate": 9.668649299989939e-06,
"loss": 0.1602,
"step": 117
},
{
"epoch": 0.2350597609561753,
"grad_norm": 1.9878477683225815,
"learning_rate": 9.663025744476167e-06,
"loss": 0.1592,
"step": 118
},
{
"epoch": 0.23705179282868527,
"grad_norm": 2.1066728943604685,
"learning_rate": 9.657356532760647e-06,
"loss": 0.1586,
"step": 119
},
{
"epoch": 0.23904382470119523,
"grad_norm": 2.4857773499345814,
"learning_rate": 9.651641720351262e-06,
"loss": 0.1706,
"step": 120
},
{
"epoch": 0.2410358565737052,
"grad_norm": 1.9669140495950204,
"learning_rate": 9.645881363202371e-06,
"loss": 0.135,
"step": 121
},
{
"epoch": 0.24302788844621515,
"grad_norm": 2.129283733459999,
"learning_rate": 9.640075517714272e-06,
"loss": 0.1785,
"step": 122
},
{
"epoch": 0.2450199203187251,
"grad_norm": 1.9937349989457298,
"learning_rate": 9.634224240732641e-06,
"loss": 0.1453,
"step": 123
},
{
"epoch": 0.24701195219123506,
"grad_norm": 2.1385569770971298,
"learning_rate": 9.628327589547977e-06,
"loss": 0.1475,
"step": 124
},
{
"epoch": 0.24900398406374502,
"grad_norm": 2.28108046376594,
"learning_rate": 9.622385621895046e-06,
"loss": 0.1813,
"step": 125
},
{
"epoch": 0.250996015936255,
"grad_norm": 2.148801989847443,
"learning_rate": 9.616398395952313e-06,
"loss": 0.1683,
"step": 126
},
{
"epoch": 0.25298804780876494,
"grad_norm": 1.8652803065183108,
"learning_rate": 9.610365970341369e-06,
"loss": 0.1494,
"step": 127
},
{
"epoch": 0.2549800796812749,
"grad_norm": 1.8989160712067141,
"learning_rate": 9.604288404126362e-06,
"loss": 0.146,
"step": 128
},
{
"epoch": 0.25697211155378485,
"grad_norm": 1.7384157916220273,
"learning_rate": 9.598165756813418e-06,
"loss": 0.1359,
"step": 129
},
{
"epoch": 0.2589641434262948,
"grad_norm": 2.2085278004769324,
"learning_rate": 9.591998088350055e-06,
"loss": 0.1784,
"step": 130
},
{
"epoch": 0.26095617529880477,
"grad_norm": 1.6343923035740249,
"learning_rate": 9.585785459124595e-06,
"loss": 0.1376,
"step": 131
},
{
"epoch": 0.26294820717131473,
"grad_norm": 2.1234396006125396,
"learning_rate": 9.579527929965581e-06,
"loss": 0.1702,
"step": 132
},
{
"epoch": 0.2649402390438247,
"grad_norm": 1.7801773475854494,
"learning_rate": 9.573225562141174e-06,
"loss": 0.1474,
"step": 133
},
{
"epoch": 0.26693227091633465,
"grad_norm": 1.4560862756744588,
"learning_rate": 9.566878417358559e-06,
"loss": 0.1246,
"step": 134
},
{
"epoch": 0.2689243027888446,
"grad_norm": 2.025058996733023,
"learning_rate": 9.56048655776333e-06,
"loss": 0.1639,
"step": 135
},
{
"epoch": 0.27091633466135456,
"grad_norm": 1.8323049535503093,
"learning_rate": 9.554050045938893e-06,
"loss": 0.1452,
"step": 136
},
{
"epoch": 0.2729083665338645,
"grad_norm": 1.8874907449560725,
"learning_rate": 9.54756894490585e-06,
"loss": 0.1502,
"step": 137
},
{
"epoch": 0.2749003984063745,
"grad_norm": 2.5711934437137316,
"learning_rate": 9.541043318121379e-06,
"loss": 0.1767,
"step": 138
},
{
"epoch": 0.27689243027888444,
"grad_norm": 2.6684694094439223,
"learning_rate": 9.534473229478613e-06,
"loss": 0.1809,
"step": 139
},
{
"epoch": 0.2788844621513944,
"grad_norm": 1.8024037893437692,
"learning_rate": 9.52785874330602e-06,
"loss": 0.139,
"step": 140
},
{
"epoch": 0.28087649402390436,
"grad_norm": 2.0746711127055155,
"learning_rate": 9.521199924366766e-06,
"loss": 0.1645,
"step": 141
},
{
"epoch": 0.28286852589641437,
"grad_norm": 1.8375760908575898,
"learning_rate": 9.514496837858085e-06,
"loss": 0.1409,
"step": 142
},
{
"epoch": 0.2848605577689243,
"grad_norm": 2.155790335928457,
"learning_rate": 9.507749549410641e-06,
"loss": 0.1527,
"step": 143
},
{
"epoch": 0.2868525896414343,
"grad_norm": 1.9735656021269132,
"learning_rate": 9.500958125087882e-06,
"loss": 0.1454,
"step": 144
},
{
"epoch": 0.28884462151394424,
"grad_norm": 1.9447279614136819,
"learning_rate": 9.494122631385397e-06,
"loss": 0.147,
"step": 145
},
{
"epoch": 0.2908366533864542,
"grad_norm": 1.8420179664244611,
"learning_rate": 9.487243135230259e-06,
"loss": 0.1266,
"step": 146
},
{
"epoch": 0.29282868525896416,
"grad_norm": 2.0895817258939613,
"learning_rate": 9.480319703980382e-06,
"loss": 0.1752,
"step": 147
},
{
"epoch": 0.2948207171314741,
"grad_norm": 2.368984572890111,
"learning_rate": 9.473352405423845e-06,
"loss": 0.1716,
"step": 148
},
{
"epoch": 0.2968127490039841,
"grad_norm": 2.3614919475438336,
"learning_rate": 9.466341307778239e-06,
"loss": 0.1853,
"step": 149
},
{
"epoch": 0.29880478087649404,
"grad_norm": 1.7497898226508322,
"learning_rate": 9.459286479690002e-06,
"loss": 0.1487,
"step": 150
},
{
"epoch": 0.300796812749004,
"grad_norm": 2.319602310333304,
"learning_rate": 9.452187990233737e-06,
"loss": 0.1564,
"step": 151
},
{
"epoch": 0.30278884462151395,
"grad_norm": 1.9314144164174814,
"learning_rate": 9.445045908911536e-06,
"loss": 0.1574,
"step": 152
},
{
"epoch": 0.3047808764940239,
"grad_norm": 2.0498544875744726,
"learning_rate": 9.437860305652314e-06,
"loss": 0.1609,
"step": 153
},
{
"epoch": 0.30677290836653387,
"grad_norm": 2.1271214704509895,
"learning_rate": 9.430631250811107e-06,
"loss": 0.1453,
"step": 154
},
{
"epoch": 0.30876494023904383,
"grad_norm": 1.5971982417533808,
"learning_rate": 9.42335881516839e-06,
"loss": 0.1293,
"step": 155
},
{
"epoch": 0.3107569721115538,
"grad_norm": 1.7060046369161204,
"learning_rate": 9.416043069929389e-06,
"loss": 0.1308,
"step": 156
},
{
"epoch": 0.31274900398406374,
"grad_norm": 2.2558369441074744,
"learning_rate": 9.408684086723375e-06,
"loss": 0.1894,
"step": 157
},
{
"epoch": 0.3147410358565737,
"grad_norm": 2.040622936675236,
"learning_rate": 9.401281937602966e-06,
"loss": 0.1536,
"step": 158
},
{
"epoch": 0.31673306772908366,
"grad_norm": 2.021178408176851,
"learning_rate": 9.393836695043429e-06,
"loss": 0.1316,
"step": 159
},
{
"epoch": 0.3187250996015936,
"grad_norm": 1.9053653174415077,
"learning_rate": 9.386348431941953e-06,
"loss": 0.1445,
"step": 160
},
{
"epoch": 0.3207171314741036,
"grad_norm": 2.3821069319286576,
"learning_rate": 9.378817221616955e-06,
"loss": 0.1498,
"step": 161
},
{
"epoch": 0.32270916334661354,
"grad_norm": 1.987196571819016,
"learning_rate": 9.371243137807353e-06,
"loss": 0.1351,
"step": 162
},
{
"epoch": 0.3247011952191235,
"grad_norm": 2.012726967199412,
"learning_rate": 9.363626254671835e-06,
"loss": 0.1263,
"step": 163
},
{
"epoch": 0.32669322709163345,
"grad_norm": 1.8276010793351352,
"learning_rate": 9.355966646788152e-06,
"loss": 0.1357,
"step": 164
},
{
"epoch": 0.3286852589641434,
"grad_norm": 1.8020618013754388,
"learning_rate": 9.34826438915237e-06,
"loss": 0.1235,
"step": 165
},
{
"epoch": 0.33067729083665337,
"grad_norm": 1.854178246940745,
"learning_rate": 9.340519557178149e-06,
"loss": 0.1481,
"step": 166
},
{
"epoch": 0.33266932270916333,
"grad_norm": 1.9660957079874257,
"learning_rate": 9.332732226695997e-06,
"loss": 0.1354,
"step": 167
},
{
"epoch": 0.3346613545816733,
"grad_norm": 1.903678628134176,
"learning_rate": 9.324902473952529e-06,
"loss": 0.154,
"step": 168
},
{
"epoch": 0.33665338645418325,
"grad_norm": 1.8793502875666162,
"learning_rate": 9.317030375609721e-06,
"loss": 0.1415,
"step": 169
},
{
"epoch": 0.3386454183266932,
"grad_norm": 2.2655417937058355,
"learning_rate": 9.309116008744164e-06,
"loss": 0.1805,
"step": 170
},
{
"epoch": 0.34063745019920316,
"grad_norm": 2.224518864361946,
"learning_rate": 9.301159450846296e-06,
"loss": 0.1555,
"step": 171
},
{
"epoch": 0.3426294820717131,
"grad_norm": 2.594561985506128,
"learning_rate": 9.293160779819658e-06,
"loss": 0.1695,
"step": 172
},
{
"epoch": 0.34462151394422313,
"grad_norm": 2.0293944680745257,
"learning_rate": 9.285120073980127e-06,
"loss": 0.147,
"step": 173
},
{
"epoch": 0.3466135458167331,
"grad_norm": 1.8842194158472625,
"learning_rate": 9.277037412055143e-06,
"loss": 0.1397,
"step": 174
},
{
"epoch": 0.34860557768924305,
"grad_norm": 2.1876611599257294,
"learning_rate": 9.268912873182945e-06,
"loss": 0.1646,
"step": 175
},
{
"epoch": 0.350597609561753,
"grad_norm": 2.0439657178756083,
"learning_rate": 9.260746536911792e-06,
"loss": 0.1583,
"step": 176
},
{
"epoch": 0.35258964143426297,
"grad_norm": 2.3673755824284095,
"learning_rate": 9.25253848319919e-06,
"loss": 0.1779,
"step": 177
},
{
"epoch": 0.3545816733067729,
"grad_norm": 1.849959819313811,
"learning_rate": 9.244288792411099e-06,
"loss": 0.145,
"step": 178
},
{
"epoch": 0.3565737051792829,
"grad_norm": 1.8494333009850685,
"learning_rate": 9.235997545321156e-06,
"loss": 0.1402,
"step": 179
},
{
"epoch": 0.35856573705179284,
"grad_norm": 1.8409139333031814,
"learning_rate": 9.227664823109884e-06,
"loss": 0.1472,
"step": 180
},
{
"epoch": 0.3605577689243028,
"grad_norm": 2.198400559444892,
"learning_rate": 9.219290707363885e-06,
"loss": 0.1683,
"step": 181
},
{
"epoch": 0.36254980079681276,
"grad_norm": 1.847947075455182,
"learning_rate": 9.210875280075056e-06,
"loss": 0.1472,
"step": 182
},
{
"epoch": 0.3645418326693227,
"grad_norm": 1.9180035039553798,
"learning_rate": 9.202418623639779e-06,
"loss": 0.1473,
"step": 183
},
{
"epoch": 0.3665338645418327,
"grad_norm": 2.1866401096601358,
"learning_rate": 9.193920820858113e-06,
"loss": 0.1744,
"step": 184
},
{
"epoch": 0.36852589641434264,
"grad_norm": 1.8712622384696984,
"learning_rate": 9.185381954932984e-06,
"loss": 0.1566,
"step": 185
},
{
"epoch": 0.3705179282868526,
"grad_norm": 1.7798834216487947,
"learning_rate": 9.17680210946938e-06,
"loss": 0.1236,
"step": 186
},
{
"epoch": 0.37250996015936255,
"grad_norm": 1.82049432105803,
"learning_rate": 9.168181368473514e-06,
"loss": 0.146,
"step": 187
},
{
"epoch": 0.3745019920318725,
"grad_norm": 2.0877726702358923,
"learning_rate": 9.159519816352021e-06,
"loss": 0.1389,
"step": 188
},
{
"epoch": 0.37649402390438247,
"grad_norm": 2.051030898440553,
"learning_rate": 9.150817537911111e-06,
"loss": 0.1448,
"step": 189
},
{
"epoch": 0.3784860557768924,
"grad_norm": 1.7874327218426942,
"learning_rate": 9.142074618355763e-06,
"loss": 0.1214,
"step": 190
},
{
"epoch": 0.3804780876494024,
"grad_norm": 2.347061822643037,
"learning_rate": 9.133291143288865e-06,
"loss": 0.1751,
"step": 191
},
{
"epoch": 0.38247011952191234,
"grad_norm": 1.8742517441369677,
"learning_rate": 9.124467198710401e-06,
"loss": 0.1326,
"step": 192
},
{
"epoch": 0.3844621513944223,
"grad_norm": 1.8857929156382558,
"learning_rate": 9.115602871016585e-06,
"loss": 0.1261,
"step": 193
},
{
"epoch": 0.38645418326693226,
"grad_norm": 1.9824443306668558,
"learning_rate": 9.106698246999036e-06,
"loss": 0.1461,
"step": 194
},
{
"epoch": 0.3884462151394422,
"grad_norm": 2.0467435985376348,
"learning_rate": 9.097753413843909e-06,
"loss": 0.1384,
"step": 195
},
{
"epoch": 0.3904382470119522,
"grad_norm": 2.141133625722717,
"learning_rate": 9.08876845913106e-06,
"loss": 0.1497,
"step": 196
},
{
"epoch": 0.39243027888446214,
"grad_norm": 2.234122705708421,
"learning_rate": 9.079743470833177e-06,
"loss": 0.145,
"step": 197
},
{
"epoch": 0.3944223107569721,
"grad_norm": 2.682514937204839,
"learning_rate": 9.070678537314919e-06,
"loss": 0.1762,
"step": 198
},
{
"epoch": 0.39641434262948205,
"grad_norm": 2.2720011535768885,
"learning_rate": 9.061573747332053e-06,
"loss": 0.154,
"step": 199
},
{
"epoch": 0.398406374501992,
"grad_norm": 1.9823400380634495,
"learning_rate": 9.052429190030589e-06,
"loss": 0.1573,
"step": 200
},
{
"epoch": 0.398406374501992,
"eval_loss": 0.14404042065143585,
"eval_runtime": 3.1342,
"eval_samples_per_second": 13.082,
"eval_steps_per_second": 3.51,
"step": 200
},
{
"epoch": 0.40039840637450197,
"grad_norm": 2.4201576423705538,
"learning_rate": 9.0432449549459e-06,
"loss": 0.1734,
"step": 201
},
{
"epoch": 0.40239043824701193,
"grad_norm": 1.8987324574568356,
"learning_rate": 9.03402113200185e-06,
"loss": 0.1357,
"step": 202
},
{
"epoch": 0.4043824701195219,
"grad_norm": 1.8455215459039969,
"learning_rate": 9.02475781150991e-06,
"loss": 0.1412,
"step": 203
},
{
"epoch": 0.4063745019920319,
"grad_norm": 2.1617514944433367,
"learning_rate": 9.015455084168279e-06,
"loss": 0.1658,
"step": 204
},
{
"epoch": 0.40836653386454186,
"grad_norm": 2.1331463500913665,
"learning_rate": 9.00611304106099e-06,
"loss": 0.1647,
"step": 205
},
{
"epoch": 0.4103585657370518,
"grad_norm": 1.8588463567752136,
"learning_rate": 8.996731773657022e-06,
"loss": 0.1615,
"step": 206
},
{
"epoch": 0.4123505976095618,
"grad_norm": 1.5786356112405115,
"learning_rate": 8.987311373809405e-06,
"loss": 0.1381,
"step": 207
},
{
"epoch": 0.41434262948207173,
"grad_norm": 2.069308545418217,
"learning_rate": 8.977851933754317e-06,
"loss": 0.1485,
"step": 208
},
{
"epoch": 0.4163346613545817,
"grad_norm": 1.8545162202812966,
"learning_rate": 8.968353546110181e-06,
"loss": 0.1311,
"step": 209
},
{
"epoch": 0.41832669322709165,
"grad_norm": 1.7202119799614204,
"learning_rate": 8.95881630387677e-06,
"loss": 0.1265,
"step": 210
},
{
"epoch": 0.4203187250996016,
"grad_norm": 2.1311229993111276,
"learning_rate": 8.949240300434272e-06,
"loss": 0.1544,
"step": 211
},
{
"epoch": 0.42231075697211157,
"grad_norm": 1.932887530350482,
"learning_rate": 8.939625629542401e-06,
"loss": 0.1358,
"step": 212
},
{
"epoch": 0.4243027888446215,
"grad_norm": 1.996095976241167,
"learning_rate": 8.929972385339466e-06,
"loss": 0.1395,
"step": 213
},
{
"epoch": 0.4262948207171315,
"grad_norm": 1.7960261511185895,
"learning_rate": 8.92028066234145e-06,
"loss": 0.1436,
"step": 214
},
{
"epoch": 0.42828685258964144,
"grad_norm": 1.6930133554882265,
"learning_rate": 8.910550555441085e-06,
"loss": 0.1246,
"step": 215
},
{
"epoch": 0.4302788844621514,
"grad_norm": 2.225395808052067,
"learning_rate": 8.900782159906927e-06,
"loss": 0.165,
"step": 216
},
{
"epoch": 0.43227091633466136,
"grad_norm": 1.942615677966355,
"learning_rate": 8.890975571382419e-06,
"loss": 0.1418,
"step": 217
},
{
"epoch": 0.4342629482071713,
"grad_norm": 2.151401945627201,
"learning_rate": 8.881130885884955e-06,
"loss": 0.1405,
"step": 218
},
{
"epoch": 0.4362549800796813,
"grad_norm": 2.101742983984682,
"learning_rate": 8.871248199804944e-06,
"loss": 0.1415,
"step": 219
},
{
"epoch": 0.43824701195219123,
"grad_norm": 2.0803368131274973,
"learning_rate": 8.861327609904859e-06,
"loss": 0.1401,
"step": 220
},
{
"epoch": 0.4402390438247012,
"grad_norm": 1.9987975497294423,
"learning_rate": 8.851369213318293e-06,
"loss": 0.1582,
"step": 221
},
{
"epoch": 0.44223107569721115,
"grad_norm": 1.882395487923144,
"learning_rate": 8.841373107549014e-06,
"loss": 0.1363,
"step": 222
},
{
"epoch": 0.4442231075697211,
"grad_norm": 1.7141338375852764,
"learning_rate": 8.831339390469998e-06,
"loss": 0.1304,
"step": 223
},
{
"epoch": 0.44621513944223107,
"grad_norm": 1.8845992727826126,
"learning_rate": 8.821268160322482e-06,
"loss": 0.1251,
"step": 224
},
{
"epoch": 0.448207171314741,
"grad_norm": 1.807320122447534,
"learning_rate": 8.811159515714998e-06,
"loss": 0.1202,
"step": 225
},
{
"epoch": 0.450199203187251,
"grad_norm": 1.993372692147205,
"learning_rate": 8.801013555622403e-06,
"loss": 0.1624,
"step": 226
},
{
"epoch": 0.45219123505976094,
"grad_norm": 1.8931180135396861,
"learning_rate": 8.790830379384918e-06,
"loss": 0.1408,
"step": 227
},
{
"epoch": 0.4541832669322709,
"grad_norm": 1.9138251654922296,
"learning_rate": 8.780610086707149e-06,
"loss": 0.1365,
"step": 228
},
{
"epoch": 0.45617529880478086,
"grad_norm": 1.9534031291438365,
"learning_rate": 8.770352777657112e-06,
"loss": 0.1363,
"step": 229
},
{
"epoch": 0.4581673306772908,
"grad_norm": 1.7954406324400658,
"learning_rate": 8.760058552665262e-06,
"loss": 0.1201,
"step": 230
},
{
"epoch": 0.4601593625498008,
"grad_norm": 2.0677561411750163,
"learning_rate": 8.749727512523491e-06,
"loss": 0.1479,
"step": 231
},
{
"epoch": 0.46215139442231074,
"grad_norm": 2.27753550624284,
"learning_rate": 8.739359758384162e-06,
"loss": 0.1417,
"step": 232
},
{
"epoch": 0.4641434262948207,
"grad_norm": 2.2517352339516523,
"learning_rate": 8.728955391759102e-06,
"loss": 0.1535,
"step": 233
},
{
"epoch": 0.46613545816733065,
"grad_norm": 2.070124167431417,
"learning_rate": 8.718514514518617e-06,
"loss": 0.1553,
"step": 234
},
{
"epoch": 0.4681274900398406,
"grad_norm": 2.064421588096535,
"learning_rate": 8.708037228890494e-06,
"loss": 0.1459,
"step": 235
},
{
"epoch": 0.4701195219123506,
"grad_norm": 1.8120445831055432,
"learning_rate": 8.697523637458997e-06,
"loss": 0.1391,
"step": 236
},
{
"epoch": 0.4721115537848606,
"grad_norm": 1.8340032254229388,
"learning_rate": 8.686973843163868e-06,
"loss": 0.1445,
"step": 237
},
{
"epoch": 0.47410358565737054,
"grad_norm": 2.139642376885461,
"learning_rate": 8.676387949299307e-06,
"loss": 0.1343,
"step": 238
},
{
"epoch": 0.4760956175298805,
"grad_norm": 2.0856211998803973,
"learning_rate": 8.665766059512977e-06,
"loss": 0.1422,
"step": 239
},
{
"epoch": 0.47808764940239046,
"grad_norm": 2.0329687486696906,
"learning_rate": 8.655108277804975e-06,
"loss": 0.124,
"step": 240
},
{
"epoch": 0.4800796812749004,
"grad_norm": 1.8725092328941733,
"learning_rate": 8.644414708526824e-06,
"loss": 0.1319,
"step": 241
},
{
"epoch": 0.4820717131474104,
"grad_norm": 2.2032287252466585,
"learning_rate": 8.63368545638045e-06,
"loss": 0.1499,
"step": 242
},
{
"epoch": 0.48406374501992033,
"grad_norm": 2.04185058829803,
"learning_rate": 8.622920626417141e-06,
"loss": 0.1501,
"step": 243
},
{
"epoch": 0.4860557768924303,
"grad_norm": 2.003978218913384,
"learning_rate": 8.612120324036548e-06,
"loss": 0.1382,
"step": 244
},
{
"epoch": 0.48804780876494025,
"grad_norm": 1.8579294263600015,
"learning_rate": 8.601284654985623e-06,
"loss": 0.1342,
"step": 245
},
{
"epoch": 0.4900398406374502,
"grad_norm": 2.072837174686408,
"learning_rate": 8.590413725357605e-06,
"loss": 0.142,
"step": 246
},
{
"epoch": 0.49203187250996017,
"grad_norm": 1.9780709854474574,
"learning_rate": 8.57950764159097e-06,
"loss": 0.1395,
"step": 247
},
{
"epoch": 0.4940239043824701,
"grad_norm": 2.5022268847929054,
"learning_rate": 8.568566510468392e-06,
"loss": 0.1649,
"step": 248
},
{
"epoch": 0.4960159362549801,
"grad_norm": 1.7767948533765374,
"learning_rate": 8.557590439115697e-06,
"loss": 0.129,
"step": 249
},
{
"epoch": 0.49800796812749004,
"grad_norm": 1.8237813486682513,
"learning_rate": 8.546579535000819e-06,
"loss": 0.1429,
"step": 250
},
{
"epoch": 0.5,
"grad_norm": 2.108585214242547,
"learning_rate": 8.535533905932739e-06,
"loss": 0.151,
"step": 251
},
{
"epoch": 0.50199203187251,
"grad_norm": 1.8936980275001982,
"learning_rate": 8.524453660060434e-06,
"loss": 0.135,
"step": 252
},
{
"epoch": 0.5039840637450199,
"grad_norm": 1.953069313184316,
"learning_rate": 8.513338905871819e-06,
"loss": 0.1377,
"step": 253
},
{
"epoch": 0.5059760956175299,
"grad_norm": 1.6215779669447803,
"learning_rate": 8.502189752192685e-06,
"loss": 0.1269,
"step": 254
},
{
"epoch": 0.5079681274900398,
"grad_norm": 1.9115865609618454,
"learning_rate": 8.491006308185632e-06,
"loss": 0.1329,
"step": 255
},
{
"epoch": 0.5099601593625498,
"grad_norm": 1.5921580761982146,
"learning_rate": 8.479788683348996e-06,
"loss": 0.1022,
"step": 256
},
{
"epoch": 0.5119521912350598,
"grad_norm": 2.0973700556988786,
"learning_rate": 8.468536987515788e-06,
"loss": 0.134,
"step": 257
},
{
"epoch": 0.5139442231075697,
"grad_norm": 2.045549212493722,
"learning_rate": 8.457251330852608e-06,
"loss": 0.1385,
"step": 258
},
{
"epoch": 0.5159362549800797,
"grad_norm": 1.9007334882981535,
"learning_rate": 8.445931823858568e-06,
"loss": 0.1566,
"step": 259
},
{
"epoch": 0.5179282868525896,
"grad_norm": 1.5993440106427084,
"learning_rate": 8.434578577364218e-06,
"loss": 0.1066,
"step": 260
},
{
"epoch": 0.5199203187250996,
"grad_norm": 2.2603377167523973,
"learning_rate": 8.423191702530453e-06,
"loss": 0.1367,
"step": 261
},
{
"epoch": 0.5219123505976095,
"grad_norm": 2.096603538865956,
"learning_rate": 8.411771310847426e-06,
"loss": 0.1558,
"step": 262
},
{
"epoch": 0.5239043824701195,
"grad_norm": 2.104670890811567,
"learning_rate": 8.400317514133454e-06,
"loss": 0.1245,
"step": 263
},
{
"epoch": 0.5258964143426295,
"grad_norm": 2.1577997074465483,
"learning_rate": 8.388830424533935e-06,
"loss": 0.1587,
"step": 264
},
{
"epoch": 0.5278884462151394,
"grad_norm": 1.9717314334953373,
"learning_rate": 8.377310154520232e-06,
"loss": 0.1445,
"step": 265
},
{
"epoch": 0.5298804780876494,
"grad_norm": 2.18540940693717,
"learning_rate": 8.365756816888586e-06,
"loss": 0.1369,
"step": 266
},
{
"epoch": 0.5318725099601593,
"grad_norm": 1.8217822120680458,
"learning_rate": 8.354170524759008e-06,
"loss": 0.1407,
"step": 267
},
{
"epoch": 0.5338645418326693,
"grad_norm": 1.4966007977041007,
"learning_rate": 8.342551391574165e-06,
"loss": 0.1101,
"step": 268
},
{
"epoch": 0.5358565737051793,
"grad_norm": 1.5671763413455242,
"learning_rate": 8.33089953109828e-06,
"loss": 0.1185,
"step": 269
},
{
"epoch": 0.5378486055776892,
"grad_norm": 1.8560918618066358,
"learning_rate": 8.319215057416007e-06,
"loss": 0.1246,
"step": 270
},
{
"epoch": 0.5398406374501992,
"grad_norm": 1.8485098665344575,
"learning_rate": 8.307498084931327e-06,
"loss": 0.1324,
"step": 271
},
{
"epoch": 0.5418326693227091,
"grad_norm": 2.2839051894864237,
"learning_rate": 8.295748728366414e-06,
"loss": 0.1342,
"step": 272
},
{
"epoch": 0.5438247011952191,
"grad_norm": 1.6783837554851726,
"learning_rate": 8.283967102760518e-06,
"loss": 0.1192,
"step": 273
},
{
"epoch": 0.545816733067729,
"grad_norm": 2.0329474847075097,
"learning_rate": 8.272153323468842e-06,
"loss": 0.139,
"step": 274
},
{
"epoch": 0.547808764940239,
"grad_norm": 1.7759478728678548,
"learning_rate": 8.260307506161407e-06,
"loss": 0.1396,
"step": 275
},
{
"epoch": 0.549800796812749,
"grad_norm": 1.8489007641542803,
"learning_rate": 8.248429766821925e-06,
"loss": 0.1035,
"step": 276
},
{
"epoch": 0.5517928286852589,
"grad_norm": 1.3042776152655502,
"learning_rate": 8.236520221746657e-06,
"loss": 0.0979,
"step": 277
},
{
"epoch": 0.5537848605577689,
"grad_norm": 1.938450288353747,
"learning_rate": 8.22457898754328e-06,
"loss": 0.1355,
"step": 278
},
{
"epoch": 0.5557768924302788,
"grad_norm": 1.9473763473782129,
"learning_rate": 8.212606181129737e-06,
"loss": 0.1354,
"step": 279
},
{
"epoch": 0.5577689243027888,
"grad_norm": 2.3529581735987684,
"learning_rate": 8.200601919733106e-06,
"loss": 0.1478,
"step": 280
},
{
"epoch": 0.5597609561752988,
"grad_norm": 2.2045125976242277,
"learning_rate": 8.18856632088844e-06,
"loss": 0.1403,
"step": 281
},
{
"epoch": 0.5617529880478087,
"grad_norm": 1.8741947740872564,
"learning_rate": 8.176499502437621e-06,
"loss": 0.1224,
"step": 282
},
{
"epoch": 0.5637450199203188,
"grad_norm": 1.6785392877999905,
"learning_rate": 8.164401582528202e-06,
"loss": 0.1114,
"step": 283
},
{
"epoch": 0.5657370517928287,
"grad_norm": 1.805219996685288,
"learning_rate": 8.15227267961226e-06,
"loss": 0.1281,
"step": 284
},
{
"epoch": 0.5677290836653387,
"grad_norm": 2.262380478016552,
"learning_rate": 8.14011291244523e-06,
"loss": 0.1495,
"step": 285
},
{
"epoch": 0.5697211155378487,
"grad_norm": 1.6184059179144714,
"learning_rate": 8.127922400084736e-06,
"loss": 0.1077,
"step": 286
},
{
"epoch": 0.5717131474103586,
"grad_norm": 1.7035754466739166,
"learning_rate": 8.115701261889437e-06,
"loss": 0.1081,
"step": 287
},
{
"epoch": 0.5737051792828686,
"grad_norm": 2.260155985593039,
"learning_rate": 8.10344961751785e-06,
"loss": 0.1367,
"step": 288
},
{
"epoch": 0.5756972111553785,
"grad_norm": 1.985868190263633,
"learning_rate": 8.091167586927184e-06,
"loss": 0.1184,
"step": 289
},
{
"epoch": 0.5776892430278885,
"grad_norm": 2.2887926342626144,
"learning_rate": 8.078855290372161e-06,
"loss": 0.1721,
"step": 290
},
{
"epoch": 0.5796812749003984,
"grad_norm": 2.027120648554562,
"learning_rate": 8.066512848403837e-06,
"loss": 0.1342,
"step": 291
},
{
"epoch": 0.5816733067729084,
"grad_norm": 1.9925359431101095,
"learning_rate": 8.054140381868435e-06,
"loss": 0.1353,
"step": 292
},
{
"epoch": 0.5836653386454184,
"grad_norm": 2.0044583230657027,
"learning_rate": 8.041738011906144e-06,
"loss": 0.1321,
"step": 293
},
{
"epoch": 0.5856573705179283,
"grad_norm": 2.1494533777158993,
"learning_rate": 8.02930585994994e-06,
"loss": 0.132,
"step": 294
},
{
"epoch": 0.5876494023904383,
"grad_norm": 2.1658323964218322,
"learning_rate": 8.016844047724404e-06,
"loss": 0.156,
"step": 295
},
{
"epoch": 0.5896414342629482,
"grad_norm": 1.928648624837369,
"learning_rate": 8.004352697244516e-06,
"loss": 0.142,
"step": 296
},
{
"epoch": 0.5916334661354582,
"grad_norm": 1.612340205944657,
"learning_rate": 7.991831930814475e-06,
"loss": 0.1101,
"step": 297
},
{
"epoch": 0.5936254980079682,
"grad_norm": 1.6508868869854736,
"learning_rate": 7.979281871026493e-06,
"loss": 0.1281,
"step": 298
},
{
"epoch": 0.5956175298804781,
"grad_norm": 1.8982891507623567,
"learning_rate": 7.966702640759598e-06,
"loss": 0.1321,
"step": 299
},
{
"epoch": 0.5976095617529881,
"grad_norm": 1.8987961958988515,
"learning_rate": 7.954094363178421e-06,
"loss": 0.1229,
"step": 300
},
{
"epoch": 0.599601593625498,
"grad_norm": 2.1488376153773725,
"learning_rate": 7.941457161732011e-06,
"loss": 0.1544,
"step": 301
},
{
"epoch": 0.601593625498008,
"grad_norm": 1.563913063819495,
"learning_rate": 7.928791160152603e-06,
"loss": 0.0999,
"step": 302
},
{
"epoch": 0.603585657370518,
"grad_norm": 2.113124277431984,
"learning_rate": 7.916096482454425e-06,
"loss": 0.1452,
"step": 303
},
{
"epoch": 0.6055776892430279,
"grad_norm": 1.4930277954467297,
"learning_rate": 7.903373252932474e-06,
"loss": 0.1182,
"step": 304
},
{
"epoch": 0.6075697211155379,
"grad_norm": 1.9567309939982196,
"learning_rate": 7.890621596161295e-06,
"loss": 0.1485,
"step": 305
},
{
"epoch": 0.6095617529880478,
"grad_norm": 2.115801942619243,
"learning_rate": 7.877841636993777e-06,
"loss": 0.1343,
"step": 306
},
{
"epoch": 0.6115537848605578,
"grad_norm": 1.6597909053499225,
"learning_rate": 7.865033500559916e-06,
"loss": 0.1221,
"step": 307
},
{
"epoch": 0.6135458167330677,
"grad_norm": 2.268140954664932,
"learning_rate": 7.852197312265592e-06,
"loss": 0.1548,
"step": 308
},
{
"epoch": 0.6155378486055777,
"grad_norm": 1.7438803215243461,
"learning_rate": 7.83933319779135e-06,
"loss": 0.1197,
"step": 309
},
{
"epoch": 0.6175298804780877,
"grad_norm": 1.7475479034468657,
"learning_rate": 7.826441283091158e-06,
"loss": 0.1205,
"step": 310
},
{
"epoch": 0.6195219123505976,
"grad_norm": 1.917559079622195,
"learning_rate": 7.813521694391183e-06,
"loss": 0.1466,
"step": 311
},
{
"epoch": 0.6215139442231076,
"grad_norm": 1.9007500389944925,
"learning_rate": 7.800574558188548e-06,
"loss": 0.1297,
"step": 312
},
{
"epoch": 0.6235059760956175,
"grad_norm": 2.3138697129523624,
"learning_rate": 7.787600001250098e-06,
"loss": 0.1435,
"step": 313
},
{
"epoch": 0.6254980079681275,
"grad_norm": 1.803331449338951,
"learning_rate": 7.77459815061116e-06,
"loss": 0.1265,
"step": 314
},
{
"epoch": 0.6274900398406374,
"grad_norm": 1.729785988240638,
"learning_rate": 7.761569133574291e-06,
"loss": 0.1208,
"step": 315
},
{
"epoch": 0.6294820717131474,
"grad_norm": 1.5580779702404448,
"learning_rate": 7.748513077708044e-06,
"loss": 0.1279,
"step": 316
},
{
"epoch": 0.6314741035856574,
"grad_norm": 1.5557075595562477,
"learning_rate": 7.735430110845707e-06,
"loss": 0.1218,
"step": 317
},
{
"epoch": 0.6334661354581673,
"grad_norm": 2.190732094534515,
"learning_rate": 7.722320361084057e-06,
"loss": 0.1435,
"step": 318
},
{
"epoch": 0.6354581673306773,
"grad_norm": 2.066394679381035,
"learning_rate": 7.70918395678211e-06,
"loss": 0.1552,
"step": 319
},
{
"epoch": 0.6374501992031872,
"grad_norm": 2.386874848230383,
"learning_rate": 7.69602102655985e-06,
"loss": 0.1511,
"step": 320
},
{
"epoch": 0.6394422310756972,
"grad_norm": 1.9135694055276182,
"learning_rate": 7.682831699296991e-06,
"loss": 0.1494,
"step": 321
},
{
"epoch": 0.6414342629482072,
"grad_norm": 1.9705232701476278,
"learning_rate": 7.669616104131697e-06,
"loss": 0.1452,
"step": 322
},
{
"epoch": 0.6434262948207171,
"grad_norm": 1.853209913025916,
"learning_rate": 7.656374370459321e-06,
"loss": 0.1336,
"step": 323
},
{
"epoch": 0.6454183266932271,
"grad_norm": 1.772429733282671,
"learning_rate": 7.643106627931148e-06,
"loss": 0.1131,
"step": 324
},
{
"epoch": 0.647410358565737,
"grad_norm": 2.331769433940343,
"learning_rate": 7.629813006453114e-06,
"loss": 0.1417,
"step": 325
},
{
"epoch": 0.649402390438247,
"grad_norm": 2.1257162988182037,
"learning_rate": 7.616493636184538e-06,
"loss": 0.1537,
"step": 326
},
{
"epoch": 0.651394422310757,
"grad_norm": 1.6032004847610595,
"learning_rate": 7.603148647536853e-06,
"loss": 0.1208,
"step": 327
},
{
"epoch": 0.6533864541832669,
"grad_norm": 1.9414518467604416,
"learning_rate": 7.5897781711723215e-06,
"loss": 0.1189,
"step": 328
},
{
"epoch": 0.6553784860557769,
"grad_norm": 1.8035099668212278,
"learning_rate": 7.576382338002759e-06,
"loss": 0.1289,
"step": 329
},
{
"epoch": 0.6573705179282868,
"grad_norm": 2.5935545240432347,
"learning_rate": 7.56296127918825e-06,
"loss": 0.1589,
"step": 330
},
{
"epoch": 0.6593625498007968,
"grad_norm": 2.0001976484989887,
"learning_rate": 7.549515126135871e-06,
"loss": 0.1329,
"step": 331
},
{
"epoch": 0.6613545816733067,
"grad_norm": 2.2072498495488135,
"learning_rate": 7.536044010498396e-06,
"loss": 0.1562,
"step": 332
},
{
"epoch": 0.6633466135458167,
"grad_norm": 1.942689121900593,
"learning_rate": 7.5225480641730084e-06,
"loss": 0.138,
"step": 333
},
{
"epoch": 0.6653386454183267,
"grad_norm": 2.025836699652673,
"learning_rate": 7.509027419300017e-06,
"loss": 0.1442,
"step": 334
},
{
"epoch": 0.6673306772908366,
"grad_norm": 1.75497256540315,
"learning_rate": 7.495482208261554e-06,
"loss": 0.1324,
"step": 335
},
{
"epoch": 0.6693227091633466,
"grad_norm": 1.730691502621294,
"learning_rate": 7.48191256368028e-06,
"loss": 0.1037,
"step": 336
},
{
"epoch": 0.6713147410358565,
"grad_norm": 2.1818943000426825,
"learning_rate": 7.468318618418089e-06,
"loss": 0.1197,
"step": 337
},
{
"epoch": 0.6733067729083665,
"grad_norm": 2.282870874976595,
"learning_rate": 7.454700505574805e-06,
"loss": 0.1651,
"step": 338
},
{
"epoch": 0.6752988047808764,
"grad_norm": 1.9598715085355076,
"learning_rate": 7.44105835848688e-06,
"loss": 0.13,
"step": 339
},
{
"epoch": 0.6772908366533864,
"grad_norm": 1.32121304959289,
"learning_rate": 7.427392310726088e-06,
"loss": 0.088,
"step": 340
},
{
"epoch": 0.6792828685258964,
"grad_norm": 2.0337415726445296,
"learning_rate": 7.413702496098218e-06,
"loss": 0.1425,
"step": 341
},
{
"epoch": 0.6812749003984063,
"grad_norm": 2.1827633175836008,
"learning_rate": 7.39998904864176e-06,
"loss": 0.1507,
"step": 342
},
{
"epoch": 0.6832669322709163,
"grad_norm": 2.10204580287355,
"learning_rate": 7.3862521026265986e-06,
"loss": 0.1411,
"step": 343
},
{
"epoch": 0.6852589641434262,
"grad_norm": 2.1566650379770675,
"learning_rate": 7.372491792552694e-06,
"loss": 0.1427,
"step": 344
},
{
"epoch": 0.6872509960159362,
"grad_norm": 1.9617078978276397,
"learning_rate": 7.3587082531487675e-06,
"loss": 0.1167,
"step": 345
},
{
"epoch": 0.6892430278884463,
"grad_norm": 1.912009857403253,
"learning_rate": 7.344901619370977e-06,
"loss": 0.1234,
"step": 346
},
{
"epoch": 0.6912350597609562,
"grad_norm": 2.127034247138793,
"learning_rate": 7.331072026401611e-06,
"loss": 0.1176,
"step": 347
},
{
"epoch": 0.6932270916334662,
"grad_norm": 2.2559426016327078,
"learning_rate": 7.31721960964774e-06,
"loss": 0.1343,
"step": 348
},
{
"epoch": 0.6952191235059761,
"grad_norm": 1.9327539929872006,
"learning_rate": 7.303344504739914e-06,
"loss": 0.1277,
"step": 349
},
{
"epoch": 0.6972111553784861,
"grad_norm": 1.6413171465166834,
"learning_rate": 7.289446847530822e-06,
"loss": 0.0954,
"step": 350
},
{
"epoch": 0.6992031872509961,
"grad_norm": 1.7965552697782228,
"learning_rate": 7.2755267740939664e-06,
"loss": 0.1182,
"step": 351
},
{
"epoch": 0.701195219123506,
"grad_norm": 2.264078406151102,
"learning_rate": 7.261584420722328e-06,
"loss": 0.1239,
"step": 352
},
{
"epoch": 0.703187250996016,
"grad_norm": 2.1024709135826423,
"learning_rate": 7.2476199239270354e-06,
"loss": 0.1252,
"step": 353
},
{
"epoch": 0.7051792828685259,
"grad_norm": 1.850775717751215,
"learning_rate": 7.2336334204360206e-06,
"loss": 0.096,
"step": 354
},
{
"epoch": 0.7071713147410359,
"grad_norm": 1.9294667982978444,
"learning_rate": 7.21962504719269e-06,
"loss": 0.1288,
"step": 355
},
{
"epoch": 0.7091633466135459,
"grad_norm": 2.0263901644198645,
"learning_rate": 7.20559494135458e-06,
"loss": 0.1273,
"step": 356
},
{
"epoch": 0.7111553784860558,
"grad_norm": 2.519289066543413,
"learning_rate": 7.19154324029201e-06,
"loss": 0.1442,
"step": 357
},
{
"epoch": 0.7131474103585658,
"grad_norm": 2.3400623940562886,
"learning_rate": 7.177470081586743e-06,
"loss": 0.1531,
"step": 358
},
{
"epoch": 0.7151394422310757,
"grad_norm": 1.913600852089038,
"learning_rate": 7.163375603030634e-06,
"loss": 0.1142,
"step": 359
},
{
"epoch": 0.7171314741035857,
"grad_norm": 2.4654502175383657,
"learning_rate": 7.149259942624287e-06,
"loss": 0.1498,
"step": 360
},
{
"epoch": 0.7191235059760956,
"grad_norm": 1.7809542538273848,
"learning_rate": 7.135123238575693e-06,
"loss": 0.0992,
"step": 361
},
{
"epoch": 0.7211155378486056,
"grad_norm": 2.236498970023622,
"learning_rate": 7.120965629298891e-06,
"loss": 0.1231,
"step": 362
},
{
"epoch": 0.7231075697211156,
"grad_norm": 1.5579483159438388,
"learning_rate": 7.1067872534126004e-06,
"loss": 0.1208,
"step": 363
},
{
"epoch": 0.7250996015936255,
"grad_norm": 2.623828745113357,
"learning_rate": 7.092588249738871e-06,
"loss": 0.1127,
"step": 364
},
{
"epoch": 0.7270916334661355,
"grad_norm": 1.926947918523208,
"learning_rate": 7.0783687573017215e-06,
"loss": 0.1114,
"step": 365
},
{
"epoch": 0.7290836653386454,
"grad_norm": 1.5604394877235856,
"learning_rate": 7.064128915325777e-06,
"loss": 0.1116,
"step": 366
},
{
"epoch": 0.7310756972111554,
"grad_norm": 1.9819642302802614,
"learning_rate": 7.049868863234911e-06,
"loss": 0.1231,
"step": 367
},
{
"epoch": 0.7330677290836654,
"grad_norm": 2.04013986237716,
"learning_rate": 7.03558874065087e-06,
"loss": 0.1304,
"step": 368
},
{
"epoch": 0.7350597609561753,
"grad_norm": 1.8528858096630048,
"learning_rate": 7.021288687391917e-06,
"loss": 0.1232,
"step": 369
},
{
"epoch": 0.7370517928286853,
"grad_norm": 1.931351631379841,
"learning_rate": 7.00696884347146e-06,
"loss": 0.1202,
"step": 370
},
{
"epoch": 0.7390438247011952,
"grad_norm": 2.3674440717388068,
"learning_rate": 6.9926293490966755e-06,
"loss": 0.1578,
"step": 371
},
{
"epoch": 0.7410358565737052,
"grad_norm": 1.6376485083203747,
"learning_rate": 6.978270344667143e-06,
"loss": 0.0929,
"step": 372
},
{
"epoch": 0.7430278884462151,
"grad_norm": 1.705119215989148,
"learning_rate": 6.963891970773465e-06,
"loss": 0.1146,
"step": 373
},
{
"epoch": 0.7450199203187251,
"grad_norm": 1.7904536633328327,
"learning_rate": 6.949494368195896e-06,
"loss": 0.0929,
"step": 374
},
{
"epoch": 0.7470119521912351,
"grad_norm": 1.7463821286816135,
"learning_rate": 6.935077677902955e-06,
"loss": 0.1146,
"step": 375
},
{
"epoch": 0.749003984063745,
"grad_norm": 1.8343889129423383,
"learning_rate": 6.920642041050055e-06,
"loss": 0.1231,
"step": 376
},
{
"epoch": 0.750996015936255,
"grad_norm": 2.112401907355922,
"learning_rate": 6.9061875989781165e-06,
"loss": 0.1323,
"step": 377
},
{
"epoch": 0.7529880478087649,
"grad_norm": 2.3269817857970456,
"learning_rate": 6.891714493212183e-06,
"loss": 0.1461,
"step": 378
},
{
"epoch": 0.7549800796812749,
"grad_norm": 2.164617790401808,
"learning_rate": 6.877222865460037e-06,
"loss": 0.1465,
"step": 379
},
{
"epoch": 0.7569721115537849,
"grad_norm": 1.8394467625888513,
"learning_rate": 6.862712857610812e-06,
"loss": 0.1078,
"step": 380
},
{
"epoch": 0.7589641434262948,
"grad_norm": 1.7772473532916493,
"learning_rate": 6.848184611733602e-06,
"loss": 0.1173,
"step": 381
},
{
"epoch": 0.7609561752988048,
"grad_norm": 1.90451509004974,
"learning_rate": 6.833638270076071e-06,
"loss": 0.1164,
"step": 382
},
{
"epoch": 0.7629482071713147,
"grad_norm": 1.3933704850807993,
"learning_rate": 6.819073975063064e-06,
"loss": 0.0916,
"step": 383
},
{
"epoch": 0.7649402390438247,
"grad_norm": 2.1083722503136073,
"learning_rate": 6.804491869295207e-06,
"loss": 0.1482,
"step": 384
},
{
"epoch": 0.7669322709163346,
"grad_norm": 2.1302855766956466,
"learning_rate": 6.789892095547511e-06,
"loss": 0.1255,
"step": 385
},
{
"epoch": 0.7689243027888446,
"grad_norm": 2.0218204203878747,
"learning_rate": 6.7752747967679825e-06,
"loss": 0.1414,
"step": 386
},
{
"epoch": 0.7709163346613546,
"grad_norm": 1.6484036328649272,
"learning_rate": 6.7606401160762105e-06,
"loss": 0.1006,
"step": 387
},
{
"epoch": 0.7729083665338645,
"grad_norm": 1.9790346630572022,
"learning_rate": 6.745988196761976e-06,
"loss": 0.1253,
"step": 388
},
{
"epoch": 0.7749003984063745,
"grad_norm": 2.4027541617777923,
"learning_rate": 6.731319182283844e-06,
"loss": 0.1516,
"step": 389
},
{
"epoch": 0.7768924302788844,
"grad_norm": 2.076146090352596,
"learning_rate": 6.71663321626776e-06,
"loss": 0.1324,
"step": 390
},
{
"epoch": 0.7788844621513944,
"grad_norm": 2.176712178015417,
"learning_rate": 6.7019304425056484e-06,
"loss": 0.1133,
"step": 391
},
{
"epoch": 0.7808764940239044,
"grad_norm": 2.270343647007193,
"learning_rate": 6.687211004953992e-06,
"loss": 0.1543,
"step": 392
},
{
"epoch": 0.7828685258964143,
"grad_norm": 2.2157431300656114,
"learning_rate": 6.672475047732436e-06,
"loss": 0.1419,
"step": 393
},
{
"epoch": 0.7848605577689243,
"grad_norm": 1.8472598515997243,
"learning_rate": 6.657722715122372e-06,
"loss": 0.1171,
"step": 394
},
{
"epoch": 0.7868525896414342,
"grad_norm": 1.9746467863711765,
"learning_rate": 6.6429541515655215e-06,
"loss": 0.1214,
"step": 395
},
{
"epoch": 0.7888446215139442,
"grad_norm": 1.838974412736928,
"learning_rate": 6.628169501662527e-06,
"loss": 0.1288,
"step": 396
},
{
"epoch": 0.7908366533864541,
"grad_norm": 2.315243867151159,
"learning_rate": 6.613368910171533e-06,
"loss": 0.1328,
"step": 397
},
{
"epoch": 0.7928286852589641,
"grad_norm": 2.081415971304626,
"learning_rate": 6.598552522006772e-06,
"loss": 0.1259,
"step": 398
},
{
"epoch": 0.7948207171314741,
"grad_norm": 2.3461895895379414,
"learning_rate": 6.583720482237143e-06,
"loss": 0.1449,
"step": 399
},
{
"epoch": 0.796812749003984,
"grad_norm": 2.2278972284384233,
"learning_rate": 6.568872936084789e-06,
"loss": 0.153,
"step": 400
},
{
"epoch": 0.796812749003984,
"eval_loss": 0.12241560965776443,
"eval_runtime": 3.1403,
"eval_samples_per_second": 13.056,
"eval_steps_per_second": 3.503,
"step": 400
},
{
"epoch": 0.798804780876494,
"grad_norm": 1.5055565131366404,
"learning_rate": 6.554010028923682e-06,
"loss": 0.0904,
"step": 401
},
{
"epoch": 0.8007968127490039,
"grad_norm": 1.9134714393025904,
"learning_rate": 6.539131906278189e-06,
"loss": 0.1221,
"step": 402
},
{
"epoch": 0.8027888446215139,
"grad_norm": 1.676345510524098,
"learning_rate": 6.524238713821661e-06,
"loss": 0.1049,
"step": 403
},
{
"epoch": 0.8047808764940239,
"grad_norm": 1.8483113242007039,
"learning_rate": 6.509330597374993e-06,
"loss": 0.1209,
"step": 404
},
{
"epoch": 0.8067729083665338,
"grad_norm": 2.1209942031252096,
"learning_rate": 6.494407702905207e-06,
"loss": 0.1274,
"step": 405
},
{
"epoch": 0.8087649402390438,
"grad_norm": 2.090229556416619,
"learning_rate": 6.479470176524015e-06,
"loss": 0.1378,
"step": 406
},
{
"epoch": 0.8107569721115537,
"grad_norm": 1.5536560670439392,
"learning_rate": 6.464518164486395e-06,
"loss": 0.1042,
"step": 407
},
{
"epoch": 0.8127490039840638,
"grad_norm": 2.136864457352028,
"learning_rate": 6.44955181318915e-06,
"loss": 0.1356,
"step": 408
},
{
"epoch": 0.8147410358565738,
"grad_norm": 1.6345271519484204,
"learning_rate": 6.434571269169487e-06,
"loss": 0.1103,
"step": 409
},
{
"epoch": 0.8167330677290837,
"grad_norm": 2.2602024940481877,
"learning_rate": 6.419576679103571e-06,
"loss": 0.1412,
"step": 410
},
{
"epoch": 0.8187250996015937,
"grad_norm": 1.950297407773567,
"learning_rate": 6.404568189805095e-06,
"loss": 0.1177,
"step": 411
},
{
"epoch": 0.8207171314741036,
"grad_norm": 1.7261505783927313,
"learning_rate": 6.389545948223841e-06,
"loss": 0.1061,
"step": 412
},
{
"epoch": 0.8227091633466136,
"grad_norm": 1.7921074892509365,
"learning_rate": 6.374510101444242e-06,
"loss": 0.1149,
"step": 413
},
{
"epoch": 0.8247011952191236,
"grad_norm": 1.693484859639379,
"learning_rate": 6.359460796683937e-06,
"loss": 0.1013,
"step": 414
},
{
"epoch": 0.8266932270916335,
"grad_norm": 1.8929838681555156,
"learning_rate": 6.344398181292338e-06,
"loss": 0.1117,
"step": 415
},
{
"epoch": 0.8286852589641435,
"grad_norm": 1.9178716343933822,
"learning_rate": 6.329322402749181e-06,
"loss": 0.1133,
"step": 416
},
{
"epoch": 0.8306772908366534,
"grad_norm": 2.06968861690023,
"learning_rate": 6.314233608663085e-06,
"loss": 0.1347,
"step": 417
},
{
"epoch": 0.8326693227091634,
"grad_norm": 2.150136253809127,
"learning_rate": 6.299131946770104e-06,
"loss": 0.1102,
"step": 418
},
{
"epoch": 0.8346613545816733,
"grad_norm": 2.0515375853571105,
"learning_rate": 6.284017564932284e-06,
"loss": 0.1321,
"step": 419
},
{
"epoch": 0.8366533864541833,
"grad_norm": 1.972608566259217,
"learning_rate": 6.2688906111362115e-06,
"loss": 0.1317,
"step": 420
},
{
"epoch": 0.8386454183266933,
"grad_norm": 2.1835844876053176,
"learning_rate": 6.253751233491565e-06,
"loss": 0.1368,
"step": 421
},
{
"epoch": 0.8406374501992032,
"grad_norm": 1.9358655395330886,
"learning_rate": 6.238599580229673e-06,
"loss": 0.1242,
"step": 422
},
{
"epoch": 0.8426294820717132,
"grad_norm": 1.9756294879268275,
"learning_rate": 6.2234357997020475e-06,
"loss": 0.1228,
"step": 423
},
{
"epoch": 0.8446215139442231,
"grad_norm": 2.0407229239665616,
"learning_rate": 6.208260040378946e-06,
"loss": 0.1069,
"step": 424
},
{
"epoch": 0.8466135458167331,
"grad_norm": 2.469118035586134,
"learning_rate": 6.193072450847909e-06,
"loss": 0.1353,
"step": 425
},
{
"epoch": 0.848605577689243,
"grad_norm": 1.3150969469878047,
"learning_rate": 6.1778731798123105e-06,
"loss": 0.0766,
"step": 426
},
{
"epoch": 0.850597609561753,
"grad_norm": 1.7880637008772997,
"learning_rate": 6.162662376089894e-06,
"loss": 0.1111,
"step": 427
},
{
"epoch": 0.852589641434263,
"grad_norm": 1.7519177689555412,
"learning_rate": 6.147440188611324e-06,
"loss": 0.1172,
"step": 428
},
{
"epoch": 0.8545816733067729,
"grad_norm": 1.6790695470314452,
"learning_rate": 6.132206766418728e-06,
"loss": 0.1307,
"step": 429
},
{
"epoch": 0.8565737051792829,
"grad_norm": 2.329181770786997,
"learning_rate": 6.116962258664228e-06,
"loss": 0.1398,
"step": 430
},
{
"epoch": 0.8585657370517928,
"grad_norm": 2.1360997891269298,
"learning_rate": 6.10170681460849e-06,
"loss": 0.1348,
"step": 431
},
{
"epoch": 0.8605577689243028,
"grad_norm": 2.0074048591988745,
"learning_rate": 6.0864405836192575e-06,
"loss": 0.1257,
"step": 432
},
{
"epoch": 0.8625498007968128,
"grad_norm": 1.9966192443343993,
"learning_rate": 6.071163715169889e-06,
"loss": 0.1233,
"step": 433
},
{
"epoch": 0.8645418326693227,
"grad_norm": 1.6688986826934915,
"learning_rate": 6.055876358837894e-06,
"loss": 0.1034,
"step": 434
},
{
"epoch": 0.8665338645418327,
"grad_norm": 2.1720920201705574,
"learning_rate": 6.040578664303476e-06,
"loss": 0.122,
"step": 435
},
{
"epoch": 0.8685258964143426,
"grad_norm": 2.2349377112258884,
"learning_rate": 6.025270781348055e-06,
"loss": 0.1305,
"step": 436
},
{
"epoch": 0.8705179282868526,
"grad_norm": 1.9735977301953636,
"learning_rate": 6.009952859852809e-06,
"loss": 0.1209,
"step": 437
},
{
"epoch": 0.8725099601593626,
"grad_norm": 2.1227111042570446,
"learning_rate": 5.994625049797206e-06,
"loss": 0.1313,
"step": 438
},
{
"epoch": 0.8745019920318725,
"grad_norm": 1.8425960578334477,
"learning_rate": 5.979287501257531e-06,
"loss": 0.1045,
"step": 439
},
{
"epoch": 0.8764940239043825,
"grad_norm": 1.9096751566518075,
"learning_rate": 5.963940364405425e-06,
"loss": 0.1094,
"step": 440
},
{
"epoch": 0.8784860557768924,
"grad_norm": 2.0206284611558396,
"learning_rate": 5.9485837895064e-06,
"loss": 0.1147,
"step": 441
},
{
"epoch": 0.8804780876494024,
"grad_norm": 1.7978035110634043,
"learning_rate": 5.933217926918386e-06,
"loss": 0.1039,
"step": 442
},
{
"epoch": 0.8824701195219123,
"grad_norm": 1.3448754139603254,
"learning_rate": 5.9178429270902445e-06,
"loss": 0.0751,
"step": 443
},
{
"epoch": 0.8844621513944223,
"grad_norm": 1.8906404995180646,
"learning_rate": 5.902458940560304e-06,
"loss": 0.1118,
"step": 444
},
{
"epoch": 0.8864541832669323,
"grad_norm": 2.248607528732412,
"learning_rate": 5.88706611795488e-06,
"loss": 0.1344,
"step": 445
},
{
"epoch": 0.8884462151394422,
"grad_norm": 2.0500336844672566,
"learning_rate": 5.871664609986804e-06,
"loss": 0.1172,
"step": 446
},
{
"epoch": 0.8904382470119522,
"grad_norm": 1.7749390516298367,
"learning_rate": 5.85625456745395e-06,
"loss": 0.096,
"step": 447
},
{
"epoch": 0.8924302788844621,
"grad_norm": 1.6650651187511873,
"learning_rate": 5.8408361412377475e-06,
"loss": 0.112,
"step": 448
},
{
"epoch": 0.8944223107569721,
"grad_norm": 1.8698848014452956,
"learning_rate": 5.8254094823017195e-06,
"loss": 0.1094,
"step": 449
},
{
"epoch": 0.896414342629482,
"grad_norm": 2.31893715626561,
"learning_rate": 5.80997474168999e-06,
"loss": 0.1524,
"step": 450
},
{
"epoch": 0.898406374501992,
"grad_norm": 1.7326523317597042,
"learning_rate": 5.794532070525817e-06,
"loss": 0.1145,
"step": 451
},
{
"epoch": 0.900398406374502,
"grad_norm": 1.8551714873826095,
"learning_rate": 5.779081620010104e-06,
"loss": 0.1064,
"step": 452
},
{
"epoch": 0.9023904382470119,
"grad_norm": 2.000481014536217,
"learning_rate": 5.763623541419925e-06,
"loss": 0.1152,
"step": 453
},
{
"epoch": 0.9043824701195219,
"grad_norm": 1.514470739576286,
"learning_rate": 5.748157986107038e-06,
"loss": 0.0937,
"step": 454
},
{
"epoch": 0.9063745019920318,
"grad_norm": 1.7848046901968333,
"learning_rate": 5.73268510549641e-06,
"loss": 0.1182,
"step": 455
},
{
"epoch": 0.9083665338645418,
"grad_norm": 2.123090333270882,
"learning_rate": 5.717205051084731e-06,
"loss": 0.1207,
"step": 456
},
{
"epoch": 0.9103585657370518,
"grad_norm": 1.9230718298635634,
"learning_rate": 5.7017179744389276e-06,
"loss": 0.1105,
"step": 457
},
{
"epoch": 0.9123505976095617,
"grad_norm": 2.218085940556976,
"learning_rate": 5.686224027194682e-06,
"loss": 0.1313,
"step": 458
},
{
"epoch": 0.9143426294820717,
"grad_norm": 1.8057081110378266,
"learning_rate": 5.6707233610549505e-06,
"loss": 0.1248,
"step": 459
},
{
"epoch": 0.9163346613545816,
"grad_norm": 2.3840484640685715,
"learning_rate": 5.655216127788472e-06,
"loss": 0.1432,
"step": 460
},
{
"epoch": 0.9183266932270916,
"grad_norm": 2.020040126603478,
"learning_rate": 5.639702479228286e-06,
"loss": 0.1113,
"step": 461
},
{
"epoch": 0.9203187250996016,
"grad_norm": 2.074741713662112,
"learning_rate": 5.6241825672702444e-06,
"loss": 0.1067,
"step": 462
},
{
"epoch": 0.9223107569721115,
"grad_norm": 2.0160262890682854,
"learning_rate": 5.608656543871524e-06,
"loss": 0.1058,
"step": 463
},
{
"epoch": 0.9243027888446215,
"grad_norm": 1.576835822191225,
"learning_rate": 5.593124561049141e-06,
"loss": 0.1148,
"step": 464
},
{
"epoch": 0.9262948207171314,
"grad_norm": 1.7157626959132268,
"learning_rate": 5.57758677087846e-06,
"loss": 0.101,
"step": 465
},
{
"epoch": 0.9282868525896414,
"grad_norm": 2.0879882110494465,
"learning_rate": 5.5620433254917075e-06,
"loss": 0.1223,
"step": 466
},
{
"epoch": 0.9302788844621513,
"grad_norm": 1.6905070173375571,
"learning_rate": 5.546494377076478e-06,
"loss": 0.0903,
"step": 467
},
{
"epoch": 0.9322709163346613,
"grad_norm": 1.8563993306347122,
"learning_rate": 5.530940077874248e-06,
"loss": 0.1065,
"step": 468
},
{
"epoch": 0.9342629482071713,
"grad_norm": 1.9938558563881068,
"learning_rate": 5.515380580178887e-06,
"loss": 0.1157,
"step": 469
},
{
"epoch": 0.9362549800796812,
"grad_norm": 1.9502947540268583,
"learning_rate": 5.499816036335157e-06,
"loss": 0.1213,
"step": 470
},
{
"epoch": 0.9382470119521913,
"grad_norm": 2.0454369850534997,
"learning_rate": 5.484246598737234e-06,
"loss": 0.1321,
"step": 471
},
{
"epoch": 0.9402390438247012,
"grad_norm": 2.215669709261003,
"learning_rate": 5.468672419827208e-06,
"loss": 0.1102,
"step": 472
},
{
"epoch": 0.9422310756972112,
"grad_norm": 1.9756062583329823,
"learning_rate": 5.453093652093588e-06,
"loss": 0.1232,
"step": 473
},
{
"epoch": 0.9442231075697212,
"grad_norm": 1.799263076671508,
"learning_rate": 5.437510448069815e-06,
"loss": 0.1081,
"step": 474
},
{
"epoch": 0.9462151394422311,
"grad_norm": 2.3780944126238475,
"learning_rate": 5.421922960332767e-06,
"loss": 0.1121,
"step": 475
},
{
"epoch": 0.9482071713147411,
"grad_norm": 1.8280247291319718,
"learning_rate": 5.406331341501264e-06,
"loss": 0.1324,
"step": 476
},
{
"epoch": 0.950199203187251,
"grad_norm": 2.15631656729653,
"learning_rate": 5.390735744234573e-06,
"loss": 0.1287,
"step": 477
},
{
"epoch": 0.952191235059761,
"grad_norm": 1.7263537522584287,
"learning_rate": 5.375136321230915e-06,
"loss": 0.086,
"step": 478
},
{
"epoch": 0.954183266932271,
"grad_norm": 2.3095667041223753,
"learning_rate": 5.359533225225971e-06,
"loss": 0.1374,
"step": 479
},
{
"epoch": 0.9561752988047809,
"grad_norm": 1.7602753374015587,
"learning_rate": 5.34392660899138e-06,
"loss": 0.1078,
"step": 480
},
{
"epoch": 0.9581673306772909,
"grad_norm": 2.1582500460632263,
"learning_rate": 5.328316625333251e-06,
"loss": 0.1415,
"step": 481
},
{
"epoch": 0.9601593625498008,
"grad_norm": 1.8078245176783199,
"learning_rate": 5.312703427090665e-06,
"loss": 0.1202,
"step": 482
},
{
"epoch": 0.9621513944223108,
"grad_norm": 1.6747521348323762,
"learning_rate": 5.297087167134176e-06,
"loss": 0.1105,
"step": 483
},
{
"epoch": 0.9641434262948207,
"grad_norm": 2.108324269500925,
"learning_rate": 5.281467998364314e-06,
"loss": 0.1166,
"step": 484
},
{
"epoch": 0.9661354581673307,
"grad_norm": 1.899301062782763,
"learning_rate": 5.265846073710093e-06,
"loss": 0.1032,
"step": 485
},
{
"epoch": 0.9681274900398407,
"grad_norm": 1.7986533480735443,
"learning_rate": 5.250221546127508e-06,
"loss": 0.1023,
"step": 486
},
{
"epoch": 0.9701195219123506,
"grad_norm": 1.7143716184742455,
"learning_rate": 5.2345945685980404e-06,
"loss": 0.1056,
"step": 487
},
{
"epoch": 0.9721115537848606,
"grad_norm": 1.8989132351788962,
"learning_rate": 5.218965294127155e-06,
"loss": 0.1246,
"step": 488
},
{
"epoch": 0.9741035856573705,
"grad_norm": 1.692065620829879,
"learning_rate": 5.203333875742814e-06,
"loss": 0.1002,
"step": 489
},
{
"epoch": 0.9760956175298805,
"grad_norm": 2.1756954006047207,
"learning_rate": 5.187700466493966e-06,
"loss": 0.1316,
"step": 490
},
{
"epoch": 0.9780876494023905,
"grad_norm": 1.5992575019291566,
"learning_rate": 5.1720652194490504e-06,
"loss": 0.0879,
"step": 491
},
{
"epoch": 0.9800796812749004,
"grad_norm": 1.8188244998440222,
"learning_rate": 5.156428287694508e-06,
"loss": 0.113,
"step": 492
},
{
"epoch": 0.9820717131474104,
"grad_norm": 1.8310353091664167,
"learning_rate": 5.140789824333266e-06,
"loss": 0.0925,
"step": 493
},
{
"epoch": 0.9840637450199203,
"grad_norm": 2.2645161516308674,
"learning_rate": 5.125149982483255e-06,
"loss": 0.1333,
"step": 494
},
{
"epoch": 0.9860557768924303,
"grad_norm": 2.38898722297588,
"learning_rate": 5.109508915275898e-06,
"loss": 0.1262,
"step": 495
},
{
"epoch": 0.9880478087649402,
"grad_norm": 1.894617587966868,
"learning_rate": 5.093866775854618e-06,
"loss": 0.0988,
"step": 496
},
{
"epoch": 0.9900398406374502,
"grad_norm": 1.8643326954112833,
"learning_rate": 5.078223717373334e-06,
"loss": 0.1151,
"step": 497
},
{
"epoch": 0.9920318725099602,
"grad_norm": 2.381725171456074,
"learning_rate": 5.062579892994966e-06,
"loss": 0.1243,
"step": 498
},
{
"epoch": 0.9940239043824701,
"grad_norm": 1.7622591670635168,
"learning_rate": 5.046935455889933e-06,
"loss": 0.1052,
"step": 499
},
{
"epoch": 0.9960159362549801,
"grad_norm": 2.133367063969825,
"learning_rate": 5.03129055923465e-06,
"loss": 0.1098,
"step": 500
},
{
"epoch": 0.99800796812749,
"grad_norm": 1.800851217571952,
"learning_rate": 5.0156453562100325e-06,
"loss": 0.1187,
"step": 501
},
{
"epoch": 1.0,
"grad_norm": 1.515740716232249,
"learning_rate": 5e-06,
"loss": 0.0727,
"step": 502
},
{
"epoch": 1.00199203187251,
"grad_norm": 1.454807758568271,
"learning_rate": 4.984354643789968e-06,
"loss": 0.0645,
"step": 503
},
{
"epoch": 1.00398406374502,
"grad_norm": 1.0906220710604444,
"learning_rate": 4.968709440765352e-06,
"loss": 0.0463,
"step": 504
},
{
"epoch": 1.0059760956175299,
"grad_norm": 1.5597035712618907,
"learning_rate": 4.953064544110069e-06,
"loss": 0.0638,
"step": 505
},
{
"epoch": 1.0079681274900398,
"grad_norm": 1.3271308187362474,
"learning_rate": 4.9374201070050345e-06,
"loss": 0.0564,
"step": 506
},
{
"epoch": 1.0099601593625498,
"grad_norm": 1.6371457735090003,
"learning_rate": 4.9217762826266665e-06,
"loss": 0.0652,
"step": 507
},
{
"epoch": 1.0119521912350598,
"grad_norm": 1.3010690622611256,
"learning_rate": 4.906133224145384e-06,
"loss": 0.0478,
"step": 508
},
{
"epoch": 1.0139442231075697,
"grad_norm": 1.6385417730692136,
"learning_rate": 4.8904910847241025e-06,
"loss": 0.0571,
"step": 509
},
{
"epoch": 1.0159362549800797,
"grad_norm": 1.460130563576597,
"learning_rate": 4.874850017516746e-06,
"loss": 0.0478,
"step": 510
},
{
"epoch": 1.0179282868525896,
"grad_norm": 1.218172055244865,
"learning_rate": 4.8592101756667345e-06,
"loss": 0.0406,
"step": 511
},
{
"epoch": 1.0199203187250996,
"grad_norm": 1.380650876589631,
"learning_rate": 4.843571712305493e-06,
"loss": 0.0488,
"step": 512
},
{
"epoch": 1.0219123505976095,
"grad_norm": 1.4404478206417628,
"learning_rate": 4.82793478055095e-06,
"loss": 0.0423,
"step": 513
},
{
"epoch": 1.0239043824701195,
"grad_norm": 1.4907418600568456,
"learning_rate": 4.8122995335060365e-06,
"loss": 0.0524,
"step": 514
},
{
"epoch": 1.0258964143426295,
"grad_norm": 1.3967454054494113,
"learning_rate": 4.796666124257187e-06,
"loss": 0.0427,
"step": 515
},
{
"epoch": 1.0278884462151394,
"grad_norm": 1.3774126162481908,
"learning_rate": 4.781034705872846e-06,
"loss": 0.0424,
"step": 516
},
{
"epoch": 1.0298804780876494,
"grad_norm": 1.4995564243577384,
"learning_rate": 4.765405431401961e-06,
"loss": 0.0429,
"step": 517
},
{
"epoch": 1.0318725099601593,
"grad_norm": 1.4776829368859978,
"learning_rate": 4.7497784538724925e-06,
"loss": 0.0439,
"step": 518
},
{
"epoch": 1.0338645418326693,
"grad_norm": 1.5766123608089488,
"learning_rate": 4.7341539262899075e-06,
"loss": 0.047,
"step": 519
},
{
"epoch": 1.0358565737051793,
"grad_norm": 2.010013759539753,
"learning_rate": 4.7185320016356865e-06,
"loss": 0.0495,
"step": 520
},
{
"epoch": 1.0378486055776892,
"grad_norm": 1.7137673308746173,
"learning_rate": 4.7029128328658255e-06,
"loss": 0.0519,
"step": 521
},
{
"epoch": 1.0398406374501992,
"grad_norm": 1.8846039575121651,
"learning_rate": 4.687296572909336e-06,
"loss": 0.0518,
"step": 522
},
{
"epoch": 1.0418326693227091,
"grad_norm": 1.517157653051372,
"learning_rate": 4.671683374666751e-06,
"loss": 0.0376,
"step": 523
},
{
"epoch": 1.043824701195219,
"grad_norm": 1.5477788120437967,
"learning_rate": 4.656073391008622e-06,
"loss": 0.0509,
"step": 524
},
{
"epoch": 1.045816733067729,
"grad_norm": 1.5961149461761268,
"learning_rate": 4.64046677477403e-06,
"loss": 0.0406,
"step": 525
},
{
"epoch": 1.047808764940239,
"grad_norm": 1.7872243106969083,
"learning_rate": 4.624863678769086e-06,
"loss": 0.0484,
"step": 526
},
{
"epoch": 1.049800796812749,
"grad_norm": 2.0305402714348038,
"learning_rate": 4.609264255765429e-06,
"loss": 0.06,
"step": 527
},
{
"epoch": 1.051792828685259,
"grad_norm": 1.5196304776495169,
"learning_rate": 4.593668658498737e-06,
"loss": 0.0379,
"step": 528
},
{
"epoch": 1.0537848605577689,
"grad_norm": 1.8563914910463752,
"learning_rate": 4.578077039667235e-06,
"loss": 0.0579,
"step": 529
},
{
"epoch": 1.0557768924302788,
"grad_norm": 2.0399962964080927,
"learning_rate": 4.562489551930187e-06,
"loss": 0.0554,
"step": 530
},
{
"epoch": 1.0577689243027888,
"grad_norm": 1.5558523714376704,
"learning_rate": 4.546906347906414e-06,
"loss": 0.0487,
"step": 531
},
{
"epoch": 1.0597609561752988,
"grad_norm": 2.067131962038375,
"learning_rate": 4.531327580172794e-06,
"loss": 0.0616,
"step": 532
},
{
"epoch": 1.0617529880478087,
"grad_norm": 1.8515752338818168,
"learning_rate": 4.515753401262767e-06,
"loss": 0.0476,
"step": 533
},
{
"epoch": 1.0637450199203187,
"grad_norm": 1.6512814928931692,
"learning_rate": 4.5001839636648456e-06,
"loss": 0.0457,
"step": 534
},
{
"epoch": 1.0657370517928286,
"grad_norm": 1.6309985963797304,
"learning_rate": 4.484619419821116e-06,
"loss": 0.0479,
"step": 535
},
{
"epoch": 1.0677290836653386,
"grad_norm": 1.3534892174684972,
"learning_rate": 4.469059922125753e-06,
"loss": 0.0351,
"step": 536
},
{
"epoch": 1.0697211155378485,
"grad_norm": 1.5618587225235019,
"learning_rate": 4.453505622923524e-06,
"loss": 0.0514,
"step": 537
},
{
"epoch": 1.0717131474103585,
"grad_norm": 1.8799695188430705,
"learning_rate": 4.437956674508295e-06,
"loss": 0.0564,
"step": 538
},
{
"epoch": 1.0737051792828685,
"grad_norm": 1.6371009327782438,
"learning_rate": 4.422413229121541e-06,
"loss": 0.05,
"step": 539
},
{
"epoch": 1.0756972111553784,
"grad_norm": 1.7149994790253227,
"learning_rate": 4.4068754389508616e-06,
"loss": 0.0526,
"step": 540
},
{
"epoch": 1.0776892430278884,
"grad_norm": 1.5057791531519986,
"learning_rate": 4.391343456128479e-06,
"loss": 0.0401,
"step": 541
},
{
"epoch": 1.0796812749003983,
"grad_norm": 1.6556428738570654,
"learning_rate": 4.375817432729759e-06,
"loss": 0.0501,
"step": 542
},
{
"epoch": 1.0816733067729083,
"grad_norm": 1.8035332314518027,
"learning_rate": 4.360297520771716e-06,
"loss": 0.0598,
"step": 543
},
{
"epoch": 1.0836653386454183,
"grad_norm": 1.9373725794619105,
"learning_rate": 4.34478387221153e-06,
"loss": 0.0493,
"step": 544
},
{
"epoch": 1.0856573705179282,
"grad_norm": 1.2034769948690562,
"learning_rate": 4.329276638945051e-06,
"loss": 0.0397,
"step": 545
},
{
"epoch": 1.0876494023904382,
"grad_norm": 1.329498987282073,
"learning_rate": 4.3137759728053206e-06,
"loss": 0.0489,
"step": 546
},
{
"epoch": 1.0896414342629481,
"grad_norm": 1.3953725126886292,
"learning_rate": 4.298282025561076e-06,
"loss": 0.0478,
"step": 547
},
{
"epoch": 1.091633466135458,
"grad_norm": 1.561219438568973,
"learning_rate": 4.282794948915271e-06,
"loss": 0.0476,
"step": 548
},
{
"epoch": 1.093625498007968,
"grad_norm": 1.7067688119812607,
"learning_rate": 4.267314894503591e-06,
"loss": 0.0413,
"step": 549
},
{
"epoch": 1.095617529880478,
"grad_norm": 1.4480366291730635,
"learning_rate": 4.2518420138929645e-06,
"loss": 0.0452,
"step": 550
},
{
"epoch": 1.097609561752988,
"grad_norm": 1.8458558711170783,
"learning_rate": 4.2363764585800775e-06,
"loss": 0.0598,
"step": 551
},
{
"epoch": 1.099601593625498,
"grad_norm": 1.60919784354473,
"learning_rate": 4.220918379989898e-06,
"loss": 0.0578,
"step": 552
},
{
"epoch": 1.1015936254980079,
"grad_norm": 1.5149797237579112,
"learning_rate": 4.205467929474186e-06,
"loss": 0.0414,
"step": 553
},
{
"epoch": 1.1035856573705178,
"grad_norm": 1.9355788387540067,
"learning_rate": 4.190025258310013e-06,
"loss": 0.0605,
"step": 554
},
{
"epoch": 1.1055776892430278,
"grad_norm": 1.2712828264468141,
"learning_rate": 4.174590517698284e-06,
"loss": 0.0476,
"step": 555
},
{
"epoch": 1.1075697211155378,
"grad_norm": 1.7052649700043325,
"learning_rate": 4.159163858762255e-06,
"loss": 0.0448,
"step": 556
},
{
"epoch": 1.1095617529880477,
"grad_norm": 1.4892279816307104,
"learning_rate": 4.143745432546053e-06,
"loss": 0.0454,
"step": 557
},
{
"epoch": 1.1115537848605577,
"grad_norm": 1.2192774756681568,
"learning_rate": 4.1283353900131965e-06,
"loss": 0.0389,
"step": 558
},
{
"epoch": 1.1135458167330676,
"grad_norm": 1.748484313536612,
"learning_rate": 4.112933882045121e-06,
"loss": 0.0527,
"step": 559
},
{
"epoch": 1.1155378486055776,
"grad_norm": 1.5189552368858088,
"learning_rate": 4.097541059439698e-06,
"loss": 0.0538,
"step": 560
},
{
"epoch": 1.1175298804780875,
"grad_norm": 1.4923045394934673,
"learning_rate": 4.082157072909757e-06,
"loss": 0.0494,
"step": 561
},
{
"epoch": 1.1195219123505975,
"grad_norm": 1.7886352200727111,
"learning_rate": 4.066782073081616e-06,
"loss": 0.0516,
"step": 562
},
{
"epoch": 1.1215139442231075,
"grad_norm": 1.6403729404088143,
"learning_rate": 4.0514162104936025e-06,
"loss": 0.0504,
"step": 563
},
{
"epoch": 1.1235059760956174,
"grad_norm": 1.794559862930424,
"learning_rate": 4.036059635594578e-06,
"loss": 0.0539,
"step": 564
},
{
"epoch": 1.1254980079681274,
"grad_norm": 1.27443198183184,
"learning_rate": 4.020712498742469e-06,
"loss": 0.033,
"step": 565
},
{
"epoch": 1.1274900398406373,
"grad_norm": 1.528559195651957,
"learning_rate": 4.005374950202795e-06,
"loss": 0.0485,
"step": 566
},
{
"epoch": 1.1294820717131473,
"grad_norm": 1.6682443502200057,
"learning_rate": 3.990047140147192e-06,
"loss": 0.0472,
"step": 567
},
{
"epoch": 1.1314741035856573,
"grad_norm": 1.5977930972583627,
"learning_rate": 3.974729218651946e-06,
"loss": 0.0435,
"step": 568
},
{
"epoch": 1.1334661354581672,
"grad_norm": 1.8138279047378771,
"learning_rate": 3.959421335696524e-06,
"loss": 0.0525,
"step": 569
},
{
"epoch": 1.1354581673306772,
"grad_norm": 1.861112915058331,
"learning_rate": 3.944123641162106e-06,
"loss": 0.0539,
"step": 570
},
{
"epoch": 1.1374501992031871,
"grad_norm": 1.5212525579613898,
"learning_rate": 3.928836284830113e-06,
"loss": 0.0493,
"step": 571
},
{
"epoch": 1.139442231075697,
"grad_norm": 1.3446527061200657,
"learning_rate": 3.913559416380743e-06,
"loss": 0.0499,
"step": 572
},
{
"epoch": 1.1414342629482073,
"grad_norm": 1.2939076783357535,
"learning_rate": 3.898293185391509e-06,
"loss": 0.0323,
"step": 573
},
{
"epoch": 1.1434262948207172,
"grad_norm": 1.5273046564926365,
"learning_rate": 3.883037741335772e-06,
"loss": 0.0437,
"step": 574
},
{
"epoch": 1.1454183266932272,
"grad_norm": 1.3539627956837923,
"learning_rate": 3.867793233581272e-06,
"loss": 0.047,
"step": 575
},
{
"epoch": 1.1474103585657371,
"grad_norm": 1.2884254132839787,
"learning_rate": 3.852559811388676e-06,
"loss": 0.0435,
"step": 576
},
{
"epoch": 1.149402390438247,
"grad_norm": 1.3084774623960684,
"learning_rate": 3.8373376239101076e-06,
"loss": 0.0483,
"step": 577
},
{
"epoch": 1.151394422310757,
"grad_norm": 1.242637903895707,
"learning_rate": 3.822126820187691e-06,
"loss": 0.0313,
"step": 578
},
{
"epoch": 1.153386454183267,
"grad_norm": 1.4593599661421381,
"learning_rate": 3.806927549152091e-06,
"loss": 0.0401,
"step": 579
},
{
"epoch": 1.155378486055777,
"grad_norm": 1.5535605047130172,
"learning_rate": 3.791739959621054e-06,
"loss": 0.0527,
"step": 580
},
{
"epoch": 1.157370517928287,
"grad_norm": 1.7540657914683675,
"learning_rate": 3.776564200297953e-06,
"loss": 0.0527,
"step": 581
},
{
"epoch": 1.159362549800797,
"grad_norm": 1.57101285496225,
"learning_rate": 3.761400419770328e-06,
"loss": 0.0395,
"step": 582
},
{
"epoch": 1.1613545816733069,
"grad_norm": 1.7555114294741245,
"learning_rate": 3.746248766508435e-06,
"loss": 0.0434,
"step": 583
},
{
"epoch": 1.1633466135458168,
"grad_norm": 2.1592609021335227,
"learning_rate": 3.7311093888637906e-06,
"loss": 0.0495,
"step": 584
},
{
"epoch": 1.1653386454183268,
"grad_norm": 1.2940156736695578,
"learning_rate": 3.7159824350677177e-06,
"loss": 0.0387,
"step": 585
},
{
"epoch": 1.1673306772908367,
"grad_norm": 1.5202353281570202,
"learning_rate": 3.7008680532298962e-06,
"loss": 0.0432,
"step": 586
},
{
"epoch": 1.1693227091633467,
"grad_norm": 1.4400071402140493,
"learning_rate": 3.685766391336916e-06,
"loss": 0.0463,
"step": 587
},
{
"epoch": 1.1713147410358566,
"grad_norm": 1.7100573375487627,
"learning_rate": 3.670677597250819e-06,
"loss": 0.0511,
"step": 588
},
{
"epoch": 1.1733067729083666,
"grad_norm": 1.68459514966065,
"learning_rate": 3.6556018187076624e-06,
"loss": 0.0478,
"step": 589
},
{
"epoch": 1.1752988047808766,
"grad_norm": 1.6034087639429946,
"learning_rate": 3.6405392033160637e-06,
"loss": 0.04,
"step": 590
},
{
"epoch": 1.1772908366533865,
"grad_norm": 1.7092195009028075,
"learning_rate": 3.6254898985557598e-06,
"loss": 0.0474,
"step": 591
},
{
"epoch": 1.1792828685258965,
"grad_norm": 1.5626916752981477,
"learning_rate": 3.6104540517761594e-06,
"loss": 0.0551,
"step": 592
},
{
"epoch": 1.1812749003984064,
"grad_norm": 1.3965756525646038,
"learning_rate": 3.5954318101949047e-06,
"loss": 0.0369,
"step": 593
},
{
"epoch": 1.1832669322709164,
"grad_norm": 1.8211192187451488,
"learning_rate": 3.580423320896429e-06,
"loss": 0.0583,
"step": 594
},
{
"epoch": 1.1852589641434264,
"grad_norm": 1.348709853436774,
"learning_rate": 3.5654287308305137e-06,
"loss": 0.0385,
"step": 595
},
{
"epoch": 1.1872509960159363,
"grad_norm": 1.5559379215487086,
"learning_rate": 3.55044818681085e-06,
"loss": 0.0456,
"step": 596
},
{
"epoch": 1.1892430278884463,
"grad_norm": 2.0539321664502976,
"learning_rate": 3.5354818355136058e-06,
"loss": 0.0677,
"step": 597
},
{
"epoch": 1.1912350597609562,
"grad_norm": 1.4411394411205194,
"learning_rate": 3.5205298234759854e-06,
"loss": 0.0451,
"step": 598
},
{
"epoch": 1.1932270916334662,
"grad_norm": 1.7774567393386194,
"learning_rate": 3.5055922970947943e-06,
"loss": 0.0492,
"step": 599
},
{
"epoch": 1.1952191235059761,
"grad_norm": 1.5120643301720722,
"learning_rate": 3.4906694026250075e-06,
"loss": 0.0457,
"step": 600
},
{
"epoch": 1.1952191235059761,
"eval_loss": 0.11691030859947205,
"eval_runtime": 3.1386,
"eval_samples_per_second": 13.063,
"eval_steps_per_second": 3.505,
"step": 600
},
{
"epoch": 1.197211155378486,
"grad_norm": 1.364354387569068,
"learning_rate": 3.475761286178341e-06,
"loss": 0.0498,
"step": 601
},
{
"epoch": 1.199203187250996,
"grad_norm": 1.3894575596183438,
"learning_rate": 3.460868093721812e-06,
"loss": 0.038,
"step": 602
},
{
"epoch": 1.201195219123506,
"grad_norm": 1.8217847559393374,
"learning_rate": 3.44598997107632e-06,
"loss": 0.0567,
"step": 603
},
{
"epoch": 1.203187250996016,
"grad_norm": 2.1685604089404857,
"learning_rate": 3.431127063915213e-06,
"loss": 0.071,
"step": 604
},
{
"epoch": 1.205179282868526,
"grad_norm": 1.7472889111167405,
"learning_rate": 3.416279517762858e-06,
"loss": 0.0504,
"step": 605
},
{
"epoch": 1.207171314741036,
"grad_norm": 1.789030781154645,
"learning_rate": 3.4014474779932295e-06,
"loss": 0.0535,
"step": 606
},
{
"epoch": 1.2091633466135459,
"grad_norm": 1.5379198221444872,
"learning_rate": 3.386631089828468e-06,
"loss": 0.05,
"step": 607
},
{
"epoch": 1.2111553784860558,
"grad_norm": 1.3874042612095352,
"learning_rate": 3.371830498337475e-06,
"loss": 0.0373,
"step": 608
},
{
"epoch": 1.2131474103585658,
"grad_norm": 1.5231435277419703,
"learning_rate": 3.35704584843448e-06,
"loss": 0.037,
"step": 609
},
{
"epoch": 1.2151394422310757,
"grad_norm": 1.6029887156514784,
"learning_rate": 3.342277284877629e-06,
"loss": 0.0461,
"step": 610
},
{
"epoch": 1.2171314741035857,
"grad_norm": 1.5816632438715723,
"learning_rate": 3.3275249522675656e-06,
"loss": 0.046,
"step": 611
},
{
"epoch": 1.2191235059760956,
"grad_norm": 1.5428519312912603,
"learning_rate": 3.3127889950460094e-06,
"loss": 0.0472,
"step": 612
},
{
"epoch": 1.2211155378486056,
"grad_norm": 1.625742641074798,
"learning_rate": 3.2980695574943532e-06,
"loss": 0.0437,
"step": 613
},
{
"epoch": 1.2231075697211156,
"grad_norm": 1.3751070319547616,
"learning_rate": 3.28336678373224e-06,
"loss": 0.0375,
"step": 614
},
{
"epoch": 1.2250996015936255,
"grad_norm": 1.9737969684729024,
"learning_rate": 3.268680817716158e-06,
"loss": 0.0585,
"step": 615
},
{
"epoch": 1.2270916334661355,
"grad_norm": 1.9490150643596276,
"learning_rate": 3.254011803238026e-06,
"loss": 0.0504,
"step": 616
},
{
"epoch": 1.2290836653386454,
"grad_norm": 1.332216726824679,
"learning_rate": 3.2393598839237903e-06,
"loss": 0.0342,
"step": 617
},
{
"epoch": 1.2310756972111554,
"grad_norm": 2.1004912830561744,
"learning_rate": 3.22472520323202e-06,
"loss": 0.0488,
"step": 618
},
{
"epoch": 1.2330677290836654,
"grad_norm": 2.068005000040964,
"learning_rate": 3.2101079044524895e-06,
"loss": 0.065,
"step": 619
},
{
"epoch": 1.2350597609561753,
"grad_norm": 1.2584068242022144,
"learning_rate": 3.195508130704795e-06,
"loss": 0.0378,
"step": 620
},
{
"epoch": 1.2370517928286853,
"grad_norm": 1.3495383443316602,
"learning_rate": 3.1809260249369373e-06,
"loss": 0.0377,
"step": 621
},
{
"epoch": 1.2390438247011952,
"grad_norm": 1.74209098701691,
"learning_rate": 3.1663617299239303e-06,
"loss": 0.0582,
"step": 622
},
{
"epoch": 1.2410358565737052,
"grad_norm": 2.3541586375494767,
"learning_rate": 3.1518153882663994e-06,
"loss": 0.0658,
"step": 623
},
{
"epoch": 1.2430278884462151,
"grad_norm": 1.5107261868901234,
"learning_rate": 3.1372871423891894e-06,
"loss": 0.0413,
"step": 624
},
{
"epoch": 1.245019920318725,
"grad_norm": 2.007995750567458,
"learning_rate": 3.1227771345399647e-06,
"loss": 0.0477,
"step": 625
},
{
"epoch": 1.247011952191235,
"grad_norm": 1.3321234729078801,
"learning_rate": 3.1082855067878182e-06,
"loss": 0.0353,
"step": 626
},
{
"epoch": 1.249003984063745,
"grad_norm": 1.5773251846473662,
"learning_rate": 3.093812401021885e-06,
"loss": 0.0446,
"step": 627
},
{
"epoch": 1.250996015936255,
"grad_norm": 1.7010297581371503,
"learning_rate": 3.079357958949946e-06,
"loss": 0.0489,
"step": 628
},
{
"epoch": 1.252988047808765,
"grad_norm": 1.4948785976133703,
"learning_rate": 3.0649223220970458e-06,
"loss": 0.0346,
"step": 629
},
{
"epoch": 1.254980079681275,
"grad_norm": 1.580233346726061,
"learning_rate": 3.050505631804105e-06,
"loss": 0.0397,
"step": 630
},
{
"epoch": 1.2569721115537849,
"grad_norm": 1.4427856316940244,
"learning_rate": 3.0361080292265354e-06,
"loss": 0.0462,
"step": 631
},
{
"epoch": 1.2589641434262948,
"grad_norm": 1.5405358822480004,
"learning_rate": 3.021729655332858e-06,
"loss": 0.0446,
"step": 632
},
{
"epoch": 1.2609561752988048,
"grad_norm": 1.4069898266648972,
"learning_rate": 3.0073706509033257e-06,
"loss": 0.0481,
"step": 633
},
{
"epoch": 1.2629482071713147,
"grad_norm": 1.46434016713666,
"learning_rate": 2.993031156528542e-06,
"loss": 0.0405,
"step": 634
},
{
"epoch": 1.2649402390438247,
"grad_norm": 1.535938715676213,
"learning_rate": 2.978711312608084e-06,
"loss": 0.0404,
"step": 635
},
{
"epoch": 1.2669322709163346,
"grad_norm": 1.6527329484718531,
"learning_rate": 2.9644112593491315e-06,
"loss": 0.0378,
"step": 636
},
{
"epoch": 1.2689243027888446,
"grad_norm": 1.3342944497270683,
"learning_rate": 2.9501311367650908e-06,
"loss": 0.0458,
"step": 637
},
{
"epoch": 1.2709163346613546,
"grad_norm": 2.0538614960525874,
"learning_rate": 2.9358710846742237e-06,
"loss": 0.0536,
"step": 638
},
{
"epoch": 1.2729083665338645,
"grad_norm": 1.905491659479835,
"learning_rate": 2.92163124269828e-06,
"loss": 0.0531,
"step": 639
},
{
"epoch": 1.2749003984063745,
"grad_norm": 1.374868869463802,
"learning_rate": 2.90741175026113e-06,
"loss": 0.0371,
"step": 640
},
{
"epoch": 1.2768924302788844,
"grad_norm": 1.4720536952560632,
"learning_rate": 2.8932127465874004e-06,
"loss": 0.0444,
"step": 641
},
{
"epoch": 1.2788844621513944,
"grad_norm": 1.756270327258482,
"learning_rate": 2.8790343707011114e-06,
"loss": 0.0508,
"step": 642
},
{
"epoch": 1.2808764940239044,
"grad_norm": 1.1919142011447958,
"learning_rate": 2.864876761424309e-06,
"loss": 0.0361,
"step": 643
},
{
"epoch": 1.2828685258964143,
"grad_norm": 1.392440748957012,
"learning_rate": 2.850740057375716e-06,
"loss": 0.0379,
"step": 644
},
{
"epoch": 1.2848605577689243,
"grad_norm": 1.3950602927829037,
"learning_rate": 2.8366243969693674e-06,
"loss": 0.0338,
"step": 645
},
{
"epoch": 1.2868525896414342,
"grad_norm": 1.5201761557299762,
"learning_rate": 2.822529918413259e-06,
"loss": 0.0411,
"step": 646
},
{
"epoch": 1.2888446215139442,
"grad_norm": 1.3120599984550645,
"learning_rate": 2.8084567597079915e-06,
"loss": 0.0402,
"step": 647
},
{
"epoch": 1.2908366533864541,
"grad_norm": 1.4249084044129727,
"learning_rate": 2.7944050586454215e-06,
"loss": 0.0416,
"step": 648
},
{
"epoch": 1.292828685258964,
"grad_norm": 1.7782238234601455,
"learning_rate": 2.7803749528073108e-06,
"loss": 0.0488,
"step": 649
},
{
"epoch": 1.294820717131474,
"grad_norm": 1.6503484254463683,
"learning_rate": 2.7663665795639815e-06,
"loss": 0.0495,
"step": 650
},
{
"epoch": 1.296812749003984,
"grad_norm": 1.5251619493297126,
"learning_rate": 2.752380076072967e-06,
"loss": 0.0503,
"step": 651
},
{
"epoch": 1.298804780876494,
"grad_norm": 1.3125100617788958,
"learning_rate": 2.7384155792776724e-06,
"loss": 0.0356,
"step": 652
},
{
"epoch": 1.300796812749004,
"grad_norm": 1.5295843374836597,
"learning_rate": 2.7244732259060335e-06,
"loss": 0.044,
"step": 653
},
{
"epoch": 1.302788844621514,
"grad_norm": 1.5090295425620295,
"learning_rate": 2.710553152469178e-06,
"loss": 0.0426,
"step": 654
},
{
"epoch": 1.3047808764940239,
"grad_norm": 1.2597916390063753,
"learning_rate": 2.6966554952600886e-06,
"loss": 0.0359,
"step": 655
},
{
"epoch": 1.3067729083665338,
"grad_norm": 1.8988044363314096,
"learning_rate": 2.682780390352262e-06,
"loss": 0.0575,
"step": 656
},
{
"epoch": 1.3087649402390438,
"grad_norm": 1.6686947788798556,
"learning_rate": 2.668927973598392e-06,
"loss": 0.0424,
"step": 657
},
{
"epoch": 1.3107569721115537,
"grad_norm": 1.681750415123868,
"learning_rate": 2.655098380629024e-06,
"loss": 0.047,
"step": 658
},
{
"epoch": 1.3127490039840637,
"grad_norm": 1.5736992626946802,
"learning_rate": 2.6412917468512354e-06,
"loss": 0.034,
"step": 659
},
{
"epoch": 1.3147410358565736,
"grad_norm": 1.2994141414494456,
"learning_rate": 2.627508207447308e-06,
"loss": 0.0323,
"step": 660
},
{
"epoch": 1.3167330677290836,
"grad_norm": 1.5957841782185134,
"learning_rate": 2.613747897373403e-06,
"loss": 0.0537,
"step": 661
},
{
"epoch": 1.3187250996015936,
"grad_norm": 1.62545382658515,
"learning_rate": 2.6000109513582417e-06,
"loss": 0.0523,
"step": 662
},
{
"epoch": 1.3207171314741035,
"grad_norm": 2.230115579239022,
"learning_rate": 2.5862975039017835e-06,
"loss": 0.0607,
"step": 663
},
{
"epoch": 1.3227091633466135,
"grad_norm": 1.405368943781722,
"learning_rate": 2.5726076892739127e-06,
"loss": 0.0448,
"step": 664
},
{
"epoch": 1.3247011952191234,
"grad_norm": 1.7969792652573766,
"learning_rate": 2.5589416415131215e-06,
"loss": 0.0517,
"step": 665
},
{
"epoch": 1.3266932270916334,
"grad_norm": 1.530151249792159,
"learning_rate": 2.5452994944251962e-06,
"loss": 0.0331,
"step": 666
},
{
"epoch": 1.3286852589641434,
"grad_norm": 1.4807902414940988,
"learning_rate": 2.531681381581913e-06,
"loss": 0.0378,
"step": 667
},
{
"epoch": 1.3306772908366533,
"grad_norm": 1.7906460874526582,
"learning_rate": 2.5180874363197217e-06,
"loss": 0.0409,
"step": 668
},
{
"epoch": 1.3326693227091633,
"grad_norm": 1.6166848567176595,
"learning_rate": 2.504517791738449e-06,
"loss": 0.0464,
"step": 669
},
{
"epoch": 1.3346613545816732,
"grad_norm": 1.6676564078427882,
"learning_rate": 2.4909725806999847e-06,
"loss": 0.0466,
"step": 670
},
{
"epoch": 1.3366533864541832,
"grad_norm": 1.8873962166889124,
"learning_rate": 2.4774519358269932e-06,
"loss": 0.0472,
"step": 671
},
{
"epoch": 1.3386454183266931,
"grad_norm": 1.498576664515062,
"learning_rate": 2.463955989501607e-06,
"loss": 0.0464,
"step": 672
},
{
"epoch": 1.340637450199203,
"grad_norm": 1.4936995031717264,
"learning_rate": 2.4504848738641313e-06,
"loss": 0.0319,
"step": 673
},
{
"epoch": 1.342629482071713,
"grad_norm": 1.7166013235605342,
"learning_rate": 2.437038720811752e-06,
"loss": 0.0458,
"step": 674
},
{
"epoch": 1.3446215139442232,
"grad_norm": 1.9084481577153407,
"learning_rate": 2.4236176619972436e-06,
"loss": 0.0479,
"step": 675
},
{
"epoch": 1.3466135458167332,
"grad_norm": 1.4070919253542853,
"learning_rate": 2.41022182882768e-06,
"loss": 0.0322,
"step": 676
},
{
"epoch": 1.3486055776892432,
"grad_norm": 2.0522106017301285,
"learning_rate": 2.3968513524631483e-06,
"loss": 0.0533,
"step": 677
},
{
"epoch": 1.3505976095617531,
"grad_norm": 1.4028748923424417,
"learning_rate": 2.3835063638154636e-06,
"loss": 0.0371,
"step": 678
},
{
"epoch": 1.352589641434263,
"grad_norm": 1.7132235697547435,
"learning_rate": 2.3701869935468893e-06,
"loss": 0.045,
"step": 679
},
{
"epoch": 1.354581673306773,
"grad_norm": 1.3468986017248528,
"learning_rate": 2.356893372068855e-06,
"loss": 0.0374,
"step": 680
},
{
"epoch": 1.356573705179283,
"grad_norm": 1.7023203932118756,
"learning_rate": 2.343625629540681e-06,
"loss": 0.0431,
"step": 681
},
{
"epoch": 1.358565737051793,
"grad_norm": 1.7614144791766995,
"learning_rate": 2.3303838958683077e-06,
"loss": 0.0469,
"step": 682
},
{
"epoch": 1.360557768924303,
"grad_norm": 1.713466874008201,
"learning_rate": 2.3171683007030117e-06,
"loss": 0.0403,
"step": 683
},
{
"epoch": 1.3625498007968129,
"grad_norm": 1.8072313944179408,
"learning_rate": 2.3039789734401524e-06,
"loss": 0.0514,
"step": 684
},
{
"epoch": 1.3645418326693228,
"grad_norm": 1.8285876817770546,
"learning_rate": 2.2908160432178937e-06,
"loss": 0.047,
"step": 685
},
{
"epoch": 1.3665338645418328,
"grad_norm": 1.835704830271512,
"learning_rate": 2.277679638915945e-06,
"loss": 0.0491,
"step": 686
},
{
"epoch": 1.3685258964143427,
"grad_norm": 1.809690467670344,
"learning_rate": 2.264569889154295e-06,
"loss": 0.0527,
"step": 687
},
{
"epoch": 1.3705179282868527,
"grad_norm": 1.3476668235326381,
"learning_rate": 2.251486922291957e-06,
"loss": 0.0385,
"step": 688
},
{
"epoch": 1.3725099601593627,
"grad_norm": 2.0206149912840194,
"learning_rate": 2.23843086642571e-06,
"loss": 0.0537,
"step": 689
},
{
"epoch": 1.3745019920318726,
"grad_norm": 1.5720430933164902,
"learning_rate": 2.225401849388842e-06,
"loss": 0.048,
"step": 690
},
{
"epoch": 1.3764940239043826,
"grad_norm": 1.499247928447303,
"learning_rate": 2.2123999987499015e-06,
"loss": 0.0422,
"step": 691
},
{
"epoch": 1.3784860557768925,
"grad_norm": 1.276340108431395,
"learning_rate": 2.1994254418114524e-06,
"loss": 0.0423,
"step": 692
},
{
"epoch": 1.3804780876494025,
"grad_norm": 1.5216579445173712,
"learning_rate": 2.186478305608819e-06,
"loss": 0.0412,
"step": 693
},
{
"epoch": 1.3824701195219125,
"grad_norm": 1.9598772663646258,
"learning_rate": 2.1735587169088435e-06,
"loss": 0.0477,
"step": 694
},
{
"epoch": 1.3844621513944224,
"grad_norm": 1.6029428282921836,
"learning_rate": 2.1606668022086517e-06,
"loss": 0.0481,
"step": 695
},
{
"epoch": 1.3864541832669324,
"grad_norm": 1.0893162038986566,
"learning_rate": 2.147802687734409e-06,
"loss": 0.031,
"step": 696
},
{
"epoch": 1.3884462151394423,
"grad_norm": 1.48589149356358,
"learning_rate": 2.1349664994400853e-06,
"loss": 0.0382,
"step": 697
},
{
"epoch": 1.3904382470119523,
"grad_norm": 1.8673421468251405,
"learning_rate": 2.122158363006223e-06,
"loss": 0.0605,
"step": 698
},
{
"epoch": 1.3924302788844622,
"grad_norm": 1.4982653691530576,
"learning_rate": 2.109378403838705e-06,
"loss": 0.0441,
"step": 699
},
{
"epoch": 1.3944223107569722,
"grad_norm": 1.6272971890188588,
"learning_rate": 2.0966267470675273e-06,
"loss": 0.0387,
"step": 700
},
{
"epoch": 1.3964143426294822,
"grad_norm": 1.3009436508973116,
"learning_rate": 2.0839035175455748e-06,
"loss": 0.0364,
"step": 701
},
{
"epoch": 1.3984063745019921,
"grad_norm": 1.5070361743640346,
"learning_rate": 2.071208839847397e-06,
"loss": 0.0384,
"step": 702
},
{
"epoch": 1.400398406374502,
"grad_norm": 1.8403315806429654,
"learning_rate": 2.0585428382679894e-06,
"loss": 0.0566,
"step": 703
},
{
"epoch": 1.402390438247012,
"grad_norm": 1.9768359787018837,
"learning_rate": 2.0459056368215786e-06,
"loss": 0.049,
"step": 704
},
{
"epoch": 1.404382470119522,
"grad_norm": 1.1994848396934092,
"learning_rate": 2.0332973592404027e-06,
"loss": 0.0357,
"step": 705
},
{
"epoch": 1.406374501992032,
"grad_norm": 1.4693306467977794,
"learning_rate": 2.0207181289735073e-06,
"loss": 0.0359,
"step": 706
},
{
"epoch": 1.408366533864542,
"grad_norm": 1.5649069322977223,
"learning_rate": 2.008168069185525e-06,
"loss": 0.042,
"step": 707
},
{
"epoch": 1.4103585657370519,
"grad_norm": 1.3393789831970144,
"learning_rate": 1.9956473027554846e-06,
"loss": 0.0383,
"step": 708
},
{
"epoch": 1.4123505976095618,
"grad_norm": 1.7830258693931829,
"learning_rate": 1.9831559522755976e-06,
"loss": 0.0469,
"step": 709
},
{
"epoch": 1.4143426294820718,
"grad_norm": 1.6147368592510511,
"learning_rate": 1.97069414005006e-06,
"loss": 0.0439,
"step": 710
},
{
"epoch": 1.4163346613545817,
"grad_norm": 1.3999929614334758,
"learning_rate": 1.9582619880938565e-06,
"loss": 0.0463,
"step": 711
},
{
"epoch": 1.4183266932270917,
"grad_norm": 1.836089297310526,
"learning_rate": 1.9458596181315643e-06,
"loss": 0.0573,
"step": 712
},
{
"epoch": 1.4203187250996017,
"grad_norm": 1.5850159120530725,
"learning_rate": 1.9334871515961616e-06,
"loss": 0.0378,
"step": 713
},
{
"epoch": 1.4223107569721116,
"grad_norm": 1.3946476121954683,
"learning_rate": 1.9211447096278403e-06,
"loss": 0.0405,
"step": 714
},
{
"epoch": 1.4243027888446216,
"grad_norm": 1.3008967894406476,
"learning_rate": 1.9088324130728164e-06,
"loss": 0.0326,
"step": 715
},
{
"epoch": 1.4262948207171315,
"grad_norm": 1.3349769350489,
"learning_rate": 1.8965503824821496e-06,
"loss": 0.0429,
"step": 716
},
{
"epoch": 1.4282868525896415,
"grad_norm": 1.6914951673768632,
"learning_rate": 1.8842987381105626e-06,
"loss": 0.0367,
"step": 717
},
{
"epoch": 1.4302788844621515,
"grad_norm": 1.3515680548078475,
"learning_rate": 1.872077599915263e-06,
"loss": 0.0379,
"step": 718
},
{
"epoch": 1.4322709163346614,
"grad_norm": 1.3377996405235246,
"learning_rate": 1.8598870875547691e-06,
"loss": 0.0402,
"step": 719
},
{
"epoch": 1.4342629482071714,
"grad_norm": 1.264515354052566,
"learning_rate": 1.84772732038774e-06,
"loss": 0.0323,
"step": 720
},
{
"epoch": 1.4362549800796813,
"grad_norm": 1.555389355360336,
"learning_rate": 1.8355984174717994e-06,
"loss": 0.0382,
"step": 721
},
{
"epoch": 1.4382470119521913,
"grad_norm": 1.3182929471725358,
"learning_rate": 1.8235004975623816e-06,
"loss": 0.0307,
"step": 722
},
{
"epoch": 1.4402390438247012,
"grad_norm": 1.4783822554476882,
"learning_rate": 1.811433679111561e-06,
"loss": 0.0391,
"step": 723
},
{
"epoch": 1.4422310756972112,
"grad_norm": 1.7708760927848832,
"learning_rate": 1.7993980802668947e-06,
"loss": 0.0391,
"step": 724
},
{
"epoch": 1.4442231075697212,
"grad_norm": 1.6784698019995254,
"learning_rate": 1.787393818870264e-06,
"loss": 0.0487,
"step": 725
},
{
"epoch": 1.4462151394422311,
"grad_norm": 1.2911428050631453,
"learning_rate": 1.7754210124567216e-06,
"loss": 0.0325,
"step": 726
},
{
"epoch": 1.448207171314741,
"grad_norm": 1.6758556031327247,
"learning_rate": 1.7634797782533436e-06,
"loss": 0.0378,
"step": 727
},
{
"epoch": 1.450199203187251,
"grad_norm": 1.319567821261113,
"learning_rate": 1.7515702331780753e-06,
"loss": 0.0368,
"step": 728
},
{
"epoch": 1.452191235059761,
"grad_norm": 1.2921652740567358,
"learning_rate": 1.7396924938385933e-06,
"loss": 0.0331,
"step": 729
},
{
"epoch": 1.454183266932271,
"grad_norm": 1.1429340732722788,
"learning_rate": 1.7278466765311597e-06,
"loss": 0.0336,
"step": 730
},
{
"epoch": 1.456175298804781,
"grad_norm": 1.8196882924447233,
"learning_rate": 1.7160328972394835e-06,
"loss": 0.0418,
"step": 731
},
{
"epoch": 1.4581673306772909,
"grad_norm": 1.6752103021159248,
"learning_rate": 1.7042512716335873e-06,
"loss": 0.0397,
"step": 732
},
{
"epoch": 1.4601593625498008,
"grad_norm": 1.797041780271012,
"learning_rate": 1.6925019150686744e-06,
"loss": 0.055,
"step": 733
},
{
"epoch": 1.4621513944223108,
"grad_norm": 1.4651037298048166,
"learning_rate": 1.6807849425839933e-06,
"loss": 0.0361,
"step": 734
},
{
"epoch": 1.4641434262948207,
"grad_norm": 1.1462478664298237,
"learning_rate": 1.669100468901722e-06,
"loss": 0.0308,
"step": 735
},
{
"epoch": 1.4661354581673307,
"grad_norm": 1.7311028494874292,
"learning_rate": 1.6574486084258369e-06,
"loss": 0.0447,
"step": 736
},
{
"epoch": 1.4681274900398407,
"grad_norm": 1.7747156927182413,
"learning_rate": 1.6458294752409943e-06,
"loss": 0.0423,
"step": 737
},
{
"epoch": 1.4701195219123506,
"grad_norm": 1.765615529033668,
"learning_rate": 1.6342431831114153e-06,
"loss": 0.0419,
"step": 738
},
{
"epoch": 1.4721115537848606,
"grad_norm": 1.588016885038511,
"learning_rate": 1.6226898454797697e-06,
"loss": 0.0437,
"step": 739
},
{
"epoch": 1.4741035856573705,
"grad_norm": 2.3682981845889954,
"learning_rate": 1.6111695754660667e-06,
"loss": 0.0548,
"step": 740
},
{
"epoch": 1.4760956175298805,
"grad_norm": 2.196815477790404,
"learning_rate": 1.599682485866546e-06,
"loss": 0.0471,
"step": 741
},
{
"epoch": 1.4780876494023905,
"grad_norm": 1.3771418350242575,
"learning_rate": 1.5882286891525755e-06,
"loss": 0.0428,
"step": 742
},
{
"epoch": 1.4800796812749004,
"grad_norm": 1.4903254185233181,
"learning_rate": 1.5768082974695476e-06,
"loss": 0.0404,
"step": 743
},
{
"epoch": 1.4820717131474104,
"grad_norm": 1.4985158317383274,
"learning_rate": 1.5654214226357822e-06,
"loss": 0.0329,
"step": 744
},
{
"epoch": 1.4840637450199203,
"grad_norm": 1.7223869396300333,
"learning_rate": 1.5540681761414327e-06,
"loss": 0.047,
"step": 745
},
{
"epoch": 1.4860557768924303,
"grad_norm": 1.570813419108133,
"learning_rate": 1.5427486691473942e-06,
"loss": 0.0383,
"step": 746
},
{
"epoch": 1.4880478087649402,
"grad_norm": 1.69930983396748,
"learning_rate": 1.5314630124842144e-06,
"loss": 0.0399,
"step": 747
},
{
"epoch": 1.4900398406374502,
"grad_norm": 1.7421371319863044,
"learning_rate": 1.5202113166510058e-06,
"loss": 0.0481,
"step": 748
},
{
"epoch": 1.4920318725099602,
"grad_norm": 1.5181018716448114,
"learning_rate": 1.5089936918143705e-06,
"loss": 0.0349,
"step": 749
},
{
"epoch": 1.4940239043824701,
"grad_norm": 1.5458296783975771,
"learning_rate": 1.4978102478073165e-06,
"loss": 0.0469,
"step": 750
},
{
"epoch": 1.49601593625498,
"grad_norm": 1.5770259773948998,
"learning_rate": 1.4866610941281823e-06,
"loss": 0.0325,
"step": 751
},
{
"epoch": 1.49800796812749,
"grad_norm": 1.5341462678859672,
"learning_rate": 1.475546339939568e-06,
"loss": 0.0397,
"step": 752
},
{
"epoch": 1.5,
"grad_norm": 1.2027634510806997,
"learning_rate": 1.4644660940672628e-06,
"loss": 0.028,
"step": 753
},
{
"epoch": 1.50199203187251,
"grad_norm": 1.5111370601251886,
"learning_rate": 1.4534204649991817e-06,
"loss": 0.0386,
"step": 754
},
{
"epoch": 1.50398406374502,
"grad_norm": 1.482657437759512,
"learning_rate": 1.4424095608843036e-06,
"loss": 0.036,
"step": 755
},
{
"epoch": 1.5059760956175299,
"grad_norm": 1.077070456287382,
"learning_rate": 1.4314334895316095e-06,
"loss": 0.0334,
"step": 756
},
{
"epoch": 1.5079681274900398,
"grad_norm": 1.4629517388254307,
"learning_rate": 1.4204923584090314e-06,
"loss": 0.036,
"step": 757
},
{
"epoch": 1.5099601593625498,
"grad_norm": 1.3805538263025956,
"learning_rate": 1.4095862746423961e-06,
"loss": 0.0356,
"step": 758
},
{
"epoch": 1.5119521912350598,
"grad_norm": 1.8358312305710611,
"learning_rate": 1.3987153450143775e-06,
"loss": 0.0465,
"step": 759
},
{
"epoch": 1.5139442231075697,
"grad_norm": 1.1684136646906567,
"learning_rate": 1.3878796759634544e-06,
"loss": 0.0332,
"step": 760
},
{
"epoch": 1.5159362549800797,
"grad_norm": 1.2688724525634016,
"learning_rate": 1.3770793735828603e-06,
"loss": 0.0346,
"step": 761
},
{
"epoch": 1.5179282868525896,
"grad_norm": 1.8410552958221407,
"learning_rate": 1.366314543619553e-06,
"loss": 0.0464,
"step": 762
},
{
"epoch": 1.5199203187250996,
"grad_norm": 1.6641830537225055,
"learning_rate": 1.355585291473176e-06,
"loss": 0.0402,
"step": 763
},
{
"epoch": 1.5219123505976095,
"grad_norm": 1.4605320398418256,
"learning_rate": 1.3448917221950264e-06,
"loss": 0.0386,
"step": 764
},
{
"epoch": 1.5239043824701195,
"grad_norm": 1.628381572331683,
"learning_rate": 1.3342339404870253e-06,
"loss": 0.0401,
"step": 765
},
{
"epoch": 1.5258964143426295,
"grad_norm": 2.3659907711059467,
"learning_rate": 1.3236120507006945e-06,
"loss": 0.0553,
"step": 766
},
{
"epoch": 1.5278884462151394,
"grad_norm": 2.019268389843473,
"learning_rate": 1.3130261568361335e-06,
"loss": 0.0456,
"step": 767
},
{
"epoch": 1.5298804780876494,
"grad_norm": 1.903479551165333,
"learning_rate": 1.3024763625410025e-06,
"loss": 0.0452,
"step": 768
},
{
"epoch": 1.5318725099601593,
"grad_norm": 1.6250609527356852,
"learning_rate": 1.2919627711095068e-06,
"loss": 0.0362,
"step": 769
},
{
"epoch": 1.5338645418326693,
"grad_norm": 1.2317333249828415,
"learning_rate": 1.281485485481384e-06,
"loss": 0.0304,
"step": 770
},
{
"epoch": 1.5358565737051793,
"grad_norm": 1.8649886760609837,
"learning_rate": 1.2710446082408996e-06,
"loss": 0.0464,
"step": 771
},
{
"epoch": 1.5378486055776892,
"grad_norm": 1.8171004964117743,
"learning_rate": 1.2606402416158391e-06,
"loss": 0.0465,
"step": 772
},
{
"epoch": 1.5398406374501992,
"grad_norm": 1.862322448579373,
"learning_rate": 1.2502724874765087e-06,
"loss": 0.0479,
"step": 773
},
{
"epoch": 1.5418326693227091,
"grad_norm": 1.8583353335497257,
"learning_rate": 1.2399414473347405e-06,
"loss": 0.0424,
"step": 774
},
{
"epoch": 1.543824701195219,
"grad_norm": 1.4852985609369138,
"learning_rate": 1.229647222342889e-06,
"loss": 0.037,
"step": 775
},
{
"epoch": 1.545816733067729,
"grad_norm": 1.701665909805954,
"learning_rate": 1.2193899132928539e-06,
"loss": 0.0435,
"step": 776
},
{
"epoch": 1.547808764940239,
"grad_norm": 1.5368339751180022,
"learning_rate": 1.2091696206150843e-06,
"loss": 0.0405,
"step": 777
},
{
"epoch": 1.549800796812749,
"grad_norm": 1.4211500123116818,
"learning_rate": 1.1989864443775984e-06,
"loss": 0.0355,
"step": 778
},
{
"epoch": 1.551792828685259,
"grad_norm": 2.0587268481909646,
"learning_rate": 1.1888404842850031e-06,
"loss": 0.0555,
"step": 779
},
{
"epoch": 1.5537848605577689,
"grad_norm": 1.2088357311119127,
"learning_rate": 1.1787318396775188e-06,
"loss": 0.0348,
"step": 780
},
{
"epoch": 1.5557768924302788,
"grad_norm": 1.4388643186761485,
"learning_rate": 1.1686606095300034e-06,
"loss": 0.0344,
"step": 781
},
{
"epoch": 1.5577689243027888,
"grad_norm": 1.2409791290088654,
"learning_rate": 1.158626892450988e-06,
"loss": 0.0344,
"step": 782
},
{
"epoch": 1.5597609561752988,
"grad_norm": 1.6545764811493544,
"learning_rate": 1.1486307866817082e-06,
"loss": 0.0455,
"step": 783
},
{
"epoch": 1.5617529880478087,
"grad_norm": 1.8070275154016138,
"learning_rate": 1.138672390095143e-06,
"loss": 0.033,
"step": 784
},
{
"epoch": 1.5637450199203187,
"grad_norm": 1.6600852203195762,
"learning_rate": 1.128751800195057e-06,
"loss": 0.0467,
"step": 785
},
{
"epoch": 1.5657370517928286,
"grad_norm": 1.4417059533847947,
"learning_rate": 1.1188691141150455e-06,
"loss": 0.0383,
"step": 786
},
{
"epoch": 1.5677290836653386,
"grad_norm": 1.2631911364466095,
"learning_rate": 1.1090244286175834e-06,
"loss": 0.0262,
"step": 787
},
{
"epoch": 1.5697211155378485,
"grad_norm": 1.7996066106500415,
"learning_rate": 1.0992178400930753e-06,
"loss": 0.0428,
"step": 788
},
{
"epoch": 1.5717131474103585,
"grad_norm": 1.661450747231149,
"learning_rate": 1.0894494445589171e-06,
"loss": 0.0475,
"step": 789
},
{
"epoch": 1.5737051792828685,
"grad_norm": 1.9431205504964344,
"learning_rate": 1.0797193376585518e-06,
"loss": 0.0451,
"step": 790
},
{
"epoch": 1.5756972111553784,
"grad_norm": 1.3821996487700146,
"learning_rate": 1.0700276146605349e-06,
"loss": 0.0385,
"step": 791
},
{
"epoch": 1.5776892430278884,
"grad_norm": 1.4713583961609482,
"learning_rate": 1.0603743704575992e-06,
"loss": 0.0402,
"step": 792
},
{
"epoch": 1.5796812749003983,
"grad_norm": 1.396562745448481,
"learning_rate": 1.0507596995657288e-06,
"loss": 0.0407,
"step": 793
},
{
"epoch": 1.5816733067729083,
"grad_norm": 1.584255914198357,
"learning_rate": 1.0411836961232312e-06,
"loss": 0.0362,
"step": 794
},
{
"epoch": 1.5836653386454183,
"grad_norm": 1.2525048404082542,
"learning_rate": 1.031646453889818e-06,
"loss": 0.0375,
"step": 795
},
{
"epoch": 1.5856573705179282,
"grad_norm": 1.692525265149603,
"learning_rate": 1.0221480662456845e-06,
"loss": 0.0451,
"step": 796
},
{
"epoch": 1.5876494023904382,
"grad_norm": 1.5918519161268234,
"learning_rate": 1.012688626190596e-06,
"loss": 0.0479,
"step": 797
},
{
"epoch": 1.5896414342629481,
"grad_norm": 1.4243746693979769,
"learning_rate": 1.0032682263429788e-06,
"loss": 0.0331,
"step": 798
},
{
"epoch": 1.591633466135458,
"grad_norm": 1.926086405446464,
"learning_rate": 9.93886958939011e-07,
"loss": 0.0427,
"step": 799
},
{
"epoch": 1.593625498007968,
"grad_norm": 1.4477256907812834,
"learning_rate": 9.845449158317216e-07,
"loss": 0.0359,
"step": 800
},
{
"epoch": 1.593625498007968,
"eval_loss": 0.10596468299627304,
"eval_runtime": 3.1343,
"eval_samples_per_second": 13.081,
"eval_steps_per_second": 3.51,
"step": 800
},
{
"epoch": 1.595617529880478,
"grad_norm": 1.421779703061797,
"learning_rate": 9.752421884900915e-07,
"loss": 0.0305,
"step": 801
},
{
"epoch": 1.597609561752988,
"grad_norm": 1.7789603558111702,
"learning_rate": 9.65978867998152e-07,
"loss": 0.0345,
"step": 802
},
{
"epoch": 1.599601593625498,
"grad_norm": 1.9525946533919227,
"learning_rate": 9.567550450541012e-07,
"loss": 0.0536,
"step": 803
},
{
"epoch": 1.6015936254980079,
"grad_norm": 1.4805860459441311,
"learning_rate": 9.475708099694125e-07,
"loss": 0.0425,
"step": 804
},
{
"epoch": 1.6035856573705178,
"grad_norm": 1.6320592012302073,
"learning_rate": 9.384262526679488e-07,
"loss": 0.0385,
"step": 805
},
{
"epoch": 1.6055776892430278,
"grad_norm": 1.407033676035743,
"learning_rate": 9.293214626850838e-07,
"loss": 0.0398,
"step": 806
},
{
"epoch": 1.6075697211155378,
"grad_norm": 1.1535555756252225,
"learning_rate": 9.202565291668253e-07,
"loss": 0.0314,
"step": 807
},
{
"epoch": 1.6095617529880477,
"grad_norm": 1.4977437784858982,
"learning_rate": 9.112315408689415e-07,
"loss": 0.0374,
"step": 808
},
{
"epoch": 1.6115537848605577,
"grad_norm": 1.732572156525876,
"learning_rate": 9.022465861560931e-07,
"loss": 0.0446,
"step": 809
},
{
"epoch": 1.6135458167330676,
"grad_norm": 1.6891276330650864,
"learning_rate": 8.933017530009669e-07,
"loss": 0.0383,
"step": 810
},
{
"epoch": 1.6155378486055776,
"grad_norm": 1.897852297890652,
"learning_rate": 8.843971289834157e-07,
"loss": 0.0441,
"step": 811
},
{
"epoch": 1.6175298804780875,
"grad_norm": 1.5594431631506627,
"learning_rate": 8.755328012896002e-07,
"loss": 0.0405,
"step": 812
},
{
"epoch": 1.6195219123505975,
"grad_norm": 1.436757522414069,
"learning_rate": 8.667088567111348e-07,
"loss": 0.0396,
"step": 813
},
{
"epoch": 1.6215139442231075,
"grad_norm": 1.9484255207208512,
"learning_rate": 8.579253816442401e-07,
"loss": 0.0516,
"step": 814
},
{
"epoch": 1.6235059760956174,
"grad_norm": 1.380709884926527,
"learning_rate": 8.491824620888906e-07,
"loss": 0.0356,
"step": 815
},
{
"epoch": 1.6254980079681274,
"grad_norm": 1.7133470278748053,
"learning_rate": 8.404801836479809e-07,
"loss": 0.0434,
"step": 816
},
{
"epoch": 1.6274900398406373,
"grad_norm": 1.583809883914204,
"learning_rate": 8.318186315264859e-07,
"loss": 0.0394,
"step": 817
},
{
"epoch": 1.6294820717131473,
"grad_norm": 1.468174216651124,
"learning_rate": 8.231978905306204e-07,
"loss": 0.0268,
"step": 818
},
{
"epoch": 1.6314741035856573,
"grad_norm": 1.4408279947153464,
"learning_rate": 8.146180450670155e-07,
"loss": 0.0313,
"step": 819
},
{
"epoch": 1.6334661354581672,
"grad_norm": 1.5748697433294543,
"learning_rate": 8.060791791418887e-07,
"loss": 0.0363,
"step": 820
},
{
"epoch": 1.6354581673306772,
"grad_norm": 1.6393304512543407,
"learning_rate": 7.975813763602219e-07,
"loss": 0.0325,
"step": 821
},
{
"epoch": 1.6374501992031871,
"grad_norm": 2.1412321953511935,
"learning_rate": 7.891247199249441e-07,
"loss": 0.0532,
"step": 822
},
{
"epoch": 1.639442231075697,
"grad_norm": 1.3138050863993294,
"learning_rate": 7.807092926361154e-07,
"loss": 0.0331,
"step": 823
},
{
"epoch": 1.641434262948207,
"grad_norm": 1.5541240912252534,
"learning_rate": 7.723351768901172e-07,
"loss": 0.0426,
"step": 824
},
{
"epoch": 1.643426294820717,
"grad_norm": 1.406024578499524,
"learning_rate": 7.640024546788449e-07,
"loss": 0.0342,
"step": 825
},
{
"epoch": 1.645418326693227,
"grad_norm": 1.2539556880610443,
"learning_rate": 7.557112075889034e-07,
"loss": 0.026,
"step": 826
},
{
"epoch": 1.647410358565737,
"grad_norm": 1.394792639284168,
"learning_rate": 7.474615168008126e-07,
"loss": 0.0437,
"step": 827
},
{
"epoch": 1.6494023904382469,
"grad_norm": 1.2380166612525048,
"learning_rate": 7.392534630882092e-07,
"loss": 0.0308,
"step": 828
},
{
"epoch": 1.6513944223107568,
"grad_norm": 1.6208908227480252,
"learning_rate": 7.310871268170566e-07,
"loss": 0.0391,
"step": 829
},
{
"epoch": 1.6533864541832668,
"grad_norm": 1.3790210574012849,
"learning_rate": 7.229625879448577e-07,
"loss": 0.0341,
"step": 830
},
{
"epoch": 1.6553784860557768,
"grad_norm": 1.3562415289883447,
"learning_rate": 7.148799260198736e-07,
"loss": 0.0323,
"step": 831
},
{
"epoch": 1.6573705179282867,
"grad_norm": 1.4745348978618424,
"learning_rate": 7.06839220180342e-07,
"loss": 0.0394,
"step": 832
},
{
"epoch": 1.6593625498007967,
"grad_norm": 1.8763390835724343,
"learning_rate": 6.988405491537054e-07,
"loss": 0.0403,
"step": 833
},
{
"epoch": 1.6613545816733066,
"grad_norm": 1.5395306915776477,
"learning_rate": 6.908839912558374e-07,
"loss": 0.0375,
"step": 834
},
{
"epoch": 1.6633466135458166,
"grad_norm": 1.6645400590228017,
"learning_rate": 6.829696243902784e-07,
"loss": 0.0448,
"step": 835
},
{
"epoch": 1.6653386454183265,
"grad_norm": 1.8788295421219408,
"learning_rate": 6.750975260474718e-07,
"loss": 0.0425,
"step": 836
},
{
"epoch": 1.6673306772908365,
"grad_norm": 1.9902864466850982,
"learning_rate": 6.67267773304004e-07,
"loss": 0.048,
"step": 837
},
{
"epoch": 1.6693227091633465,
"grad_norm": 1.0800153432067903,
"learning_rate": 6.594804428218527e-07,
"loss": 0.0327,
"step": 838
},
{
"epoch": 1.6713147410358564,
"grad_norm": 1.500062093936282,
"learning_rate": 6.517356108476314e-07,
"loss": 0.0366,
"step": 839
},
{
"epoch": 1.6733067729083664,
"grad_norm": 1.9099325765908919,
"learning_rate": 6.440333532118503e-07,
"loss": 0.0432,
"step": 840
},
{
"epoch": 1.6752988047808763,
"grad_norm": 1.16559644327572,
"learning_rate": 6.36373745328166e-07,
"loss": 0.0258,
"step": 841
},
{
"epoch": 1.6772908366533863,
"grad_norm": 1.3024121385631138,
"learning_rate": 6.287568621926482e-07,
"loss": 0.0329,
"step": 842
},
{
"epoch": 1.6792828685258963,
"grad_norm": 1.7905458621338421,
"learning_rate": 6.211827783830443e-07,
"loss": 0.0388,
"step": 843
},
{
"epoch": 1.6812749003984062,
"grad_norm": 1.507626826133329,
"learning_rate": 6.136515680580479e-07,
"loss": 0.0365,
"step": 844
},
{
"epoch": 1.6832669322709162,
"grad_norm": 1.5818672866222618,
"learning_rate": 6.061633049565735e-07,
"loss": 0.0427,
"step": 845
},
{
"epoch": 1.6852589641434261,
"grad_norm": 1.7500474110140714,
"learning_rate": 5.987180623970351e-07,
"loss": 0.0419,
"step": 846
},
{
"epoch": 1.687250996015936,
"grad_norm": 1.6005728580669865,
"learning_rate": 5.913159132766272e-07,
"loss": 0.0386,
"step": 847
},
{
"epoch": 1.6892430278884463,
"grad_norm": 1.4765492940539946,
"learning_rate": 5.839569300706127e-07,
"loss": 0.035,
"step": 848
},
{
"epoch": 1.6912350597609562,
"grad_norm": 1.7073641068426852,
"learning_rate": 5.766411848316111e-07,
"loss": 0.0453,
"step": 849
},
{
"epoch": 1.6932270916334662,
"grad_norm": 1.3266813195045528,
"learning_rate": 5.693687491888944e-07,
"loss": 0.0269,
"step": 850
},
{
"epoch": 1.6952191235059761,
"grad_norm": 1.7375537306550803,
"learning_rate": 5.621396943476865e-07,
"loss": 0.0371,
"step": 851
},
{
"epoch": 1.697211155378486,
"grad_norm": 1.16829582732014,
"learning_rate": 5.549540910884649e-07,
"loss": 0.0292,
"step": 852
},
{
"epoch": 1.699203187250996,
"grad_norm": 1.2524818853083746,
"learning_rate": 5.478120097662654e-07,
"loss": 0.0379,
"step": 853
},
{
"epoch": 1.701195219123506,
"grad_norm": 1.6095889326273223,
"learning_rate": 5.407135203099984e-07,
"loss": 0.0348,
"step": 854
},
{
"epoch": 1.703187250996016,
"grad_norm": 1.7855589434412364,
"learning_rate": 5.336586922217607e-07,
"loss": 0.0426,
"step": 855
},
{
"epoch": 1.705179282868526,
"grad_norm": 1.3350483084942124,
"learning_rate": 5.266475945761562e-07,
"loss": 0.0324,
"step": 856
},
{
"epoch": 1.707171314741036,
"grad_norm": 1.9650070108762112,
"learning_rate": 5.19680296019619e-07,
"loss": 0.0422,
"step": 857
},
{
"epoch": 1.7091633466135459,
"grad_norm": 1.5395346984385803,
"learning_rate": 5.127568647697407e-07,
"loss": 0.0389,
"step": 858
},
{
"epoch": 1.7111553784860558,
"grad_norm": 1.4407452070494002,
"learning_rate": 5.05877368614604e-07,
"loss": 0.0336,
"step": 859
},
{
"epoch": 1.7131474103585658,
"grad_norm": 1.7638225778002696,
"learning_rate": 4.990418749121179e-07,
"loss": 0.0386,
"step": 860
},
{
"epoch": 1.7151394422310757,
"grad_norm": 1.3281767084541467,
"learning_rate": 4.922504505893583e-07,
"loss": 0.0292,
"step": 861
},
{
"epoch": 1.7171314741035857,
"grad_norm": 1.626746762497501,
"learning_rate": 4.855031621419143e-07,
"loss": 0.033,
"step": 862
},
{
"epoch": 1.7191235059760956,
"grad_norm": 1.432244514359786,
"learning_rate": 4.788000756332339e-07,
"loss": 0.0342,
"step": 863
},
{
"epoch": 1.7211155378486056,
"grad_norm": 1.355048293376289,
"learning_rate": 4.721412566939804e-07,
"loss": 0.0339,
"step": 864
},
{
"epoch": 1.7231075697211156,
"grad_norm": 1.8150166104991199,
"learning_rate": 4.655267705213884e-07,
"loss": 0.0332,
"step": 865
},
{
"epoch": 1.7250996015936255,
"grad_norm": 1.3905664903519097,
"learning_rate": 4.5895668187862283e-07,
"loss": 0.0355,
"step": 866
},
{
"epoch": 1.7270916334661355,
"grad_norm": 1.4892701437662028,
"learning_rate": 4.524310550941513e-07,
"loss": 0.0389,
"step": 867
},
{
"epoch": 1.7290836653386454,
"grad_norm": 1.4991400329744464,
"learning_rate": 4.4594995406110785e-07,
"loss": 0.0288,
"step": 868
},
{
"epoch": 1.7310756972111554,
"grad_norm": 2.1763244133944717,
"learning_rate": 4.395134422366715e-07,
"loss": 0.0597,
"step": 869
},
{
"epoch": 1.7330677290836654,
"grad_norm": 1.4364589154779435,
"learning_rate": 4.33121582641442e-07,
"loss": 0.029,
"step": 870
},
{
"epoch": 1.7350597609561753,
"grad_norm": 1.4528953350538367,
"learning_rate": 4.2677443785882566e-07,
"loss": 0.0299,
"step": 871
},
{
"epoch": 1.7370517928286853,
"grad_norm": 1.9612964941834787,
"learning_rate": 4.2047207003442003e-07,
"loss": 0.0395,
"step": 872
},
{
"epoch": 1.7390438247011952,
"grad_norm": 2.167595941204298,
"learning_rate": 4.142145408754061e-07,
"loss": 0.0443,
"step": 873
},
{
"epoch": 1.7410358565737052,
"grad_norm": 1.91106994896413,
"learning_rate": 4.0800191164994675e-07,
"loss": 0.0389,
"step": 874
},
{
"epoch": 1.7430278884462151,
"grad_norm": 1.4525532274137143,
"learning_rate": 4.018342431865818e-07,
"loss": 0.032,
"step": 875
},
{
"epoch": 1.745019920318725,
"grad_norm": 1.6060381855232735,
"learning_rate": 3.957115958736374e-07,
"loss": 0.0425,
"step": 876
},
{
"epoch": 1.747011952191235,
"grad_norm": 1.3887626940467759,
"learning_rate": 3.8963402965863094e-07,
"loss": 0.0397,
"step": 877
},
{
"epoch": 1.749003984063745,
"grad_norm": 1.372684240605827,
"learning_rate": 3.8360160404768755e-07,
"loss": 0.0391,
"step": 878
},
{
"epoch": 1.750996015936255,
"grad_norm": 1.3181667404064525,
"learning_rate": 3.7761437810495517e-07,
"loss": 0.0335,
"step": 879
},
{
"epoch": 1.752988047808765,
"grad_norm": 1.6789671311971048,
"learning_rate": 3.7167241045202474e-07,
"loss": 0.0407,
"step": 880
},
{
"epoch": 1.754980079681275,
"grad_norm": 1.3469532000945257,
"learning_rate": 3.657757592673611e-07,
"loss": 0.0274,
"step": 881
},
{
"epoch": 1.7569721115537849,
"grad_norm": 1.4431142582457726,
"learning_rate": 3.5992448228572895e-07,
"loss": 0.0276,
"step": 882
},
{
"epoch": 1.7589641434262948,
"grad_norm": 1.2459499701396597,
"learning_rate": 3.5411863679762956e-07,
"loss": 0.0313,
"step": 883
},
{
"epoch": 1.7609561752988048,
"grad_norm": 1.8366872404033463,
"learning_rate": 3.483582796487395e-07,
"loss": 0.0434,
"step": 884
},
{
"epoch": 1.7629482071713147,
"grad_norm": 1.6841292031312827,
"learning_rate": 3.426434672393542e-07,
"loss": 0.0375,
"step": 885
},
{
"epoch": 1.7649402390438247,
"grad_norm": 1.696643515004684,
"learning_rate": 3.3697425552383536e-07,
"loss": 0.0421,
"step": 886
},
{
"epoch": 1.7669322709163346,
"grad_norm": 1.316459868698009,
"learning_rate": 3.3135070001006186e-07,
"loss": 0.033,
"step": 887
},
{
"epoch": 1.7689243027888446,
"grad_norm": 1.9166835989066684,
"learning_rate": 3.257728557588902e-07,
"loss": 0.0438,
"step": 888
},
{
"epoch": 1.7709163346613546,
"grad_norm": 1.6334683496335292,
"learning_rate": 3.202407773836108e-07,
"loss": 0.0356,
"step": 889
},
{
"epoch": 1.7729083665338645,
"grad_norm": 1.7333830974632702,
"learning_rate": 3.1475451904941613e-07,
"loss": 0.0392,
"step": 890
},
{
"epoch": 1.7749003984063745,
"grad_norm": 1.6486004340046463,
"learning_rate": 3.093141344728695e-07,
"loss": 0.0429,
"step": 891
},
{
"epoch": 1.7768924302788844,
"grad_norm": 1.7192233169908389,
"learning_rate": 3.039196769213787e-07,
"loss": 0.0334,
"step": 892
},
{
"epoch": 1.7788844621513944,
"grad_norm": 1.4986249160852925,
"learning_rate": 2.985711992126772e-07,
"loss": 0.0332,
"step": 893
},
{
"epoch": 1.7808764940239044,
"grad_norm": 1.391955958968789,
"learning_rate": 2.932687537143003e-07,
"loss": 0.0395,
"step": 894
},
{
"epoch": 1.7828685258964143,
"grad_norm": 1.5022523308748341,
"learning_rate": 2.8801239234308e-07,
"loss": 0.0328,
"step": 895
},
{
"epoch": 1.7848605577689243,
"grad_norm": 1.5896216675986345,
"learning_rate": 2.828021665646341e-07,
"loss": 0.0395,
"step": 896
},
{
"epoch": 1.7868525896414342,
"grad_norm": 1.5943964105881356,
"learning_rate": 2.776381273928597e-07,
"loss": 0.0335,
"step": 897
},
{
"epoch": 1.7888446215139442,
"grad_norm": 1.4573130385904005,
"learning_rate": 2.725203253894365e-07,
"loss": 0.0328,
"step": 898
},
{
"epoch": 1.7908366533864541,
"grad_norm": 1.5957847319376306,
"learning_rate": 2.6744881066333104e-07,
"loss": 0.0302,
"step": 899
},
{
"epoch": 1.792828685258964,
"grad_norm": 1.5938499333127716,
"learning_rate": 2.6242363287030617e-07,
"loss": 0.0334,
"step": 900
},
{
"epoch": 1.794820717131474,
"grad_norm": 1.493471769426448,
"learning_rate": 2.5744484121243416e-07,
"loss": 0.0376,
"step": 901
},
{
"epoch": 1.796812749003984,
"grad_norm": 1.4679887246269543,
"learning_rate": 2.5251248443761644e-07,
"loss": 0.0308,
"step": 902
},
{
"epoch": 1.798804780876494,
"grad_norm": 1.4811334241506124,
"learning_rate": 2.47626610839104e-07,
"loss": 0.0461,
"step": 903
},
{
"epoch": 1.800796812749004,
"grad_norm": 1.4378347029531395,
"learning_rate": 2.4278726825502696e-07,
"loss": 0.0308,
"step": 904
},
{
"epoch": 1.802788844621514,
"grad_norm": 1.3874168126207334,
"learning_rate": 2.3799450406792435e-07,
"loss": 0.0383,
"step": 905
},
{
"epoch": 1.8047808764940239,
"grad_norm": 1.309299013582529,
"learning_rate": 2.3324836520428275e-07,
"loss": 0.0313,
"step": 906
},
{
"epoch": 1.8067729083665338,
"grad_norm": 1.724858228945609,
"learning_rate": 2.285488981340722e-07,
"loss": 0.0355,
"step": 907
},
{
"epoch": 1.8087649402390438,
"grad_norm": 1.696250952718899,
"learning_rate": 2.2389614887029564e-07,
"loss": 0.0314,
"step": 908
},
{
"epoch": 1.8107569721115537,
"grad_norm": 1.4712298958031025,
"learning_rate": 2.1929016296853679e-07,
"loss": 0.0371,
"step": 909
},
{
"epoch": 1.812749003984064,
"grad_norm": 1.2608615587029985,
"learning_rate": 2.147309855265145e-07,
"loss": 0.0313,
"step": 910
},
{
"epoch": 1.8147410358565739,
"grad_norm": 1.6486609933665899,
"learning_rate": 2.1021866118363987e-07,
"loss": 0.0372,
"step": 911
},
{
"epoch": 1.8167330677290838,
"grad_norm": 1.456648149760954,
"learning_rate": 2.0575323412058036e-07,
"loss": 0.036,
"step": 912
},
{
"epoch": 1.8187250996015938,
"grad_norm": 1.6643795113487712,
"learning_rate": 2.0133474805882735e-07,
"loss": 0.0357,
"step": 913
},
{
"epoch": 1.8207171314741037,
"grad_norm": 1.4592297656846365,
"learning_rate": 1.9696324626026774e-07,
"loss": 0.0343,
"step": 914
},
{
"epoch": 1.8227091633466137,
"grad_norm": 1.5024913962281268,
"learning_rate": 1.926387715267597e-07,
"loss": 0.0332,
"step": 915
},
{
"epoch": 1.8247011952191237,
"grad_norm": 1.5222331132128035,
"learning_rate": 1.8836136619971468e-07,
"loss": 0.0274,
"step": 916
},
{
"epoch": 1.8266932270916336,
"grad_norm": 1.470688430167571,
"learning_rate": 1.8413107215968174e-07,
"loss": 0.0358,
"step": 917
},
{
"epoch": 1.8286852589641436,
"grad_norm": 1.2370854006861078,
"learning_rate": 1.7994793082593942e-07,
"loss": 0.0294,
"step": 918
},
{
"epoch": 1.8306772908366535,
"grad_norm": 1.479773183899037,
"learning_rate": 1.7581198315608727e-07,
"loss": 0.0404,
"step": 919
},
{
"epoch": 1.8326693227091635,
"grad_norm": 1.9167324471302913,
"learning_rate": 1.7172326964564777e-07,
"loss": 0.0444,
"step": 920
},
{
"epoch": 1.8346613545816735,
"grad_norm": 1.4329590983432605,
"learning_rate": 1.6768183032766728e-07,
"loss": 0.0372,
"step": 921
},
{
"epoch": 1.8366533864541834,
"grad_norm": 1.2780231630551226,
"learning_rate": 1.6368770477232622e-07,
"loss": 0.0328,
"step": 922
},
{
"epoch": 1.8386454183266934,
"grad_norm": 1.4118812529232878,
"learning_rate": 1.597409320865506e-07,
"loss": 0.0415,
"step": 923
},
{
"epoch": 1.8406374501992033,
"grad_norm": 1.836114736553896,
"learning_rate": 1.5584155091362907e-07,
"loss": 0.0489,
"step": 924
},
{
"epoch": 1.8426294820717133,
"grad_norm": 1.3679200385396149,
"learning_rate": 1.5198959943283466e-07,
"loss": 0.0323,
"step": 925
},
{
"epoch": 1.8446215139442232,
"grad_norm": 1.5872343873759975,
"learning_rate": 1.4818511535905077e-07,
"loss": 0.0395,
"step": 926
},
{
"epoch": 1.8466135458167332,
"grad_norm": 1.4421140886448538,
"learning_rate": 1.444281359424038e-07,
"loss": 0.0315,
"step": 927
},
{
"epoch": 1.8486055776892432,
"grad_norm": 1.7604548086333676,
"learning_rate": 1.4071869796789427e-07,
"loss": 0.05,
"step": 928
},
{
"epoch": 1.8505976095617531,
"grad_norm": 1.4556017520765743,
"learning_rate": 1.3705683775504075e-07,
"loss": 0.0281,
"step": 929
},
{
"epoch": 1.852589641434263,
"grad_norm": 1.3271732449749567,
"learning_rate": 1.3344259115752268e-07,
"loss": 0.0324,
"step": 930
},
{
"epoch": 1.854581673306773,
"grad_norm": 1.2761857510150398,
"learning_rate": 1.2987599356282853e-07,
"loss": 0.0391,
"step": 931
},
{
"epoch": 1.856573705179283,
"grad_norm": 1.5833152792310408,
"learning_rate": 1.263570798919106e-07,
"loss": 0.0339,
"step": 932
},
{
"epoch": 1.858565737051793,
"grad_norm": 1.2639695829653441,
"learning_rate": 1.2288588459884344e-07,
"loss": 0.0345,
"step": 933
},
{
"epoch": 1.860557768924303,
"grad_norm": 1.3658369812613829,
"learning_rate": 1.1946244167048314e-07,
"loss": 0.0315,
"step": 934
},
{
"epoch": 1.8625498007968129,
"grad_norm": 1.4408384970498218,
"learning_rate": 1.1608678462613987e-07,
"loss": 0.0405,
"step": 935
},
{
"epoch": 1.8645418326693228,
"grad_norm": 1.8837289894616709,
"learning_rate": 1.1275894651724517e-07,
"loss": 0.0432,
"step": 936
},
{
"epoch": 1.8665338645418328,
"grad_norm": 1.5772528501243686,
"learning_rate": 1.0947895992703129e-07,
"loss": 0.034,
"step": 937
},
{
"epoch": 1.8685258964143427,
"grad_norm": 1.498705914254895,
"learning_rate": 1.062468569702102e-07,
"loss": 0.0348,
"step": 938
},
{
"epoch": 1.8705179282868527,
"grad_norm": 1.6273002546701671,
"learning_rate": 1.0306266929265951e-07,
"loss": 0.0444,
"step": 939
},
{
"epoch": 1.8725099601593627,
"grad_norm": 1.8115132687464455,
"learning_rate": 9.992642807111486e-08,
"loss": 0.045,
"step": 940
},
{
"epoch": 1.8745019920318726,
"grad_norm": 1.2331103677449673,
"learning_rate": 9.683816401286017e-08,
"loss": 0.0384,
"step": 941
},
{
"epoch": 1.8764940239043826,
"grad_norm": 1.719077840733274,
"learning_rate": 9.379790735543182e-08,
"loss": 0.0426,
"step": 942
},
{
"epoch": 1.8784860557768925,
"grad_norm": 1.3832648508966021,
"learning_rate": 9.080568786631939e-08,
"loss": 0.0305,
"step": 943
},
{
"epoch": 1.8804780876494025,
"grad_norm": 1.6655589837620999,
"learning_rate": 8.78615348426759e-08,
"loss": 0.0387,
"step": 944
},
{
"epoch": 1.8824701195219125,
"grad_norm": 1.3436243075109746,
"learning_rate": 8.49654771110292e-08,
"loss": 0.0336,
"step": 945
},
{
"epoch": 1.8844621513944224,
"grad_norm": 1.4941725985898642,
"learning_rate": 8.211754302700159e-08,
"loss": 0.035,
"step": 946
},
{
"epoch": 1.8864541832669324,
"grad_norm": 1.2284626460508044,
"learning_rate": 7.93177604750317e-08,
"loss": 0.0285,
"step": 947
},
{
"epoch": 1.8884462151394423,
"grad_norm": 1.225195965424142,
"learning_rate": 7.656615686809976e-08,
"loss": 0.0269,
"step": 948
},
{
"epoch": 1.8904382470119523,
"grad_norm": 1.3770985891500578,
"learning_rate": 7.386275914746222e-08,
"loss": 0.0324,
"step": 949
},
{
"epoch": 1.8924302788844622,
"grad_norm": 1.2826488338620294,
"learning_rate": 7.120759378238585e-08,
"loss": 0.032,
"step": 950
},
{
"epoch": 1.8944223107569722,
"grad_norm": 1.6006341483416981,
"learning_rate": 6.860068676988907e-08,
"loss": 0.0392,
"step": 951
},
{
"epoch": 1.8964143426294822,
"grad_norm": 1.4187750994999075,
"learning_rate": 6.604206363448662e-08,
"loss": 0.0321,
"step": 952
},
{
"epoch": 1.8984063745019921,
"grad_norm": 1.3893878947453275,
"learning_rate": 6.353174942794138e-08,
"loss": 0.0405,
"step": 953
},
{
"epoch": 1.900398406374502,
"grad_norm": 1.7017032311402343,
"learning_rate": 6.106976872901793e-08,
"loss": 0.0358,
"step": 954
},
{
"epoch": 1.902390438247012,
"grad_norm": 1.5607283128871816,
"learning_rate": 5.865614564324273e-08,
"loss": 0.0366,
"step": 955
},
{
"epoch": 1.904382470119522,
"grad_norm": 1.8745845801550434,
"learning_rate": 5.6290903802665444e-08,
"loss": 0.0493,
"step": 956
},
{
"epoch": 1.906374501992032,
"grad_norm": 1.8941896384592207,
"learning_rate": 5.397406636563296e-08,
"loss": 0.0482,
"step": 957
},
{
"epoch": 1.908366533864542,
"grad_norm": 1.7755712845597584,
"learning_rate": 5.1705656016555196e-08,
"loss": 0.0377,
"step": 958
},
{
"epoch": 1.9103585657370519,
"grad_norm": 1.5081972730474071,
"learning_rate": 4.948569496569078e-08,
"loss": 0.0424,
"step": 959
},
{
"epoch": 1.9123505976095618,
"grad_norm": 1.5670265082331885,
"learning_rate": 4.7314204948923356e-08,
"loss": 0.0352,
"step": 960
},
{
"epoch": 1.9143426294820718,
"grad_norm": 1.5649598609019262,
"learning_rate": 4.5191207227553437e-08,
"loss": 0.0325,
"step": 961
},
{
"epoch": 1.9163346613545817,
"grad_norm": 1.2941707907708186,
"learning_rate": 4.311672258808575e-08,
"loss": 0.0282,
"step": 962
},
{
"epoch": 1.9183266932270917,
"grad_norm": 1.5926908997623397,
"learning_rate": 4.109077134202999e-08,
"loss": 0.0338,
"step": 963
},
{
"epoch": 1.9203187250996017,
"grad_norm": 1.6503076572985267,
"learning_rate": 3.911337332569876e-08,
"loss": 0.0406,
"step": 964
},
{
"epoch": 1.9223107569721116,
"grad_norm": 1.7379935919621892,
"learning_rate": 3.718454790001546e-08,
"loss": 0.0402,
"step": 965
},
{
"epoch": 1.9243027888446216,
"grad_norm": 1.2573758118317542,
"learning_rate": 3.530431395032396e-08,
"loss": 0.0308,
"step": 966
},
{
"epoch": 1.9262948207171315,
"grad_norm": 1.8088122825079003,
"learning_rate": 3.347268988620256e-08,
"loss": 0.0416,
"step": 967
},
{
"epoch": 1.9282868525896415,
"grad_norm": 1.5128035830310647,
"learning_rate": 3.168969364128527e-08,
"loss": 0.0314,
"step": 968
},
{
"epoch": 1.9302788844621515,
"grad_norm": 1.5006247617840172,
"learning_rate": 2.995534267308697e-08,
"loss": 0.0336,
"step": 969
},
{
"epoch": 1.9322709163346614,
"grad_norm": 1.4914522988842465,
"learning_rate": 2.8269653962829104e-08,
"loss": 0.0391,
"step": 970
},
{
"epoch": 1.9342629482071714,
"grad_norm": 1.795362403964496,
"learning_rate": 2.6632644015276987e-08,
"loss": 0.0516,
"step": 971
},
{
"epoch": 1.9362549800796813,
"grad_norm": 1.516180508821745,
"learning_rate": 2.5044328858576105e-08,
"loss": 0.0379,
"step": 972
},
{
"epoch": 1.9382470119521913,
"grad_norm": 2.0020652857017813,
"learning_rate": 2.3504724044097206e-08,
"loss": 0.0469,
"step": 973
},
{
"epoch": 1.9402390438247012,
"grad_norm": 1.5598747529026058,
"learning_rate": 2.2013844646280313e-08,
"loss": 0.0396,
"step": 974
},
{
"epoch": 1.9422310756972112,
"grad_norm": 1.6142472802956638,
"learning_rate": 2.057170526249097e-08,
"loss": 0.0404,
"step": 975
},
{
"epoch": 1.9442231075697212,
"grad_norm": 1.2019709175359712,
"learning_rate": 1.917832001287645e-08,
"loss": 0.0287,
"step": 976
},
{
"epoch": 1.9462151394422311,
"grad_norm": 1.251311639854653,
"learning_rate": 1.783370254022587e-08,
"loss": 0.0376,
"step": 977
},
{
"epoch": 1.948207171314741,
"grad_norm": 1.6937487382902727,
"learning_rate": 1.6537866009837533e-08,
"loss": 0.0487,
"step": 978
},
{
"epoch": 1.950199203187251,
"grad_norm": 1.9336485273597244,
"learning_rate": 1.5290823109390673e-08,
"loss": 0.0436,
"step": 979
},
{
"epoch": 1.952191235059761,
"grad_norm": 1.2972478534447982,
"learning_rate": 1.4092586048820578e-08,
"loss": 0.0355,
"step": 980
},
{
"epoch": 1.954183266932271,
"grad_norm": 1.5530751540422953,
"learning_rate": 1.2943166560199228e-08,
"loss": 0.0328,
"step": 981
},
{
"epoch": 1.956175298804781,
"grad_norm": 1.5748922448723512,
"learning_rate": 1.1842575897619835e-08,
"loss": 0.0325,
"step": 982
},
{
"epoch": 1.9581673306772909,
"grad_norm": 1.4683412245601868,
"learning_rate": 1.0790824837088043e-08,
"loss": 0.0305,
"step": 983
},
{
"epoch": 1.9601593625498008,
"grad_norm": 1.704418853645297,
"learning_rate": 9.787923676414235e-09,
"loss": 0.043,
"step": 984
},
{
"epoch": 1.9621513944223108,
"grad_norm": 1.2313086946224918,
"learning_rate": 8.833882235115277e-09,
"loss": 0.041,
"step": 985
},
{
"epoch": 1.9641434262948207,
"grad_norm": 1.5411732148234083,
"learning_rate": 7.928709854316818e-09,
"loss": 0.0319,
"step": 986
},
{
"epoch": 1.9661354581673307,
"grad_norm": 1.803560123914151,
"learning_rate": 7.072415396661703e-09,
"loss": 0.0405,
"step": 987
},
{
"epoch": 1.9681274900398407,
"grad_norm": 1.4510860737042486,
"learning_rate": 6.265007246223365e-09,
"loss": 0.0348,
"step": 988
},
{
"epoch": 1.9701195219123506,
"grad_norm": 1.508510166730464,
"learning_rate": 5.506493308425342e-09,
"loss": 0.0334,
"step": 989
},
{
"epoch": 1.9721115537848606,
"grad_norm": 1.7329289625239936,
"learning_rate": 4.796881009961341e-09,
"loss": 0.0427,
"step": 990
},
{
"epoch": 1.9741035856573705,
"grad_norm": 1.6711540109669145,
"learning_rate": 4.136177298724176e-09,
"loss": 0.0321,
"step": 991
},
{
"epoch": 1.9760956175298805,
"grad_norm": 1.5519285892825812,
"learning_rate": 3.524388643736387e-09,
"loss": 0.0337,
"step": 992
},
{
"epoch": 1.9780876494023905,
"grad_norm": 1.5062338202236834,
"learning_rate": 2.9615210350891764e-09,
"loss": 0.0342,
"step": 993
},
{
"epoch": 1.9800796812749004,
"grad_norm": 1.4830859961400638,
"learning_rate": 2.447579983881343e-09,
"loss": 0.0316,
"step": 994
},
{
"epoch": 1.9820717131474104,
"grad_norm": 1.723742367527845,
"learning_rate": 1.9825705221665493e-09,
"loss": 0.0375,
"step": 995
},
{
"epoch": 1.9840637450199203,
"grad_norm": 1.41394511694927,
"learning_rate": 1.566497202904471e-09,
"loss": 0.0296,
"step": 996
},
{
"epoch": 1.9860557768924303,
"grad_norm": 1.3088020276395265,
"learning_rate": 1.1993640999147238e-09,
"loss": 0.0267,
"step": 997
},
{
"epoch": 1.9880478087649402,
"grad_norm": 1.6100561146810857,
"learning_rate": 8.811748078385584e-10,
"loss": 0.0358,
"step": 998
},
{
"epoch": 1.9900398406374502,
"grad_norm": 1.535784061618093,
"learning_rate": 6.119324421016704e-10,
"loss": 0.035,
"step": 999
},
{
"epoch": 1.9920318725099602,
"grad_norm": 1.4352594424209704,
"learning_rate": 3.916396388869981e-10,
"loss": 0.0378,
"step": 1000
},
{
"epoch": 1.9920318725099602,
"eval_loss": 0.10359270870685577,
"eval_runtime": 3.1378,
"eval_samples_per_second": 13.067,
"eval_steps_per_second": 3.506,
"step": 1000
},
{
"epoch": 1.9940239043824701,
"grad_norm": 1.4155470820818645,
"learning_rate": 2.2029855510474762e-10,
"loss": 0.0368,
"step": 1001
},
{
"epoch": 1.99601593625498,
"grad_norm": 1.7851317595092946,
"learning_rate": 9.791086837573905e-11,
"loss": 0.0397,
"step": 1002
},
{
"epoch": 1.99800796812749,
"grad_norm": 1.4719784863811807,
"learning_rate": 2.4477777010312175e-11,
"loss": 0.0351,
"step": 1003
},
{
"epoch": 2.0,
"grad_norm": 1.1952136070405424,
"learning_rate": 0.0,
"loss": 0.028,
"step": 1004
},
{
"epoch": 2.0,
"step": 1004,
"total_flos": 18718017454080.0,
"train_loss": 0.09493647176267735,
"train_runtime": 1399.7108,
"train_samples_per_second": 5.73,
"train_steps_per_second": 0.717
}
],
"logging_steps": 1,
"max_steps": 1004,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 18718017454080.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}