{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 200, "global_step": 1376, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014534883720930232, "grad_norm": 4.90753978197305, "learning_rate": 9.999986968251269e-06, "loss": 0.2756, "step": 1 }, { "epoch": 0.0029069767441860465, "grad_norm": 2.0826321480239645, "learning_rate": 9.99994787307301e-06, "loss": 0.2055, "step": 2 }, { "epoch": 0.00436046511627907, "grad_norm": 2.0071189367521165, "learning_rate": 9.999882714669009e-06, "loss": 0.2428, "step": 3 }, { "epoch": 0.005813953488372093, "grad_norm": 1.8308409567606185, "learning_rate": 9.99979149337892e-06, "loss": 0.1988, "step": 4 }, { "epoch": 0.007267441860465116, "grad_norm": 2.1523095681813, "learning_rate": 9.999674209678253e-06, "loss": 0.2308, "step": 5 }, { "epoch": 0.00872093023255814, "grad_norm": 2.042843363698041, "learning_rate": 9.999530864178371e-06, "loss": 0.1847, "step": 6 }, { "epoch": 0.010174418604651164, "grad_norm": 1.7333272867252485, "learning_rate": 9.999361457626493e-06, "loss": 0.2195, "step": 7 }, { "epoch": 0.011627906976744186, "grad_norm": 1.68765684619863, "learning_rate": 9.999165990905684e-06, "loss": 0.1968, "step": 8 }, { "epoch": 0.01308139534883721, "grad_norm": 2.3397968827666062, "learning_rate": 9.99894446503485e-06, "loss": 0.1655, "step": 9 }, { "epoch": 0.014534883720930232, "grad_norm": 1.8740327980184355, "learning_rate": 9.998696881168743e-06, "loss": 0.2018, "step": 10 }, { "epoch": 0.015988372093023256, "grad_norm": 1.8903939988289815, "learning_rate": 9.998423240597942e-06, "loss": 0.1755, "step": 11 }, { "epoch": 0.01744186046511628, "grad_norm": 2.536766624700735, "learning_rate": 9.998123544748852e-06, "loss": 0.2422, "step": 12 }, { "epoch": 0.0188953488372093, "grad_norm": 2.019676282269962, "learning_rate": 9.997797795183699e-06, "loss": 0.1937, "step": 13 }, { "epoch": 0.020348837209302327, "grad_norm": 1.717985207230675, "learning_rate": 9.997445993600516e-06, "loss": 0.1596, "step": 14 }, { "epoch": 0.02180232558139535, "grad_norm": 1.7865449356650134, "learning_rate": 9.99706814183314e-06, "loss": 0.1943, "step": 15 }, { "epoch": 0.023255813953488372, "grad_norm": 1.9620864278100416, "learning_rate": 9.996664241851197e-06, "loss": 0.1623, "step": 16 }, { "epoch": 0.024709302325581394, "grad_norm": 1.8373307354674375, "learning_rate": 9.996234295760099e-06, "loss": 0.2007, "step": 17 }, { "epoch": 0.02616279069767442, "grad_norm": 2.227861617466709, "learning_rate": 9.995778305801025e-06, "loss": 0.18, "step": 18 }, { "epoch": 0.027616279069767442, "grad_norm": 1.6895392880355353, "learning_rate": 9.995296274350912e-06, "loss": 0.1483, "step": 19 }, { "epoch": 0.029069767441860465, "grad_norm": 1.7640692153593953, "learning_rate": 9.994788203922447e-06, "loss": 0.149, "step": 20 }, { "epoch": 0.030523255813953487, "grad_norm": 1.455602398611072, "learning_rate": 9.994254097164047e-06, "loss": 0.1534, "step": 21 }, { "epoch": 0.03197674418604651, "grad_norm": 1.932687065520904, "learning_rate": 9.993693956859849e-06, "loss": 0.1628, "step": 22 }, { "epoch": 0.03343023255813953, "grad_norm": 1.86140975395309, "learning_rate": 9.9931077859297e-06, "loss": 0.189, "step": 23 }, { "epoch": 0.03488372093023256, "grad_norm": 1.6940825298691309, "learning_rate": 9.99249558742913e-06, "loss": 0.1636, "step": 24 }, { "epoch": 0.036337209302325583, "grad_norm": 1.9970017291615039, "learning_rate": 9.991857364549347e-06, "loss": 0.1354, "step": 25 }, { "epoch": 0.0377906976744186, "grad_norm": 1.611614767401661, "learning_rate": 9.991193120617213e-06, "loss": 0.1271, "step": 26 }, { "epoch": 0.03924418604651163, "grad_norm": 1.6668640058803244, "learning_rate": 9.990502859095234e-06, "loss": 0.1471, "step": 27 }, { "epoch": 0.040697674418604654, "grad_norm": 1.6279188049671565, "learning_rate": 9.989786583581535e-06, "loss": 0.1672, "step": 28 }, { "epoch": 0.04215116279069767, "grad_norm": 1.7050930977268255, "learning_rate": 9.989044297809846e-06, "loss": 0.1621, "step": 29 }, { "epoch": 0.0436046511627907, "grad_norm": 1.6059761856688315, "learning_rate": 9.98827600564948e-06, "loss": 0.1413, "step": 30 }, { "epoch": 0.04505813953488372, "grad_norm": 1.657144516453862, "learning_rate": 9.987481711105312e-06, "loss": 0.1747, "step": 31 }, { "epoch": 0.046511627906976744, "grad_norm": 1.5640565805511477, "learning_rate": 9.986661418317759e-06, "loss": 0.1553, "step": 32 }, { "epoch": 0.04796511627906977, "grad_norm": 1.3799874670130874, "learning_rate": 9.985815131562765e-06, "loss": 0.1185, "step": 33 }, { "epoch": 0.04941860465116279, "grad_norm": 1.6674345854485941, "learning_rate": 9.984942855251765e-06, "loss": 0.1666, "step": 34 }, { "epoch": 0.050872093023255814, "grad_norm": 1.6727808497465833, "learning_rate": 9.984044593931674e-06, "loss": 0.1918, "step": 35 }, { "epoch": 0.05232558139534884, "grad_norm": 1.5015093467589995, "learning_rate": 9.983120352284861e-06, "loss": 0.1464, "step": 36 }, { "epoch": 0.05377906976744186, "grad_norm": 1.505665041651918, "learning_rate": 9.982170135129116e-06, "loss": 0.1259, "step": 37 }, { "epoch": 0.055232558139534885, "grad_norm": 1.5003368863482498, "learning_rate": 9.981193947417638e-06, "loss": 0.1187, "step": 38 }, { "epoch": 0.056686046511627904, "grad_norm": 1.900977624703376, "learning_rate": 9.980191794239e-06, "loss": 0.1584, "step": 39 }, { "epoch": 0.05813953488372093, "grad_norm": 2.1817255633060855, "learning_rate": 9.979163680817124e-06, "loss": 0.1711, "step": 40 }, { "epoch": 0.059593023255813955, "grad_norm": 1.9211415894532136, "learning_rate": 9.978109612511257e-06, "loss": 0.1723, "step": 41 }, { "epoch": 0.061046511627906974, "grad_norm": 3.230526894351387, "learning_rate": 9.977029594815942e-06, "loss": 0.2277, "step": 42 }, { "epoch": 0.0625, "grad_norm": 1.6529179532438223, "learning_rate": 9.975923633360985e-06, "loss": 0.1547, "step": 43 }, { "epoch": 0.06395348837209303, "grad_norm": 1.7575964357251361, "learning_rate": 9.974791733911431e-06, "loss": 0.2112, "step": 44 }, { "epoch": 0.06540697674418605, "grad_norm": 1.7248448102436156, "learning_rate": 9.973633902367532e-06, "loss": 0.1583, "step": 45 }, { "epoch": 0.06686046511627906, "grad_norm": 1.747621900146478, "learning_rate": 9.972450144764713e-06, "loss": 0.1747, "step": 46 }, { "epoch": 0.06831395348837209, "grad_norm": 1.8290141742667902, "learning_rate": 9.971240467273552e-06, "loss": 0.1524, "step": 47 }, { "epoch": 0.06976744186046512, "grad_norm": 1.8430702236241727, "learning_rate": 9.970004876199731e-06, "loss": 0.1496, "step": 48 }, { "epoch": 0.07122093023255814, "grad_norm": 2.2055666937974614, "learning_rate": 9.968743377984013e-06, "loss": 0.17, "step": 49 }, { "epoch": 0.07267441860465117, "grad_norm": 1.538052515526653, "learning_rate": 9.967455979202214e-06, "loss": 0.154, "step": 50 }, { "epoch": 0.07412790697674419, "grad_norm": 1.633366596107583, "learning_rate": 9.966142686565155e-06, "loss": 0.1578, "step": 51 }, { "epoch": 0.0755813953488372, "grad_norm": 1.6546648500780583, "learning_rate": 9.964803506918634e-06, "loss": 0.16, "step": 52 }, { "epoch": 0.07703488372093023, "grad_norm": 1.6111923238734047, "learning_rate": 9.963438447243394e-06, "loss": 0.1182, "step": 53 }, { "epoch": 0.07848837209302326, "grad_norm": 1.5663545251276552, "learning_rate": 9.96204751465508e-06, "loss": 0.1348, "step": 54 }, { "epoch": 0.07994186046511628, "grad_norm": 1.5491604249911004, "learning_rate": 9.960630716404205e-06, "loss": 0.1305, "step": 55 }, { "epoch": 0.08139534883720931, "grad_norm": 1.5904201245888179, "learning_rate": 9.959188059876115e-06, "loss": 0.1485, "step": 56 }, { "epoch": 0.08284883720930232, "grad_norm": 1.8216437492726838, "learning_rate": 9.957719552590944e-06, "loss": 0.1787, "step": 57 }, { "epoch": 0.08430232558139535, "grad_norm": 1.423635987100293, "learning_rate": 9.956225202203576e-06, "loss": 0.1335, "step": 58 }, { "epoch": 0.08575581395348837, "grad_norm": 1.651615225486003, "learning_rate": 9.954705016503614e-06, "loss": 0.1339, "step": 59 }, { "epoch": 0.0872093023255814, "grad_norm": 1.658410900976534, "learning_rate": 9.95315900341533e-06, "loss": 0.1618, "step": 60 }, { "epoch": 0.08866279069767442, "grad_norm": 1.7421372069577428, "learning_rate": 9.951587170997621e-06, "loss": 0.1559, "step": 61 }, { "epoch": 0.09011627906976744, "grad_norm": 1.6410793659457708, "learning_rate": 9.949989527443982e-06, "loss": 0.1187, "step": 62 }, { "epoch": 0.09156976744186046, "grad_norm": 1.3084758751505552, "learning_rate": 9.948366081082446e-06, "loss": 0.1078, "step": 63 }, { "epoch": 0.09302325581395349, "grad_norm": 1.8532352795685818, "learning_rate": 9.946716840375552e-06, "loss": 0.1787, "step": 64 }, { "epoch": 0.09447674418604651, "grad_norm": 1.7832244807577888, "learning_rate": 9.945041813920296e-06, "loss": 0.1327, "step": 65 }, { "epoch": 0.09593023255813954, "grad_norm": 1.5586347200013577, "learning_rate": 9.943341010448086e-06, "loss": 0.1254, "step": 66 }, { "epoch": 0.09738372093023256, "grad_norm": 1.3472131423550082, "learning_rate": 9.941614438824703e-06, "loss": 0.1473, "step": 67 }, { "epoch": 0.09883720930232558, "grad_norm": 1.6722596945098753, "learning_rate": 9.939862108050244e-06, "loss": 0.1816, "step": 68 }, { "epoch": 0.1002906976744186, "grad_norm": 1.5162995990299708, "learning_rate": 9.93808402725908e-06, "loss": 0.1418, "step": 69 }, { "epoch": 0.10174418604651163, "grad_norm": 1.7264251629725236, "learning_rate": 9.936280205719817e-06, "loss": 0.1845, "step": 70 }, { "epoch": 0.10319767441860465, "grad_norm": 1.8868123011708366, "learning_rate": 9.934450652835233e-06, "loss": 0.1454, "step": 71 }, { "epoch": 0.10465116279069768, "grad_norm": 1.6996113668126296, "learning_rate": 9.932595378142233e-06, "loss": 0.1611, "step": 72 }, { "epoch": 0.10610465116279069, "grad_norm": 1.3926857562718415, "learning_rate": 9.930714391311813e-06, "loss": 0.13, "step": 73 }, { "epoch": 0.10755813953488372, "grad_norm": 1.3730967025306435, "learning_rate": 9.928807702148986e-06, "loss": 0.14, "step": 74 }, { "epoch": 0.10901162790697674, "grad_norm": 1.6148257033625841, "learning_rate": 9.926875320592756e-06, "loss": 0.2184, "step": 75 }, { "epoch": 0.11046511627906977, "grad_norm": 1.6770555686431594, "learning_rate": 9.924917256716042e-06, "loss": 0.1343, "step": 76 }, { "epoch": 0.1119186046511628, "grad_norm": 1.5552428567558088, "learning_rate": 9.922933520725645e-06, "loss": 0.1228, "step": 77 }, { "epoch": 0.11337209302325581, "grad_norm": 1.6238590358211495, "learning_rate": 9.920924122962185e-06, "loss": 0.1883, "step": 78 }, { "epoch": 0.11482558139534883, "grad_norm": 1.3715125415573524, "learning_rate": 9.918889073900046e-06, "loss": 0.1204, "step": 79 }, { "epoch": 0.11627906976744186, "grad_norm": 1.5437673536268413, "learning_rate": 9.91682838414733e-06, "loss": 0.1443, "step": 80 }, { "epoch": 0.11773255813953488, "grad_norm": 1.9238885575325282, "learning_rate": 9.914742064445795e-06, "loss": 0.151, "step": 81 }, { "epoch": 0.11918604651162791, "grad_norm": 1.5000041779719893, "learning_rate": 9.912630125670793e-06, "loss": 0.1478, "step": 82 }, { "epoch": 0.12063953488372094, "grad_norm": 1.4161989982638625, "learning_rate": 9.910492578831231e-06, "loss": 0.1205, "step": 83 }, { "epoch": 0.12209302325581395, "grad_norm": 1.887456331836179, "learning_rate": 9.908329435069495e-06, "loss": 0.1892, "step": 84 }, { "epoch": 0.12354651162790697, "grad_norm": 1.4188322937106472, "learning_rate": 9.906140705661406e-06, "loss": 0.1354, "step": 85 }, { "epoch": 0.125, "grad_norm": 1.4674049503326925, "learning_rate": 9.903926402016153e-06, "loss": 0.1188, "step": 86 }, { "epoch": 0.12645348837209303, "grad_norm": 1.7991239782209092, "learning_rate": 9.901686535676233e-06, "loss": 0.1758, "step": 87 }, { "epoch": 0.12790697674418605, "grad_norm": 1.755012463932795, "learning_rate": 9.899421118317399e-06, "loss": 0.1811, "step": 88 }, { "epoch": 0.12936046511627908, "grad_norm": 1.6961857675852638, "learning_rate": 9.897130161748588e-06, "loss": 0.1617, "step": 89 }, { "epoch": 0.1308139534883721, "grad_norm": 1.8355331445231158, "learning_rate": 9.894813677911868e-06, "loss": 0.1709, "step": 90 }, { "epoch": 0.13226744186046513, "grad_norm": 1.637508412196323, "learning_rate": 9.892471678882377e-06, "loss": 0.1467, "step": 91 }, { "epoch": 0.13372093023255813, "grad_norm": 2.2746840658968677, "learning_rate": 9.890104176868246e-06, "loss": 0.1416, "step": 92 }, { "epoch": 0.13517441860465115, "grad_norm": 1.7646971278296149, "learning_rate": 9.887711184210559e-06, "loss": 0.1597, "step": 93 }, { "epoch": 0.13662790697674418, "grad_norm": 1.8116059351522782, "learning_rate": 9.885292713383264e-06, "loss": 0.1599, "step": 94 }, { "epoch": 0.1380813953488372, "grad_norm": 1.8626165104101766, "learning_rate": 9.882848776993119e-06, "loss": 0.1377, "step": 95 }, { "epoch": 0.13953488372093023, "grad_norm": 1.5910215156996428, "learning_rate": 9.880379387779637e-06, "loss": 0.1596, "step": 96 }, { "epoch": 0.14098837209302326, "grad_norm": 1.3431975319409974, "learning_rate": 9.877884558614997e-06, "loss": 0.1346, "step": 97 }, { "epoch": 0.14244186046511628, "grad_norm": 1.3362805614728352, "learning_rate": 9.875364302503995e-06, "loss": 0.1127, "step": 98 }, { "epoch": 0.1438953488372093, "grad_norm": 1.8199783165196086, "learning_rate": 9.872818632583969e-06, "loss": 0.1604, "step": 99 }, { "epoch": 0.14534883720930233, "grad_norm": 1.6827249026814992, "learning_rate": 9.870247562124731e-06, "loss": 0.1346, "step": 100 }, { "epoch": 0.14680232558139536, "grad_norm": 1.4674119242601242, "learning_rate": 9.8676511045285e-06, "loss": 0.1168, "step": 101 }, { "epoch": 0.14825581395348839, "grad_norm": 1.6729272753403375, "learning_rate": 9.865029273329826e-06, "loss": 0.1478, "step": 102 }, { "epoch": 0.14970930232558138, "grad_norm": 1.716840091927574, "learning_rate": 9.862382082195531e-06, "loss": 0.1494, "step": 103 }, { "epoch": 0.1511627906976744, "grad_norm": 1.8089594266192164, "learning_rate": 9.859709544924624e-06, "loss": 0.1362, "step": 104 }, { "epoch": 0.15261627906976744, "grad_norm": 1.5150282280782363, "learning_rate": 9.85701167544824e-06, "loss": 0.1684, "step": 105 }, { "epoch": 0.15406976744186046, "grad_norm": 1.523111761533529, "learning_rate": 9.854288487829561e-06, "loss": 0.145, "step": 106 }, { "epoch": 0.1555232558139535, "grad_norm": 1.5473976095842301, "learning_rate": 9.851539996263748e-06, "loss": 0.1349, "step": 107 }, { "epoch": 0.1569767441860465, "grad_norm": 1.9497016837218222, "learning_rate": 9.848766215077859e-06, "loss": 0.1751, "step": 108 }, { "epoch": 0.15843023255813954, "grad_norm": 1.8085748930229228, "learning_rate": 9.845967158730783e-06, "loss": 0.1401, "step": 109 }, { "epoch": 0.15988372093023256, "grad_norm": 1.7391260062163252, "learning_rate": 9.843142841813158e-06, "loss": 0.1599, "step": 110 }, { "epoch": 0.1613372093023256, "grad_norm": 1.6195797255004982, "learning_rate": 9.840293279047302e-06, "loss": 0.1632, "step": 111 }, { "epoch": 0.16279069767441862, "grad_norm": 1.82089731812025, "learning_rate": 9.837418485287126e-06, "loss": 0.1355, "step": 112 }, { "epoch": 0.16424418604651161, "grad_norm": 1.7953380194093425, "learning_rate": 9.83451847551807e-06, "loss": 0.1635, "step": 113 }, { "epoch": 0.16569767441860464, "grad_norm": 1.6497571195888396, "learning_rate": 9.831593264857011e-06, "loss": 0.1563, "step": 114 }, { "epoch": 0.16715116279069767, "grad_norm": 1.4404520426344958, "learning_rate": 9.828642868552195e-06, "loss": 0.142, "step": 115 }, { "epoch": 0.1686046511627907, "grad_norm": 1.5292175625418722, "learning_rate": 9.825667301983149e-06, "loss": 0.1322, "step": 116 }, { "epoch": 0.17005813953488372, "grad_norm": 2.0789140113841236, "learning_rate": 9.822666580660606e-06, "loss": 0.1272, "step": 117 }, { "epoch": 0.17151162790697674, "grad_norm": 1.5437780941386026, "learning_rate": 9.819640720226429e-06, "loss": 0.1699, "step": 118 }, { "epoch": 0.17296511627906977, "grad_norm": 1.5471057200671505, "learning_rate": 9.816589736453516e-06, "loss": 0.1233, "step": 119 }, { "epoch": 0.1744186046511628, "grad_norm": 1.5206963754435927, "learning_rate": 9.81351364524573e-06, "loss": 0.1265, "step": 120 }, { "epoch": 0.17587209302325582, "grad_norm": 2.2824687888973947, "learning_rate": 9.81041246263781e-06, "loss": 0.1806, "step": 121 }, { "epoch": 0.17732558139534885, "grad_norm": 1.490348186092453, "learning_rate": 9.807286204795287e-06, "loss": 0.1254, "step": 122 }, { "epoch": 0.17877906976744187, "grad_norm": 1.4547407899623217, "learning_rate": 9.804134888014407e-06, "loss": 0.1669, "step": 123 }, { "epoch": 0.18023255813953487, "grad_norm": 1.3955416003002072, "learning_rate": 9.800958528722035e-06, "loss": 0.1227, "step": 124 }, { "epoch": 0.1816860465116279, "grad_norm": 1.366189834603044, "learning_rate": 9.797757143475577e-06, "loss": 0.1141, "step": 125 }, { "epoch": 0.18313953488372092, "grad_norm": 1.784363750973548, "learning_rate": 9.794530748962894e-06, "loss": 0.1545, "step": 126 }, { "epoch": 0.18459302325581395, "grad_norm": 1.694786135391199, "learning_rate": 9.791279362002212e-06, "loss": 0.1441, "step": 127 }, { "epoch": 0.18604651162790697, "grad_norm": 1.5799996656623245, "learning_rate": 9.78800299954203e-06, "loss": 0.1495, "step": 128 }, { "epoch": 0.1875, "grad_norm": 1.810120051088718, "learning_rate": 9.784701678661045e-06, "loss": 0.1489, "step": 129 }, { "epoch": 0.18895348837209303, "grad_norm": 1.6065641177622767, "learning_rate": 9.781375416568048e-06, "loss": 0.1498, "step": 130 }, { "epoch": 0.19040697674418605, "grad_norm": 1.6146187321984817, "learning_rate": 9.778024230601846e-06, "loss": 0.1616, "step": 131 }, { "epoch": 0.19186046511627908, "grad_norm": 1.5540139065991918, "learning_rate": 9.774648138231163e-06, "loss": 0.15, "step": 132 }, { "epoch": 0.1933139534883721, "grad_norm": 1.692543845359626, "learning_rate": 9.771247157054554e-06, "loss": 0.1459, "step": 133 }, { "epoch": 0.19476744186046513, "grad_norm": 1.9872344731203477, "learning_rate": 9.767821304800312e-06, "loss": 0.153, "step": 134 }, { "epoch": 0.19622093023255813, "grad_norm": 2.1160080220932027, "learning_rate": 9.764370599326375e-06, "loss": 0.179, "step": 135 }, { "epoch": 0.19767441860465115, "grad_norm": 1.6129189257388363, "learning_rate": 9.760895058620236e-06, "loss": 0.1689, "step": 136 }, { "epoch": 0.19912790697674418, "grad_norm": 1.650799570186251, "learning_rate": 9.75739470079884e-06, "loss": 0.1671, "step": 137 }, { "epoch": 0.2005813953488372, "grad_norm": 2.030672631597455, "learning_rate": 9.753869544108504e-06, "loss": 0.18, "step": 138 }, { "epoch": 0.20203488372093023, "grad_norm": 1.319224461569878, "learning_rate": 9.75031960692481e-06, "loss": 0.1483, "step": 139 }, { "epoch": 0.20348837209302326, "grad_norm": 1.407548702023548, "learning_rate": 9.74674490775251e-06, "loss": 0.1333, "step": 140 }, { "epoch": 0.20494186046511628, "grad_norm": 1.4802679481612602, "learning_rate": 9.743145465225443e-06, "loss": 0.1117, "step": 141 }, { "epoch": 0.2063953488372093, "grad_norm": 1.4078486946857556, "learning_rate": 9.739521298106417e-06, "loss": 0.1307, "step": 142 }, { "epoch": 0.20784883720930233, "grad_norm": 1.9988611788754047, "learning_rate": 9.735872425287124e-06, "loss": 0.1269, "step": 143 }, { "epoch": 0.20930232558139536, "grad_norm": 1.4797178696285465, "learning_rate": 9.732198865788047e-06, "loss": 0.1559, "step": 144 }, { "epoch": 0.21075581395348839, "grad_norm": 1.6049649292469328, "learning_rate": 9.728500638758345e-06, "loss": 0.1381, "step": 145 }, { "epoch": 0.21220930232558138, "grad_norm": 1.811631968427576, "learning_rate": 9.724777763475765e-06, "loss": 0.1637, "step": 146 }, { "epoch": 0.2136627906976744, "grad_norm": 1.4755303711009382, "learning_rate": 9.721030259346536e-06, "loss": 0.1054, "step": 147 }, { "epoch": 0.21511627906976744, "grad_norm": 1.5259450627037843, "learning_rate": 9.71725814590527e-06, "loss": 0.1422, "step": 148 }, { "epoch": 0.21656976744186046, "grad_norm": 1.6229825202968946, "learning_rate": 9.713461442814862e-06, "loss": 0.1298, "step": 149 }, { "epoch": 0.2180232558139535, "grad_norm": 1.4725135801149893, "learning_rate": 9.709640169866385e-06, "loss": 0.1361, "step": 150 }, { "epoch": 0.2194767441860465, "grad_norm": 1.5398373955451008, "learning_rate": 9.705794346978988e-06, "loss": 0.1531, "step": 151 }, { "epoch": 0.22093023255813954, "grad_norm": 1.676675600727517, "learning_rate": 9.701923994199784e-06, "loss": 0.1579, "step": 152 }, { "epoch": 0.22238372093023256, "grad_norm": 1.6397792048445756, "learning_rate": 9.698029131703766e-06, "loss": 0.1446, "step": 153 }, { "epoch": 0.2238372093023256, "grad_norm": 1.5894721901549633, "learning_rate": 9.694109779793677e-06, "loss": 0.1533, "step": 154 }, { "epoch": 0.22529069767441862, "grad_norm": 1.6862648872718184, "learning_rate": 9.690165958899923e-06, "loss": 0.1483, "step": 155 }, { "epoch": 0.22674418604651161, "grad_norm": 1.465800326444994, "learning_rate": 9.686197689580457e-06, "loss": 0.1338, "step": 156 }, { "epoch": 0.22819767441860464, "grad_norm": 1.4653415401590757, "learning_rate": 9.682204992520674e-06, "loss": 0.1045, "step": 157 }, { "epoch": 0.22965116279069767, "grad_norm": 1.8737300808849864, "learning_rate": 9.678187888533302e-06, "loss": 0.1457, "step": 158 }, { "epoch": 0.2311046511627907, "grad_norm": 1.9112821169487715, "learning_rate": 9.674146398558303e-06, "loss": 0.157, "step": 159 }, { "epoch": 0.23255813953488372, "grad_norm": 1.6172822347717775, "learning_rate": 9.670080543662742e-06, "loss": 0.1708, "step": 160 }, { "epoch": 0.23401162790697674, "grad_norm": 1.5162309678255983, "learning_rate": 9.665990345040702e-06, "loss": 0.1407, "step": 161 }, { "epoch": 0.23546511627906977, "grad_norm": 1.5939441326156882, "learning_rate": 9.66187582401316e-06, "loss": 0.155, "step": 162 }, { "epoch": 0.2369186046511628, "grad_norm": 1.6355525068645504, "learning_rate": 9.657737002027878e-06, "loss": 0.1847, "step": 163 }, { "epoch": 0.23837209302325582, "grad_norm": 1.3046079540322713, "learning_rate": 9.653573900659292e-06, "loss": 0.1282, "step": 164 }, { "epoch": 0.23982558139534885, "grad_norm": 1.5490255723693547, "learning_rate": 9.649386541608395e-06, "loss": 0.1477, "step": 165 }, { "epoch": 0.24127906976744187, "grad_norm": 1.595779809647306, "learning_rate": 9.645174946702634e-06, "loss": 0.1678, "step": 166 }, { "epoch": 0.24273255813953487, "grad_norm": 1.2880223015165764, "learning_rate": 9.640939137895788e-06, "loss": 0.1463, "step": 167 }, { "epoch": 0.2441860465116279, "grad_norm": 1.5824936872282962, "learning_rate": 9.636679137267852e-06, "loss": 0.1709, "step": 168 }, { "epoch": 0.24563953488372092, "grad_norm": 1.5803584865776295, "learning_rate": 9.632394967024934e-06, "loss": 0.1744, "step": 169 }, { "epoch": 0.24709302325581395, "grad_norm": 1.4525033434391654, "learning_rate": 9.628086649499121e-06, "loss": 0.1441, "step": 170 }, { "epoch": 0.24854651162790697, "grad_norm": 1.3887804855032542, "learning_rate": 9.623754207148382e-06, "loss": 0.1482, "step": 171 }, { "epoch": 0.25, "grad_norm": 1.4357196619635686, "learning_rate": 9.619397662556434e-06, "loss": 0.1281, "step": 172 }, { "epoch": 0.251453488372093, "grad_norm": 1.7625555681963379, "learning_rate": 9.615017038432636e-06, "loss": 0.1772, "step": 173 }, { "epoch": 0.25290697674418605, "grad_norm": 1.493762339052703, "learning_rate": 9.610612357611868e-06, "loss": 0.1158, "step": 174 }, { "epoch": 0.2543604651162791, "grad_norm": 1.429067156820555, "learning_rate": 9.606183643054401e-06, "loss": 0.1427, "step": 175 }, { "epoch": 0.2558139534883721, "grad_norm": 1.6273610891658072, "learning_rate": 9.601730917845798e-06, "loss": 0.1567, "step": 176 }, { "epoch": 0.25726744186046513, "grad_norm": 1.7484274417059458, "learning_rate": 9.597254205196775e-06, "loss": 0.1352, "step": 177 }, { "epoch": 0.25872093023255816, "grad_norm": 1.4768497051427878, "learning_rate": 9.592753528443092e-06, "loss": 0.1554, "step": 178 }, { "epoch": 0.2601744186046512, "grad_norm": 1.792017449974729, "learning_rate": 9.588228911045423e-06, "loss": 0.1611, "step": 179 }, { "epoch": 0.2616279069767442, "grad_norm": 1.8929106049989604, "learning_rate": 9.58368037658924e-06, "loss": 0.1599, "step": 180 }, { "epoch": 0.26308139534883723, "grad_norm": 1.6939699226935878, "learning_rate": 9.579107948784684e-06, "loss": 0.1625, "step": 181 }, { "epoch": 0.26453488372093026, "grad_norm": 1.564868234763488, "learning_rate": 9.57451165146645e-06, "loss": 0.1533, "step": 182 }, { "epoch": 0.26598837209302323, "grad_norm": 1.5170878496402254, "learning_rate": 9.569891508593654e-06, "loss": 0.1435, "step": 183 }, { "epoch": 0.26744186046511625, "grad_norm": 1.3597169726380491, "learning_rate": 9.565247544249709e-06, "loss": 0.1419, "step": 184 }, { "epoch": 0.2688953488372093, "grad_norm": 1.3285568618373091, "learning_rate": 9.56057978264221e-06, "loss": 0.1199, "step": 185 }, { "epoch": 0.2703488372093023, "grad_norm": 1.610052139476346, "learning_rate": 9.55588824810279e-06, "loss": 0.1329, "step": 186 }, { "epoch": 0.27180232558139533, "grad_norm": 1.8398199886190283, "learning_rate": 9.551172965087017e-06, "loss": 0.161, "step": 187 }, { "epoch": 0.27325581395348836, "grad_norm": 1.7116096397748537, "learning_rate": 9.54643395817424e-06, "loss": 0.1918, "step": 188 }, { "epoch": 0.2747093023255814, "grad_norm": 1.8785142775891301, "learning_rate": 9.541671252067475e-06, "loss": 0.2001, "step": 189 }, { "epoch": 0.2761627906976744, "grad_norm": 1.9515749354758625, "learning_rate": 9.53688487159328e-06, "loss": 0.195, "step": 190 }, { "epoch": 0.27761627906976744, "grad_norm": 1.539979313310192, "learning_rate": 9.532074841701619e-06, "loss": 0.1586, "step": 191 }, { "epoch": 0.27906976744186046, "grad_norm": 1.6618019039238547, "learning_rate": 9.527241187465735e-06, "loss": 0.1625, "step": 192 }, { "epoch": 0.2805232558139535, "grad_norm": 1.534997769425568, "learning_rate": 9.522383934082009e-06, "loss": 0.1421, "step": 193 }, { "epoch": 0.2819767441860465, "grad_norm": 1.6350100832134988, "learning_rate": 9.517503106869845e-06, "loss": 0.1254, "step": 194 }, { "epoch": 0.28343023255813954, "grad_norm": 1.4730057580868088, "learning_rate": 9.512598731271532e-06, "loss": 0.1575, "step": 195 }, { "epoch": 0.28488372093023256, "grad_norm": 2.071129803071774, "learning_rate": 9.507670832852103e-06, "loss": 0.1589, "step": 196 }, { "epoch": 0.2863372093023256, "grad_norm": 2.002706015726789, "learning_rate": 9.502719437299212e-06, "loss": 0.1739, "step": 197 }, { "epoch": 0.2877906976744186, "grad_norm": 1.0738206830624688, "learning_rate": 9.497744570422997e-06, "loss": 0.1045, "step": 198 }, { "epoch": 0.28924418604651164, "grad_norm": 1.6809668906114832, "learning_rate": 9.492746258155944e-06, "loss": 0.1688, "step": 199 }, { "epoch": 0.29069767441860467, "grad_norm": 1.568763317573355, "learning_rate": 9.487724526552753e-06, "loss": 0.154, "step": 200 }, { "epoch": 0.29069767441860467, "eval_loss": 0.14336518943309784, "eval_runtime": 2.2549, "eval_samples_per_second": 24.835, "eval_steps_per_second": 6.209, "step": 200 }, { "epoch": 0.2921511627906977, "grad_norm": 1.4702550707766895, "learning_rate": 9.4826794017902e-06, "loss": 0.1549, "step": 201 }, { "epoch": 0.2936046511627907, "grad_norm": 1.5525125309981525, "learning_rate": 9.477610910167005e-06, "loss": 0.138, "step": 202 }, { "epoch": 0.29505813953488375, "grad_norm": 1.9512544957864502, "learning_rate": 9.472519078103693e-06, "loss": 0.191, "step": 203 }, { "epoch": 0.29651162790697677, "grad_norm": 1.6914763992393016, "learning_rate": 9.467403932142452e-06, "loss": 0.1415, "step": 204 }, { "epoch": 0.29796511627906974, "grad_norm": 1.4472857294705104, "learning_rate": 9.462265498947002e-06, "loss": 0.1429, "step": 205 }, { "epoch": 0.29941860465116277, "grad_norm": 1.5823405481416377, "learning_rate": 9.457103805302454e-06, "loss": 0.1326, "step": 206 }, { "epoch": 0.3008720930232558, "grad_norm": 1.420341202696158, "learning_rate": 9.451918878115163e-06, "loss": 0.128, "step": 207 }, { "epoch": 0.3023255813953488, "grad_norm": 1.3234585094233786, "learning_rate": 9.446710744412595e-06, "loss": 0.1409, "step": 208 }, { "epoch": 0.30377906976744184, "grad_norm": 1.4054727085825658, "learning_rate": 9.441479431343189e-06, "loss": 0.1295, "step": 209 }, { "epoch": 0.30523255813953487, "grad_norm": 1.762998737448682, "learning_rate": 9.436224966176205e-06, "loss": 0.1832, "step": 210 }, { "epoch": 0.3066860465116279, "grad_norm": 1.6833002091383082, "learning_rate": 9.430947376301593e-06, "loss": 0.1393, "step": 211 }, { "epoch": 0.3081395348837209, "grad_norm": 1.4120192730188195, "learning_rate": 9.425646689229843e-06, "loss": 0.1295, "step": 212 }, { "epoch": 0.30959302325581395, "grad_norm": 1.1626004083245838, "learning_rate": 9.420322932591842e-06, "loss": 0.1202, "step": 213 }, { "epoch": 0.311046511627907, "grad_norm": 1.5259387861927707, "learning_rate": 9.414976134138736e-06, "loss": 0.1193, "step": 214 }, { "epoch": 0.3125, "grad_norm": 1.619894973779481, "learning_rate": 9.409606321741776e-06, "loss": 0.1698, "step": 215 }, { "epoch": 0.313953488372093, "grad_norm": 1.7822094110301971, "learning_rate": 9.404213523392183e-06, "loss": 0.1393, "step": 216 }, { "epoch": 0.31540697674418605, "grad_norm": 1.5931388190429225, "learning_rate": 9.39879776720099e-06, "loss": 0.1383, "step": 217 }, { "epoch": 0.3168604651162791, "grad_norm": 2.072625803623837, "learning_rate": 9.393359081398914e-06, "loss": 0.1834, "step": 218 }, { "epoch": 0.3183139534883721, "grad_norm": 1.7269561451971633, "learning_rate": 9.387897494336182e-06, "loss": 0.2005, "step": 219 }, { "epoch": 0.31976744186046513, "grad_norm": 1.9777378749032901, "learning_rate": 9.38241303448241e-06, "loss": 0.1648, "step": 220 }, { "epoch": 0.32122093023255816, "grad_norm": 1.7036324366034152, "learning_rate": 9.376905730426438e-06, "loss": 0.1661, "step": 221 }, { "epoch": 0.3226744186046512, "grad_norm": 1.6114418788776652, "learning_rate": 9.371375610876189e-06, "loss": 0.1871, "step": 222 }, { "epoch": 0.3241279069767442, "grad_norm": 1.6496270048093367, "learning_rate": 9.365822704658511e-06, "loss": 0.1683, "step": 223 }, { "epoch": 0.32558139534883723, "grad_norm": 1.9563162812494876, "learning_rate": 9.36024704071904e-06, "loss": 0.1941, "step": 224 }, { "epoch": 0.32703488372093026, "grad_norm": 2.0820637027256086, "learning_rate": 9.354648648122032e-06, "loss": 0.1951, "step": 225 }, { "epoch": 0.32848837209302323, "grad_norm": 1.5149036427355425, "learning_rate": 9.349027556050225e-06, "loss": 0.1985, "step": 226 }, { "epoch": 0.32994186046511625, "grad_norm": 1.3330622481681265, "learning_rate": 9.343383793804688e-06, "loss": 0.0971, "step": 227 }, { "epoch": 0.3313953488372093, "grad_norm": 1.917698823268353, "learning_rate": 9.337717390804653e-06, "loss": 0.1743, "step": 228 }, { "epoch": 0.3328488372093023, "grad_norm": 1.477889283501478, "learning_rate": 9.332028376587377e-06, "loss": 0.1367, "step": 229 }, { "epoch": 0.33430232558139533, "grad_norm": 1.35154982485599, "learning_rate": 9.326316780807982e-06, "loss": 0.1498, "step": 230 }, { "epoch": 0.33575581395348836, "grad_norm": 1.9401902636815154, "learning_rate": 9.320582633239303e-06, "loss": 0.1633, "step": 231 }, { "epoch": 0.3372093023255814, "grad_norm": 1.5805620021792157, "learning_rate": 9.314825963771724e-06, "loss": 0.172, "step": 232 }, { "epoch": 0.3386627906976744, "grad_norm": 1.2624039358903756, "learning_rate": 9.309046802413033e-06, "loss": 0.1445, "step": 233 }, { "epoch": 0.34011627906976744, "grad_norm": 1.6329242629844094, "learning_rate": 9.303245179288265e-06, "loss": 0.1617, "step": 234 }, { "epoch": 0.34156976744186046, "grad_norm": 1.572417591830697, "learning_rate": 9.297421124639534e-06, "loss": 0.1901, "step": 235 }, { "epoch": 0.3430232558139535, "grad_norm": 1.4594857772846002, "learning_rate": 9.29157466882589e-06, "loss": 0.1408, "step": 236 }, { "epoch": 0.3444767441860465, "grad_norm": 1.341145951847404, "learning_rate": 9.28570584232315e-06, "loss": 0.1468, "step": 237 }, { "epoch": 0.34593023255813954, "grad_norm": 1.3186091526025143, "learning_rate": 9.27981467572374e-06, "loss": 0.1269, "step": 238 }, { "epoch": 0.34738372093023256, "grad_norm": 1.2632212032396861, "learning_rate": 9.273901199736544e-06, "loss": 0.1329, "step": 239 }, { "epoch": 0.3488372093023256, "grad_norm": 1.2397475494586065, "learning_rate": 9.267965445186733e-06, "loss": 0.1188, "step": 240 }, { "epoch": 0.3502906976744186, "grad_norm": 1.4040827076553737, "learning_rate": 9.262007443015614e-06, "loss": 0.1217, "step": 241 }, { "epoch": 0.35174418604651164, "grad_norm": 1.5619651302909152, "learning_rate": 9.25602722428046e-06, "loss": 0.1372, "step": 242 }, { "epoch": 0.35319767441860467, "grad_norm": 1.759985209874878, "learning_rate": 9.250024820154356e-06, "loss": 0.1545, "step": 243 }, { "epoch": 0.3546511627906977, "grad_norm": 1.426120234598387, "learning_rate": 9.24400026192603e-06, "loss": 0.1254, "step": 244 }, { "epoch": 0.3561046511627907, "grad_norm": 2.1333005251482238, "learning_rate": 9.237953580999694e-06, "loss": 0.1715, "step": 245 }, { "epoch": 0.35755813953488375, "grad_norm": 1.4669057043638045, "learning_rate": 9.231884808894877e-06, "loss": 0.1589, "step": 246 }, { "epoch": 0.35901162790697677, "grad_norm": 1.70352194581075, "learning_rate": 9.225793977246267e-06, "loss": 0.1714, "step": 247 }, { "epoch": 0.36046511627906974, "grad_norm": 1.6341050240990393, "learning_rate": 9.219681117803537e-06, "loss": 0.1715, "step": 248 }, { "epoch": 0.36191860465116277, "grad_norm": 1.5094144987016003, "learning_rate": 9.213546262431185e-06, "loss": 0.1195, "step": 249 }, { "epoch": 0.3633720930232558, "grad_norm": 1.4662745386322011, "learning_rate": 9.207389443108372e-06, "loss": 0.1502, "step": 250 }, { "epoch": 0.3648255813953488, "grad_norm": 1.2772800586235213, "learning_rate": 9.201210691928745e-06, "loss": 0.1211, "step": 251 }, { "epoch": 0.36627906976744184, "grad_norm": 1.4892966499782139, "learning_rate": 9.195010041100276e-06, "loss": 0.1168, "step": 252 }, { "epoch": 0.36773255813953487, "grad_norm": 1.423847678645897, "learning_rate": 9.188787522945098e-06, "loss": 0.1338, "step": 253 }, { "epoch": 0.3691860465116279, "grad_norm": 1.5018029337938783, "learning_rate": 9.182543169899325e-06, "loss": 0.1324, "step": 254 }, { "epoch": 0.3706395348837209, "grad_norm": 1.3714052238212613, "learning_rate": 9.176277014512894e-06, "loss": 0.1568, "step": 255 }, { "epoch": 0.37209302325581395, "grad_norm": 1.409010922605016, "learning_rate": 9.16998908944939e-06, "loss": 0.1604, "step": 256 }, { "epoch": 0.373546511627907, "grad_norm": 1.6529698075548698, "learning_rate": 9.163679427485878e-06, "loss": 0.1567, "step": 257 }, { "epoch": 0.375, "grad_norm": 1.3541902426493249, "learning_rate": 9.157348061512728e-06, "loss": 0.1557, "step": 258 }, { "epoch": 0.376453488372093, "grad_norm": 1.4327109786235201, "learning_rate": 9.150995024533446e-06, "loss": 0.1578, "step": 259 }, { "epoch": 0.37790697674418605, "grad_norm": 1.3363028855734624, "learning_rate": 9.14462034966451e-06, "loss": 0.1404, "step": 260 }, { "epoch": 0.3793604651162791, "grad_norm": 1.8919021627256019, "learning_rate": 9.138224070135183e-06, "loss": 0.1841, "step": 261 }, { "epoch": 0.3808139534883721, "grad_norm": 1.48877635650518, "learning_rate": 9.131806219287344e-06, "loss": 0.1292, "step": 262 }, { "epoch": 0.38226744186046513, "grad_norm": 1.4141106832003385, "learning_rate": 9.125366830575325e-06, "loss": 0.1355, "step": 263 }, { "epoch": 0.38372093023255816, "grad_norm": 1.3866461149312288, "learning_rate": 9.118905937565723e-06, "loss": 0.1493, "step": 264 }, { "epoch": 0.3851744186046512, "grad_norm": 1.3927609057507213, "learning_rate": 9.112423573937232e-06, "loss": 0.1187, "step": 265 }, { "epoch": 0.3866279069767442, "grad_norm": 1.63044313572066, "learning_rate": 9.105919773480464e-06, "loss": 0.1604, "step": 266 }, { "epoch": 0.38808139534883723, "grad_norm": 1.6214111304578023, "learning_rate": 9.09939457009778e-06, "loss": 0.1917, "step": 267 }, { "epoch": 0.38953488372093026, "grad_norm": 1.4173048655266718, "learning_rate": 9.092847997803098e-06, "loss": 0.17, "step": 268 }, { "epoch": 0.39098837209302323, "grad_norm": 1.6513577218808286, "learning_rate": 9.08628009072174e-06, "loss": 0.1725, "step": 269 }, { "epoch": 0.39244186046511625, "grad_norm": 1.5628095311374188, "learning_rate": 9.079690883090227e-06, "loss": 0.1608, "step": 270 }, { "epoch": 0.3938953488372093, "grad_norm": 1.9629889905768871, "learning_rate": 9.073080409256118e-06, "loss": 0.1587, "step": 271 }, { "epoch": 0.3953488372093023, "grad_norm": 1.947239430105324, "learning_rate": 9.066448703677828e-06, "loss": 0.2092, "step": 272 }, { "epoch": 0.39680232558139533, "grad_norm": 1.6096007627861755, "learning_rate": 9.059795800924445e-06, "loss": 0.2076, "step": 273 }, { "epoch": 0.39825581395348836, "grad_norm": 1.4682775876201015, "learning_rate": 9.053121735675552e-06, "loss": 0.1338, "step": 274 }, { "epoch": 0.3997093023255814, "grad_norm": 1.541753647412304, "learning_rate": 9.046426542721046e-06, "loss": 0.1714, "step": 275 }, { "epoch": 0.4011627906976744, "grad_norm": 1.413689581379683, "learning_rate": 9.039710256960956e-06, "loss": 0.1346, "step": 276 }, { "epoch": 0.40261627906976744, "grad_norm": 1.6313953237696512, "learning_rate": 9.03297291340526e-06, "loss": 0.1596, "step": 277 }, { "epoch": 0.40406976744186046, "grad_norm": 1.2910181632753897, "learning_rate": 9.026214547173706e-06, "loss": 0.1553, "step": 278 }, { "epoch": 0.4055232558139535, "grad_norm": 1.4228459831893367, "learning_rate": 9.019435193495627e-06, "loss": 0.1377, "step": 279 }, { "epoch": 0.4069767441860465, "grad_norm": 1.459679839660881, "learning_rate": 9.012634887709755e-06, "loss": 0.1404, "step": 280 }, { "epoch": 0.40843023255813954, "grad_norm": 1.6321159835021386, "learning_rate": 9.005813665264042e-06, "loss": 0.1393, "step": 281 }, { "epoch": 0.40988372093023256, "grad_norm": 1.9115197748274007, "learning_rate": 8.998971561715468e-06, "loss": 0.164, "step": 282 }, { "epoch": 0.4113372093023256, "grad_norm": 1.5888048449790153, "learning_rate": 8.992108612729868e-06, "loss": 0.1422, "step": 283 }, { "epoch": 0.4127906976744186, "grad_norm": 2.3019049975561585, "learning_rate": 8.985224854081727e-06, "loss": 0.1863, "step": 284 }, { "epoch": 0.41424418604651164, "grad_norm": 1.5757906262469172, "learning_rate": 8.978320321654014e-06, "loss": 0.1531, "step": 285 }, { "epoch": 0.41569767441860467, "grad_norm": 1.3184585148618004, "learning_rate": 8.97139505143798e-06, "loss": 0.1035, "step": 286 }, { "epoch": 0.4171511627906977, "grad_norm": 1.9152332078193026, "learning_rate": 8.964449079532978e-06, "loss": 0.1982, "step": 287 }, { "epoch": 0.4186046511627907, "grad_norm": 1.3318524817371034, "learning_rate": 8.957482442146271e-06, "loss": 0.1422, "step": 288 }, { "epoch": 0.42005813953488375, "grad_norm": 1.2994676654595296, "learning_rate": 8.950495175592849e-06, "loss": 0.0954, "step": 289 }, { "epoch": 0.42151162790697677, "grad_norm": 1.3863595317355537, "learning_rate": 8.94348731629523e-06, "loss": 0.1481, "step": 290 }, { "epoch": 0.42296511627906974, "grad_norm": 1.4930615065996538, "learning_rate": 8.93645890078328e-06, "loss": 0.1474, "step": 291 }, { "epoch": 0.42441860465116277, "grad_norm": 1.1744911565495457, "learning_rate": 8.929409965694016e-06, "loss": 0.1228, "step": 292 }, { "epoch": 0.4258720930232558, "grad_norm": 1.3229206282158534, "learning_rate": 8.92234054777142e-06, "loss": 0.1165, "step": 293 }, { "epoch": 0.4273255813953488, "grad_norm": 1.500633285810525, "learning_rate": 8.915250683866242e-06, "loss": 0.1561, "step": 294 }, { "epoch": 0.42877906976744184, "grad_norm": 1.4979998345117733, "learning_rate": 8.908140410935813e-06, "loss": 0.1466, "step": 295 }, { "epoch": 0.43023255813953487, "grad_norm": 1.2712924829276973, "learning_rate": 8.901009766043846e-06, "loss": 0.1026, "step": 296 }, { "epoch": 0.4316860465116279, "grad_norm": 1.6863134441377277, "learning_rate": 8.893858786360255e-06, "loss": 0.1687, "step": 297 }, { "epoch": 0.4331395348837209, "grad_norm": 1.5887492952986007, "learning_rate": 8.886687509160944e-06, "loss": 0.1973, "step": 298 }, { "epoch": 0.43459302325581395, "grad_norm": 1.6354886094555063, "learning_rate": 8.879495971827628e-06, "loss": 0.1881, "step": 299 }, { "epoch": 0.436046511627907, "grad_norm": 1.5874859267729486, "learning_rate": 8.872284211847629e-06, "loss": 0.1105, "step": 300 }, { "epoch": 0.4375, "grad_norm": 1.7021255827480994, "learning_rate": 8.865052266813686e-06, "loss": 0.1592, "step": 301 }, { "epoch": 0.438953488372093, "grad_norm": 1.1905429804396237, "learning_rate": 8.857800174423754e-06, "loss": 0.1569, "step": 302 }, { "epoch": 0.44040697674418605, "grad_norm": 1.7925350240691666, "learning_rate": 8.850527972480812e-06, "loss": 0.1823, "step": 303 }, { "epoch": 0.4418604651162791, "grad_norm": 1.682150104412601, "learning_rate": 8.843235698892661e-06, "loss": 0.1725, "step": 304 }, { "epoch": 0.4433139534883721, "grad_norm": 1.4842418533887083, "learning_rate": 8.835923391671735e-06, "loss": 0.1095, "step": 305 }, { "epoch": 0.44476744186046513, "grad_norm": 1.4624470494811705, "learning_rate": 8.828591088934894e-06, "loss": 0.1286, "step": 306 }, { "epoch": 0.44622093023255816, "grad_norm": 1.3843653917366865, "learning_rate": 8.821238828903227e-06, "loss": 0.1423, "step": 307 }, { "epoch": 0.4476744186046512, "grad_norm": 1.4949237211694288, "learning_rate": 8.813866649901857e-06, "loss": 0.1426, "step": 308 }, { "epoch": 0.4491279069767442, "grad_norm": 1.338762423782917, "learning_rate": 8.806474590359736e-06, "loss": 0.1609, "step": 309 }, { "epoch": 0.45058139534883723, "grad_norm": 1.3119459351922982, "learning_rate": 8.799062688809452e-06, "loss": 0.1508, "step": 310 }, { "epoch": 0.45203488372093026, "grad_norm": 1.4964356641654488, "learning_rate": 8.79163098388702e-06, "loss": 0.1536, "step": 311 }, { "epoch": 0.45348837209302323, "grad_norm": 1.8958424478348768, "learning_rate": 8.784179514331683e-06, "loss": 0.2154, "step": 312 }, { "epoch": 0.45494186046511625, "grad_norm": 1.616943621767028, "learning_rate": 8.776708318985712e-06, "loss": 0.1338, "step": 313 }, { "epoch": 0.4563953488372093, "grad_norm": 1.3369024538015155, "learning_rate": 8.769217436794205e-06, "loss": 0.1481, "step": 314 }, { "epoch": 0.4578488372093023, "grad_norm": 1.5002304811430793, "learning_rate": 8.761706906804878e-06, "loss": 0.1484, "step": 315 }, { "epoch": 0.45930232558139533, "grad_norm": 1.5317354250290076, "learning_rate": 8.75417676816787e-06, "loss": 0.1388, "step": 316 }, { "epoch": 0.46075581395348836, "grad_norm": 1.5881770732342904, "learning_rate": 8.746627060135528e-06, "loss": 0.1607, "step": 317 }, { "epoch": 0.4622093023255814, "grad_norm": 1.2933179184781483, "learning_rate": 8.73905782206221e-06, "loss": 0.1425, "step": 318 }, { "epoch": 0.4636627906976744, "grad_norm": 1.4900348199011015, "learning_rate": 8.731469093404086e-06, "loss": 0.1686, "step": 319 }, { "epoch": 0.46511627906976744, "grad_norm": 1.2187036161656735, "learning_rate": 8.72386091371891e-06, "loss": 0.156, "step": 320 }, { "epoch": 0.46656976744186046, "grad_norm": 1.5361392026631566, "learning_rate": 8.71623332266584e-06, "loss": 0.1697, "step": 321 }, { "epoch": 0.4680232558139535, "grad_norm": 1.5624029046245063, "learning_rate": 8.708586360005218e-06, "loss": 0.1565, "step": 322 }, { "epoch": 0.4694767441860465, "grad_norm": 1.2313528832746294, "learning_rate": 8.700920065598358e-06, "loss": 0.1319, "step": 323 }, { "epoch": 0.47093023255813954, "grad_norm": 1.4770385920550462, "learning_rate": 8.693234479407353e-06, "loss": 0.1212, "step": 324 }, { "epoch": 0.47238372093023256, "grad_norm": 2.1148845935494913, "learning_rate": 8.685529641494852e-06, "loss": 0.1714, "step": 325 }, { "epoch": 0.4738372093023256, "grad_norm": 1.3818082146663953, "learning_rate": 8.677805592023858e-06, "loss": 0.1328, "step": 326 }, { "epoch": 0.4752906976744186, "grad_norm": 2.0939762202676935, "learning_rate": 8.670062371257525e-06, "loss": 0.2428, "step": 327 }, { "epoch": 0.47674418604651164, "grad_norm": 1.5066771176481124, "learning_rate": 8.662300019558931e-06, "loss": 0.1397, "step": 328 }, { "epoch": 0.47819767441860467, "grad_norm": 1.3186753628942027, "learning_rate": 8.654518577390885e-06, "loss": 0.1513, "step": 329 }, { "epoch": 0.4796511627906977, "grad_norm": 1.6619235949225295, "learning_rate": 8.646718085315707e-06, "loss": 0.1637, "step": 330 }, { "epoch": 0.4811046511627907, "grad_norm": 1.5149079703407748, "learning_rate": 8.638898583995016e-06, "loss": 0.1614, "step": 331 }, { "epoch": 0.48255813953488375, "grad_norm": 1.6949649519406504, "learning_rate": 8.631060114189526e-06, "loss": 0.1613, "step": 332 }, { "epoch": 0.48401162790697677, "grad_norm": 1.6341150247952272, "learning_rate": 8.62320271675882e-06, "loss": 0.139, "step": 333 }, { "epoch": 0.48546511627906974, "grad_norm": 1.5722940293537715, "learning_rate": 8.615326432661155e-06, "loss": 0.1389, "step": 334 }, { "epoch": 0.48691860465116277, "grad_norm": 1.6784313452366777, "learning_rate": 8.607431302953229e-06, "loss": 0.1586, "step": 335 }, { "epoch": 0.4883720930232558, "grad_norm": 1.579470915019441, "learning_rate": 8.599517368789981e-06, "loss": 0.1628, "step": 336 }, { "epoch": 0.4898255813953488, "grad_norm": 1.8175899396499493, "learning_rate": 8.591584671424371e-06, "loss": 0.1778, "step": 337 }, { "epoch": 0.49127906976744184, "grad_norm": 1.3679144266460825, "learning_rate": 8.583633252207171e-06, "loss": 0.145, "step": 338 }, { "epoch": 0.49273255813953487, "grad_norm": 1.5973543371186025, "learning_rate": 8.575663152586735e-06, "loss": 0.1371, "step": 339 }, { "epoch": 0.4941860465116279, "grad_norm": 1.4223743951319792, "learning_rate": 8.5676744141088e-06, "loss": 0.1349, "step": 340 }, { "epoch": 0.4956395348837209, "grad_norm": 1.7867654597125795, "learning_rate": 8.559667078416257e-06, "loss": 0.1652, "step": 341 }, { "epoch": 0.49709302325581395, "grad_norm": 1.6721772672683666, "learning_rate": 8.551641187248942e-06, "loss": 0.1827, "step": 342 }, { "epoch": 0.498546511627907, "grad_norm": 1.5334190767352909, "learning_rate": 8.543596782443415e-06, "loss": 0.1367, "step": 343 }, { "epoch": 0.5, "grad_norm": 1.4823817376465154, "learning_rate": 8.535533905932739e-06, "loss": 0.1427, "step": 344 }, { "epoch": 0.501453488372093, "grad_norm": 1.5846228452591529, "learning_rate": 8.527452599746265e-06, "loss": 0.1705, "step": 345 }, { "epoch": 0.502906976744186, "grad_norm": 1.3811944073529825, "learning_rate": 8.519352906009417e-06, "loss": 0.135, "step": 346 }, { "epoch": 0.5043604651162791, "grad_norm": 1.4875376615322078, "learning_rate": 8.511234866943463e-06, "loss": 0.1388, "step": 347 }, { "epoch": 0.5058139534883721, "grad_norm": 1.402324894315079, "learning_rate": 8.5030985248653e-06, "loss": 0.1474, "step": 348 }, { "epoch": 0.5072674418604651, "grad_norm": 1.463484821168073, "learning_rate": 8.494943922187236e-06, "loss": 0.1234, "step": 349 }, { "epoch": 0.5087209302325582, "grad_norm": 1.6398780382743268, "learning_rate": 8.486771101416765e-06, "loss": 0.1335, "step": 350 }, { "epoch": 0.5101744186046512, "grad_norm": 1.244410077865948, "learning_rate": 8.47858010515634e-06, "loss": 0.1499, "step": 351 }, { "epoch": 0.5116279069767442, "grad_norm": 1.7990830264360216, "learning_rate": 8.470370976103171e-06, "loss": 0.1662, "step": 352 }, { "epoch": 0.5130813953488372, "grad_norm": 1.7219602924486799, "learning_rate": 8.462143757048976e-06, "loss": 0.1294, "step": 353 }, { "epoch": 0.5145348837209303, "grad_norm": 1.1737595986370917, "learning_rate": 8.453898490879776e-06, "loss": 0.1391, "step": 354 }, { "epoch": 0.5159883720930233, "grad_norm": 1.5033196146866203, "learning_rate": 8.445635220575663e-06, "loss": 0.1207, "step": 355 }, { "epoch": 0.5174418604651163, "grad_norm": 1.526089460093771, "learning_rate": 8.43735398921059e-06, "loss": 0.1575, "step": 356 }, { "epoch": 0.5188953488372093, "grad_norm": 1.0613154614063016, "learning_rate": 8.429054839952122e-06, "loss": 0.107, "step": 357 }, { "epoch": 0.5203488372093024, "grad_norm": 1.5465351587584268, "learning_rate": 8.42073781606123e-06, "loss": 0.1412, "step": 358 }, { "epoch": 0.5218023255813954, "grad_norm": 1.6263257643507738, "learning_rate": 8.412402960892061e-06, "loss": 0.1528, "step": 359 }, { "epoch": 0.5232558139534884, "grad_norm": 1.4428369608603286, "learning_rate": 8.40405031789171e-06, "loss": 0.1229, "step": 360 }, { "epoch": 0.5247093023255814, "grad_norm": 1.5887049579267734, "learning_rate": 8.395679930599997e-06, "loss": 0.1768, "step": 361 }, { "epoch": 0.5261627906976745, "grad_norm": 1.6132505441168516, "learning_rate": 8.387291842649234e-06, "loss": 0.147, "step": 362 }, { "epoch": 0.5276162790697675, "grad_norm": 1.6221175313012306, "learning_rate": 8.378886097764001e-06, "loss": 0.1393, "step": 363 }, { "epoch": 0.5290697674418605, "grad_norm": 1.3815088653312464, "learning_rate": 8.370462739760922e-06, "loss": 0.1346, "step": 364 }, { "epoch": 0.5305232558139535, "grad_norm": 1.613218374432377, "learning_rate": 8.362021812548433e-06, "loss": 0.1296, "step": 365 }, { "epoch": 0.5319767441860465, "grad_norm": 1.5239164352989918, "learning_rate": 8.353563360126548e-06, "loss": 0.2012, "step": 366 }, { "epoch": 0.5334302325581395, "grad_norm": 1.4465320300081779, "learning_rate": 8.345087426586638e-06, "loss": 0.1436, "step": 367 }, { "epoch": 0.5348837209302325, "grad_norm": 1.6179330922890798, "learning_rate": 8.336594056111197e-06, "loss": 0.1699, "step": 368 }, { "epoch": 0.5363372093023255, "grad_norm": 1.2917432875508248, "learning_rate": 8.328083292973617e-06, "loss": 0.1294, "step": 369 }, { "epoch": 0.5377906976744186, "grad_norm": 1.823166535138271, "learning_rate": 8.319555181537942e-06, "loss": 0.1693, "step": 370 }, { "epoch": 0.5392441860465116, "grad_norm": 1.5964311134632778, "learning_rate": 8.311009766258659e-06, "loss": 0.1838, "step": 371 }, { "epoch": 0.5406976744186046, "grad_norm": 1.5287488881908324, "learning_rate": 8.30244709168045e-06, "loss": 0.164, "step": 372 }, { "epoch": 0.5421511627906976, "grad_norm": 1.617466008500704, "learning_rate": 8.293867202437962e-06, "loss": 0.1675, "step": 373 }, { "epoch": 0.5436046511627907, "grad_norm": 1.81589698419327, "learning_rate": 8.285270143255579e-06, "loss": 0.158, "step": 374 }, { "epoch": 0.5450581395348837, "grad_norm": 1.496211076073237, "learning_rate": 8.27665595894719e-06, "loss": 0.1332, "step": 375 }, { "epoch": 0.5465116279069767, "grad_norm": 1.528765465197372, "learning_rate": 8.268024694415949e-06, "loss": 0.1313, "step": 376 }, { "epoch": 0.5479651162790697, "grad_norm": 1.7252370903269931, "learning_rate": 8.25937639465404e-06, "loss": 0.1635, "step": 377 }, { "epoch": 0.5494186046511628, "grad_norm": 1.3299953218632383, "learning_rate": 8.250711104742453e-06, "loss": 0.1198, "step": 378 }, { "epoch": 0.5508720930232558, "grad_norm": 1.6165817454268103, "learning_rate": 8.242028869850743e-06, "loss": 0.142, "step": 379 }, { "epoch": 0.5523255813953488, "grad_norm": 1.4748568202518297, "learning_rate": 8.23332973523679e-06, "loss": 0.1134, "step": 380 }, { "epoch": 0.5537790697674418, "grad_norm": 1.327698827166681, "learning_rate": 8.224613746246565e-06, "loss": 0.1465, "step": 381 }, { "epoch": 0.5552325581395349, "grad_norm": 1.3793313249754087, "learning_rate": 8.215880948313904e-06, "loss": 0.1304, "step": 382 }, { "epoch": 0.5566860465116279, "grad_norm": 1.7504804510682308, "learning_rate": 8.207131386960256e-06, "loss": 0.1673, "step": 383 }, { "epoch": 0.5581395348837209, "grad_norm": 1.595850264406846, "learning_rate": 8.198365107794457e-06, "loss": 0.1444, "step": 384 }, { "epoch": 0.559593023255814, "grad_norm": 1.5020289813450294, "learning_rate": 8.189582156512484e-06, "loss": 0.1403, "step": 385 }, { "epoch": 0.561046511627907, "grad_norm": 1.4326207754311362, "learning_rate": 8.180782578897225e-06, "loss": 0.0998, "step": 386 }, { "epoch": 0.5625, "grad_norm": 1.7057272776604826, "learning_rate": 8.171966420818227e-06, "loss": 0.1388, "step": 387 }, { "epoch": 0.563953488372093, "grad_norm": 1.7222325633827718, "learning_rate": 8.163133728231482e-06, "loss": 0.1897, "step": 388 }, { "epoch": 0.565406976744186, "grad_norm": 1.473100288550089, "learning_rate": 8.154284547179158e-06, "loss": 0.1419, "step": 389 }, { "epoch": 0.5668604651162791, "grad_norm": 1.9904385322120357, "learning_rate": 8.145418923789375e-06, "loss": 0.1935, "step": 390 }, { "epoch": 0.5683139534883721, "grad_norm": 1.8647312855206553, "learning_rate": 8.136536904275965e-06, "loss": 0.2022, "step": 391 }, { "epoch": 0.5697674418604651, "grad_norm": 1.7630172852357997, "learning_rate": 8.127638534938227e-06, "loss": 0.1924, "step": 392 }, { "epoch": 0.5712209302325582, "grad_norm": 1.3019122719624554, "learning_rate": 8.118723862160687e-06, "loss": 0.1469, "step": 393 }, { "epoch": 0.5726744186046512, "grad_norm": 1.860210908405091, "learning_rate": 8.109792932412853e-06, "loss": 0.1508, "step": 394 }, { "epoch": 0.5741279069767442, "grad_norm": 1.345826572185921, "learning_rate": 8.10084579224898e-06, "loss": 0.155, "step": 395 }, { "epoch": 0.5755813953488372, "grad_norm": 1.9559560979112873, "learning_rate": 8.09188248830782e-06, "loss": 0.183, "step": 396 }, { "epoch": 0.5770348837209303, "grad_norm": 1.7929107775519544, "learning_rate": 8.082903067312384e-06, "loss": 0.1219, "step": 397 }, { "epoch": 0.5784883720930233, "grad_norm": 1.4217667098877558, "learning_rate": 8.073907576069692e-06, "loss": 0.1615, "step": 398 }, { "epoch": 0.5799418604651163, "grad_norm": 1.74509835604015, "learning_rate": 8.064896061470542e-06, "loss": 0.1638, "step": 399 }, { "epoch": 0.5813953488372093, "grad_norm": 1.412746062615197, "learning_rate": 8.055868570489247e-06, "loss": 0.1497, "step": 400 }, { "epoch": 0.5813953488372093, "eval_loss": 0.1344260424375534, "eval_runtime": 2.2004, "eval_samples_per_second": 25.449, "eval_steps_per_second": 6.362, "step": 400 }, { "epoch": 0.5828488372093024, "grad_norm": 1.2068327544738824, "learning_rate": 8.046825150183406e-06, "loss": 0.1201, "step": 401 }, { "epoch": 0.5843023255813954, "grad_norm": 1.6435893838564368, "learning_rate": 8.037765847693652e-06, "loss": 0.1145, "step": 402 }, { "epoch": 0.5857558139534884, "grad_norm": 1.6608170672171527, "learning_rate": 8.028690710243407e-06, "loss": 0.1279, "step": 403 }, { "epoch": 0.5872093023255814, "grad_norm": 1.4328859838303152, "learning_rate": 8.019599785138635e-06, "loss": 0.1373, "step": 404 }, { "epoch": 0.5886627906976745, "grad_norm": 1.4343862675807864, "learning_rate": 8.010493119767596e-06, "loss": 0.1498, "step": 405 }, { "epoch": 0.5901162790697675, "grad_norm": 1.6086081681876832, "learning_rate": 8.001370761600598e-06, "loss": 0.141, "step": 406 }, { "epoch": 0.5915697674418605, "grad_norm": 2.1039051172285, "learning_rate": 7.992232758189756e-06, "loss": 0.1973, "step": 407 }, { "epoch": 0.5930232558139535, "grad_norm": 1.8140948838773037, "learning_rate": 7.983079157168736e-06, "loss": 0.1748, "step": 408 }, { "epoch": 0.5944767441860465, "grad_norm": 1.3362665282466653, "learning_rate": 7.973910006252508e-06, "loss": 0.1397, "step": 409 }, { "epoch": 0.5959302325581395, "grad_norm": 1.5133021367871378, "learning_rate": 7.9647253532371e-06, "loss": 0.1227, "step": 410 }, { "epoch": 0.5973837209302325, "grad_norm": 1.3742813696710579, "learning_rate": 7.955525245999348e-06, "loss": 0.1292, "step": 411 }, { "epoch": 0.5988372093023255, "grad_norm": 1.1612211072347856, "learning_rate": 7.946309732496646e-06, "loss": 0.1167, "step": 412 }, { "epoch": 0.6002906976744186, "grad_norm": 1.2766729466996067, "learning_rate": 7.9370788607667e-06, "loss": 0.1229, "step": 413 }, { "epoch": 0.6017441860465116, "grad_norm": 1.5577927339728042, "learning_rate": 7.927832678927265e-06, "loss": 0.1267, "step": 414 }, { "epoch": 0.6031976744186046, "grad_norm": 1.252636212314083, "learning_rate": 7.918571235175914e-06, "loss": 0.1487, "step": 415 }, { "epoch": 0.6046511627906976, "grad_norm": 1.4878931355701959, "learning_rate": 7.909294577789765e-06, "loss": 0.1456, "step": 416 }, { "epoch": 0.6061046511627907, "grad_norm": 1.6750463601592944, "learning_rate": 7.900002755125249e-06, "loss": 0.1539, "step": 417 }, { "epoch": 0.6075581395348837, "grad_norm": 1.670161301173932, "learning_rate": 7.890695815617844e-06, "loss": 0.1588, "step": 418 }, { "epoch": 0.6090116279069767, "grad_norm": 1.5574686810516989, "learning_rate": 7.881373807781827e-06, "loss": 0.1598, "step": 419 }, { "epoch": 0.6104651162790697, "grad_norm": 1.4407087883766663, "learning_rate": 7.872036780210025e-06, "loss": 0.1292, "step": 420 }, { "epoch": 0.6119186046511628, "grad_norm": 1.532394578818048, "learning_rate": 7.86268478157356e-06, "loss": 0.1417, "step": 421 }, { "epoch": 0.6133720930232558, "grad_norm": 1.558475109143411, "learning_rate": 7.853317860621586e-06, "loss": 0.1243, "step": 422 }, { "epoch": 0.6148255813953488, "grad_norm": 1.2251102819390556, "learning_rate": 7.843936066181049e-06, "loss": 0.1218, "step": 423 }, { "epoch": 0.6162790697674418, "grad_norm": 1.3841095970618222, "learning_rate": 7.834539447156424e-06, "loss": 0.1085, "step": 424 }, { "epoch": 0.6177325581395349, "grad_norm": 1.3356825728851078, "learning_rate": 7.825128052529462e-06, "loss": 0.1116, "step": 425 }, { "epoch": 0.6191860465116279, "grad_norm": 1.5573547752820238, "learning_rate": 7.815701931358934e-06, "loss": 0.1388, "step": 426 }, { "epoch": 0.6206395348837209, "grad_norm": 1.7038054695327371, "learning_rate": 7.80626113278038e-06, "loss": 0.1519, "step": 427 }, { "epoch": 0.622093023255814, "grad_norm": 1.5094414416286532, "learning_rate": 7.796805706005843e-06, "loss": 0.1149, "step": 428 }, { "epoch": 0.623546511627907, "grad_norm": 1.4273226525726925, "learning_rate": 7.787335700323622e-06, "loss": 0.1254, "step": 429 }, { "epoch": 0.625, "grad_norm": 1.4966730289367747, "learning_rate": 7.777851165098012e-06, "loss": 0.156, "step": 430 }, { "epoch": 0.626453488372093, "grad_norm": 1.6315240876401846, "learning_rate": 7.768352149769044e-06, "loss": 0.1621, "step": 431 }, { "epoch": 0.627906976744186, "grad_norm": 1.7333824682174062, "learning_rate": 7.75883870385223e-06, "loss": 0.1376, "step": 432 }, { "epoch": 0.6293604651162791, "grad_norm": 1.602227708449946, "learning_rate": 7.749310876938306e-06, "loss": 0.1735, "step": 433 }, { "epoch": 0.6308139534883721, "grad_norm": 1.3977532588731882, "learning_rate": 7.739768718692969e-06, "loss": 0.1289, "step": 434 }, { "epoch": 0.6322674418604651, "grad_norm": 1.268259604423468, "learning_rate": 7.730212278856625e-06, "loss": 0.1017, "step": 435 }, { "epoch": 0.6337209302325582, "grad_norm": 1.3411672369388048, "learning_rate": 7.72064160724412e-06, "loss": 0.1362, "step": 436 }, { "epoch": 0.6351744186046512, "grad_norm": 1.2808388499819041, "learning_rate": 7.71105675374449e-06, "loss": 0.1394, "step": 437 }, { "epoch": 0.6366279069767442, "grad_norm": 1.295520262094418, "learning_rate": 7.701457768320696e-06, "loss": 0.1442, "step": 438 }, { "epoch": 0.6380813953488372, "grad_norm": 2.109775997490956, "learning_rate": 7.691844701009365e-06, "loss": 0.1762, "step": 439 }, { "epoch": 0.6395348837209303, "grad_norm": 1.8500489048534585, "learning_rate": 7.682217601920529e-06, "loss": 0.1796, "step": 440 }, { "epoch": 0.6409883720930233, "grad_norm": 1.5939072801392646, "learning_rate": 7.672576521237361e-06, "loss": 0.1516, "step": 441 }, { "epoch": 0.6424418604651163, "grad_norm": 1.6138091025138628, "learning_rate": 7.662921509215916e-06, "loss": 0.1829, "step": 442 }, { "epoch": 0.6438953488372093, "grad_norm": 1.676099177115984, "learning_rate": 7.653252616184875e-06, "loss": 0.1237, "step": 443 }, { "epoch": 0.6453488372093024, "grad_norm": 1.2244457381485303, "learning_rate": 7.643569892545267e-06, "loss": 0.1306, "step": 444 }, { "epoch": 0.6468023255813954, "grad_norm": 1.5503434065388142, "learning_rate": 7.633873388770223e-06, "loss": 0.1432, "step": 445 }, { "epoch": 0.6482558139534884, "grad_norm": 1.3861212995292134, "learning_rate": 7.624163155404702e-06, "loss": 0.1246, "step": 446 }, { "epoch": 0.6497093023255814, "grad_norm": 1.4732536785794919, "learning_rate": 7.614439243065235e-06, "loss": 0.171, "step": 447 }, { "epoch": 0.6511627906976745, "grad_norm": 1.9155294406879655, "learning_rate": 7.604701702439652e-06, "loss": 0.159, "step": 448 }, { "epoch": 0.6526162790697675, "grad_norm": 1.866142590003918, "learning_rate": 7.594950584286826e-06, "loss": 0.1705, "step": 449 }, { "epoch": 0.6540697674418605, "grad_norm": 1.3523787659587572, "learning_rate": 7.585185939436409e-06, "loss": 0.139, "step": 450 }, { "epoch": 0.6555232558139535, "grad_norm": 1.314104202140968, "learning_rate": 7.5754078187885586e-06, "loss": 0.1222, "step": 451 }, { "epoch": 0.6569767441860465, "grad_norm": 1.3649972766280627, "learning_rate": 7.5656162733136776e-06, "loss": 0.1429, "step": 452 }, { "epoch": 0.6584302325581395, "grad_norm": 1.3599029511484766, "learning_rate": 7.555811354052152e-06, "loss": 0.1483, "step": 453 }, { "epoch": 0.6598837209302325, "grad_norm": 1.249236779922474, "learning_rate": 7.545993112114078e-06, "loss": 0.135, "step": 454 }, { "epoch": 0.6613372093023255, "grad_norm": 1.4068263051558898, "learning_rate": 7.536161598679002e-06, "loss": 0.14, "step": 455 }, { "epoch": 0.6627906976744186, "grad_norm": 1.873871839563441, "learning_rate": 7.526316864995648e-06, "loss": 0.1585, "step": 456 }, { "epoch": 0.6642441860465116, "grad_norm": 1.506919292093952, "learning_rate": 7.516458962381654e-06, "loss": 0.1308, "step": 457 }, { "epoch": 0.6656976744186046, "grad_norm": 1.561352936275751, "learning_rate": 7.506587942223305e-06, "loss": 0.1374, "step": 458 }, { "epoch": 0.6671511627906976, "grad_norm": 1.520294700430976, "learning_rate": 7.4967038559752626e-06, "loss": 0.1181, "step": 459 }, { "epoch": 0.6686046511627907, "grad_norm": 1.6731627584388915, "learning_rate": 7.486806755160298e-06, "loss": 0.1595, "step": 460 }, { "epoch": 0.6700581395348837, "grad_norm": 1.3704388693068905, "learning_rate": 7.476896691369023e-06, "loss": 0.1188, "step": 461 }, { "epoch": 0.6715116279069767, "grad_norm": 1.4337091782306097, "learning_rate": 7.466973716259622e-06, "loss": 0.1132, "step": 462 }, { "epoch": 0.6729651162790697, "grad_norm": 1.3716017344315858, "learning_rate": 7.457037881557585e-06, "loss": 0.1334, "step": 463 }, { "epoch": 0.6744186046511628, "grad_norm": 1.5766750432421248, "learning_rate": 7.447089239055428e-06, "loss": 0.1143, "step": 464 }, { "epoch": 0.6758720930232558, "grad_norm": 1.5864606252067934, "learning_rate": 7.437127840612438e-06, "loss": 0.1309, "step": 465 }, { "epoch": 0.6773255813953488, "grad_norm": 1.4567542884868516, "learning_rate": 7.4271537381543916e-06, "loss": 0.147, "step": 466 }, { "epoch": 0.6787790697674418, "grad_norm": 1.4514082699066, "learning_rate": 7.417166983673286e-06, "loss": 0.1551, "step": 467 }, { "epoch": 0.6802325581395349, "grad_norm": 1.2445772110855982, "learning_rate": 7.407167629227072e-06, "loss": 0.1066, "step": 468 }, { "epoch": 0.6816860465116279, "grad_norm": 1.370255809316678, "learning_rate": 7.3971557269393805e-06, "loss": 0.1377, "step": 469 }, { "epoch": 0.6831395348837209, "grad_norm": 1.4592622873191892, "learning_rate": 7.3871313289992466e-06, "loss": 0.1314, "step": 470 }, { "epoch": 0.684593023255814, "grad_norm": 1.6311413610361725, "learning_rate": 7.377094487660847e-06, "loss": 0.1628, "step": 471 }, { "epoch": 0.686046511627907, "grad_norm": 1.5762097334368965, "learning_rate": 7.367045255243217e-06, "loss": 0.156, "step": 472 }, { "epoch": 0.6875, "grad_norm": 1.7627902314989499, "learning_rate": 7.3569836841299905e-06, "loss": 0.1315, "step": 473 }, { "epoch": 0.688953488372093, "grad_norm": 2.222043724851836, "learning_rate": 7.346909826769107e-06, "loss": 0.1731, "step": 474 }, { "epoch": 0.690406976744186, "grad_norm": 1.5406069442857668, "learning_rate": 7.336823735672563e-06, "loss": 0.1386, "step": 475 }, { "epoch": 0.6918604651162791, "grad_norm": 1.5796855692443843, "learning_rate": 7.326725463416118e-06, "loss": 0.15, "step": 476 }, { "epoch": 0.6933139534883721, "grad_norm": 1.4662737935999774, "learning_rate": 7.316615062639031e-06, "loss": 0.1435, "step": 477 }, { "epoch": 0.6947674418604651, "grad_norm": 1.2313632203692446, "learning_rate": 7.306492586043783e-06, "loss": 0.0919, "step": 478 }, { "epoch": 0.6962209302325582, "grad_norm": 1.7509061840145403, "learning_rate": 7.296358086395803e-06, "loss": 0.1372, "step": 479 }, { "epoch": 0.6976744186046512, "grad_norm": 1.6433660385495745, "learning_rate": 7.286211616523193e-06, "loss": 0.131, "step": 480 }, { "epoch": 0.6991279069767442, "grad_norm": 1.8160423455057901, "learning_rate": 7.276053229316451e-06, "loss": 0.1945, "step": 481 }, { "epoch": 0.7005813953488372, "grad_norm": 1.4727605332371438, "learning_rate": 7.265882977728195e-06, "loss": 0.1382, "step": 482 }, { "epoch": 0.7020348837209303, "grad_norm": 1.4064135407009675, "learning_rate": 7.255700914772891e-06, "loss": 0.1266, "step": 483 }, { "epoch": 0.7034883720930233, "grad_norm": 1.6003335768968785, "learning_rate": 7.245507093526575e-06, "loss": 0.169, "step": 484 }, { "epoch": 0.7049418604651163, "grad_norm": 1.3092194579961982, "learning_rate": 7.2353015671265716e-06, "loss": 0.1184, "step": 485 }, { "epoch": 0.7063953488372093, "grad_norm": 1.5489983894775146, "learning_rate": 7.225084388771226e-06, "loss": 0.1524, "step": 486 }, { "epoch": 0.7078488372093024, "grad_norm": 1.4796324887099654, "learning_rate": 7.214855611719616e-06, "loss": 0.1592, "step": 487 }, { "epoch": 0.7093023255813954, "grad_norm": 1.4347850977601093, "learning_rate": 7.204615289291283e-06, "loss": 0.1618, "step": 488 }, { "epoch": 0.7107558139534884, "grad_norm": 1.5275682896148945, "learning_rate": 7.194363474865951e-06, "loss": 0.1281, "step": 489 }, { "epoch": 0.7122093023255814, "grad_norm": 1.785937536200274, "learning_rate": 7.184100221883248e-06, "loss": 0.1454, "step": 490 }, { "epoch": 0.7136627906976745, "grad_norm": 1.2976847404347416, "learning_rate": 7.173825583842427e-06, "loss": 0.1317, "step": 491 }, { "epoch": 0.7151162790697675, "grad_norm": 1.166513744560416, "learning_rate": 7.163539614302088e-06, "loss": 0.13, "step": 492 }, { "epoch": 0.7165697674418605, "grad_norm": 1.3524452986795923, "learning_rate": 7.153242366879903e-06, "loss": 0.1514, "step": 493 }, { "epoch": 0.7180232558139535, "grad_norm": 1.1994423678946693, "learning_rate": 7.142933895252324e-06, "loss": 0.1194, "step": 494 }, { "epoch": 0.7194767441860465, "grad_norm": 1.2798330620724652, "learning_rate": 7.1326142531543184e-06, "loss": 0.1322, "step": 495 }, { "epoch": 0.7209302325581395, "grad_norm": 1.2830117846056146, "learning_rate": 7.122283494379076e-06, "loss": 0.1368, "step": 496 }, { "epoch": 0.7223837209302325, "grad_norm": 1.6773615252194154, "learning_rate": 7.1119416727777414e-06, "loss": 0.152, "step": 497 }, { "epoch": 0.7238372093023255, "grad_norm": 1.4919923732243248, "learning_rate": 7.101588842259122e-06, "loss": 0.1712, "step": 498 }, { "epoch": 0.7252906976744186, "grad_norm": 1.4335685593529308, "learning_rate": 7.09122505678941e-06, "loss": 0.1476, "step": 499 }, { "epoch": 0.7267441860465116, "grad_norm": 1.6645358118933977, "learning_rate": 7.080850370391907e-06, "loss": 0.1275, "step": 500 }, { "epoch": 0.7281976744186046, "grad_norm": 1.4570433505689948, "learning_rate": 7.0704648371467355e-06, "loss": 0.1173, "step": 501 }, { "epoch": 0.7296511627906976, "grad_norm": 1.4500084348795894, "learning_rate": 7.060068511190559e-06, "loss": 0.1343, "step": 502 }, { "epoch": 0.7311046511627907, "grad_norm": 1.3583549655605316, "learning_rate": 7.049661446716298e-06, "loss": 0.1392, "step": 503 }, { "epoch": 0.7325581395348837, "grad_norm": 1.678903155981633, "learning_rate": 7.039243697972856e-06, "loss": 0.1237, "step": 504 }, { "epoch": 0.7340116279069767, "grad_norm": 1.1813005908965784, "learning_rate": 7.028815319264825e-06, "loss": 0.1239, "step": 505 }, { "epoch": 0.7354651162790697, "grad_norm": 1.6621537638966262, "learning_rate": 7.01837636495221e-06, "loss": 0.1373, "step": 506 }, { "epoch": 0.7369186046511628, "grad_norm": 1.4624579516595546, "learning_rate": 7.007926889450142e-06, "loss": 0.1074, "step": 507 }, { "epoch": 0.7383720930232558, "grad_norm": 1.6615046264229711, "learning_rate": 6.997466947228596e-06, "loss": 0.149, "step": 508 }, { "epoch": 0.7398255813953488, "grad_norm": 1.4750586970164907, "learning_rate": 6.9869965928121095e-06, "loss": 0.11, "step": 509 }, { "epoch": 0.7412790697674418, "grad_norm": 1.2507783508104329, "learning_rate": 6.976515880779492e-06, "loss": 0.1141, "step": 510 }, { "epoch": 0.7427325581395349, "grad_norm": 1.545121618824877, "learning_rate": 6.966024865763546e-06, "loss": 0.1291, "step": 511 }, { "epoch": 0.7441860465116279, "grad_norm": 1.438312213107679, "learning_rate": 6.95552360245078e-06, "loss": 0.1502, "step": 512 }, { "epoch": 0.7456395348837209, "grad_norm": 1.6592935733066172, "learning_rate": 6.945012145581127e-06, "loss": 0.175, "step": 513 }, { "epoch": 0.747093023255814, "grad_norm": 1.514707717226784, "learning_rate": 6.9344905499476475e-06, "loss": 0.15, "step": 514 }, { "epoch": 0.748546511627907, "grad_norm": 1.604276122246373, "learning_rate": 6.9239588703962625e-06, "loss": 0.1572, "step": 515 }, { "epoch": 0.75, "grad_norm": 1.3281897884109624, "learning_rate": 6.913417161825449e-06, "loss": 0.1519, "step": 516 }, { "epoch": 0.751453488372093, "grad_norm": 1.4267995539927456, "learning_rate": 6.90286547918597e-06, "loss": 0.1335, "step": 517 }, { "epoch": 0.752906976744186, "grad_norm": 1.5170579839155427, "learning_rate": 6.8923038774805705e-06, "loss": 0.1519, "step": 518 }, { "epoch": 0.7543604651162791, "grad_norm": 1.3797508342143487, "learning_rate": 6.881732411763712e-06, "loss": 0.1343, "step": 519 }, { "epoch": 0.7558139534883721, "grad_norm": 1.5104127395592026, "learning_rate": 6.871151137141266e-06, "loss": 0.138, "step": 520 }, { "epoch": 0.7572674418604651, "grad_norm": 1.3568718550092838, "learning_rate": 6.860560108770238e-06, "loss": 0.1367, "step": 521 }, { "epoch": 0.7587209302325582, "grad_norm": 1.5091260876222339, "learning_rate": 6.849959381858475e-06, "loss": 0.1186, "step": 522 }, { "epoch": 0.7601744186046512, "grad_norm": 1.318107897408335, "learning_rate": 6.839349011664381e-06, "loss": 0.111, "step": 523 }, { "epoch": 0.7616279069767442, "grad_norm": 1.4214672012040783, "learning_rate": 6.828729053496629e-06, "loss": 0.1174, "step": 524 }, { "epoch": 0.7630813953488372, "grad_norm": 1.4659206181489879, "learning_rate": 6.8180995627138665e-06, "loss": 0.1492, "step": 525 }, { "epoch": 0.7645348837209303, "grad_norm": 1.7334326757131346, "learning_rate": 6.80746059472444e-06, "loss": 0.1363, "step": 526 }, { "epoch": 0.7659883720930233, "grad_norm": 1.4971209402674073, "learning_rate": 6.796812204986087e-06, "loss": 0.1465, "step": 527 }, { "epoch": 0.7674418604651163, "grad_norm": 1.4222274425753152, "learning_rate": 6.786154449005664e-06, "loss": 0.1356, "step": 528 }, { "epoch": 0.7688953488372093, "grad_norm": 1.1331281504639361, "learning_rate": 6.775487382338854e-06, "loss": 0.0901, "step": 529 }, { "epoch": 0.7703488372093024, "grad_norm": 1.4096278465211138, "learning_rate": 6.764811060589867e-06, "loss": 0.1083, "step": 530 }, { "epoch": 0.7718023255813954, "grad_norm": 1.4098544395095702, "learning_rate": 6.754125539411159e-06, "loss": 0.1217, "step": 531 }, { "epoch": 0.7732558139534884, "grad_norm": 1.131188089335205, "learning_rate": 6.743430874503143e-06, "loss": 0.1337, "step": 532 }, { "epoch": 0.7747093023255814, "grad_norm": 1.2696620283091575, "learning_rate": 6.732727121613894e-06, "loss": 0.097, "step": 533 }, { "epoch": 0.7761627906976745, "grad_norm": 1.5173238831421139, "learning_rate": 6.722014336538858e-06, "loss": 0.1178, "step": 534 }, { "epoch": 0.7776162790697675, "grad_norm": 1.393380969750145, "learning_rate": 6.7112925751205636e-06, "loss": 0.1409, "step": 535 }, { "epoch": 0.7790697674418605, "grad_norm": 1.5020699777762692, "learning_rate": 6.700561893248332e-06, "loss": 0.1635, "step": 536 }, { "epoch": 0.7805232558139535, "grad_norm": 1.319982727969388, "learning_rate": 6.689822346857983e-06, "loss": 0.1047, "step": 537 }, { "epoch": 0.7819767441860465, "grad_norm": 1.6061685140083173, "learning_rate": 6.679073991931544e-06, "loss": 0.1513, "step": 538 }, { "epoch": 0.7834302325581395, "grad_norm": 1.451224509588678, "learning_rate": 6.66831688449696e-06, "loss": 0.1223, "step": 539 }, { "epoch": 0.7848837209302325, "grad_norm": 1.3257073380514315, "learning_rate": 6.657551080627801e-06, "loss": 0.147, "step": 540 }, { "epoch": 0.7863372093023255, "grad_norm": 1.412854606695532, "learning_rate": 6.646776636442964e-06, "loss": 0.1427, "step": 541 }, { "epoch": 0.7877906976744186, "grad_norm": 1.2307963654566814, "learning_rate": 6.63599360810639e-06, "loss": 0.1149, "step": 542 }, { "epoch": 0.7892441860465116, "grad_norm": 1.2378486319546478, "learning_rate": 6.6252020518267664e-06, "loss": 0.145, "step": 543 }, { "epoch": 0.7906976744186046, "grad_norm": 1.4375410232495538, "learning_rate": 6.614402023857231e-06, "loss": 0.1458, "step": 544 }, { "epoch": 0.7921511627906976, "grad_norm": 1.7682907013920814, "learning_rate": 6.603593580495088e-06, "loss": 0.1492, "step": 545 }, { "epoch": 0.7936046511627907, "grad_norm": 1.5849164073586606, "learning_rate": 6.5927767780815e-06, "loss": 0.1244, "step": 546 }, { "epoch": 0.7950581395348837, "grad_norm": 1.4013284382226423, "learning_rate": 6.581951673001212e-06, "loss": 0.1575, "step": 547 }, { "epoch": 0.7965116279069767, "grad_norm": 1.6703536565707453, "learning_rate": 6.5711183216822405e-06, "loss": 0.1345, "step": 548 }, { "epoch": 0.7979651162790697, "grad_norm": 1.5575391293614544, "learning_rate": 6.56027678059559e-06, "loss": 0.1424, "step": 549 }, { "epoch": 0.7994186046511628, "grad_norm": 1.1506378616299937, "learning_rate": 6.549427106254959e-06, "loss": 0.1204, "step": 550 }, { "epoch": 0.8008720930232558, "grad_norm": 1.6831420345046888, "learning_rate": 6.5385693552164375e-06, "loss": 0.1533, "step": 551 }, { "epoch": 0.8023255813953488, "grad_norm": 1.5433557869849786, "learning_rate": 6.527703584078219e-06, "loss": 0.1287, "step": 552 }, { "epoch": 0.8037790697674418, "grad_norm": 1.37938065902107, "learning_rate": 6.516829849480304e-06, "loss": 0.1466, "step": 553 }, { "epoch": 0.8052325581395349, "grad_norm": 1.4138166190724506, "learning_rate": 6.505948208104202e-06, "loss": 0.1336, "step": 554 }, { "epoch": 0.8066860465116279, "grad_norm": 1.2958346006809633, "learning_rate": 6.495058716672641e-06, "loss": 0.1155, "step": 555 }, { "epoch": 0.8081395348837209, "grad_norm": 1.536894893444166, "learning_rate": 6.4841614319492665e-06, "loss": 0.1467, "step": 556 }, { "epoch": 0.809593023255814, "grad_norm": 1.42823984914434, "learning_rate": 6.473256410738349e-06, "loss": 0.1245, "step": 557 }, { "epoch": 0.811046511627907, "grad_norm": 1.738051363991152, "learning_rate": 6.462343709884488e-06, "loss": 0.1431, "step": 558 }, { "epoch": 0.8125, "grad_norm": 1.199922342897142, "learning_rate": 6.451423386272312e-06, "loss": 0.1328, "step": 559 }, { "epoch": 0.813953488372093, "grad_norm": 1.8691018081903936, "learning_rate": 6.440495496826189e-06, "loss": 0.1695, "step": 560 }, { "epoch": 0.815406976744186, "grad_norm": 1.431507205382704, "learning_rate": 6.429560098509919e-06, "loss": 0.1519, "step": 561 }, { "epoch": 0.8168604651162791, "grad_norm": 1.764161344903358, "learning_rate": 6.4186172483264505e-06, "loss": 0.1503, "step": 562 }, { "epoch": 0.8183139534883721, "grad_norm": 1.3488150562790815, "learning_rate": 6.4076670033175725e-06, "loss": 0.1419, "step": 563 }, { "epoch": 0.8197674418604651, "grad_norm": 1.6253830612966447, "learning_rate": 6.396709420563621e-06, "loss": 0.1779, "step": 564 }, { "epoch": 0.8212209302325582, "grad_norm": 1.628949856290011, "learning_rate": 6.385744557183181e-06, "loss": 0.1325, "step": 565 }, { "epoch": 0.8226744186046512, "grad_norm": 2.1235072067385237, "learning_rate": 6.374772470332793e-06, "loss": 0.1833, "step": 566 }, { "epoch": 0.8241279069767442, "grad_norm": 1.9172827402856294, "learning_rate": 6.363793217206645e-06, "loss": 0.1903, "step": 567 }, { "epoch": 0.8255813953488372, "grad_norm": 1.4322727783843214, "learning_rate": 6.352806855036287e-06, "loss": 0.1271, "step": 568 }, { "epoch": 0.8270348837209303, "grad_norm": 1.2966960086456427, "learning_rate": 6.341813441090323e-06, "loss": 0.146, "step": 569 }, { "epoch": 0.8284883720930233, "grad_norm": 1.550482403879787, "learning_rate": 6.330813032674116e-06, "loss": 0.1432, "step": 570 }, { "epoch": 0.8299418604651163, "grad_norm": 1.3655414999390942, "learning_rate": 6.3198056871294885e-06, "loss": 0.109, "step": 571 }, { "epoch": 0.8313953488372093, "grad_norm": 1.4217140051202517, "learning_rate": 6.308791461834427e-06, "loss": 0.1318, "step": 572 }, { "epoch": 0.8328488372093024, "grad_norm": 1.7714886028240058, "learning_rate": 6.297770414202778e-06, "loss": 0.1777, "step": 573 }, { "epoch": 0.8343023255813954, "grad_norm": 1.4570998125079995, "learning_rate": 6.286742601683947e-06, "loss": 0.1237, "step": 574 }, { "epoch": 0.8357558139534884, "grad_norm": 1.4949515901539128, "learning_rate": 6.275708081762611e-06, "loss": 0.131, "step": 575 }, { "epoch": 0.8372093023255814, "grad_norm": 1.3854089542275734, "learning_rate": 6.264666911958404e-06, "loss": 0.1125, "step": 576 }, { "epoch": 0.8386627906976745, "grad_norm": 1.5479190064330035, "learning_rate": 6.253619149825627e-06, "loss": 0.1402, "step": 577 }, { "epoch": 0.8401162790697675, "grad_norm": 1.4026025728759723, "learning_rate": 6.242564852952946e-06, "loss": 0.1209, "step": 578 }, { "epoch": 0.8415697674418605, "grad_norm": 1.4121319145908862, "learning_rate": 6.231504078963087e-06, "loss": 0.138, "step": 579 }, { "epoch": 0.8430232558139535, "grad_norm": 2.209836743441391, "learning_rate": 6.220436885512539e-06, "loss": 0.1362, "step": 580 }, { "epoch": 0.8444767441860465, "grad_norm": 1.585787336115818, "learning_rate": 6.209363330291261e-06, "loss": 0.201, "step": 581 }, { "epoch": 0.8459302325581395, "grad_norm": 1.9740958700357052, "learning_rate": 6.198283471022362e-06, "loss": 0.1469, "step": 582 }, { "epoch": 0.8473837209302325, "grad_norm": 1.4412498217025638, "learning_rate": 6.187197365461822e-06, "loss": 0.1708, "step": 583 }, { "epoch": 0.8488372093023255, "grad_norm": 1.4582305731121776, "learning_rate": 6.1761050713981795e-06, "loss": 0.1266, "step": 584 }, { "epoch": 0.8502906976744186, "grad_norm": 1.450107224126935, "learning_rate": 6.165006646652227e-06, "loss": 0.1561, "step": 585 }, { "epoch": 0.8517441860465116, "grad_norm": 2.004083462986721, "learning_rate": 6.1539021490767206e-06, "loss": 0.2133, "step": 586 }, { "epoch": 0.8531976744186046, "grad_norm": 1.6247079616074993, "learning_rate": 6.1427916365560666e-06, "loss": 0.1742, "step": 587 }, { "epoch": 0.8546511627906976, "grad_norm": 1.552175859788209, "learning_rate": 6.1316751670060295e-06, "loss": 0.136, "step": 588 }, { "epoch": 0.8561046511627907, "grad_norm": 1.276124479823587, "learning_rate": 6.120552798373423e-06, "loss": 0.1694, "step": 589 }, { "epoch": 0.8575581395348837, "grad_norm": 1.5410147115702781, "learning_rate": 6.109424588635814e-06, "loss": 0.105, "step": 590 }, { "epoch": 0.8590116279069767, "grad_norm": 1.2952469635559278, "learning_rate": 6.098290595801215e-06, "loss": 0.1179, "step": 591 }, { "epoch": 0.8604651162790697, "grad_norm": 1.1306177176882681, "learning_rate": 6.087150877907786e-06, "loss": 0.081, "step": 592 }, { "epoch": 0.8619186046511628, "grad_norm": 1.7371127866399059, "learning_rate": 6.076005493023527e-06, "loss": 0.1893, "step": 593 }, { "epoch": 0.8633720930232558, "grad_norm": 1.4638570381751588, "learning_rate": 6.0648544992459804e-06, "loss": 0.1572, "step": 594 }, { "epoch": 0.8648255813953488, "grad_norm": 1.6634088466354064, "learning_rate": 6.053697954701927e-06, "loss": 0.1517, "step": 595 }, { "epoch": 0.8662790697674418, "grad_norm": 1.620767793264197, "learning_rate": 6.04253591754708e-06, "loss": 0.1678, "step": 596 }, { "epoch": 0.8677325581395349, "grad_norm": 1.6119437429007382, "learning_rate": 6.031368445965784e-06, "loss": 0.159, "step": 597 }, { "epoch": 0.8691860465116279, "grad_norm": 1.78819833988465, "learning_rate": 6.0201955981707135e-06, "loss": 0.1993, "step": 598 }, { "epoch": 0.8706395348837209, "grad_norm": 1.95544535687204, "learning_rate": 6.009017432402569e-06, "loss": 0.1759, "step": 599 }, { "epoch": 0.872093023255814, "grad_norm": 1.5590305502883546, "learning_rate": 5.997834006929765e-06, "loss": 0.1469, "step": 600 }, { "epoch": 0.872093023255814, "eval_loss": 0.1277909129858017, "eval_runtime": 2.208, "eval_samples_per_second": 25.362, "eval_steps_per_second": 6.341, "step": 600 }, { "epoch": 0.873546511627907, "grad_norm": 1.3164705679840971, "learning_rate": 5.98664538004814e-06, "loss": 0.1003, "step": 601 }, { "epoch": 0.875, "grad_norm": 1.2239480234296696, "learning_rate": 5.975451610080643e-06, "loss": 0.1121, "step": 602 }, { "epoch": 0.876453488372093, "grad_norm": 1.894849529945871, "learning_rate": 5.964252755377033e-06, "loss": 0.1654, "step": 603 }, { "epoch": 0.877906976744186, "grad_norm": 1.2492208085247638, "learning_rate": 5.953048874313575e-06, "loss": 0.1122, "step": 604 }, { "epoch": 0.8793604651162791, "grad_norm": 1.8022455407233433, "learning_rate": 5.941840025292733e-06, "loss": 0.1257, "step": 605 }, { "epoch": 0.8808139534883721, "grad_norm": 1.3177318316240192, "learning_rate": 5.930626266742871e-06, "loss": 0.1383, "step": 606 }, { "epoch": 0.8822674418604651, "grad_norm": 1.140730369983588, "learning_rate": 5.9194076571179415e-06, "loss": 0.116, "step": 607 }, { "epoch": 0.8837209302325582, "grad_norm": 1.3723451817269945, "learning_rate": 5.908184254897183e-06, "loss": 0.1549, "step": 608 }, { "epoch": 0.8851744186046512, "grad_norm": 1.5626353100716068, "learning_rate": 5.89695611858482e-06, "loss": 0.1442, "step": 609 }, { "epoch": 0.8866279069767442, "grad_norm": 1.8839870797491676, "learning_rate": 5.885723306709754e-06, "loss": 0.1286, "step": 610 }, { "epoch": 0.8880813953488372, "grad_norm": 1.082745017222708, "learning_rate": 5.8744858778252555e-06, "loss": 0.1223, "step": 611 }, { "epoch": 0.8895348837209303, "grad_norm": 1.4674626548779213, "learning_rate": 5.8632438905086685e-06, "loss": 0.1456, "step": 612 }, { "epoch": 0.8909883720930233, "grad_norm": 1.358671032886957, "learning_rate": 5.851997403361089e-06, "loss": 0.1299, "step": 613 }, { "epoch": 0.8924418604651163, "grad_norm": 1.296241900835058, "learning_rate": 5.840746475007079e-06, "loss": 0.1419, "step": 614 }, { "epoch": 0.8938953488372093, "grad_norm": 1.3842657938075646, "learning_rate": 5.8294911640943455e-06, "loss": 0.1215, "step": 615 }, { "epoch": 0.8953488372093024, "grad_norm": 1.3467341878188124, "learning_rate": 5.818231529293441e-06, "loss": 0.1123, "step": 616 }, { "epoch": 0.8968023255813954, "grad_norm": 1.4251072382018615, "learning_rate": 5.80696762929746e-06, "loss": 0.1423, "step": 617 }, { "epoch": 0.8982558139534884, "grad_norm": 1.5077208770452775, "learning_rate": 5.795699522821727e-06, "loss": 0.1728, "step": 618 }, { "epoch": 0.8997093023255814, "grad_norm": 1.248864820858094, "learning_rate": 5.784427268603498e-06, "loss": 0.1195, "step": 619 }, { "epoch": 0.9011627906976745, "grad_norm": 1.4092688361917134, "learning_rate": 5.773150925401642e-06, "loss": 0.1266, "step": 620 }, { "epoch": 0.9026162790697675, "grad_norm": 1.688504971336848, "learning_rate": 5.761870551996349e-06, "loss": 0.1379, "step": 621 }, { "epoch": 0.9040697674418605, "grad_norm": 1.2541754034602337, "learning_rate": 5.750586207188817e-06, "loss": 0.0917, "step": 622 }, { "epoch": 0.9055232558139535, "grad_norm": 1.2487120647276626, "learning_rate": 5.7392979498009445e-06, "loss": 0.1241, "step": 623 }, { "epoch": 0.9069767441860465, "grad_norm": 1.445654790451785, "learning_rate": 5.728005838675026e-06, "loss": 0.1456, "step": 624 }, { "epoch": 0.9084302325581395, "grad_norm": 1.7499610557683432, "learning_rate": 5.7167099326734385e-06, "loss": 0.1405, "step": 625 }, { "epoch": 0.9098837209302325, "grad_norm": 1.648183080801681, "learning_rate": 5.7054102906783526e-06, "loss": 0.1822, "step": 626 }, { "epoch": 0.9113372093023255, "grad_norm": 1.6571510451841074, "learning_rate": 5.6941069715914e-06, "loss": 0.1181, "step": 627 }, { "epoch": 0.9127906976744186, "grad_norm": 1.944131860744343, "learning_rate": 5.6828000343333904e-06, "loss": 0.1719, "step": 628 }, { "epoch": 0.9142441860465116, "grad_norm": 1.7309656879851627, "learning_rate": 5.671489537843987e-06, "loss": 0.1689, "step": 629 }, { "epoch": 0.9156976744186046, "grad_norm": 1.7575196308441996, "learning_rate": 5.660175541081411e-06, "loss": 0.182, "step": 630 }, { "epoch": 0.9171511627906976, "grad_norm": 1.277009648528385, "learning_rate": 5.648858103022128e-06, "loss": 0.1516, "step": 631 }, { "epoch": 0.9186046511627907, "grad_norm": 1.340132227949514, "learning_rate": 5.63753728266054e-06, "loss": 0.1176, "step": 632 }, { "epoch": 0.9200581395348837, "grad_norm": 1.7630027193552753, "learning_rate": 5.626213139008684e-06, "loss": 0.1677, "step": 633 }, { "epoch": 0.9215116279069767, "grad_norm": 1.5111942011086947, "learning_rate": 5.614885731095915e-06, "loss": 0.1763, "step": 634 }, { "epoch": 0.9229651162790697, "grad_norm": 1.3355309378554394, "learning_rate": 5.603555117968607e-06, "loss": 0.1073, "step": 635 }, { "epoch": 0.9244186046511628, "grad_norm": 1.404078498056489, "learning_rate": 5.592221358689843e-06, "loss": 0.1249, "step": 636 }, { "epoch": 0.9258720930232558, "grad_norm": 1.4270282877283114, "learning_rate": 5.580884512339103e-06, "loss": 0.1035, "step": 637 }, { "epoch": 0.9273255813953488, "grad_norm": 1.5176928637053217, "learning_rate": 5.56954463801196e-06, "loss": 0.1373, "step": 638 }, { "epoch": 0.9287790697674418, "grad_norm": 1.3456243259305802, "learning_rate": 5.558201794819773e-06, "loss": 0.1868, "step": 639 }, { "epoch": 0.9302325581395349, "grad_norm": 1.5593018977581026, "learning_rate": 5.546856041889374e-06, "loss": 0.1429, "step": 640 }, { "epoch": 0.9316860465116279, "grad_norm": 1.6976998517010211, "learning_rate": 5.53550743836276e-06, "loss": 0.1523, "step": 641 }, { "epoch": 0.9331395348837209, "grad_norm": 1.3798256912079117, "learning_rate": 5.524156043396796e-06, "loss": 0.1179, "step": 642 }, { "epoch": 0.934593023255814, "grad_norm": 1.4294475115030438, "learning_rate": 5.512801916162891e-06, "loss": 0.1309, "step": 643 }, { "epoch": 0.936046511627907, "grad_norm": 2.7198043975539137, "learning_rate": 5.501445115846697e-06, "loss": 0.207, "step": 644 }, { "epoch": 0.9375, "grad_norm": 1.4833934130748365, "learning_rate": 5.490085701647805e-06, "loss": 0.1355, "step": 645 }, { "epoch": 0.938953488372093, "grad_norm": 1.3355964013808177, "learning_rate": 5.478723732779422e-06, "loss": 0.1374, "step": 646 }, { "epoch": 0.940406976744186, "grad_norm": 1.7207500799533988, "learning_rate": 5.467359268468081e-06, "loss": 0.1605, "step": 647 }, { "epoch": 0.9418604651162791, "grad_norm": 1.3090162322795527, "learning_rate": 5.455992367953318e-06, "loss": 0.1445, "step": 648 }, { "epoch": 0.9433139534883721, "grad_norm": 1.2788214859695954, "learning_rate": 5.444623090487371e-06, "loss": 0.1047, "step": 649 }, { "epoch": 0.9447674418604651, "grad_norm": 2.0071938164433365, "learning_rate": 5.433251495334864e-06, "loss": 0.1546, "step": 650 }, { "epoch": 0.9462209302325582, "grad_norm": 1.3782058820605825, "learning_rate": 5.4218776417725095e-06, "loss": 0.136, "step": 651 }, { "epoch": 0.9476744186046512, "grad_norm": 1.281462748132549, "learning_rate": 5.410501589088786e-06, "loss": 0.1035, "step": 652 }, { "epoch": 0.9491279069767442, "grad_norm": 1.9048686906108325, "learning_rate": 5.3991233965836365e-06, "loss": 0.1683, "step": 653 }, { "epoch": 0.9505813953488372, "grad_norm": 1.6316273841656876, "learning_rate": 5.387743123568161e-06, "loss": 0.1446, "step": 654 }, { "epoch": 0.9520348837209303, "grad_norm": 1.4861937009380384, "learning_rate": 5.376360829364301e-06, "loss": 0.1527, "step": 655 }, { "epoch": 0.9534883720930233, "grad_norm": 1.8814574763895897, "learning_rate": 5.364976573304538e-06, "loss": 0.1614, "step": 656 }, { "epoch": 0.9549418604651163, "grad_norm": 1.4646624685889762, "learning_rate": 5.3535904147315765e-06, "loss": 0.1543, "step": 657 }, { "epoch": 0.9563953488372093, "grad_norm": 1.287864663906961, "learning_rate": 5.34220241299804e-06, "loss": 0.1279, "step": 658 }, { "epoch": 0.9578488372093024, "grad_norm": 1.7385545460984706, "learning_rate": 5.330812627466159e-06, "loss": 0.1639, "step": 659 }, { "epoch": 0.9593023255813954, "grad_norm": 1.7081543020679955, "learning_rate": 5.319421117507461e-06, "loss": 0.1597, "step": 660 }, { "epoch": 0.9607558139534884, "grad_norm": 1.3995536288669932, "learning_rate": 5.308027942502467e-06, "loss": 0.1103, "step": 661 }, { "epoch": 0.9622093023255814, "grad_norm": 1.2943413923835272, "learning_rate": 5.296633161840374e-06, "loss": 0.159, "step": 662 }, { "epoch": 0.9636627906976745, "grad_norm": 1.5010261384648311, "learning_rate": 5.285236834918749e-06, "loss": 0.1101, "step": 663 }, { "epoch": 0.9651162790697675, "grad_norm": 1.59839024793064, "learning_rate": 5.273839021143217e-06, "loss": 0.1438, "step": 664 }, { "epoch": 0.9665697674418605, "grad_norm": 1.3887266849885156, "learning_rate": 5.262439779927163e-06, "loss": 0.1336, "step": 665 }, { "epoch": 0.9680232558139535, "grad_norm": 1.2605940576749957, "learning_rate": 5.251039170691399e-06, "loss": 0.1168, "step": 666 }, { "epoch": 0.9694767441860465, "grad_norm": 1.4056700350437759, "learning_rate": 5.2396372528638785e-06, "loss": 0.1501, "step": 667 }, { "epoch": 0.9709302325581395, "grad_norm": 1.4534488094786742, "learning_rate": 5.22823408587937e-06, "loss": 0.1336, "step": 668 }, { "epoch": 0.9723837209302325, "grad_norm": 1.336814955827563, "learning_rate": 5.216829729179158e-06, "loss": 0.1126, "step": 669 }, { "epoch": 0.9738372093023255, "grad_norm": 1.5142125626046417, "learning_rate": 5.205424242210727e-06, "loss": 0.1096, "step": 670 }, { "epoch": 0.9752906976744186, "grad_norm": 1.4755522779468442, "learning_rate": 5.194017684427453e-06, "loss": 0.1087, "step": 671 }, { "epoch": 0.9767441860465116, "grad_norm": 1.5661924416341968, "learning_rate": 5.182610115288296e-06, "loss": 0.1274, "step": 672 }, { "epoch": 0.9781976744186046, "grad_norm": 1.9124058904385859, "learning_rate": 5.171201594257481e-06, "loss": 0.1314, "step": 673 }, { "epoch": 0.9796511627906976, "grad_norm": 1.504304863764823, "learning_rate": 5.159792180804204e-06, "loss": 0.1759, "step": 674 }, { "epoch": 0.9811046511627907, "grad_norm": 1.6754651562035756, "learning_rate": 5.148381934402306e-06, "loss": 0.1911, "step": 675 }, { "epoch": 0.9825581395348837, "grad_norm": 1.4176711979057277, "learning_rate": 5.136970914529975e-06, "loss": 0.1305, "step": 676 }, { "epoch": 0.9840116279069767, "grad_norm": 1.5962091642432297, "learning_rate": 5.125559180669427e-06, "loss": 0.1385, "step": 677 }, { "epoch": 0.9854651162790697, "grad_norm": 1.223911365889511, "learning_rate": 5.1141467923066016e-06, "loss": 0.1169, "step": 678 }, { "epoch": 0.9869186046511628, "grad_norm": 1.5617758354090914, "learning_rate": 5.102733808930851e-06, "loss": 0.1172, "step": 679 }, { "epoch": 0.9883720930232558, "grad_norm": 1.6974851747495288, "learning_rate": 5.0913202900346246e-06, "loss": 0.13, "step": 680 }, { "epoch": 0.9898255813953488, "grad_norm": 1.631411370867565, "learning_rate": 5.07990629511317e-06, "loss": 0.1454, "step": 681 }, { "epoch": 0.9912790697674418, "grad_norm": 1.5300412665655438, "learning_rate": 5.068491883664212e-06, "loss": 0.109, "step": 682 }, { "epoch": 0.9927325581395349, "grad_norm": 1.442218618017608, "learning_rate": 5.057077115187645e-06, "loss": 0.1429, "step": 683 }, { "epoch": 0.9941860465116279, "grad_norm": 1.347588478220433, "learning_rate": 5.04566204918523e-06, "loss": 0.1255, "step": 684 }, { "epoch": 0.9956395348837209, "grad_norm": 1.605701847815844, "learning_rate": 5.034246745160275e-06, "loss": 0.1533, "step": 685 }, { "epoch": 0.997093023255814, "grad_norm": 1.4017801022069876, "learning_rate": 5.022831262617328e-06, "loss": 0.1291, "step": 686 }, { "epoch": 0.998546511627907, "grad_norm": 1.5723087938578602, "learning_rate": 5.011415661061869e-06, "loss": 0.106, "step": 687 }, { "epoch": 1.0, "grad_norm": 1.160944506207306, "learning_rate": 5e-06, "loss": 0.0753, "step": 688 }, { "epoch": 1.001453488372093, "grad_norm": 0.9804794119294494, "learning_rate": 4.988584338938133e-06, "loss": 0.0865, "step": 689 }, { "epoch": 1.002906976744186, "grad_norm": 0.8710757605721173, "learning_rate": 4.977168737382674e-06, "loss": 0.0666, "step": 690 }, { "epoch": 1.004360465116279, "grad_norm": 1.195545295432579, "learning_rate": 4.965753254839727e-06, "loss": 0.0836, "step": 691 }, { "epoch": 1.005813953488372, "grad_norm": 1.2127630890866723, "learning_rate": 4.954337950814771e-06, "loss": 0.0551, "step": 692 }, { "epoch": 1.007267441860465, "grad_norm": 1.5404241417212168, "learning_rate": 4.942922884812357e-06, "loss": 0.1009, "step": 693 }, { "epoch": 1.0087209302325582, "grad_norm": 1.1556879357489478, "learning_rate": 4.9315081163357905e-06, "loss": 0.0675, "step": 694 }, { "epoch": 1.010174418604651, "grad_norm": 1.1774111310837216, "learning_rate": 4.920093704886832e-06, "loss": 0.0639, "step": 695 }, { "epoch": 1.0116279069767442, "grad_norm": 1.1528720659312301, "learning_rate": 4.908679709965376e-06, "loss": 0.0548, "step": 696 }, { "epoch": 1.0130813953488371, "grad_norm": 1.1786682354719682, "learning_rate": 4.897266191069152e-06, "loss": 0.0793, "step": 697 }, { "epoch": 1.0145348837209303, "grad_norm": 1.1681775175087292, "learning_rate": 4.8858532076934e-06, "loss": 0.0621, "step": 698 }, { "epoch": 1.0159883720930232, "grad_norm": 1.396149038821934, "learning_rate": 4.874440819330576e-06, "loss": 0.0595, "step": 699 }, { "epoch": 1.0174418604651163, "grad_norm": 1.0444832188691113, "learning_rate": 4.8630290854700264e-06, "loss": 0.06, "step": 700 }, { "epoch": 1.0188953488372092, "grad_norm": 1.3215192237780464, "learning_rate": 4.851618065597696e-06, "loss": 0.0594, "step": 701 }, { "epoch": 1.0203488372093024, "grad_norm": 1.3022953280724392, "learning_rate": 4.840207819195797e-06, "loss": 0.0582, "step": 702 }, { "epoch": 1.0218023255813953, "grad_norm": 1.2305716639772417, "learning_rate": 4.82879840574252e-06, "loss": 0.0626, "step": 703 }, { "epoch": 1.0232558139534884, "grad_norm": 1.612550300365213, "learning_rate": 4.817389884711706e-06, "loss": 0.0739, "step": 704 }, { "epoch": 1.0247093023255813, "grad_norm": 1.2261189219475501, "learning_rate": 4.805982315572547e-06, "loss": 0.0842, "step": 705 }, { "epoch": 1.0261627906976745, "grad_norm": 1.5497272626438672, "learning_rate": 4.794575757789274e-06, "loss": 0.0808, "step": 706 }, { "epoch": 1.0276162790697674, "grad_norm": 1.5930387295456183, "learning_rate": 4.7831702708208445e-06, "loss": 0.0617, "step": 707 }, { "epoch": 1.0290697674418605, "grad_norm": 1.3103541839768624, "learning_rate": 4.7717659141206315e-06, "loss": 0.0728, "step": 708 }, { "epoch": 1.0305232558139534, "grad_norm": 1.2251435492506793, "learning_rate": 4.760362747136125e-06, "loss": 0.0707, "step": 709 }, { "epoch": 1.0319767441860466, "grad_norm": 1.3796327304677727, "learning_rate": 4.748960829308601e-06, "loss": 0.0681, "step": 710 }, { "epoch": 1.0334302325581395, "grad_norm": 1.1725032499441765, "learning_rate": 4.737560220072839e-06, "loss": 0.0565, "step": 711 }, { "epoch": 1.0348837209302326, "grad_norm": 1.245077983095157, "learning_rate": 4.726160978856782e-06, "loss": 0.0528, "step": 712 }, { "epoch": 1.0363372093023255, "grad_norm": 1.1988572535104782, "learning_rate": 4.714763165081253e-06, "loss": 0.0677, "step": 713 }, { "epoch": 1.0377906976744187, "grad_norm": 1.4960786476723407, "learning_rate": 4.703366838159627e-06, "loss": 0.0639, "step": 714 }, { "epoch": 1.0392441860465116, "grad_norm": 1.4091361813904233, "learning_rate": 4.691972057497534e-06, "loss": 0.0602, "step": 715 }, { "epoch": 1.0406976744186047, "grad_norm": 1.5037404883411951, "learning_rate": 4.6805788824925395e-06, "loss": 0.0705, "step": 716 }, { "epoch": 1.0421511627906976, "grad_norm": 1.2607654648800852, "learning_rate": 4.669187372533843e-06, "loss": 0.053, "step": 717 }, { "epoch": 1.0436046511627908, "grad_norm": 1.6239843045120586, "learning_rate": 4.657797587001961e-06, "loss": 0.0704, "step": 718 }, { "epoch": 1.0450581395348837, "grad_norm": 1.5045378575805983, "learning_rate": 4.646409585268425e-06, "loss": 0.0606, "step": 719 }, { "epoch": 1.0465116279069768, "grad_norm": 1.291286419079524, "learning_rate": 4.635023426695462e-06, "loss": 0.0644, "step": 720 }, { "epoch": 1.0479651162790697, "grad_norm": 1.370959757593235, "learning_rate": 4.6236391706357e-06, "loss": 0.0704, "step": 721 }, { "epoch": 1.0494186046511629, "grad_norm": 1.5664627647333818, "learning_rate": 4.612256876431839e-06, "loss": 0.0498, "step": 722 }, { "epoch": 1.0508720930232558, "grad_norm": 1.2766329620982255, "learning_rate": 4.600876603416364e-06, "loss": 0.0473, "step": 723 }, { "epoch": 1.052325581395349, "grad_norm": 1.5959190709847693, "learning_rate": 4.589498410911215e-06, "loss": 0.083, "step": 724 }, { "epoch": 1.0537790697674418, "grad_norm": 1.2654889143492782, "learning_rate": 4.578122358227492e-06, "loss": 0.061, "step": 725 }, { "epoch": 1.055232558139535, "grad_norm": 1.232952014205962, "learning_rate": 4.566748504665136e-06, "loss": 0.0601, "step": 726 }, { "epoch": 1.056686046511628, "grad_norm": 1.3155701382257572, "learning_rate": 4.555376909512631e-06, "loss": 0.0537, "step": 727 }, { "epoch": 1.058139534883721, "grad_norm": 1.6520885025976517, "learning_rate": 4.544007632046682e-06, "loss": 0.0798, "step": 728 }, { "epoch": 1.059593023255814, "grad_norm": 1.2269767633790387, "learning_rate": 4.532640731531921e-06, "loss": 0.0454, "step": 729 }, { "epoch": 1.0610465116279069, "grad_norm": 1.6055062515206473, "learning_rate": 4.52127626722058e-06, "loss": 0.1011, "step": 730 }, { "epoch": 1.0625, "grad_norm": 1.5283525172896468, "learning_rate": 4.509914298352197e-06, "loss": 0.0507, "step": 731 }, { "epoch": 1.0639534883720931, "grad_norm": 1.4633436434507294, "learning_rate": 4.4985548841533035e-06, "loss": 0.0948, "step": 732 }, { "epoch": 1.065406976744186, "grad_norm": 1.3019061971094648, "learning_rate": 4.487198083837111e-06, "loss": 0.0631, "step": 733 }, { "epoch": 1.066860465116279, "grad_norm": 1.3266926997772812, "learning_rate": 4.475843956603205e-06, "loss": 0.0604, "step": 734 }, { "epoch": 1.068313953488372, "grad_norm": 1.3235367351231275, "learning_rate": 4.4644925616372405e-06, "loss": 0.0605, "step": 735 }, { "epoch": 1.069767441860465, "grad_norm": 1.295361077164503, "learning_rate": 4.4531439581106295e-06, "loss": 0.06, "step": 736 }, { "epoch": 1.0712209302325582, "grad_norm": 1.6139195727409428, "learning_rate": 4.441798205180228e-06, "loss": 0.0797, "step": 737 }, { "epoch": 1.072674418604651, "grad_norm": 1.2563569911360324, "learning_rate": 4.430455361988041e-06, "loss": 0.0705, "step": 738 }, { "epoch": 1.0741279069767442, "grad_norm": 1.2037437189059568, "learning_rate": 4.419115487660899e-06, "loss": 0.06, "step": 739 }, { "epoch": 1.0755813953488371, "grad_norm": 1.2888284666571121, "learning_rate": 4.40777864131016e-06, "loss": 0.0574, "step": 740 }, { "epoch": 1.0770348837209303, "grad_norm": 1.4666943653511182, "learning_rate": 4.396444882031394e-06, "loss": 0.0748, "step": 741 }, { "epoch": 1.0784883720930232, "grad_norm": 1.1651868897402284, "learning_rate": 4.3851142689040885e-06, "loss": 0.0624, "step": 742 }, { "epoch": 1.0799418604651163, "grad_norm": 1.6627210680829636, "learning_rate": 4.373786860991318e-06, "loss": 0.0521, "step": 743 }, { "epoch": 1.0813953488372092, "grad_norm": 1.7854261575212669, "learning_rate": 4.3624627173394615e-06, "loss": 0.0956, "step": 744 }, { "epoch": 1.0828488372093024, "grad_norm": 1.281448055061643, "learning_rate": 4.351141896977874e-06, "loss": 0.0618, "step": 745 }, { "epoch": 1.0843023255813953, "grad_norm": 1.9115172580935775, "learning_rate": 4.339824458918592e-06, "loss": 0.0778, "step": 746 }, { "epoch": 1.0857558139534884, "grad_norm": 1.1325100660387584, "learning_rate": 4.328510462156015e-06, "loss": 0.0447, "step": 747 }, { "epoch": 1.0872093023255813, "grad_norm": 1.2777657933057078, "learning_rate": 4.317199965666613e-06, "loss": 0.0757, "step": 748 }, { "epoch": 1.0886627906976745, "grad_norm": 1.3785018962846736, "learning_rate": 4.305893028408601e-06, "loss": 0.0654, "step": 749 }, { "epoch": 1.0901162790697674, "grad_norm": 1.2206761011457805, "learning_rate": 4.294589709321651e-06, "loss": 0.0724, "step": 750 }, { "epoch": 1.0915697674418605, "grad_norm": 1.4018327959477974, "learning_rate": 4.283290067326562e-06, "loss": 0.0904, "step": 751 }, { "epoch": 1.0930232558139534, "grad_norm": 1.5155226786120202, "learning_rate": 4.271994161324977e-06, "loss": 0.0475, "step": 752 }, { "epoch": 1.0944767441860466, "grad_norm": 1.5020821031236455, "learning_rate": 4.260702050199056e-06, "loss": 0.0627, "step": 753 }, { "epoch": 1.0959302325581395, "grad_norm": 1.1544970293337866, "learning_rate": 4.2494137928111835e-06, "loss": 0.0731, "step": 754 }, { "epoch": 1.0973837209302326, "grad_norm": 1.156709990172585, "learning_rate": 4.238129448003651e-06, "loss": 0.0496, "step": 755 }, { "epoch": 1.0988372093023255, "grad_norm": 1.0821603719730972, "learning_rate": 4.22684907459836e-06, "loss": 0.0619, "step": 756 }, { "epoch": 1.1002906976744187, "grad_norm": 1.1819763770544474, "learning_rate": 4.215572731396504e-06, "loss": 0.0582, "step": 757 }, { "epoch": 1.1017441860465116, "grad_norm": 1.3810737668324913, "learning_rate": 4.204300477178274e-06, "loss": 0.0484, "step": 758 }, { "epoch": 1.1031976744186047, "grad_norm": 1.6953078691365866, "learning_rate": 4.19303237070254e-06, "loss": 0.0856, "step": 759 }, { "epoch": 1.1046511627906976, "grad_norm": 1.2505887912927889, "learning_rate": 4.181768470706561e-06, "loss": 0.0498, "step": 760 }, { "epoch": 1.1061046511627908, "grad_norm": 1.3867145509235537, "learning_rate": 4.170508835905655e-06, "loss": 0.0529, "step": 761 }, { "epoch": 1.1075581395348837, "grad_norm": 1.731884519992483, "learning_rate": 4.159253524992922e-06, "loss": 0.0649, "step": 762 }, { "epoch": 1.1090116279069768, "grad_norm": 1.6560829967990272, "learning_rate": 4.148002596638911e-06, "loss": 0.0575, "step": 763 }, { "epoch": 1.1104651162790697, "grad_norm": 1.95162595456697, "learning_rate": 4.136756109491333e-06, "loss": 0.081, "step": 764 }, { "epoch": 1.1119186046511629, "grad_norm": 1.1885528527482156, "learning_rate": 4.1255141221747445e-06, "loss": 0.0531, "step": 765 }, { "epoch": 1.1133720930232558, "grad_norm": 1.5782587513465858, "learning_rate": 4.1142766932902475e-06, "loss": 0.0698, "step": 766 }, { "epoch": 1.114825581395349, "grad_norm": 1.7115336320491816, "learning_rate": 4.103043881415181e-06, "loss": 0.0743, "step": 767 }, { "epoch": 1.1162790697674418, "grad_norm": 1.3705963951525217, "learning_rate": 4.091815745102818e-06, "loss": 0.0634, "step": 768 }, { "epoch": 1.117732558139535, "grad_norm": 1.2666345308767393, "learning_rate": 4.080592342882059e-06, "loss": 0.0557, "step": 769 }, { "epoch": 1.119186046511628, "grad_norm": 1.466660608860313, "learning_rate": 4.069373733257129e-06, "loss": 0.0532, "step": 770 }, { "epoch": 1.120639534883721, "grad_norm": 1.9380585631227165, "learning_rate": 4.058159974707267e-06, "loss": 0.0838, "step": 771 }, { "epoch": 1.122093023255814, "grad_norm": 1.3123759677425353, "learning_rate": 4.046951125686427e-06, "loss": 0.0661, "step": 772 }, { "epoch": 1.1235465116279069, "grad_norm": 1.3861730341247065, "learning_rate": 4.035747244622968e-06, "loss": 0.0732, "step": 773 }, { "epoch": 1.125, "grad_norm": 1.5843091362943043, "learning_rate": 4.02454838991936e-06, "loss": 0.0601, "step": 774 }, { "epoch": 1.1264534883720931, "grad_norm": 1.1871559950997301, "learning_rate": 4.013354619951864e-06, "loss": 0.0542, "step": 775 }, { "epoch": 1.127906976744186, "grad_norm": 1.4833725641864457, "learning_rate": 4.002165993070237e-06, "loss": 0.0421, "step": 776 }, { "epoch": 1.129360465116279, "grad_norm": 1.2607191005902103, "learning_rate": 3.990982567597434e-06, "loss": 0.0675, "step": 777 }, { "epoch": 1.130813953488372, "grad_norm": 1.302371033029548, "learning_rate": 3.979804401829287e-06, "loss": 0.0552, "step": 778 }, { "epoch": 1.1322674418604652, "grad_norm": 1.0982776342344591, "learning_rate": 3.968631554034219e-06, "loss": 0.0417, "step": 779 }, { "epoch": 1.1337209302325582, "grad_norm": 1.0186685111120157, "learning_rate": 3.957464082452922e-06, "loss": 0.0433, "step": 780 }, { "epoch": 1.135174418604651, "grad_norm": 1.405227738570767, "learning_rate": 3.946302045298076e-06, "loss": 0.0589, "step": 781 }, { "epoch": 1.1366279069767442, "grad_norm": 1.7360986285508477, "learning_rate": 3.93514550075402e-06, "loss": 0.0831, "step": 782 }, { "epoch": 1.1380813953488371, "grad_norm": 1.3982460147182094, "learning_rate": 3.923994506976475e-06, "loss": 0.0717, "step": 783 }, { "epoch": 1.1395348837209303, "grad_norm": 1.67771687763503, "learning_rate": 3.912849122092216e-06, "loss": 0.0623, "step": 784 }, { "epoch": 1.1409883720930232, "grad_norm": 1.36066170200081, "learning_rate": 3.901709404198787e-06, "loss": 0.0632, "step": 785 }, { "epoch": 1.1424418604651163, "grad_norm": 1.724665327424766, "learning_rate": 3.890575411364187e-06, "loss": 0.0904, "step": 786 }, { "epoch": 1.1438953488372092, "grad_norm": 1.292310436918187, "learning_rate": 3.879447201626579e-06, "loss": 0.0486, "step": 787 }, { "epoch": 1.1453488372093024, "grad_norm": 1.3385498197221257, "learning_rate": 3.868324832993972e-06, "loss": 0.0543, "step": 788 }, { "epoch": 1.1468023255813953, "grad_norm": 1.3062276729009943, "learning_rate": 3.857208363443936e-06, "loss": 0.0607, "step": 789 }, { "epoch": 1.1482558139534884, "grad_norm": 1.6713853170193331, "learning_rate": 3.84609785092328e-06, "loss": 0.0541, "step": 790 }, { "epoch": 1.1497093023255813, "grad_norm": 1.4621166936575898, "learning_rate": 3.834993353347774e-06, "loss": 0.0822, "step": 791 }, { "epoch": 1.1511627906976745, "grad_norm": 1.3920933935968336, "learning_rate": 3.823894928601822e-06, "loss": 0.0502, "step": 792 }, { "epoch": 1.1526162790697674, "grad_norm": 1.3690857531456042, "learning_rate": 3.8128026345381804e-06, "loss": 0.0726, "step": 793 }, { "epoch": 1.1540697674418605, "grad_norm": 1.435007037574927, "learning_rate": 3.8017165289776397e-06, "loss": 0.0733, "step": 794 }, { "epoch": 1.1555232558139534, "grad_norm": 1.3209050161532427, "learning_rate": 3.7906366697087426e-06, "loss": 0.0625, "step": 795 }, { "epoch": 1.1569767441860466, "grad_norm": 1.3609715123113075, "learning_rate": 3.7795631144874607e-06, "loss": 0.064, "step": 796 }, { "epoch": 1.1584302325581395, "grad_norm": 1.35359644663218, "learning_rate": 3.768495921036915e-06, "loss": 0.0786, "step": 797 }, { "epoch": 1.1598837209302326, "grad_norm": 1.4861713191531716, "learning_rate": 3.7574351470470547e-06, "loss": 0.0777, "step": 798 }, { "epoch": 1.1613372093023255, "grad_norm": 1.256830182158305, "learning_rate": 3.7463808501743736e-06, "loss": 0.0649, "step": 799 }, { "epoch": 1.1627906976744187, "grad_norm": 1.18227664622794, "learning_rate": 3.7353330880415963e-06, "loss": 0.0614, "step": 800 }, { "epoch": 1.1627906976744187, "eval_loss": 0.13292963802814484, "eval_runtime": 2.205, "eval_samples_per_second": 25.396, "eval_steps_per_second": 6.349, "step": 800 }, { "epoch": 1.1642441860465116, "grad_norm": 1.3697674011754686, "learning_rate": 3.724291918237391e-06, "loss": 0.0802, "step": 801 }, { "epoch": 1.1656976744186047, "grad_norm": 1.5758218959573673, "learning_rate": 3.7132573983160538e-06, "loss": 0.0555, "step": 802 }, { "epoch": 1.1671511627906976, "grad_norm": 1.4752518250909372, "learning_rate": 3.7022295857972244e-06, "loss": 0.0598, "step": 803 }, { "epoch": 1.1686046511627908, "grad_norm": 1.1107721704788946, "learning_rate": 3.691208538165574e-06, "loss": 0.0526, "step": 804 }, { "epoch": 1.1700581395348837, "grad_norm": 1.5316239040252175, "learning_rate": 3.6801943128705128e-06, "loss": 0.0642, "step": 805 }, { "epoch": 1.1715116279069768, "grad_norm": 1.1009808525064773, "learning_rate": 3.6691869673258847e-06, "loss": 0.0579, "step": 806 }, { "epoch": 1.1729651162790697, "grad_norm": 1.2132447629779688, "learning_rate": 3.6581865589096784e-06, "loss": 0.0534, "step": 807 }, { "epoch": 1.1744186046511629, "grad_norm": 1.21822644945824, "learning_rate": 3.6471931449637127e-06, "loss": 0.0478, "step": 808 }, { "epoch": 1.1758720930232558, "grad_norm": 2.1071653837605484, "learning_rate": 3.6362067827933555e-06, "loss": 0.0591, "step": 809 }, { "epoch": 1.177325581395349, "grad_norm": 1.2907948357744514, "learning_rate": 3.625227529667209e-06, "loss": 0.0492, "step": 810 }, { "epoch": 1.1787790697674418, "grad_norm": 1.3081083590425786, "learning_rate": 3.6142554428168208e-06, "loss": 0.0641, "step": 811 }, { "epoch": 1.1802325581395348, "grad_norm": 1.497246846467104, "learning_rate": 3.6032905794363805e-06, "loss": 0.0657, "step": 812 }, { "epoch": 1.181686046511628, "grad_norm": 1.3989638492911172, "learning_rate": 3.5923329966824288e-06, "loss": 0.0677, "step": 813 }, { "epoch": 1.183139534883721, "grad_norm": 1.4702223684938749, "learning_rate": 3.5813827516735487e-06, "loss": 0.0858, "step": 814 }, { "epoch": 1.184593023255814, "grad_norm": 1.2980406228178047, "learning_rate": 3.5704399014900814e-06, "loss": 0.0547, "step": 815 }, { "epoch": 1.1860465116279069, "grad_norm": 1.2387292846340658, "learning_rate": 3.5595045031738123e-06, "loss": 0.0477, "step": 816 }, { "epoch": 1.1875, "grad_norm": 1.2267059696407838, "learning_rate": 3.5485766137276894e-06, "loss": 0.0499, "step": 817 }, { "epoch": 1.1889534883720931, "grad_norm": 1.5850681514152112, "learning_rate": 3.5376562901155138e-06, "loss": 0.0608, "step": 818 }, { "epoch": 1.190406976744186, "grad_norm": 1.4591340811318565, "learning_rate": 3.526743589261652e-06, "loss": 0.0553, "step": 819 }, { "epoch": 1.191860465116279, "grad_norm": 1.3466268312861094, "learning_rate": 3.5158385680507356e-06, "loss": 0.0601, "step": 820 }, { "epoch": 1.193313953488372, "grad_norm": 1.28302549307778, "learning_rate": 3.50494128332736e-06, "loss": 0.0614, "step": 821 }, { "epoch": 1.1947674418604652, "grad_norm": 1.5064832595806237, "learning_rate": 3.4940517918958e-06, "loss": 0.053, "step": 822 }, { "epoch": 1.1962209302325582, "grad_norm": 1.241785767259479, "learning_rate": 3.483170150519697e-06, "loss": 0.0648, "step": 823 }, { "epoch": 1.197674418604651, "grad_norm": 1.9169832841939474, "learning_rate": 3.472296415921783e-06, "loss": 0.0653, "step": 824 }, { "epoch": 1.1991279069767442, "grad_norm": 1.1075005753142289, "learning_rate": 3.4614306447835646e-06, "loss": 0.0623, "step": 825 }, { "epoch": 1.2005813953488371, "grad_norm": 1.4624159678780777, "learning_rate": 3.4505728937450437e-06, "loss": 0.0502, "step": 826 }, { "epoch": 1.2020348837209303, "grad_norm": 1.4873573941031213, "learning_rate": 3.439723219404411e-06, "loss": 0.0589, "step": 827 }, { "epoch": 1.2034883720930232, "grad_norm": 1.3071195675919425, "learning_rate": 3.4288816783177624e-06, "loss": 0.059, "step": 828 }, { "epoch": 1.2049418604651163, "grad_norm": 1.3306075906173456, "learning_rate": 3.41804832699879e-06, "loss": 0.0557, "step": 829 }, { "epoch": 1.2063953488372092, "grad_norm": 1.2154802175356552, "learning_rate": 3.407223221918501e-06, "loss": 0.0593, "step": 830 }, { "epoch": 1.2078488372093024, "grad_norm": 1.368819793191012, "learning_rate": 3.396406419504914e-06, "loss": 0.0756, "step": 831 }, { "epoch": 1.2093023255813953, "grad_norm": 1.6765723354799054, "learning_rate": 3.3855979761427705e-06, "loss": 0.0546, "step": 832 }, { "epoch": 1.2107558139534884, "grad_norm": 1.4117226452497533, "learning_rate": 3.3747979481732352e-06, "loss": 0.0665, "step": 833 }, { "epoch": 1.2122093023255813, "grad_norm": 1.2207966641443189, "learning_rate": 3.364006391893612e-06, "loss": 0.0481, "step": 834 }, { "epoch": 1.2136627906976745, "grad_norm": 1.4940343341477946, "learning_rate": 3.3532233635570377e-06, "loss": 0.0578, "step": 835 }, { "epoch": 1.2151162790697674, "grad_norm": 1.3800156284226655, "learning_rate": 3.3424489193722016e-06, "loss": 0.0642, "step": 836 }, { "epoch": 1.2165697674418605, "grad_norm": 1.5526090076765855, "learning_rate": 3.331683115503041e-06, "loss": 0.0639, "step": 837 }, { "epoch": 1.2180232558139534, "grad_norm": 1.2911489672133427, "learning_rate": 3.320926008068458e-06, "loss": 0.0577, "step": 838 }, { "epoch": 1.2194767441860466, "grad_norm": 1.546598681848458, "learning_rate": 3.310177653142018e-06, "loss": 0.0593, "step": 839 }, { "epoch": 1.2209302325581395, "grad_norm": 1.188966656927106, "learning_rate": 3.2994381067516702e-06, "loss": 0.067, "step": 840 }, { "epoch": 1.2223837209302326, "grad_norm": 1.3683724251027738, "learning_rate": 3.2887074248794372e-06, "loss": 0.0661, "step": 841 }, { "epoch": 1.2238372093023255, "grad_norm": 1.4058614221882173, "learning_rate": 3.2779856634611433e-06, "loss": 0.0519, "step": 842 }, { "epoch": 1.2252906976744187, "grad_norm": 1.1579274317787078, "learning_rate": 3.267272878386106e-06, "loss": 0.0699, "step": 843 }, { "epoch": 1.2267441860465116, "grad_norm": 1.2336765642615921, "learning_rate": 3.256569125496858e-06, "loss": 0.0526, "step": 844 }, { "epoch": 1.2281976744186047, "grad_norm": 1.590986386650674, "learning_rate": 3.2458744605888414e-06, "loss": 0.0653, "step": 845 }, { "epoch": 1.2296511627906976, "grad_norm": 1.2675149654017217, "learning_rate": 3.2351889394101356e-06, "loss": 0.0517, "step": 846 }, { "epoch": 1.2311046511627908, "grad_norm": 1.4746783771397893, "learning_rate": 3.224512617661147e-06, "loss": 0.0597, "step": 847 }, { "epoch": 1.2325581395348837, "grad_norm": 1.3951470526945027, "learning_rate": 3.2138455509943365e-06, "loss": 0.06, "step": 848 }, { "epoch": 1.2340116279069768, "grad_norm": 1.6377226127179718, "learning_rate": 3.2031877950139138e-06, "loss": 0.0634, "step": 849 }, { "epoch": 1.2354651162790697, "grad_norm": 1.3590191499897781, "learning_rate": 3.192539405275562e-06, "loss": 0.0585, "step": 850 }, { "epoch": 1.2369186046511629, "grad_norm": 1.197442084738658, "learning_rate": 3.181900437286133e-06, "loss": 0.0522, "step": 851 }, { "epoch": 1.2383720930232558, "grad_norm": 1.457106341008041, "learning_rate": 3.171270946503373e-06, "loss": 0.0757, "step": 852 }, { "epoch": 1.239825581395349, "grad_norm": 1.1462139163166636, "learning_rate": 3.160650988335619e-06, "loss": 0.066, "step": 853 }, { "epoch": 1.2412790697674418, "grad_norm": 1.422733421409013, "learning_rate": 3.1500406181415266e-06, "loss": 0.0574, "step": 854 }, { "epoch": 1.2427325581395348, "grad_norm": 1.3764162372997557, "learning_rate": 3.1394398912297623e-06, "loss": 0.0569, "step": 855 }, { "epoch": 1.244186046511628, "grad_norm": 1.3748066305788302, "learning_rate": 3.1288488628587343e-06, "loss": 0.079, "step": 856 }, { "epoch": 1.245639534883721, "grad_norm": 1.5587103993805567, "learning_rate": 3.118267588236288e-06, "loss": 0.048, "step": 857 }, { "epoch": 1.247093023255814, "grad_norm": 1.0821878347641152, "learning_rate": 3.1076961225194303e-06, "loss": 0.0373, "step": 858 }, { "epoch": 1.2485465116279069, "grad_norm": 1.3196026626843278, "learning_rate": 3.0971345208140315e-06, "loss": 0.0762, "step": 859 }, { "epoch": 1.25, "grad_norm": 1.2188350465078943, "learning_rate": 3.0865828381745515e-06, "loss": 0.0603, "step": 860 }, { "epoch": 1.2514534883720931, "grad_norm": 1.2251652362284926, "learning_rate": 3.07604112960374e-06, "loss": 0.0622, "step": 861 }, { "epoch": 1.252906976744186, "grad_norm": 1.3179349347598328, "learning_rate": 3.065509450052353e-06, "loss": 0.0834, "step": 862 }, { "epoch": 1.254360465116279, "grad_norm": 1.5906225203629438, "learning_rate": 3.054987854418876e-06, "loss": 0.0809, "step": 863 }, { "epoch": 1.255813953488372, "grad_norm": 1.4130566616345097, "learning_rate": 3.044476397549221e-06, "loss": 0.0654, "step": 864 }, { "epoch": 1.2572674418604652, "grad_norm": 1.743946208757902, "learning_rate": 3.0339751342364563e-06, "loss": 0.0625, "step": 865 }, { "epoch": 1.2587209302325582, "grad_norm": 1.7612769821987542, "learning_rate": 3.02348411922051e-06, "loss": 0.0642, "step": 866 }, { "epoch": 1.260174418604651, "grad_norm": 3.522383016452111, "learning_rate": 3.0130034071878935e-06, "loss": 0.0452, "step": 867 }, { "epoch": 1.2616279069767442, "grad_norm": 1.8108815533825797, "learning_rate": 3.002533052771405e-06, "loss": 0.069, "step": 868 }, { "epoch": 1.2630813953488373, "grad_norm": 1.4705889091500777, "learning_rate": 2.99207311054986e-06, "loss": 0.0665, "step": 869 }, { "epoch": 1.2645348837209303, "grad_norm": 1.2345293570590237, "learning_rate": 2.9816236350477924e-06, "loss": 0.0471, "step": 870 }, { "epoch": 1.2659883720930232, "grad_norm": 1.3814272608337272, "learning_rate": 2.9711846807351775e-06, "loss": 0.0497, "step": 871 }, { "epoch": 1.2674418604651163, "grad_norm": 1.4737560562363339, "learning_rate": 2.9607563020271446e-06, "loss": 0.0754, "step": 872 }, { "epoch": 1.2688953488372092, "grad_norm": 1.2213469734360816, "learning_rate": 2.950338553283704e-06, "loss": 0.0439, "step": 873 }, { "epoch": 1.2703488372093024, "grad_norm": 1.0228539211087246, "learning_rate": 2.939931488809443e-06, "loss": 0.0453, "step": 874 }, { "epoch": 1.2718023255813953, "grad_norm": 1.3148270871243914, "learning_rate": 2.9295351628532666e-06, "loss": 0.0691, "step": 875 }, { "epoch": 1.2732558139534884, "grad_norm": 1.6172892866219821, "learning_rate": 2.9191496296080935e-06, "loss": 0.0716, "step": 876 }, { "epoch": 1.2747093023255813, "grad_norm": 1.5950616252422385, "learning_rate": 2.9087749432105917e-06, "loss": 0.0691, "step": 877 }, { "epoch": 1.2761627906976745, "grad_norm": 1.465303805672259, "learning_rate": 2.898411157740879e-06, "loss": 0.0642, "step": 878 }, { "epoch": 1.2776162790697674, "grad_norm": 1.234388737205486, "learning_rate": 2.8880583272222594e-06, "loss": 0.0485, "step": 879 }, { "epoch": 1.2790697674418605, "grad_norm": 1.088371039773677, "learning_rate": 2.8777165056209256e-06, "loss": 0.0435, "step": 880 }, { "epoch": 1.2805232558139534, "grad_norm": 1.2648066111949126, "learning_rate": 2.867385746845685e-06, "loss": 0.0707, "step": 881 }, { "epoch": 1.2819767441860466, "grad_norm": 1.429685475726185, "learning_rate": 2.8570661047476773e-06, "loss": 0.0561, "step": 882 }, { "epoch": 1.2834302325581395, "grad_norm": 1.9989759755669896, "learning_rate": 2.8467576331200986e-06, "loss": 0.0794, "step": 883 }, { "epoch": 1.2848837209302326, "grad_norm": 1.2470700343219876, "learning_rate": 2.836460385697911e-06, "loss": 0.046, "step": 884 }, { "epoch": 1.2863372093023255, "grad_norm": 1.3164750906741658, "learning_rate": 2.8261744161575745e-06, "loss": 0.0658, "step": 885 }, { "epoch": 1.2877906976744187, "grad_norm": 1.1137226528892454, "learning_rate": 2.815899778116753e-06, "loss": 0.0438, "step": 886 }, { "epoch": 1.2892441860465116, "grad_norm": 1.5963496710705627, "learning_rate": 2.80563652513405e-06, "loss": 0.0768, "step": 887 }, { "epoch": 1.2906976744186047, "grad_norm": 1.2418886512188267, "learning_rate": 2.7953847107087173e-06, "loss": 0.0476, "step": 888 }, { "epoch": 1.2921511627906976, "grad_norm": 1.4960278840419248, "learning_rate": 2.785144388280385e-06, "loss": 0.0737, "step": 889 }, { "epoch": 1.2936046511627908, "grad_norm": 1.078019987993341, "learning_rate": 2.7749156112287746e-06, "loss": 0.0686, "step": 890 }, { "epoch": 1.2950581395348837, "grad_norm": 1.3385018973715532, "learning_rate": 2.7646984328734284e-06, "loss": 0.0422, "step": 891 }, { "epoch": 1.2965116279069768, "grad_norm": 1.5906816734841456, "learning_rate": 2.754492906473425e-06, "loss": 0.0555, "step": 892 }, { "epoch": 1.2979651162790697, "grad_norm": 1.2925137226713737, "learning_rate": 2.744299085227109e-06, "loss": 0.0472, "step": 893 }, { "epoch": 1.2994186046511627, "grad_norm": 1.259018228463516, "learning_rate": 2.7341170222718073e-06, "loss": 0.0517, "step": 894 }, { "epoch": 1.3008720930232558, "grad_norm": 1.350283378016509, "learning_rate": 2.723946770683552e-06, "loss": 0.0496, "step": 895 }, { "epoch": 1.302325581395349, "grad_norm": 1.3897000129677248, "learning_rate": 2.7137883834768076e-06, "loss": 0.0555, "step": 896 }, { "epoch": 1.3037790697674418, "grad_norm": 1.4501986699541858, "learning_rate": 2.703641913604198e-06, "loss": 0.0734, "step": 897 }, { "epoch": 1.3052325581395348, "grad_norm": 1.316272770181074, "learning_rate": 2.6935074139562174e-06, "loss": 0.0598, "step": 898 }, { "epoch": 1.306686046511628, "grad_norm": 1.1543542777127582, "learning_rate": 2.683384937360971e-06, "loss": 0.0635, "step": 899 }, { "epoch": 1.308139534883721, "grad_norm": 1.2569738112859057, "learning_rate": 2.673274536583883e-06, "loss": 0.0493, "step": 900 }, { "epoch": 1.309593023255814, "grad_norm": 1.5911605897872676, "learning_rate": 2.663176264327439e-06, "loss": 0.064, "step": 901 }, { "epoch": 1.3110465116279069, "grad_norm": 1.398413502674623, "learning_rate": 2.6530901732308934e-06, "loss": 0.0542, "step": 902 }, { "epoch": 1.3125, "grad_norm": 1.481895453387704, "learning_rate": 2.6430163158700116e-06, "loss": 0.0423, "step": 903 }, { "epoch": 1.3139534883720931, "grad_norm": 1.5404341782721638, "learning_rate": 2.632954744756784e-06, "loss": 0.0743, "step": 904 }, { "epoch": 1.315406976744186, "grad_norm": 1.6089449641602316, "learning_rate": 2.6229055123391545e-06, "loss": 0.0762, "step": 905 }, { "epoch": 1.316860465116279, "grad_norm": 1.3087109651209734, "learning_rate": 2.612868671000755e-06, "loss": 0.063, "step": 906 }, { "epoch": 1.318313953488372, "grad_norm": 1.3502162014519732, "learning_rate": 2.602844273060623e-06, "loss": 0.0613, "step": 907 }, { "epoch": 1.3197674418604652, "grad_norm": 1.757116221919893, "learning_rate": 2.592832370772931e-06, "loss": 0.0598, "step": 908 }, { "epoch": 1.3212209302325582, "grad_norm": 1.440632285315959, "learning_rate": 2.582833016326716e-06, "loss": 0.0603, "step": 909 }, { "epoch": 1.322674418604651, "grad_norm": 1.3695411985419055, "learning_rate": 2.5728462618456114e-06, "loss": 0.0603, "step": 910 }, { "epoch": 1.3241279069767442, "grad_norm": 1.3717727081965483, "learning_rate": 2.562872159387563e-06, "loss": 0.0489, "step": 911 }, { "epoch": 1.3255813953488373, "grad_norm": 1.3965140784846173, "learning_rate": 2.5529107609445737e-06, "loss": 0.053, "step": 912 }, { "epoch": 1.3270348837209303, "grad_norm": 1.3605550236382897, "learning_rate": 2.542962118442417e-06, "loss": 0.0652, "step": 913 }, { "epoch": 1.3284883720930232, "grad_norm": 1.5017426449957705, "learning_rate": 2.5330262837403795e-06, "loss": 0.059, "step": 914 }, { "epoch": 1.3299418604651163, "grad_norm": 1.2686790614066732, "learning_rate": 2.523103308630978e-06, "loss": 0.059, "step": 915 }, { "epoch": 1.3313953488372092, "grad_norm": 1.4932765351503499, "learning_rate": 2.513193244839704e-06, "loss": 0.0801, "step": 916 }, { "epoch": 1.3328488372093024, "grad_norm": 1.277822451358384, "learning_rate": 2.5032961440247382e-06, "loss": 0.0563, "step": 917 }, { "epoch": 1.3343023255813953, "grad_norm": 1.1466090066744306, "learning_rate": 2.4934120577766963e-06, "loss": 0.0482, "step": 918 }, { "epoch": 1.3357558139534884, "grad_norm": 1.3716592416122075, "learning_rate": 2.483541037618346e-06, "loss": 0.0618, "step": 919 }, { "epoch": 1.3372093023255813, "grad_norm": 1.365751544441765, "learning_rate": 2.473683135004354e-06, "loss": 0.062, "step": 920 }, { "epoch": 1.3386627906976745, "grad_norm": 2.264876190842683, "learning_rate": 2.4638384013210004e-06, "loss": 0.1066, "step": 921 }, { "epoch": 1.3401162790697674, "grad_norm": 1.6250492792457274, "learning_rate": 2.4540068878859247e-06, "loss": 0.0857, "step": 922 }, { "epoch": 1.3415697674418605, "grad_norm": 1.821123714104965, "learning_rate": 2.4441886459478502e-06, "loss": 0.0484, "step": 923 }, { "epoch": 1.3430232558139534, "grad_norm": 1.3628978122194464, "learning_rate": 2.4343837266863245e-06, "loss": 0.0716, "step": 924 }, { "epoch": 1.3444767441860466, "grad_norm": 1.2104456979218408, "learning_rate": 2.4245921812114427e-06, "loss": 0.0413, "step": 925 }, { "epoch": 1.3459302325581395, "grad_norm": 1.4006337692831987, "learning_rate": 2.4148140605635923e-06, "loss": 0.0884, "step": 926 }, { "epoch": 1.3473837209302326, "grad_norm": 1.5775325655062982, "learning_rate": 2.405049415713173e-06, "loss": 0.0826, "step": 927 }, { "epoch": 1.3488372093023255, "grad_norm": 1.3469035877818512, "learning_rate": 2.3952982975603494e-06, "loss": 0.0528, "step": 928 }, { "epoch": 1.3502906976744187, "grad_norm": 1.3912044037862688, "learning_rate": 2.385560756934765e-06, "loss": 0.0683, "step": 929 }, { "epoch": 1.3517441860465116, "grad_norm": 1.4330392863334833, "learning_rate": 2.3758368445952977e-06, "loss": 0.0615, "step": 930 }, { "epoch": 1.3531976744186047, "grad_norm": 1.353177391111986, "learning_rate": 2.3661266112297765e-06, "loss": 0.0441, "step": 931 }, { "epoch": 1.3546511627906976, "grad_norm": 1.4685441680460056, "learning_rate": 2.356430107454733e-06, "loss": 0.0719, "step": 932 }, { "epoch": 1.3561046511627908, "grad_norm": 1.3373925778152866, "learning_rate": 2.346747383815126e-06, "loss": 0.0581, "step": 933 }, { "epoch": 1.3575581395348837, "grad_norm": 1.6807992341418785, "learning_rate": 2.337078490784084e-06, "loss": 0.083, "step": 934 }, { "epoch": 1.3590116279069768, "grad_norm": 1.3439853412810583, "learning_rate": 2.32742347876264e-06, "loss": 0.0601, "step": 935 }, { "epoch": 1.3604651162790697, "grad_norm": 1.3060456175816213, "learning_rate": 2.317782398079473e-06, "loss": 0.055, "step": 936 }, { "epoch": 1.3619186046511627, "grad_norm": 1.5527718046211736, "learning_rate": 2.3081552989906347e-06, "loss": 0.0734, "step": 937 }, { "epoch": 1.3633720930232558, "grad_norm": 1.5997787553142, "learning_rate": 2.298542231679305e-06, "loss": 0.0727, "step": 938 }, { "epoch": 1.364825581395349, "grad_norm": 1.647243597942959, "learning_rate": 2.2889432462555106e-06, "loss": 0.069, "step": 939 }, { "epoch": 1.3662790697674418, "grad_norm": 1.4131937959777625, "learning_rate": 2.279358392755882e-06, "loss": 0.0572, "step": 940 }, { "epoch": 1.3677325581395348, "grad_norm": 1.2921469692871526, "learning_rate": 2.269787721143376e-06, "loss": 0.0558, "step": 941 }, { "epoch": 1.369186046511628, "grad_norm": 1.5778632142343019, "learning_rate": 2.2602312813070315e-06, "loss": 0.0653, "step": 942 }, { "epoch": 1.370639534883721, "grad_norm": 1.1650103416898503, "learning_rate": 2.250689123061694e-06, "loss": 0.0583, "step": 943 }, { "epoch": 1.372093023255814, "grad_norm": 1.638353937587653, "learning_rate": 2.2411612961477704e-06, "loss": 0.0954, "step": 944 }, { "epoch": 1.3735465116279069, "grad_norm": 1.3332240689990287, "learning_rate": 2.2316478502309576e-06, "loss": 0.0686, "step": 945 }, { "epoch": 1.375, "grad_norm": 1.4897450673269428, "learning_rate": 2.2221488349019903e-06, "loss": 0.0637, "step": 946 }, { "epoch": 1.3764534883720931, "grad_norm": 1.2433687422683675, "learning_rate": 2.2126642996763793e-06, "loss": 0.0636, "step": 947 }, { "epoch": 1.377906976744186, "grad_norm": 1.5438951823738334, "learning_rate": 2.203194293994159e-06, "loss": 0.063, "step": 948 }, { "epoch": 1.379360465116279, "grad_norm": 1.353307173474805, "learning_rate": 2.193738867219623e-06, "loss": 0.0551, "step": 949 }, { "epoch": 1.380813953488372, "grad_norm": 1.3926599213141382, "learning_rate": 2.184298068641067e-06, "loss": 0.0658, "step": 950 }, { "epoch": 1.3822674418604652, "grad_norm": 1.5876307826339826, "learning_rate": 2.174871947470541e-06, "loss": 0.0714, "step": 951 }, { "epoch": 1.3837209302325582, "grad_norm": 1.5940606667679074, "learning_rate": 2.1654605528435774e-06, "loss": 0.0984, "step": 952 }, { "epoch": 1.385174418604651, "grad_norm": 1.6476409988363916, "learning_rate": 2.1560639338189533e-06, "loss": 0.0618, "step": 953 }, { "epoch": 1.3866279069767442, "grad_norm": 1.3123274183423979, "learning_rate": 2.1466821393784148e-06, "loss": 0.0526, "step": 954 }, { "epoch": 1.3880813953488373, "grad_norm": 1.1675619599323548, "learning_rate": 2.137315218426442e-06, "loss": 0.0465, "step": 955 }, { "epoch": 1.3895348837209303, "grad_norm": 1.6048177965652641, "learning_rate": 2.127963219789974e-06, "loss": 0.1021, "step": 956 }, { "epoch": 1.3909883720930232, "grad_norm": 0.8988094880823243, "learning_rate": 2.1186261922181746e-06, "loss": 0.0387, "step": 957 }, { "epoch": 1.3924418604651163, "grad_norm": 1.287189982666728, "learning_rate": 2.109304184382157e-06, "loss": 0.065, "step": 958 }, { "epoch": 1.3938953488372092, "grad_norm": 1.1802701284704842, "learning_rate": 2.0999972448747525e-06, "loss": 0.0453, "step": 959 }, { "epoch": 1.3953488372093024, "grad_norm": 1.3499018387202095, "learning_rate": 2.0907054222102367e-06, "loss": 0.0439, "step": 960 }, { "epoch": 1.3968023255813953, "grad_norm": 1.620339483385674, "learning_rate": 2.081428764824089e-06, "loss": 0.075, "step": 961 }, { "epoch": 1.3982558139534884, "grad_norm": 1.2075011912654863, "learning_rate": 2.072167321072736e-06, "loss": 0.0653, "step": 962 }, { "epoch": 1.3997093023255813, "grad_norm": 1.4261627536661536, "learning_rate": 2.0629211392333033e-06, "loss": 0.0605, "step": 963 }, { "epoch": 1.4011627906976745, "grad_norm": 1.5799723516279018, "learning_rate": 2.0536902675033547e-06, "loss": 0.0667, "step": 964 }, { "epoch": 1.4026162790697674, "grad_norm": 1.4908299612274243, "learning_rate": 2.044474754000655e-06, "loss": 0.058, "step": 965 }, { "epoch": 1.4040697674418605, "grad_norm": 1.4424465604879053, "learning_rate": 2.0352746467629018e-06, "loss": 0.0749, "step": 966 }, { "epoch": 1.4055232558139534, "grad_norm": 1.3986123725352064, "learning_rate": 2.0260899937474943e-06, "loss": 0.0587, "step": 967 }, { "epoch": 1.4069767441860466, "grad_norm": 1.5870575696903628, "learning_rate": 2.0169208428312647e-06, "loss": 0.0825, "step": 968 }, { "epoch": 1.4084302325581395, "grad_norm": 1.4921609261163944, "learning_rate": 2.0077672418102443e-06, "loss": 0.0796, "step": 969 }, { "epoch": 1.4098837209302326, "grad_norm": 1.5995313772665938, "learning_rate": 1.998629238399402e-06, "loss": 0.0659, "step": 970 }, { "epoch": 1.4113372093023255, "grad_norm": 1.2823207017129816, "learning_rate": 1.9895068802324065e-06, "loss": 0.0484, "step": 971 }, { "epoch": 1.4127906976744187, "grad_norm": 1.3944837875822929, "learning_rate": 1.980400214861367e-06, "loss": 0.0507, "step": 972 }, { "epoch": 1.4142441860465116, "grad_norm": 1.3570706618634196, "learning_rate": 1.971309289756595e-06, "loss": 0.0553, "step": 973 }, { "epoch": 1.4156976744186047, "grad_norm": 1.3823236957095917, "learning_rate": 1.9622341523063484e-06, "loss": 0.0604, "step": 974 }, { "epoch": 1.4171511627906976, "grad_norm": 1.5503182481651412, "learning_rate": 1.953174849816595e-06, "loss": 0.1027, "step": 975 }, { "epoch": 1.4186046511627908, "grad_norm": 1.25459622511254, "learning_rate": 1.944131429510754e-06, "loss": 0.0646, "step": 976 }, { "epoch": 1.4200581395348837, "grad_norm": 1.6351810496565538, "learning_rate": 1.93510393852946e-06, "loss": 0.0659, "step": 977 }, { "epoch": 1.4215116279069768, "grad_norm": 1.2975727336300502, "learning_rate": 1.9260924239303075e-06, "loss": 0.0672, "step": 978 }, { "epoch": 1.4229651162790697, "grad_norm": 1.3534696019811543, "learning_rate": 1.9170969326876177e-06, "loss": 0.0516, "step": 979 }, { "epoch": 1.4244186046511627, "grad_norm": 1.5117325218714435, "learning_rate": 1.90811751169218e-06, "loss": 0.0512, "step": 980 }, { "epoch": 1.4258720930232558, "grad_norm": 1.2260873105062584, "learning_rate": 1.8991542077510205e-06, "loss": 0.0535, "step": 981 }, { "epoch": 1.427325581395349, "grad_norm": 1.4430073129022352, "learning_rate": 1.8902070675871465e-06, "loss": 0.0623, "step": 982 }, { "epoch": 1.4287790697674418, "grad_norm": 1.4117041417091303, "learning_rate": 1.881276137839314e-06, "loss": 0.0367, "step": 983 }, { "epoch": 1.4302325581395348, "grad_norm": 1.3665830676976485, "learning_rate": 1.8723614650617721e-06, "loss": 0.0658, "step": 984 }, { "epoch": 1.431686046511628, "grad_norm": 1.4391996547793735, "learning_rate": 1.8634630957240352e-06, "loss": 0.0762, "step": 985 }, { "epoch": 1.433139534883721, "grad_norm": 1.5377368090806955, "learning_rate": 1.8545810762106263e-06, "loss": 0.0709, "step": 986 }, { "epoch": 1.434593023255814, "grad_norm": 1.3753400620423242, "learning_rate": 1.845715452820845e-06, "loss": 0.0501, "step": 987 }, { "epoch": 1.4360465116279069, "grad_norm": 1.593242899745401, "learning_rate": 1.8368662717685188e-06, "loss": 0.0799, "step": 988 }, { "epoch": 1.4375, "grad_norm": 1.6328684919856264, "learning_rate": 1.8280335791817733e-06, "loss": 0.053, "step": 989 }, { "epoch": 1.4389534883720931, "grad_norm": 1.432266744388245, "learning_rate": 1.819217421102779e-06, "loss": 0.0617, "step": 990 }, { "epoch": 1.440406976744186, "grad_norm": 1.2883150490212059, "learning_rate": 1.8104178434875175e-06, "loss": 0.0646, "step": 991 }, { "epoch": 1.441860465116279, "grad_norm": 1.364249538543468, "learning_rate": 1.8016348922055448e-06, "loss": 0.0465, "step": 992 }, { "epoch": 1.443313953488372, "grad_norm": 1.3168346576696832, "learning_rate": 1.7928686130397443e-06, "loss": 0.0792, "step": 993 }, { "epoch": 1.4447674418604652, "grad_norm": 1.8107406359055818, "learning_rate": 1.7841190516860973e-06, "loss": 0.0732, "step": 994 }, { "epoch": 1.4462209302325582, "grad_norm": 1.2063299992756713, "learning_rate": 1.7753862537534356e-06, "loss": 0.0762, "step": 995 }, { "epoch": 1.447674418604651, "grad_norm": 1.8674037040685265, "learning_rate": 1.7666702647632128e-06, "loss": 0.0695, "step": 996 }, { "epoch": 1.4491279069767442, "grad_norm": 1.1951714079592195, "learning_rate": 1.7579711301492574e-06, "loss": 0.0597, "step": 997 }, { "epoch": 1.4505813953488373, "grad_norm": 2.0661364210414725, "learning_rate": 1.7492888952575475e-06, "loss": 0.0786, "step": 998 }, { "epoch": 1.4520348837209303, "grad_norm": 1.3804303251215748, "learning_rate": 1.740623605345963e-06, "loss": 0.0632, "step": 999 }, { "epoch": 1.4534883720930232, "grad_norm": 1.3877478679606865, "learning_rate": 1.7319753055840555e-06, "loss": 0.0467, "step": 1000 }, { "epoch": 1.4534883720930232, "eval_loss": 0.1322409063577652, "eval_runtime": 2.2065, "eval_samples_per_second": 25.379, "eval_steps_per_second": 6.345, "step": 1000 }, { "epoch": 1.4549418604651163, "grad_norm": 1.2437202205338127, "learning_rate": 1.7233440410528117e-06, "loss": 0.0504, "step": 1001 }, { "epoch": 1.4563953488372092, "grad_norm": 1.6237467009740467, "learning_rate": 1.7147298567444231e-06, "loss": 0.0938, "step": 1002 }, { "epoch": 1.4578488372093024, "grad_norm": 1.5164187899961117, "learning_rate": 1.7061327975620402e-06, "loss": 0.0772, "step": 1003 }, { "epoch": 1.4593023255813953, "grad_norm": 1.5187557348768597, "learning_rate": 1.697552908319553e-06, "loss": 0.0624, "step": 1004 }, { "epoch": 1.4607558139534884, "grad_norm": 1.178181594838555, "learning_rate": 1.6889902337413415e-06, "loss": 0.0655, "step": 1005 }, { "epoch": 1.4622093023255813, "grad_norm": 1.4575686492434048, "learning_rate": 1.6804448184620598e-06, "loss": 0.0631, "step": 1006 }, { "epoch": 1.4636627906976745, "grad_norm": 2.0108235684971327, "learning_rate": 1.6719167070263848e-06, "loss": 0.093, "step": 1007 }, { "epoch": 1.4651162790697674, "grad_norm": 1.3503757474217617, "learning_rate": 1.6634059438888034e-06, "loss": 0.0498, "step": 1008 }, { "epoch": 1.4665697674418605, "grad_norm": 1.4598338826538206, "learning_rate": 1.6549125734133625e-06, "loss": 0.0543, "step": 1009 }, { "epoch": 1.4680232558139534, "grad_norm": 1.4195207368199612, "learning_rate": 1.6464366398734532e-06, "loss": 0.0598, "step": 1010 }, { "epoch": 1.4694767441860466, "grad_norm": 1.357120211132288, "learning_rate": 1.6379781874515666e-06, "loss": 0.0511, "step": 1011 }, { "epoch": 1.4709302325581395, "grad_norm": 1.8474857055851184, "learning_rate": 1.6295372602390768e-06, "loss": 0.0676, "step": 1012 }, { "epoch": 1.4723837209302326, "grad_norm": 1.4685941935514055, "learning_rate": 1.6211139022359995e-06, "loss": 0.0616, "step": 1013 }, { "epoch": 1.4738372093023255, "grad_norm": 1.379812903529308, "learning_rate": 1.6127081573507685e-06, "loss": 0.0589, "step": 1014 }, { "epoch": 1.4752906976744187, "grad_norm": 1.2647342925759666, "learning_rate": 1.6043200694000038e-06, "loss": 0.0754, "step": 1015 }, { "epoch": 1.4767441860465116, "grad_norm": 1.13404075461665, "learning_rate": 1.5959496821082905e-06, "loss": 0.0544, "step": 1016 }, { "epoch": 1.4781976744186047, "grad_norm": 1.4701585201110707, "learning_rate": 1.5875970391079393e-06, "loss": 0.0625, "step": 1017 }, { "epoch": 1.4796511627906976, "grad_norm": 1.2418075850932098, "learning_rate": 1.5792621839387717e-06, "loss": 0.0489, "step": 1018 }, { "epoch": 1.4811046511627908, "grad_norm": 1.191695849684784, "learning_rate": 1.5709451600478787e-06, "loss": 0.0439, "step": 1019 }, { "epoch": 1.4825581395348837, "grad_norm": 1.2753380078997356, "learning_rate": 1.562646010789411e-06, "loss": 0.0616, "step": 1020 }, { "epoch": 1.4840116279069768, "grad_norm": 1.5221939760552072, "learning_rate": 1.5543647794243355e-06, "loss": 0.0827, "step": 1021 }, { "epoch": 1.4854651162790697, "grad_norm": 1.7991818130452302, "learning_rate": 1.5461015091202263e-06, "loss": 0.0717, "step": 1022 }, { "epoch": 1.4869186046511627, "grad_norm": 1.3744111180193537, "learning_rate": 1.5378562429510257e-06, "loss": 0.0639, "step": 1023 }, { "epoch": 1.4883720930232558, "grad_norm": 1.6295469366842923, "learning_rate": 1.5296290238968303e-06, "loss": 0.0615, "step": 1024 }, { "epoch": 1.489825581395349, "grad_norm": 1.4986295841575483, "learning_rate": 1.5214198948436604e-06, "loss": 0.0512, "step": 1025 }, { "epoch": 1.4912790697674418, "grad_norm": 1.5258968983298233, "learning_rate": 1.5132288985832383e-06, "loss": 0.0567, "step": 1026 }, { "epoch": 1.4927325581395348, "grad_norm": 1.3320359313785561, "learning_rate": 1.5050560778127648e-06, "loss": 0.0475, "step": 1027 }, { "epoch": 1.494186046511628, "grad_norm": 1.4535765609363385, "learning_rate": 1.496901475134701e-06, "loss": 0.0491, "step": 1028 }, { "epoch": 1.495639534883721, "grad_norm": 1.7901069273417733, "learning_rate": 1.4887651330565378e-06, "loss": 0.066, "step": 1029 }, { "epoch": 1.497093023255814, "grad_norm": 1.7585562014185228, "learning_rate": 1.4806470939905842e-06, "loss": 0.054, "step": 1030 }, { "epoch": 1.4985465116279069, "grad_norm": 1.6626210530787378, "learning_rate": 1.472547400253735e-06, "loss": 0.0757, "step": 1031 }, { "epoch": 1.5, "grad_norm": 1.4156229167835312, "learning_rate": 1.4644660940672628e-06, "loss": 0.063, "step": 1032 }, { "epoch": 1.5014534883720931, "grad_norm": 1.4973808057494127, "learning_rate": 1.4564032175565873e-06, "loss": 0.0539, "step": 1033 }, { "epoch": 1.502906976744186, "grad_norm": 1.462014108284378, "learning_rate": 1.4483588127510585e-06, "loss": 0.047, "step": 1034 }, { "epoch": 1.504360465116279, "grad_norm": 1.511766423540138, "learning_rate": 1.440332921583744e-06, "loss": 0.0576, "step": 1035 }, { "epoch": 1.505813953488372, "grad_norm": 1.4551371173160883, "learning_rate": 1.432325585891201e-06, "loss": 0.0611, "step": 1036 }, { "epoch": 1.5072674418604652, "grad_norm": 1.6008143073565722, "learning_rate": 1.4243368474132663e-06, "loss": 0.0612, "step": 1037 }, { "epoch": 1.5087209302325582, "grad_norm": 1.5177354364722053, "learning_rate": 1.41636674779283e-06, "loss": 0.0667, "step": 1038 }, { "epoch": 1.510174418604651, "grad_norm": 1.4839839114250544, "learning_rate": 1.408415328575629e-06, "loss": 0.0749, "step": 1039 }, { "epoch": 1.5116279069767442, "grad_norm": 1.4529745164922419, "learning_rate": 1.4004826312100218e-06, "loss": 0.0411, "step": 1040 }, { "epoch": 1.5130813953488373, "grad_norm": 1.1970103229686213, "learning_rate": 1.3925686970467745e-06, "loss": 0.0527, "step": 1041 }, { "epoch": 1.5145348837209303, "grad_norm": 1.3891332095713593, "learning_rate": 1.3846735673388473e-06, "loss": 0.059, "step": 1042 }, { "epoch": 1.5159883720930232, "grad_norm": 1.191612023245594, "learning_rate": 1.3767972832411813e-06, "loss": 0.0644, "step": 1043 }, { "epoch": 1.5174418604651163, "grad_norm": 1.2633634289004876, "learning_rate": 1.3689398858104753e-06, "loss": 0.0518, "step": 1044 }, { "epoch": 1.5188953488372094, "grad_norm": 1.259697007622928, "learning_rate": 1.3611014160049846e-06, "loss": 0.0399, "step": 1045 }, { "epoch": 1.5203488372093024, "grad_norm": 1.2500835906399796, "learning_rate": 1.3532819146842934e-06, "loss": 0.0555, "step": 1046 }, { "epoch": 1.5218023255813953, "grad_norm": 1.2215856052507712, "learning_rate": 1.3454814226091156e-06, "loss": 0.0541, "step": 1047 }, { "epoch": 1.5232558139534884, "grad_norm": 1.0651853757254646, "learning_rate": 1.337699980441069e-06, "loss": 0.0597, "step": 1048 }, { "epoch": 1.5247093023255816, "grad_norm": 1.4449454027688438, "learning_rate": 1.3299376287424763e-06, "loss": 0.0528, "step": 1049 }, { "epoch": 1.5261627906976745, "grad_norm": 1.3052426229372127, "learning_rate": 1.3221944079761413e-06, "loss": 0.0575, "step": 1050 }, { "epoch": 1.5276162790697674, "grad_norm": 1.3323801979825336, "learning_rate": 1.3144703585051498e-06, "loss": 0.0471, "step": 1051 }, { "epoch": 1.5290697674418605, "grad_norm": 1.3097362291071397, "learning_rate": 1.3067655205926488e-06, "loss": 0.0686, "step": 1052 }, { "epoch": 1.5305232558139537, "grad_norm": 1.556833363521575, "learning_rate": 1.2990799344016436e-06, "loss": 0.0567, "step": 1053 }, { "epoch": 1.5319767441860463, "grad_norm": 1.2414839248874836, "learning_rate": 1.2914136399947841e-06, "loss": 0.0461, "step": 1054 }, { "epoch": 1.5334302325581395, "grad_norm": 1.4598506706462977, "learning_rate": 1.283766677334161e-06, "loss": 0.0661, "step": 1055 }, { "epoch": 1.5348837209302326, "grad_norm": 1.5053648322674895, "learning_rate": 1.2761390862810907e-06, "loss": 0.0684, "step": 1056 }, { "epoch": 1.5363372093023255, "grad_norm": 1.533086553131052, "learning_rate": 1.2685309065959168e-06, "loss": 0.0623, "step": 1057 }, { "epoch": 1.5377906976744184, "grad_norm": 1.376493396330026, "learning_rate": 1.260942177937789e-06, "loss": 0.0523, "step": 1058 }, { "epoch": 1.5392441860465116, "grad_norm": 1.4332881734009741, "learning_rate": 1.2533729398644735e-06, "loss": 0.042, "step": 1059 }, { "epoch": 1.5406976744186047, "grad_norm": 0.9999597570352821, "learning_rate": 1.2458232318321306e-06, "loss": 0.0336, "step": 1060 }, { "epoch": 1.5421511627906976, "grad_norm": 1.211215207564261, "learning_rate": 1.238293093195122e-06, "loss": 0.052, "step": 1061 }, { "epoch": 1.5436046511627906, "grad_norm": 1.394715555164229, "learning_rate": 1.2307825632057952e-06, "loss": 0.0753, "step": 1062 }, { "epoch": 1.5450581395348837, "grad_norm": 1.3381942952433081, "learning_rate": 1.2232916810142886e-06, "loss": 0.058, "step": 1063 }, { "epoch": 1.5465116279069768, "grad_norm": 1.7578672319163982, "learning_rate": 1.2158204856683176e-06, "loss": 0.0657, "step": 1064 }, { "epoch": 1.5479651162790697, "grad_norm": 1.5353885598622956, "learning_rate": 1.2083690161129808e-06, "loss": 0.059, "step": 1065 }, { "epoch": 1.5494186046511627, "grad_norm": 1.405316093009901, "learning_rate": 1.2009373111905487e-06, "loss": 0.0579, "step": 1066 }, { "epoch": 1.5508720930232558, "grad_norm": 1.607692815378601, "learning_rate": 1.1935254096402655e-06, "loss": 0.0653, "step": 1067 }, { "epoch": 1.552325581395349, "grad_norm": 1.497479074300615, "learning_rate": 1.1861333500981449e-06, "loss": 0.0523, "step": 1068 }, { "epoch": 1.5537790697674418, "grad_norm": 1.1203473988887975, "learning_rate": 1.1787611710967751e-06, "loss": 0.0452, "step": 1069 }, { "epoch": 1.5552325581395348, "grad_norm": 1.4222136382752362, "learning_rate": 1.1714089110651071e-06, "loss": 0.0635, "step": 1070 }, { "epoch": 1.556686046511628, "grad_norm": 0.9658885164743157, "learning_rate": 1.1640766083282662e-06, "loss": 0.0771, "step": 1071 }, { "epoch": 1.558139534883721, "grad_norm": 1.3063613380960122, "learning_rate": 1.1567643011073393e-06, "loss": 0.0573, "step": 1072 }, { "epoch": 1.559593023255814, "grad_norm": 1.6416890877169144, "learning_rate": 1.1494720275191901e-06, "loss": 0.0616, "step": 1073 }, { "epoch": 1.5610465116279069, "grad_norm": 1.3817386853715168, "learning_rate": 1.1421998255762468e-06, "loss": 0.0503, "step": 1074 }, { "epoch": 1.5625, "grad_norm": 1.3346841117708024, "learning_rate": 1.134947733186315e-06, "loss": 0.047, "step": 1075 }, { "epoch": 1.5639534883720931, "grad_norm": 1.3343322414067098, "learning_rate": 1.127715788152372e-06, "loss": 0.0636, "step": 1076 }, { "epoch": 1.565406976744186, "grad_norm": 1.7154646419432018, "learning_rate": 1.1205040281723728e-06, "loss": 0.0645, "step": 1077 }, { "epoch": 1.566860465116279, "grad_norm": 1.4443691543089068, "learning_rate": 1.1133124908390575e-06, "loss": 0.0569, "step": 1078 }, { "epoch": 1.568313953488372, "grad_norm": 1.9549846970535834, "learning_rate": 1.106141213639747e-06, "loss": 0.0843, "step": 1079 }, { "epoch": 1.5697674418604652, "grad_norm": 1.4036582458578477, "learning_rate": 1.0989902339561554e-06, "loss": 0.0541, "step": 1080 }, { "epoch": 1.5712209302325582, "grad_norm": 1.7620858308016654, "learning_rate": 1.0918595890641891e-06, "loss": 0.0571, "step": 1081 }, { "epoch": 1.572674418604651, "grad_norm": 2.0116431611899657, "learning_rate": 1.0847493161337602e-06, "loss": 0.0509, "step": 1082 }, { "epoch": 1.5741279069767442, "grad_norm": 1.3745830759736906, "learning_rate": 1.077659452228581e-06, "loss": 0.0506, "step": 1083 }, { "epoch": 1.5755813953488373, "grad_norm": 1.491007758939216, "learning_rate": 1.0705900343059856e-06, "loss": 0.0615, "step": 1084 }, { "epoch": 1.5770348837209303, "grad_norm": 1.369265137859038, "learning_rate": 1.0635410992167212e-06, "loss": 0.0674, "step": 1085 }, { "epoch": 1.5784883720930232, "grad_norm": 1.0623500340337344, "learning_rate": 1.0565126837047718e-06, "loss": 0.0467, "step": 1086 }, { "epoch": 1.5799418604651163, "grad_norm": 1.3868730182724978, "learning_rate": 1.049504824407152e-06, "loss": 0.0583, "step": 1087 }, { "epoch": 1.5813953488372094, "grad_norm": 1.5500696399009597, "learning_rate": 1.04251755785373e-06, "loss": 0.054, "step": 1088 }, { "epoch": 1.5828488372093024, "grad_norm": 1.416069042767884, "learning_rate": 1.0355509204670234e-06, "loss": 0.0462, "step": 1089 }, { "epoch": 1.5843023255813953, "grad_norm": 1.2069842940803852, "learning_rate": 1.0286049485620213e-06, "loss": 0.0483, "step": 1090 }, { "epoch": 1.5857558139534884, "grad_norm": 1.6409720581547536, "learning_rate": 1.0216796783459866e-06, "loss": 0.0489, "step": 1091 }, { "epoch": 1.5872093023255816, "grad_norm": 1.7346156581015553, "learning_rate": 1.0147751459182737e-06, "loss": 0.0452, "step": 1092 }, { "epoch": 1.5886627906976745, "grad_norm": 1.304713086631584, "learning_rate": 1.007891387270134e-06, "loss": 0.0652, "step": 1093 }, { "epoch": 1.5901162790697674, "grad_norm": 1.169058686538856, "learning_rate": 1.001028438284533e-06, "loss": 0.0395, "step": 1094 }, { "epoch": 1.5915697674418605, "grad_norm": 1.581843571962872, "learning_rate": 9.941863347359597e-07, "loss": 0.0593, "step": 1095 }, { "epoch": 1.5930232558139537, "grad_norm": 1.492544567921207, "learning_rate": 9.873651122902472e-07, "loss": 0.0834, "step": 1096 }, { "epoch": 1.5944767441860463, "grad_norm": 1.459822032594516, "learning_rate": 9.805648065043745e-07, "loss": 0.0511, "step": 1097 }, { "epoch": 1.5959302325581395, "grad_norm": 1.3911943572138405, "learning_rate": 9.737854528262953e-07, "loss": 0.0531, "step": 1098 }, { "epoch": 1.5973837209302326, "grad_norm": 1.0428194925897607, "learning_rate": 9.670270865947406e-07, "loss": 0.0646, "step": 1099 }, { "epoch": 1.5988372093023255, "grad_norm": 1.3576620238676198, "learning_rate": 9.602897430390456e-07, "loss": 0.0471, "step": 1100 }, { "epoch": 1.6002906976744184, "grad_norm": 1.5766868187890286, "learning_rate": 9.53573457278954e-07, "loss": 0.0637, "step": 1101 }, { "epoch": 1.6017441860465116, "grad_norm": 1.4800012500421134, "learning_rate": 9.468782643244484e-07, "loss": 0.0695, "step": 1102 }, { "epoch": 1.6031976744186047, "grad_norm": 1.7029559921767845, "learning_rate": 9.40204199075555e-07, "loss": 0.0608, "step": 1103 }, { "epoch": 1.6046511627906976, "grad_norm": 1.1192989686723849, "learning_rate": 9.335512963221732e-07, "loss": 0.0501, "step": 1104 }, { "epoch": 1.6061046511627906, "grad_norm": 1.392049569057171, "learning_rate": 9.269195907438843e-07, "loss": 0.0956, "step": 1105 }, { "epoch": 1.6075581395348837, "grad_norm": 1.4377153631105137, "learning_rate": 9.203091169097761e-07, "loss": 0.0639, "step": 1106 }, { "epoch": 1.6090116279069768, "grad_norm": 1.4194367382834814, "learning_rate": 9.137199092782617e-07, "loss": 0.0504, "step": 1107 }, { "epoch": 1.6104651162790697, "grad_norm": 1.0856317537640523, "learning_rate": 9.071520021969027e-07, "loss": 0.0385, "step": 1108 }, { "epoch": 1.6119186046511627, "grad_norm": 1.3095443403571647, "learning_rate": 9.006054299022227e-07, "loss": 0.058, "step": 1109 }, { "epoch": 1.6133720930232558, "grad_norm": 1.377541034515128, "learning_rate": 8.940802265195375e-07, "loss": 0.0688, "step": 1110 }, { "epoch": 1.614825581395349, "grad_norm": 1.1095826182633532, "learning_rate": 8.875764260627695e-07, "loss": 0.0473, "step": 1111 }, { "epoch": 1.6162790697674418, "grad_norm": 1.5224215809822206, "learning_rate": 8.810940624342784e-07, "loss": 0.0825, "step": 1112 }, { "epoch": 1.6177325581395348, "grad_norm": 1.8482839553995425, "learning_rate": 8.746331694246756e-07, "loss": 0.0744, "step": 1113 }, { "epoch": 1.619186046511628, "grad_norm": 1.6017104682800916, "learning_rate": 8.681937807126567e-07, "loss": 0.069, "step": 1114 }, { "epoch": 1.620639534883721, "grad_norm": 1.2376828031612417, "learning_rate": 8.617759298648182e-07, "loss": 0.0495, "step": 1115 }, { "epoch": 1.622093023255814, "grad_norm": 1.284668072013395, "learning_rate": 8.553796503354899e-07, "loss": 0.0771, "step": 1116 }, { "epoch": 1.6235465116279069, "grad_norm": 1.3931203949754822, "learning_rate": 8.490049754665541e-07, "loss": 0.0574, "step": 1117 }, { "epoch": 1.625, "grad_norm": 1.4765649037778599, "learning_rate": 8.426519384872733e-07, "loss": 0.0682, "step": 1118 }, { "epoch": 1.6264534883720931, "grad_norm": 1.181845082363929, "learning_rate": 8.363205725141238e-07, "loss": 0.0494, "step": 1119 }, { "epoch": 1.627906976744186, "grad_norm": 1.3928319550967374, "learning_rate": 8.30010910550611e-07, "loss": 0.059, "step": 1120 }, { "epoch": 1.629360465116279, "grad_norm": 1.1141954284915958, "learning_rate": 8.237229854871076e-07, "loss": 0.0471, "step": 1121 }, { "epoch": 1.630813953488372, "grad_norm": 1.4876650888680079, "learning_rate": 8.174568301006763e-07, "loss": 0.0805, "step": 1122 }, { "epoch": 1.6322674418604652, "grad_norm": 1.4355426352193548, "learning_rate": 8.11212477054904e-07, "loss": 0.0412, "step": 1123 }, { "epoch": 1.6337209302325582, "grad_norm": 1.3219907781653484, "learning_rate": 8.049899588997246e-07, "loss": 0.0644, "step": 1124 }, { "epoch": 1.635174418604651, "grad_norm": 1.2661461720313956, "learning_rate": 7.987893080712572e-07, "loss": 0.0647, "step": 1125 }, { "epoch": 1.6366279069767442, "grad_norm": 1.4112271603442867, "learning_rate": 7.926105568916292e-07, "loss": 0.0559, "step": 1126 }, { "epoch": 1.6380813953488373, "grad_norm": 1.4885958147808351, "learning_rate": 7.864537375688164e-07, "loss": 0.0665, "step": 1127 }, { "epoch": 1.6395348837209303, "grad_norm": 1.2255901712760742, "learning_rate": 7.803188821964652e-07, "loss": 0.0447, "step": 1128 }, { "epoch": 1.6409883720930232, "grad_norm": 1.5452975728529494, "learning_rate": 7.742060227537351e-07, "loss": 0.067, "step": 1129 }, { "epoch": 1.6424418604651163, "grad_norm": 1.3595014210127743, "learning_rate": 7.681151911051232e-07, "loss": 0.0613, "step": 1130 }, { "epoch": 1.6438953488372094, "grad_norm": 1.4222742042571062, "learning_rate": 7.620464190003074e-07, "loss": 0.0725, "step": 1131 }, { "epoch": 1.6453488372093024, "grad_norm": 1.2218037474493977, "learning_rate": 7.559997380739714e-07, "loss": 0.0518, "step": 1132 }, { "epoch": 1.6468023255813953, "grad_norm": 1.7245104586600535, "learning_rate": 7.499751798456456e-07, "loss": 0.072, "step": 1133 }, { "epoch": 1.6482558139534884, "grad_norm": 1.3548373797393403, "learning_rate": 7.439727757195408e-07, "loss": 0.0655, "step": 1134 }, { "epoch": 1.6497093023255816, "grad_norm": 1.4843313971694068, "learning_rate": 7.379925569843877e-07, "loss": 0.0701, "step": 1135 }, { "epoch": 1.6511627906976745, "grad_norm": 1.2524145522085295, "learning_rate": 7.320345548132679e-07, "loss": 0.0429, "step": 1136 }, { "epoch": 1.6526162790697674, "grad_norm": 1.3051416609601543, "learning_rate": 7.260988002634584e-07, "loss": 0.0709, "step": 1137 }, { "epoch": 1.6540697674418605, "grad_norm": 1.1280777447038597, "learning_rate": 7.201853242762613e-07, "loss": 0.0653, "step": 1138 }, { "epoch": 1.6555232558139537, "grad_norm": 1.524319939704524, "learning_rate": 7.142941576768526e-07, "loss": 0.0671, "step": 1139 }, { "epoch": 1.6569767441860463, "grad_norm": 1.1315251169649219, "learning_rate": 7.084253311741101e-07, "loss": 0.0415, "step": 1140 }, { "epoch": 1.6584302325581395, "grad_norm": 1.5134458313250108, "learning_rate": 7.025788753604668e-07, "loss": 0.0507, "step": 1141 }, { "epoch": 1.6598837209302326, "grad_norm": 1.3932510174316692, "learning_rate": 6.967548207117364e-07, "loss": 0.0653, "step": 1142 }, { "epoch": 1.6613372093023255, "grad_norm": 1.369278789308021, "learning_rate": 6.909531975869682e-07, "loss": 0.0602, "step": 1143 }, { "epoch": 1.6627906976744184, "grad_norm": 1.335104392777719, "learning_rate": 6.851740362282788e-07, "loss": 0.0505, "step": 1144 }, { "epoch": 1.6642441860465116, "grad_norm": 1.7252990354329707, "learning_rate": 6.794173667606995e-07, "loss": 0.0679, "step": 1145 }, { "epoch": 1.6656976744186047, "grad_norm": 1.3931797125192535, "learning_rate": 6.736832191920184e-07, "loss": 0.0689, "step": 1146 }, { "epoch": 1.6671511627906976, "grad_norm": 1.270578497522883, "learning_rate": 6.679716234126243e-07, "loss": 0.0663, "step": 1147 }, { "epoch": 1.6686046511627906, "grad_norm": 1.1833811909595593, "learning_rate": 6.622826091953483e-07, "loss": 0.043, "step": 1148 }, { "epoch": 1.6700581395348837, "grad_norm": 1.2856539604876163, "learning_rate": 6.566162061953141e-07, "loss": 0.0689, "step": 1149 }, { "epoch": 1.6715116279069768, "grad_norm": 1.2026108031855738, "learning_rate": 6.50972443949775e-07, "loss": 0.0436, "step": 1150 }, { "epoch": 1.6729651162790697, "grad_norm": 1.4050042046526028, "learning_rate": 6.453513518779708e-07, "loss": 0.0683, "step": 1151 }, { "epoch": 1.6744186046511627, "grad_norm": 1.172885792221737, "learning_rate": 6.397529592809615e-07, "loss": 0.0457, "step": 1152 }, { "epoch": 1.6758720930232558, "grad_norm": 1.6645132359630175, "learning_rate": 6.341772953414893e-07, "loss": 0.0656, "step": 1153 }, { "epoch": 1.677325581395349, "grad_norm": 1.203947351057658, "learning_rate": 6.286243891238114e-07, "loss": 0.0508, "step": 1154 }, { "epoch": 1.6787790697674418, "grad_norm": 1.5653427701253289, "learning_rate": 6.23094269573562e-07, "loss": 0.0581, "step": 1155 }, { "epoch": 1.6802325581395348, "grad_norm": 1.3692678493335273, "learning_rate": 6.175869655175898e-07, "loss": 0.0574, "step": 1156 }, { "epoch": 1.681686046511628, "grad_norm": 1.223293248941118, "learning_rate": 6.121025056638186e-07, "loss": 0.0645, "step": 1157 }, { "epoch": 1.683139534883721, "grad_norm": 1.1984686741863968, "learning_rate": 6.06640918601088e-07, "loss": 0.0492, "step": 1158 }, { "epoch": 1.684593023255814, "grad_norm": 1.8607181067232295, "learning_rate": 6.012022327990097e-07, "loss": 0.0717, "step": 1159 }, { "epoch": 1.6860465116279069, "grad_norm": 1.3129459081678303, "learning_rate": 5.957864766078186e-07, "loss": 0.0527, "step": 1160 }, { "epoch": 1.6875, "grad_norm": 1.2419623459260931, "learning_rate": 5.903936782582253e-07, "loss": 0.0381, "step": 1161 }, { "epoch": 1.6889534883720931, "grad_norm": 1.5778350174178633, "learning_rate": 5.850238658612667e-07, "loss": 0.0657, "step": 1162 }, { "epoch": 1.690406976744186, "grad_norm": 1.227703263910834, "learning_rate": 5.796770674081592e-07, "loss": 0.0578, "step": 1163 }, { "epoch": 1.691860465116279, "grad_norm": 1.2163654872782266, "learning_rate": 5.743533107701593e-07, "loss": 0.0622, "step": 1164 }, { "epoch": 1.693313953488372, "grad_norm": 1.7561357257514982, "learning_rate": 5.690526236984079e-07, "loss": 0.0706, "step": 1165 }, { "epoch": 1.6947674418604652, "grad_norm": 1.236485378770411, "learning_rate": 5.637750338237963e-07, "loss": 0.0476, "step": 1166 }, { "epoch": 1.6962209302325582, "grad_norm": 1.273190952518188, "learning_rate": 5.585205686568123e-07, "loss": 0.0572, "step": 1167 }, { "epoch": 1.697674418604651, "grad_norm": 1.3840145983571588, "learning_rate": 5.532892555874059e-07, "loss": 0.0498, "step": 1168 }, { "epoch": 1.6991279069767442, "grad_norm": 1.6682935105457355, "learning_rate": 5.48081121884838e-07, "loss": 0.0501, "step": 1169 }, { "epoch": 1.7005813953488373, "grad_norm": 1.7215714569023008, "learning_rate": 5.428961946975464e-07, "loss": 0.0621, "step": 1170 }, { "epoch": 1.7020348837209303, "grad_norm": 1.2802300699738327, "learning_rate": 5.377345010529977e-07, "loss": 0.0507, "step": 1171 }, { "epoch": 1.7034883720930232, "grad_norm": 1.179288667096262, "learning_rate": 5.325960678575498e-07, "loss": 0.0622, "step": 1172 }, { "epoch": 1.7049418604651163, "grad_norm": 1.397312032777205, "learning_rate": 5.274809218963089e-07, "loss": 0.048, "step": 1173 }, { "epoch": 1.7063953488372094, "grad_norm": 1.0705846916376756, "learning_rate": 5.22389089832997e-07, "loss": 0.0462, "step": 1174 }, { "epoch": 1.7078488372093024, "grad_norm": 1.5708429842240574, "learning_rate": 5.173205982098018e-07, "loss": 0.0777, "step": 1175 }, { "epoch": 1.7093023255813953, "grad_norm": 1.6720126005052158, "learning_rate": 5.122754734472496e-07, "loss": 0.0751, "step": 1176 }, { "epoch": 1.7107558139534884, "grad_norm": 1.4286721495638015, "learning_rate": 5.072537418440565e-07, "loss": 0.0509, "step": 1177 }, { "epoch": 1.7122093023255816, "grad_norm": 1.594631174541215, "learning_rate": 5.022554295770038e-07, "loss": 0.0578, "step": 1178 }, { "epoch": 1.7136627906976745, "grad_norm": 1.2897023400093752, "learning_rate": 4.972805627007881e-07, "loss": 0.059, "step": 1179 }, { "epoch": 1.7151162790697674, "grad_norm": 1.0706258806400843, "learning_rate": 4.92329167147898e-07, "loss": 0.0403, "step": 1180 }, { "epoch": 1.7165697674418605, "grad_norm": 1.247773190344945, "learning_rate": 4.874012687284685e-07, "loss": 0.0588, "step": 1181 }, { "epoch": 1.7180232558139537, "grad_norm": 1.1610372428474574, "learning_rate": 4.824968931301549e-07, "loss": 0.0493, "step": 1182 }, { "epoch": 1.7194767441860463, "grad_norm": 0.9925566701240426, "learning_rate": 4.776160659179918e-07, "loss": 0.0423, "step": 1183 }, { "epoch": 1.7209302325581395, "grad_norm": 1.2198238537851194, "learning_rate": 4.727588125342669e-07, "loss": 0.0434, "step": 1184 }, { "epoch": 1.7223837209302326, "grad_norm": 1.0814209449015801, "learning_rate": 4.679251582983807e-07, "loss": 0.0515, "step": 1185 }, { "epoch": 1.7238372093023255, "grad_norm": 1.253442945461474, "learning_rate": 4.631151284067209e-07, "loss": 0.0401, "step": 1186 }, { "epoch": 1.7252906976744184, "grad_norm": 1.6572342387258825, "learning_rate": 4.583287479325266e-07, "loss": 0.0851, "step": 1187 }, { "epoch": 1.7267441860465116, "grad_norm": 1.5674512288501314, "learning_rate": 4.5356604182576315e-07, "loss": 0.0748, "step": 1188 }, { "epoch": 1.7281976744186047, "grad_norm": 1.4246277140745789, "learning_rate": 4.4882703491298364e-07, "loss": 0.0507, "step": 1189 }, { "epoch": 1.7296511627906976, "grad_norm": 1.4692955325669175, "learning_rate": 4.4411175189720935e-07, "loss": 0.0599, "step": 1190 }, { "epoch": 1.7311046511627906, "grad_norm": 1.3880435931686506, "learning_rate": 4.3942021735779163e-07, "loss": 0.0801, "step": 1191 }, { "epoch": 1.7325581395348837, "grad_norm": 1.2441163102111796, "learning_rate": 4.347524557502919e-07, "loss": 0.0656, "step": 1192 }, { "epoch": 1.7340116279069768, "grad_norm": 1.333242236616478, "learning_rate": 4.301084914063475e-07, "loss": 0.051, "step": 1193 }, { "epoch": 1.7354651162790697, "grad_norm": 1.48014650555478, "learning_rate": 4.2548834853355036e-07, "loss": 0.0678, "step": 1194 }, { "epoch": 1.7369186046511627, "grad_norm": 1.4136502655814862, "learning_rate": 4.2089205121531475e-07, "loss": 0.0423, "step": 1195 }, { "epoch": 1.7383720930232558, "grad_norm": 1.466619588107673, "learning_rate": 4.163196234107603e-07, "loss": 0.0422, "step": 1196 }, { "epoch": 1.739825581395349, "grad_norm": 1.0867167600455685, "learning_rate": 4.117710889545767e-07, "loss": 0.0465, "step": 1197 }, { "epoch": 1.7412790697674418, "grad_norm": 1.1097337994586927, "learning_rate": 4.0724647155690855e-07, "loss": 0.049, "step": 1198 }, { "epoch": 1.7427325581395348, "grad_norm": 1.4162752653560682, "learning_rate": 4.0274579480322485e-07, "loss": 0.0706, "step": 1199 }, { "epoch": 1.744186046511628, "grad_norm": 1.6314238706254214, "learning_rate": 3.9826908215420344e-07, "loss": 0.0648, "step": 1200 }, { "epoch": 1.744186046511628, "eval_loss": 0.13038784265518188, "eval_runtime": 2.2036, "eval_samples_per_second": 25.414, "eval_steps_per_second": 6.353, "step": 1200 }, { "epoch": 1.745639534883721, "grad_norm": 1.4924894866086462, "learning_rate": 3.938163569455999e-07, "loss": 0.065, "step": 1201 }, { "epoch": 1.747093023255814, "grad_norm": 1.2812945325918792, "learning_rate": 3.893876423881343e-07, "loss": 0.056, "step": 1202 }, { "epoch": 1.7485465116279069, "grad_norm": 1.4847911329450767, "learning_rate": 3.8498296156736336e-07, "loss": 0.0609, "step": 1203 }, { "epoch": 1.75, "grad_norm": 1.1933603478148918, "learning_rate": 3.8060233744356634e-07, "loss": 0.0528, "step": 1204 }, { "epoch": 1.7514534883720931, "grad_norm": 1.2151612672851098, "learning_rate": 3.7624579285161945e-07, "loss": 0.0448, "step": 1205 }, { "epoch": 1.752906976744186, "grad_norm": 1.2721240144688912, "learning_rate": 3.719133505008793e-07, "loss": 0.0638, "step": 1206 }, { "epoch": 1.754360465116279, "grad_norm": 1.5815876693660877, "learning_rate": 3.67605032975068e-07, "loss": 0.0611, "step": 1207 }, { "epoch": 1.755813953488372, "grad_norm": 1.4791508048535231, "learning_rate": 3.633208627321483e-07, "loss": 0.0565, "step": 1208 }, { "epoch": 1.7572674418604652, "grad_norm": 1.9027980020443158, "learning_rate": 3.590608621042141e-07, "loss": 0.0835, "step": 1209 }, { "epoch": 1.7587209302325582, "grad_norm": 1.2720247109280667, "learning_rate": 3.548250532973663e-07, "loss": 0.041, "step": 1210 }, { "epoch": 1.760174418604651, "grad_norm": 1.3313592730700925, "learning_rate": 3.50613458391606e-07, "loss": 0.0662, "step": 1211 }, { "epoch": 1.7616279069767442, "grad_norm": 1.742038899061449, "learning_rate": 3.464260993407098e-07, "loss": 0.0625, "step": 1212 }, { "epoch": 1.7630813953488373, "grad_norm": 1.313671785304789, "learning_rate": 3.422629979721226e-07, "loss": 0.058, "step": 1213 }, { "epoch": 1.7645348837209303, "grad_norm": 1.4381256471907784, "learning_rate": 3.381241759868403e-07, "loss": 0.0577, "step": 1214 }, { "epoch": 1.7659883720930232, "grad_norm": 1.1247184686299345, "learning_rate": 3.340096549592997e-07, "loss": 0.0381, "step": 1215 }, { "epoch": 1.7674418604651163, "grad_norm": 1.3749991805427006, "learning_rate": 3.299194563372604e-07, "loss": 0.05, "step": 1216 }, { "epoch": 1.7688953488372094, "grad_norm": 1.2620255884743692, "learning_rate": 3.258536014417002e-07, "loss": 0.0404, "step": 1217 }, { "epoch": 1.7703488372093024, "grad_norm": 1.4310474211303066, "learning_rate": 3.2181211146669835e-07, "loss": 0.0637, "step": 1218 }, { "epoch": 1.7718023255813953, "grad_norm": 1.372997486141924, "learning_rate": 3.177950074793279e-07, "loss": 0.059, "step": 1219 }, { "epoch": 1.7732558139534884, "grad_norm": 1.4514336664056386, "learning_rate": 3.1380231041954366e-07, "loss": 0.0802, "step": 1220 }, { "epoch": 1.7747093023255816, "grad_norm": 1.294363423127495, "learning_rate": 3.0983404110007775e-07, "loss": 0.0463, "step": 1221 }, { "epoch": 1.7761627906976745, "grad_norm": 1.2489012530557877, "learning_rate": 3.05890220206323e-07, "loss": 0.0758, "step": 1222 }, { "epoch": 1.7776162790697674, "grad_norm": 1.3733868017342423, "learning_rate": 3.0197086829623524e-07, "loss": 0.0433, "step": 1223 }, { "epoch": 1.7790697674418605, "grad_norm": 1.2696169610865513, "learning_rate": 2.980760058002163e-07, "loss": 0.0695, "step": 1224 }, { "epoch": 1.7805232558139537, "grad_norm": 1.1885642085760084, "learning_rate": 2.9420565302101467e-07, "loss": 0.0689, "step": 1225 }, { "epoch": 1.7819767441860463, "grad_norm": 1.7975663880717683, "learning_rate": 2.9035983013361524e-07, "loss": 0.0832, "step": 1226 }, { "epoch": 1.7834302325581395, "grad_norm": 1.4668495811562468, "learning_rate": 2.8653855718513867e-07, "loss": 0.0839, "step": 1227 }, { "epoch": 1.7848837209302326, "grad_norm": 1.7943430675185803, "learning_rate": 2.827418540947313e-07, "loss": 0.0734, "step": 1228 }, { "epoch": 1.7863372093023255, "grad_norm": 1.414532030794273, "learning_rate": 2.7896974065346636e-07, "loss": 0.0591, "step": 1229 }, { "epoch": 1.7877906976744184, "grad_norm": 1.3282221407021015, "learning_rate": 2.7522223652423627e-07, "loss": 0.0574, "step": 1230 }, { "epoch": 1.7892441860465116, "grad_norm": 1.4616173540064281, "learning_rate": 2.7149936124165556e-07, "loss": 0.088, "step": 1231 }, { "epoch": 1.7906976744186047, "grad_norm": 1.62055262401236, "learning_rate": 2.67801134211953e-07, "loss": 0.0555, "step": 1232 }, { "epoch": 1.7921511627906976, "grad_norm": 1.5054710143448626, "learning_rate": 2.6412757471287633e-07, "loss": 0.0355, "step": 1233 }, { "epoch": 1.7936046511627906, "grad_norm": 1.2551896585231108, "learning_rate": 2.6047870189358504e-07, "loss": 0.0619, "step": 1234 }, { "epoch": 1.7950581395348837, "grad_norm": 1.4827616308813487, "learning_rate": 2.568545347745582e-07, "loss": 0.0699, "step": 1235 }, { "epoch": 1.7965116279069768, "grad_norm": 1.1634489682279106, "learning_rate": 2.5325509224748965e-07, "loss": 0.0424, "step": 1236 }, { "epoch": 1.7979651162790697, "grad_norm": 1.2805820993346737, "learning_rate": 2.4968039307519174e-07, "loss": 0.0738, "step": 1237 }, { "epoch": 1.7994186046511627, "grad_norm": 1.2970028495965267, "learning_rate": 2.461304558914973e-07, "loss": 0.0483, "step": 1238 }, { "epoch": 1.8008720930232558, "grad_norm": 1.3774251049609605, "learning_rate": 2.426052992011613e-07, "loss": 0.0594, "step": 1239 }, { "epoch": 1.802325581395349, "grad_norm": 1.2741543577811263, "learning_rate": 2.3910494137976526e-07, "loss": 0.061, "step": 1240 }, { "epoch": 1.8037790697674418, "grad_norm": 1.4946869686916726, "learning_rate": 2.356294006736254e-07, "loss": 0.0422, "step": 1241 }, { "epoch": 1.8052325581395348, "grad_norm": 1.1976454792671984, "learning_rate": 2.321786951996885e-07, "loss": 0.049, "step": 1242 }, { "epoch": 1.806686046511628, "grad_norm": 1.7640200270984723, "learning_rate": 2.2875284294544663e-07, "loss": 0.0654, "step": 1243 }, { "epoch": 1.808139534883721, "grad_norm": 1.4131166766489878, "learning_rate": 2.2535186176883771e-07, "loss": 0.0615, "step": 1244 }, { "epoch": 1.809593023255814, "grad_norm": 1.3382405156149042, "learning_rate": 2.2197576939815447e-07, "loss": 0.0596, "step": 1245 }, { "epoch": 1.8110465116279069, "grad_norm": 1.0375341190167515, "learning_rate": 2.186245834319517e-07, "loss": 0.0378, "step": 1246 }, { "epoch": 1.8125, "grad_norm": 1.26799708959962, "learning_rate": 2.152983213389559e-07, "loss": 0.0612, "step": 1247 }, { "epoch": 1.8139534883720931, "grad_norm": 0.9680143896503044, "learning_rate": 2.1199700045797077e-07, "loss": 0.0391, "step": 1248 }, { "epoch": 1.815406976744186, "grad_norm": 1.4351941135226602, "learning_rate": 2.0872063799778908e-07, "loss": 0.0465, "step": 1249 }, { "epoch": 1.816860465116279, "grad_norm": 1.2212499026345323, "learning_rate": 2.054692510371059e-07, "loss": 0.0596, "step": 1250 }, { "epoch": 1.818313953488372, "grad_norm": 1.4124067831447158, "learning_rate": 2.0224285652442332e-07, "loss": 0.0434, "step": 1251 }, { "epoch": 1.8197674418604652, "grad_norm": 1.6900102388116847, "learning_rate": 1.9904147127796646e-07, "loss": 0.0633, "step": 1252 }, { "epoch": 1.8212209302325582, "grad_norm": 1.1245930914451603, "learning_rate": 1.9586511198559422e-07, "loss": 0.0518, "step": 1253 }, { "epoch": 1.822674418604651, "grad_norm": 1.4535155723070285, "learning_rate": 1.9271379520471366e-07, "loss": 0.0611, "step": 1254 }, { "epoch": 1.8241279069767442, "grad_norm": 1.1651708233377704, "learning_rate": 1.8958753736219137e-07, "loss": 0.0437, "step": 1255 }, { "epoch": 1.8255813953488373, "grad_norm": 1.698268844667189, "learning_rate": 1.8648635475427112e-07, "loss": 0.0661, "step": 1256 }, { "epoch": 1.8270348837209303, "grad_norm": 1.2897615950696786, "learning_rate": 1.8341026354648461e-07, "loss": 0.0653, "step": 1257 }, { "epoch": 1.8284883720930232, "grad_norm": 1.454033537605278, "learning_rate": 1.8035927977357204e-07, "loss": 0.0824, "step": 1258 }, { "epoch": 1.8299418604651163, "grad_norm": 1.4782681121621621, "learning_rate": 1.773334193393944e-07, "loss": 0.0695, "step": 1259 }, { "epoch": 1.8313953488372094, "grad_norm": 1.2472901471200917, "learning_rate": 1.7433269801685304e-07, "loss": 0.0535, "step": 1260 }, { "epoch": 1.8328488372093024, "grad_norm": 1.411827026048705, "learning_rate": 1.713571314478063e-07, "loss": 0.0558, "step": 1261 }, { "epoch": 1.8343023255813953, "grad_norm": 1.3101061668799474, "learning_rate": 1.684067351429891e-07, "loss": 0.0847, "step": 1262 }, { "epoch": 1.8357558139534884, "grad_norm": 1.3283465609162761, "learning_rate": 1.6548152448193021e-07, "loss": 0.0517, "step": 1263 }, { "epoch": 1.8372093023255816, "grad_norm": 1.073779839976608, "learning_rate": 1.6258151471287397e-07, "loss": 0.059, "step": 1264 }, { "epoch": 1.8386627906976745, "grad_norm": 1.5768683051386514, "learning_rate": 1.5970672095269978e-07, "loss": 0.0715, "step": 1265 }, { "epoch": 1.8401162790697674, "grad_norm": 1.6425224461963919, "learning_rate": 1.5685715818684332e-07, "loss": 0.0742, "step": 1266 }, { "epoch": 1.8415697674418605, "grad_norm": 1.2460670529438014, "learning_rate": 1.540328412692188e-07, "loss": 0.0509, "step": 1267 }, { "epoch": 1.8430232558139537, "grad_norm": 1.1780779637487353, "learning_rate": 1.512337849221429e-07, "loss": 0.0558, "step": 1268 }, { "epoch": 1.8444767441860463, "grad_norm": 1.3379396790966653, "learning_rate": 1.4846000373625325e-07, "loss": 0.039, "step": 1269 }, { "epoch": 1.8459302325581395, "grad_norm": 1.6926159045939793, "learning_rate": 1.4571151217043944e-07, "loss": 0.0581, "step": 1270 }, { "epoch": 1.8473837209302326, "grad_norm": 1.358687319927915, "learning_rate": 1.4298832455176104e-07, "loss": 0.0442, "step": 1271 }, { "epoch": 1.8488372093023255, "grad_norm": 1.1372613393653885, "learning_rate": 1.4029045507537696e-07, "loss": 0.0619, "step": 1272 }, { "epoch": 1.8502906976744184, "grad_norm": 1.761528640147089, "learning_rate": 1.376179178044701e-07, "loss": 0.1053, "step": 1273 }, { "epoch": 1.8517441860465116, "grad_norm": 1.103400969846263, "learning_rate": 1.3497072667017497e-07, "loss": 0.0444, "step": 1274 }, { "epoch": 1.8531976744186047, "grad_norm": 1.6546592470023223, "learning_rate": 1.3234889547150132e-07, "loss": 0.0705, "step": 1275 }, { "epoch": 1.8546511627906976, "grad_norm": 1.4348034154756075, "learning_rate": 1.297524378752696e-07, "loss": 0.0479, "step": 1276 }, { "epoch": 1.8561046511627906, "grad_norm": 1.571516172687651, "learning_rate": 1.2718136741603216e-07, "loss": 0.0694, "step": 1277 }, { "epoch": 1.8575581395348837, "grad_norm": 1.3840050658107048, "learning_rate": 1.2463569749600613e-07, "loss": 0.035, "step": 1278 }, { "epoch": 1.8590116279069768, "grad_norm": 1.4947152523030978, "learning_rate": 1.2211544138500452e-07, "loss": 0.0561, "step": 1279 }, { "epoch": 1.8604651162790697, "grad_norm": 1.1233071008724007, "learning_rate": 1.196206122203647e-07, "loss": 0.036, "step": 1280 }, { "epoch": 1.8619186046511627, "grad_norm": 1.314094801957361, "learning_rate": 1.1715122300688109e-07, "loss": 0.0535, "step": 1281 }, { "epoch": 1.8633720930232558, "grad_norm": 1.2805535837964788, "learning_rate": 1.1470728661673814e-07, "loss": 0.0556, "step": 1282 }, { "epoch": 1.864825581395349, "grad_norm": 1.6182345359056034, "learning_rate": 1.122888157894414e-07, "loss": 0.0446, "step": 1283 }, { "epoch": 1.8662790697674418, "grad_norm": 1.5370590795498948, "learning_rate": 1.0989582313175373e-07, "loss": 0.0648, "step": 1284 }, { "epoch": 1.8677325581395348, "grad_norm": 1.2775753759686193, "learning_rate": 1.0752832111762479e-07, "loss": 0.0485, "step": 1285 }, { "epoch": 1.869186046511628, "grad_norm": 1.3721105986516784, "learning_rate": 1.0518632208813274e-07, "loss": 0.0461, "step": 1286 }, { "epoch": 1.870639534883721, "grad_norm": 1.3509315346274664, "learning_rate": 1.0286983825141373e-07, "loss": 0.0591, "step": 1287 }, { "epoch": 1.872093023255814, "grad_norm": 1.3166068602170886, "learning_rate": 1.0057888168260311e-07, "loss": 0.0667, "step": 1288 }, { "epoch": 1.8735465116279069, "grad_norm": 1.0841449480013747, "learning_rate": 9.831346432376765e-08, "loss": 0.0296, "step": 1289 }, { "epoch": 1.875, "grad_norm": 1.4139829336218244, "learning_rate": 9.607359798384785e-08, "loss": 0.0899, "step": 1290 }, { "epoch": 1.8764534883720931, "grad_norm": 1.3681050302392366, "learning_rate": 9.385929433859353e-08, "loss": 0.04, "step": 1291 }, { "epoch": 1.877906976744186, "grad_norm": 1.4537407830939912, "learning_rate": 9.167056493050497e-08, "loss": 0.0582, "step": 1292 }, { "epoch": 1.879360465116279, "grad_norm": 1.437948138954234, "learning_rate": 8.95074211687702e-08, "loss": 0.0558, "step": 1293 }, { "epoch": 1.880813953488372, "grad_norm": 1.2655853734528568, "learning_rate": 8.736987432920785e-08, "loss": 0.0562, "step": 1294 }, { "epoch": 1.8822674418604652, "grad_norm": 1.2886785360003277, "learning_rate": 8.525793555420714e-08, "loss": 0.0642, "step": 1295 }, { "epoch": 1.8837209302325582, "grad_norm": 1.5650728826423228, "learning_rate": 8.317161585266964e-08, "loss": 0.048, "step": 1296 }, { "epoch": 1.885174418604651, "grad_norm": 1.75943043500276, "learning_rate": 8.111092609995375e-08, "loss": 0.0516, "step": 1297 }, { "epoch": 1.8866279069767442, "grad_norm": 2.137820572547727, "learning_rate": 7.907587703781583e-08, "loss": 0.1197, "step": 1298 }, { "epoch": 1.8880813953488373, "grad_norm": 1.4094974542843408, "learning_rate": 7.706647927435528e-08, "loss": 0.0446, "step": 1299 }, { "epoch": 1.8895348837209303, "grad_norm": 1.5046480179949109, "learning_rate": 7.508274328395848e-08, "loss": 0.0717, "step": 1300 }, { "epoch": 1.8909883720930232, "grad_norm": 0.9467909119425738, "learning_rate": 7.312467940724488e-08, "loss": 0.0391, "step": 1301 }, { "epoch": 1.8924418604651163, "grad_norm": 1.159518695387788, "learning_rate": 7.119229785101322e-08, "loss": 0.0499, "step": 1302 }, { "epoch": 1.8938953488372094, "grad_norm": 1.1359551931047827, "learning_rate": 6.928560868818823e-08, "loss": 0.0528, "step": 1303 }, { "epoch": 1.8953488372093024, "grad_norm": 1.3754138749662996, "learning_rate": 6.74046218577673e-08, "loss": 0.0901, "step": 1304 }, { "epoch": 1.8968023255813953, "grad_norm": 1.1173372045284649, "learning_rate": 6.554934716476946e-08, "loss": 0.0416, "step": 1305 }, { "epoch": 1.8982558139534884, "grad_norm": 1.6715320242870557, "learning_rate": 6.371979428018371e-08, "loss": 0.051, "step": 1306 }, { "epoch": 1.8997093023255816, "grad_norm": 1.6108507509250944, "learning_rate": 6.191597274091965e-08, "loss": 0.048, "step": 1307 }, { "epoch": 1.9011627906976745, "grad_norm": 1.1025808474832854, "learning_rate": 6.01378919497575e-08, "loss": 0.0473, "step": 1308 }, { "epoch": 1.9026162790697674, "grad_norm": 1.1565768061022645, "learning_rate": 5.838556117529759e-08, "loss": 0.0532, "step": 1309 }, { "epoch": 1.9040697674418605, "grad_norm": 1.3641230663168908, "learning_rate": 5.6658989551913736e-08, "loss": 0.059, "step": 1310 }, { "epoch": 1.9055232558139537, "grad_norm": 1.3356709104324593, "learning_rate": 5.495818607970549e-08, "loss": 0.0612, "step": 1311 }, { "epoch": 1.9069767441860463, "grad_norm": 1.660638299272094, "learning_rate": 5.3283159624448745e-08, "loss": 0.0496, "step": 1312 }, { "epoch": 1.9084302325581395, "grad_norm": 1.7498825229557364, "learning_rate": 5.16339189175552e-08, "loss": 0.0577, "step": 1313 }, { "epoch": 1.9098837209302326, "grad_norm": 1.5655598568640097, "learning_rate": 5.0010472556019096e-08, "loss": 0.0553, "step": 1314 }, { "epoch": 1.9113372093023255, "grad_norm": 1.3860084050687909, "learning_rate": 4.841282900237942e-08, "loss": 0.0631, "step": 1315 }, { "epoch": 1.9127906976744184, "grad_norm": 1.3887503495723006, "learning_rate": 4.684099658467223e-08, "loss": 0.0699, "step": 1316 }, { "epoch": 1.9142441860465116, "grad_norm": 1.2678955663748521, "learning_rate": 4.529498349638728e-08, "loss": 0.0402, "step": 1317 }, { "epoch": 1.9156976744186047, "grad_norm": 1.284564731699548, "learning_rate": 4.377479779642535e-08, "loss": 0.0502, "step": 1318 }, { "epoch": 1.9171511627906976, "grad_norm": 1.2529895690190205, "learning_rate": 4.228044740905879e-08, "loss": 0.0626, "step": 1319 }, { "epoch": 1.9186046511627906, "grad_norm": 1.5762385143793198, "learning_rate": 4.081194012388601e-08, "loss": 0.0627, "step": 1320 }, { "epoch": 1.9200581395348837, "grad_norm": 1.6231285336791623, "learning_rate": 3.936928359579539e-08, "loss": 0.0667, "step": 1321 }, { "epoch": 1.9215116279069768, "grad_norm": 1.5262840772998052, "learning_rate": 3.7952485344921465e-08, "loss": 0.0425, "step": 1322 }, { "epoch": 1.9229651162790697, "grad_norm": 1.145845214252006, "learning_rate": 3.656155275660711e-08, "loss": 0.0455, "step": 1323 }, { "epoch": 1.9244186046511627, "grad_norm": 2.0089578761429916, "learning_rate": 3.5196493081366966e-08, "loss": 0.0779, "step": 1324 }, { "epoch": 1.9258720930232558, "grad_norm": 1.613161432653411, "learning_rate": 3.385731343484633e-08, "loss": 0.0759, "step": 1325 }, { "epoch": 1.927325581395349, "grad_norm": 1.1432087881946933, "learning_rate": 3.254402079778618e-08, "loss": 0.0547, "step": 1326 }, { "epoch": 1.9287790697674418, "grad_norm": 1.4249087522542705, "learning_rate": 3.125662201598656e-08, "loss": 0.0663, "step": 1327 }, { "epoch": 1.9302325581395348, "grad_norm": 1.2132853467810891, "learning_rate": 2.9995123800270476e-08, "loss": 0.0499, "step": 1328 }, { "epoch": 1.931686046511628, "grad_norm": 1.193012210008745, "learning_rate": 2.8759532726448937e-08, "loss": 0.0523, "step": 1329 }, { "epoch": 1.933139534883721, "grad_norm": 1.358877307198463, "learning_rate": 2.754985523528708e-08, "loss": 0.0695, "step": 1330 }, { "epoch": 1.934593023255814, "grad_norm": 1.6866286956608185, "learning_rate": 2.6366097632469778e-08, "loss": 0.0625, "step": 1331 }, { "epoch": 1.9360465116279069, "grad_norm": 1.585941621289744, "learning_rate": 2.5208266088569966e-08, "loss": 0.0511, "step": 1332 }, { "epoch": 1.9375, "grad_norm": 1.3872165917004133, "learning_rate": 2.4076366639015914e-08, "loss": 0.0568, "step": 1333 }, { "epoch": 1.9389534883720931, "grad_norm": 1.457747420185785, "learning_rate": 2.2970405184058463e-08, "loss": 0.0547, "step": 1334 }, { "epoch": 1.940406976744186, "grad_norm": 1.1949184875255823, "learning_rate": 2.1890387488742726e-08, "loss": 0.0399, "step": 1335 }, { "epoch": 1.941860465116279, "grad_norm": 1.5940091425661185, "learning_rate": 2.083631918287643e-08, "loss": 0.0518, "step": 1336 }, { "epoch": 1.943313953488372, "grad_norm": 1.382910110487527, "learning_rate": 1.9808205761001065e-08, "loss": 0.0532, "step": 1337 }, { "epoch": 1.9447674418604652, "grad_norm": 1.4736697924051638, "learning_rate": 1.880605258236301e-08, "loss": 0.0588, "step": 1338 }, { "epoch": 1.9462209302325582, "grad_norm": 1.295565534954337, "learning_rate": 1.782986487088467e-08, "loss": 0.0724, "step": 1339 }, { "epoch": 1.947674418604651, "grad_norm": 1.3488063432605657, "learning_rate": 1.6879647715140613e-08, "loss": 0.0923, "step": 1340 }, { "epoch": 1.9491279069767442, "grad_norm": 1.3676570070169922, "learning_rate": 1.5955406068326462e-08, "loss": 0.0515, "step": 1341 }, { "epoch": 1.9505813953488373, "grad_norm": 1.2884297596799819, "learning_rate": 1.5057144748236162e-08, "loss": 0.0462, "step": 1342 }, { "epoch": 1.9520348837209303, "grad_norm": 1.2443705721719838, "learning_rate": 1.4184868437236987e-08, "loss": 0.0576, "step": 1343 }, { "epoch": 1.9534883720930232, "grad_norm": 1.3053643973067475, "learning_rate": 1.333858168224178e-08, "loss": 0.0537, "step": 1344 }, { "epoch": 1.9549418604651163, "grad_norm": 1.546574318041773, "learning_rate": 1.2518288894690089e-08, "loss": 0.0372, "step": 1345 }, { "epoch": 1.9563953488372094, "grad_norm": 1.2490005145711904, "learning_rate": 1.1723994350521518e-08, "loss": 0.0541, "step": 1346 }, { "epoch": 1.9578488372093024, "grad_norm": 1.5464557013202604, "learning_rate": 1.0955702190154072e-08, "loss": 0.0558, "step": 1347 }, { "epoch": 1.9593023255813953, "grad_norm": 1.2582934072656768, "learning_rate": 1.0213416418465294e-08, "loss": 0.0567, "step": 1348 }, { "epoch": 1.9607558139534884, "grad_norm": 1.9089698225345129, "learning_rate": 9.497140904766722e-09, "loss": 0.0556, "step": 1349 }, { "epoch": 1.9622093023255816, "grad_norm": 1.509899723394668, "learning_rate": 8.806879382788347e-09, "loss": 0.0603, "step": 1350 }, { "epoch": 1.9636627906976745, "grad_norm": 1.4511032534590458, "learning_rate": 8.142635450654746e-09, "loss": 0.0762, "step": 1351 }, { "epoch": 1.9651162790697674, "grad_norm": 1.6080149924866183, "learning_rate": 7.5044125708712e-09, "loss": 0.0609, "step": 1352 }, { "epoch": 1.9665697674418605, "grad_norm": 1.1922215779000969, "learning_rate": 6.89221407030094e-09, "loss": 0.0444, "step": 1353 }, { "epoch": 1.9680232558139537, "grad_norm": 1.3256995449929614, "learning_rate": 6.3060431401512634e-09, "loss": 0.0474, "step": 1354 }, { "epoch": 1.9694767441860463, "grad_norm": 1.1949382933968422, "learning_rate": 5.7459028359546645e-09, "loss": 0.0467, "step": 1355 }, { "epoch": 1.9709302325581395, "grad_norm": 1.693812650124421, "learning_rate": 5.211796077554399e-09, "loss": 0.0645, "step": 1356 }, { "epoch": 1.9723837209302326, "grad_norm": 1.454630218156985, "learning_rate": 4.703725649088941e-09, "loss": 0.0514, "step": 1357 }, { "epoch": 1.9738372093023255, "grad_norm": 1.4686563612295154, "learning_rate": 4.221694198976445e-09, "loss": 0.0654, "step": 1358 }, { "epoch": 1.9752906976744184, "grad_norm": 1.7602700723605103, "learning_rate": 3.765704239901413e-09, "loss": 0.0678, "step": 1359 }, { "epoch": 1.9767441860465116, "grad_norm": 1.2613024941113347, "learning_rate": 3.3357581488030476e-09, "loss": 0.0627, "step": 1360 }, { "epoch": 1.9781976744186047, "grad_norm": 1.7048864018499121, "learning_rate": 2.9318581668613676e-09, "loss": 0.0691, "step": 1361 }, { "epoch": 1.9796511627906976, "grad_norm": 1.1475541829943066, "learning_rate": 2.5540063994849982e-09, "loss": 0.0512, "step": 1362 }, { "epoch": 1.9811046511627906, "grad_norm": 1.6476817749953838, "learning_rate": 2.202204816302289e-09, "loss": 0.05, "step": 1363 }, { "epoch": 1.9825581395348837, "grad_norm": 1.0248136833336827, "learning_rate": 1.8764552511485457e-09, "loss": 0.0374, "step": 1364 }, { "epoch": 1.9840116279069768, "grad_norm": 1.248745978654899, "learning_rate": 1.576759402058814e-09, "loss": 0.0581, "step": 1365 }, { "epoch": 1.9854651162790697, "grad_norm": 1.4353092543675106, "learning_rate": 1.3031188312573328e-09, "loss": 0.0512, "step": 1366 }, { "epoch": 1.9869186046511627, "grad_norm": 1.6080768674831933, "learning_rate": 1.0555349651503178e-09, "loss": 0.063, "step": 1367 }, { "epoch": 1.9883720930232558, "grad_norm": 1.426444153430007, "learning_rate": 8.340090943176338e-10, "loss": 0.0526, "step": 1368 }, { "epoch": 1.989825581395349, "grad_norm": 1.3214408477997244, "learning_rate": 6.385423735078e-10, "loss": 0.0463, "step": 1369 }, { "epoch": 1.9912790697674418, "grad_norm": 1.7983231171481742, "learning_rate": 4.691358216291075e-10, "loss": 0.058, "step": 1370 }, { "epoch": 1.9927325581395348, "grad_norm": 1.4390939528927378, "learning_rate": 3.257903217479541e-10, "loss": 0.0639, "step": 1371 }, { "epoch": 1.994186046511628, "grad_norm": 1.1021839312652135, "learning_rate": 2.0850662108051755e-10, "loss": 0.0402, "step": 1372 }, { "epoch": 1.995639534883721, "grad_norm": 1.3590522618876162, "learning_rate": 1.1728533099220063e-10, "loss": 0.0509, "step": 1373 }, { "epoch": 1.997093023255814, "grad_norm": 1.4976019289330444, "learning_rate": 5.2126926991524774e-11, "loss": 0.0671, "step": 1374 }, { "epoch": 1.9985465116279069, "grad_norm": 1.3973669217539937, "learning_rate": 1.3031748730685246e-11, "loss": 0.0565, "step": 1375 }, { "epoch": 2.0, "grad_norm": 0.8399439880889858, "learning_rate": 0.0, "loss": 0.0345, "step": 1376 }, { "epoch": 2.0, "step": 1376, "total_flos": 7660524011520.0, "train_loss": 0.10411996046909629, "train_runtime": 1065.2694, "train_samples_per_second": 10.32, "train_steps_per_second": 1.292 } ], "logging_steps": 1, "max_steps": 1376, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7660524011520.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }