|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 200, |
|
"global_step": 754, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002652519893899204, |
|
"grad_norm": 3.143817743419433, |
|
"learning_rate": 9.999956599329999e-06, |
|
"loss": 0.4703, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005305039787798408, |
|
"grad_norm": 2.9153346228246715, |
|
"learning_rate": 9.999826398073438e-06, |
|
"loss": 0.4676, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.007957559681697613, |
|
"grad_norm": 2.1837013669916683, |
|
"learning_rate": 9.999609398490651e-06, |
|
"loss": 0.3752, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.010610079575596816, |
|
"grad_norm": 2.7009872655116145, |
|
"learning_rate": 9.999305604348804e-06, |
|
"loss": 0.494, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.013262599469496022, |
|
"grad_norm": 2.71509007103024, |
|
"learning_rate": 9.998915020921847e-06, |
|
"loss": 0.4101, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.015915119363395226, |
|
"grad_norm": 2.046773566123141, |
|
"learning_rate": 9.998437654990412e-06, |
|
"loss": 0.3118, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.01856763925729443, |
|
"grad_norm": 1.6805054009773157, |
|
"learning_rate": 9.997873514841703e-06, |
|
"loss": 0.2808, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.021220159151193633, |
|
"grad_norm": 1.9031301046910736, |
|
"learning_rate": 9.997222610269339e-06, |
|
"loss": 0.288, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.023872679045092837, |
|
"grad_norm": 2.2790419571248, |
|
"learning_rate": 9.996484952573203e-06, |
|
"loss": 0.3187, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.026525198938992044, |
|
"grad_norm": 2.005476307323896, |
|
"learning_rate": 9.995660554559225e-06, |
|
"loss": 0.2684, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.029177718832891247, |
|
"grad_norm": 2.1118955586623542, |
|
"learning_rate": 9.99474943053918e-06, |
|
"loss": 0.2861, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.03183023872679045, |
|
"grad_norm": 1.9084863447700302, |
|
"learning_rate": 9.993751596330421e-06, |
|
"loss": 0.2447, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.034482758620689655, |
|
"grad_norm": 2.443033048528926, |
|
"learning_rate": 9.99266706925562e-06, |
|
"loss": 0.295, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.03713527851458886, |
|
"grad_norm": 2.252694252488703, |
|
"learning_rate": 9.991495868142457e-06, |
|
"loss": 0.2895, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.03978779840848806, |
|
"grad_norm": 2.134403724246919, |
|
"learning_rate": 9.990238013323298e-06, |
|
"loss": 0.2704, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.042440318302387266, |
|
"grad_norm": 2.28401373031769, |
|
"learning_rate": 9.98889352663484e-06, |
|
"loss": 0.2648, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04509283819628647, |
|
"grad_norm": 2.1064551964073286, |
|
"learning_rate": 9.987462431417732e-06, |
|
"loss": 0.2033, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.04774535809018567, |
|
"grad_norm": 2.1590960874250102, |
|
"learning_rate": 9.985944752516168e-06, |
|
"loss": 0.206, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.050397877984084884, |
|
"grad_norm": 1.787435615555535, |
|
"learning_rate": 9.984340516277464e-06, |
|
"loss": 0.2446, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.05305039787798409, |
|
"grad_norm": 2.154695176620596, |
|
"learning_rate": 9.982649750551589e-06, |
|
"loss": 0.2428, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05570291777188329, |
|
"grad_norm": 2.013092950603686, |
|
"learning_rate": 9.980872484690689e-06, |
|
"loss": 0.2095, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.058355437665782495, |
|
"grad_norm": 1.871221689867333, |
|
"learning_rate": 9.979008749548575e-06, |
|
"loss": 0.2119, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0610079575596817, |
|
"grad_norm": 2.1428411294867975, |
|
"learning_rate": 9.977058577480192e-06, |
|
"loss": 0.2031, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0636604774535809, |
|
"grad_norm": 2.029431401764423, |
|
"learning_rate": 9.975022002341045e-06, |
|
"loss": 0.2279, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06631299734748011, |
|
"grad_norm": 1.9037807432106393, |
|
"learning_rate": 9.972899059486629e-06, |
|
"loss": 0.2236, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06896551724137931, |
|
"grad_norm": 1.5412274357921503, |
|
"learning_rate": 9.970689785771798e-06, |
|
"loss": 0.1816, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.07161803713527852, |
|
"grad_norm": 2.230329801114519, |
|
"learning_rate": 9.968394219550136e-06, |
|
"loss": 0.2428, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.07427055702917772, |
|
"grad_norm": 1.902794098881427, |
|
"learning_rate": 9.966012400673291e-06, |
|
"loss": 0.2154, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.07692307692307693, |
|
"grad_norm": 1.8780742288675212, |
|
"learning_rate": 9.96354437049027e-06, |
|
"loss": 0.1972, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.07957559681697612, |
|
"grad_norm": 1.8047024624739505, |
|
"learning_rate": 9.960990171846745e-06, |
|
"loss": 0.2003, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08222811671087533, |
|
"grad_norm": 1.99420196004025, |
|
"learning_rate": 9.958349849084286e-06, |
|
"loss": 0.2229, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.08488063660477453, |
|
"grad_norm": 1.9295211213650523, |
|
"learning_rate": 9.955623448039605e-06, |
|
"loss": 0.2077, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.08753315649867374, |
|
"grad_norm": 1.9560816295756256, |
|
"learning_rate": 9.952811016043753e-06, |
|
"loss": 0.2035, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.09018567639257294, |
|
"grad_norm": 1.8994575592237317, |
|
"learning_rate": 9.949912601921306e-06, |
|
"loss": 0.2365, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.09283819628647215, |
|
"grad_norm": 1.7820353508663918, |
|
"learning_rate": 9.946928255989507e-06, |
|
"loss": 0.1889, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.09549071618037135, |
|
"grad_norm": 1.9972563802629155, |
|
"learning_rate": 9.943858030057404e-06, |
|
"loss": 0.1992, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.09814323607427056, |
|
"grad_norm": 1.9858847962279158, |
|
"learning_rate": 9.94070197742494e-06, |
|
"loss": 0.2126, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.10079575596816977, |
|
"grad_norm": 2.2468613321703375, |
|
"learning_rate": 9.937460152882035e-06, |
|
"loss": 0.1966, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.10344827586206896, |
|
"grad_norm": 1.9047848108957046, |
|
"learning_rate": 9.934132612707631e-06, |
|
"loss": 0.1929, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.10610079575596817, |
|
"grad_norm": 1.5198039037792352, |
|
"learning_rate": 9.93071941466872e-06, |
|
"loss": 0.1454, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10875331564986737, |
|
"grad_norm": 1.942256631038327, |
|
"learning_rate": 9.927220618019331e-06, |
|
"loss": 0.1927, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.11140583554376658, |
|
"grad_norm": 1.9039836584761076, |
|
"learning_rate": 9.923636283499513e-06, |
|
"loss": 0.1683, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.11405835543766578, |
|
"grad_norm": 2.3882871093584015, |
|
"learning_rate": 9.919966473334278e-06, |
|
"loss": 0.1925, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.11671087533156499, |
|
"grad_norm": 1.9482729522397495, |
|
"learning_rate": 9.916211251232507e-06, |
|
"loss": 0.204, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.11936339522546419, |
|
"grad_norm": 1.9541832414635472, |
|
"learning_rate": 9.912370682385866e-06, |
|
"loss": 0.1718, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1220159151193634, |
|
"grad_norm": 2.2469271806449744, |
|
"learning_rate": 9.908444833467659e-06, |
|
"loss": 0.2099, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.1246684350132626, |
|
"grad_norm": 1.9273893068530785, |
|
"learning_rate": 9.904433772631674e-06, |
|
"loss": 0.1896, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.1273209549071618, |
|
"grad_norm": 2.020005570778388, |
|
"learning_rate": 9.900337569511003e-06, |
|
"loss": 0.2135, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.129973474801061, |
|
"grad_norm": 1.787168676896887, |
|
"learning_rate": 9.896156295216832e-06, |
|
"loss": 0.1895, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.13262599469496023, |
|
"grad_norm": 1.9492709359169609, |
|
"learning_rate": 9.891890022337201e-06, |
|
"loss": 0.1948, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13527851458885942, |
|
"grad_norm": 1.4707392860636022, |
|
"learning_rate": 9.88753882493575e-06, |
|
"loss": 0.1494, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.13793103448275862, |
|
"grad_norm": 1.8405786005165676, |
|
"learning_rate": 9.883102778550434e-06, |
|
"loss": 0.1752, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.14058355437665782, |
|
"grad_norm": 1.8735255704087945, |
|
"learning_rate": 9.878581960192206e-06, |
|
"loss": 0.1779, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.14323607427055704, |
|
"grad_norm": 2.0141493172675746, |
|
"learning_rate": 9.873976448343685e-06, |
|
"loss": 0.2168, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.14588859416445624, |
|
"grad_norm": 1.7017677167638687, |
|
"learning_rate": 9.86928632295779e-06, |
|
"loss": 0.1589, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.14854111405835543, |
|
"grad_norm": 1.6283124591177214, |
|
"learning_rate": 9.864511665456355e-06, |
|
"loss": 0.1399, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.15119363395225463, |
|
"grad_norm": 2.2298913035183845, |
|
"learning_rate": 9.859652558728714e-06, |
|
"loss": 0.2046, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"grad_norm": 2.12248040462289, |
|
"learning_rate": 9.854709087130261e-06, |
|
"loss": 0.1859, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.15649867374005305, |
|
"grad_norm": 2.4614522882556247, |
|
"learning_rate": 9.84968133648099e-06, |
|
"loss": 0.2284, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.15915119363395225, |
|
"grad_norm": 2.0728088827577578, |
|
"learning_rate": 9.844569394063997e-06, |
|
"loss": 0.1826, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16180371352785147, |
|
"grad_norm": 1.626832715567298, |
|
"learning_rate": 9.839373348623976e-06, |
|
"loss": 0.144, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.16445623342175067, |
|
"grad_norm": 2.095258455090102, |
|
"learning_rate": 9.834093290365665e-06, |
|
"loss": 0.1762, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.16710875331564987, |
|
"grad_norm": 2.1180496536142495, |
|
"learning_rate": 9.828729310952292e-06, |
|
"loss": 0.1754, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.16976127320954906, |
|
"grad_norm": 2.523281048591522, |
|
"learning_rate": 9.823281503503976e-06, |
|
"loss": 0.2063, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.1724137931034483, |
|
"grad_norm": 1.7605117934687937, |
|
"learning_rate": 9.817749962596115e-06, |
|
"loss": 0.1513, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.17506631299734748, |
|
"grad_norm": 1.8068568519736308, |
|
"learning_rate": 9.812134784257743e-06, |
|
"loss": 0.1695, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.17771883289124668, |
|
"grad_norm": 2.194003391788825, |
|
"learning_rate": 9.80643606596986e-06, |
|
"loss": 0.1873, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.18037135278514588, |
|
"grad_norm": 1.8221102873639325, |
|
"learning_rate": 9.80065390666374e-06, |
|
"loss": 0.1709, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.1830238726790451, |
|
"grad_norm": 1.81681516720243, |
|
"learning_rate": 9.794788406719223e-06, |
|
"loss": 0.1611, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.1856763925729443, |
|
"grad_norm": 1.6215357337836858, |
|
"learning_rate": 9.788839667962956e-06, |
|
"loss": 0.1491, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1883289124668435, |
|
"grad_norm": 1.9544453657582719, |
|
"learning_rate": 9.78280779366664e-06, |
|
"loss": 0.1666, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.1909814323607427, |
|
"grad_norm": 1.9891058976186013, |
|
"learning_rate": 9.77669288854523e-06, |
|
"loss": 0.1652, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.19363395225464192, |
|
"grad_norm": 1.7657716439562443, |
|
"learning_rate": 9.770495058755113e-06, |
|
"loss": 0.1665, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.1962864721485411, |
|
"grad_norm": 2.0702179893057657, |
|
"learning_rate": 9.764214411892283e-06, |
|
"loss": 0.1839, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.1989389920424403, |
|
"grad_norm": 1.94174040080256, |
|
"learning_rate": 9.757851056990446e-06, |
|
"loss": 0.1514, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.20159151193633953, |
|
"grad_norm": 2.0359509600829355, |
|
"learning_rate": 9.751405104519151e-06, |
|
"loss": 0.168, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.20424403183023873, |
|
"grad_norm": 1.8488897921580416, |
|
"learning_rate": 9.744876666381861e-06, |
|
"loss": 0.1642, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.20689655172413793, |
|
"grad_norm": 1.9493633096974006, |
|
"learning_rate": 9.738265855914014e-06, |
|
"loss": 0.1451, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.20954907161803712, |
|
"grad_norm": 1.969779554700488, |
|
"learning_rate": 9.731572787881045e-06, |
|
"loss": 0.1738, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.21220159151193635, |
|
"grad_norm": 1.6792825100474953, |
|
"learning_rate": 9.724797578476414e-06, |
|
"loss": 0.1243, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.21485411140583555, |
|
"grad_norm": 1.9140356369318925, |
|
"learning_rate": 9.71794034531957e-06, |
|
"loss": 0.1591, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.21750663129973474, |
|
"grad_norm": 1.7940577658445638, |
|
"learning_rate": 9.711001207453919e-06, |
|
"loss": 0.1411, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.22015915119363394, |
|
"grad_norm": 2.383028621766121, |
|
"learning_rate": 9.703980285344752e-06, |
|
"loss": 0.1752, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.22281167108753316, |
|
"grad_norm": 2.051261568492118, |
|
"learning_rate": 9.696877700877162e-06, |
|
"loss": 0.1974, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.22546419098143236, |
|
"grad_norm": 1.694394298477383, |
|
"learning_rate": 9.689693577353917e-06, |
|
"loss": 0.1395, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.22811671087533156, |
|
"grad_norm": 1.8051445315092405, |
|
"learning_rate": 9.682428039493325e-06, |
|
"loss": 0.1576, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.23076923076923078, |
|
"grad_norm": 1.988060978591615, |
|
"learning_rate": 9.675081213427076e-06, |
|
"loss": 0.152, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.23342175066312998, |
|
"grad_norm": 1.6413056627568852, |
|
"learning_rate": 9.667653226698033e-06, |
|
"loss": 0.157, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.23607427055702918, |
|
"grad_norm": 1.7068088087742874, |
|
"learning_rate": 9.660144208258039e-06, |
|
"loss": 0.1518, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.23872679045092837, |
|
"grad_norm": 2.0128746338247856, |
|
"learning_rate": 9.652554288465668e-06, |
|
"loss": 0.1792, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2413793103448276, |
|
"grad_norm": 1.7874268938466609, |
|
"learning_rate": 9.644883599083959e-06, |
|
"loss": 0.1459, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.2440318302387268, |
|
"grad_norm": 2.15165919348902, |
|
"learning_rate": 9.637132273278135e-06, |
|
"loss": 0.1745, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.246684350132626, |
|
"grad_norm": 2.3979359764341415, |
|
"learning_rate": 9.629300445613294e-06, |
|
"loss": 0.2075, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.2493368700265252, |
|
"grad_norm": 2.250801983136737, |
|
"learning_rate": 9.62138825205206e-06, |
|
"loss": 0.1958, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.2519893899204244, |
|
"grad_norm": 2.021920489405669, |
|
"learning_rate": 9.613395829952233e-06, |
|
"loss": 0.1608, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2546419098143236, |
|
"grad_norm": 2.2291104046239076, |
|
"learning_rate": 9.605323318064403e-06, |
|
"loss": 0.196, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2572944297082228, |
|
"grad_norm": 1.7681014524697132, |
|
"learning_rate": 9.59717085652954e-06, |
|
"loss": 0.1367, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.259946949602122, |
|
"grad_norm": 2.186660089739903, |
|
"learning_rate": 9.588938586876564e-06, |
|
"loss": 0.1485, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.2625994694960212, |
|
"grad_norm": 2.2507510523601044, |
|
"learning_rate": 9.580626652019878e-06, |
|
"loss": 0.1795, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.26525198938992045, |
|
"grad_norm": 1.9046066926852447, |
|
"learning_rate": 9.5722351962569e-06, |
|
"loss": 0.1676, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.26790450928381965, |
|
"grad_norm": 1.8505248856341252, |
|
"learning_rate": 9.563764365265553e-06, |
|
"loss": 0.1539, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.27055702917771884, |
|
"grad_norm": 2.0270020661057755, |
|
"learning_rate": 9.555214306101732e-06, |
|
"loss": 0.1702, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.27320954907161804, |
|
"grad_norm": 1.6795319874110077, |
|
"learning_rate": 9.546585167196755e-06, |
|
"loss": 0.1503, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.27586206896551724, |
|
"grad_norm": 2.341752387924688, |
|
"learning_rate": 9.537877098354787e-06, |
|
"loss": 0.1766, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.27851458885941643, |
|
"grad_norm": 2.5362314705747093, |
|
"learning_rate": 9.529090250750234e-06, |
|
"loss": 0.1854, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.28116710875331563, |
|
"grad_norm": 1.6421242469941448, |
|
"learning_rate": 9.52022477692513e-06, |
|
"loss": 0.12, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.2838196286472148, |
|
"grad_norm": 1.7705962516319114, |
|
"learning_rate": 9.511280830786471e-06, |
|
"loss": 0.15, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.2864721485411141, |
|
"grad_norm": 1.8018966800140481, |
|
"learning_rate": 9.502258567603563e-06, |
|
"loss": 0.1446, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2891246684350133, |
|
"grad_norm": 2.154026465895243, |
|
"learning_rate": 9.493158144005314e-06, |
|
"loss": 0.175, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.2917771883289125, |
|
"grad_norm": 2.091201257414692, |
|
"learning_rate": 9.483979717977513e-06, |
|
"loss": 0.1533, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.29442970822281167, |
|
"grad_norm": 2.049226329317327, |
|
"learning_rate": 9.474723448860096e-06, |
|
"loss": 0.1582, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.29708222811671087, |
|
"grad_norm": 1.6937916325037254, |
|
"learning_rate": 9.46538949734438e-06, |
|
"loss": 0.1319, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.29973474801061006, |
|
"grad_norm": 2.2804043446780193, |
|
"learning_rate": 9.455978025470257e-06, |
|
"loss": 0.1744, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.30238726790450926, |
|
"grad_norm": 2.427110301571042, |
|
"learning_rate": 9.44648919662341e-06, |
|
"loss": 0.1972, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.3050397877984085, |
|
"grad_norm": 2.4719255725338307, |
|
"learning_rate": 9.436923175532442e-06, |
|
"loss": 0.1863, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 1.9198368860593722, |
|
"learning_rate": 9.427280128266049e-06, |
|
"loss": 0.1428, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.3103448275862069, |
|
"grad_norm": 1.9531949152663541, |
|
"learning_rate": 9.417560222230115e-06, |
|
"loss": 0.1723, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.3129973474801061, |
|
"grad_norm": 2.1363324241184456, |
|
"learning_rate": 9.407763626164812e-06, |
|
"loss": 0.1758, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.3156498673740053, |
|
"grad_norm": 1.8882375069693482, |
|
"learning_rate": 9.397890510141674e-06, |
|
"loss": 0.1248, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.3183023872679045, |
|
"grad_norm": 2.1194646896664584, |
|
"learning_rate": 9.387941045560641e-06, |
|
"loss": 0.1589, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3209549071618037, |
|
"grad_norm": 1.5127624036717937, |
|
"learning_rate": 9.377915405147085e-06, |
|
"loss": 0.1446, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.32360742705570295, |
|
"grad_norm": 2.158233386717242, |
|
"learning_rate": 9.367813762948809e-06, |
|
"loss": 0.1584, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.32625994694960214, |
|
"grad_norm": 1.941105678166514, |
|
"learning_rate": 9.357636294333031e-06, |
|
"loss": 0.1272, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.32891246684350134, |
|
"grad_norm": 1.8205147183171333, |
|
"learning_rate": 9.347383175983333e-06, |
|
"loss": 0.1545, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.33156498673740054, |
|
"grad_norm": 1.6927779015644462, |
|
"learning_rate": 9.337054585896596e-06, |
|
"loss": 0.1202, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.33421750663129973, |
|
"grad_norm": 1.9276160632994672, |
|
"learning_rate": 9.326650703379913e-06, |
|
"loss": 0.1432, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.33687002652519893, |
|
"grad_norm": 1.9253639897291404, |
|
"learning_rate": 9.316171709047475e-06, |
|
"loss": 0.148, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.3395225464190981, |
|
"grad_norm": 1.8807065660047437, |
|
"learning_rate": 9.305617784817426e-06, |
|
"loss": 0.1401, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3421750663129973, |
|
"grad_norm": 1.95250475420529, |
|
"learning_rate": 9.294989113908726e-06, |
|
"loss": 0.1548, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.3448275862068966, |
|
"grad_norm": 2.3715397805842255, |
|
"learning_rate": 9.284285880837947e-06, |
|
"loss": 0.1756, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.34748010610079577, |
|
"grad_norm": 1.93914417403847, |
|
"learning_rate": 9.273508271416082e-06, |
|
"loss": 0.1399, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.35013262599469497, |
|
"grad_norm": 1.9560254067556455, |
|
"learning_rate": 9.262656472745324e-06, |
|
"loss": 0.1389, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.35278514588859416, |
|
"grad_norm": 1.9150433290760882, |
|
"learning_rate": 9.251730673215802e-06, |
|
"loss": 0.1407, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.35543766578249336, |
|
"grad_norm": 1.8135844757186297, |
|
"learning_rate": 9.240731062502323e-06, |
|
"loss": 0.1306, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.35809018567639256, |
|
"grad_norm": 2.162238929625922, |
|
"learning_rate": 9.229657831561082e-06, |
|
"loss": 0.1496, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.36074270557029176, |
|
"grad_norm": 2.055713876885785, |
|
"learning_rate": 9.218511172626333e-06, |
|
"loss": 0.1753, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.363395225464191, |
|
"grad_norm": 1.6946475181680336, |
|
"learning_rate": 9.207291279207058e-06, |
|
"loss": 0.1335, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.3660477453580902, |
|
"grad_norm": 2.101107547619466, |
|
"learning_rate": 9.195998346083621e-06, |
|
"loss": 0.1493, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.3687002652519894, |
|
"grad_norm": 1.8055607972774566, |
|
"learning_rate": 9.184632569304365e-06, |
|
"loss": 0.1262, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.3713527851458886, |
|
"grad_norm": 1.690201779069974, |
|
"learning_rate": 9.173194146182219e-06, |
|
"loss": 0.1499, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3740053050397878, |
|
"grad_norm": 2.1397994585808955, |
|
"learning_rate": 9.161683275291275e-06, |
|
"loss": 0.157, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.376657824933687, |
|
"grad_norm": 1.9870860707292484, |
|
"learning_rate": 9.150100156463337e-06, |
|
"loss": 0.1529, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.3793103448275862, |
|
"grad_norm": 2.207156912182982, |
|
"learning_rate": 9.138444990784455e-06, |
|
"loss": 0.1859, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.3819628647214854, |
|
"grad_norm": 1.9713520675081648, |
|
"learning_rate": 9.126717980591422e-06, |
|
"loss": 0.1459, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.38461538461538464, |
|
"grad_norm": 1.7065232206005256, |
|
"learning_rate": 9.114919329468283e-06, |
|
"loss": 0.1461, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.38726790450928383, |
|
"grad_norm": 2.07601306564582, |
|
"learning_rate": 9.103049242242781e-06, |
|
"loss": 0.1785, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.38992042440318303, |
|
"grad_norm": 1.776835787519842, |
|
"learning_rate": 9.091107924982814e-06, |
|
"loss": 0.1525, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.3925729442970822, |
|
"grad_norm": 1.8663700533344434, |
|
"learning_rate": 9.079095584992848e-06, |
|
"loss": 0.1409, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.3952254641909814, |
|
"grad_norm": 2.2784042310709407, |
|
"learning_rate": 9.067012430810326e-06, |
|
"loss": 0.1785, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.3978779840848806, |
|
"grad_norm": 1.7721799125937119, |
|
"learning_rate": 9.05485867220204e-06, |
|
"loss": 0.1346, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4005305039787798, |
|
"grad_norm": 2.13950597469927, |
|
"learning_rate": 9.0426345201605e-06, |
|
"loss": 0.1652, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.40318302387267907, |
|
"grad_norm": 1.8810097271179547, |
|
"learning_rate": 9.03034018690026e-06, |
|
"loss": 0.1449, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.40583554376657827, |
|
"grad_norm": 2.4159078987026863, |
|
"learning_rate": 9.01797588585424e-06, |
|
"loss": 0.1683, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.40848806366047746, |
|
"grad_norm": 2.0980888650490668, |
|
"learning_rate": 9.00554183167002e-06, |
|
"loss": 0.1424, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.41114058355437666, |
|
"grad_norm": 2.062288135177168, |
|
"learning_rate": 8.993038240206114e-06, |
|
"loss": 0.1392, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.41379310344827586, |
|
"grad_norm": 1.9204203170622036, |
|
"learning_rate": 8.98046532852822e-06, |
|
"loss": 0.1418, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.41644562334217505, |
|
"grad_norm": 2.08299140855318, |
|
"learning_rate": 8.967823314905452e-06, |
|
"loss": 0.1486, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.41909814323607425, |
|
"grad_norm": 1.844754808254566, |
|
"learning_rate": 8.95511241880656e-06, |
|
"loss": 0.1263, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.4217506631299735, |
|
"grad_norm": 1.902160167609469, |
|
"learning_rate": 8.942332860896102e-06, |
|
"loss": 0.1435, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.4244031830238727, |
|
"grad_norm": 1.4047023346474345, |
|
"learning_rate": 8.929484863030631e-06, |
|
"loss": 0.1204, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4270557029177719, |
|
"grad_norm": 1.5383340677271746, |
|
"learning_rate": 8.91656864825483e-06, |
|
"loss": 0.1307, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.4297082228116711, |
|
"grad_norm": 1.8646076049063893, |
|
"learning_rate": 8.903584440797652e-06, |
|
"loss": 0.1403, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.4323607427055703, |
|
"grad_norm": 1.6149573488861944, |
|
"learning_rate": 8.890532466068417e-06, |
|
"loss": 0.1381, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.4350132625994695, |
|
"grad_norm": 2.1580924076315715, |
|
"learning_rate": 8.877412950652907e-06, |
|
"loss": 0.1577, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.4376657824933687, |
|
"grad_norm": 2.2128873070419375, |
|
"learning_rate": 8.864226122309423e-06, |
|
"loss": 0.1526, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.4403183023872679, |
|
"grad_norm": 2.174869482049831, |
|
"learning_rate": 8.850972209964837e-06, |
|
"loss": 0.1473, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.44297082228116713, |
|
"grad_norm": 1.6128773258978748, |
|
"learning_rate": 8.837651443710623e-06, |
|
"loss": 0.1217, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.44562334217506633, |
|
"grad_norm": 1.8324959002361592, |
|
"learning_rate": 8.824264054798852e-06, |
|
"loss": 0.1509, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.4482758620689655, |
|
"grad_norm": 1.708365841100358, |
|
"learning_rate": 8.810810275638183e-06, |
|
"loss": 0.124, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.4509283819628647, |
|
"grad_norm": 1.7502467085637508, |
|
"learning_rate": 8.797290339789827e-06, |
|
"loss": 0.1299, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4535809018567639, |
|
"grad_norm": 2.060960880635761, |
|
"learning_rate": 8.783704481963498e-06, |
|
"loss": 0.1428, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.4562334217506631, |
|
"grad_norm": 1.8464905297693734, |
|
"learning_rate": 8.770052938013323e-06, |
|
"loss": 0.1502, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.4588859416445623, |
|
"grad_norm": 1.8087559826720645, |
|
"learning_rate": 8.756335944933768e-06, |
|
"loss": 0.1351, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"grad_norm": 1.7162671809778718, |
|
"learning_rate": 8.742553740855507e-06, |
|
"loss": 0.1244, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.46419098143236076, |
|
"grad_norm": 2.25916831884875, |
|
"learning_rate": 8.728706565041296e-06, |
|
"loss": 0.176, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.46684350132625996, |
|
"grad_norm": 1.9927732799007924, |
|
"learning_rate": 8.714794657881818e-06, |
|
"loss": 0.1329, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.46949602122015915, |
|
"grad_norm": 1.9150506644030154, |
|
"learning_rate": 8.700818260891512e-06, |
|
"loss": 0.1455, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.47214854111405835, |
|
"grad_norm": 1.918021311795205, |
|
"learning_rate": 8.686777616704375e-06, |
|
"loss": 0.14, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.47480106100795755, |
|
"grad_norm": 2.0258194425535203, |
|
"learning_rate": 8.67267296906975e-06, |
|
"loss": 0.1676, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.47745358090185674, |
|
"grad_norm": 1.9171122906217821, |
|
"learning_rate": 8.658504562848104e-06, |
|
"loss": 0.1369, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.48010610079575594, |
|
"grad_norm": 2.218531055729975, |
|
"learning_rate": 8.644272644006764e-06, |
|
"loss": 0.152, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.4827586206896552, |
|
"grad_norm": 2.0038883017886375, |
|
"learning_rate": 8.629977459615655e-06, |
|
"loss": 0.1426, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.4854111405835544, |
|
"grad_norm": 1.7175445641951717, |
|
"learning_rate": 8.61561925784301e-06, |
|
"loss": 0.1249, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.4880636604774536, |
|
"grad_norm": 1.815071151822735, |
|
"learning_rate": 8.601198287951059e-06, |
|
"loss": 0.1301, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4907161803713528, |
|
"grad_norm": 2.1561780015246463, |
|
"learning_rate": 8.586714800291704e-06, |
|
"loss": 0.1397, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.493368700265252, |
|
"grad_norm": 1.9294815320965064, |
|
"learning_rate": 8.572169046302174e-06, |
|
"loss": 0.141, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.4960212201591512, |
|
"grad_norm": 1.659333790717064, |
|
"learning_rate": 8.557561278500656e-06, |
|
"loss": 0.1214, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.4986737400530504, |
|
"grad_norm": 1.6753928986149604, |
|
"learning_rate": 8.542891750481913e-06, |
|
"loss": 0.1238, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.5013262599469496, |
|
"grad_norm": 1.909985436763465, |
|
"learning_rate": 8.528160716912882e-06, |
|
"loss": 0.1189, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.5039787798408488, |
|
"grad_norm": 1.956919058433968, |
|
"learning_rate": 8.513368433528255e-06, |
|
"loss": 0.1328, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.506631299734748, |
|
"grad_norm": 1.7825907922117281, |
|
"learning_rate": 8.498515157126038e-06, |
|
"loss": 0.1362, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.5092838196286472, |
|
"grad_norm": 2.1745948430153232, |
|
"learning_rate": 8.483601145563087e-06, |
|
"loss": 0.1403, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.5119363395225465, |
|
"grad_norm": 1.5767065409411627, |
|
"learning_rate": 8.46862665775064e-06, |
|
"loss": 0.121, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.5145888594164456, |
|
"grad_norm": 2.1206690113650737, |
|
"learning_rate": 8.45359195364982e-06, |
|
"loss": 0.1435, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.5172413793103449, |
|
"grad_norm": 2.0215180716784453, |
|
"learning_rate": 8.438497294267117e-06, |
|
"loss": 0.1362, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.519893899204244, |
|
"grad_norm": 2.2299825948556697, |
|
"learning_rate": 8.423342941649866e-06, |
|
"loss": 0.1337, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.5225464190981433, |
|
"grad_norm": 2.262534468463096, |
|
"learning_rate": 8.40812915888169e-06, |
|
"loss": 0.1538, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.5251989389920424, |
|
"grad_norm": 2.306393950737034, |
|
"learning_rate": 8.392856210077932e-06, |
|
"loss": 0.1696, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.5278514588859416, |
|
"grad_norm": 1.9803460856435193, |
|
"learning_rate": 8.37752436038108e-06, |
|
"loss": 0.1307, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.5305039787798409, |
|
"grad_norm": 1.7912221503357104, |
|
"learning_rate": 8.36213387595615e-06, |
|
"loss": 0.1292, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5305039787798409, |
|
"eval_loss": 0.14994381368160248, |
|
"eval_runtime": 1.3023, |
|
"eval_samples_per_second": 23.804, |
|
"eval_steps_per_second": 6.143, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.53315649867374, |
|
"grad_norm": 1.6059764600949016, |
|
"learning_rate": 8.34668502398608e-06, |
|
"loss": 0.106, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.5358090185676393, |
|
"grad_norm": 1.706895090532193, |
|
"learning_rate": 8.331178072667079e-06, |
|
"loss": 0.1261, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.5384615384615384, |
|
"grad_norm": 1.7309972720553148, |
|
"learning_rate": 8.315613291203977e-06, |
|
"loss": 0.1177, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.5411140583554377, |
|
"grad_norm": 2.119393668480105, |
|
"learning_rate": 8.299990949805551e-06, |
|
"loss": 0.1512, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5437665782493368, |
|
"grad_norm": 2.1446534939795194, |
|
"learning_rate": 8.28431131967984e-06, |
|
"loss": 0.1451, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5464190981432361, |
|
"grad_norm": 2.3040675508103057, |
|
"learning_rate": 8.268574673029415e-06, |
|
"loss": 0.1528, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.5490716180371353, |
|
"grad_norm": 1.828325753916526, |
|
"learning_rate": 8.252781283046688e-06, |
|
"loss": 0.1346, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.5517241379310345, |
|
"grad_norm": 1.8710705926707558, |
|
"learning_rate": 8.23693142390914e-06, |
|
"loss": 0.1463, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5543766578249337, |
|
"grad_norm": 1.8384322195422618, |
|
"learning_rate": 8.22102537077457e-06, |
|
"loss": 0.1383, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.5570291777188329, |
|
"grad_norm": 1.9467404806792865, |
|
"learning_rate": 8.205063399776326e-06, |
|
"loss": 0.1415, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5596816976127321, |
|
"grad_norm": 1.7702552779136205, |
|
"learning_rate": 8.189045788018502e-06, |
|
"loss": 0.141, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.5623342175066313, |
|
"grad_norm": 2.0515294131254525, |
|
"learning_rate": 8.172972813571132e-06, |
|
"loss": 0.1527, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5649867374005305, |
|
"grad_norm": 2.0940031140217865, |
|
"learning_rate": 8.156844755465357e-06, |
|
"loss": 0.15, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.5676392572944297, |
|
"grad_norm": 1.9660531752973018, |
|
"learning_rate": 8.14066189368859e-06, |
|
"loss": 0.1408, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.5702917771883289, |
|
"grad_norm": 2.1910231506350186, |
|
"learning_rate": 8.124424509179648e-06, |
|
"loss": 0.1463, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5729442970822282, |
|
"grad_norm": 1.5430710287754699, |
|
"learning_rate": 8.108132883823878e-06, |
|
"loss": 0.1097, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5755968169761273, |
|
"grad_norm": 1.8265329192409332, |
|
"learning_rate": 8.091787300448264e-06, |
|
"loss": 0.1395, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.5782493368700266, |
|
"grad_norm": 1.8822040988497644, |
|
"learning_rate": 8.07538804281651e-06, |
|
"loss": 0.1298, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.5809018567639257, |
|
"grad_norm": 2.0002279443157445, |
|
"learning_rate": 8.058935395624128e-06, |
|
"loss": 0.1421, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.583554376657825, |
|
"grad_norm": 2.1521750298282964, |
|
"learning_rate": 8.042429644493479e-06, |
|
"loss": 0.1379, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5862068965517241, |
|
"grad_norm": 1.5640533332285234, |
|
"learning_rate": 8.025871075968828e-06, |
|
"loss": 0.1228, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.5888594164456233, |
|
"grad_norm": 2.0211033096735864, |
|
"learning_rate": 8.00925997751136e-06, |
|
"loss": 0.1402, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.5915119363395226, |
|
"grad_norm": 1.7401492380650114, |
|
"learning_rate": 7.992596637494199e-06, |
|
"loss": 0.1223, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.5941644562334217, |
|
"grad_norm": 1.5482263543431414, |
|
"learning_rate": 7.975881345197394e-06, |
|
"loss": 0.1278, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.596816976127321, |
|
"grad_norm": 2.012786559192079, |
|
"learning_rate": 7.959114390802894e-06, |
|
"loss": 0.1322, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5994694960212201, |
|
"grad_norm": 1.9101656055747547, |
|
"learning_rate": 7.942296065389528e-06, |
|
"loss": 0.1411, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.6021220159151194, |
|
"grad_norm": 1.8476462070875777, |
|
"learning_rate": 7.925426660927926e-06, |
|
"loss": 0.1325, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.6047745358090185, |
|
"grad_norm": 1.9254949919992999, |
|
"learning_rate": 7.908506470275474e-06, |
|
"loss": 0.1226, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.6074270557029178, |
|
"grad_norm": 2.0758812690239052, |
|
"learning_rate": 7.891535787171216e-06, |
|
"loss": 0.1263, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.610079575596817, |
|
"grad_norm": 1.9368560259395753, |
|
"learning_rate": 7.874514906230757e-06, |
|
"loss": 0.1308, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6127320954907162, |
|
"grad_norm": 1.7832950240034287, |
|
"learning_rate": 7.857444122941155e-06, |
|
"loss": 0.1229, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 1.9284054045996681, |
|
"learning_rate": 7.84032373365578e-06, |
|
"loss": 0.1315, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.6180371352785146, |
|
"grad_norm": 1.8237229630031635, |
|
"learning_rate": 7.82315403558918e-06, |
|
"loss": 0.1263, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.6206896551724138, |
|
"grad_norm": 2.3980431877447552, |
|
"learning_rate": 7.805935326811913e-06, |
|
"loss": 0.1594, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.623342175066313, |
|
"grad_norm": 1.908328991517894, |
|
"learning_rate": 7.78866790624538e-06, |
|
"loss": 0.1221, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.6259946949602122, |
|
"grad_norm": 1.9070140557762951, |
|
"learning_rate": 7.771352073656628e-06, |
|
"loss": 0.1099, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.6286472148541115, |
|
"grad_norm": 1.9086850445705497, |
|
"learning_rate": 7.753988129653152e-06, |
|
"loss": 0.1217, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.6312997347480106, |
|
"grad_norm": 2.298658640053533, |
|
"learning_rate": 7.736576375677676e-06, |
|
"loss": 0.1534, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.6339522546419099, |
|
"grad_norm": 2.012272147602231, |
|
"learning_rate": 7.719117114002912e-06, |
|
"loss": 0.1367, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.636604774535809, |
|
"grad_norm": 2.054870475879218, |
|
"learning_rate": 7.701610647726323e-06, |
|
"loss": 0.1528, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6392572944297082, |
|
"grad_norm": 1.931987531988978, |
|
"learning_rate": 7.684057280764855e-06, |
|
"loss": 0.1359, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.6419098143236074, |
|
"grad_norm": 1.6653702816540303, |
|
"learning_rate": 7.666457317849663e-06, |
|
"loss": 0.1271, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.6445623342175066, |
|
"grad_norm": 1.8059464643897476, |
|
"learning_rate": 7.648811064520821e-06, |
|
"loss": 0.1355, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.6472148541114059, |
|
"grad_norm": 1.8078224433971741, |
|
"learning_rate": 7.631118827122013e-06, |
|
"loss": 0.1202, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.649867374005305, |
|
"grad_norm": 1.948459636065191, |
|
"learning_rate": 7.613380912795225e-06, |
|
"loss": 0.1429, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.6525198938992043, |
|
"grad_norm": 2.2076773590525316, |
|
"learning_rate": 7.595597629475402e-06, |
|
"loss": 0.1516, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.6551724137931034, |
|
"grad_norm": 1.8887891109191988, |
|
"learning_rate": 7.57776928588511e-06, |
|
"loss": 0.1334, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.6578249336870027, |
|
"grad_norm": 2.3599876519640297, |
|
"learning_rate": 7.559896191529169e-06, |
|
"loss": 0.1523, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6604774535809018, |
|
"grad_norm": 1.6073366184090068, |
|
"learning_rate": 7.54197865668929e-06, |
|
"loss": 0.1094, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.6631299734748011, |
|
"grad_norm": 1.6066208266739346, |
|
"learning_rate": 7.524016992418676e-06, |
|
"loss": 0.1133, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6657824933687002, |
|
"grad_norm": 2.402024874117795, |
|
"learning_rate": 7.506011510536635e-06, |
|
"loss": 0.1683, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.6684350132625995, |
|
"grad_norm": 2.4880666218264955, |
|
"learning_rate": 7.487962523623159e-06, |
|
"loss": 0.1604, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6710875331564987, |
|
"grad_norm": 2.1970208087255076, |
|
"learning_rate": 7.469870345013495e-06, |
|
"loss": 0.1261, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.6737400530503979, |
|
"grad_norm": 1.949833364046787, |
|
"learning_rate": 7.451735288792716e-06, |
|
"loss": 0.1283, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.6763925729442971, |
|
"grad_norm": 2.043764786717034, |
|
"learning_rate": 7.4335576697902546e-06, |
|
"loss": 0.1217, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6790450928381963, |
|
"grad_norm": 1.7800715009896282, |
|
"learning_rate": 7.415337803574449e-06, |
|
"loss": 0.1134, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6816976127320955, |
|
"grad_norm": 2.036030495974864, |
|
"learning_rate": 7.3970760064470634e-06, |
|
"loss": 0.1346, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.6843501326259946, |
|
"grad_norm": 1.7953348831678584, |
|
"learning_rate": 7.378772595437785e-06, |
|
"loss": 0.1068, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.6870026525198939, |
|
"grad_norm": 2.151685228258153, |
|
"learning_rate": 7.360427888298737e-06, |
|
"loss": 0.1374, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 1.6830864009549993, |
|
"learning_rate": 7.342042203498952e-06, |
|
"loss": 0.0968, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6923076923076923, |
|
"grad_norm": 2.0738297934683545, |
|
"learning_rate": 7.323615860218844e-06, |
|
"loss": 0.1274, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.6949602122015915, |
|
"grad_norm": 2.416943322684093, |
|
"learning_rate": 7.3051491783446705e-06, |
|
"loss": 0.1395, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.6976127320954907, |
|
"grad_norm": 1.8473933982642847, |
|
"learning_rate": 7.2866424784629806e-06, |
|
"loss": 0.1189, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.7002652519893899, |
|
"grad_norm": 1.990601440702459, |
|
"learning_rate": 7.26809608185504e-06, |
|
"loss": 0.1202, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.7029177718832891, |
|
"grad_norm": 1.904814403012209, |
|
"learning_rate": 7.249510310491268e-06, |
|
"loss": 0.1208, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.7055702917771883, |
|
"grad_norm": 1.8589441949655492, |
|
"learning_rate": 7.230885487025635e-06, |
|
"loss": 0.1181, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.7082228116710876, |
|
"grad_norm": 1.6795096831283485, |
|
"learning_rate": 7.212221934790067e-06, |
|
"loss": 0.1192, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.7108753315649867, |
|
"grad_norm": 1.723252840070737, |
|
"learning_rate": 7.193519977788834e-06, |
|
"loss": 0.1236, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.713527851458886, |
|
"grad_norm": 2.418801413723791, |
|
"learning_rate": 7.174779940692922e-06, |
|
"loss": 0.1544, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.7161803713527851, |
|
"grad_norm": 1.770338034548602, |
|
"learning_rate": 7.1560021488343956e-06, |
|
"loss": 0.1286, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7188328912466844, |
|
"grad_norm": 1.6417602409918115, |
|
"learning_rate": 7.1371869282007545e-06, |
|
"loss": 0.1142, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.7214854111405835, |
|
"grad_norm": 2.2143362328692207, |
|
"learning_rate": 7.118334605429272e-06, |
|
"loss": 0.1542, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.7241379310344828, |
|
"grad_norm": 2.316061054037307, |
|
"learning_rate": 7.099445507801324e-06, |
|
"loss": 0.1376, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.726790450928382, |
|
"grad_norm": 2.184752996333078, |
|
"learning_rate": 7.080519963236706e-06, |
|
"loss": 0.1268, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.7294429708222812, |
|
"grad_norm": 1.840884510423046, |
|
"learning_rate": 7.0615583002879465e-06, |
|
"loss": 0.1346, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.7320954907161804, |
|
"grad_norm": 1.811306716138802, |
|
"learning_rate": 7.042560848134592e-06, |
|
"loss": 0.1149, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.7347480106100795, |
|
"grad_norm": 2.0428053121611476, |
|
"learning_rate": 7.023527936577507e-06, |
|
"loss": 0.1407, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.7374005305039788, |
|
"grad_norm": 1.6116834909252478, |
|
"learning_rate": 7.004459896033137e-06, |
|
"loss": 0.1193, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.7400530503978779, |
|
"grad_norm": 2.1213770604701976, |
|
"learning_rate": 6.985357057527774e-06, |
|
"loss": 0.1434, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.7427055702917772, |
|
"grad_norm": 1.91726573137005, |
|
"learning_rate": 6.966219752691814e-06, |
|
"loss": 0.125, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7453580901856764, |
|
"grad_norm": 1.67695736254988, |
|
"learning_rate": 6.947048313753998e-06, |
|
"loss": 0.1338, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.7480106100795756, |
|
"grad_norm": 2.489354393443295, |
|
"learning_rate": 6.927843073535645e-06, |
|
"loss": 0.1473, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.7506631299734748, |
|
"grad_norm": 1.7295592511699343, |
|
"learning_rate": 6.9086043654448734e-06, |
|
"loss": 0.1256, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.753315649867374, |
|
"grad_norm": 1.9395856232269375, |
|
"learning_rate": 6.889332523470808e-06, |
|
"loss": 0.1195, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.7559681697612732, |
|
"grad_norm": 2.066706586377597, |
|
"learning_rate": 6.870027882177791e-06, |
|
"loss": 0.1358, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.7586206896551724, |
|
"grad_norm": 2.0770046146432395, |
|
"learning_rate": 6.850690776699574e-06, |
|
"loss": 0.1429, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.7612732095490716, |
|
"grad_norm": 1.881483786265112, |
|
"learning_rate": 6.831321542733482e-06, |
|
"loss": 0.1266, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.7639257294429708, |
|
"grad_norm": 1.695062472576317, |
|
"learning_rate": 6.811920516534616e-06, |
|
"loss": 0.1202, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.76657824933687, |
|
"grad_norm": 2.1054959851607506, |
|
"learning_rate": 6.7924880349099855e-06, |
|
"loss": 0.137, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 1.559652722034954, |
|
"learning_rate": 6.773024435212678e-06, |
|
"loss": 0.0985, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7718832891246684, |
|
"grad_norm": 2.2208427457963342, |
|
"learning_rate": 6.753530055336006e-06, |
|
"loss": 0.1466, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.7745358090185677, |
|
"grad_norm": 2.1893216240483753, |
|
"learning_rate": 6.734005233707624e-06, |
|
"loss": 0.1537, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.7771883289124668, |
|
"grad_norm": 2.3583536185079703, |
|
"learning_rate": 6.714450309283671e-06, |
|
"loss": 0.1384, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.7798408488063661, |
|
"grad_norm": 2.116909607447533, |
|
"learning_rate": 6.694865621542873e-06, |
|
"loss": 0.1525, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.7824933687002652, |
|
"grad_norm": 1.553790399379545, |
|
"learning_rate": 6.675251510480662e-06, |
|
"loss": 0.1074, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.7851458885941645, |
|
"grad_norm": 1.702247544650182, |
|
"learning_rate": 6.655608316603257e-06, |
|
"loss": 0.1037, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7877984084880637, |
|
"grad_norm": 1.4027176458517787, |
|
"learning_rate": 6.635936380921774e-06, |
|
"loss": 0.0842, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.7904509283819628, |
|
"grad_norm": 1.8615502107309188, |
|
"learning_rate": 6.616236044946283e-06, |
|
"loss": 0.1335, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.7931034482758621, |
|
"grad_norm": 1.6648222471112546, |
|
"learning_rate": 6.5965076506799e-06, |
|
"loss": 0.1181, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.7957559681697612, |
|
"grad_norm": 1.9881029740135223, |
|
"learning_rate": 6.576751540612835e-06, |
|
"loss": 0.1112, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7984084880636605, |
|
"grad_norm": 2.2013679784949667, |
|
"learning_rate": 6.556968057716457e-06, |
|
"loss": 0.1237, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.8010610079575596, |
|
"grad_norm": 2.1049495467849697, |
|
"learning_rate": 6.537157545437326e-06, |
|
"loss": 0.14, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.8037135278514589, |
|
"grad_norm": 1.9391684954075283, |
|
"learning_rate": 6.517320347691245e-06, |
|
"loss": 0.1432, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.8063660477453581, |
|
"grad_norm": 1.897903152535271, |
|
"learning_rate": 6.497456808857286e-06, |
|
"loss": 0.1242, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.8090185676392573, |
|
"grad_norm": 2.400907859514575, |
|
"learning_rate": 6.477567273771807e-06, |
|
"loss": 0.1363, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.8116710875331565, |
|
"grad_norm": 2.0519874786243095, |
|
"learning_rate": 6.4576520877224644e-06, |
|
"loss": 0.149, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.8143236074270557, |
|
"grad_norm": 1.613637672772973, |
|
"learning_rate": 6.437711596442228e-06, |
|
"loss": 0.1078, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.8169761273209549, |
|
"grad_norm": 2.0653398136129675, |
|
"learning_rate": 6.4177461461033675e-06, |
|
"loss": 0.1416, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.8196286472148541, |
|
"grad_norm": 2.2371819869165073, |
|
"learning_rate": 6.397756083311454e-06, |
|
"loss": 0.1395, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.8222811671087533, |
|
"grad_norm": 1.7682314840535782, |
|
"learning_rate": 6.377741755099334e-06, |
|
"loss": 0.1248, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8249336870026526, |
|
"grad_norm": 1.7180875056368419, |
|
"learning_rate": 6.357703508921109e-06, |
|
"loss": 0.1146, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.8275862068965517, |
|
"grad_norm": 1.837539966421522, |
|
"learning_rate": 6.337641692646106e-06, |
|
"loss": 0.1063, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.830238726790451, |
|
"grad_norm": 1.9061111225956069, |
|
"learning_rate": 6.317556654552825e-06, |
|
"loss": 0.1261, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.8328912466843501, |
|
"grad_norm": 1.7785361235235786, |
|
"learning_rate": 6.297448743322918e-06, |
|
"loss": 0.1213, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.8355437665782494, |
|
"grad_norm": 2.285655601737043, |
|
"learning_rate": 6.277318308035109e-06, |
|
"loss": 0.136, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.8381962864721485, |
|
"grad_norm": 2.292822876847262, |
|
"learning_rate": 6.257165698159149e-06, |
|
"loss": 0.1437, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.8408488063660478, |
|
"grad_norm": 1.925635900709222, |
|
"learning_rate": 6.236991263549748e-06, |
|
"loss": 0.1244, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.843501326259947, |
|
"grad_norm": 1.8764634867307084, |
|
"learning_rate": 6.2167953544404955e-06, |
|
"loss": 0.1269, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.8461538461538461, |
|
"grad_norm": 1.9227813803757845, |
|
"learning_rate": 6.1965783214377895e-06, |
|
"loss": 0.1052, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.8488063660477454, |
|
"grad_norm": 1.6243389871471685, |
|
"learning_rate": 6.176340515514738e-06, |
|
"loss": 0.0946, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8514588859416445, |
|
"grad_norm": 1.9721230457586518, |
|
"learning_rate": 6.156082288005078e-06, |
|
"loss": 0.1242, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.8541114058355438, |
|
"grad_norm": 1.939736016873224, |
|
"learning_rate": 6.135803990597066e-06, |
|
"loss": 0.1107, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.8567639257294429, |
|
"grad_norm": 1.910794512321765, |
|
"learning_rate": 6.115505975327382e-06, |
|
"loss": 0.1157, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.8594164456233422, |
|
"grad_norm": 2.3310014277848006, |
|
"learning_rate": 6.095188594575008e-06, |
|
"loss": 0.1473, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.8620689655172413, |
|
"grad_norm": 2.021424856899511, |
|
"learning_rate": 6.074852201055121e-06, |
|
"loss": 0.1412, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.8647214854111406, |
|
"grad_norm": 1.8755075770524758, |
|
"learning_rate": 6.054497147812962e-06, |
|
"loss": 0.1374, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.8673740053050398, |
|
"grad_norm": 2.0283809132037947, |
|
"learning_rate": 6.034123788217712e-06, |
|
"loss": 0.1225, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.870026525198939, |
|
"grad_norm": 1.77815086557222, |
|
"learning_rate": 6.013732475956352e-06, |
|
"loss": 0.1105, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8726790450928382, |
|
"grad_norm": 1.698228853578335, |
|
"learning_rate": 5.993323565027528e-06, |
|
"loss": 0.0964, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.8753315649867374, |
|
"grad_norm": 1.8428502419687727, |
|
"learning_rate": 5.972897409735403e-06, |
|
"loss": 0.1227, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8779840848806366, |
|
"grad_norm": 1.7797504117579037, |
|
"learning_rate": 5.952454364683507e-06, |
|
"loss": 0.1129, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.8806366047745358, |
|
"grad_norm": 2.5588750953515143, |
|
"learning_rate": 5.931994784768582e-06, |
|
"loss": 0.1512, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.883289124668435, |
|
"grad_norm": 2.4302985612966697, |
|
"learning_rate": 5.911519025174419e-06, |
|
"loss": 0.1419, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.8859416445623343, |
|
"grad_norm": 2.116715417993045, |
|
"learning_rate": 5.891027441365689e-06, |
|
"loss": 0.1248, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.8885941644562334, |
|
"grad_norm": 1.8450184290705178, |
|
"learning_rate": 5.870520389081782e-06, |
|
"loss": 0.1093, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.8912466843501327, |
|
"grad_norm": 1.8378061965275745, |
|
"learning_rate": 5.849998224330621e-06, |
|
"loss": 0.1103, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8938992042440318, |
|
"grad_norm": 2.0712714287437963, |
|
"learning_rate": 5.829461303382484e-06, |
|
"loss": 0.1318, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.896551724137931, |
|
"grad_norm": 2.1172154065660034, |
|
"learning_rate": 5.808909982763825e-06, |
|
"loss": 0.1185, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.8992042440318302, |
|
"grad_norm": 2.600554195545317, |
|
"learning_rate": 5.788344619251076e-06, |
|
"loss": 0.1462, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.9018567639257294, |
|
"grad_norm": 2.08199632146431, |
|
"learning_rate": 5.767765569864459e-06, |
|
"loss": 0.1383, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9045092838196287, |
|
"grad_norm": 1.806422329448522, |
|
"learning_rate": 5.747173191861788e-06, |
|
"loss": 0.0905, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.9071618037135278, |
|
"grad_norm": 1.8050690831990468, |
|
"learning_rate": 5.726567842732262e-06, |
|
"loss": 0.1156, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.9098143236074271, |
|
"grad_norm": 2.091128781060801, |
|
"learning_rate": 5.705949880190266e-06, |
|
"loss": 0.1195, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.9124668435013262, |
|
"grad_norm": 1.8670275726747676, |
|
"learning_rate": 5.685319662169157e-06, |
|
"loss": 0.1029, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.9151193633952255, |
|
"grad_norm": 2.039989303735285, |
|
"learning_rate": 5.664677546815043e-06, |
|
"loss": 0.1312, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.9177718832891246, |
|
"grad_norm": 1.859016569528762, |
|
"learning_rate": 5.644023892480583e-06, |
|
"loss": 0.1055, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.9204244031830239, |
|
"grad_norm": 1.5903154857571018, |
|
"learning_rate": 5.623359057718752e-06, |
|
"loss": 0.0973, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 1.9024912783079182, |
|
"learning_rate": 5.6026834012766155e-06, |
|
"loss": 0.1147, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.9257294429708223, |
|
"grad_norm": 2.4600571478007645, |
|
"learning_rate": 5.581997282089114e-06, |
|
"loss": 0.1274, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.9283819628647215, |
|
"grad_norm": 2.1252658277426493, |
|
"learning_rate": 5.561301059272821e-06, |
|
"loss": 0.1192, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9310344827586207, |
|
"grad_norm": 2.2663754831446528, |
|
"learning_rate": 5.540595092119709e-06, |
|
"loss": 0.1368, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.9336870026525199, |
|
"grad_norm": 2.1656127015955637, |
|
"learning_rate": 5.519879740090918e-06, |
|
"loss": 0.1072, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.9363395225464191, |
|
"grad_norm": 2.1805472380714552, |
|
"learning_rate": 5.499155362810512e-06, |
|
"loss": 0.1475, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.9389920424403183, |
|
"grad_norm": 1.9313395207402062, |
|
"learning_rate": 5.478422320059231e-06, |
|
"loss": 0.1138, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.9416445623342176, |
|
"grad_norm": 1.9593835652421479, |
|
"learning_rate": 5.457680971768258e-06, |
|
"loss": 0.1236, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.9442970822281167, |
|
"grad_norm": 1.9523449371463568, |
|
"learning_rate": 5.436931678012956e-06, |
|
"loss": 0.1191, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.946949602122016, |
|
"grad_norm": 2.213437974106433, |
|
"learning_rate": 5.4161747990066235e-06, |
|
"loss": 0.157, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.9496021220159151, |
|
"grad_norm": 1.7830925038957812, |
|
"learning_rate": 5.395410695094246e-06, |
|
"loss": 0.1048, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.9522546419098143, |
|
"grad_norm": 1.9526782766464377, |
|
"learning_rate": 5.374639726746232e-06, |
|
"loss": 0.1298, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.9549071618037135, |
|
"grad_norm": 2.225209885017297, |
|
"learning_rate": 5.353862254552159e-06, |
|
"loss": 0.1114, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9575596816976127, |
|
"grad_norm": 1.8717932182864292, |
|
"learning_rate": 5.333078639214511e-06, |
|
"loss": 0.114, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.9602122015915119, |
|
"grad_norm": 2.037766974038598, |
|
"learning_rate": 5.31228924154242e-06, |
|
"loss": 0.1061, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.9628647214854111, |
|
"grad_norm": 1.7551149566575617, |
|
"learning_rate": 5.2914944224454e-06, |
|
"loss": 0.1105, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.9655172413793104, |
|
"grad_norm": 1.6746920582972595, |
|
"learning_rate": 5.270694542927089e-06, |
|
"loss": 0.1129, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.9681697612732095, |
|
"grad_norm": 1.7421649434309925, |
|
"learning_rate": 5.249889964078965e-06, |
|
"loss": 0.1095, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.9708222811671088, |
|
"grad_norm": 1.705953446601723, |
|
"learning_rate": 5.2290810470740925e-06, |
|
"loss": 0.1009, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.9734748010610079, |
|
"grad_norm": 1.9937643181748503, |
|
"learning_rate": 5.2082681531608505e-06, |
|
"loss": 0.1374, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.9761273209549072, |
|
"grad_norm": 2.173137907211324, |
|
"learning_rate": 5.187451643656654e-06, |
|
"loss": 0.1279, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9787798408488063, |
|
"grad_norm": 1.8358057318165615, |
|
"learning_rate": 5.166631879941686e-06, |
|
"loss": 0.1032, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.9814323607427056, |
|
"grad_norm": 2.097800711750244, |
|
"learning_rate": 5.145809223452625e-06, |
|
"loss": 0.119, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9840848806366048, |
|
"grad_norm": 1.7975003014708764, |
|
"learning_rate": 5.124984035676366e-06, |
|
"loss": 0.0984, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.986737400530504, |
|
"grad_norm": 1.6452454858458356, |
|
"learning_rate": 5.1041566781437525e-06, |
|
"loss": 0.0995, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.9893899204244032, |
|
"grad_norm": 1.6148448625331262, |
|
"learning_rate": 5.083327512423294e-06, |
|
"loss": 0.087, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.9920424403183024, |
|
"grad_norm": 1.8548088873299826, |
|
"learning_rate": 5.062496900114887e-06, |
|
"loss": 0.1028, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.9946949602122016, |
|
"grad_norm": 2.286016586242463, |
|
"learning_rate": 5.041665202843543e-06, |
|
"loss": 0.1394, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.9973474801061007, |
|
"grad_norm": 1.9248524614602884, |
|
"learning_rate": 5.020832782253115e-06, |
|
"loss": 0.1206, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.7947903575090804, |
|
"learning_rate": 5e-06, |
|
"loss": 0.108, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 1.0026525198938991, |
|
"grad_norm": 1.1448965437517742, |
|
"learning_rate": 4.979167217746888e-06, |
|
"loss": 0.0474, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.0053050397877985, |
|
"grad_norm": 1.4782795600036578, |
|
"learning_rate": 4.9583347971564575e-06, |
|
"loss": 0.0546, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 1.0079575596816976, |
|
"grad_norm": 1.4977052145475058, |
|
"learning_rate": 4.937503099885115e-06, |
|
"loss": 0.0527, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.0106100795755968, |
|
"grad_norm": 1.3509330137945688, |
|
"learning_rate": 4.916672487576708e-06, |
|
"loss": 0.0581, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.013262599469496, |
|
"grad_norm": 1.3766430452266287, |
|
"learning_rate": 4.895843321856249e-06, |
|
"loss": 0.0543, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 1.0159151193633953, |
|
"grad_norm": 1.1838930001612804, |
|
"learning_rate": 4.875015964323635e-06, |
|
"loss": 0.0501, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 1.0185676392572944, |
|
"grad_norm": 1.1327638132836573, |
|
"learning_rate": 4.854190776547377e-06, |
|
"loss": 0.0488, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.0212201591511936, |
|
"grad_norm": 1.3242917574913802, |
|
"learning_rate": 4.833368120058317e-06, |
|
"loss": 0.0528, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.023872679045093, |
|
"grad_norm": 1.3957886304337215, |
|
"learning_rate": 4.812548356343347e-06, |
|
"loss": 0.0435, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 1.026525198938992, |
|
"grad_norm": 1.3976646283902743, |
|
"learning_rate": 4.79173184683915e-06, |
|
"loss": 0.0504, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.0291777188328912, |
|
"grad_norm": 1.594064921848588, |
|
"learning_rate": 4.770918952925908e-06, |
|
"loss": 0.0467, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.0318302387267904, |
|
"grad_norm": 1.8424058890187611, |
|
"learning_rate": 4.750110035921038e-06, |
|
"loss": 0.0592, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 1.0344827586206897, |
|
"grad_norm": 1.5198759678201303, |
|
"learning_rate": 4.729305457072913e-06, |
|
"loss": 0.0493, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.0371352785145889, |
|
"grad_norm": 1.63737016645813, |
|
"learning_rate": 4.708505577554601e-06, |
|
"loss": 0.0637, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 1.039787798408488, |
|
"grad_norm": 1.513812328618622, |
|
"learning_rate": 4.687710758457583e-06, |
|
"loss": 0.0452, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 1.0424403183023874, |
|
"grad_norm": 1.6651422093934103, |
|
"learning_rate": 4.6669213607854915e-06, |
|
"loss": 0.0498, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.0450928381962865, |
|
"grad_norm": 1.5964261962470236, |
|
"learning_rate": 4.646137745447843e-06, |
|
"loss": 0.0623, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 1.0477453580901857, |
|
"grad_norm": 1.7129301697378825, |
|
"learning_rate": 4.6253602732537685e-06, |
|
"loss": 0.0455, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.0503978779840848, |
|
"grad_norm": 1.5300658689026623, |
|
"learning_rate": 4.6045893049057544e-06, |
|
"loss": 0.0555, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.0530503978779842, |
|
"grad_norm": 1.3274779219248138, |
|
"learning_rate": 4.583825200993377e-06, |
|
"loss": 0.0461, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 1.0557029177718833, |
|
"grad_norm": 1.6225406314000637, |
|
"learning_rate": 4.563068321987047e-06, |
|
"loss": 0.0531, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 1.0583554376657824, |
|
"grad_norm": 1.5953486175793383, |
|
"learning_rate": 4.542319028231744e-06, |
|
"loss": 0.0477, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.0610079575596818, |
|
"grad_norm": 1.5805509793613008, |
|
"learning_rate": 4.521577679940769e-06, |
|
"loss": 0.0429, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0610079575596818, |
|
"eval_loss": 0.1300889253616333, |
|
"eval_runtime": 1.2993, |
|
"eval_samples_per_second": 23.859, |
|
"eval_steps_per_second": 6.157, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.063660477453581, |
|
"grad_norm": 1.6980952582384556, |
|
"learning_rate": 4.50084463718949e-06, |
|
"loss": 0.06, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 1.06631299734748, |
|
"grad_norm": 1.5737277597483659, |
|
"learning_rate": 4.480120259909084e-06, |
|
"loss": 0.0488, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.0689655172413792, |
|
"grad_norm": 1.6771874133081281, |
|
"learning_rate": 4.459404907880293e-06, |
|
"loss": 0.0539, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 1.0716180371352786, |
|
"grad_norm": 1.6529014957836754, |
|
"learning_rate": 4.438698940727179e-06, |
|
"loss": 0.0572, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 1.0742705570291777, |
|
"grad_norm": 1.5970118458627822, |
|
"learning_rate": 4.418002717910887e-06, |
|
"loss": 0.06, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.0769230769230769, |
|
"grad_norm": 1.4471958707171573, |
|
"learning_rate": 4.397316598723385e-06, |
|
"loss": 0.0538, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 1.079575596816976, |
|
"grad_norm": 1.0348681849940542, |
|
"learning_rate": 4.37664094228125e-06, |
|
"loss": 0.035, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 1.0822281167108754, |
|
"grad_norm": 1.3544554476316786, |
|
"learning_rate": 4.3559761075194185e-06, |
|
"loss": 0.0417, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.0848806366047745, |
|
"grad_norm": 1.9097492769272106, |
|
"learning_rate": 4.335322453184959e-06, |
|
"loss": 0.0564, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 1.0875331564986737, |
|
"grad_norm": 1.337428576441879, |
|
"learning_rate": 4.314680337830847e-06, |
|
"loss": 0.0471, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.090185676392573, |
|
"grad_norm": 1.6610870125693307, |
|
"learning_rate": 4.294050119809735e-06, |
|
"loss": 0.0495, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 1.0928381962864722, |
|
"grad_norm": 1.648323552864328, |
|
"learning_rate": 4.273432157267739e-06, |
|
"loss": 0.0541, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 1.0954907161803713, |
|
"grad_norm": 1.5486084688969948, |
|
"learning_rate": 4.252826808138214e-06, |
|
"loss": 0.0559, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 1.0981432360742707, |
|
"grad_norm": 1.7853234719100994, |
|
"learning_rate": 4.232234430135542e-06, |
|
"loss": 0.0688, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.1007957559681698, |
|
"grad_norm": 1.5356971600614069, |
|
"learning_rate": 4.2116553807489255e-06, |
|
"loss": 0.0516, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.103448275862069, |
|
"grad_norm": 1.6247663576609628, |
|
"learning_rate": 4.191090017236177e-06, |
|
"loss": 0.0514, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.106100795755968, |
|
"grad_norm": 1.534189806530592, |
|
"learning_rate": 4.170538696617518e-06, |
|
"loss": 0.0524, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 1.1087533156498675, |
|
"grad_norm": 1.3158821951034483, |
|
"learning_rate": 4.15000177566938e-06, |
|
"loss": 0.0403, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 1.1114058355437666, |
|
"grad_norm": 1.5138948007814246, |
|
"learning_rate": 4.129479610918219e-06, |
|
"loss": 0.049, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 1.1140583554376657, |
|
"grad_norm": 1.2320891847094047, |
|
"learning_rate": 4.108972558634312e-06, |
|
"loss": 0.042, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.1167108753315649, |
|
"grad_norm": 1.6729860721769905, |
|
"learning_rate": 4.088480974825584e-06, |
|
"loss": 0.0527, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 1.1193633952254642, |
|
"grad_norm": 2.128191222629532, |
|
"learning_rate": 4.0680052152314185e-06, |
|
"loss": 0.064, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.1220159151193634, |
|
"grad_norm": 1.5415156129118721, |
|
"learning_rate": 4.047545635316494e-06, |
|
"loss": 0.0479, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 1.1246684350132625, |
|
"grad_norm": 1.4478769280496202, |
|
"learning_rate": 4.0271025902646e-06, |
|
"loss": 0.0461, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 1.1273209549071619, |
|
"grad_norm": 1.4168078919989058, |
|
"learning_rate": 4.006676434972474e-06, |
|
"loss": 0.0461, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.129973474801061, |
|
"grad_norm": 1.7163680394879697, |
|
"learning_rate": 3.98626752404365e-06, |
|
"loss": 0.0563, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.1326259946949602, |
|
"grad_norm": 1.4561458702273795, |
|
"learning_rate": 3.96587621178229e-06, |
|
"loss": 0.0451, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 1.1352785145888595, |
|
"grad_norm": 1.3054592699914147, |
|
"learning_rate": 3.94550285218704e-06, |
|
"loss": 0.0413, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 1.1379310344827587, |
|
"grad_norm": 1.410529232372516, |
|
"learning_rate": 3.92514779894488e-06, |
|
"loss": 0.0475, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 1.1405835543766578, |
|
"grad_norm": 1.1401984741886695, |
|
"learning_rate": 3.904811405424993e-06, |
|
"loss": 0.034, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.143236074270557, |
|
"grad_norm": 1.677403926891428, |
|
"learning_rate": 3.8844940246726206e-06, |
|
"loss": 0.0537, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 1.1458885941644563, |
|
"grad_norm": 2.0010687043565856, |
|
"learning_rate": 3.864196009402935e-06, |
|
"loss": 0.0571, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.1485411140583555, |
|
"grad_norm": 1.4276836740813328, |
|
"learning_rate": 3.843917711994923e-06, |
|
"loss": 0.039, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 1.1511936339522546, |
|
"grad_norm": 1.7186301264192605, |
|
"learning_rate": 3.823659484485264e-06, |
|
"loss": 0.0566, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 1.1538461538461537, |
|
"grad_norm": 1.6127857163652946, |
|
"learning_rate": 3.803421678562213e-06, |
|
"loss": 0.0392, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.156498673740053, |
|
"grad_norm": 1.6060058035757456, |
|
"learning_rate": 3.783204645559504e-06, |
|
"loss": 0.058, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 1.1591511936339522, |
|
"grad_norm": 1.8031413344908938, |
|
"learning_rate": 3.7630087364502545e-06, |
|
"loss": 0.0565, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 1.1618037135278514, |
|
"grad_norm": 1.3528132025697588, |
|
"learning_rate": 3.742834301840853e-06, |
|
"loss": 0.0488, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.1644562334217508, |
|
"grad_norm": 1.4452161790201725, |
|
"learning_rate": 3.722681691964892e-06, |
|
"loss": 0.052, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 1.16710875331565, |
|
"grad_norm": 1.506462400743829, |
|
"learning_rate": 3.702551256677083e-06, |
|
"loss": 0.0517, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.169761273209549, |
|
"grad_norm": 1.4482316284963157, |
|
"learning_rate": 3.6824433454471755e-06, |
|
"loss": 0.0486, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 1.1724137931034484, |
|
"grad_norm": 1.4651862565906717, |
|
"learning_rate": 3.662358307353897e-06, |
|
"loss": 0.0528, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 1.1750663129973475, |
|
"grad_norm": 1.7861455263109471, |
|
"learning_rate": 3.6422964910788917e-06, |
|
"loss": 0.068, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 1.1777188328912467, |
|
"grad_norm": 1.8641910946820022, |
|
"learning_rate": 3.6222582449006673e-06, |
|
"loss": 0.0557, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.1803713527851458, |
|
"grad_norm": 1.493182090383967, |
|
"learning_rate": 3.602243916688548e-06, |
|
"loss": 0.0468, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.1830238726790452, |
|
"grad_norm": 1.458973918132782, |
|
"learning_rate": 3.5822538538966333e-06, |
|
"loss": 0.0546, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 1.1856763925729443, |
|
"grad_norm": 1.1386325113824567, |
|
"learning_rate": 3.5622884035577743e-06, |
|
"loss": 0.044, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 1.1883289124668435, |
|
"grad_norm": 1.8301480285009966, |
|
"learning_rate": 3.542347912277537e-06, |
|
"loss": 0.0532, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 1.1909814323607426, |
|
"grad_norm": 1.4956585692430135, |
|
"learning_rate": 3.5224327262281956e-06, |
|
"loss": 0.0583, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 1.193633952254642, |
|
"grad_norm": 1.5435072195981274, |
|
"learning_rate": 3.502543191142713e-06, |
|
"loss": 0.0527, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.1962864721485411, |
|
"grad_norm": 1.2942065638898985, |
|
"learning_rate": 3.482679652308756e-06, |
|
"loss": 0.0458, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 1.1989389920424403, |
|
"grad_norm": 1.5335416817545946, |
|
"learning_rate": 3.462842454562677e-06, |
|
"loss": 0.0435, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 1.2015915119363396, |
|
"grad_norm": 1.4166517541759323, |
|
"learning_rate": 3.443031942283544e-06, |
|
"loss": 0.0423, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 1.2042440318302388, |
|
"grad_norm": 1.8664420121487215, |
|
"learning_rate": 3.423248459387165e-06, |
|
"loss": 0.0546, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 1.206896551724138, |
|
"grad_norm": 1.3815730200823022, |
|
"learning_rate": 3.403492349320101e-06, |
|
"loss": 0.043, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.209549071618037, |
|
"grad_norm": 1.581543542046171, |
|
"learning_rate": 3.3837639550537183e-06, |
|
"loss": 0.0563, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.2122015915119364, |
|
"grad_norm": 1.420673121312355, |
|
"learning_rate": 3.364063619078228e-06, |
|
"loss": 0.0364, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 1.2148541114058355, |
|
"grad_norm": 1.4971596799850846, |
|
"learning_rate": 3.344391683396744e-06, |
|
"loss": 0.0446, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 1.2175066312997347, |
|
"grad_norm": 1.701718210938796, |
|
"learning_rate": 3.3247484895193406e-06, |
|
"loss": 0.052, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 1.2201591511936338, |
|
"grad_norm": 1.4921935610407966, |
|
"learning_rate": 3.305134378457127e-06, |
|
"loss": 0.046, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.2228116710875332, |
|
"grad_norm": 1.6030117015462066, |
|
"learning_rate": 3.2855496907163296e-06, |
|
"loss": 0.0472, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 1.2254641909814323, |
|
"grad_norm": 1.2527893295170924, |
|
"learning_rate": 3.2659947662923767e-06, |
|
"loss": 0.0443, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.2281167108753315, |
|
"grad_norm": 1.9885749044696137, |
|
"learning_rate": 3.246469944663996e-06, |
|
"loss": 0.0619, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 1.4623016319017155, |
|
"learning_rate": 3.226975564787322e-06, |
|
"loss": 0.042, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 1.23342175066313, |
|
"grad_norm": 1.372530389209216, |
|
"learning_rate": 3.2075119650900166e-06, |
|
"loss": 0.0406, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.2360742705570291, |
|
"grad_norm": 1.3245791205337774, |
|
"learning_rate": 3.1880794834653872e-06, |
|
"loss": 0.0471, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 1.2387267904509285, |
|
"grad_norm": 1.765042349753763, |
|
"learning_rate": 3.1686784572665176e-06, |
|
"loss": 0.0545, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 1.2413793103448276, |
|
"grad_norm": 1.6880726450034362, |
|
"learning_rate": 3.149309223300428e-06, |
|
"loss": 0.0417, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 1.2440318302387268, |
|
"grad_norm": 1.6867768845931341, |
|
"learning_rate": 3.12997211782221e-06, |
|
"loss": 0.064, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 1.246684350132626, |
|
"grad_norm": 1.6829202873944922, |
|
"learning_rate": 3.1106674765291943e-06, |
|
"loss": 0.0487, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.2493368700265253, |
|
"grad_norm": 1.482635437530892, |
|
"learning_rate": 3.0913956345551287e-06, |
|
"loss": 0.0437, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 1.2519893899204244, |
|
"grad_norm": 1.0243355165306798, |
|
"learning_rate": 3.072156926464356e-06, |
|
"loss": 0.0286, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 1.2546419098143236, |
|
"grad_norm": 1.7086250346584169, |
|
"learning_rate": 3.052951686246003e-06, |
|
"loss": 0.0524, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 1.2572944297082227, |
|
"grad_norm": 1.5669584106420342, |
|
"learning_rate": 3.033780247308187e-06, |
|
"loss": 0.0416, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 1.259946949602122, |
|
"grad_norm": 1.5314026636512095, |
|
"learning_rate": 3.0146429424722277e-06, |
|
"loss": 0.0459, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.2625994694960212, |
|
"grad_norm": 1.6510097722189716, |
|
"learning_rate": 2.9955401039668642e-06, |
|
"loss": 0.0451, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 1.2652519893899203, |
|
"grad_norm": 1.1998735054502896, |
|
"learning_rate": 2.976472063422493e-06, |
|
"loss": 0.0421, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 1.2679045092838197, |
|
"grad_norm": 1.8230740287638594, |
|
"learning_rate": 2.9574391518654077e-06, |
|
"loss": 0.0483, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 1.2705570291777188, |
|
"grad_norm": 1.8679306651778758, |
|
"learning_rate": 2.938441699712055e-06, |
|
"loss": 0.0541, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 1.273209549071618, |
|
"grad_norm": 1.5955196355028194, |
|
"learning_rate": 2.9194800367632946e-06, |
|
"loss": 0.0558, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.2758620689655173, |
|
"grad_norm": 1.8476586232402528, |
|
"learning_rate": 2.9005544921986774e-06, |
|
"loss": 0.0564, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 1.2785145888594165, |
|
"grad_norm": 1.4298742037812726, |
|
"learning_rate": 2.8816653945707286e-06, |
|
"loss": 0.0442, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 1.2811671087533156, |
|
"grad_norm": 1.287640370036359, |
|
"learning_rate": 2.8628130717992463e-06, |
|
"loss": 0.0373, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 1.2838196286472148, |
|
"grad_norm": 1.4067433199098056, |
|
"learning_rate": 2.8439978511656057e-06, |
|
"loss": 0.045, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 1.2864721485411141, |
|
"grad_norm": 1.4224319432151542, |
|
"learning_rate": 2.82522005930708e-06, |
|
"loss": 0.0385, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.2891246684350133, |
|
"grad_norm": 1.5527398960311367, |
|
"learning_rate": 2.8064800222111673e-06, |
|
"loss": 0.0413, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 1.2917771883289124, |
|
"grad_norm": 1.4424002142583947, |
|
"learning_rate": 2.787778065209934e-06, |
|
"loss": 0.0398, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 1.2944297082228116, |
|
"grad_norm": 1.3110803703822922, |
|
"learning_rate": 2.7691145129743645e-06, |
|
"loss": 0.048, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 1.297082228116711, |
|
"grad_norm": 1.5121856468389003, |
|
"learning_rate": 2.7504896895087317e-06, |
|
"loss": 0.0414, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 1.29973474801061, |
|
"grad_norm": 1.7902702451543397, |
|
"learning_rate": 2.7319039181449604e-06, |
|
"loss": 0.0616, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.3023872679045092, |
|
"grad_norm": 1.834255575531623, |
|
"learning_rate": 2.713357521537023e-06, |
|
"loss": 0.0479, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 1.3050397877984086, |
|
"grad_norm": 1.3600003850482532, |
|
"learning_rate": 2.6948508216553304e-06, |
|
"loss": 0.042, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 1.3076923076923077, |
|
"grad_norm": 1.9237485999672874, |
|
"learning_rate": 2.6763841397811576e-06, |
|
"loss": 0.0462, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 1.3103448275862069, |
|
"grad_norm": 1.2574491259586364, |
|
"learning_rate": 2.65795779650105e-06, |
|
"loss": 0.0382, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 1.3129973474801062, |
|
"grad_norm": 1.8546812652508733, |
|
"learning_rate": 2.6395721117012648e-06, |
|
"loss": 0.0582, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.3156498673740054, |
|
"grad_norm": 1.4303414853405272, |
|
"learning_rate": 2.6212274045622167e-06, |
|
"loss": 0.0437, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 1.3183023872679045, |
|
"grad_norm": 1.3475863489270175, |
|
"learning_rate": 2.6029239935529395e-06, |
|
"loss": 0.0374, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 1.3209549071618036, |
|
"grad_norm": 1.3885729140928034, |
|
"learning_rate": 2.5846621964255524e-06, |
|
"loss": 0.0378, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 1.323607427055703, |
|
"grad_norm": 1.6004829128070286, |
|
"learning_rate": 2.5664423302097462e-06, |
|
"loss": 0.0456, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 1.3262599469496021, |
|
"grad_norm": 1.4483756915235027, |
|
"learning_rate": 2.5482647112072857e-06, |
|
"loss": 0.0483, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3289124668435013, |
|
"grad_norm": 1.5083194734193153, |
|
"learning_rate": 2.530129654986505e-06, |
|
"loss": 0.0403, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 1.3315649867374004, |
|
"grad_norm": 1.0241725585916064, |
|
"learning_rate": 2.5120374763768422e-06, |
|
"loss": 0.0309, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 1.3342175066312998, |
|
"grad_norm": 1.4775342162666805, |
|
"learning_rate": 2.493988489463366e-06, |
|
"loss": 0.0449, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 1.336870026525199, |
|
"grad_norm": 1.453802729887559, |
|
"learning_rate": 2.475983007581326e-06, |
|
"loss": 0.0552, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 1.339522546419098, |
|
"grad_norm": 1.4761120060128954, |
|
"learning_rate": 2.458021343310713e-06, |
|
"loss": 0.0538, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.3421750663129974, |
|
"grad_norm": 1.4602044460085675, |
|
"learning_rate": 2.4401038084708313e-06, |
|
"loss": 0.0367, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 1.3448275862068966, |
|
"grad_norm": 1.1064714894292946, |
|
"learning_rate": 2.422230714114891e-06, |
|
"loss": 0.0318, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 1.3474801061007957, |
|
"grad_norm": 1.5984242375021522, |
|
"learning_rate": 2.4044023705246e-06, |
|
"loss": 0.0451, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 1.350132625994695, |
|
"grad_norm": 1.8076623418841897, |
|
"learning_rate": 2.3866190872047775e-06, |
|
"loss": 0.055, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 1.3527851458885942, |
|
"grad_norm": 1.53849591572484, |
|
"learning_rate": 2.3688811728779875e-06, |
|
"loss": 0.0413, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.3554376657824934, |
|
"grad_norm": 1.3925465600995954, |
|
"learning_rate": 2.351188935479181e-06, |
|
"loss": 0.0405, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 1.3580901856763925, |
|
"grad_norm": 1.4430777345183743, |
|
"learning_rate": 2.333542682150339e-06, |
|
"loss": 0.0418, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 1.3607427055702916, |
|
"grad_norm": 1.7058397273368744, |
|
"learning_rate": 2.3159427192351467e-06, |
|
"loss": 0.0417, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 1.363395225464191, |
|
"grad_norm": 1.6101298515299973, |
|
"learning_rate": 2.2983893522736795e-06, |
|
"loss": 0.0436, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 1.3660477453580901, |
|
"grad_norm": 1.5663612607810036, |
|
"learning_rate": 2.2808828859970905e-06, |
|
"loss": 0.0415, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.3687002652519893, |
|
"grad_norm": 1.3127711478830895, |
|
"learning_rate": 2.263423624322326e-06, |
|
"loss": 0.0429, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 1.3713527851458887, |
|
"grad_norm": 1.4631985318610559, |
|
"learning_rate": 2.2460118703468475e-06, |
|
"loss": 0.0426, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 1.3740053050397878, |
|
"grad_norm": 2.056857172754129, |
|
"learning_rate": 2.228647926343373e-06, |
|
"loss": 0.0549, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 1.376657824933687, |
|
"grad_norm": 1.4596328383955115, |
|
"learning_rate": 2.211332093754622e-06, |
|
"loss": 0.0463, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 1.3793103448275863, |
|
"grad_norm": 1.4795909661114393, |
|
"learning_rate": 2.1940646731880887e-06, |
|
"loss": 0.0382, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.3819628647214854, |
|
"grad_norm": 1.79070132377592, |
|
"learning_rate": 2.1768459644108223e-06, |
|
"loss": 0.0493, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"grad_norm": 1.6598245589918261, |
|
"learning_rate": 2.159676266344222e-06, |
|
"loss": 0.0504, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 1.387267904509284, |
|
"grad_norm": 1.5697697079805137, |
|
"learning_rate": 2.142555877058847e-06, |
|
"loss": 0.056, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 1.389920424403183, |
|
"grad_norm": 1.5250399923513906, |
|
"learning_rate": 2.125485093769242e-06, |
|
"loss": 0.0451, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 1.3925729442970822, |
|
"grad_norm": 1.488259185423536, |
|
"learning_rate": 2.108464212828786e-06, |
|
"loss": 0.0403, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.3952254641909814, |
|
"grad_norm": 1.7336708194270882, |
|
"learning_rate": 2.091493529724528e-06, |
|
"loss": 0.0545, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 1.3978779840848805, |
|
"grad_norm": 1.457566134032916, |
|
"learning_rate": 2.0745733390720744e-06, |
|
"loss": 0.0401, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 1.4005305039787799, |
|
"grad_norm": 2.0446606459323173, |
|
"learning_rate": 2.057703934610474e-06, |
|
"loss": 0.0511, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 1.403183023872679, |
|
"grad_norm": 1.8179579055212491, |
|
"learning_rate": 2.0408856091971063e-06, |
|
"loss": 0.0415, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 1.4058355437665782, |
|
"grad_norm": 1.4476314828089778, |
|
"learning_rate": 2.024118654802608e-06, |
|
"loss": 0.043, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.4084880636604775, |
|
"grad_norm": 1.4148575078926533, |
|
"learning_rate": 2.007403362505802e-06, |
|
"loss": 0.0408, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 1.4111405835543767, |
|
"grad_norm": 1.6321368893473254, |
|
"learning_rate": 1.990740022488642e-06, |
|
"loss": 0.0381, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 1.4137931034482758, |
|
"grad_norm": 1.5202544820877246, |
|
"learning_rate": 1.9741289240311757e-06, |
|
"loss": 0.045, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 1.4164456233421752, |
|
"grad_norm": 1.5924780405260186, |
|
"learning_rate": 1.957570355506522e-06, |
|
"loss": 0.0453, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 1.4190981432360743, |
|
"grad_norm": 1.7962035777442131, |
|
"learning_rate": 1.9410646043758737e-06, |
|
"loss": 0.0514, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.4217506631299734, |
|
"grad_norm": 1.3406272487252804, |
|
"learning_rate": 1.9246119571834904e-06, |
|
"loss": 0.0333, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 1.4244031830238728, |
|
"grad_norm": 1.730688912537288, |
|
"learning_rate": 1.9082126995517376e-06, |
|
"loss": 0.0475, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 1.427055702917772, |
|
"grad_norm": 1.5376121127489066, |
|
"learning_rate": 1.8918671161761227e-06, |
|
"loss": 0.0343, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 1.429708222811671, |
|
"grad_norm": 1.4633947145114203, |
|
"learning_rate": 1.8755754908203528e-06, |
|
"loss": 0.0409, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 1.4323607427055702, |
|
"grad_norm": 1.6079257325417993, |
|
"learning_rate": 1.8593381063114113e-06, |
|
"loss": 0.0418, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.4350132625994694, |
|
"grad_norm": 1.36008605859255, |
|
"learning_rate": 1.8431552445346434e-06, |
|
"loss": 0.04, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 1.4376657824933687, |
|
"grad_norm": 1.7947332913494305, |
|
"learning_rate": 1.827027186428869e-06, |
|
"loss": 0.0601, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 1.4403183023872679, |
|
"grad_norm": 1.3927853248296074, |
|
"learning_rate": 1.8109542119815e-06, |
|
"loss": 0.0425, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 1.442970822281167, |
|
"grad_norm": 1.415485606603946, |
|
"learning_rate": 1.7949366002236762e-06, |
|
"loss": 0.0408, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 1.4456233421750664, |
|
"grad_norm": 1.3417207244235902, |
|
"learning_rate": 1.7789746292254313e-06, |
|
"loss": 0.0415, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.4482758620689655, |
|
"grad_norm": 1.5705396456844423, |
|
"learning_rate": 1.7630685760908623e-06, |
|
"loss": 0.0414, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 1.4509283819628647, |
|
"grad_norm": 1.4346574390804177, |
|
"learning_rate": 1.7472187169533128e-06, |
|
"loss": 0.0399, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 1.453580901856764, |
|
"grad_norm": 1.5831645649543884, |
|
"learning_rate": 1.7314253269705854e-06, |
|
"loss": 0.0456, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 1.4562334217506632, |
|
"grad_norm": 1.5188840072828362, |
|
"learning_rate": 1.7156886803201638e-06, |
|
"loss": 0.0508, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 1.4588859416445623, |
|
"grad_norm": 1.739652082715737, |
|
"learning_rate": 1.70000905019445e-06, |
|
"loss": 0.0495, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.4615384615384617, |
|
"grad_norm": 1.826264975830559, |
|
"learning_rate": 1.6843867087960252e-06, |
|
"loss": 0.0437, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 1.4641909814323608, |
|
"grad_norm": 1.30588237840256, |
|
"learning_rate": 1.6688219273329215e-06, |
|
"loss": 0.0383, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 1.46684350132626, |
|
"grad_norm": 1.4986591185883977, |
|
"learning_rate": 1.6533149760139206e-06, |
|
"loss": 0.0389, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 1.469496021220159, |
|
"grad_norm": 1.4584576788521406, |
|
"learning_rate": 1.6378661240438498e-06, |
|
"loss": 0.0398, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 1.4721485411140582, |
|
"grad_norm": 1.738259374230387, |
|
"learning_rate": 1.6224756396189216e-06, |
|
"loss": 0.0489, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.4748010610079576, |
|
"grad_norm": 1.4204392194550715, |
|
"learning_rate": 1.6071437899220688e-06, |
|
"loss": 0.0414, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 1.4774535809018567, |
|
"grad_norm": 1.5679568822862189, |
|
"learning_rate": 1.591870841118312e-06, |
|
"loss": 0.0527, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 1.4801061007957559, |
|
"grad_norm": 1.595033631355667, |
|
"learning_rate": 1.576657058350135e-06, |
|
"loss": 0.043, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 1.4827586206896552, |
|
"grad_norm": 1.738257177021954, |
|
"learning_rate": 1.561502705732883e-06, |
|
"loss": 0.0479, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 1.4854111405835544, |
|
"grad_norm": 1.365927651900338, |
|
"learning_rate": 1.546408046350183e-06, |
|
"loss": 0.0408, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.4880636604774535, |
|
"grad_norm": 1.5461936213104102, |
|
"learning_rate": 1.5313733422493626e-06, |
|
"loss": 0.0432, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 1.490716180371353, |
|
"grad_norm": 1.4928548013692324, |
|
"learning_rate": 1.516398854436914e-06, |
|
"loss": 0.04, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 1.493368700265252, |
|
"grad_norm": 1.640223366433278, |
|
"learning_rate": 1.501484842873963e-06, |
|
"loss": 0.0546, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 1.4960212201591512, |
|
"grad_norm": 1.7973098844829862, |
|
"learning_rate": 1.486631566471745e-06, |
|
"loss": 0.046, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 1.4986737400530503, |
|
"grad_norm": 1.6511015774830353, |
|
"learning_rate": 1.4718392830871192e-06, |
|
"loss": 0.0445, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.5013262599469495, |
|
"grad_norm": 2.1489351880145637, |
|
"learning_rate": 1.457108249518089e-06, |
|
"loss": 0.0626, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 1.5039787798408488, |
|
"grad_norm": 1.4283531671752674, |
|
"learning_rate": 1.4424387214993457e-06, |
|
"loss": 0.0388, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 1.506631299734748, |
|
"grad_norm": 1.9589195614651327, |
|
"learning_rate": 1.4278309536978275e-06, |
|
"loss": 0.0403, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 1.509283819628647, |
|
"grad_norm": 1.6259539938234473, |
|
"learning_rate": 1.4132851997082969e-06, |
|
"loss": 0.0453, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 1.5119363395225465, |
|
"grad_norm": 1.52635618170666, |
|
"learning_rate": 1.3988017120489417e-06, |
|
"loss": 0.0487, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.5145888594164456, |
|
"grad_norm": 1.166944034701237, |
|
"learning_rate": 1.384380742156991e-06, |
|
"loss": 0.0345, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 1.5172413793103448, |
|
"grad_norm": 1.4683349538414516, |
|
"learning_rate": 1.370022540384347e-06, |
|
"loss": 0.0456, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 1.5198938992042441, |
|
"grad_norm": 1.2722220574404348, |
|
"learning_rate": 1.3557273559932372e-06, |
|
"loss": 0.0321, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 1.5225464190981433, |
|
"grad_norm": 1.420343043420184, |
|
"learning_rate": 1.3414954371518968e-06, |
|
"loss": 0.0486, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 1.5251989389920424, |
|
"grad_norm": 1.613818956433735, |
|
"learning_rate": 1.32732703093025e-06, |
|
"loss": 0.0462, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.5278514588859418, |
|
"grad_norm": 1.5717020920513154, |
|
"learning_rate": 1.3132223832956265e-06, |
|
"loss": 0.0393, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 1.530503978779841, |
|
"grad_norm": 1.899833644607642, |
|
"learning_rate": 1.2991817391084887e-06, |
|
"loss": 0.0571, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 1.53315649867374, |
|
"grad_norm": 1.3467128729397928, |
|
"learning_rate": 1.2852053421181826e-06, |
|
"loss": 0.0436, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 1.5358090185676394, |
|
"grad_norm": 1.4684591070731103, |
|
"learning_rate": 1.2712934349587063e-06, |
|
"loss": 0.0409, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 1.4659121730561506, |
|
"learning_rate": 1.257446259144494e-06, |
|
"loss": 0.0343, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.5411140583554377, |
|
"grad_norm": 1.2296857151247282, |
|
"learning_rate": 1.2436640550662333e-06, |
|
"loss": 0.0315, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 1.5437665782493368, |
|
"grad_norm": 1.5658367180498545, |
|
"learning_rate": 1.2299470619866778e-06, |
|
"loss": 0.0468, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 1.546419098143236, |
|
"grad_norm": 1.5447401683597295, |
|
"learning_rate": 1.2162955180365033e-06, |
|
"loss": 0.0406, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 1.5490716180371353, |
|
"grad_norm": 1.563584941111247, |
|
"learning_rate": 1.2027096602101728e-06, |
|
"loss": 0.0385, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 1.5517241379310345, |
|
"grad_norm": 1.6434334673624074, |
|
"learning_rate": 1.1891897243618184e-06, |
|
"loss": 0.0452, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.5543766578249336, |
|
"grad_norm": 1.4760334609742514, |
|
"learning_rate": 1.1757359452011497e-06, |
|
"loss": 0.0366, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 1.557029177718833, |
|
"grad_norm": 1.232360571349415, |
|
"learning_rate": 1.1623485562893772e-06, |
|
"loss": 0.0373, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 1.5596816976127321, |
|
"grad_norm": 1.2603274281429182, |
|
"learning_rate": 1.1490277900351637e-06, |
|
"loss": 0.0336, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 1.5623342175066313, |
|
"grad_norm": 1.4672611395225155, |
|
"learning_rate": 1.1357738776905802e-06, |
|
"loss": 0.0499, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 1.5649867374005306, |
|
"grad_norm": 1.3938518014710124, |
|
"learning_rate": 1.1225870493470952e-06, |
|
"loss": 0.0407, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.5676392572944295, |
|
"grad_norm": 1.5266979684533877, |
|
"learning_rate": 1.1094675339315825e-06, |
|
"loss": 0.0389, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 1.570291777188329, |
|
"grad_norm": 1.8368858591777315, |
|
"learning_rate": 1.0964155592023483e-06, |
|
"loss": 0.0353, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 1.5729442970822283, |
|
"grad_norm": 1.6847631793792286, |
|
"learning_rate": 1.083431351745171e-06, |
|
"loss": 0.0492, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 1.5755968169761272, |
|
"grad_norm": 1.457143006494126, |
|
"learning_rate": 1.0705151369693712e-06, |
|
"loss": 0.0373, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 1.5782493368700266, |
|
"grad_norm": 1.4537669375386875, |
|
"learning_rate": 1.0576671391038996e-06, |
|
"loss": 0.0404, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.5809018567639257, |
|
"grad_norm": 1.570259411391911, |
|
"learning_rate": 1.0448875811934417e-06, |
|
"loss": 0.0374, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 1.5835543766578248, |
|
"grad_norm": 1.4551622624887812, |
|
"learning_rate": 1.0321766850945486e-06, |
|
"loss": 0.0349, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 1.5862068965517242, |
|
"grad_norm": 1.320645141600928, |
|
"learning_rate": 1.0195346714717813e-06, |
|
"loss": 0.0365, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 1.5888594164456233, |
|
"grad_norm": 1.7279741174166037, |
|
"learning_rate": 1.0069617597938869e-06, |
|
"loss": 0.048, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 1.5915119363395225, |
|
"grad_norm": 1.3238694043973904, |
|
"learning_rate": 9.944581683299804e-07, |
|
"loss": 0.0402, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.5915119363395225, |
|
"eval_loss": 0.1134517639875412, |
|
"eval_runtime": 1.2987, |
|
"eval_samples_per_second": 23.87, |
|
"eval_steps_per_second": 6.16, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.5941644562334218, |
|
"grad_norm": 1.3051093843669757, |
|
"learning_rate": 9.82024114145761e-07, |
|
"loss": 0.0382, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 1.596816976127321, |
|
"grad_norm": 1.162659102103195, |
|
"learning_rate": 9.696598130997415e-07, |
|
"loss": 0.036, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 1.5994694960212201, |
|
"grad_norm": 1.7546331982712047, |
|
"learning_rate": 9.57365479839501e-07, |
|
"loss": 0.0494, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 1.6021220159151195, |
|
"grad_norm": 1.3672793279747577, |
|
"learning_rate": 9.45141327797961e-07, |
|
"loss": 0.0322, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 1.6047745358090184, |
|
"grad_norm": 1.3661103077040575, |
|
"learning_rate": 9.32987569189675e-07, |
|
"loss": 0.0391, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.6074270557029178, |
|
"grad_norm": 1.5456527347090927, |
|
"learning_rate": 9.209044150071522e-07, |
|
"loss": 0.042, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 1.6100795755968171, |
|
"grad_norm": 1.6924497065293256, |
|
"learning_rate": 9.088920750171876e-07, |
|
"loss": 0.0415, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 1.612732095490716, |
|
"grad_norm": 1.74031693520605, |
|
"learning_rate": 8.969507577572189e-07, |
|
"loss": 0.0431, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 1.6153846153846154, |
|
"grad_norm": 1.551498689243845, |
|
"learning_rate": 8.850806705317183e-07, |
|
"loss": 0.0361, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 1.6180371352785146, |
|
"grad_norm": 1.542261921192749, |
|
"learning_rate": 8.732820194085794e-07, |
|
"loss": 0.0428, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.6206896551724137, |
|
"grad_norm": 1.78892779608987, |
|
"learning_rate": 8.615550092155478e-07, |
|
"loss": 0.0451, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 1.623342175066313, |
|
"grad_norm": 1.555805233678888, |
|
"learning_rate": 8.498998435366634e-07, |
|
"loss": 0.0365, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 1.6259946949602122, |
|
"grad_norm": 1.5060342455389297, |
|
"learning_rate": 8.383167247087259e-07, |
|
"loss": 0.04, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 1.6286472148541113, |
|
"grad_norm": 1.611093579148822, |
|
"learning_rate": 8.268058538177826e-07, |
|
"loss": 0.0375, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 1.6312997347480107, |
|
"grad_norm": 1.588047950817851, |
|
"learning_rate": 8.15367430695636e-07, |
|
"loss": 0.0456, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.6339522546419099, |
|
"grad_norm": 1.3816401812608983, |
|
"learning_rate": 8.040016539163792e-07, |
|
"loss": 0.0312, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 1.636604774535809, |
|
"grad_norm": 1.4853951274849326, |
|
"learning_rate": 7.927087207929418e-07, |
|
"loss": 0.0371, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 1.6392572944297084, |
|
"grad_norm": 1.4635552701452224, |
|
"learning_rate": 7.814888273736698e-07, |
|
"loss": 0.0398, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 1.6419098143236073, |
|
"grad_norm": 1.4395599081772887, |
|
"learning_rate": 7.70342168438919e-07, |
|
"loss": 0.0377, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 1.6445623342175066, |
|
"grad_norm": 1.4904004060637046, |
|
"learning_rate": 7.592689374976769e-07, |
|
"loss": 0.0422, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.647214854111406, |
|
"grad_norm": 1.3555033892449118, |
|
"learning_rate": 7.482693267842e-07, |
|
"loss": 0.0298, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 1.649867374005305, |
|
"grad_norm": 1.6737297709368, |
|
"learning_rate": 7.373435272546764e-07, |
|
"loss": 0.0426, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 1.6525198938992043, |
|
"grad_norm": 1.4564908618465116, |
|
"learning_rate": 7.264917285839168e-07, |
|
"loss": 0.0329, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 1.6551724137931034, |
|
"grad_norm": 1.2263290141969596, |
|
"learning_rate": 7.157141191620548e-07, |
|
"loss": 0.0405, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 1.6578249336870026, |
|
"grad_norm": 1.5061155245041684, |
|
"learning_rate": 7.050108860912752e-07, |
|
"loss": 0.0423, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.660477453580902, |
|
"grad_norm": 1.342531472718274, |
|
"learning_rate": 6.943822151825735e-07, |
|
"loss": 0.0381, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 1.663129973474801, |
|
"grad_norm": 1.529969425604032, |
|
"learning_rate": 6.838282909525268e-07, |
|
"loss": 0.0378, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 1.6657824933687002, |
|
"grad_norm": 1.4693891043473597, |
|
"learning_rate": 6.733492966200872e-07, |
|
"loss": 0.0409, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 1.6684350132625996, |
|
"grad_norm": 1.2856071654517978, |
|
"learning_rate": 6.629454141034053e-07, |
|
"loss": 0.0359, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 1.6710875331564987, |
|
"grad_norm": 1.261936660004472, |
|
"learning_rate": 6.526168240166686e-07, |
|
"loss": 0.0375, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.6737400530503979, |
|
"grad_norm": 1.9177656406918595, |
|
"learning_rate": 6.423637056669702e-07, |
|
"loss": 0.0528, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 1.6763925729442972, |
|
"grad_norm": 1.1739232814066218, |
|
"learning_rate": 6.321862370511922e-07, |
|
"loss": 0.0339, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 1.6790450928381961, |
|
"grad_norm": 1.5997908198741042, |
|
"learning_rate": 6.220845948529159e-07, |
|
"loss": 0.0365, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 1.6816976127320955, |
|
"grad_norm": 1.9896095612620952, |
|
"learning_rate": 6.120589544393596e-07, |
|
"loss": 0.0442, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 1.6843501326259946, |
|
"grad_norm": 1.4447133472783895, |
|
"learning_rate": 6.021094898583269e-07, |
|
"loss": 0.0417, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.6870026525198938, |
|
"grad_norm": 1.423453466191924, |
|
"learning_rate": 5.922363738351888e-07, |
|
"loss": 0.0416, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 1.6896551724137931, |
|
"grad_norm": 1.6412520906518056, |
|
"learning_rate": 5.824397777698859e-07, |
|
"loss": 0.0376, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 1.6923076923076923, |
|
"grad_norm": 1.3500786343856588, |
|
"learning_rate": 5.727198717339511e-07, |
|
"loss": 0.0365, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 1.6949602122015914, |
|
"grad_norm": 1.3281753235494516, |
|
"learning_rate": 5.630768244675583e-07, |
|
"loss": 0.0355, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 1.6976127320954908, |
|
"grad_norm": 1.5686317332318083, |
|
"learning_rate": 5.535108033765913e-07, |
|
"loss": 0.047, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.70026525198939, |
|
"grad_norm": 1.8508472340714566, |
|
"learning_rate": 5.440219745297432e-07, |
|
"loss": 0.049, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 1.702917771883289, |
|
"grad_norm": 1.6565736316956416, |
|
"learning_rate": 5.346105026556226e-07, |
|
"loss": 0.0397, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 1.7055702917771884, |
|
"grad_norm": 1.5711209136989794, |
|
"learning_rate": 5.252765511399044e-07, |
|
"loss": 0.0436, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 1.7082228116710876, |
|
"grad_norm": 1.5673409139078343, |
|
"learning_rate": 5.160202820224875e-07, |
|
"loss": 0.0441, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 1.7108753315649867, |
|
"grad_norm": 1.545089411507962, |
|
"learning_rate": 5.068418559946864e-07, |
|
"loss": 0.0348, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.713527851458886, |
|
"grad_norm": 1.432485291304248, |
|
"learning_rate": 4.977414323964364e-07, |
|
"loss": 0.0375, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 1.716180371352785, |
|
"grad_norm": 1.6399078665274962, |
|
"learning_rate": 4.88719169213529e-07, |
|
"loss": 0.039, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 1.7188328912466844, |
|
"grad_norm": 1.987599098491381, |
|
"learning_rate": 4.797752230748721e-07, |
|
"loss": 0.0515, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 1.7214854111405835, |
|
"grad_norm": 1.1831605370816431, |
|
"learning_rate": 4.7090974924976716e-07, |
|
"loss": 0.0313, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 1.7241379310344827, |
|
"grad_norm": 1.462425034427707, |
|
"learning_rate": 4.6212290164521554e-07, |
|
"loss": 0.0384, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.726790450928382, |
|
"grad_norm": 1.2550042615331936, |
|
"learning_rate": 4.534148328032456e-07, |
|
"loss": 0.0333, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 1.7294429708222812, |
|
"grad_norm": 2.006128939436255, |
|
"learning_rate": 4.4478569389826864e-07, |
|
"loss": 0.0362, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 1.7320954907161803, |
|
"grad_norm": 1.0400629902188836, |
|
"learning_rate": 4.3623563473444817e-07, |
|
"loss": 0.0389, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 1.7347480106100797, |
|
"grad_norm": 1.40192347825387, |
|
"learning_rate": 4.277648037430998e-07, |
|
"loss": 0.039, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 1.7374005305039788, |
|
"grad_norm": 1.183366553658279, |
|
"learning_rate": 4.193733479801232e-07, |
|
"loss": 0.0288, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.740053050397878, |
|
"grad_norm": 1.3787704383311228, |
|
"learning_rate": 4.110614131234375e-07, |
|
"loss": 0.0391, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 1.7427055702917773, |
|
"grad_norm": 1.5234821215536667, |
|
"learning_rate": 4.028291434704601e-07, |
|
"loss": 0.0402, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 1.7453580901856764, |
|
"grad_norm": 1.7063010359505684, |
|
"learning_rate": 3.946766819355985e-07, |
|
"loss": 0.0477, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 1.7480106100795756, |
|
"grad_norm": 1.6813393539550572, |
|
"learning_rate": 3.866041700477691e-07, |
|
"loss": 0.0451, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 1.750663129973475, |
|
"grad_norm": 1.7013026951961887, |
|
"learning_rate": 3.786117479479423e-07, |
|
"loss": 0.0375, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.7533156498673739, |
|
"grad_norm": 1.335932666043299, |
|
"learning_rate": 3.7069955438670704e-07, |
|
"loss": 0.0365, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 1.7559681697612732, |
|
"grad_norm": 1.3147540821216455, |
|
"learning_rate": 3.62867726721865e-07, |
|
"loss": 0.033, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 1.7586206896551724, |
|
"grad_norm": 1.6551386695113186, |
|
"learning_rate": 3.5511640091604293e-07, |
|
"loss": 0.0443, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 1.7612732095490715, |
|
"grad_norm": 1.2796382269681643, |
|
"learning_rate": 3.474457115343344e-07, |
|
"loss": 0.0341, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 1.7639257294429709, |
|
"grad_norm": 1.4215576260782057, |
|
"learning_rate": 3.398557917419626e-07, |
|
"loss": 0.0353, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.76657824933687, |
|
"grad_norm": 1.701821067856708, |
|
"learning_rate": 3.3234677330196865e-07, |
|
"loss": 0.0451, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 1.7692307692307692, |
|
"grad_norm": 1.182220074597278, |
|
"learning_rate": 3.2491878657292643e-07, |
|
"loss": 0.0366, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 1.7718832891246685, |
|
"grad_norm": 1.6450299221574771, |
|
"learning_rate": 3.175719605066746e-07, |
|
"loss": 0.0405, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 1.7745358090185677, |
|
"grad_norm": 1.9308466623075535, |
|
"learning_rate": 3.1030642264608393e-07, |
|
"loss": 0.0425, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 1.7771883289124668, |
|
"grad_norm": 2.0326661438505202, |
|
"learning_rate": 3.0312229912283884e-07, |
|
"loss": 0.048, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.7798408488063662, |
|
"grad_norm": 1.576985336913738, |
|
"learning_rate": 2.96019714655249e-07, |
|
"loss": 0.0385, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 1.782493368700265, |
|
"grad_norm": 1.4496388975425247, |
|
"learning_rate": 2.88998792546083e-07, |
|
"loss": 0.0366, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 1.7851458885941645, |
|
"grad_norm": 1.3474637091777282, |
|
"learning_rate": 2.820596546804316e-07, |
|
"loss": 0.0445, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 1.7877984084880638, |
|
"grad_norm": 1.6312145174464268, |
|
"learning_rate": 2.7520242152358767e-07, |
|
"loss": 0.047, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 1.7904509283819627, |
|
"grad_norm": 1.1102230652695808, |
|
"learning_rate": 2.6842721211895516e-07, |
|
"loss": 0.0309, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.793103448275862, |
|
"grad_norm": 1.6712723138079337, |
|
"learning_rate": 2.617341440859883e-07, |
|
"loss": 0.0391, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 1.7957559681697612, |
|
"grad_norm": 1.4567044509094966, |
|
"learning_rate": 2.551233336181386e-07, |
|
"loss": 0.0368, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 1.7984084880636604, |
|
"grad_norm": 1.4754171869798691, |
|
"learning_rate": 2.485948954808493e-07, |
|
"loss": 0.0386, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 1.8010610079575597, |
|
"grad_norm": 1.2965369200067902, |
|
"learning_rate": 2.421489430095547e-07, |
|
"loss": 0.0272, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 1.8037135278514589, |
|
"grad_norm": 1.4954201899471793, |
|
"learning_rate": 2.357855881077181e-07, |
|
"loss": 0.0436, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.806366047745358, |
|
"grad_norm": 1.081445215280861, |
|
"learning_rate": 2.2950494124488687e-07, |
|
"loss": 0.0355, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 1.8090185676392574, |
|
"grad_norm": 1.319457308590872, |
|
"learning_rate": 2.2330711145477247e-07, |
|
"loss": 0.0301, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 1.8116710875331565, |
|
"grad_norm": 1.5878396866106896, |
|
"learning_rate": 2.1719220633336147e-07, |
|
"loss": 0.0335, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 1.8143236074270557, |
|
"grad_norm": 1.4109191318255065, |
|
"learning_rate": 2.1116033203704534e-07, |
|
"loss": 0.0321, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 1.816976127320955, |
|
"grad_norm": 1.7498102959049076, |
|
"learning_rate": 2.0521159328077856e-07, |
|
"loss": 0.037, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.819628647214854, |
|
"grad_norm": 1.4473167506873907, |
|
"learning_rate": 1.993460933362601e-07, |
|
"loss": 0.0364, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 1.8222811671087533, |
|
"grad_norm": 1.2905934999941218, |
|
"learning_rate": 1.935639340301415e-07, |
|
"loss": 0.0304, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 1.8249336870026527, |
|
"grad_norm": 1.6396632866378924, |
|
"learning_rate": 1.8786521574225837e-07, |
|
"loss": 0.0492, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 1.8275862068965516, |
|
"grad_norm": 1.4628611231715085, |
|
"learning_rate": 1.8225003740388546e-07, |
|
"loss": 0.0384, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 1.830238726790451, |
|
"grad_norm": 1.4318855930579866, |
|
"learning_rate": 1.7671849649602502e-07, |
|
"loss": 0.0363, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.83289124668435, |
|
"grad_norm": 1.1555536281682646, |
|
"learning_rate": 1.7127068904770948e-07, |
|
"loss": 0.0316, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 1.8355437665782492, |
|
"grad_norm": 1.276197266535223, |
|
"learning_rate": 1.6590670963433642e-07, |
|
"loss": 0.0322, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 1.8381962864721486, |
|
"grad_norm": 1.540221628307776, |
|
"learning_rate": 1.6062665137602572e-07, |
|
"loss": 0.039, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 1.8408488063660478, |
|
"grad_norm": 1.488141776444817, |
|
"learning_rate": 1.5543060593600334e-07, |
|
"loss": 0.0497, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 1.843501326259947, |
|
"grad_norm": 1.4064075758646781, |
|
"learning_rate": 1.5031866351901182e-07, |
|
"loss": 0.0354, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"grad_norm": 1.7289030418499938, |
|
"learning_rate": 1.4529091286973994e-07, |
|
"loss": 0.047, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 1.8488063660477454, |
|
"grad_norm": 1.4042197094833593, |
|
"learning_rate": 1.403474412712874e-07, |
|
"loss": 0.0352, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 1.8514588859416445, |
|
"grad_norm": 1.0019472105701877, |
|
"learning_rate": 1.3548833454364641e-07, |
|
"loss": 0.024, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 1.854111405835544, |
|
"grad_norm": 1.773377229970334, |
|
"learning_rate": 1.3071367704221129e-07, |
|
"loss": 0.0444, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 1.8567639257294428, |
|
"grad_norm": 1.1177532519030482, |
|
"learning_rate": 1.260235516563163e-07, |
|
"loss": 0.0363, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.8594164456233422, |
|
"grad_norm": 1.2900623934609892, |
|
"learning_rate": 1.2141803980779464e-07, |
|
"loss": 0.0317, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 1.8620689655172413, |
|
"grad_norm": 1.8304735271621675, |
|
"learning_rate": 1.1689722144956672e-07, |
|
"loss": 0.0432, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 1.8647214854111405, |
|
"grad_norm": 1.4188940996465451, |
|
"learning_rate": 1.1246117506425014e-07, |
|
"loss": 0.0392, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 1.8673740053050398, |
|
"grad_norm": 1.4825923835504944, |
|
"learning_rate": 1.0810997766279974e-07, |
|
"loss": 0.0382, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 1.870026525198939, |
|
"grad_norm": 1.1014643278241296, |
|
"learning_rate": 1.0384370478316919e-07, |
|
"loss": 0.0311, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.8726790450928381, |
|
"grad_norm": 1.3678036067136727, |
|
"learning_rate": 9.966243048899704e-08, |
|
"loss": 0.0331, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 1.8753315649867375, |
|
"grad_norm": 1.673390021703709, |
|
"learning_rate": 9.556622736832665e-08, |
|
"loss": 0.0481, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 1.8779840848806366, |
|
"grad_norm": 1.5656233630923257, |
|
"learning_rate": 9.155516653234276e-08, |
|
"loss": 0.0442, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 1.8806366047745358, |
|
"grad_norm": 1.320780746379846, |
|
"learning_rate": 8.762931761413573e-08, |
|
"loss": 0.0382, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 1.8832891246684351, |
|
"grad_norm": 1.8359864667941603, |
|
"learning_rate": 8.378874876749433e-08, |
|
"loss": 0.0369, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.8859416445623343, |
|
"grad_norm": 1.6235154801266736, |
|
"learning_rate": 8.003352666572428e-08, |
|
"loss": 0.0404, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 1.8885941644562334, |
|
"grad_norm": 1.8357177886604772, |
|
"learning_rate": 7.636371650048658e-08, |
|
"loss": 0.0458, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 1.8912466843501328, |
|
"grad_norm": 1.4570512705105305, |
|
"learning_rate": 7.277938198066992e-08, |
|
"loss": 0.0418, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 1.8938992042440317, |
|
"grad_norm": 1.2604162586852787, |
|
"learning_rate": 6.928058533128112e-08, |
|
"loss": 0.0323, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 1.896551724137931, |
|
"grad_norm": 1.6009023727823846, |
|
"learning_rate": 6.58673872923693e-08, |
|
"loss": 0.0359, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.8992042440318302, |
|
"grad_norm": 1.2976436667625482, |
|
"learning_rate": 6.253984711796612e-08, |
|
"loss": 0.0371, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 1.9018567639257293, |
|
"grad_norm": 1.2870991478173943, |
|
"learning_rate": 5.929802257506112e-08, |
|
"loss": 0.045, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 1.9045092838196287, |
|
"grad_norm": 1.6902715191719975, |
|
"learning_rate": 5.6141969942596906e-08, |
|
"loss": 0.0389, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 1.9071618037135278, |
|
"grad_norm": 1.7050325681735479, |
|
"learning_rate": 5.307174401049275e-08, |
|
"loss": 0.0334, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 1.909814323607427, |
|
"grad_norm": 1.5808736002449604, |
|
"learning_rate": 5.0087398078694785e-08, |
|
"loss": 0.0417, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.9124668435013263, |
|
"grad_norm": 1.3430964938486658, |
|
"learning_rate": 4.718898395624671e-08, |
|
"loss": 0.0339, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 1.9151193633952255, |
|
"grad_norm": 1.489257458379661, |
|
"learning_rate": 4.437655196039559e-08, |
|
"loss": 0.0429, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 1.9177718832891246, |
|
"grad_norm": 1.4210307447912756, |
|
"learning_rate": 4.1650150915714674e-08, |
|
"loss": 0.0421, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 1.920424403183024, |
|
"grad_norm": 1.2959156290024842, |
|
"learning_rate": 3.900982815325582e-08, |
|
"loss": 0.0355, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 1.9230769230769231, |
|
"grad_norm": 1.4827766464308814, |
|
"learning_rate": 3.645562950973014e-08, |
|
"loss": 0.0369, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.9257294429708223, |
|
"grad_norm": 1.572959594152032, |
|
"learning_rate": 3.3987599326710806e-08, |
|
"loss": 0.0523, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 1.9283819628647216, |
|
"grad_norm": 1.519959930918152, |
|
"learning_rate": 3.160578044986373e-08, |
|
"loss": 0.0354, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 1.9310344827586206, |
|
"grad_norm": 1.2687605705293235, |
|
"learning_rate": 2.9310214228202016e-08, |
|
"loss": 0.0367, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 1.93368700265252, |
|
"grad_norm": 1.2589757041102903, |
|
"learning_rate": 2.7100940513370976e-08, |
|
"loss": 0.037, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 1.936339522546419, |
|
"grad_norm": 1.3802142564158149, |
|
"learning_rate": 2.4977997658954257e-08, |
|
"loss": 0.0384, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.9389920424403182, |
|
"grad_norm": 1.1728085570996578, |
|
"learning_rate": 2.29414225198088e-08, |
|
"loss": 0.0332, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 1.9416445623342176, |
|
"grad_norm": 1.4829028934918602, |
|
"learning_rate": 2.0991250451424806e-08, |
|
"loss": 0.0382, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 1.9442970822281167, |
|
"grad_norm": 1.0525766488908528, |
|
"learning_rate": 1.912751530931234e-08, |
|
"loss": 0.0307, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 1.9469496021220158, |
|
"grad_norm": 1.7747299163019683, |
|
"learning_rate": 1.735024944841235e-08, |
|
"loss": 0.0412, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 1.9496021220159152, |
|
"grad_norm": 1.3308814219146978, |
|
"learning_rate": 1.5659483722537117e-08, |
|
"loss": 0.0332, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.9522546419098143, |
|
"grad_norm": 1.6064157720427565, |
|
"learning_rate": 1.4055247483832356e-08, |
|
"loss": 0.0434, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 1.9549071618037135, |
|
"grad_norm": 1.334355441410993, |
|
"learning_rate": 1.2537568582269289e-08, |
|
"loss": 0.0407, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 1.9575596816976129, |
|
"grad_norm": 1.8576856489215392, |
|
"learning_rate": 1.110647336516002e-08, |
|
"loss": 0.041, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 1.9602122015915118, |
|
"grad_norm": 1.3915797639255931, |
|
"learning_rate": 9.761986676701251e-09, |
|
"loss": 0.0338, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 1.9628647214854111, |
|
"grad_norm": 1.696616311407856, |
|
"learning_rate": 8.504131857542952e-09, |
|
"loss": 0.0438, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.9655172413793105, |
|
"grad_norm": 1.2509363269884906, |
|
"learning_rate": 7.332930744380906e-09, |
|
"loss": 0.0359, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 1.9681697612732094, |
|
"grad_norm": 1.5867898086364787, |
|
"learning_rate": 6.24840366958035e-09, |
|
"loss": 0.0337, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 1.9708222811671088, |
|
"grad_norm": 1.0944506086655037, |
|
"learning_rate": 5.250569460822363e-09, |
|
"loss": 0.0304, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 1.973474801061008, |
|
"grad_norm": 1.6628593056793242, |
|
"learning_rate": 4.339445440776358e-09, |
|
"loss": 0.0377, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 1.976127320954907, |
|
"grad_norm": 1.7268640463935314, |
|
"learning_rate": 3.5150474267992007e-09, |
|
"loss": 0.0435, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.9787798408488064, |
|
"grad_norm": 1.469911347318018, |
|
"learning_rate": 2.7773897306615504e-09, |
|
"loss": 0.0377, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 1.9814323607427056, |
|
"grad_norm": 1.420365368396037, |
|
"learning_rate": 2.126485158298608e-09, |
|
"loss": 0.0384, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 1.9840848806366047, |
|
"grad_norm": 1.1547923580578376, |
|
"learning_rate": 1.5623450095880731e-09, |
|
"loss": 0.0289, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 1.986737400530504, |
|
"grad_norm": 1.7325651456934643, |
|
"learning_rate": 1.0849790781541913e-09, |
|
"loss": 0.0369, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 1.9893899204244032, |
|
"grad_norm": 1.6534729465614353, |
|
"learning_rate": 6.943956511973326e-10, |
|
"loss": 0.0513, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.9920424403183024, |
|
"grad_norm": 1.2257336512995223, |
|
"learning_rate": 3.9060150935077425e-10, |
|
"loss": 0.0341, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 1.9946949602122017, |
|
"grad_norm": 1.5013042404201287, |
|
"learning_rate": 1.7360192656246112e-10, |
|
"loss": 0.0428, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 1.9973474801061006, |
|
"grad_norm": 1.5234604261794584, |
|
"learning_rate": 4.340067000230264e-11, |
|
"loss": 0.0356, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.1204706677910448, |
|
"learning_rate": 0.0, |
|
"loss": 0.0257, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 754, |
|
"total_flos": 7513536872448.0, |
|
"train_loss": 0.09835794550769367, |
|
"train_runtime": 628.4293, |
|
"train_samples_per_second": 9.589, |
|
"train_steps_per_second": 1.2 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 754, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 5000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7513536872448.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|