{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.998003992015968, "eval_steps": 500, "global_step": 3567, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008404244143292362, "grad_norm": 0.7383340001106262, "learning_rate": 2e-05, "loss": 2.4066, "step": 10 }, { "epoch": 0.016808488286584725, "grad_norm": 0.46394309401512146, "learning_rate": 4e-05, "loss": 2.0375, "step": 20 }, { "epoch": 0.025212732429877087, "grad_norm": 0.4739478528499603, "learning_rate": 6e-05, "loss": 1.5044, "step": 30 }, { "epoch": 0.03361697657316945, "grad_norm": 0.20930196344852448, "learning_rate": 8e-05, "loss": 0.8704, "step": 40 }, { "epoch": 0.04202122071646181, "grad_norm": 0.15288038551807404, "learning_rate": 0.0001, "loss": 0.6533, "step": 50 }, { "epoch": 0.050425464859754174, "grad_norm": 0.13073962926864624, "learning_rate": 0.00012, "loss": 0.586, "step": 60 }, { "epoch": 0.058829709003046536, "grad_norm": 0.14555367827415466, "learning_rate": 0.00014, "loss": 0.5793, "step": 70 }, { "epoch": 0.0672339531463389, "grad_norm": 0.12397414445877075, "learning_rate": 0.00016, "loss": 0.581, "step": 80 }, { "epoch": 0.07563819728963127, "grad_norm": 0.13021130859851837, "learning_rate": 0.00018, "loss": 0.5512, "step": 90 }, { "epoch": 0.08404244143292362, "grad_norm": 0.13012883067131042, "learning_rate": 0.0002, "loss": 0.5403, "step": 100 }, { "epoch": 0.09244668557621599, "grad_norm": 0.11942347884178162, "learning_rate": 0.00019942313239111625, "loss": 0.5247, "step": 110 }, { "epoch": 0.10085092971950835, "grad_norm": 0.11690942198038101, "learning_rate": 0.0001988462647822325, "loss": 0.5417, "step": 120 }, { "epoch": 0.10925517386280072, "grad_norm": 0.1355101615190506, "learning_rate": 0.00019826939717334873, "loss": 0.5273, "step": 130 }, { "epoch": 0.11765941800609307, "grad_norm": 0.1345665603876114, "learning_rate": 0.00019769252956446497, "loss": 0.5243, "step": 140 }, { "epoch": 0.12606366214938544, "grad_norm": 0.12515193223953247, "learning_rate": 0.0001971156619555812, "loss": 0.5344, "step": 150 }, { "epoch": 0.1344679062926778, "grad_norm": 0.15686553716659546, "learning_rate": 0.00019653879434669745, "loss": 0.5118, "step": 160 }, { "epoch": 0.14287215043597015, "grad_norm": 0.12068944424390793, "learning_rate": 0.0001959619267378137, "loss": 0.4979, "step": 170 }, { "epoch": 0.15127639457926254, "grad_norm": 0.13319459557533264, "learning_rate": 0.00019538505912892993, "loss": 0.503, "step": 180 }, { "epoch": 0.1596806387225549, "grad_norm": 0.11806949228048325, "learning_rate": 0.00019480819152004617, "loss": 0.49, "step": 190 }, { "epoch": 0.16808488286584725, "grad_norm": 0.12932075560092926, "learning_rate": 0.00019423132391116238, "loss": 0.514, "step": 200 }, { "epoch": 0.17648912700913963, "grad_norm": 0.11743929982185364, "learning_rate": 0.00019365445630227862, "loss": 0.4788, "step": 210 }, { "epoch": 0.18489337115243198, "grad_norm": 0.11788313835859299, "learning_rate": 0.00019307758869339486, "loss": 0.4891, "step": 220 }, { "epoch": 0.19329761529572434, "grad_norm": 0.11414741724729538, "learning_rate": 0.0001925007210845111, "loss": 0.5033, "step": 230 }, { "epoch": 0.2017018594390167, "grad_norm": 0.11419043689966202, "learning_rate": 0.00019192385347562737, "loss": 0.4844, "step": 240 }, { "epoch": 0.21010610358230908, "grad_norm": 0.12788020074367523, "learning_rate": 0.0001913469858667436, "loss": 0.4697, "step": 250 }, { "epoch": 0.21851034772560143, "grad_norm": 0.13661302626132965, "learning_rate": 0.00019077011825785982, "loss": 0.4627, "step": 260 }, { "epoch": 0.2269145918688938, "grad_norm": 0.12041325867176056, "learning_rate": 0.00019019325064897606, "loss": 0.4964, "step": 270 }, { "epoch": 0.23531883601218614, "grad_norm": 0.133742094039917, "learning_rate": 0.0001896163830400923, "loss": 0.4658, "step": 280 }, { "epoch": 0.24372308015547853, "grad_norm": 0.1261977106332779, "learning_rate": 0.00018903951543120854, "loss": 0.4781, "step": 290 }, { "epoch": 0.2521273242987709, "grad_norm": 0.130150705575943, "learning_rate": 0.00018846264782232478, "loss": 0.4922, "step": 300 }, { "epoch": 0.26053156844206327, "grad_norm": 0.13174410164356232, "learning_rate": 0.00018788578021344102, "loss": 0.4559, "step": 310 }, { "epoch": 0.2689358125853556, "grad_norm": 0.1186077669262886, "learning_rate": 0.00018730891260455726, "loss": 0.4722, "step": 320 }, { "epoch": 0.277340056728648, "grad_norm": 0.116569384932518, "learning_rate": 0.0001867320449956735, "loss": 0.4457, "step": 330 }, { "epoch": 0.2857443008719403, "grad_norm": 0.12219471484422684, "learning_rate": 0.00018615517738678974, "loss": 0.4849, "step": 340 }, { "epoch": 0.2941485450152327, "grad_norm": 0.12746909260749817, "learning_rate": 0.00018557830977790598, "loss": 0.4821, "step": 350 }, { "epoch": 0.30255278915852507, "grad_norm": 0.14125944674015045, "learning_rate": 0.00018500144216902222, "loss": 0.4605, "step": 360 }, { "epoch": 0.3109570333018174, "grad_norm": 0.19157269597053528, "learning_rate": 0.00018442457456013846, "loss": 0.4541, "step": 370 }, { "epoch": 0.3193612774451098, "grad_norm": 0.12603330612182617, "learning_rate": 0.0001838477069512547, "loss": 0.4536, "step": 380 }, { "epoch": 0.32776552158840216, "grad_norm": 0.12653909623622894, "learning_rate": 0.00018327083934237091, "loss": 0.4468, "step": 390 }, { "epoch": 0.3361697657316945, "grad_norm": 0.15930472314357758, "learning_rate": 0.00018269397173348718, "loss": 0.4542, "step": 400 }, { "epoch": 0.3445740098749869, "grad_norm": 0.13266988098621368, "learning_rate": 0.00018211710412460342, "loss": 0.4335, "step": 410 }, { "epoch": 0.35297825401827926, "grad_norm": 0.12103667855262756, "learning_rate": 0.00018154023651571966, "loss": 0.4575, "step": 420 }, { "epoch": 0.3613824981615716, "grad_norm": 0.14439740777015686, "learning_rate": 0.0001809633689068359, "loss": 0.4377, "step": 430 }, { "epoch": 0.36978674230486397, "grad_norm": 0.12652407586574554, "learning_rate": 0.00018038650129795214, "loss": 0.4363, "step": 440 }, { "epoch": 0.3781909864481563, "grad_norm": 0.14594405889511108, "learning_rate": 0.00017980963368906835, "loss": 0.4306, "step": 450 }, { "epoch": 0.3865952305914487, "grad_norm": 0.12562687695026398, "learning_rate": 0.0001792327660801846, "loss": 0.4501, "step": 460 }, { "epoch": 0.39499947473474106, "grad_norm": 0.14584492146968842, "learning_rate": 0.00017865589847130083, "loss": 0.4509, "step": 470 }, { "epoch": 0.4034037188780334, "grad_norm": 0.13192500174045563, "learning_rate": 0.00017807903086241707, "loss": 0.4505, "step": 480 }, { "epoch": 0.4118079630213258, "grad_norm": 0.14266645908355713, "learning_rate": 0.00017750216325353331, "loss": 0.4585, "step": 490 }, { "epoch": 0.42021220716461816, "grad_norm": 0.1400412619113922, "learning_rate": 0.00017692529564464958, "loss": 0.4365, "step": 500 }, { "epoch": 0.4286164513079105, "grad_norm": 0.14728468656539917, "learning_rate": 0.0001763484280357658, "loss": 0.4303, "step": 510 }, { "epoch": 0.43702069545120287, "grad_norm": 0.15791365504264832, "learning_rate": 0.00017577156042688203, "loss": 0.4407, "step": 520 }, { "epoch": 0.4454249395944952, "grad_norm": 0.15447258949279785, "learning_rate": 0.00017519469281799827, "loss": 0.4365, "step": 530 }, { "epoch": 0.4538291837377876, "grad_norm": 0.1518252044916153, "learning_rate": 0.00017461782520911451, "loss": 0.4305, "step": 540 }, { "epoch": 0.46223342788107996, "grad_norm": 0.1154065877199173, "learning_rate": 0.00017404095760023075, "loss": 0.4212, "step": 550 }, { "epoch": 0.4706376720243723, "grad_norm": 0.12900012731552124, "learning_rate": 0.000173464089991347, "loss": 0.4277, "step": 560 }, { "epoch": 0.47904191616766467, "grad_norm": 0.1349458247423172, "learning_rate": 0.00017288722238246323, "loss": 0.4051, "step": 570 }, { "epoch": 0.48744616031095706, "grad_norm": 0.16337165236473083, "learning_rate": 0.00017231035477357947, "loss": 0.407, "step": 580 }, { "epoch": 0.4958504044542494, "grad_norm": 0.13420593738555908, "learning_rate": 0.0001717334871646957, "loss": 0.4138, "step": 590 }, { "epoch": 0.5042546485975418, "grad_norm": 0.13840581476688385, "learning_rate": 0.00017115661955581195, "loss": 0.4099, "step": 600 }, { "epoch": 0.5126588927408341, "grad_norm": 0.1378021389245987, "learning_rate": 0.0001705797519469282, "loss": 0.4254, "step": 610 }, { "epoch": 0.5210631368841265, "grad_norm": 0.1607150137424469, "learning_rate": 0.00017000288433804443, "loss": 0.4353, "step": 620 }, { "epoch": 0.5294673810274189, "grad_norm": 0.13462169468402863, "learning_rate": 0.00016942601672916067, "loss": 0.4267, "step": 630 }, { "epoch": 0.5378716251707112, "grad_norm": 0.14311543107032776, "learning_rate": 0.00016884914912027689, "loss": 0.4301, "step": 640 }, { "epoch": 0.5462758693140036, "grad_norm": 0.15559442341327667, "learning_rate": 0.00016827228151139313, "loss": 0.4102, "step": 650 }, { "epoch": 0.554680113457296, "grad_norm": 0.15557149052619934, "learning_rate": 0.00016769541390250937, "loss": 0.4136, "step": 660 }, { "epoch": 0.5630843576005883, "grad_norm": 0.135511115193367, "learning_rate": 0.00016711854629362563, "loss": 0.4153, "step": 670 }, { "epoch": 0.5714886017438806, "grad_norm": 0.13760776817798615, "learning_rate": 0.00016654167868474187, "loss": 0.4145, "step": 680 }, { "epoch": 0.579892845887173, "grad_norm": 0.14971590042114258, "learning_rate": 0.0001659648110758581, "loss": 0.3875, "step": 690 }, { "epoch": 0.5882970900304654, "grad_norm": 0.16005663573741913, "learning_rate": 0.00016538794346697433, "loss": 0.3938, "step": 700 }, { "epoch": 0.5967013341737577, "grad_norm": 0.1625218689441681, "learning_rate": 0.00016481107585809057, "loss": 0.3871, "step": 710 }, { "epoch": 0.6051055783170501, "grad_norm": 0.17047689855098724, "learning_rate": 0.0001642342082492068, "loss": 0.412, "step": 720 }, { "epoch": 0.6135098224603425, "grad_norm": 0.13825903832912445, "learning_rate": 0.00016365734064032305, "loss": 0.3948, "step": 730 }, { "epoch": 0.6219140666036348, "grad_norm": 0.14830929040908813, "learning_rate": 0.00016308047303143929, "loss": 0.3927, "step": 740 }, { "epoch": 0.6303183107469272, "grad_norm": 0.13950933516025543, "learning_rate": 0.00016250360542255553, "loss": 0.4051, "step": 750 }, { "epoch": 0.6387225548902196, "grad_norm": 0.15511371195316315, "learning_rate": 0.0001619267378136718, "loss": 0.4041, "step": 760 }, { "epoch": 0.6471267990335119, "grad_norm": 0.14828190207481384, "learning_rate": 0.000161349870204788, "loss": 0.3824, "step": 770 }, { "epoch": 0.6555310431768043, "grad_norm": 0.144051194190979, "learning_rate": 0.00016077300259590425, "loss": 0.3829, "step": 780 }, { "epoch": 0.6639352873200967, "grad_norm": 0.14780694246292114, "learning_rate": 0.00016019613498702049, "loss": 0.3814, "step": 790 }, { "epoch": 0.672339531463389, "grad_norm": 0.15042325854301453, "learning_rate": 0.00015961926737813673, "loss": 0.3962, "step": 800 }, { "epoch": 0.6807437756066814, "grad_norm": 0.16325107216835022, "learning_rate": 0.00015904239976925297, "loss": 0.3801, "step": 810 }, { "epoch": 0.6891480197499738, "grad_norm": 0.14843328297138214, "learning_rate": 0.0001584655321603692, "loss": 0.4082, "step": 820 }, { "epoch": 0.6975522638932661, "grad_norm": 0.16731064021587372, "learning_rate": 0.00015788866455148545, "loss": 0.4192, "step": 830 }, { "epoch": 0.7059565080365585, "grad_norm": 0.18703435361385345, "learning_rate": 0.00015731179694260169, "loss": 0.4009, "step": 840 }, { "epoch": 0.7143607521798508, "grad_norm": 0.13935630023479462, "learning_rate": 0.00015673492933371793, "loss": 0.3618, "step": 850 }, { "epoch": 0.7227649963231432, "grad_norm": 0.13263636827468872, "learning_rate": 0.00015615806172483417, "loss": 0.3963, "step": 860 }, { "epoch": 0.7311692404664355, "grad_norm": 0.14940643310546875, "learning_rate": 0.0001555811941159504, "loss": 0.3585, "step": 870 }, { "epoch": 0.7395734846097279, "grad_norm": 0.14807912707328796, "learning_rate": 0.00015500432650706665, "loss": 0.3748, "step": 880 }, { "epoch": 0.7479777287530203, "grad_norm": 0.15254080295562744, "learning_rate": 0.00015442745889818286, "loss": 0.3718, "step": 890 }, { "epoch": 0.7563819728963126, "grad_norm": 0.16590768098831177, "learning_rate": 0.0001538505912892991, "loss": 0.386, "step": 900 }, { "epoch": 0.764786217039605, "grad_norm": 0.15733902156352997, "learning_rate": 0.00015327372368041534, "loss": 0.3756, "step": 910 }, { "epoch": 0.7731904611828974, "grad_norm": 0.13757385313510895, "learning_rate": 0.00015269685607153158, "loss": 0.3843, "step": 920 }, { "epoch": 0.7815947053261897, "grad_norm": 0.14952607452869415, "learning_rate": 0.00015211998846264784, "loss": 0.3634, "step": 930 }, { "epoch": 0.7899989494694821, "grad_norm": 0.1516282558441162, "learning_rate": 0.00015154312085376408, "loss": 0.3798, "step": 940 }, { "epoch": 0.7984031936127745, "grad_norm": 0.17785628139972687, "learning_rate": 0.00015096625324488032, "loss": 0.3681, "step": 950 }, { "epoch": 0.8068074377560668, "grad_norm": 0.171351820230484, "learning_rate": 0.00015038938563599654, "loss": 0.3686, "step": 960 }, { "epoch": 0.8152116818993592, "grad_norm": 0.1742231398820877, "learning_rate": 0.00014981251802711278, "loss": 0.3792, "step": 970 }, { "epoch": 0.8236159260426515, "grad_norm": 0.16650599241256714, "learning_rate": 0.00014923565041822902, "loss": 0.3577, "step": 980 }, { "epoch": 0.8320201701859439, "grad_norm": 0.1497887670993805, "learning_rate": 0.00014865878280934526, "loss": 0.3553, "step": 990 }, { "epoch": 0.8404244143292363, "grad_norm": 0.14781557023525238, "learning_rate": 0.0001480819152004615, "loss": 0.3538, "step": 1000 }, { "epoch": 0.8488286584725286, "grad_norm": 0.15724751353263855, "learning_rate": 0.00014750504759157774, "loss": 0.3597, "step": 1010 }, { "epoch": 0.857232902615821, "grad_norm": 0.18635571002960205, "learning_rate": 0.00014692817998269398, "loss": 0.3615, "step": 1020 }, { "epoch": 0.8656371467591134, "grad_norm": 0.17742526531219482, "learning_rate": 0.00014635131237381022, "loss": 0.348, "step": 1030 }, { "epoch": 0.8740413909024057, "grad_norm": 0.20535768568515778, "learning_rate": 0.00014577444476492646, "loss": 0.3343, "step": 1040 }, { "epoch": 0.8824456350456981, "grad_norm": 0.18968522548675537, "learning_rate": 0.0001451975771560427, "loss": 0.3615, "step": 1050 }, { "epoch": 0.8908498791889904, "grad_norm": 0.1528492122888565, "learning_rate": 0.00014462070954715894, "loss": 0.3786, "step": 1060 }, { "epoch": 0.8992541233322828, "grad_norm": 0.15841075778007507, "learning_rate": 0.00014404384193827518, "loss": 0.3761, "step": 1070 }, { "epoch": 0.9076583674755752, "grad_norm": 0.15167982876300812, "learning_rate": 0.0001434669743293914, "loss": 0.3528, "step": 1080 }, { "epoch": 0.9160626116188675, "grad_norm": 0.14096671342849731, "learning_rate": 0.00014289010672050766, "loss": 0.371, "step": 1090 }, { "epoch": 0.9244668557621599, "grad_norm": 0.1579194813966751, "learning_rate": 0.0001423132391116239, "loss": 0.3491, "step": 1100 }, { "epoch": 0.9328710999054523, "grad_norm": 0.16789057850837708, "learning_rate": 0.00014173637150274014, "loss": 0.3536, "step": 1110 }, { "epoch": 0.9412753440487446, "grad_norm": 0.13980717957019806, "learning_rate": 0.00014115950389385638, "loss": 0.3423, "step": 1120 }, { "epoch": 0.949679588192037, "grad_norm": 0.19879643619060516, "learning_rate": 0.00014058263628497262, "loss": 0.3285, "step": 1130 }, { "epoch": 0.9580838323353293, "grad_norm": 0.16574440896511078, "learning_rate": 0.00014000576867608886, "loss": 0.3568, "step": 1140 }, { "epoch": 0.9664880764786217, "grad_norm": 0.15376180410385132, "learning_rate": 0.00013942890106720507, "loss": 0.3558, "step": 1150 }, { "epoch": 0.9748923206219141, "grad_norm": 0.17232170701026917, "learning_rate": 0.0001388520334583213, "loss": 0.342, "step": 1160 }, { "epoch": 0.9832965647652064, "grad_norm": 0.1959993690252304, "learning_rate": 0.00013827516584943755, "loss": 0.3458, "step": 1170 }, { "epoch": 0.9917008089084988, "grad_norm": 0.14029347896575928, "learning_rate": 0.0001376982982405538, "loss": 0.3297, "step": 1180 }, { "epoch": 1.0002101061035824, "grad_norm": 0.20758652687072754, "learning_rate": 0.00013712143063167006, "loss": 0.3642, "step": 1190 }, { "epoch": 1.0086143502468747, "grad_norm": 0.15599438548088074, "learning_rate": 0.0001365445630227863, "loss": 0.3004, "step": 1200 }, { "epoch": 1.017018594390167, "grad_norm": 0.16680683195590973, "learning_rate": 0.0001359676954139025, "loss": 0.2915, "step": 1210 }, { "epoch": 1.0254228385334594, "grad_norm": 0.1668105274438858, "learning_rate": 0.00013539082780501875, "loss": 0.2963, "step": 1220 }, { "epoch": 1.0338270826767517, "grad_norm": 0.16461539268493652, "learning_rate": 0.000134813960196135, "loss": 0.3041, "step": 1230 }, { "epoch": 1.042231326820044, "grad_norm": 0.18869394063949585, "learning_rate": 0.00013423709258725123, "loss": 0.3046, "step": 1240 }, { "epoch": 1.0506355709633364, "grad_norm": 0.16899700462818146, "learning_rate": 0.00013366022497836747, "loss": 0.2921, "step": 1250 }, { "epoch": 1.059039815106629, "grad_norm": 0.1905297338962555, "learning_rate": 0.0001330833573694837, "loss": 0.2879, "step": 1260 }, { "epoch": 1.0674440592499213, "grad_norm": 0.17273731529712677, "learning_rate": 0.00013250648976059995, "loss": 0.3038, "step": 1270 }, { "epoch": 1.0758483033932136, "grad_norm": 0.1947745531797409, "learning_rate": 0.0001319296221517162, "loss": 0.3029, "step": 1280 }, { "epoch": 1.084252547536506, "grad_norm": 0.1741725355386734, "learning_rate": 0.00013135275454283243, "loss": 0.3073, "step": 1290 }, { "epoch": 1.0926567916797982, "grad_norm": 0.18244194984436035, "learning_rate": 0.00013077588693394867, "loss": 0.287, "step": 1300 }, { "epoch": 1.1010610358230906, "grad_norm": 0.18360966444015503, "learning_rate": 0.0001301990193250649, "loss": 0.307, "step": 1310 }, { "epoch": 1.1094652799663831, "grad_norm": 0.16066686809062958, "learning_rate": 0.00012962215171618115, "loss": 0.2712, "step": 1320 }, { "epoch": 1.1178695241096754, "grad_norm": 0.16239213943481445, "learning_rate": 0.00012904528410729736, "loss": 0.2857, "step": 1330 }, { "epoch": 1.1262737682529678, "grad_norm": 0.16966617107391357, "learning_rate": 0.0001284684164984136, "loss": 0.3087, "step": 1340 }, { "epoch": 1.13467801239626, "grad_norm": 0.16753819584846497, "learning_rate": 0.00012789154888952984, "loss": 0.2852, "step": 1350 }, { "epoch": 1.1430822565395524, "grad_norm": 0.19184084236621857, "learning_rate": 0.0001273146812806461, "loss": 0.3138, "step": 1360 }, { "epoch": 1.1514865006828447, "grad_norm": 0.15949766337871552, "learning_rate": 0.00012673781367176235, "loss": 0.2812, "step": 1370 }, { "epoch": 1.159890744826137, "grad_norm": 0.16187496483325958, "learning_rate": 0.0001261609460628786, "loss": 0.2841, "step": 1380 }, { "epoch": 1.1682949889694296, "grad_norm": 0.1778268665075302, "learning_rate": 0.00012558407845399483, "loss": 0.3181, "step": 1390 }, { "epoch": 1.176699233112722, "grad_norm": 0.17179737985134125, "learning_rate": 0.00012500721084511104, "loss": 0.2904, "step": 1400 }, { "epoch": 1.1851034772560143, "grad_norm": 0.16989010572433472, "learning_rate": 0.00012443034323622728, "loss": 0.2856, "step": 1410 }, { "epoch": 1.1935077213993066, "grad_norm": 0.21040703356266022, "learning_rate": 0.00012385347562734352, "loss": 0.2743, "step": 1420 }, { "epoch": 1.201911965542599, "grad_norm": 0.19255656003952026, "learning_rate": 0.00012327660801845976, "loss": 0.316, "step": 1430 }, { "epoch": 1.2103162096858915, "grad_norm": 0.16303245723247528, "learning_rate": 0.000122699740409576, "loss": 0.2671, "step": 1440 }, { "epoch": 1.2187204538291838, "grad_norm": 0.21385671198368073, "learning_rate": 0.00012212287280069227, "loss": 0.2865, "step": 1450 }, { "epoch": 1.2271246979724761, "grad_norm": 0.18770861625671387, "learning_rate": 0.00012154600519180848, "loss": 0.2795, "step": 1460 }, { "epoch": 1.2355289421157685, "grad_norm": 0.20827870070934296, "learning_rate": 0.00012096913758292472, "loss": 0.2769, "step": 1470 }, { "epoch": 1.2439331862590608, "grad_norm": 0.1704486757516861, "learning_rate": 0.00012039226997404096, "loss": 0.2993, "step": 1480 }, { "epoch": 1.2523374304023531, "grad_norm": 0.21233461797237396, "learning_rate": 0.0001198154023651572, "loss": 0.2912, "step": 1490 }, { "epoch": 1.2607416745456455, "grad_norm": 0.1879620999097824, "learning_rate": 0.00011923853475627344, "loss": 0.2885, "step": 1500 }, { "epoch": 1.2691459186889378, "grad_norm": 0.14288674294948578, "learning_rate": 0.00011866166714738968, "loss": 0.2794, "step": 1510 }, { "epoch": 1.2775501628322303, "grad_norm": 0.1654644012451172, "learning_rate": 0.00011808479953850591, "loss": 0.2762, "step": 1520 }, { "epoch": 1.2859544069755227, "grad_norm": 0.15648572146892548, "learning_rate": 0.00011750793192962215, "loss": 0.2853, "step": 1530 }, { "epoch": 1.294358651118815, "grad_norm": 0.14321617782115936, "learning_rate": 0.00011693106432073839, "loss": 0.2949, "step": 1540 }, { "epoch": 1.3027628952621073, "grad_norm": 0.18823479115962982, "learning_rate": 0.00011635419671185464, "loss": 0.2734, "step": 1550 }, { "epoch": 1.3111671394053999, "grad_norm": 0.1524640917778015, "learning_rate": 0.00011577732910297088, "loss": 0.2668, "step": 1560 }, { "epoch": 1.3195713835486922, "grad_norm": 0.1731933057308197, "learning_rate": 0.00011520046149408712, "loss": 0.2815, "step": 1570 }, { "epoch": 1.3279756276919845, "grad_norm": 0.19858598709106445, "learning_rate": 0.00011462359388520336, "loss": 0.2863, "step": 1580 }, { "epoch": 1.3363798718352768, "grad_norm": 0.20350554585456848, "learning_rate": 0.00011404672627631959, "loss": 0.2974, "step": 1590 }, { "epoch": 1.3447841159785692, "grad_norm": 0.16735605895519257, "learning_rate": 0.00011346985866743583, "loss": 0.2742, "step": 1600 }, { "epoch": 1.3531883601218615, "grad_norm": 0.18708328902721405, "learning_rate": 0.00011289299105855207, "loss": 0.2877, "step": 1610 }, { "epoch": 1.3615926042651538, "grad_norm": 0.19334456324577332, "learning_rate": 0.00011231612344966831, "loss": 0.2735, "step": 1620 }, { "epoch": 1.3699968484084462, "grad_norm": 0.20367129147052765, "learning_rate": 0.00011173925584078455, "loss": 0.2801, "step": 1630 }, { "epoch": 1.3784010925517387, "grad_norm": 0.18539854884147644, "learning_rate": 0.00011116238823190079, "loss": 0.2842, "step": 1640 }, { "epoch": 1.386805336695031, "grad_norm": 0.2150140106678009, "learning_rate": 0.00011058552062301701, "loss": 0.2611, "step": 1650 }, { "epoch": 1.3952095808383234, "grad_norm": 0.162113755941391, "learning_rate": 0.00011000865301413325, "loss": 0.289, "step": 1660 }, { "epoch": 1.4036138249816157, "grad_norm": 0.18180853128433228, "learning_rate": 0.0001094317854052495, "loss": 0.2808, "step": 1670 }, { "epoch": 1.412018069124908, "grad_norm": 0.17916476726531982, "learning_rate": 0.00010885491779636575, "loss": 0.2912, "step": 1680 }, { "epoch": 1.4204223132682006, "grad_norm": 0.22721944749355316, "learning_rate": 0.00010827805018748199, "loss": 0.2611, "step": 1690 }, { "epoch": 1.428826557411493, "grad_norm": 0.16184848546981812, "learning_rate": 0.00010770118257859823, "loss": 0.2722, "step": 1700 }, { "epoch": 1.4372308015547852, "grad_norm": 0.19588448107242584, "learning_rate": 0.00010712431496971444, "loss": 0.2817, "step": 1710 }, { "epoch": 1.4456350456980775, "grad_norm": 0.1870766133069992, "learning_rate": 0.0001065474473608307, "loss": 0.2835, "step": 1720 }, { "epoch": 1.4540392898413699, "grad_norm": 0.1768248826265335, "learning_rate": 0.00010597057975194693, "loss": 0.2643, "step": 1730 }, { "epoch": 1.4624435339846622, "grad_norm": 0.1726955771446228, "learning_rate": 0.00010539371214306317, "loss": 0.2674, "step": 1740 }, { "epoch": 1.4708477781279545, "grad_norm": 0.1709883064031601, "learning_rate": 0.00010481684453417941, "loss": 0.262, "step": 1750 }, { "epoch": 1.4792520222712469, "grad_norm": 0.2008083164691925, "learning_rate": 0.00010423997692529565, "loss": 0.2634, "step": 1760 }, { "epoch": 1.4876562664145394, "grad_norm": 0.17773209512233734, "learning_rate": 0.0001036631093164119, "loss": 0.2805, "step": 1770 }, { "epoch": 1.4960605105578317, "grad_norm": 0.18000538647174835, "learning_rate": 0.00010308624170752812, "loss": 0.2443, "step": 1780 }, { "epoch": 1.504464754701124, "grad_norm": 0.2176659256219864, "learning_rate": 0.00010250937409864436, "loss": 0.2594, "step": 1790 }, { "epoch": 1.5128689988444164, "grad_norm": 0.15863171219825745, "learning_rate": 0.0001019325064897606, "loss": 0.2751, "step": 1800 }, { "epoch": 1.521273242987709, "grad_norm": 0.19906319677829742, "learning_rate": 0.00010135563888087685, "loss": 0.2865, "step": 1810 }, { "epoch": 1.5296774871310013, "grad_norm": 0.21247649192810059, "learning_rate": 0.00010077877127199309, "loss": 0.2892, "step": 1820 }, { "epoch": 1.5380817312742936, "grad_norm": 0.21099700033664703, "learning_rate": 0.00010020190366310933, "loss": 0.3008, "step": 1830 }, { "epoch": 1.546485975417586, "grad_norm": 0.15469135344028473, "learning_rate": 9.962503605422556e-05, "loss": 0.2672, "step": 1840 }, { "epoch": 1.5548902195608783, "grad_norm": 0.16477440297603607, "learning_rate": 9.90481684453418e-05, "loss": 0.2799, "step": 1850 }, { "epoch": 1.5632944637041706, "grad_norm": 0.17361459136009216, "learning_rate": 9.847130083645804e-05, "loss": 0.2756, "step": 1860 }, { "epoch": 1.571698707847463, "grad_norm": 0.15138483047485352, "learning_rate": 9.789443322757428e-05, "loss": 0.2785, "step": 1870 }, { "epoch": 1.5801029519907552, "grad_norm": 0.16653598845005035, "learning_rate": 9.731756561869052e-05, "loss": 0.2814, "step": 1880 }, { "epoch": 1.5885071961340476, "grad_norm": 0.16785801947116852, "learning_rate": 9.674069800980675e-05, "loss": 0.2752, "step": 1890 }, { "epoch": 1.59691144027734, "grad_norm": 0.21643054485321045, "learning_rate": 9.6163830400923e-05, "loss": 0.2623, "step": 1900 }, { "epoch": 1.6053156844206324, "grad_norm": 0.15368995070457458, "learning_rate": 9.558696279203924e-05, "loss": 0.2722, "step": 1910 }, { "epoch": 1.6137199285639248, "grad_norm": 0.21962004899978638, "learning_rate": 9.501009518315547e-05, "loss": 0.2563, "step": 1920 }, { "epoch": 1.622124172707217, "grad_norm": 0.14919191598892212, "learning_rate": 9.44332275742717e-05, "loss": 0.2502, "step": 1930 }, { "epoch": 1.6305284168505096, "grad_norm": 0.2036961317062378, "learning_rate": 9.385635996538795e-05, "loss": 0.2539, "step": 1940 }, { "epoch": 1.638932660993802, "grad_norm": 0.19002236425876617, "learning_rate": 9.327949235650419e-05, "loss": 0.2464, "step": 1950 }, { "epoch": 1.6473369051370943, "grad_norm": 0.16677500307559967, "learning_rate": 9.270262474762043e-05, "loss": 0.2684, "step": 1960 }, { "epoch": 1.6557411492803866, "grad_norm": 0.15206314623355865, "learning_rate": 9.212575713873667e-05, "loss": 0.242, "step": 1970 }, { "epoch": 1.664145393423679, "grad_norm": 0.17641034722328186, "learning_rate": 9.15488895298529e-05, "loss": 0.2604, "step": 1980 }, { "epoch": 1.6725496375669713, "grad_norm": 0.17574937641620636, "learning_rate": 9.097202192096915e-05, "loss": 0.2547, "step": 1990 }, { "epoch": 1.6809538817102636, "grad_norm": 0.16344806551933289, "learning_rate": 9.039515431208539e-05, "loss": 0.2681, "step": 2000 }, { "epoch": 1.689358125853556, "grad_norm": 0.18498322367668152, "learning_rate": 8.981828670320161e-05, "loss": 0.2713, "step": 2010 }, { "epoch": 1.6977623699968483, "grad_norm": 0.14767137169837952, "learning_rate": 8.924141909431785e-05, "loss": 0.2604, "step": 2020 }, { "epoch": 1.7061666141401408, "grad_norm": 0.1902410387992859, "learning_rate": 8.86645514854341e-05, "loss": 0.2516, "step": 2030 }, { "epoch": 1.7145708582834331, "grad_norm": 0.1728687733411789, "learning_rate": 8.808768387655033e-05, "loss": 0.2711, "step": 2040 }, { "epoch": 1.7229751024267255, "grad_norm": 0.1836615651845932, "learning_rate": 8.751081626766657e-05, "loss": 0.2717, "step": 2050 }, { "epoch": 1.731379346570018, "grad_norm": 0.1553170531988144, "learning_rate": 8.693394865878281e-05, "loss": 0.2303, "step": 2060 }, { "epoch": 1.7397835907133103, "grad_norm": 0.1942613571882248, "learning_rate": 8.635708104989905e-05, "loss": 0.2581, "step": 2070 }, { "epoch": 1.7481878348566027, "grad_norm": 0.1734922230243683, "learning_rate": 8.578021344101529e-05, "loss": 0.259, "step": 2080 }, { "epoch": 1.756592078999895, "grad_norm": 0.1309240758419037, "learning_rate": 8.520334583213153e-05, "loss": 0.2381, "step": 2090 }, { "epoch": 1.7649963231431873, "grad_norm": 0.17716042697429657, "learning_rate": 8.462647822324777e-05, "loss": 0.2413, "step": 2100 }, { "epoch": 1.7734005672864797, "grad_norm": 0.16437722742557526, "learning_rate": 8.404961061436401e-05, "loss": 0.2699, "step": 2110 }, { "epoch": 1.781804811429772, "grad_norm": 0.15865294635295868, "learning_rate": 8.347274300548025e-05, "loss": 0.2515, "step": 2120 }, { "epoch": 1.7902090555730643, "grad_norm": 0.16365793347358704, "learning_rate": 8.289587539659649e-05, "loss": 0.2507, "step": 2130 }, { "epoch": 1.7986132997163566, "grad_norm": 0.19089579582214355, "learning_rate": 8.231900778771272e-05, "loss": 0.2572, "step": 2140 }, { "epoch": 1.807017543859649, "grad_norm": 0.1750141978263855, "learning_rate": 8.174214017882896e-05, "loss": 0.2692, "step": 2150 }, { "epoch": 1.8154217880029415, "grad_norm": 0.14101552963256836, "learning_rate": 8.116527256994521e-05, "loss": 0.2658, "step": 2160 }, { "epoch": 1.8238260321462338, "grad_norm": 0.14396284520626068, "learning_rate": 8.058840496106144e-05, "loss": 0.2556, "step": 2170 }, { "epoch": 1.8322302762895262, "grad_norm": 0.15593650937080383, "learning_rate": 8.001153735217768e-05, "loss": 0.2442, "step": 2180 }, { "epoch": 1.8406345204328187, "grad_norm": 0.18202078342437744, "learning_rate": 7.943466974329392e-05, "loss": 0.2509, "step": 2190 }, { "epoch": 1.849038764576111, "grad_norm": 0.17855936288833618, "learning_rate": 7.885780213441016e-05, "loss": 0.2595, "step": 2200 }, { "epoch": 1.8574430087194034, "grad_norm": 0.16823212802410126, "learning_rate": 7.82809345255264e-05, "loss": 0.2469, "step": 2210 }, { "epoch": 1.8658472528626957, "grad_norm": 0.15248893201351166, "learning_rate": 7.770406691664264e-05, "loss": 0.2603, "step": 2220 }, { "epoch": 1.874251497005988, "grad_norm": 0.16229604184627533, "learning_rate": 7.712719930775886e-05, "loss": 0.2434, "step": 2230 }, { "epoch": 1.8826557411492804, "grad_norm": 0.18594375252723694, "learning_rate": 7.655033169887512e-05, "loss": 0.266, "step": 2240 }, { "epoch": 1.8910599852925727, "grad_norm": 0.18467053771018982, "learning_rate": 7.597346408999136e-05, "loss": 0.2535, "step": 2250 }, { "epoch": 1.899464229435865, "grad_norm": 0.18451227247714996, "learning_rate": 7.539659648110758e-05, "loss": 0.2579, "step": 2260 }, { "epoch": 1.9078684735791573, "grad_norm": 0.15458305180072784, "learning_rate": 7.481972887222382e-05, "loss": 0.2506, "step": 2270 }, { "epoch": 1.91627271772245, "grad_norm": 0.17949137091636658, "learning_rate": 7.424286126334006e-05, "loss": 0.2659, "step": 2280 }, { "epoch": 1.9246769618657422, "grad_norm": 0.1898379623889923, "learning_rate": 7.366599365445632e-05, "loss": 0.2882, "step": 2290 }, { "epoch": 1.9330812060090345, "grad_norm": 0.14720788598060608, "learning_rate": 7.308912604557254e-05, "loss": 0.2367, "step": 2300 }, { "epoch": 1.9414854501523269, "grad_norm": 0.15253467857837677, "learning_rate": 7.251225843668878e-05, "loss": 0.256, "step": 2310 }, { "epoch": 1.9498896942956194, "grad_norm": 0.1564057618379593, "learning_rate": 7.193539082780502e-05, "loss": 0.2536, "step": 2320 }, { "epoch": 1.9582939384389118, "grad_norm": 0.15893864631652832, "learning_rate": 7.135852321892126e-05, "loss": 0.2347, "step": 2330 }, { "epoch": 1.966698182582204, "grad_norm": 0.20592626929283142, "learning_rate": 7.07816556100375e-05, "loss": 0.2419, "step": 2340 }, { "epoch": 1.9751024267254964, "grad_norm": 0.20137999951839447, "learning_rate": 7.020478800115374e-05, "loss": 0.2415, "step": 2350 }, { "epoch": 1.9835066708687887, "grad_norm": 0.19287312030792236, "learning_rate": 6.962792039226997e-05, "loss": 0.2484, "step": 2360 }, { "epoch": 1.991910915012081, "grad_norm": 0.1620776355266571, "learning_rate": 6.905105278338622e-05, "loss": 0.2599, "step": 2370 }, { "epoch": 2.000420212207165, "grad_norm": 0.19407401978969574, "learning_rate": 6.847418517450246e-05, "loss": 0.2438, "step": 2380 }, { "epoch": 2.008824456350457, "grad_norm": 0.16983546316623688, "learning_rate": 6.789731756561869e-05, "loss": 0.2069, "step": 2390 }, { "epoch": 2.0172287004937495, "grad_norm": 0.1806386411190033, "learning_rate": 6.732044995673493e-05, "loss": 0.197, "step": 2400 }, { "epoch": 2.025632944637042, "grad_norm": 0.189859077334404, "learning_rate": 6.674358234785117e-05, "loss": 0.2059, "step": 2410 }, { "epoch": 2.034037188780334, "grad_norm": 0.1526906043291092, "learning_rate": 6.616671473896741e-05, "loss": 0.202, "step": 2420 }, { "epoch": 2.0424414329236265, "grad_norm": 0.18099629878997803, "learning_rate": 6.558984713008365e-05, "loss": 0.2047, "step": 2430 }, { "epoch": 2.050845677066919, "grad_norm": 0.1844697743654251, "learning_rate": 6.501297952119989e-05, "loss": 0.2124, "step": 2440 }, { "epoch": 2.059249921210211, "grad_norm": 0.1806306689977646, "learning_rate": 6.443611191231612e-05, "loss": 0.2232, "step": 2450 }, { "epoch": 2.0676541653535034, "grad_norm": 0.23650456964969635, "learning_rate": 6.385924430343237e-05, "loss": 0.2072, "step": 2460 }, { "epoch": 2.0760584094967958, "grad_norm": 0.20731420814990997, "learning_rate": 6.328237669454861e-05, "loss": 0.205, "step": 2470 }, { "epoch": 2.084462653640088, "grad_norm": 0.17086872458457947, "learning_rate": 6.270550908566484e-05, "loss": 0.2072, "step": 2480 }, { "epoch": 2.0928668977833804, "grad_norm": 0.16823840141296387, "learning_rate": 6.212864147678108e-05, "loss": 0.1932, "step": 2490 }, { "epoch": 2.1012711419266727, "grad_norm": 0.15792180597782135, "learning_rate": 6.155177386789732e-05, "loss": 0.216, "step": 2500 }, { "epoch": 2.1096753860699655, "grad_norm": 0.1540854424238205, "learning_rate": 6.097490625901356e-05, "loss": 0.2164, "step": 2510 }, { "epoch": 2.118079630213258, "grad_norm": 0.1553662121295929, "learning_rate": 6.0398038650129795e-05, "loss": 0.2259, "step": 2520 }, { "epoch": 2.12648387435655, "grad_norm": 0.17120260000228882, "learning_rate": 5.9821171041246035e-05, "loss": 0.1974, "step": 2530 }, { "epoch": 2.1348881184998425, "grad_norm": 0.16605545580387115, "learning_rate": 5.924430343236228e-05, "loss": 0.1997, "step": 2540 }, { "epoch": 2.143292362643135, "grad_norm": 0.18276792764663696, "learning_rate": 5.866743582347851e-05, "loss": 0.2224, "step": 2550 }, { "epoch": 2.151696606786427, "grad_norm": 0.18557056784629822, "learning_rate": 5.8090568214594755e-05, "loss": 0.2052, "step": 2560 }, { "epoch": 2.1601008509297195, "grad_norm": 0.1803399622440338, "learning_rate": 5.7513700605710995e-05, "loss": 0.2133, "step": 2570 }, { "epoch": 2.168505095073012, "grad_norm": 0.16556379199028015, "learning_rate": 5.693683299682723e-05, "loss": 0.2063, "step": 2580 }, { "epoch": 2.176909339216304, "grad_norm": 0.1620313972234726, "learning_rate": 5.635996538794347e-05, "loss": 0.2052, "step": 2590 }, { "epoch": 2.1853135833595965, "grad_norm": 0.2085953652858734, "learning_rate": 5.5783097779059715e-05, "loss": 0.2269, "step": 2600 }, { "epoch": 2.193717827502889, "grad_norm": 0.1796538531780243, "learning_rate": 5.520623017017594e-05, "loss": 0.1968, "step": 2610 }, { "epoch": 2.202122071646181, "grad_norm": 0.17472976446151733, "learning_rate": 5.462936256129219e-05, "loss": 0.2118, "step": 2620 }, { "epoch": 2.2105263157894735, "grad_norm": 0.22609752416610718, "learning_rate": 5.405249495240843e-05, "loss": 0.2122, "step": 2630 }, { "epoch": 2.2189305599327662, "grad_norm": 0.2001248002052307, "learning_rate": 5.347562734352466e-05, "loss": 0.2133, "step": 2640 }, { "epoch": 2.2273348040760585, "grad_norm": 0.17724989354610443, "learning_rate": 5.28987597346409e-05, "loss": 0.2098, "step": 2650 }, { "epoch": 2.235739048219351, "grad_norm": 0.1948956549167633, "learning_rate": 5.232189212575714e-05, "loss": 0.2106, "step": 2660 }, { "epoch": 2.244143292362643, "grad_norm": 0.17181238532066345, "learning_rate": 5.1745024516873374e-05, "loss": 0.2112, "step": 2670 }, { "epoch": 2.2525475365059355, "grad_norm": 0.17248126864433289, "learning_rate": 5.1168156907989614e-05, "loss": 0.2066, "step": 2680 }, { "epoch": 2.260951780649228, "grad_norm": 0.16288314759731293, "learning_rate": 5.059128929910586e-05, "loss": 0.1934, "step": 2690 }, { "epoch": 2.26935602479252, "grad_norm": 0.16555948555469513, "learning_rate": 5.00144216902221e-05, "loss": 0.203, "step": 2700 }, { "epoch": 2.2777602689358125, "grad_norm": 0.1704113483428955, "learning_rate": 4.9437554081338334e-05, "loss": 0.1935, "step": 2710 }, { "epoch": 2.286164513079105, "grad_norm": 0.1625906527042389, "learning_rate": 4.8860686472454574e-05, "loss": 0.2204, "step": 2720 }, { "epoch": 2.294568757222397, "grad_norm": 0.19332656264305115, "learning_rate": 4.8283818863570814e-05, "loss": 0.218, "step": 2730 }, { "epoch": 2.3029730013656895, "grad_norm": 0.1706068366765976, "learning_rate": 4.770695125468705e-05, "loss": 0.2012, "step": 2740 }, { "epoch": 2.3113772455089823, "grad_norm": 0.1740259975194931, "learning_rate": 4.7130083645803294e-05, "loss": 0.2177, "step": 2750 }, { "epoch": 2.319781489652274, "grad_norm": 0.17610688507556915, "learning_rate": 4.655321603691953e-05, "loss": 0.2109, "step": 2760 }, { "epoch": 2.328185733795567, "grad_norm": 0.16697686910629272, "learning_rate": 4.597634842803577e-05, "loss": 0.1963, "step": 2770 }, { "epoch": 2.3365899779388593, "grad_norm": 0.1630171686410904, "learning_rate": 4.539948081915201e-05, "loss": 0.2057, "step": 2780 }, { "epoch": 2.3449942220821516, "grad_norm": 0.17797575891017914, "learning_rate": 4.482261321026825e-05, "loss": 0.187, "step": 2790 }, { "epoch": 2.353398466225444, "grad_norm": 0.15808750689029694, "learning_rate": 4.424574560138449e-05, "loss": 0.2084, "step": 2800 }, { "epoch": 2.3618027103687362, "grad_norm": 0.15329663455486298, "learning_rate": 4.366887799250072e-05, "loss": 0.2135, "step": 2810 }, { "epoch": 2.3702069545120286, "grad_norm": 0.17800621688365936, "learning_rate": 4.309201038361696e-05, "loss": 0.2137, "step": 2820 }, { "epoch": 2.378611198655321, "grad_norm": 0.17937782406806946, "learning_rate": 4.25151427747332e-05, "loss": 0.2128, "step": 2830 }, { "epoch": 2.387015442798613, "grad_norm": 0.19397765398025513, "learning_rate": 4.193827516584944e-05, "loss": 0.2023, "step": 2840 }, { "epoch": 2.3954196869419055, "grad_norm": 0.23468880355358124, "learning_rate": 4.1361407556965673e-05, "loss": 0.2142, "step": 2850 }, { "epoch": 2.403823931085198, "grad_norm": 0.16991160809993744, "learning_rate": 4.078453994808192e-05, "loss": 0.1922, "step": 2860 }, { "epoch": 2.41222817522849, "grad_norm": 0.1695888191461563, "learning_rate": 4.020767233919815e-05, "loss": 0.2131, "step": 2870 }, { "epoch": 2.420632419371783, "grad_norm": 0.1804046779870987, "learning_rate": 3.96308047303144e-05, "loss": 0.2112, "step": 2880 }, { "epoch": 2.4290366635150753, "grad_norm": 0.16802121698856354, "learning_rate": 3.905393712143063e-05, "loss": 0.225, "step": 2890 }, { "epoch": 2.4374409076583676, "grad_norm": 0.16845154762268066, "learning_rate": 3.847706951254687e-05, "loss": 0.2003, "step": 2900 }, { "epoch": 2.44584515180166, "grad_norm": 0.11345178633928299, "learning_rate": 3.790020190366311e-05, "loss": 0.2049, "step": 2910 }, { "epoch": 2.4542493959449523, "grad_norm": 0.15802818536758423, "learning_rate": 3.7323334294779346e-05, "loss": 0.2205, "step": 2920 }, { "epoch": 2.4626536400882446, "grad_norm": 0.21586033701896667, "learning_rate": 3.6746466685895586e-05, "loss": 0.2203, "step": 2930 }, { "epoch": 2.471057884231537, "grad_norm": 0.19281122088432312, "learning_rate": 3.6169599077011826e-05, "loss": 0.2045, "step": 2940 }, { "epoch": 2.4794621283748293, "grad_norm": 0.223149836063385, "learning_rate": 3.5592731468128066e-05, "loss": 0.2124, "step": 2950 }, { "epoch": 2.4878663725181216, "grad_norm": 0.1529054492712021, "learning_rate": 3.5015863859244306e-05, "loss": 0.1921, "step": 2960 }, { "epoch": 2.496270616661414, "grad_norm": 0.15139955282211304, "learning_rate": 3.4438996250360546e-05, "loss": 0.2168, "step": 2970 }, { "epoch": 2.5046748608047062, "grad_norm": 0.19425025582313538, "learning_rate": 3.386212864147678e-05, "loss": 0.2107, "step": 2980 }, { "epoch": 2.5130791049479986, "grad_norm": 0.17192526161670685, "learning_rate": 3.3285261032593026e-05, "loss": 0.1988, "step": 2990 }, { "epoch": 2.521483349091291, "grad_norm": 0.2090197652578354, "learning_rate": 3.270839342370926e-05, "loss": 0.2096, "step": 3000 }, { "epoch": 2.5298875932345837, "grad_norm": 0.15847593545913696, "learning_rate": 3.21315258148255e-05, "loss": 0.1998, "step": 3010 }, { "epoch": 2.5382918373778756, "grad_norm": 0.19396276772022247, "learning_rate": 3.155465820594174e-05, "loss": 0.2051, "step": 3020 }, { "epoch": 2.5466960815211683, "grad_norm": 0.19245147705078125, "learning_rate": 3.097779059705798e-05, "loss": 0.2055, "step": 3030 }, { "epoch": 2.5551003256644607, "grad_norm": 0.18058356642723083, "learning_rate": 3.0400922988174212e-05, "loss": 0.1981, "step": 3040 }, { "epoch": 2.563504569807753, "grad_norm": 0.1646842509508133, "learning_rate": 2.9824055379290456e-05, "loss": 0.1989, "step": 3050 }, { "epoch": 2.5719088139510453, "grad_norm": 0.15872938930988312, "learning_rate": 2.9247187770406692e-05, "loss": 0.2018, "step": 3060 }, { "epoch": 2.5803130580943376, "grad_norm": 0.1905873417854309, "learning_rate": 2.8670320161522936e-05, "loss": 0.1999, "step": 3070 }, { "epoch": 2.58871730223763, "grad_norm": 0.19063328206539154, "learning_rate": 2.8093452552639172e-05, "loss": 0.2084, "step": 3080 }, { "epoch": 2.5971215463809223, "grad_norm": 0.22434838116168976, "learning_rate": 2.751658494375541e-05, "loss": 0.196, "step": 3090 }, { "epoch": 2.6055257905242146, "grad_norm": 0.20004242658615112, "learning_rate": 2.693971733487165e-05, "loss": 0.2077, "step": 3100 }, { "epoch": 2.613930034667507, "grad_norm": 0.15297284722328186, "learning_rate": 2.636284972598789e-05, "loss": 0.1827, "step": 3110 }, { "epoch": 2.6223342788107997, "grad_norm": 0.1892947554588318, "learning_rate": 2.5785982117104125e-05, "loss": 0.1991, "step": 3120 }, { "epoch": 2.6307385229540916, "grad_norm": 0.15705682337284088, "learning_rate": 2.5209114508220365e-05, "loss": 0.1918, "step": 3130 }, { "epoch": 2.6391427670973844, "grad_norm": 0.19605481624603271, "learning_rate": 2.4632246899336602e-05, "loss": 0.1988, "step": 3140 }, { "epoch": 2.6475470112406767, "grad_norm": 0.1448083370923996, "learning_rate": 2.4055379290452842e-05, "loss": 0.2002, "step": 3150 }, { "epoch": 2.655951255383969, "grad_norm": 0.17756974697113037, "learning_rate": 2.3478511681569082e-05, "loss": 0.2017, "step": 3160 }, { "epoch": 2.6643554995272614, "grad_norm": 0.18558846414089203, "learning_rate": 2.2901644072685318e-05, "loss": 0.1893, "step": 3170 }, { "epoch": 2.6727597436705537, "grad_norm": 0.15384823083877563, "learning_rate": 2.2324776463801558e-05, "loss": 0.1997, "step": 3180 }, { "epoch": 2.681163987813846, "grad_norm": 0.1683199107646942, "learning_rate": 2.1747908854917798e-05, "loss": 0.1965, "step": 3190 }, { "epoch": 2.6895682319571383, "grad_norm": 0.17967212200164795, "learning_rate": 2.1171041246034038e-05, "loss": 0.191, "step": 3200 }, { "epoch": 2.6979724761004307, "grad_norm": 0.17232255637645721, "learning_rate": 2.0594173637150275e-05, "loss": 0.2099, "step": 3210 }, { "epoch": 2.706376720243723, "grad_norm": 0.13474784791469574, "learning_rate": 2.0017306028266515e-05, "loss": 0.186, "step": 3220 }, { "epoch": 2.7147809643870153, "grad_norm": 0.15341779589653015, "learning_rate": 1.9440438419382755e-05, "loss": 0.2121, "step": 3230 }, { "epoch": 2.7231852085303077, "grad_norm": 0.15999628603458405, "learning_rate": 1.886357081049899e-05, "loss": 0.2033, "step": 3240 }, { "epoch": 2.7315894526736004, "grad_norm": 0.19280874729156494, "learning_rate": 1.828670320161523e-05, "loss": 0.2011, "step": 3250 }, { "epoch": 2.7399936968168923, "grad_norm": 0.19871552288532257, "learning_rate": 1.7709835592731468e-05, "loss": 0.1959, "step": 3260 }, { "epoch": 2.748397940960185, "grad_norm": 0.16481180489063263, "learning_rate": 1.7132967983847708e-05, "loss": 0.1936, "step": 3270 }, { "epoch": 2.7568021851034774, "grad_norm": 0.16249620914459229, "learning_rate": 1.6556100374963944e-05, "loss": 0.2062, "step": 3280 }, { "epoch": 2.7652064292467697, "grad_norm": 0.1833724081516266, "learning_rate": 1.5979232766080184e-05, "loss": 0.206, "step": 3290 }, { "epoch": 2.773610673390062, "grad_norm": 0.17345350980758667, "learning_rate": 1.5402365157196424e-05, "loss": 0.204, "step": 3300 }, { "epoch": 2.7820149175333544, "grad_norm": 0.18603332340717316, "learning_rate": 1.4825497548312664e-05, "loss": 0.2034, "step": 3310 }, { "epoch": 2.7904191616766467, "grad_norm": 0.22087831795215607, "learning_rate": 1.42486299394289e-05, "loss": 0.1862, "step": 3320 }, { "epoch": 2.798823405819939, "grad_norm": 0.16892971098423004, "learning_rate": 1.367176233054514e-05, "loss": 0.2096, "step": 3330 }, { "epoch": 2.8072276499632314, "grad_norm": 0.1363382637500763, "learning_rate": 1.309489472166138e-05, "loss": 0.195, "step": 3340 }, { "epoch": 2.8156318941065237, "grad_norm": 0.17548972368240356, "learning_rate": 1.2518027112777619e-05, "loss": 0.1899, "step": 3350 }, { "epoch": 2.824036138249816, "grad_norm": 0.21062695980072021, "learning_rate": 1.1941159503893857e-05, "loss": 0.1909, "step": 3360 }, { "epoch": 2.8324403823931084, "grad_norm": 0.16137580573558807, "learning_rate": 1.1364291895010095e-05, "loss": 0.2135, "step": 3370 }, { "epoch": 2.840844626536401, "grad_norm": 0.1788797825574875, "learning_rate": 1.0787424286126334e-05, "loss": 0.1963, "step": 3380 }, { "epoch": 2.849248870679693, "grad_norm": 0.1574256718158722, "learning_rate": 1.0210556677242574e-05, "loss": 0.2075, "step": 3390 }, { "epoch": 2.857653114822986, "grad_norm": 0.15146087110042572, "learning_rate": 9.633689068358812e-06, "loss": 0.2027, "step": 3400 }, { "epoch": 2.866057358966278, "grad_norm": 0.14105765521526337, "learning_rate": 9.056821459475052e-06, "loss": 0.1976, "step": 3410 }, { "epoch": 2.8744616031095704, "grad_norm": 0.19882114231586456, "learning_rate": 8.479953850591289e-06, "loss": 0.2077, "step": 3420 }, { "epoch": 2.8828658472528628, "grad_norm": 0.14719258248806, "learning_rate": 7.903086241707528e-06, "loss": 0.1976, "step": 3430 }, { "epoch": 2.891270091396155, "grad_norm": 0.21373219788074493, "learning_rate": 7.326218632823767e-06, "loss": 0.1882, "step": 3440 }, { "epoch": 2.8996743355394474, "grad_norm": 0.1744794398546219, "learning_rate": 6.749351023940007e-06, "loss": 0.1935, "step": 3450 }, { "epoch": 2.9080785796827398, "grad_norm": 0.15825729072093964, "learning_rate": 6.172483415056245e-06, "loss": 0.1994, "step": 3460 }, { "epoch": 2.916482823826032, "grad_norm": 0.1830754429101944, "learning_rate": 5.595615806172483e-06, "loss": 0.1839, "step": 3470 }, { "epoch": 2.9248870679693244, "grad_norm": 0.12078092247247696, "learning_rate": 5.018748197288722e-06, "loss": 0.1952, "step": 3480 }, { "epoch": 2.9332913121126167, "grad_norm": 0.16265451908111572, "learning_rate": 4.4418805884049615e-06, "loss": 0.197, "step": 3490 }, { "epoch": 2.941695556255909, "grad_norm": 0.18411016464233398, "learning_rate": 3.8650129795212e-06, "loss": 0.197, "step": 3500 }, { "epoch": 2.950099800399202, "grad_norm": 0.18464697897434235, "learning_rate": 3.288145370637439e-06, "loss": 0.2013, "step": 3510 }, { "epoch": 2.9585040445424937, "grad_norm": 0.21990764141082764, "learning_rate": 2.711277761753678e-06, "loss": 0.1985, "step": 3520 }, { "epoch": 2.9669082886857865, "grad_norm": 0.163679301738739, "learning_rate": 2.1344101528699166e-06, "loss": 0.1942, "step": 3530 }, { "epoch": 2.975312532829079, "grad_norm": 0.1887105107307434, "learning_rate": 1.5575425439861553e-06, "loss": 0.1936, "step": 3540 }, { "epoch": 2.983716776972371, "grad_norm": 0.2435252070426941, "learning_rate": 9.806749351023942e-07, "loss": 0.1978, "step": 3550 }, { "epoch": 2.9921210211156635, "grad_norm": 0.16136085987091064, "learning_rate": 4.038073262186328e-07, "loss": 0.1951, "step": 3560 } ], "logging_steps": 10, "max_steps": 3567, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.419033588234076e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }