{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2166, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004616805170821791, "grad_norm": 15.672144611335952, "learning_rate": 9.216589861751152e-08, "loss": 1.3168, "step": 1 }, { "epoch": 0.0023084025854108957, "grad_norm": 14.680930201512727, "learning_rate": 4.608294930875577e-07, "loss": 1.2513, "step": 5 }, { "epoch": 0.0046168051708217915, "grad_norm": 8.024031806354767, "learning_rate": 9.216589861751154e-07, "loss": 1.1965, "step": 10 }, { "epoch": 0.006925207756232687, "grad_norm": 5.48716759630272, "learning_rate": 1.382488479262673e-06, "loss": 1.0985, "step": 15 }, { "epoch": 0.009233610341643583, "grad_norm": 5.068857847099135, "learning_rate": 1.8433179723502307e-06, "loss": 1.0416, "step": 20 }, { "epoch": 0.011542012927054479, "grad_norm": 5.784416390233076, "learning_rate": 2.3041474654377884e-06, "loss": 1.0414, "step": 25 }, { "epoch": 0.013850415512465374, "grad_norm": 5.088059220204017, "learning_rate": 2.764976958525346e-06, "loss": 1.0977, "step": 30 }, { "epoch": 0.016158818097876268, "grad_norm": 4.831013532516624, "learning_rate": 3.225806451612903e-06, "loss": 1.0864, "step": 35 }, { "epoch": 0.018467220683287166, "grad_norm": 5.01522341202905, "learning_rate": 3.6866359447004615e-06, "loss": 1.0875, "step": 40 }, { "epoch": 0.02077562326869806, "grad_norm": 5.2743430965797495, "learning_rate": 4.147465437788019e-06, "loss": 1.0582, "step": 45 }, { "epoch": 0.023084025854108958, "grad_norm": 4.868797420697276, "learning_rate": 4.608294930875577e-06, "loss": 1.0706, "step": 50 }, { "epoch": 0.025392428439519853, "grad_norm": 4.866881384682284, "learning_rate": 5.0691244239631346e-06, "loss": 1.0874, "step": 55 }, { "epoch": 0.027700831024930747, "grad_norm": 4.942215298185941, "learning_rate": 5.529953917050692e-06, "loss": 1.0765, "step": 60 }, { "epoch": 0.030009233610341645, "grad_norm": 4.510624758005134, "learning_rate": 5.9907834101382485e-06, "loss": 1.0651, "step": 65 }, { "epoch": 0.032317636195752536, "grad_norm": 4.961160516804359, "learning_rate": 6.451612903225806e-06, "loss": 1.0826, "step": 70 }, { "epoch": 0.03462603878116344, "grad_norm": 4.999318871040395, "learning_rate": 6.912442396313365e-06, "loss": 1.0966, "step": 75 }, { "epoch": 0.03693444136657433, "grad_norm": 4.639315170945839, "learning_rate": 7.373271889400923e-06, "loss": 1.0934, "step": 80 }, { "epoch": 0.039242843951985226, "grad_norm": 4.79620290699333, "learning_rate": 7.83410138248848e-06, "loss": 1.0932, "step": 85 }, { "epoch": 0.04155124653739612, "grad_norm": 4.957993386602933, "learning_rate": 8.294930875576038e-06, "loss": 1.1032, "step": 90 }, { "epoch": 0.043859649122807015, "grad_norm": 4.669607524515842, "learning_rate": 8.755760368663595e-06, "loss": 1.0875, "step": 95 }, { "epoch": 0.046168051708217916, "grad_norm": 4.602861109332021, "learning_rate": 9.216589861751153e-06, "loss": 1.0809, "step": 100 }, { "epoch": 0.04847645429362881, "grad_norm": 4.548726172146522, "learning_rate": 9.67741935483871e-06, "loss": 1.1218, "step": 105 }, { "epoch": 0.050784856879039705, "grad_norm": 4.724149571099335, "learning_rate": 1.0138248847926269e-05, "loss": 1.1007, "step": 110 }, { "epoch": 0.0530932594644506, "grad_norm": 5.309010432204349, "learning_rate": 1.0599078341013826e-05, "loss": 1.1368, "step": 115 }, { "epoch": 0.055401662049861494, "grad_norm": 4.839305137997795, "learning_rate": 1.1059907834101385e-05, "loss": 1.1055, "step": 120 }, { "epoch": 0.05771006463527239, "grad_norm": 4.5796294161615565, "learning_rate": 1.152073732718894e-05, "loss": 1.1155, "step": 125 }, { "epoch": 0.06001846722068329, "grad_norm": 4.706959812240538, "learning_rate": 1.1981566820276497e-05, "loss": 1.1387, "step": 130 }, { "epoch": 0.062326869806094184, "grad_norm": 4.492165983938348, "learning_rate": 1.2442396313364056e-05, "loss": 1.1733, "step": 135 }, { "epoch": 0.06463527239150507, "grad_norm": 4.746032213098828, "learning_rate": 1.2903225806451613e-05, "loss": 1.1375, "step": 140 }, { "epoch": 0.06694367497691597, "grad_norm": 4.713817907356248, "learning_rate": 1.3364055299539171e-05, "loss": 1.158, "step": 145 }, { "epoch": 0.06925207756232687, "grad_norm": 4.342905646572964, "learning_rate": 1.382488479262673e-05, "loss": 1.1607, "step": 150 }, { "epoch": 0.07156048014773776, "grad_norm": 4.502102336400582, "learning_rate": 1.4285714285714287e-05, "loss": 1.1382, "step": 155 }, { "epoch": 0.07386888273314866, "grad_norm": 4.300393542300411, "learning_rate": 1.4746543778801846e-05, "loss": 1.1518, "step": 160 }, { "epoch": 0.07617728531855955, "grad_norm": 4.400546990325483, "learning_rate": 1.5207373271889403e-05, "loss": 1.1436, "step": 165 }, { "epoch": 0.07848568790397045, "grad_norm": 4.77590791643038, "learning_rate": 1.566820276497696e-05, "loss": 1.2173, "step": 170 }, { "epoch": 0.08079409048938135, "grad_norm": 4.32969974114785, "learning_rate": 1.6129032258064517e-05, "loss": 1.1654, "step": 175 }, { "epoch": 0.08310249307479224, "grad_norm": 5.285074448558262, "learning_rate": 1.6589861751152075e-05, "loss": 1.2185, "step": 180 }, { "epoch": 0.08541089566020314, "grad_norm": 6.312179413881035, "learning_rate": 1.705069124423963e-05, "loss": 1.2063, "step": 185 }, { "epoch": 0.08771929824561403, "grad_norm": 4.351482667809684, "learning_rate": 1.751152073732719e-05, "loss": 1.1814, "step": 190 }, { "epoch": 0.09002770083102493, "grad_norm": 4.468079454686115, "learning_rate": 1.7972350230414748e-05, "loss": 1.2058, "step": 195 }, { "epoch": 0.09233610341643583, "grad_norm": 5.51425273025908, "learning_rate": 1.8433179723502307e-05, "loss": 1.1646, "step": 200 }, { "epoch": 0.09464450600184672, "grad_norm": 4.661323669253999, "learning_rate": 1.8894009216589862e-05, "loss": 1.1689, "step": 205 }, { "epoch": 0.09695290858725762, "grad_norm": 726.888849011745, "learning_rate": 1.935483870967742e-05, "loss": 1.8785, "step": 210 }, { "epoch": 0.09926131117266851, "grad_norm": 5.867844131835615, "learning_rate": 1.981566820276498e-05, "loss": 1.2661, "step": 215 }, { "epoch": 0.10156971375807941, "grad_norm": 6.2684199277472015, "learning_rate": 1.9999883080288618e-05, "loss": 1.2545, "step": 220 }, { "epoch": 0.1038781163434903, "grad_norm": 5.426004523317811, "learning_rate": 1.999916858084231e-05, "loss": 1.2259, "step": 225 }, { "epoch": 0.1061865189289012, "grad_norm": 4.617593739028291, "learning_rate": 1.999780458369908e-05, "loss": 1.177, "step": 230 }, { "epoch": 0.1084949215143121, "grad_norm": 4.412649452769939, "learning_rate": 1.9995791177457598e-05, "loss": 1.2127, "step": 235 }, { "epoch": 0.11080332409972299, "grad_norm": 4.3422685444059965, "learning_rate": 1.9993128492899012e-05, "loss": 1.2398, "step": 240 }, { "epoch": 0.11311172668513389, "grad_norm": 4.837187367612426, "learning_rate": 1.9989816702978447e-05, "loss": 1.2189, "step": 245 }, { "epoch": 0.11542012927054478, "grad_norm": 4.203624655101154, "learning_rate": 1.998585602281378e-05, "loss": 1.1641, "step": 250 }, { "epoch": 0.11772853185595568, "grad_norm": 4.172078683953242, "learning_rate": 1.9981246709671668e-05, "loss": 1.217, "step": 255 }, { "epoch": 0.12003693444136658, "grad_norm": 4.445815868978626, "learning_rate": 1.9975989062950828e-05, "loss": 1.2198, "step": 260 }, { "epoch": 0.12234533702677747, "grad_norm": 4.5591861583880045, "learning_rate": 1.9970083424162598e-05, "loss": 1.2971, "step": 265 }, { "epoch": 0.12465373961218837, "grad_norm": 8.794456155689286, "learning_rate": 1.9963530176908752e-05, "loss": 1.2543, "step": 270 }, { "epoch": 0.12696214219759927, "grad_norm": 4.296337355363852, "learning_rate": 1.9956329746856583e-05, "loss": 1.1902, "step": 275 }, { "epoch": 0.12927054478301014, "grad_norm": 4.210037276606183, "learning_rate": 1.9948482601711245e-05, "loss": 1.2119, "step": 280 }, { "epoch": 0.13157894736842105, "grad_norm": 4.525819047133829, "learning_rate": 1.9939989251185386e-05, "loss": 1.2267, "step": 285 }, { "epoch": 0.13388734995383195, "grad_norm": 4.617114491344834, "learning_rate": 1.993085024696604e-05, "loss": 1.253, "step": 290 }, { "epoch": 0.13619575253924285, "grad_norm": 4.306884827311129, "learning_rate": 1.992106618267878e-05, "loss": 1.2968, "step": 295 }, { "epoch": 0.13850415512465375, "grad_norm": 4.06156778374873, "learning_rate": 1.9910637693849166e-05, "loss": 1.2523, "step": 300 }, { "epoch": 0.14081255771006462, "grad_norm": 4.1725194301873225, "learning_rate": 1.9899565457861463e-05, "loss": 1.2465, "step": 305 }, { "epoch": 0.14312096029547552, "grad_norm": 6.1582094432239005, "learning_rate": 1.988785019391465e-05, "loss": 1.2893, "step": 310 }, { "epoch": 0.14542936288088643, "grad_norm": 4.468419480755536, "learning_rate": 1.987549266297568e-05, "loss": 1.2684, "step": 315 }, { "epoch": 0.14773776546629733, "grad_norm": 4.663314719431853, "learning_rate": 1.986249366773009e-05, "loss": 1.2472, "step": 320 }, { "epoch": 0.15004616805170823, "grad_norm": 4.557295583444763, "learning_rate": 1.9848854052529822e-05, "loss": 1.2856, "step": 325 }, { "epoch": 0.1523545706371191, "grad_norm": 4.128322557091226, "learning_rate": 1.9834574703338406e-05, "loss": 1.2717, "step": 330 }, { "epoch": 0.15466297322253, "grad_norm": 4.265562249971871, "learning_rate": 1.9819656547673393e-05, "loss": 1.2614, "step": 335 }, { "epoch": 0.1569713758079409, "grad_norm": 4.189648461283852, "learning_rate": 1.9804100554546127e-05, "loss": 1.2221, "step": 340 }, { "epoch": 0.1592797783933518, "grad_norm": 4.753781028146877, "learning_rate": 1.9787907734398785e-05, "loss": 1.2641, "step": 345 }, { "epoch": 0.1615881809787627, "grad_norm": 4.624571477954883, "learning_rate": 1.9771079139038765e-05, "loss": 1.3082, "step": 350 }, { "epoch": 0.16389658356417358, "grad_norm": 4.51192227446534, "learning_rate": 1.9753615861570338e-05, "loss": 1.3116, "step": 355 }, { "epoch": 0.16620498614958448, "grad_norm": 4.392313959090253, "learning_rate": 1.9735519036323656e-05, "loss": 1.2304, "step": 360 }, { "epoch": 0.16851338873499538, "grad_norm": 4.979240339208881, "learning_rate": 1.9716789838781095e-05, "loss": 1.2682, "step": 365 }, { "epoch": 0.17082179132040629, "grad_norm": 4.96937836441046, "learning_rate": 1.9697429485500862e-05, "loss": 1.3054, "step": 370 }, { "epoch": 0.1731301939058172, "grad_norm": 3.935739346153204, "learning_rate": 1.9677439234038004e-05, "loss": 1.2704, "step": 375 }, { "epoch": 0.17543859649122806, "grad_norm": 4.366123456450803, "learning_rate": 1.96568203828627e-05, "loss": 1.236, "step": 380 }, { "epoch": 0.17774699907663896, "grad_norm": 4.003638705307624, "learning_rate": 1.963557427127594e-05, "loss": 1.2134, "step": 385 }, { "epoch": 0.18005540166204986, "grad_norm": 4.711836278485082, "learning_rate": 1.9613702279322518e-05, "loss": 1.2424, "step": 390 }, { "epoch": 0.18236380424746076, "grad_norm": 4.7756346414851345, "learning_rate": 1.95912058277014e-05, "loss": 1.2513, "step": 395 }, { "epoch": 0.18467220683287167, "grad_norm": 4.055556447653374, "learning_rate": 1.9568086377673422e-05, "loss": 1.2305, "step": 400 }, { "epoch": 0.18698060941828254, "grad_norm": 3.9870929086001605, "learning_rate": 1.9544345430966398e-05, "loss": 1.2766, "step": 405 }, { "epoch": 0.18928901200369344, "grad_norm": 4.3683569271591525, "learning_rate": 1.951998452967756e-05, "loss": 1.2701, "step": 410 }, { "epoch": 0.19159741458910434, "grad_norm": 4.282177503327308, "learning_rate": 1.9495005256173398e-05, "loss": 1.2173, "step": 415 }, { "epoch": 0.19390581717451524, "grad_norm": 4.122228465513596, "learning_rate": 1.9469409232986876e-05, "loss": 1.293, "step": 420 }, { "epoch": 0.19621421975992612, "grad_norm": 4.391730062186428, "learning_rate": 1.9443198122712036e-05, "loss": 1.3013, "step": 425 }, { "epoch": 0.19852262234533702, "grad_norm": 4.2533205751093, "learning_rate": 1.9416373627896002e-05, "loss": 1.2478, "step": 430 }, { "epoch": 0.20083102493074792, "grad_norm": 4.982151398275928, "learning_rate": 1.9388937490928402e-05, "loss": 1.289, "step": 435 }, { "epoch": 0.20313942751615882, "grad_norm": 4.254393940238592, "learning_rate": 1.9360891493928186e-05, "loss": 1.2773, "step": 440 }, { "epoch": 0.20544783010156972, "grad_norm": 4.812233488623846, "learning_rate": 1.933223745862786e-05, "loss": 1.2571, "step": 445 }, { "epoch": 0.2077562326869806, "grad_norm": 4.193819364681046, "learning_rate": 1.930297724625516e-05, "loss": 1.3167, "step": 450 }, { "epoch": 0.2100646352723915, "grad_norm": 4.318967687699199, "learning_rate": 1.9273112757412165e-05, "loss": 1.2578, "step": 455 }, { "epoch": 0.2123730378578024, "grad_norm": 4.021438837096732, "learning_rate": 1.9242645931951833e-05, "loss": 1.2703, "step": 460 }, { "epoch": 0.2146814404432133, "grad_norm": 3.9988355301981344, "learning_rate": 1.921157874885199e-05, "loss": 1.2702, "step": 465 }, { "epoch": 0.2169898430286242, "grad_norm": 3.866018897785007, "learning_rate": 1.91799132260868e-05, "loss": 1.2651, "step": 470 }, { "epoch": 0.21929824561403508, "grad_norm": 4.228145732575894, "learning_rate": 1.9147651420495696e-05, "loss": 1.2429, "step": 475 }, { "epoch": 0.22160664819944598, "grad_norm": 4.16044625111994, "learning_rate": 1.9114795427649735e-05, "loss": 1.2263, "step": 480 }, { "epoch": 0.22391505078485688, "grad_norm": 3.7071606709047678, "learning_rate": 1.9081347381715535e-05, "loss": 1.2592, "step": 485 }, { "epoch": 0.22622345337026778, "grad_norm": 4.093983584879632, "learning_rate": 1.904730945531661e-05, "loss": 1.2819, "step": 490 }, { "epoch": 0.22853185595567868, "grad_norm": 4.247421291613911, "learning_rate": 1.901268385939226e-05, "loss": 1.3118, "step": 495 }, { "epoch": 0.23084025854108955, "grad_norm": 4.088704419142061, "learning_rate": 1.8977472843053962e-05, "loss": 1.2529, "step": 500 }, { "epoch": 0.23314866112650046, "grad_norm": 3.9526614218286698, "learning_rate": 1.8941678693439272e-05, "loss": 1.2254, "step": 505 }, { "epoch": 0.23545706371191136, "grad_norm": 3.767319095108075, "learning_rate": 1.8905303735563274e-05, "loss": 1.2705, "step": 510 }, { "epoch": 0.23776546629732226, "grad_norm": 4.1464464034097, "learning_rate": 1.886835033216755e-05, "loss": 1.2841, "step": 515 }, { "epoch": 0.24007386888273316, "grad_norm": 4.154511161776497, "learning_rate": 1.88308208835667e-05, "loss": 1.2715, "step": 520 }, { "epoch": 0.24238227146814403, "grad_norm": 4.815166096996458, "learning_rate": 1.8792717827492446e-05, "loss": 1.3034, "step": 525 }, { "epoch": 0.24469067405355494, "grad_norm": 22.245546847367528, "learning_rate": 1.8754043638935283e-05, "loss": 1.2532, "step": 530 }, { "epoch": 0.24699907663896584, "grad_norm": 4.177323522295811, "learning_rate": 1.871480082998371e-05, "loss": 1.2501, "step": 535 }, { "epoch": 0.24930747922437674, "grad_norm": 3.9426463777773346, "learning_rate": 1.867499194966106e-05, "loss": 1.2683, "step": 540 }, { "epoch": 0.2516158818097876, "grad_norm": 3.912690873331932, "learning_rate": 1.8634619583759933e-05, "loss": 1.2874, "step": 545 }, { "epoch": 0.25392428439519854, "grad_norm": 3.972529239438344, "learning_rate": 1.8593686354674223e-05, "loss": 1.2698, "step": 550 }, { "epoch": 0.2562326869806094, "grad_norm": 3.958572886167977, "learning_rate": 1.8552194921228793e-05, "loss": 1.2293, "step": 555 }, { "epoch": 0.2585410895660203, "grad_norm": 3.7553829117034767, "learning_rate": 1.851014797850676e-05, "loss": 1.2818, "step": 560 }, { "epoch": 0.2608494921514312, "grad_norm": 4.352268879736511, "learning_rate": 1.8467548257674453e-05, "loss": 1.2552, "step": 565 }, { "epoch": 0.2631578947368421, "grad_norm": 5.014139215739045, "learning_rate": 1.8424398525803983e-05, "loss": 1.2228, "step": 570 }, { "epoch": 0.265466297322253, "grad_norm": 4.192590762422093, "learning_rate": 1.8380701585693526e-05, "loss": 1.2526, "step": 575 }, { "epoch": 0.2677746999076639, "grad_norm": 4.209340122955672, "learning_rate": 1.8336460275685267e-05, "loss": 1.2681, "step": 580 }, { "epoch": 0.27008310249307477, "grad_norm": 3.801129619164067, "learning_rate": 1.8291677469481025e-05, "loss": 1.2623, "step": 585 }, { "epoch": 0.2723915050784857, "grad_norm": 5.60448449703679, "learning_rate": 1.8246356075955594e-05, "loss": 1.2778, "step": 590 }, { "epoch": 0.27469990766389657, "grad_norm": 3.8415685450636143, "learning_rate": 1.820049903896782e-05, "loss": 1.2546, "step": 595 }, { "epoch": 0.2770083102493075, "grad_norm": 3.766423848242755, "learning_rate": 1.8154109337169326e-05, "loss": 1.2994, "step": 600 }, { "epoch": 0.2793167128347184, "grad_norm": 3.8445299977202363, "learning_rate": 1.8107189983811094e-05, "loss": 1.2779, "step": 605 }, { "epoch": 0.28162511542012925, "grad_norm": 4.20182793655244, "learning_rate": 1.8059744026547713e-05, "loss": 1.2794, "step": 610 }, { "epoch": 0.2839335180055402, "grad_norm": 3.6927184982852554, "learning_rate": 1.8011774547239403e-05, "loss": 1.2217, "step": 615 }, { "epoch": 0.28624192059095105, "grad_norm": 3.906241578603264, "learning_rate": 1.796328466175186e-05, "loss": 1.3162, "step": 620 }, { "epoch": 0.288550323176362, "grad_norm": 3.7221850266429675, "learning_rate": 1.791427751975385e-05, "loss": 1.2591, "step": 625 }, { "epoch": 0.29085872576177285, "grad_norm": 4.11815775927983, "learning_rate": 1.786475630451262e-05, "loss": 1.2572, "step": 630 }, { "epoch": 0.2931671283471837, "grad_norm": 3.8995508626898454, "learning_rate": 1.781472423268713e-05, "loss": 1.2604, "step": 635 }, { "epoch": 0.29547553093259465, "grad_norm": 4.5219499712986035, "learning_rate": 1.776418455411913e-05, "loss": 1.298, "step": 640 }, { "epoch": 0.29778393351800553, "grad_norm": 4.5899598168207785, "learning_rate": 1.7713140551622032e-05, "loss": 1.2664, "step": 645 }, { "epoch": 0.30009233610341646, "grad_norm": 4.641570078800192, "learning_rate": 1.7661595540767714e-05, "loss": 1.2689, "step": 650 }, { "epoch": 0.30240073868882733, "grad_norm": 4.383087991217795, "learning_rate": 1.7609552869671126e-05, "loss": 1.2551, "step": 655 }, { "epoch": 0.3047091412742382, "grad_norm": 3.9687899547292576, "learning_rate": 1.7557015918772822e-05, "loss": 1.2379, "step": 660 }, { "epoch": 0.30701754385964913, "grad_norm": 4.133840300932013, "learning_rate": 1.750398810061939e-05, "loss": 1.2779, "step": 665 }, { "epoch": 0.30932594644506, "grad_norm": 3.84778329275165, "learning_rate": 1.745047285964179e-05, "loss": 1.2306, "step": 670 }, { "epoch": 0.31163434903047094, "grad_norm": 4.054603771464119, "learning_rate": 1.7396473671931597e-05, "loss": 1.2089, "step": 675 }, { "epoch": 0.3139427516158818, "grad_norm": 4.013882196193361, "learning_rate": 1.7341994045015245e-05, "loss": 1.2225, "step": 680 }, { "epoch": 0.3162511542012927, "grad_norm": 4.076399340438248, "learning_rate": 1.7287037517626174e-05, "loss": 1.3166, "step": 685 }, { "epoch": 0.3185595567867036, "grad_norm": 3.991144267549364, "learning_rate": 1.7231607659474972e-05, "loss": 1.2706, "step": 690 }, { "epoch": 0.3208679593721145, "grad_norm": 3.592102167186549, "learning_rate": 1.7175708071017503e-05, "loss": 1.2066, "step": 695 }, { "epoch": 0.3231763619575254, "grad_norm": 4.2490266329322655, "learning_rate": 1.7119342383221055e-05, "loss": 1.3011, "step": 700 }, { "epoch": 0.3254847645429363, "grad_norm": 3.7487591296204266, "learning_rate": 1.7062514257328474e-05, "loss": 1.2587, "step": 705 }, { "epoch": 0.32779316712834716, "grad_norm": 3.6111287365523466, "learning_rate": 1.7005227384620336e-05, "loss": 1.2626, "step": 710 }, { "epoch": 0.3301015697137581, "grad_norm": 3.8624035554609892, "learning_rate": 1.6947485486175223e-05, "loss": 1.266, "step": 715 }, { "epoch": 0.33240997229916897, "grad_norm": 4.191574332500623, "learning_rate": 1.688929231262797e-05, "loss": 1.2275, "step": 720 }, { "epoch": 0.3347183748845799, "grad_norm": 3.931766819485826, "learning_rate": 1.683065164392606e-05, "loss": 1.2525, "step": 725 }, { "epoch": 0.33702677746999077, "grad_norm": 3.8224846577065685, "learning_rate": 1.6771567289084122e-05, "loss": 1.228, "step": 730 }, { "epoch": 0.33933518005540164, "grad_norm": 3.7975499971303024, "learning_rate": 1.6712043085936473e-05, "loss": 1.2121, "step": 735 }, { "epoch": 0.34164358264081257, "grad_norm": 3.7233983105114326, "learning_rate": 1.6652082900887858e-05, "loss": 1.2439, "step": 740 }, { "epoch": 0.34395198522622344, "grad_norm": 4.0496534376278674, "learning_rate": 1.6591690628662305e-05, "loss": 1.3064, "step": 745 }, { "epoch": 0.3462603878116344, "grad_norm": 4.397682055950332, "learning_rate": 1.6530870192050134e-05, "loss": 1.2433, "step": 750 }, { "epoch": 0.34856879039704525, "grad_norm": 3.999160650641557, "learning_rate": 1.6469625541653152e-05, "loss": 1.2117, "step": 755 }, { "epoch": 0.3508771929824561, "grad_norm": 4.475385002364299, "learning_rate": 1.6407960655628055e-05, "loss": 1.203, "step": 760 }, { "epoch": 0.35318559556786705, "grad_norm": 3.5042875341184416, "learning_rate": 1.6345879539428e-05, "loss": 1.2567, "step": 765 }, { "epoch": 0.3554939981532779, "grad_norm": 3.678612416780679, "learning_rate": 1.6283386225542467e-05, "loss": 1.2276, "step": 770 }, { "epoch": 0.35780240073868885, "grad_norm": 5.063348081613382, "learning_rate": 1.622048477323529e-05, "loss": 1.2297, "step": 775 }, { "epoch": 0.3601108033240997, "grad_norm": 4.04397764374825, "learning_rate": 1.6157179268281007e-05, "loss": 1.2498, "step": 780 }, { "epoch": 0.3624192059095106, "grad_norm": 3.7786600086660553, "learning_rate": 1.6093473822699467e-05, "loss": 1.2156, "step": 785 }, { "epoch": 0.36472760849492153, "grad_norm": 3.726670143436363, "learning_rate": 1.6029372574488732e-05, "loss": 1.248, "step": 790 }, { "epoch": 0.3670360110803324, "grad_norm": 3.6023664901819115, "learning_rate": 1.5964879687356286e-05, "loss": 1.2762, "step": 795 }, { "epoch": 0.36934441366574333, "grad_norm": 3.684618843127009, "learning_rate": 1.589999935044859e-05, "loss": 1.2269, "step": 800 }, { "epoch": 0.3716528162511542, "grad_norm": 3.6119834291134465, "learning_rate": 1.5834735778078968e-05, "loss": 1.2078, "step": 805 }, { "epoch": 0.3739612188365651, "grad_norm": 3.66332363718426, "learning_rate": 1.5769093209453876e-05, "loss": 1.2713, "step": 810 }, { "epoch": 0.376269621421976, "grad_norm": 4.137676249046753, "learning_rate": 1.5703075908397523e-05, "loss": 1.2816, "step": 815 }, { "epoch": 0.3785780240073869, "grad_norm": 3.8481468093108475, "learning_rate": 1.563668816307494e-05, "loss": 1.2203, "step": 820 }, { "epoch": 0.3808864265927978, "grad_norm": 3.7158307301305156, "learning_rate": 1.556993428571342e-05, "loss": 1.2163, "step": 825 }, { "epoch": 0.3831948291782087, "grad_norm": 3.851222452502614, "learning_rate": 1.550281861232243e-05, "loss": 1.243, "step": 830 }, { "epoch": 0.38550323176361956, "grad_norm": 3.6817891692377978, "learning_rate": 1.5435345502411956e-05, "loss": 1.2821, "step": 835 }, { "epoch": 0.3878116343490305, "grad_norm": 3.9683025462284998, "learning_rate": 1.536751933870934e-05, "loss": 1.2019, "step": 840 }, { "epoch": 0.39012003693444136, "grad_norm": 3.94265762295689, "learning_rate": 1.5299344526874576e-05, "loss": 1.2774, "step": 845 }, { "epoch": 0.39242843951985223, "grad_norm": 4.123641725136207, "learning_rate": 1.5230825495214184e-05, "loss": 1.2352, "step": 850 }, { "epoch": 0.39473684210526316, "grad_norm": 3.9570109790957653, "learning_rate": 1.5161966694393516e-05, "loss": 1.215, "step": 855 }, { "epoch": 0.39704524469067404, "grad_norm": 3.6427091867450714, "learning_rate": 1.5092772597147707e-05, "loss": 1.2202, "step": 860 }, { "epoch": 0.39935364727608497, "grad_norm": 3.8425754107191796, "learning_rate": 1.5023247697991114e-05, "loss": 1.2432, "step": 865 }, { "epoch": 0.40166204986149584, "grad_norm": 3.759319372367797, "learning_rate": 1.4953396512925398e-05, "loss": 1.1838, "step": 870 }, { "epoch": 0.4039704524469067, "grad_norm": 3.872324982369786, "learning_rate": 1.4883223579146167e-05, "loss": 1.2331, "step": 875 }, { "epoch": 0.40627885503231764, "grad_norm": 3.8616658245003435, "learning_rate": 1.4812733454748283e-05, "loss": 1.2277, "step": 880 }, { "epoch": 0.4085872576177285, "grad_norm": 3.5624714154298163, "learning_rate": 1.4741930718429772e-05, "loss": 1.2051, "step": 885 }, { "epoch": 0.41089566020313945, "grad_norm": 3.6961173549363924, "learning_rate": 1.4670819969194416e-05, "loss": 1.2309, "step": 890 }, { "epoch": 0.4132040627885503, "grad_norm": 3.5654510220296847, "learning_rate": 1.4599405826053039e-05, "loss": 1.1884, "step": 895 }, { "epoch": 0.4155124653739612, "grad_norm": 4.205884899208378, "learning_rate": 1.4527692927723465e-05, "loss": 1.2223, "step": 900 }, { "epoch": 0.4178208679593721, "grad_norm": 3.9431786244545997, "learning_rate": 1.4455685932329204e-05, "loss": 1.2389, "step": 905 }, { "epoch": 0.420129270544783, "grad_norm": 3.579703652121505, "learning_rate": 1.4383389517096899e-05, "loss": 1.2429, "step": 910 }, { "epoch": 0.4224376731301939, "grad_norm": 3.7807582830713105, "learning_rate": 1.4310808378052506e-05, "loss": 1.1874, "step": 915 }, { "epoch": 0.4247460757156048, "grad_norm": 3.9020463886513914, "learning_rate": 1.4237947229716262e-05, "loss": 1.2587, "step": 920 }, { "epoch": 0.42705447830101567, "grad_norm": 3.7663448915088633, "learning_rate": 1.4164810804796464e-05, "loss": 1.184, "step": 925 }, { "epoch": 0.4293628808864266, "grad_norm": 3.7907471270783937, "learning_rate": 1.409140385388203e-05, "loss": 1.2445, "step": 930 }, { "epoch": 0.4316712834718375, "grad_norm": 3.791543245723202, "learning_rate": 1.4017731145133955e-05, "loss": 1.2527, "step": 935 }, { "epoch": 0.4339796860572484, "grad_norm": 3.8566751713668666, "learning_rate": 1.3943797463975575e-05, "loss": 1.2048, "step": 940 }, { "epoch": 0.4362880886426593, "grad_norm": 3.943257567360323, "learning_rate": 1.3869607612781733e-05, "loss": 1.2773, "step": 945 }, { "epoch": 0.43859649122807015, "grad_norm": 3.53206021655625, "learning_rate": 1.3795166410566834e-05, "loss": 1.2066, "step": 950 }, { "epoch": 0.4409048938134811, "grad_norm": 3.8322607840339504, "learning_rate": 1.372047869267184e-05, "loss": 1.2104, "step": 955 }, { "epoch": 0.44321329639889195, "grad_norm": 4.982802180271467, "learning_rate": 1.364554931045018e-05, "loss": 1.2782, "step": 960 }, { "epoch": 0.4455216989843029, "grad_norm": 4.121927772157904, "learning_rate": 1.3570383130952627e-05, "loss": 1.2221, "step": 965 }, { "epoch": 0.44783010156971376, "grad_norm": 3.5401426054616674, "learning_rate": 1.349498503661116e-05, "loss": 1.249, "step": 970 }, { "epoch": 0.45013850415512463, "grad_norm": 3.8347876039826647, "learning_rate": 1.3419359924921833e-05, "loss": 1.2736, "step": 975 }, { "epoch": 0.45244690674053556, "grad_norm": 4.86416192250325, "learning_rate": 1.3343512708126642e-05, "loss": 1.2032, "step": 980 }, { "epoch": 0.45475530932594643, "grad_norm": 3.8508803970513004, "learning_rate": 1.326744831289447e-05, "loss": 1.2465, "step": 985 }, { "epoch": 0.45706371191135736, "grad_norm": 3.276661833625774, "learning_rate": 1.3191171680001048e-05, "loss": 1.1905, "step": 990 }, { "epoch": 0.45937211449676824, "grad_norm": 3.6488550777243933, "learning_rate": 1.3114687764008048e-05, "loss": 1.1991, "step": 995 }, { "epoch": 0.4616805170821791, "grad_norm": 3.9637997706000223, "learning_rate": 1.3038001532941249e-05, "loss": 1.1994, "step": 1000 }, { "epoch": 0.46398891966759004, "grad_norm": 3.7798295608326447, "learning_rate": 1.2961117967967844e-05, "loss": 1.2327, "step": 1005 }, { "epoch": 0.4662973222530009, "grad_norm": 3.742363753899004, "learning_rate": 1.2884042063072881e-05, "loss": 1.2415, "step": 1010 }, { "epoch": 0.46860572483841184, "grad_norm": 4.00995610689072, "learning_rate": 1.280677882473488e-05, "loss": 1.2449, "step": 1015 }, { "epoch": 0.4709141274238227, "grad_norm": 3.7802768150285284, "learning_rate": 1.272933327160063e-05, "loss": 1.2055, "step": 1020 }, { "epoch": 0.4732225300092336, "grad_norm": 3.979719082398227, "learning_rate": 1.2651710434159223e-05, "loss": 1.1452, "step": 1025 }, { "epoch": 0.4755309325946445, "grad_norm": 3.7987734509998012, "learning_rate": 1.2573915354415274e-05, "loss": 1.2266, "step": 1030 }, { "epoch": 0.4778393351800554, "grad_norm": 3.4449265105850344, "learning_rate": 1.2495953085561426e-05, "loss": 1.1678, "step": 1035 }, { "epoch": 0.4801477377654663, "grad_norm": 4.703831538180476, "learning_rate": 1.241782869165012e-05, "loss": 1.1893, "step": 1040 }, { "epoch": 0.4824561403508772, "grad_norm": 3.56138065098868, "learning_rate": 1.2339547247264658e-05, "loss": 1.2285, "step": 1045 }, { "epoch": 0.48476454293628807, "grad_norm": 3.8664090630676147, "learning_rate": 1.2261113837189587e-05, "loss": 1.1995, "step": 1050 }, { "epoch": 0.487072945521699, "grad_norm": 3.6587622685467553, "learning_rate": 1.2182533556080402e-05, "loss": 1.2456, "step": 1055 }, { "epoch": 0.48938134810710987, "grad_norm": 3.4219623018934615, "learning_rate": 1.2103811508132642e-05, "loss": 1.1904, "step": 1060 }, { "epoch": 0.4916897506925208, "grad_norm": 3.91141223990254, "learning_rate": 1.2024952806750321e-05, "loss": 1.1811, "step": 1065 }, { "epoch": 0.4939981532779317, "grad_norm": 3.707066130468398, "learning_rate": 1.1945962574213814e-05, "loss": 1.212, "step": 1070 }, { "epoch": 0.49630655586334255, "grad_norm": 3.5782501836947653, "learning_rate": 1.1866845941347118e-05, "loss": 1.2255, "step": 1075 }, { "epoch": 0.4986149584487535, "grad_norm": 4.303350644777213, "learning_rate": 1.1787608047184583e-05, "loss": 1.1376, "step": 1080 }, { "epoch": 0.5009233610341643, "grad_norm": 3.419543860379626, "learning_rate": 1.1708254038637115e-05, "loss": 1.1872, "step": 1085 }, { "epoch": 0.5032317636195752, "grad_norm": 3.586294780528409, "learning_rate": 1.1628789070157836e-05, "loss": 1.2114, "step": 1090 }, { "epoch": 0.5055401662049861, "grad_norm": 3.6647616517214496, "learning_rate": 1.1549218303407305e-05, "loss": 1.2088, "step": 1095 }, { "epoch": 0.5078485687903971, "grad_norm": 3.6209405687157794, "learning_rate": 1.1469546906918219e-05, "loss": 1.1535, "step": 1100 }, { "epoch": 0.510156971375808, "grad_norm": 3.4760951984933777, "learning_rate": 1.1389780055759689e-05, "loss": 1.1692, "step": 1105 }, { "epoch": 0.5124653739612188, "grad_norm": 3.523587148397925, "learning_rate": 1.1309922931201114e-05, "loss": 1.1795, "step": 1110 }, { "epoch": 0.5147737765466297, "grad_norm": 3.399747435026194, "learning_rate": 1.1229980720375609e-05, "loss": 1.1913, "step": 1115 }, { "epoch": 0.5170821791320406, "grad_norm": 3.802970464768176, "learning_rate": 1.114995861594308e-05, "loss": 1.1692, "step": 1120 }, { "epoch": 0.5193905817174516, "grad_norm": 3.571347595436078, "learning_rate": 1.1069861815752944e-05, "loss": 1.1575, "step": 1125 }, { "epoch": 0.5216989843028624, "grad_norm": 3.702241350827994, "learning_rate": 1.0989695522506486e-05, "loss": 1.1776, "step": 1130 }, { "epoch": 0.5240073868882733, "grad_norm": 4.396145181294285, "learning_rate": 1.0909464943418926e-05, "loss": 1.2055, "step": 1135 }, { "epoch": 0.5263157894736842, "grad_norm": 3.402649511273165, "learning_rate": 1.0829175289881188e-05, "loss": 1.2024, "step": 1140 }, { "epoch": 0.528624192059095, "grad_norm": 3.321901777095843, "learning_rate": 1.074883177712138e-05, "loss": 1.1317, "step": 1145 }, { "epoch": 0.530932594644506, "grad_norm": 4.575011114858196, "learning_rate": 1.0668439623866043e-05, "loss": 1.1516, "step": 1150 }, { "epoch": 0.5332409972299169, "grad_norm": 3.428811319179132, "learning_rate": 1.0588004052001177e-05, "loss": 1.1326, "step": 1155 }, { "epoch": 0.5355493998153278, "grad_norm": 3.758823500740248, "learning_rate": 1.0507530286233042e-05, "loss": 1.1523, "step": 1160 }, { "epoch": 0.5378578024007387, "grad_norm": 3.828420445656179, "learning_rate": 1.0427023553748792e-05, "loss": 1.215, "step": 1165 }, { "epoch": 0.5401662049861495, "grad_norm": 3.872474623427253, "learning_rate": 1.0346489083876928e-05, "loss": 1.1798, "step": 1170 }, { "epoch": 0.5424746075715605, "grad_norm": 4.343223419966708, "learning_rate": 1.0265932107747656e-05, "loss": 1.1964, "step": 1175 }, { "epoch": 0.5447830101569714, "grad_norm": 3.4458152638291533, "learning_rate": 1.0185357857953064e-05, "loss": 1.188, "step": 1180 }, { "epoch": 0.5470914127423823, "grad_norm": 3.3343026801443765, "learning_rate": 1.0104771568207266e-05, "loss": 1.1524, "step": 1185 }, { "epoch": 0.5493998153277931, "grad_norm": 3.8325280372919774, "learning_rate": 1.0024178473006418e-05, "loss": 1.1445, "step": 1190 }, { "epoch": 0.551708217913204, "grad_norm": 3.913934401934443, "learning_rate": 9.943583807288746e-06, "loss": 1.1497, "step": 1195 }, { "epoch": 0.554016620498615, "grad_norm": 3.8771337742661585, "learning_rate": 9.862992806094473e-06, "loss": 1.1584, "step": 1200 }, { "epoch": 0.5563250230840259, "grad_norm": 3.385706053842486, "learning_rate": 9.782410704225793e-06, "loss": 1.133, "step": 1205 }, { "epoch": 0.5586334256694367, "grad_norm": 3.228558572718497, "learning_rate": 9.701842735906855e-06, "loss": 1.1714, "step": 1210 }, { "epoch": 0.5609418282548476, "grad_norm": 3.376489834368575, "learning_rate": 9.621294134443747e-06, "loss": 1.1782, "step": 1215 }, { "epoch": 0.5632502308402585, "grad_norm": 4.101023970778267, "learning_rate": 9.54077013188459e-06, "loss": 1.1679, "step": 1220 }, { "epoch": 0.5655586334256695, "grad_norm": 3.459693677788322, "learning_rate": 9.460275958679674e-06, "loss": 1.2272, "step": 1225 }, { "epoch": 0.5678670360110804, "grad_norm": 3.5741244509053556, "learning_rate": 9.379816843341715e-06, "loss": 1.1679, "step": 1230 }, { "epoch": 0.5701754385964912, "grad_norm": 14.959841662019736, "learning_rate": 9.299398012106246e-06, "loss": 1.1557, "step": 1235 }, { "epoch": 0.5724838411819021, "grad_norm": 3.479142794568544, "learning_rate": 9.219024688592136e-06, "loss": 1.191, "step": 1240 }, { "epoch": 0.574792243767313, "grad_norm": 3.4791994405128195, "learning_rate": 9.138702093462286e-06, "loss": 1.1632, "step": 1245 }, { "epoch": 0.577100646352724, "grad_norm": 3.378297795269278, "learning_rate": 9.058435444084543e-06, "loss": 1.2058, "step": 1250 }, { "epoch": 0.5794090489381348, "grad_norm": 3.3312286796444948, "learning_rate": 8.978229954192775e-06, "loss": 1.2072, "step": 1255 }, { "epoch": 0.5817174515235457, "grad_norm": 3.2936946867277497, "learning_rate": 8.898090833548226e-06, "loss": 1.1479, "step": 1260 }, { "epoch": 0.5840258541089566, "grad_norm": 3.5657195698986306, "learning_rate": 8.818023287601117e-06, "loss": 1.1579, "step": 1265 }, { "epoch": 0.5863342566943675, "grad_norm": 3.85534125869907, "learning_rate": 8.738032517152523e-06, "loss": 1.1748, "step": 1270 }, { "epoch": 0.5886426592797784, "grad_norm": 3.3807308381583585, "learning_rate": 8.658123718016548e-06, "loss": 1.1365, "step": 1275 }, { "epoch": 0.5909510618651893, "grad_norm": 3.75547737356039, "learning_rate": 8.578302080682844e-06, "loss": 1.1657, "step": 1280 }, { "epoch": 0.5932594644506002, "grad_norm": 3.334058557259955, "learning_rate": 8.498572789979446e-06, "loss": 1.1653, "step": 1285 }, { "epoch": 0.5955678670360111, "grad_norm": 3.596795067568704, "learning_rate": 8.418941024735997e-06, "loss": 1.1909, "step": 1290 }, { "epoch": 0.5978762696214219, "grad_norm": 3.754106205642103, "learning_rate": 8.33941195744737e-06, "loss": 1.1595, "step": 1295 }, { "epoch": 0.6001846722068329, "grad_norm": 3.3575559431036988, "learning_rate": 8.259990753937662e-06, "loss": 1.1378, "step": 1300 }, { "epoch": 0.6024930747922438, "grad_norm": 4.011372021010383, "learning_rate": 8.18068257302466e-06, "loss": 1.1832, "step": 1305 }, { "epoch": 0.6048014773776547, "grad_norm": 3.404379906541828, "learning_rate": 8.101492566184757e-06, "loss": 1.1592, "step": 1310 }, { "epoch": 0.6071098799630655, "grad_norm": 3.498132121516362, "learning_rate": 8.022425877218321e-06, "loss": 1.1591, "step": 1315 }, { "epoch": 0.6094182825484764, "grad_norm": 3.523586580045349, "learning_rate": 7.943487641915595e-06, "loss": 1.1525, "step": 1320 }, { "epoch": 0.6117266851338874, "grad_norm": 3.6229726894839858, "learning_rate": 7.864682987723082e-06, "loss": 1.1618, "step": 1325 }, { "epoch": 0.6140350877192983, "grad_norm": 3.696989469787097, "learning_rate": 7.78601703341051e-06, "loss": 1.1824, "step": 1330 }, { "epoch": 0.6163434903047091, "grad_norm": 3.567967173775001, "learning_rate": 7.70749488873833e-06, "loss": 1.1792, "step": 1335 }, { "epoch": 0.61865189289012, "grad_norm": 3.399928397766497, "learning_rate": 7.629121654125808e-06, "loss": 1.1438, "step": 1340 }, { "epoch": 0.6209602954755309, "grad_norm": 3.6344006441397414, "learning_rate": 7.550902420319742e-06, "loss": 1.1591, "step": 1345 }, { "epoch": 0.6232686980609419, "grad_norm": 3.538106316840523, "learning_rate": 7.472842268063776e-06, "loss": 1.1311, "step": 1350 }, { "epoch": 0.6255771006463527, "grad_norm": 3.661558906665894, "learning_rate": 7.394946267768381e-06, "loss": 1.1621, "step": 1355 }, { "epoch": 0.6278855032317636, "grad_norm": 3.6197107279149954, "learning_rate": 7.317219479181517e-06, "loss": 1.1028, "step": 1360 }, { "epoch": 0.6301939058171745, "grad_norm": 3.4094252840241355, "learning_rate": 7.23966695105996e-06, "loss": 1.119, "step": 1365 }, { "epoch": 0.6325023084025854, "grad_norm": 3.4085855538144467, "learning_rate": 7.162293720841378e-06, "loss": 1.1438, "step": 1370 }, { "epoch": 0.6348107109879964, "grad_norm": 4.073406312500022, "learning_rate": 7.085104814317101e-06, "loss": 1.1729, "step": 1375 }, { "epoch": 0.6371191135734072, "grad_norm": 3.572178264074241, "learning_rate": 7.008105245305699e-06, "loss": 1.1661, "step": 1380 }, { "epoch": 0.6394275161588181, "grad_norm": 3.81528951625221, "learning_rate": 6.931300015327274e-06, "loss": 1.1571, "step": 1385 }, { "epoch": 0.641735918744229, "grad_norm": 3.2846636335941763, "learning_rate": 6.854694113278614e-06, "loss": 1.154, "step": 1390 }, { "epoch": 0.6440443213296398, "grad_norm": 3.2544013227776007, "learning_rate": 6.7782925151091224e-06, "loss": 1.0823, "step": 1395 }, { "epoch": 0.6463527239150508, "grad_norm": 3.482450898904014, "learning_rate": 6.702100183497613e-06, "loss": 1.1803, "step": 1400 }, { "epoch": 0.6486611265004617, "grad_norm": 3.412256349030684, "learning_rate": 6.62612206752995e-06, "loss": 1.1643, "step": 1405 }, { "epoch": 0.6509695290858726, "grad_norm": 3.75196899322532, "learning_rate": 6.550363102377588e-06, "loss": 1.1117, "step": 1410 }, { "epoch": 0.6532779316712835, "grad_norm": 3.3485189294016258, "learning_rate": 6.474828208976998e-06, "loss": 1.1466, "step": 1415 }, { "epoch": 0.6555863342566943, "grad_norm": 3.4421443761863104, "learning_rate": 6.3995222937100455e-06, "loss": 1.1468, "step": 1420 }, { "epoch": 0.6578947368421053, "grad_norm": 3.4653107797221683, "learning_rate": 6.324450248085265e-06, "loss": 1.1418, "step": 1425 }, { "epoch": 0.6602031394275162, "grad_norm": 3.450235228111911, "learning_rate": 6.249616948420161e-06, "loss": 1.1393, "step": 1430 }, { "epoch": 0.6625115420129271, "grad_norm": 3.648594332616919, "learning_rate": 6.175027255524446e-06, "loss": 1.1263, "step": 1435 }, { "epoch": 0.6648199445983379, "grad_norm": 3.50804118935427, "learning_rate": 6.100686014384315e-06, "loss": 1.1497, "step": 1440 }, { "epoch": 0.6671283471837488, "grad_norm": 3.407303145023877, "learning_rate": 6.026598053847743e-06, "loss": 1.1217, "step": 1445 }, { "epoch": 0.6694367497691598, "grad_norm": 3.6049741156075426, "learning_rate": 5.952768186310813e-06, "loss": 1.2134, "step": 1450 }, { "epoch": 0.6717451523545707, "grad_norm": 3.347553717603198, "learning_rate": 5.879201207405136e-06, "loss": 1.1189, "step": 1455 }, { "epoch": 0.6740535549399815, "grad_norm": 3.7624263901785087, "learning_rate": 5.805901895686344e-06, "loss": 1.1217, "step": 1460 }, { "epoch": 0.6763619575253924, "grad_norm": 3.6359056480115193, "learning_rate": 5.732875012323712e-06, "loss": 1.1275, "step": 1465 }, { "epoch": 0.6786703601108033, "grad_norm": 3.5660085050284946, "learning_rate": 5.660125300790873e-06, "loss": 1.153, "step": 1470 }, { "epoch": 0.6809787626962143, "grad_norm": 3.4188915438262946, "learning_rate": 5.58765748655772e-06, "loss": 1.126, "step": 1475 }, { "epoch": 0.6832871652816251, "grad_norm": 3.7409360766713995, "learning_rate": 5.5154762767834605e-06, "loss": 1.1312, "step": 1480 }, { "epoch": 0.685595567867036, "grad_norm": 3.5176838276710787, "learning_rate": 5.443586360010859e-06, "loss": 1.118, "step": 1485 }, { "epoch": 0.6879039704524469, "grad_norm": 3.940112737940071, "learning_rate": 5.3719924058616975e-06, "loss": 1.1084, "step": 1490 }, { "epoch": 0.6902123730378578, "grad_norm": 3.5725656073039516, "learning_rate": 5.30069906473345e-06, "loss": 1.1462, "step": 1495 }, { "epoch": 0.6925207756232687, "grad_norm": 3.585328985764251, "learning_rate": 5.2297109674972166e-06, "loss": 1.1275, "step": 1500 }, { "epoch": 0.6948291782086796, "grad_norm": 3.7150630899084276, "learning_rate": 5.159032725196946e-06, "loss": 1.1573, "step": 1505 }, { "epoch": 0.6971375807940905, "grad_norm": 3.4991531847637893, "learning_rate": 5.088668928749891e-06, "loss": 1.1339, "step": 1510 }, { "epoch": 0.6994459833795014, "grad_norm": 3.337315702796277, "learning_rate": 5.0186241486484245e-06, "loss": 1.1121, "step": 1515 }, { "epoch": 0.7017543859649122, "grad_norm": 3.166495977462906, "learning_rate": 4.948902934663158e-06, "loss": 1.1207, "step": 1520 }, { "epoch": 0.7040627885503232, "grad_norm": 3.269883473204096, "learning_rate": 4.879509815547413e-06, "loss": 1.1067, "step": 1525 }, { "epoch": 0.7063711911357341, "grad_norm": 3.2549683491943138, "learning_rate": 4.810449298743051e-06, "loss": 1.0858, "step": 1530 }, { "epoch": 0.708679593721145, "grad_norm": 3.673192940396545, "learning_rate": 4.741725870087693e-06, "loss": 1.1674, "step": 1535 }, { "epoch": 0.7109879963065558, "grad_norm": 3.295243146355197, "learning_rate": 4.673343993523347e-06, "loss": 1.1087, "step": 1540 }, { "epoch": 0.7132963988919667, "grad_norm": 3.4162872942710867, "learning_rate": 4.605308110806436e-06, "loss": 1.1224, "step": 1545 }, { "epoch": 0.7156048014773777, "grad_norm": 3.3989883160652865, "learning_rate": 4.537622641219309e-06, "loss": 1.1307, "step": 1550 }, { "epoch": 0.7179132040627886, "grad_norm": 3.2956559559454663, "learning_rate": 4.47029198128316e-06, "loss": 1.0944, "step": 1555 }, { "epoch": 0.7202216066481995, "grad_norm": 3.3797718456778765, "learning_rate": 4.403320504472463e-06, "loss": 1.1426, "step": 1560 }, { "epoch": 0.7225300092336103, "grad_norm": 3.1832339015639826, "learning_rate": 4.336712560930891e-06, "loss": 1.1223, "step": 1565 }, { "epoch": 0.7248384118190212, "grad_norm": 3.40273969921815, "learning_rate": 4.270472477188755e-06, "loss": 1.1151, "step": 1570 }, { "epoch": 0.7271468144044322, "grad_norm": 3.32953363908172, "learning_rate": 4.204604555881967e-06, "loss": 1.1055, "step": 1575 }, { "epoch": 0.7294552169898431, "grad_norm": 3.363759228103856, "learning_rate": 4.139113075472565e-06, "loss": 1.15, "step": 1580 }, { "epoch": 0.7317636195752539, "grad_norm": 3.5692625205390214, "learning_rate": 4.074002289970801e-06, "loss": 1.1249, "step": 1585 }, { "epoch": 0.7340720221606648, "grad_norm": 3.6298117912857895, "learning_rate": 4.009276428658836e-06, "loss": 1.0911, "step": 1590 }, { "epoch": 0.7363804247460757, "grad_norm": 3.501911680130801, "learning_rate": 3.944939695816005e-06, "loss": 1.0591, "step": 1595 }, { "epoch": 0.7386888273314867, "grad_norm": 3.314254645913856, "learning_rate": 3.8809962704457375e-06, "loss": 1.122, "step": 1600 }, { "epoch": 0.7409972299168975, "grad_norm": 3.56145944415269, "learning_rate": 3.81745030600411e-06, "loss": 1.1036, "step": 1605 }, { "epoch": 0.7433056325023084, "grad_norm": 3.4910849192084235, "learning_rate": 3.75430593013006e-06, "loss": 1.1353, "step": 1610 }, { "epoch": 0.7456140350877193, "grad_norm": 3.325715619787326, "learning_rate": 3.6915672443772644e-06, "loss": 1.1538, "step": 1615 }, { "epoch": 0.7479224376731302, "grad_norm": 3.5950013679874724, "learning_rate": 3.62923832394774e-06, "loss": 1.0909, "step": 1620 }, { "epoch": 0.7502308402585411, "grad_norm": 3.1524005532212334, "learning_rate": 3.56732321742712e-06, "loss": 1.1125, "step": 1625 }, { "epoch": 0.752539242843952, "grad_norm": 3.6760451234626124, "learning_rate": 3.5058259465216828e-06, "loss": 1.1039, "step": 1630 }, { "epoch": 0.7548476454293629, "grad_norm": 3.341546948891595, "learning_rate": 3.444750505797123e-06, "loss": 1.0531, "step": 1635 }, { "epoch": 0.7571560480147738, "grad_norm": 3.35627649123262, "learning_rate": 3.384100862419096e-06, "loss": 1.0931, "step": 1640 }, { "epoch": 0.7594644506001846, "grad_norm": 3.6221419833131527, "learning_rate": 3.3238809558955054e-06, "loss": 1.0797, "step": 1645 }, { "epoch": 0.7617728531855956, "grad_norm": 3.3487671296828267, "learning_rate": 3.2640946978206266e-06, "loss": 1.0812, "step": 1650 }, { "epoch": 0.7640812557710065, "grad_norm": 3.441031645390376, "learning_rate": 3.2047459716210306e-06, "loss": 1.1155, "step": 1655 }, { "epoch": 0.7663896583564174, "grad_norm": 3.4825057106301096, "learning_rate": 3.145838632303325e-06, "loss": 1.096, "step": 1660 }, { "epoch": 0.7686980609418282, "grad_norm": 3.4525699686491875, "learning_rate": 3.087376506203763e-06, "loss": 1.145, "step": 1665 }, { "epoch": 0.7710064635272391, "grad_norm": 3.2639030957505715, "learning_rate": 3.0293633907396903e-06, "loss": 1.0711, "step": 1670 }, { "epoch": 0.7733148661126501, "grad_norm": 3.247147491878351, "learning_rate": 2.971803054162903e-06, "loss": 1.0367, "step": 1675 }, { "epoch": 0.775623268698061, "grad_norm": 3.3628039668359824, "learning_rate": 2.914699235314855e-06, "loss": 1.1311, "step": 1680 }, { "epoch": 0.7779316712834718, "grad_norm": 3.294560766749018, "learning_rate": 2.858055643383818e-06, "loss": 1.1303, "step": 1685 }, { "epoch": 0.7802400738688827, "grad_norm": 3.252460881051861, "learning_rate": 2.8018759576639478e-06, "loss": 1.0894, "step": 1690 }, { "epoch": 0.7825484764542936, "grad_norm": 3.6541818791755083, "learning_rate": 2.7461638273162895e-06, "loss": 1.1416, "step": 1695 }, { "epoch": 0.7848568790397045, "grad_norm": 3.3018114290440286, "learning_rate": 2.6909228711317526e-06, "loss": 1.0898, "step": 1700 }, { "epoch": 0.7871652816251155, "grad_norm": 3.5110479717681704, "learning_rate": 2.6361566772960466e-06, "loss": 1.0887, "step": 1705 }, { "epoch": 0.7894736842105263, "grad_norm": 3.469571849173682, "learning_rate": 2.5818688031566132e-06, "loss": 1.0182, "step": 1710 }, { "epoch": 0.7917820867959372, "grad_norm": 3.761287355693432, "learning_rate": 2.5280627749915544e-06, "loss": 1.1246, "step": 1715 }, { "epoch": 0.7940904893813481, "grad_norm": 3.7171990367681866, "learning_rate": 2.4747420877805905e-06, "loss": 1.1008, "step": 1720 }, { "epoch": 0.796398891966759, "grad_norm": 3.583342537171837, "learning_rate": 2.421910204978033e-06, "loss": 1.092, "step": 1725 }, { "epoch": 0.7987072945521699, "grad_norm": 3.3105866570237343, "learning_rate": 2.369570558287819e-06, "loss": 1.0495, "step": 1730 }, { "epoch": 0.8010156971375808, "grad_norm": 3.453250654565143, "learning_rate": 2.3177265474406084e-06, "loss": 1.0952, "step": 1735 }, { "epoch": 0.8033240997229917, "grad_norm": 3.2111312681793294, "learning_rate": 2.2663815399729495e-06, "loss": 1.0756, "step": 1740 }, { "epoch": 0.8056325023084026, "grad_norm": 3.398739502823191, "learning_rate": 2.215538871008538e-06, "loss": 1.0855, "step": 1745 }, { "epoch": 0.8079409048938134, "grad_norm": 3.4089573083048883, "learning_rate": 2.1652018430415923e-06, "loss": 1.0707, "step": 1750 }, { "epoch": 0.8102493074792244, "grad_norm": 3.7996382043873744, "learning_rate": 2.115373725722326e-06, "loss": 1.1419, "step": 1755 }, { "epoch": 0.8125577100646353, "grad_norm": 3.4303103622199203, "learning_rate": 2.066057755644587e-06, "loss": 1.1101, "step": 1760 }, { "epoch": 0.8148661126500462, "grad_norm": 3.3758394994097363, "learning_rate": 2.0172571361356007e-06, "loss": 1.0975, "step": 1765 }, { "epoch": 0.817174515235457, "grad_norm": 3.2901551940425673, "learning_rate": 1.9689750370479134e-06, "loss": 1.0797, "step": 1770 }, { "epoch": 0.8194829178208679, "grad_norm": 3.661068632899665, "learning_rate": 1.921214594553488e-06, "loss": 1.1287, "step": 1775 }, { "epoch": 0.8217913204062789, "grad_norm": 3.5442080312978415, "learning_rate": 1.8739789109399954e-06, "loss": 1.1514, "step": 1780 }, { "epoch": 0.8240997229916898, "grad_norm": 3.3534741257777325, "learning_rate": 1.8272710544093019e-06, "loss": 1.0824, "step": 1785 }, { "epoch": 0.8264081255771006, "grad_norm": 3.570055818522298, "learning_rate": 1.7810940588781811e-06, "loss": 1.1313, "step": 1790 }, { "epoch": 0.8287165281625115, "grad_norm": 3.3907592881825352, "learning_rate": 1.7354509237812334e-06, "loss": 1.0458, "step": 1795 }, { "epoch": 0.8310249307479224, "grad_norm": 3.7660635086416794, "learning_rate": 1.690344613876066e-06, "loss": 1.109, "step": 1800 }, { "epoch": 0.8333333333333334, "grad_norm": 20.624336407323348, "learning_rate": 1.64577805905072e-06, "loss": 1.0872, "step": 1805 }, { "epoch": 0.8356417359187442, "grad_norm": 3.38434013035599, "learning_rate": 1.601754154133347e-06, "loss": 1.0943, "step": 1810 }, { "epoch": 0.8379501385041551, "grad_norm": 3.3282071197431318, "learning_rate": 1.558275758704183e-06, "loss": 1.0983, "step": 1815 }, { "epoch": 0.840258541089566, "grad_norm": 3.4156960745203286, "learning_rate": 1.5153456969098013e-06, "loss": 1.0381, "step": 1820 }, { "epoch": 0.8425669436749769, "grad_norm": 3.3418973703656274, "learning_rate": 1.4729667572796735e-06, "loss": 1.1452, "step": 1825 }, { "epoch": 0.8448753462603878, "grad_norm": 3.333897377962453, "learning_rate": 1.431141692545036e-06, "loss": 1.1076, "step": 1830 }, { "epoch": 0.8471837488457987, "grad_norm": 3.402941306050666, "learning_rate": 1.389873219460085e-06, "loss": 1.0869, "step": 1835 }, { "epoch": 0.8494921514312096, "grad_norm": 3.3313186519496423, "learning_rate": 1.349164018625513e-06, "loss": 1.0765, "step": 1840 }, { "epoch": 0.8518005540166205, "grad_norm": 3.6011720414080566, "learning_rate": 1.3090167343143911e-06, "loss": 1.0846, "step": 1845 }, { "epoch": 0.8541089566020313, "grad_norm": 3.629326020817196, "learning_rate": 1.2694339743004037e-06, "loss": 1.1088, "step": 1850 }, { "epoch": 0.8564173591874423, "grad_norm": 3.6305906598709767, "learning_rate": 1.2304183096884626e-06, "loss": 1.0875, "step": 1855 }, { "epoch": 0.8587257617728532, "grad_norm": 3.35865168543221, "learning_rate": 1.1919722747477024e-06, "loss": 1.1143, "step": 1860 }, { "epoch": 0.8610341643582641, "grad_norm": 3.3889339992199177, "learning_rate": 1.1540983667468686e-06, "loss": 1.0916, "step": 1865 }, { "epoch": 0.863342566943675, "grad_norm": 3.3133014347890324, "learning_rate": 1.1167990457920985e-06, "loss": 1.0877, "step": 1870 }, { "epoch": 0.8656509695290858, "grad_norm": 3.415023896862017, "learning_rate": 1.0800767346671347e-06, "loss": 1.0284, "step": 1875 }, { "epoch": 0.8679593721144968, "grad_norm": 3.322962975732958, "learning_rate": 1.043933818675944e-06, "loss": 1.0782, "step": 1880 }, { "epoch": 0.8702677746999077, "grad_norm": 3.583896655771928, "learning_rate": 1.008372645487785e-06, "loss": 1.08, "step": 1885 }, { "epoch": 0.8725761772853186, "grad_norm": 3.3057678718948726, "learning_rate": 9.733955249847183e-07, "loss": 1.1034, "step": 1890 }, { "epoch": 0.8748845798707294, "grad_norm": 3.4387092657320997, "learning_rate": 9.390047291115567e-07, "loss": 1.0915, "step": 1895 }, { "epoch": 0.8771929824561403, "grad_norm": 3.8029482282950324, "learning_rate": 9.052024917282987e-07, "loss": 1.057, "step": 1900 }, { "epoch": 0.8795013850415513, "grad_norm": 3.3990790831971465, "learning_rate": 8.719910084650262e-07, "loss": 1.0725, "step": 1905 }, { "epoch": 0.8818097876269622, "grad_norm": 3.262416726762208, "learning_rate": 8.393724365792866e-07, "loss": 1.1028, "step": 1910 }, { "epoch": 0.884118190212373, "grad_norm": 3.551691283783414, "learning_rate": 8.073488948159691e-07, "loss": 1.0546, "step": 1915 }, { "epoch": 0.8864265927977839, "grad_norm": 3.5211563130144197, "learning_rate": 7.759224632696793e-07, "loss": 1.1024, "step": 1920 }, { "epoch": 0.8887349953831948, "grad_norm": 3.5958803804208976, "learning_rate": 7.450951832496233e-07, "loss": 1.0698, "step": 1925 }, { "epoch": 0.8910433979686058, "grad_norm": 4.107811963680795, "learning_rate": 7.148690571470251e-07, "loss": 1.0613, "step": 1930 }, { "epoch": 0.8933518005540166, "grad_norm": 3.6280688174940416, "learning_rate": 6.852460483050494e-07, "loss": 1.0987, "step": 1935 }, { "epoch": 0.8956602031394275, "grad_norm": 3.4197153407779055, "learning_rate": 6.562280808912768e-07, "loss": 1.081, "step": 1940 }, { "epoch": 0.8979686057248384, "grad_norm": 3.3975321682078494, "learning_rate": 6.278170397727179e-07, "loss": 1.0881, "step": 1945 }, { "epoch": 0.9002770083102493, "grad_norm": 3.385824657440924, "learning_rate": 6.000147703933845e-07, "loss": 1.0725, "step": 1950 }, { "epoch": 0.9025854108956602, "grad_norm": 3.733106691108023, "learning_rate": 5.728230786544153e-07, "loss": 1.0886, "step": 1955 }, { "epoch": 0.9048938134810711, "grad_norm": 3.3831515124529288, "learning_rate": 5.46243730796776e-07, "loss": 1.0854, "step": 1960 }, { "epoch": 0.907202216066482, "grad_norm": 3.4106013139907065, "learning_rate": 5.202784532865302e-07, "loss": 1.114, "step": 1965 }, { "epoch": 0.9095106186518929, "grad_norm": 3.130011381325973, "learning_rate": 4.949289327026952e-07, "loss": 1.0873, "step": 1970 }, { "epoch": 0.9118190212373037, "grad_norm": 3.3600219468750394, "learning_rate": 4.7019681562769816e-07, "loss": 1.0689, "step": 1975 }, { "epoch": 0.9141274238227147, "grad_norm": 3.379655670825615, "learning_rate": 4.460837085404113e-07, "loss": 1.0874, "step": 1980 }, { "epoch": 0.9164358264081256, "grad_norm": 3.324809563310868, "learning_rate": 4.225911777118097e-07, "loss": 1.0894, "step": 1985 }, { "epoch": 0.9187442289935365, "grad_norm": 3.4668181744618196, "learning_rate": 3.9972074910323066e-07, "loss": 1.0896, "step": 1990 }, { "epoch": 0.9210526315789473, "grad_norm": 3.4175120046363276, "learning_rate": 3.7747390826725736e-07, "loss": 1.0608, "step": 1995 }, { "epoch": 0.9233610341643582, "grad_norm": 3.365932789028912, "learning_rate": 3.5585210025122166e-07, "loss": 1.0465, "step": 2000 }, { "epoch": 0.9256694367497692, "grad_norm": 3.3721429412301442, "learning_rate": 3.3485672950334447e-07, "loss": 1.0782, "step": 2005 }, { "epoch": 0.9279778393351801, "grad_norm": 3.402893452692765, "learning_rate": 3.1448915978150365e-07, "loss": 1.0575, "step": 2010 }, { "epoch": 0.930286241920591, "grad_norm": 3.3246351042614606, "learning_rate": 2.947507140646588e-07, "loss": 1.093, "step": 2015 }, { "epoch": 0.9325946445060018, "grad_norm": 3.42392243323848, "learning_rate": 2.756426744669105e-07, "loss": 1.0709, "step": 2020 }, { "epoch": 0.9349030470914127, "grad_norm": 3.3870385964627565, "learning_rate": 2.57166282154222e-07, "loss": 1.0944, "step": 2025 }, { "epoch": 0.9372114496768237, "grad_norm": 3.4345800654530128, "learning_rate": 2.393227372638018e-07, "loss": 1.0829, "step": 2030 }, { "epoch": 0.9395198522622346, "grad_norm": 3.2304527099741094, "learning_rate": 2.221131988261438e-07, "loss": 1.0663, "step": 2035 }, { "epoch": 0.9418282548476454, "grad_norm": 3.4212248154000324, "learning_rate": 2.055387846897472e-07, "loss": 1.0608, "step": 2040 }, { "epoch": 0.9441366574330563, "grad_norm": 3.3424495231710356, "learning_rate": 1.8960057144850163e-07, "loss": 1.0513, "step": 2045 }, { "epoch": 0.9464450600184672, "grad_norm": 8.42913586604929, "learning_rate": 1.742995943717607e-07, "loss": 1.0698, "step": 2050 }, { "epoch": 0.9487534626038782, "grad_norm": 4.03605470816158, "learning_rate": 1.5963684733709462e-07, "loss": 1.0787, "step": 2055 }, { "epoch": 0.951061865189289, "grad_norm": 3.572766321551919, "learning_rate": 1.4561328276573415e-07, "loss": 1.0625, "step": 2060 }, { "epoch": 0.9533702677746999, "grad_norm": 3.213406555168112, "learning_rate": 1.3222981156070126e-07, "loss": 1.0861, "step": 2065 }, { "epoch": 0.9556786703601108, "grad_norm": 3.216022210724082, "learning_rate": 1.1948730304764622e-07, "loss": 1.0572, "step": 2070 }, { "epoch": 0.9579870729455217, "grad_norm": 3.8142801195990237, "learning_rate": 1.073865849183786e-07, "loss": 1.1151, "step": 2075 }, { "epoch": 0.9602954755309326, "grad_norm": 3.2011503381896413, "learning_rate": 9.592844317710238e-08, "loss": 1.0585, "step": 2080 }, { "epoch": 0.9626038781163435, "grad_norm": 3.3780038857652226, "learning_rate": 8.511362208936447e-08, "loss": 1.0591, "step": 2085 }, { "epoch": 0.9649122807017544, "grad_norm": 3.3212612452494295, "learning_rate": 7.494282413371135e-08, "loss": 1.0787, "step": 2090 }, { "epoch": 0.9672206832871653, "grad_norm": 3.797857330316498, "learning_rate": 6.541670995605321e-08, "loss": 1.0859, "step": 2095 }, { "epoch": 0.9695290858725761, "grad_norm": 3.153773745189338, "learning_rate": 5.653589832675943e-08, "loss": 1.0983, "step": 2100 }, { "epoch": 0.9718374884579871, "grad_norm": 3.4652822167549906, "learning_rate": 4.830096610045854e-08, "loss": 1.0713, "step": 2105 }, { "epoch": 0.974145891043398, "grad_norm": 3.6601967632905796, "learning_rate": 4.071244817857589e-08, "loss": 1.1118, "step": 2110 }, { "epoch": 0.9764542936288089, "grad_norm": 3.135385406063897, "learning_rate": 3.3770837474584874e-08, "loss": 1.072, "step": 2115 }, { "epoch": 0.9787626962142197, "grad_norm": 3.4884714571677784, "learning_rate": 2.747658488199023e-08, "loss": 1.0738, "step": 2120 }, { "epoch": 0.9810710987996306, "grad_norm": 3.7803263448925706, "learning_rate": 2.1830099245040427e-08, "loss": 1.0549, "step": 2125 }, { "epoch": 0.9833795013850416, "grad_norm": 3.3045019552603585, "learning_rate": 1.683174733216997e-08, "loss": 1.1129, "step": 2130 }, { "epoch": 0.9856879039704525, "grad_norm": 3.2212668180182784, "learning_rate": 1.248185381217848e-08, "loss": 1.0777, "step": 2135 }, { "epoch": 0.9879963065558633, "grad_norm": 3.324768260260177, "learning_rate": 8.780701233139789e-09, "loss": 1.0503, "step": 2140 }, { "epoch": 0.9903047091412742, "grad_norm": 3.214869100486745, "learning_rate": 5.728530004051047e-09, "loss": 1.0367, "step": 2145 }, { "epoch": 0.9926131117266851, "grad_norm": 3.3583215428853666, "learning_rate": 3.325538379211901e-09, "loss": 1.0554, "step": 2150 }, { "epoch": 0.9949215143120961, "grad_norm": 4.075445923312751, "learning_rate": 1.5718824453525572e-09, "loss": 1.1222, "step": 2155 }, { "epoch": 0.997229916897507, "grad_norm": 3.3599147688364903, "learning_rate": 4.676761114941197e-10, "loss": 1.0646, "step": 2160 }, { "epoch": 0.9995383194829178, "grad_norm": 3.4496692841727543, "learning_rate": 1.2991101545622998e-11, "loss": 1.1038, "step": 2165 }, { "epoch": 1.0, "eval_loss": 1.1177629232406616, "eval_runtime": 1154.8442, "eval_samples_per_second": 26.579, "eval_steps_per_second": 0.831, "step": 2166 }, { "epoch": 1.0, "step": 2166, "total_flos": 113379083550720.0, "train_loss": 1.171416565762112, "train_runtime": 11018.7418, "train_samples_per_second": 6.29, "train_steps_per_second": 0.197 } ], "logging_steps": 5, "max_steps": 2166, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 113379083550720.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }