{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 200, "global_step": 789, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012674271229404308, "grad_norm": 0.8615043469450224, "learning_rate": 2.531645569620253e-06, "loss": 0.479, "step": 1 }, { "epoch": 0.0063371356147021544, "grad_norm": 1.2064683000577734, "learning_rate": 1.2658227848101267e-05, "loss": 0.5395, "step": 5 }, { "epoch": 0.012674271229404309, "grad_norm": 1.0410453983480228, "learning_rate": 2.5316455696202533e-05, "loss": 0.6061, "step": 10 }, { "epoch": 0.019011406844106463, "grad_norm": 0.6487558463156515, "learning_rate": 3.79746835443038e-05, "loss": 0.5462, "step": 15 }, { "epoch": 0.025348542458808618, "grad_norm": 0.5370587969238408, "learning_rate": 5.0632911392405066e-05, "loss": 0.4583, "step": 20 }, { "epoch": 0.031685678073510776, "grad_norm": 0.5034416625324662, "learning_rate": 6.329113924050633e-05, "loss": 0.4883, "step": 25 }, { "epoch": 0.03802281368821293, "grad_norm": 0.39983399371170963, "learning_rate": 7.59493670886076e-05, "loss": 0.3405, "step": 30 }, { "epoch": 0.044359949302915085, "grad_norm": 0.43348705894690165, "learning_rate": 8.860759493670887e-05, "loss": 0.4164, "step": 35 }, { "epoch": 0.050697084917617236, "grad_norm": 0.38455150629020124, "learning_rate": 0.00010126582278481013, "loss": 0.3655, "step": 40 }, { "epoch": 0.057034220532319393, "grad_norm": 0.4620346635793738, "learning_rate": 0.0001139240506329114, "loss": 0.321, "step": 45 }, { "epoch": 0.06337135614702155, "grad_norm": 0.5487179791839946, "learning_rate": 0.00012658227848101267, "loss": 0.3917, "step": 50 }, { "epoch": 0.0697084917617237, "grad_norm": 0.4322988318215336, "learning_rate": 0.00013924050632911395, "loss": 0.2954, "step": 55 }, { "epoch": 0.07604562737642585, "grad_norm": 0.45659465975830865, "learning_rate": 0.0001518987341772152, "loss": 0.3362, "step": 60 }, { "epoch": 0.08238276299112801, "grad_norm": 0.5012029391973714, "learning_rate": 0.00016455696202531648, "loss": 0.3201, "step": 65 }, { "epoch": 0.08871989860583017, "grad_norm": 0.4257121777036391, "learning_rate": 0.00017721518987341773, "loss": 0.3103, "step": 70 }, { "epoch": 0.09505703422053231, "grad_norm": 0.49930786967179713, "learning_rate": 0.00018987341772151899, "loss": 0.3895, "step": 75 }, { "epoch": 0.10139416983523447, "grad_norm": 0.3987283812690162, "learning_rate": 0.00019999902106840922, "loss": 0.2607, "step": 80 }, { "epoch": 0.10773130544993663, "grad_norm": 0.4941124105408692, "learning_rate": 0.00019996476047513454, "loss": 0.3509, "step": 85 }, { "epoch": 0.11406844106463879, "grad_norm": 0.35089312674321954, "learning_rate": 0.00019988157246677513, "loss": 0.323, "step": 90 }, { "epoch": 0.12040557667934093, "grad_norm": 0.4238705680350401, "learning_rate": 0.00019974949775942134, "loss": 0.2834, "step": 95 }, { "epoch": 0.1267427122940431, "grad_norm": 0.45602469539148643, "learning_rate": 0.00019956860099659346, "loss": 0.3492, "step": 100 }, { "epoch": 0.13307984790874525, "grad_norm": 0.40420721250262537, "learning_rate": 0.00019933897071760235, "loss": 0.2865, "step": 105 }, { "epoch": 0.1394169835234474, "grad_norm": 0.47242229343887376, "learning_rate": 0.00019906071931421413, "loss": 0.3047, "step": 110 }, { "epoch": 0.14575411913814956, "grad_norm": 0.3662230634874705, "learning_rate": 0.00019873398297564037, "loss": 0.3183, "step": 115 }, { "epoch": 0.1520912547528517, "grad_norm": 0.3786550636833906, "learning_rate": 0.00019835892162188066, "loss": 0.2903, "step": 120 }, { "epoch": 0.15842839036755388, "grad_norm": 0.42556611928998644, "learning_rate": 0.00019793571882545047, "loss": 0.3298, "step": 125 }, { "epoch": 0.16476552598225602, "grad_norm": 0.3477579859642712, "learning_rate": 0.0001974645817215322, "loss": 0.2811, "step": 130 }, { "epoch": 0.17110266159695817, "grad_norm": 0.338149840344514, "learning_rate": 0.0001969457409065933, "loss": 0.3218, "step": 135 }, { "epoch": 0.17743979721166034, "grad_norm": 0.31604418234996284, "learning_rate": 0.0001963794503255219, "loss": 0.3181, "step": 140 }, { "epoch": 0.18377693282636248, "grad_norm": 0.37941148877648684, "learning_rate": 0.00019576598714733431, "loss": 0.2708, "step": 145 }, { "epoch": 0.19011406844106463, "grad_norm": 0.41662892297732984, "learning_rate": 0.00019510565162951537, "loss": 0.3133, "step": 150 }, { "epoch": 0.1964512040557668, "grad_norm": 0.33440472234808516, "learning_rate": 0.0001943987669710586, "loss": 0.3027, "step": 155 }, { "epoch": 0.20278833967046894, "grad_norm": 0.3858834779514916, "learning_rate": 0.0001936456791542776, "loss": 0.3148, "step": 160 }, { "epoch": 0.20912547528517111, "grad_norm": 0.3588666651678752, "learning_rate": 0.000192846756775466, "loss": 0.3038, "step": 165 }, { "epoch": 0.21546261089987326, "grad_norm": 0.3923595420574365, "learning_rate": 0.00019200239086448933, "loss": 0.251, "step": 170 }, { "epoch": 0.2217997465145754, "grad_norm": 0.3720411933468231, "learning_rate": 0.0001911129946933968, "loss": 0.3066, "step": 175 }, { "epoch": 0.22813688212927757, "grad_norm": 0.32287305381502635, "learning_rate": 0.00019017900357414669, "loss": 0.3021, "step": 180 }, { "epoch": 0.23447401774397972, "grad_norm": 0.3574163723783811, "learning_rate": 0.00018920087464554427, "loss": 0.3156, "step": 185 }, { "epoch": 0.24081115335868186, "grad_norm": 0.4152681195493882, "learning_rate": 0.0001881790866494969, "loss": 0.2839, "step": 190 }, { "epoch": 0.24714828897338403, "grad_norm": 0.36834582922377895, "learning_rate": 0.00018711413969669526, "loss": 0.2945, "step": 195 }, { "epoch": 0.2534854245880862, "grad_norm": 0.39981519235640095, "learning_rate": 0.00018600655502183612, "loss": 0.3189, "step": 200 }, { "epoch": 0.2534854245880862, "eval_loss": 0.28277337551116943, "eval_runtime": 1084.4496, "eval_samples_per_second": 3.689, "eval_steps_per_second": 0.115, "step": 200 }, { "epoch": 0.2598225602027883, "grad_norm": 0.31219177577589374, "learning_rate": 0.0001848568747285054, "loss": 0.2299, "step": 205 }, { "epoch": 0.2661596958174905, "grad_norm": 0.37017559919032633, "learning_rate": 0.00018366566152384773, "loss": 0.2753, "step": 210 }, { "epoch": 0.27249683143219267, "grad_norm": 0.29894743373031823, "learning_rate": 0.00018243349844315117, "loss": 0.2734, "step": 215 }, { "epoch": 0.2788339670468948, "grad_norm": 0.3227139696142255, "learning_rate": 0.00018116098856448253, "loss": 0.2595, "step": 220 }, { "epoch": 0.28517110266159695, "grad_norm": 0.3780707564332847, "learning_rate": 0.00017984875471351302, "loss": 0.3107, "step": 225 }, { "epoch": 0.2915082382762991, "grad_norm": 0.31519655631233245, "learning_rate": 0.00017849743915867807, "loss": 0.2336, "step": 230 }, { "epoch": 0.29784537389100124, "grad_norm": 0.35840275346083816, "learning_rate": 0.00017710770329682144, "loss": 0.2939, "step": 235 }, { "epoch": 0.3041825095057034, "grad_norm": 0.2971942370747859, "learning_rate": 0.0001756802273294766, "loss": 0.2959, "step": 240 }, { "epoch": 0.3105196451204056, "grad_norm": 0.3247193570102783, "learning_rate": 0.0001742157099299445, "loss": 0.2373, "step": 245 }, { "epoch": 0.31685678073510776, "grad_norm": 0.38491700907647547, "learning_rate": 0.00017271486790133023, "loss": 0.3272, "step": 250 }, { "epoch": 0.3231939163498099, "grad_norm": 0.3075011053275653, "learning_rate": 0.00017117843582570608, "loss": 0.2377, "step": 255 }, { "epoch": 0.32953105196451205, "grad_norm": 0.31453423737207853, "learning_rate": 0.00016960716570457292, "loss": 0.2842, "step": 260 }, { "epoch": 0.3358681875792142, "grad_norm": 0.24817557882025235, "learning_rate": 0.00016800182659079568, "loss": 0.2721, "step": 265 }, { "epoch": 0.34220532319391633, "grad_norm": 0.3377039172873109, "learning_rate": 0.00016636320421219278, "loss": 0.2463, "step": 270 }, { "epoch": 0.3485424588086185, "grad_norm": 0.3661471684768712, "learning_rate": 0.00016469210058696446, "loss": 0.3025, "step": 275 }, { "epoch": 0.3548795944233207, "grad_norm": 0.27351558488185734, "learning_rate": 0.0001629893336311477, "loss": 0.2335, "step": 280 }, { "epoch": 0.3612167300380228, "grad_norm": 0.4358770290773004, "learning_rate": 0.00016125573675828983, "loss": 0.279, "step": 285 }, { "epoch": 0.36755386565272496, "grad_norm": 0.2774172437400498, "learning_rate": 0.00015949215847153717, "loss": 0.2581, "step": 290 }, { "epoch": 0.37389100126742714, "grad_norm": 0.28991353506835355, "learning_rate": 0.00015769946194833817, "loss": 0.2314, "step": 295 }, { "epoch": 0.38022813688212925, "grad_norm": 0.4117767724226869, "learning_rate": 0.00015587852461796376, "loss": 0.3218, "step": 300 }, { "epoch": 0.3865652724968314, "grad_norm": 0.31038917537934363, "learning_rate": 0.00015403023773205286, "loss": 0.2315, "step": 305 }, { "epoch": 0.3929024081115336, "grad_norm": 0.3358971163136089, "learning_rate": 0.00015215550592839218, "loss": 0.302, "step": 310 }, { "epoch": 0.39923954372623577, "grad_norm": 0.24064351622215968, "learning_rate": 0.00015025524678814427, "loss": 0.2795, "step": 315 }, { "epoch": 0.4055766793409379, "grad_norm": 0.32648334764287845, "learning_rate": 0.00014833039038674047, "loss": 0.2398, "step": 320 }, { "epoch": 0.41191381495564006, "grad_norm": 0.4100611479861272, "learning_rate": 0.0001463818788386588, "loss": 0.2849, "step": 325 }, { "epoch": 0.41825095057034223, "grad_norm": 0.333871597957756, "learning_rate": 0.00014441066583630906, "loss": 0.2297, "step": 330 }, { "epoch": 0.42458808618504434, "grad_norm": 0.34174204454819196, "learning_rate": 0.00014241771618325123, "loss": 0.2704, "step": 335 }, { "epoch": 0.4309252217997465, "grad_norm": 0.2539128471983557, "learning_rate": 0.00014040400532197583, "loss": 0.2579, "step": 340 }, { "epoch": 0.4372623574144487, "grad_norm": 0.29762852904975656, "learning_rate": 0.0001383705188564767, "loss": 0.255, "step": 345 }, { "epoch": 0.4435994930291508, "grad_norm": 0.3477101390224288, "learning_rate": 0.00013631825206985063, "loss": 0.2841, "step": 350 }, { "epoch": 0.449936628643853, "grad_norm": 0.2807172517529508, "learning_rate": 0.0001342482094371591, "loss": 0.2112, "step": 355 }, { "epoch": 0.45627376425855515, "grad_norm": 0.3838095185531826, "learning_rate": 0.00013216140413379167, "loss": 0.282, "step": 360 }, { "epoch": 0.46261089987325726, "grad_norm": 0.2421625693621495, "learning_rate": 0.00013005885753957048, "loss": 0.2473, "step": 365 }, { "epoch": 0.46894803548795944, "grad_norm": 0.32700829134754156, "learning_rate": 0.0001279415987388395, "loss": 0.2174, "step": 370 }, { "epoch": 0.4752851711026616, "grad_norm": 0.3713274528328548, "learning_rate": 0.0001258106640167826, "loss": 0.283, "step": 375 }, { "epoch": 0.4816223067173637, "grad_norm": 0.27730345855582755, "learning_rate": 0.0001236670963522172, "loss": 0.2021, "step": 380 }, { "epoch": 0.4879594423320659, "grad_norm": 0.34446917830376517, "learning_rate": 0.00012151194490711178, "loss": 0.2561, "step": 385 }, { "epoch": 0.49429657794676807, "grad_norm": 0.2522933072667094, "learning_rate": 0.00011934626451307726, "loss": 0.2433, "step": 390 }, { "epoch": 0.5006337135614702, "grad_norm": 0.3469006061112617, "learning_rate": 0.00011717111515508319, "loss": 0.2266, "step": 395 }, { "epoch": 0.5069708491761724, "grad_norm": 0.3778944814729631, "learning_rate": 0.00011498756145265144, "loss": 0.2906, "step": 400 }, { "epoch": 0.5069708491761724, "eval_loss": 0.2465018332004547, "eval_runtime": 1086.0248, "eval_samples_per_second": 3.683, "eval_steps_per_second": 0.115, "step": 400 }, { "epoch": 0.5133079847908745, "grad_norm": 0.2906640770941992, "learning_rate": 0.00011279667213878205, "loss": 0.2329, "step": 405 }, { "epoch": 0.5196451204055766, "grad_norm": 0.3562675808467966, "learning_rate": 0.00011059951953686535, "loss": 0.2727, "step": 410 }, { "epoch": 0.5259822560202788, "grad_norm": 0.23100289329465973, "learning_rate": 0.00010839717903583684, "loss": 0.2559, "step": 415 }, { "epoch": 0.532319391634981, "grad_norm": 0.31979110117969145, "learning_rate": 0.00010619072856383181, "loss": 0.2413, "step": 420 }, { "epoch": 0.5386565272496832, "grad_norm": 0.3131449805956835, "learning_rate": 0.00010398124806059701, "loss": 0.2807, "step": 425 }, { "epoch": 0.5449936628643853, "grad_norm": 0.30855801225213353, "learning_rate": 0.00010176981894891768, "loss": 0.1961, "step": 430 }, { "epoch": 0.5513307984790875, "grad_norm": 0.3422494582211193, "learning_rate": 9.955752360531896e-05, "loss": 0.2805, "step": 435 }, { "epoch": 0.5576679340937896, "grad_norm": 0.26688868743798966, "learning_rate": 9.734544483030026e-05, "loss": 0.2597, "step": 440 }, { "epoch": 0.5640050697084917, "grad_norm": 0.3161252521303147, "learning_rate": 9.513466531836221e-05, "loss": 0.2166, "step": 445 }, { "epoch": 0.5703422053231939, "grad_norm": 0.3401785018630494, "learning_rate": 9.292626712808556e-05, "loss": 0.2827, "step": 450 }, { "epoch": 0.5766793409378961, "grad_norm": 0.3036061299387351, "learning_rate": 9.072133115252112e-05, "loss": 0.2254, "step": 455 }, { "epoch": 0.5830164765525983, "grad_norm": 0.35415945685790207, "learning_rate": 8.85209365901505e-05, "loss": 0.2692, "step": 460 }, { "epoch": 0.5893536121673004, "grad_norm": 0.25401069755470806, "learning_rate": 8.632616041667577e-05, "loss": 0.2539, "step": 465 }, { "epoch": 0.5956907477820025, "grad_norm": 0.31003086790259704, "learning_rate": 8.41380768578976e-05, "loss": 0.2271, "step": 470 }, { "epoch": 0.6020278833967047, "grad_norm": 0.3237135373224526, "learning_rate": 8.195775686393897e-05, "loss": 0.2792, "step": 475 }, { "epoch": 0.6083650190114068, "grad_norm": 0.2952913468902984, "learning_rate": 7.978626758507217e-05, "loss": 0.2033, "step": 480 }, { "epoch": 0.614702154626109, "grad_norm": 0.32514176968318775, "learning_rate": 7.762467184940574e-05, "loss": 0.2632, "step": 485 }, { "epoch": 0.6210392902408112, "grad_norm": 0.2704128994845209, "learning_rate": 7.547402764268689e-05, "loss": 0.2657, "step": 490 }, { "epoch": 0.6273764258555133, "grad_norm": 0.3419677722251191, "learning_rate": 7.333538759047389e-05, "loss": 0.2432, "step": 495 }, { "epoch": 0.6337135614702155, "grad_norm": 0.3336718483492428, "learning_rate": 7.120979844293201e-05, "loss": 0.2789, "step": 500 }, { "epoch": 0.6400506970849176, "grad_norm": 0.28861366582406006, "learning_rate": 6.909830056250527e-05, "loss": 0.2185, "step": 505 }, { "epoch": 0.6463878326996197, "grad_norm": 0.3292639675337998, "learning_rate": 6.700192741471447e-05, "loss": 0.2622, "step": 510 }, { "epoch": 0.6527249683143219, "grad_norm": 0.2539285743910528, "learning_rate": 6.4921705062331e-05, "loss": 0.2442, "step": 515 }, { "epoch": 0.6590621039290241, "grad_norm": 0.3067269034698268, "learning_rate": 6.285865166317386e-05, "loss": 0.2143, "step": 520 }, { "epoch": 0.6653992395437263, "grad_norm": 0.3245132022152399, "learning_rate": 6.081377697177576e-05, "loss": 0.2719, "step": 525 }, { "epoch": 0.6717363751584284, "grad_norm": 0.27285602978719187, "learning_rate": 5.8788081845162246e-05, "loss": 0.205, "step": 530 }, { "epoch": 0.6780735107731305, "grad_norm": 0.33017438859174614, "learning_rate": 5.678255775298542e-05, "loss": 0.2595, "step": 535 }, { "epoch": 0.6844106463878327, "grad_norm": 0.22042843479196494, "learning_rate": 5.479818629225259e-05, "loss": 0.2508, "step": 540 }, { "epoch": 0.6907477820025348, "grad_norm": 0.3262775579247169, "learning_rate": 5.2835938706886966e-05, "loss": 0.2252, "step": 545 }, { "epoch": 0.697084917617237, "grad_norm": 0.3604718135367501, "learning_rate": 5.0896775412355434e-05, "loss": 0.2703, "step": 550 }, { "epoch": 0.7034220532319392, "grad_norm": 0.2849330234510489, "learning_rate": 4.89816455255966e-05, "loss": 0.1977, "step": 555 }, { "epoch": 0.7097591888466414, "grad_norm": 0.2992590502661103, "learning_rate": 4.7091486400478604e-05, "loss": 0.252, "step": 560 }, { "epoch": 0.7160963244613435, "grad_norm": 0.2424976560859796, "learning_rate": 4.5227223169014456e-05, "loss": 0.2465, "step": 565 }, { "epoch": 0.7224334600760456, "grad_norm": 0.35965307208087977, "learning_rate": 4.338976828855939e-05, "loss": 0.2249, "step": 570 }, { "epoch": 0.7287705956907478, "grad_norm": 0.3741830397170476, "learning_rate": 4.1580021095211486e-05, "loss": 0.284, "step": 575 }, { "epoch": 0.7351077313054499, "grad_norm": 0.2903803822883755, "learning_rate": 3.9798867363634814e-05, "loss": 0.2123, "step": 580 }, { "epoch": 0.7414448669201521, "grad_norm": 0.31710020807356026, "learning_rate": 3.804717887351991e-05, "loss": 0.2605, "step": 585 }, { "epoch": 0.7477820025348543, "grad_norm": 0.24733722916292933, "learning_rate": 3.632581298289427e-05, "loss": 0.2289, "step": 590 }, { "epoch": 0.7541191381495564, "grad_norm": 0.3001878742603873, "learning_rate": 3.4635612208491194e-05, "loss": 0.2322, "step": 595 }, { "epoch": 0.7604562737642585, "grad_norm": 0.3250883573895976, "learning_rate": 3.2977403813382926e-05, "loss": 0.2525, "step": 600 }, { "epoch": 0.7604562737642585, "eval_loss": 0.2325838953256607, "eval_runtime": 1086.1174, "eval_samples_per_second": 3.683, "eval_steps_per_second": 0.115, "step": 600 }, { "epoch": 0.7667934093789607, "grad_norm": 0.3246331902911399, "learning_rate": 3.135199940207947e-05, "loss": 0.2206, "step": 605 }, { "epoch": 0.7731305449936628, "grad_norm": 0.3364965552008111, "learning_rate": 2.976019452329153e-05, "loss": 0.2566, "step": 610 }, { "epoch": 0.779467680608365, "grad_norm": 0.22924356326780843, "learning_rate": 2.8202768280551894e-05, "loss": 0.2448, "step": 615 }, { "epoch": 0.7858048162230672, "grad_norm": 0.3207071242715804, "learning_rate": 2.6680482950885777e-05, "loss": 0.2269, "step": 620 }, { "epoch": 0.7921419518377694, "grad_norm": 0.37606836614121164, "learning_rate": 2.5194083611716935e-05, "loss": 0.27, "step": 625 }, { "epoch": 0.7984790874524715, "grad_norm": 0.2830616844795413, "learning_rate": 2.374429777619205e-05, "loss": 0.1848, "step": 630 }, { "epoch": 0.8048162230671736, "grad_norm": 0.3123281734582231, "learning_rate": 2.2331835037101823e-05, "loss": 0.2562, "step": 635 }, { "epoch": 0.8111533586818758, "grad_norm": 0.27281906149266577, "learning_rate": 2.0957386719573224e-05, "loss": 0.2417, "step": 640 }, { "epoch": 0.8174904942965779, "grad_norm": 0.3463842809980822, "learning_rate": 1.962162554270267e-05, "loss": 0.2108, "step": 645 }, { "epoch": 0.8238276299112801, "grad_norm": 0.35511851586585724, "learning_rate": 1.83252052902961e-05, "loss": 0.2789, "step": 650 }, { "epoch": 0.8301647655259823, "grad_norm": 0.2720460231999165, "learning_rate": 1.7068760490876422e-05, "loss": 0.1886, "step": 655 }, { "epoch": 0.8365019011406845, "grad_norm": 0.323785370919597, "learning_rate": 1.5852906107115893e-05, "loss": 0.2617, "step": 660 }, { "epoch": 0.8428390367553865, "grad_norm": 0.23954436901187928, "learning_rate": 1.4678237234844649e-05, "loss": 0.2501, "step": 665 }, { "epoch": 0.8491761723700887, "grad_norm": 0.3169133202131331, "learning_rate": 1.354532881178301e-05, "loss": 0.1988, "step": 670 }, { "epoch": 0.8555133079847909, "grad_norm": 0.3516256902668012, "learning_rate": 1.2454735336140167e-05, "loss": 0.2765, "step": 675 }, { "epoch": 0.861850443599493, "grad_norm": 0.2923347791144997, "learning_rate": 1.1406990595216971e-05, "loss": 0.2065, "step": 680 }, { "epoch": 0.8681875792141952, "grad_norm": 0.30823497046755327, "learning_rate": 1.0402607404145449e-05, "loss": 0.2623, "step": 685 }, { "epoch": 0.8745247148288974, "grad_norm": 0.25292131222665615, "learning_rate": 9.442077354893198e-06, "loss": 0.2461, "step": 690 }, { "epoch": 0.8808618504435995, "grad_norm": 0.32414756933713984, "learning_rate": 8.525870575655392e-06, "loss": 0.217, "step": 695 }, { "epoch": 0.8871989860583016, "grad_norm": 0.3423100582401461, "learning_rate": 7.654435500752055e-06, "loss": 0.2675, "step": 700 }, { "epoch": 0.8935361216730038, "grad_norm": 0.29544160874192754, "learning_rate": 6.828198651143425e-06, "loss": 0.1927, "step": 705 }, { "epoch": 0.899873257287706, "grad_norm": 0.32477510117980407, "learning_rate": 6.047564425670749e-06, "loss": 0.2613, "step": 710 }, { "epoch": 0.9062103929024081, "grad_norm": 0.23676655623304554, "learning_rate": 5.312914903124566e-06, "loss": 0.2414, "step": 715 }, { "epoch": 0.9125475285171103, "grad_norm": 0.2967989522178623, "learning_rate": 4.624609655237544e-06, "loss": 0.2002, "step": 720 }, { "epoch": 0.9188846641318125, "grad_norm": 0.340334151463555, "learning_rate": 3.982985570693354e-06, "loss": 0.2764, "step": 725 }, { "epoch": 0.9252217997465145, "grad_norm": 0.2776495387493925, "learning_rate": 3.388356690237582e-06, "loss": 0.1859, "step": 730 }, { "epoch": 0.9315589353612167, "grad_norm": 0.37073108669893173, "learning_rate": 2.84101405297158e-06, "loss": 0.2378, "step": 735 }, { "epoch": 0.9378960709759189, "grad_norm": 0.2570912693772944, "learning_rate": 2.341225553904336e-06, "loss": 0.2398, "step": 740 }, { "epoch": 0.944233206590621, "grad_norm": 0.3306131510091784, "learning_rate": 1.8892358128322018e-06, "loss": 0.2092, "step": 745 }, { "epoch": 0.9505703422053232, "grad_norm": 0.33888565708232804, "learning_rate": 1.4852660546105234e-06, "loss": 0.2873, "step": 750 }, { "epoch": 0.9569074778200254, "grad_norm": 0.2906867586955816, "learning_rate": 1.1295140008758864e-06, "loss": 0.2079, "step": 755 }, { "epoch": 0.9632446134347274, "grad_norm": 0.34402820600782946, "learning_rate": 8.221537732719275e-07, "loss": 0.2648, "step": 760 }, { "epoch": 0.9695817490494296, "grad_norm": 0.26017431588284967, "learning_rate": 5.633358082260954e-07, "loss": 0.2481, "step": 765 }, { "epoch": 0.9759188846641318, "grad_norm": 0.3339069885340444, "learning_rate": 3.5318678331904833e-07, "loss": 0.2237, "step": 770 }, { "epoch": 0.982256020278834, "grad_norm": 0.34754105161997045, "learning_rate": 1.9180955528270706e-07, "loss": 0.27, "step": 775 }, { "epoch": 0.9885931558935361, "grad_norm": 0.3246226295094946, "learning_rate": 7.928310965742425e-08, "loss": 0.2226, "step": 780 }, { "epoch": 0.9949302915082383, "grad_norm": 0.31594487456225406, "learning_rate": 1.5662522132742218e-08, "loss": 0.2444, "step": 785 }, { "epoch": 1.0, "step": 789, "total_flos": 1.0371979095834624e+16, "train_loss": 0.2726454405050314, "train_runtime": 20804.2809, "train_samples_per_second": 1.213, "train_steps_per_second": 0.038 } ], "logging_steps": 5, "max_steps": 789, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0371979095834624e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }