{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9991537376586743, "eval_steps": 500, "global_step": 5316, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005641748942172073, "grad_norm": 0.8860281773045255, "learning_rate": 5e-06, "loss": 0.5781, "step": 10 }, { "epoch": 0.011283497884344146, "grad_norm": 0.9275466531112635, "learning_rate": 5e-06, "loss": 0.5493, "step": 20 }, { "epoch": 0.01692524682651622, "grad_norm": 0.9785057374089036, "learning_rate": 5e-06, "loss": 0.5144, "step": 30 }, { "epoch": 0.022566995768688293, "grad_norm": 0.9445832962940219, "learning_rate": 5e-06, "loss": 0.5132, "step": 40 }, { "epoch": 0.028208744710860368, "grad_norm": 0.8794347209187481, "learning_rate": 5e-06, "loss": 0.4958, "step": 50 }, { "epoch": 0.03385049365303244, "grad_norm": 0.8768425701561404, "learning_rate": 5e-06, "loss": 0.5032, "step": 60 }, { "epoch": 0.039492242595204514, "grad_norm": 0.7246974546492897, "learning_rate": 5e-06, "loss": 0.4798, "step": 70 }, { "epoch": 0.045133991537376586, "grad_norm": 0.554012169044732, "learning_rate": 5e-06, "loss": 0.4888, "step": 80 }, { "epoch": 0.05077574047954866, "grad_norm": 0.5618297520336772, "learning_rate": 5e-06, "loss": 0.4811, "step": 90 }, { "epoch": 0.056417489421720736, "grad_norm": 0.5213657220782494, "learning_rate": 5e-06, "loss": 0.468, "step": 100 }, { "epoch": 0.06205923836389281, "grad_norm": 0.509393805640559, "learning_rate": 5e-06, "loss": 0.4829, "step": 110 }, { "epoch": 0.06770098730606489, "grad_norm": 0.543532182255718, "learning_rate": 5e-06, "loss": 0.4737, "step": 120 }, { "epoch": 0.07334273624823695, "grad_norm": 0.5016209370031858, "learning_rate": 5e-06, "loss": 0.4787, "step": 130 }, { "epoch": 0.07898448519040903, "grad_norm": 0.5209031521531445, "learning_rate": 5e-06, "loss": 0.4848, "step": 140 }, { "epoch": 0.0846262341325811, "grad_norm": 0.47470655842824117, "learning_rate": 5e-06, "loss": 0.4776, "step": 150 }, { "epoch": 0.09026798307475317, "grad_norm": 0.5098693274003744, "learning_rate": 5e-06, "loss": 0.4765, "step": 160 }, { "epoch": 0.09590973201692525, "grad_norm": 0.5148841472543283, "learning_rate": 5e-06, "loss": 0.4744, "step": 170 }, { "epoch": 0.10155148095909731, "grad_norm": 0.5303116836505042, "learning_rate": 5e-06, "loss": 0.4789, "step": 180 }, { "epoch": 0.1071932299012694, "grad_norm": 0.5035762913816794, "learning_rate": 5e-06, "loss": 0.4763, "step": 190 }, { "epoch": 0.11283497884344147, "grad_norm": 0.5096690168519262, "learning_rate": 5e-06, "loss": 0.4687, "step": 200 }, { "epoch": 0.11847672778561354, "grad_norm": 0.50737809697083, "learning_rate": 5e-06, "loss": 0.4615, "step": 210 }, { "epoch": 0.12411847672778561, "grad_norm": 0.4961749979329462, "learning_rate": 5e-06, "loss": 0.4713, "step": 220 }, { "epoch": 0.12976022566995768, "grad_norm": 0.47944422939763603, "learning_rate": 5e-06, "loss": 0.475, "step": 230 }, { "epoch": 0.13540197461212977, "grad_norm": 0.49717123601985425, "learning_rate": 5e-06, "loss": 0.4739, "step": 240 }, { "epoch": 0.14104372355430184, "grad_norm": 0.47611353329941747, "learning_rate": 5e-06, "loss": 0.4649, "step": 250 }, { "epoch": 0.1466854724964739, "grad_norm": 0.4894294603649133, "learning_rate": 5e-06, "loss": 0.4917, "step": 260 }, { "epoch": 0.152327221438646, "grad_norm": 0.48373950578804115, "learning_rate": 5e-06, "loss": 0.4493, "step": 270 }, { "epoch": 0.15796897038081806, "grad_norm": 0.522789579136924, "learning_rate": 5e-06, "loss": 0.4664, "step": 280 }, { "epoch": 0.16361071932299012, "grad_norm": 0.4789421152666509, "learning_rate": 5e-06, "loss": 0.459, "step": 290 }, { "epoch": 0.1692524682651622, "grad_norm": 0.5012557865235248, "learning_rate": 5e-06, "loss": 0.4552, "step": 300 }, { "epoch": 0.17489421720733428, "grad_norm": 0.4624585048654735, "learning_rate": 5e-06, "loss": 0.4596, "step": 310 }, { "epoch": 0.18053596614950634, "grad_norm": 0.47502867235282875, "learning_rate": 5e-06, "loss": 0.4782, "step": 320 }, { "epoch": 0.1861777150916784, "grad_norm": 0.48965795082153923, "learning_rate": 5e-06, "loss": 0.4645, "step": 330 }, { "epoch": 0.1918194640338505, "grad_norm": 0.48242874081163245, "learning_rate": 5e-06, "loss": 0.4647, "step": 340 }, { "epoch": 0.19746121297602257, "grad_norm": 0.5121876755000992, "learning_rate": 5e-06, "loss": 0.4665, "step": 350 }, { "epoch": 0.20310296191819463, "grad_norm": 0.49770668244489025, "learning_rate": 5e-06, "loss": 0.4707, "step": 360 }, { "epoch": 0.20874471086036672, "grad_norm": 0.5159796946721876, "learning_rate": 5e-06, "loss": 0.4626, "step": 370 }, { "epoch": 0.2143864598025388, "grad_norm": 0.48687862998047, "learning_rate": 5e-06, "loss": 0.4605, "step": 380 }, { "epoch": 0.22002820874471085, "grad_norm": 0.5040230352920434, "learning_rate": 5e-06, "loss": 0.4582, "step": 390 }, { "epoch": 0.22566995768688294, "grad_norm": 0.4841608785680818, "learning_rate": 5e-06, "loss": 0.4655, "step": 400 }, { "epoch": 0.231311706629055, "grad_norm": 0.49039176099743137, "learning_rate": 5e-06, "loss": 0.4495, "step": 410 }, { "epoch": 0.23695345557122707, "grad_norm": 0.5040217827748269, "learning_rate": 5e-06, "loss": 0.4529, "step": 420 }, { "epoch": 0.24259520451339917, "grad_norm": 0.47120041333093315, "learning_rate": 5e-06, "loss": 0.4569, "step": 430 }, { "epoch": 0.24823695345557123, "grad_norm": 0.4890234120450319, "learning_rate": 5e-06, "loss": 0.4613, "step": 440 }, { "epoch": 0.2538787023977433, "grad_norm": 0.48217359915393404, "learning_rate": 5e-06, "loss": 0.4472, "step": 450 }, { "epoch": 0.25952045133991536, "grad_norm": 0.467804639174959, "learning_rate": 5e-06, "loss": 0.448, "step": 460 }, { "epoch": 0.2651622002820874, "grad_norm": 0.48164716949150344, "learning_rate": 5e-06, "loss": 0.4502, "step": 470 }, { "epoch": 0.27080394922425954, "grad_norm": 0.5145780661011983, "learning_rate": 5e-06, "loss": 0.4572, "step": 480 }, { "epoch": 0.2764456981664316, "grad_norm": 0.5207011942621447, "learning_rate": 5e-06, "loss": 0.4616, "step": 490 }, { "epoch": 0.2820874471086037, "grad_norm": 0.4935510238242219, "learning_rate": 5e-06, "loss": 0.4659, "step": 500 }, { "epoch": 0.28772919605077574, "grad_norm": 0.5079417454565434, "learning_rate": 5e-06, "loss": 0.4582, "step": 510 }, { "epoch": 0.2933709449929478, "grad_norm": 0.5314525886288128, "learning_rate": 5e-06, "loss": 0.4628, "step": 520 }, { "epoch": 0.29901269393511987, "grad_norm": 0.5053611809759164, "learning_rate": 5e-06, "loss": 0.4478, "step": 530 }, { "epoch": 0.304654442877292, "grad_norm": 0.4794058313719111, "learning_rate": 5e-06, "loss": 0.4451, "step": 540 }, { "epoch": 0.31029619181946405, "grad_norm": 0.48888977415025386, "learning_rate": 5e-06, "loss": 0.4702, "step": 550 }, { "epoch": 0.3159379407616361, "grad_norm": 0.4849302874069741, "learning_rate": 5e-06, "loss": 0.4561, "step": 560 }, { "epoch": 0.3215796897038082, "grad_norm": 0.47972377135392075, "learning_rate": 5e-06, "loss": 0.4457, "step": 570 }, { "epoch": 0.32722143864598024, "grad_norm": 0.4869264334442687, "learning_rate": 5e-06, "loss": 0.4498, "step": 580 }, { "epoch": 0.3328631875881523, "grad_norm": 0.5030426273166695, "learning_rate": 5e-06, "loss": 0.4698, "step": 590 }, { "epoch": 0.3385049365303244, "grad_norm": 0.4792385544239688, "learning_rate": 5e-06, "loss": 0.4524, "step": 600 }, { "epoch": 0.3441466854724965, "grad_norm": 0.4757776685745222, "learning_rate": 5e-06, "loss": 0.4481, "step": 610 }, { "epoch": 0.34978843441466856, "grad_norm": 0.5141080166869366, "learning_rate": 5e-06, "loss": 0.4522, "step": 620 }, { "epoch": 0.3554301833568406, "grad_norm": 0.521030094448152, "learning_rate": 5e-06, "loss": 0.4529, "step": 630 }, { "epoch": 0.3610719322990127, "grad_norm": 0.49616684223591123, "learning_rate": 5e-06, "loss": 0.4585, "step": 640 }, { "epoch": 0.36671368124118475, "grad_norm": 0.5224973873990862, "learning_rate": 5e-06, "loss": 0.4531, "step": 650 }, { "epoch": 0.3723554301833568, "grad_norm": 0.46606976454004667, "learning_rate": 5e-06, "loss": 0.4499, "step": 660 }, { "epoch": 0.37799717912552894, "grad_norm": 0.4631578950745994, "learning_rate": 5e-06, "loss": 0.4591, "step": 670 }, { "epoch": 0.383638928067701, "grad_norm": 0.463696350712983, "learning_rate": 5e-06, "loss": 0.4617, "step": 680 }, { "epoch": 0.38928067700987307, "grad_norm": 0.49700726007271695, "learning_rate": 5e-06, "loss": 0.4419, "step": 690 }, { "epoch": 0.39492242595204513, "grad_norm": 0.5047528462302425, "learning_rate": 5e-06, "loss": 0.4546, "step": 700 }, { "epoch": 0.4005641748942172, "grad_norm": 0.4881338305694489, "learning_rate": 5e-06, "loss": 0.4415, "step": 710 }, { "epoch": 0.40620592383638926, "grad_norm": 0.4950088901214604, "learning_rate": 5e-06, "loss": 0.4467, "step": 720 }, { "epoch": 0.4118476727785614, "grad_norm": 0.48800943523969437, "learning_rate": 5e-06, "loss": 0.4617, "step": 730 }, { "epoch": 0.41748942172073344, "grad_norm": 0.4761347013711521, "learning_rate": 5e-06, "loss": 0.4455, "step": 740 }, { "epoch": 0.4231311706629055, "grad_norm": 0.4811571918715123, "learning_rate": 5e-06, "loss": 0.4752, "step": 750 }, { "epoch": 0.4287729196050776, "grad_norm": 0.4785173629798188, "learning_rate": 5e-06, "loss": 0.4312, "step": 760 }, { "epoch": 0.43441466854724964, "grad_norm": 0.499757446583109, "learning_rate": 5e-06, "loss": 0.4522, "step": 770 }, { "epoch": 0.4400564174894217, "grad_norm": 0.5007042680003394, "learning_rate": 5e-06, "loss": 0.4547, "step": 780 }, { "epoch": 0.44569816643159377, "grad_norm": 0.4832215616215704, "learning_rate": 5e-06, "loss": 0.4328, "step": 790 }, { "epoch": 0.4513399153737659, "grad_norm": 0.4556785539804432, "learning_rate": 5e-06, "loss": 0.4526, "step": 800 }, { "epoch": 0.45698166431593795, "grad_norm": 0.4583262040596829, "learning_rate": 5e-06, "loss": 0.4543, "step": 810 }, { "epoch": 0.46262341325811, "grad_norm": 0.47568673401701195, "learning_rate": 5e-06, "loss": 0.4489, "step": 820 }, { "epoch": 0.4682651622002821, "grad_norm": 0.5099408600605224, "learning_rate": 5e-06, "loss": 0.4635, "step": 830 }, { "epoch": 0.47390691114245415, "grad_norm": 0.48286512485005056, "learning_rate": 5e-06, "loss": 0.4414, "step": 840 }, { "epoch": 0.4795486600846262, "grad_norm": 0.4662493732462359, "learning_rate": 5e-06, "loss": 0.4702, "step": 850 }, { "epoch": 0.48519040902679833, "grad_norm": 0.4688957523663092, "learning_rate": 5e-06, "loss": 0.452, "step": 860 }, { "epoch": 0.4908321579689704, "grad_norm": 0.5020197782038716, "learning_rate": 5e-06, "loss": 0.4401, "step": 870 }, { "epoch": 0.49647390691114246, "grad_norm": 0.5293932494261749, "learning_rate": 5e-06, "loss": 0.4594, "step": 880 }, { "epoch": 0.5021156558533145, "grad_norm": 0.5071632389201434, "learning_rate": 5e-06, "loss": 0.4445, "step": 890 }, { "epoch": 0.5077574047954866, "grad_norm": 0.4878558607219793, "learning_rate": 5e-06, "loss": 0.4419, "step": 900 }, { "epoch": 0.5133991537376587, "grad_norm": 0.463852809384172, "learning_rate": 5e-06, "loss": 0.4476, "step": 910 }, { "epoch": 0.5190409026798307, "grad_norm": 0.4552138421616773, "learning_rate": 5e-06, "loss": 0.4477, "step": 920 }, { "epoch": 0.5246826516220028, "grad_norm": 0.4451067953129034, "learning_rate": 5e-06, "loss": 0.4602, "step": 930 }, { "epoch": 0.5303244005641748, "grad_norm": 0.4892614287238877, "learning_rate": 5e-06, "loss": 0.4463, "step": 940 }, { "epoch": 0.535966149506347, "grad_norm": 0.5203275452886463, "learning_rate": 5e-06, "loss": 0.4481, "step": 950 }, { "epoch": 0.5416078984485191, "grad_norm": 0.4993491544629748, "learning_rate": 5e-06, "loss": 0.4547, "step": 960 }, { "epoch": 0.5472496473906912, "grad_norm": 0.4530550836722553, "learning_rate": 5e-06, "loss": 0.4342, "step": 970 }, { "epoch": 0.5528913963328632, "grad_norm": 0.47205399956664723, "learning_rate": 5e-06, "loss": 0.4391, "step": 980 }, { "epoch": 0.5585331452750353, "grad_norm": 0.5288042301017725, "learning_rate": 5e-06, "loss": 0.4477, "step": 990 }, { "epoch": 0.5641748942172073, "grad_norm": 0.5024574538810612, "learning_rate": 5e-06, "loss": 0.4644, "step": 1000 }, { "epoch": 0.5698166431593794, "grad_norm": 0.46444873871572295, "learning_rate": 5e-06, "loss": 0.442, "step": 1010 }, { "epoch": 0.5754583921015515, "grad_norm": 0.48362451913417076, "learning_rate": 5e-06, "loss": 0.4363, "step": 1020 }, { "epoch": 0.5811001410437235, "grad_norm": 0.48683768680972256, "learning_rate": 5e-06, "loss": 0.4523, "step": 1030 }, { "epoch": 0.5867418899858956, "grad_norm": 0.4753500530255471, "learning_rate": 5e-06, "loss": 0.4452, "step": 1040 }, { "epoch": 0.5923836389280677, "grad_norm": 0.494982125586109, "learning_rate": 5e-06, "loss": 0.4504, "step": 1050 }, { "epoch": 0.5980253878702397, "grad_norm": 0.4658594939623635, "learning_rate": 5e-06, "loss": 0.4414, "step": 1060 }, { "epoch": 0.6036671368124118, "grad_norm": 0.4576854005593855, "learning_rate": 5e-06, "loss": 0.4336, "step": 1070 }, { "epoch": 0.609308885754584, "grad_norm": 0.47667736492718527, "learning_rate": 5e-06, "loss": 0.4446, "step": 1080 }, { "epoch": 0.614950634696756, "grad_norm": 0.49049704641298675, "learning_rate": 5e-06, "loss": 0.452, "step": 1090 }, { "epoch": 0.6205923836389281, "grad_norm": 0.47137093184915657, "learning_rate": 5e-06, "loss": 0.4756, "step": 1100 }, { "epoch": 0.6262341325811002, "grad_norm": 0.48219417585137514, "learning_rate": 5e-06, "loss": 0.4427, "step": 1110 }, { "epoch": 0.6318758815232722, "grad_norm": 0.453987324341643, "learning_rate": 5e-06, "loss": 0.4323, "step": 1120 }, { "epoch": 0.6375176304654443, "grad_norm": 0.5092221096631693, "learning_rate": 5e-06, "loss": 0.4488, "step": 1130 }, { "epoch": 0.6431593794076164, "grad_norm": 0.5005212580369779, "learning_rate": 5e-06, "loss": 0.4611, "step": 1140 }, { "epoch": 0.6488011283497884, "grad_norm": 0.5259263069942747, "learning_rate": 5e-06, "loss": 0.4438, "step": 1150 }, { "epoch": 0.6544428772919605, "grad_norm": 0.4915487101147822, "learning_rate": 5e-06, "loss": 0.4452, "step": 1160 }, { "epoch": 0.6600846262341326, "grad_norm": 0.4636364534332318, "learning_rate": 5e-06, "loss": 0.4397, "step": 1170 }, { "epoch": 0.6657263751763046, "grad_norm": 0.4698556111548417, "learning_rate": 5e-06, "loss": 0.4417, "step": 1180 }, { "epoch": 0.6713681241184767, "grad_norm": 0.5329347792411113, "learning_rate": 5e-06, "loss": 0.4539, "step": 1190 }, { "epoch": 0.6770098730606487, "grad_norm": 0.5126424852624655, "learning_rate": 5e-06, "loss": 0.4429, "step": 1200 }, { "epoch": 0.6826516220028209, "grad_norm": 0.4600428689947934, "learning_rate": 5e-06, "loss": 0.4481, "step": 1210 }, { "epoch": 0.688293370944993, "grad_norm": 0.4918232014874478, "learning_rate": 5e-06, "loss": 0.44, "step": 1220 }, { "epoch": 0.693935119887165, "grad_norm": 0.5091072490058565, "learning_rate": 5e-06, "loss": 0.443, "step": 1230 }, { "epoch": 0.6995768688293371, "grad_norm": 0.5086478162333048, "learning_rate": 5e-06, "loss": 0.4439, "step": 1240 }, { "epoch": 0.7052186177715092, "grad_norm": 0.47954449181032316, "learning_rate": 5e-06, "loss": 0.4252, "step": 1250 }, { "epoch": 0.7108603667136812, "grad_norm": 0.46596459050514427, "learning_rate": 5e-06, "loss": 0.4482, "step": 1260 }, { "epoch": 0.7165021156558533, "grad_norm": 0.46248125242410526, "learning_rate": 5e-06, "loss": 0.4402, "step": 1270 }, { "epoch": 0.7221438645980254, "grad_norm": 0.49235084627255177, "learning_rate": 5e-06, "loss": 0.4368, "step": 1280 }, { "epoch": 0.7277856135401974, "grad_norm": 0.4864015478165713, "learning_rate": 5e-06, "loss": 0.4577, "step": 1290 }, { "epoch": 0.7334273624823695, "grad_norm": 0.5066841927831519, "learning_rate": 5e-06, "loss": 0.4527, "step": 1300 }, { "epoch": 0.7390691114245416, "grad_norm": 0.4767296270599191, "learning_rate": 5e-06, "loss": 0.4421, "step": 1310 }, { "epoch": 0.7447108603667136, "grad_norm": 0.4770443766109164, "learning_rate": 5e-06, "loss": 0.4548, "step": 1320 }, { "epoch": 0.7503526093088858, "grad_norm": 0.4792819993673282, "learning_rate": 5e-06, "loss": 0.4349, "step": 1330 }, { "epoch": 0.7559943582510579, "grad_norm": 0.48987632661924885, "learning_rate": 5e-06, "loss": 0.4378, "step": 1340 }, { "epoch": 0.7616361071932299, "grad_norm": 0.4896409271912306, "learning_rate": 5e-06, "loss": 0.4408, "step": 1350 }, { "epoch": 0.767277856135402, "grad_norm": 0.5370347277468178, "learning_rate": 5e-06, "loss": 0.4456, "step": 1360 }, { "epoch": 0.7729196050775741, "grad_norm": 0.5032968949454037, "learning_rate": 5e-06, "loss": 0.4473, "step": 1370 }, { "epoch": 0.7785613540197461, "grad_norm": 0.48685319139056404, "learning_rate": 5e-06, "loss": 0.4434, "step": 1380 }, { "epoch": 0.7842031029619182, "grad_norm": 0.49748304716726394, "learning_rate": 5e-06, "loss": 0.4296, "step": 1390 }, { "epoch": 0.7898448519040903, "grad_norm": 0.48733408476356, "learning_rate": 5e-06, "loss": 0.4447, "step": 1400 }, { "epoch": 0.7954866008462623, "grad_norm": 0.5053450525255075, "learning_rate": 5e-06, "loss": 0.437, "step": 1410 }, { "epoch": 0.8011283497884344, "grad_norm": 0.5051373461963963, "learning_rate": 5e-06, "loss": 0.4517, "step": 1420 }, { "epoch": 0.8067700987306065, "grad_norm": 0.5031702066693102, "learning_rate": 5e-06, "loss": 0.4458, "step": 1430 }, { "epoch": 0.8124118476727785, "grad_norm": 0.5185876273657516, "learning_rate": 5e-06, "loss": 0.4542, "step": 1440 }, { "epoch": 0.8180535966149506, "grad_norm": 0.49124261927260193, "learning_rate": 5e-06, "loss": 0.4405, "step": 1450 }, { "epoch": 0.8236953455571228, "grad_norm": 0.49751086570325753, "learning_rate": 5e-06, "loss": 0.4357, "step": 1460 }, { "epoch": 0.8293370944992948, "grad_norm": 0.4707406079652606, "learning_rate": 5e-06, "loss": 0.4404, "step": 1470 }, { "epoch": 0.8349788434414669, "grad_norm": 0.4611322469678821, "learning_rate": 5e-06, "loss": 0.4291, "step": 1480 }, { "epoch": 0.840620592383639, "grad_norm": 0.46796161325249325, "learning_rate": 5e-06, "loss": 0.4446, "step": 1490 }, { "epoch": 0.846262341325811, "grad_norm": 0.5039127016141375, "learning_rate": 5e-06, "loss": 0.442, "step": 1500 }, { "epoch": 0.8519040902679831, "grad_norm": 0.4882929849367327, "learning_rate": 5e-06, "loss": 0.4427, "step": 1510 }, { "epoch": 0.8575458392101551, "grad_norm": 0.46485028595629135, "learning_rate": 5e-06, "loss": 0.4493, "step": 1520 }, { "epoch": 0.8631875881523272, "grad_norm": 0.5261718908487378, "learning_rate": 5e-06, "loss": 0.4425, "step": 1530 }, { "epoch": 0.8688293370944993, "grad_norm": 0.5030638236696873, "learning_rate": 5e-06, "loss": 0.453, "step": 1540 }, { "epoch": 0.8744710860366713, "grad_norm": 0.47397319099408175, "learning_rate": 5e-06, "loss": 0.4667, "step": 1550 }, { "epoch": 0.8801128349788434, "grad_norm": 0.45947257613776293, "learning_rate": 5e-06, "loss": 0.4408, "step": 1560 }, { "epoch": 0.8857545839210155, "grad_norm": 0.4886106451240436, "learning_rate": 5e-06, "loss": 0.4323, "step": 1570 }, { "epoch": 0.8913963328631875, "grad_norm": 0.46842609789012146, "learning_rate": 5e-06, "loss": 0.4385, "step": 1580 }, { "epoch": 0.8970380818053597, "grad_norm": 0.49975332721542237, "learning_rate": 5e-06, "loss": 0.4398, "step": 1590 }, { "epoch": 0.9026798307475318, "grad_norm": 0.48527328135804326, "learning_rate": 5e-06, "loss": 0.4484, "step": 1600 }, { "epoch": 0.9083215796897038, "grad_norm": 0.49172287584389185, "learning_rate": 5e-06, "loss": 0.4463, "step": 1610 }, { "epoch": 0.9139633286318759, "grad_norm": 0.508732362088126, "learning_rate": 5e-06, "loss": 0.4395, "step": 1620 }, { "epoch": 0.919605077574048, "grad_norm": 0.47225307145651074, "learning_rate": 5e-06, "loss": 0.4548, "step": 1630 }, { "epoch": 0.92524682651622, "grad_norm": 0.46028374293695373, "learning_rate": 5e-06, "loss": 0.4402, "step": 1640 }, { "epoch": 0.9308885754583921, "grad_norm": 0.4887795142703319, "learning_rate": 5e-06, "loss": 0.4524, "step": 1650 }, { "epoch": 0.9365303244005642, "grad_norm": 0.48414776958913036, "learning_rate": 5e-06, "loss": 0.4388, "step": 1660 }, { "epoch": 0.9421720733427362, "grad_norm": 0.47408507089480434, "learning_rate": 5e-06, "loss": 0.4373, "step": 1670 }, { "epoch": 0.9478138222849083, "grad_norm": 0.4755919436355295, "learning_rate": 5e-06, "loss": 0.4521, "step": 1680 }, { "epoch": 0.9534555712270804, "grad_norm": 0.48600199903202446, "learning_rate": 5e-06, "loss": 0.4387, "step": 1690 }, { "epoch": 0.9590973201692524, "grad_norm": 0.4826408864245463, "learning_rate": 5e-06, "loss": 0.4474, "step": 1700 }, { "epoch": 0.9647390691114246, "grad_norm": 0.5360459005214712, "learning_rate": 5e-06, "loss": 0.4402, "step": 1710 }, { "epoch": 0.9703808180535967, "grad_norm": 0.5267429044967258, "learning_rate": 5e-06, "loss": 0.448, "step": 1720 }, { "epoch": 0.9760225669957687, "grad_norm": 0.487975885463895, "learning_rate": 5e-06, "loss": 0.4527, "step": 1730 }, { "epoch": 0.9816643159379408, "grad_norm": 0.4656913505732415, "learning_rate": 5e-06, "loss": 0.4458, "step": 1740 }, { "epoch": 0.9873060648801129, "grad_norm": 0.48356320486134374, "learning_rate": 5e-06, "loss": 0.4565, "step": 1750 }, { "epoch": 0.9929478138222849, "grad_norm": 0.5106136347337831, "learning_rate": 5e-06, "loss": 0.4481, "step": 1760 }, { "epoch": 0.998589562764457, "grad_norm": 0.481310325218027, "learning_rate": 5e-06, "loss": 0.4318, "step": 1770 }, { "epoch": 0.9997179125528914, "eval_loss": 0.4379998743534088, "eval_runtime": 445.9549, "eval_samples_per_second": 26.77, "eval_steps_per_second": 0.419, "step": 1772 }, { "epoch": 1.004231311706629, "grad_norm": 0.5560347360599398, "learning_rate": 5e-06, "loss": 0.4228, "step": 1780 }, { "epoch": 1.0098730606488011, "grad_norm": 0.43343321808918617, "learning_rate": 5e-06, "loss": 0.3888, "step": 1790 }, { "epoch": 1.0155148095909732, "grad_norm": 0.49193958815688976, "learning_rate": 5e-06, "loss": 0.4081, "step": 1800 }, { "epoch": 1.0211565585331452, "grad_norm": 0.44261196562739774, "learning_rate": 5e-06, "loss": 0.4083, "step": 1810 }, { "epoch": 1.0267983074753173, "grad_norm": 0.48715391428811605, "learning_rate": 5e-06, "loss": 0.403, "step": 1820 }, { "epoch": 1.0324400564174894, "grad_norm": 0.45617321287848667, "learning_rate": 5e-06, "loss": 0.3984, "step": 1830 }, { "epoch": 1.0380818053596614, "grad_norm": 0.46951380049994146, "learning_rate": 5e-06, "loss": 0.3908, "step": 1840 }, { "epoch": 1.0437235543018335, "grad_norm": 0.4606776177243496, "learning_rate": 5e-06, "loss": 0.3944, "step": 1850 }, { "epoch": 1.0493653032440056, "grad_norm": 0.46717676409843034, "learning_rate": 5e-06, "loss": 0.3888, "step": 1860 }, { "epoch": 1.0550070521861776, "grad_norm": 0.4602516664423018, "learning_rate": 5e-06, "loss": 0.3936, "step": 1870 }, { "epoch": 1.0606488011283497, "grad_norm": 0.42788829282622504, "learning_rate": 5e-06, "loss": 0.3881, "step": 1880 }, { "epoch": 1.0662905500705218, "grad_norm": 0.45508688226916866, "learning_rate": 5e-06, "loss": 0.3997, "step": 1890 }, { "epoch": 1.071932299012694, "grad_norm": 0.45167507963707426, "learning_rate": 5e-06, "loss": 0.3945, "step": 1900 }, { "epoch": 1.077574047954866, "grad_norm": 0.4638857492654454, "learning_rate": 5e-06, "loss": 0.393, "step": 1910 }, { "epoch": 1.0832157968970382, "grad_norm": 0.4565491666336401, "learning_rate": 5e-06, "loss": 0.3905, "step": 1920 }, { "epoch": 1.0888575458392102, "grad_norm": 0.4468567209212458, "learning_rate": 5e-06, "loss": 0.3945, "step": 1930 }, { "epoch": 1.0944992947813823, "grad_norm": 0.4451125923550269, "learning_rate": 5e-06, "loss": 0.3888, "step": 1940 }, { "epoch": 1.1001410437235544, "grad_norm": 0.4700452714699321, "learning_rate": 5e-06, "loss": 0.3889, "step": 1950 }, { "epoch": 1.1057827926657264, "grad_norm": 0.4650898761163617, "learning_rate": 5e-06, "loss": 0.3883, "step": 1960 }, { "epoch": 1.1114245416078985, "grad_norm": 0.4707832390036078, "learning_rate": 5e-06, "loss": 0.4015, "step": 1970 }, { "epoch": 1.1170662905500706, "grad_norm": 0.4743179184075093, "learning_rate": 5e-06, "loss": 0.391, "step": 1980 }, { "epoch": 1.1227080394922426, "grad_norm": 0.4823902906366835, "learning_rate": 5e-06, "loss": 0.3933, "step": 1990 }, { "epoch": 1.1283497884344147, "grad_norm": 0.4929422166855442, "learning_rate": 5e-06, "loss": 0.4033, "step": 2000 }, { "epoch": 1.1339915373765868, "grad_norm": 0.46931415950586963, "learning_rate": 5e-06, "loss": 0.3962, "step": 2010 }, { "epoch": 1.1396332863187588, "grad_norm": 0.4716793144119691, "learning_rate": 5e-06, "loss": 0.3843, "step": 2020 }, { "epoch": 1.1452750352609309, "grad_norm": 0.46214625180030394, "learning_rate": 5e-06, "loss": 0.3896, "step": 2030 }, { "epoch": 1.150916784203103, "grad_norm": 0.4478869441665965, "learning_rate": 5e-06, "loss": 0.3903, "step": 2040 }, { "epoch": 1.156558533145275, "grad_norm": 0.47404105806443103, "learning_rate": 5e-06, "loss": 0.3923, "step": 2050 }, { "epoch": 1.162200282087447, "grad_norm": 0.4815826404229114, "learning_rate": 5e-06, "loss": 0.4137, "step": 2060 }, { "epoch": 1.1678420310296191, "grad_norm": 0.47653645240601855, "learning_rate": 5e-06, "loss": 0.3952, "step": 2070 }, { "epoch": 1.1734837799717912, "grad_norm": 0.49644988829819037, "learning_rate": 5e-06, "loss": 0.3981, "step": 2080 }, { "epoch": 1.1791255289139633, "grad_norm": 0.46657331353149667, "learning_rate": 5e-06, "loss": 0.397, "step": 2090 }, { "epoch": 1.1847672778561353, "grad_norm": 0.4713649930891489, "learning_rate": 5e-06, "loss": 0.3987, "step": 2100 }, { "epoch": 1.1904090267983074, "grad_norm": 0.4988957090347967, "learning_rate": 5e-06, "loss": 0.3913, "step": 2110 }, { "epoch": 1.1960507757404795, "grad_norm": 0.4644112960714002, "learning_rate": 5e-06, "loss": 0.4001, "step": 2120 }, { "epoch": 1.2016925246826515, "grad_norm": 0.4707333720355816, "learning_rate": 5e-06, "loss": 0.3968, "step": 2130 }, { "epoch": 1.2073342736248236, "grad_norm": 0.47608463008729884, "learning_rate": 5e-06, "loss": 0.4022, "step": 2140 }, { "epoch": 1.2129760225669957, "grad_norm": 0.45996551421943505, "learning_rate": 5e-06, "loss": 0.4083, "step": 2150 }, { "epoch": 1.2186177715091677, "grad_norm": 0.45343813144247014, "learning_rate": 5e-06, "loss": 0.379, "step": 2160 }, { "epoch": 1.22425952045134, "grad_norm": 0.44526573844484096, "learning_rate": 5e-06, "loss": 0.397, "step": 2170 }, { "epoch": 1.229901269393512, "grad_norm": 0.48779112035480804, "learning_rate": 5e-06, "loss": 0.3988, "step": 2180 }, { "epoch": 1.2355430183356841, "grad_norm": 0.4487329859304139, "learning_rate": 5e-06, "loss": 0.3802, "step": 2190 }, { "epoch": 1.2411847672778562, "grad_norm": 0.47886286342692885, "learning_rate": 5e-06, "loss": 0.3966, "step": 2200 }, { "epoch": 1.2468265162200283, "grad_norm": 0.45776874778870136, "learning_rate": 5e-06, "loss": 0.3955, "step": 2210 }, { "epoch": 1.2524682651622003, "grad_norm": 0.47257007534396295, "learning_rate": 5e-06, "loss": 0.3888, "step": 2220 }, { "epoch": 1.2581100141043724, "grad_norm": 0.46751284003891047, "learning_rate": 5e-06, "loss": 0.3969, "step": 2230 }, { "epoch": 1.2637517630465445, "grad_norm": 0.4661158831574023, "learning_rate": 5e-06, "loss": 0.4088, "step": 2240 }, { "epoch": 1.2693935119887165, "grad_norm": 0.4394915987852524, "learning_rate": 5e-06, "loss": 0.3935, "step": 2250 }, { "epoch": 1.2750352609308886, "grad_norm": 0.45334151132727485, "learning_rate": 5e-06, "loss": 0.3944, "step": 2260 }, { "epoch": 1.2806770098730607, "grad_norm": 0.5078200971616262, "learning_rate": 5e-06, "loss": 0.3905, "step": 2270 }, { "epoch": 1.2863187588152327, "grad_norm": 0.4713106460600115, "learning_rate": 5e-06, "loss": 0.3955, "step": 2280 }, { "epoch": 1.2919605077574048, "grad_norm": 0.4635282807772546, "learning_rate": 5e-06, "loss": 0.4013, "step": 2290 }, { "epoch": 1.2976022566995769, "grad_norm": 0.48334074568481694, "learning_rate": 5e-06, "loss": 0.4005, "step": 2300 }, { "epoch": 1.303244005641749, "grad_norm": 0.48456675280641903, "learning_rate": 5e-06, "loss": 0.3933, "step": 2310 }, { "epoch": 1.308885754583921, "grad_norm": 0.46200542060106936, "learning_rate": 5e-06, "loss": 0.3835, "step": 2320 }, { "epoch": 1.314527503526093, "grad_norm": 0.4815654441432598, "learning_rate": 5e-06, "loss": 0.4045, "step": 2330 }, { "epoch": 1.320169252468265, "grad_norm": 0.48826822709991896, "learning_rate": 5e-06, "loss": 0.4009, "step": 2340 }, { "epoch": 1.3258110014104372, "grad_norm": 0.4716440781629598, "learning_rate": 5e-06, "loss": 0.3938, "step": 2350 }, { "epoch": 1.3314527503526092, "grad_norm": 0.4602369354038975, "learning_rate": 5e-06, "loss": 0.3928, "step": 2360 }, { "epoch": 1.3370944992947813, "grad_norm": 0.49648382398328583, "learning_rate": 5e-06, "loss": 0.4033, "step": 2370 }, { "epoch": 1.3427362482369536, "grad_norm": 0.46739455409641245, "learning_rate": 5e-06, "loss": 0.3972, "step": 2380 }, { "epoch": 1.3483779971791257, "grad_norm": 0.443323801929617, "learning_rate": 5e-06, "loss": 0.3968, "step": 2390 }, { "epoch": 1.3540197461212977, "grad_norm": 0.4539331304987661, "learning_rate": 5e-06, "loss": 0.3924, "step": 2400 }, { "epoch": 1.3596614950634698, "grad_norm": 0.47781406684365874, "learning_rate": 5e-06, "loss": 0.4117, "step": 2410 }, { "epoch": 1.3653032440056418, "grad_norm": 0.45421508901943547, "learning_rate": 5e-06, "loss": 0.39, "step": 2420 }, { "epoch": 1.370944992947814, "grad_norm": 0.4630026357905712, "learning_rate": 5e-06, "loss": 0.4, "step": 2430 }, { "epoch": 1.376586741889986, "grad_norm": 0.4523702465470941, "learning_rate": 5e-06, "loss": 0.4077, "step": 2440 }, { "epoch": 1.382228490832158, "grad_norm": 0.45782219911496075, "learning_rate": 5e-06, "loss": 0.3959, "step": 2450 }, { "epoch": 1.38787023977433, "grad_norm": 0.5117871130526895, "learning_rate": 5e-06, "loss": 0.4016, "step": 2460 }, { "epoch": 1.3935119887165022, "grad_norm": 0.5315155467795695, "learning_rate": 5e-06, "loss": 0.3973, "step": 2470 }, { "epoch": 1.3991537376586742, "grad_norm": 0.46939820177172814, "learning_rate": 5e-06, "loss": 0.3942, "step": 2480 }, { "epoch": 1.4047954866008463, "grad_norm": 0.4543328677593474, "learning_rate": 5e-06, "loss": 0.3998, "step": 2490 }, { "epoch": 1.4104372355430184, "grad_norm": 0.4496789872069011, "learning_rate": 5e-06, "loss": 0.3963, "step": 2500 }, { "epoch": 1.4160789844851904, "grad_norm": 0.471782994083793, "learning_rate": 5e-06, "loss": 0.3885, "step": 2510 }, { "epoch": 1.4217207334273625, "grad_norm": 0.4693960167487946, "learning_rate": 5e-06, "loss": 0.3917, "step": 2520 }, { "epoch": 1.4273624823695346, "grad_norm": 0.4538271312417869, "learning_rate": 5e-06, "loss": 0.3967, "step": 2530 }, { "epoch": 1.4330042313117066, "grad_norm": 0.44186919531244484, "learning_rate": 5e-06, "loss": 0.395, "step": 2540 }, { "epoch": 1.4386459802538787, "grad_norm": 0.4687203668363411, "learning_rate": 5e-06, "loss": 0.3981, "step": 2550 }, { "epoch": 1.4442877291960508, "grad_norm": 0.4587106074858165, "learning_rate": 5e-06, "loss": 0.3925, "step": 2560 }, { "epoch": 1.4499294781382228, "grad_norm": 0.4572379861998615, "learning_rate": 5e-06, "loss": 0.4025, "step": 2570 }, { "epoch": 1.4555712270803949, "grad_norm": 0.4718970269023812, "learning_rate": 5e-06, "loss": 0.3943, "step": 2580 }, { "epoch": 1.461212976022567, "grad_norm": 0.5041632079079171, "learning_rate": 5e-06, "loss": 0.4082, "step": 2590 }, { "epoch": 1.466854724964739, "grad_norm": 0.4499939474661196, "learning_rate": 5e-06, "loss": 0.4006, "step": 2600 }, { "epoch": 1.472496473906911, "grad_norm": 0.44298819801342293, "learning_rate": 5e-06, "loss": 0.4043, "step": 2610 }, { "epoch": 1.4781382228490831, "grad_norm": 0.4779382761973526, "learning_rate": 5e-06, "loss": 0.4056, "step": 2620 }, { "epoch": 1.4837799717912552, "grad_norm": 0.47692712159702133, "learning_rate": 5e-06, "loss": 0.3907, "step": 2630 }, { "epoch": 1.4894217207334273, "grad_norm": 0.46307783252557594, "learning_rate": 5e-06, "loss": 0.3959, "step": 2640 }, { "epoch": 1.4950634696755993, "grad_norm": 0.4606066509600136, "learning_rate": 5e-06, "loss": 0.3934, "step": 2650 }, { "epoch": 1.5007052186177714, "grad_norm": 0.47324134946913055, "learning_rate": 5e-06, "loss": 0.4035, "step": 2660 }, { "epoch": 1.5063469675599435, "grad_norm": 0.46378369582677753, "learning_rate": 5e-06, "loss": 0.3993, "step": 2670 }, { "epoch": 1.5119887165021155, "grad_norm": 0.446704021535711, "learning_rate": 5e-06, "loss": 0.3918, "step": 2680 }, { "epoch": 1.5176304654442876, "grad_norm": 0.4914697095674317, "learning_rate": 5e-06, "loss": 0.3909, "step": 2690 }, { "epoch": 1.5232722143864597, "grad_norm": 0.45218861250501363, "learning_rate": 5e-06, "loss": 0.3891, "step": 2700 }, { "epoch": 1.5289139633286317, "grad_norm": 0.47296389203246264, "learning_rate": 5e-06, "loss": 0.4063, "step": 2710 }, { "epoch": 1.5345557122708038, "grad_norm": 0.4565578982324502, "learning_rate": 5e-06, "loss": 0.4004, "step": 2720 }, { "epoch": 1.540197461212976, "grad_norm": 0.4682402541291805, "learning_rate": 5e-06, "loss": 0.3926, "step": 2730 }, { "epoch": 1.5458392101551481, "grad_norm": 0.461279370651597, "learning_rate": 5e-06, "loss": 0.3896, "step": 2740 }, { "epoch": 1.5514809590973202, "grad_norm": 0.48043898076393116, "learning_rate": 5e-06, "loss": 0.3999, "step": 2750 }, { "epoch": 1.5571227080394923, "grad_norm": 0.4940569464772548, "learning_rate": 5e-06, "loss": 0.4077, "step": 2760 }, { "epoch": 1.5627644569816643, "grad_norm": 0.507591402814353, "learning_rate": 5e-06, "loss": 0.4009, "step": 2770 }, { "epoch": 1.5684062059238364, "grad_norm": 0.4706877768766689, "learning_rate": 5e-06, "loss": 0.3924, "step": 2780 }, { "epoch": 1.5740479548660085, "grad_norm": 0.47676776366340334, "learning_rate": 5e-06, "loss": 0.3877, "step": 2790 }, { "epoch": 1.5796897038081805, "grad_norm": 0.5182129400923178, "learning_rate": 5e-06, "loss": 0.4036, "step": 2800 }, { "epoch": 1.5853314527503526, "grad_norm": 0.48116837090131204, "learning_rate": 5e-06, "loss": 0.3975, "step": 2810 }, { "epoch": 1.5909732016925247, "grad_norm": 0.4275386122917158, "learning_rate": 5e-06, "loss": 0.3918, "step": 2820 }, { "epoch": 1.5966149506346967, "grad_norm": 0.4558363289192502, "learning_rate": 5e-06, "loss": 0.3882, "step": 2830 }, { "epoch": 1.6022566995768688, "grad_norm": 0.46362140028375165, "learning_rate": 5e-06, "loss": 0.3986, "step": 2840 }, { "epoch": 1.607898448519041, "grad_norm": 0.46282031581127986, "learning_rate": 5e-06, "loss": 0.4039, "step": 2850 }, { "epoch": 1.6135401974612131, "grad_norm": 0.46441758640717906, "learning_rate": 5e-06, "loss": 0.4025, "step": 2860 }, { "epoch": 1.6191819464033852, "grad_norm": 0.46074665854229274, "learning_rate": 5e-06, "loss": 0.3905, "step": 2870 }, { "epoch": 1.6248236953455573, "grad_norm": 0.4777266277891572, "learning_rate": 5e-06, "loss": 0.4042, "step": 2880 }, { "epoch": 1.6304654442877293, "grad_norm": 0.45553733285190573, "learning_rate": 5e-06, "loss": 0.3962, "step": 2890 }, { "epoch": 1.6361071932299014, "grad_norm": 0.47178767330297033, "learning_rate": 5e-06, "loss": 0.3892, "step": 2900 }, { "epoch": 1.6417489421720735, "grad_norm": 0.47359171054395643, "learning_rate": 5e-06, "loss": 0.3881, "step": 2910 }, { "epoch": 1.6473906911142455, "grad_norm": 0.46603715950910696, "learning_rate": 5e-06, "loss": 0.394, "step": 2920 }, { "epoch": 1.6530324400564176, "grad_norm": 0.4558346803274527, "learning_rate": 5e-06, "loss": 0.3972, "step": 2930 }, { "epoch": 1.6586741889985896, "grad_norm": 0.479027856317277, "learning_rate": 5e-06, "loss": 0.394, "step": 2940 }, { "epoch": 1.6643159379407617, "grad_norm": 0.4643135506673433, "learning_rate": 5e-06, "loss": 0.3862, "step": 2950 }, { "epoch": 1.6699576868829338, "grad_norm": 0.4619833964547844, "learning_rate": 5e-06, "loss": 0.4016, "step": 2960 }, { "epoch": 1.6755994358251058, "grad_norm": 0.44851503998801856, "learning_rate": 5e-06, "loss": 0.4095, "step": 2970 }, { "epoch": 1.681241184767278, "grad_norm": 0.4467522582124666, "learning_rate": 5e-06, "loss": 0.3999, "step": 2980 }, { "epoch": 1.68688293370945, "grad_norm": 0.472063710369713, "learning_rate": 5e-06, "loss": 0.3939, "step": 2990 }, { "epoch": 1.692524682651622, "grad_norm": 0.48009995546325723, "learning_rate": 5e-06, "loss": 0.4183, "step": 3000 }, { "epoch": 1.698166431593794, "grad_norm": 0.44768147031308736, "learning_rate": 5e-06, "loss": 0.3983, "step": 3010 }, { "epoch": 1.7038081805359662, "grad_norm": 0.476978526816067, "learning_rate": 5e-06, "loss": 0.3951, "step": 3020 }, { "epoch": 1.7094499294781382, "grad_norm": 0.4696465520344116, "learning_rate": 5e-06, "loss": 0.3961, "step": 3030 }, { "epoch": 1.7150916784203103, "grad_norm": 0.4576648300580514, "learning_rate": 5e-06, "loss": 0.3942, "step": 3040 }, { "epoch": 1.7207334273624824, "grad_norm": 0.4731142162264191, "learning_rate": 5e-06, "loss": 0.3946, "step": 3050 }, { "epoch": 1.7263751763046544, "grad_norm": 0.4756579121422872, "learning_rate": 5e-06, "loss": 0.3925, "step": 3060 }, { "epoch": 1.7320169252468265, "grad_norm": 0.46052906775460756, "learning_rate": 5e-06, "loss": 0.3959, "step": 3070 }, { "epoch": 1.7376586741889986, "grad_norm": 0.43627250154239816, "learning_rate": 5e-06, "loss": 0.4012, "step": 3080 }, { "epoch": 1.7433004231311706, "grad_norm": 0.4820135003105483, "learning_rate": 5e-06, "loss": 0.3881, "step": 3090 }, { "epoch": 1.7489421720733427, "grad_norm": 0.4778018041594859, "learning_rate": 5e-06, "loss": 0.4, "step": 3100 }, { "epoch": 1.7545839210155147, "grad_norm": 0.4884888164729651, "learning_rate": 5e-06, "loss": 0.3891, "step": 3110 }, { "epoch": 1.7602256699576868, "grad_norm": 0.4475592439449893, "learning_rate": 5e-06, "loss": 0.3926, "step": 3120 }, { "epoch": 1.7658674188998589, "grad_norm": 0.47654858360039826, "learning_rate": 5e-06, "loss": 0.419, "step": 3130 }, { "epoch": 1.771509167842031, "grad_norm": 0.4555878766506712, "learning_rate": 5e-06, "loss": 0.3962, "step": 3140 }, { "epoch": 1.777150916784203, "grad_norm": 0.46471151818843526, "learning_rate": 5e-06, "loss": 0.3956, "step": 3150 }, { "epoch": 1.782792665726375, "grad_norm": 0.47449169536040453, "learning_rate": 5e-06, "loss": 0.4006, "step": 3160 }, { "epoch": 1.7884344146685471, "grad_norm": 0.4602783138679876, "learning_rate": 5e-06, "loss": 0.3917, "step": 3170 }, { "epoch": 1.7940761636107192, "grad_norm": 0.472002669632583, "learning_rate": 5e-06, "loss": 0.4085, "step": 3180 }, { "epoch": 1.7997179125528913, "grad_norm": 0.4675920743304564, "learning_rate": 5e-06, "loss": 0.4003, "step": 3190 }, { "epoch": 1.8053596614950633, "grad_norm": 0.4615694339610234, "learning_rate": 5e-06, "loss": 0.3955, "step": 3200 }, { "epoch": 1.8110014104372354, "grad_norm": 0.4628092696805375, "learning_rate": 5e-06, "loss": 0.4016, "step": 3210 }, { "epoch": 1.8166431593794075, "grad_norm": 0.5254614489605429, "learning_rate": 5e-06, "loss": 0.4033, "step": 3220 }, { "epoch": 1.8222849083215797, "grad_norm": 0.44928160369491393, "learning_rate": 5e-06, "loss": 0.3927, "step": 3230 }, { "epoch": 1.8279266572637518, "grad_norm": 0.4956917940016818, "learning_rate": 5e-06, "loss": 0.3963, "step": 3240 }, { "epoch": 1.8335684062059239, "grad_norm": 0.45361998350750077, "learning_rate": 5e-06, "loss": 0.3983, "step": 3250 }, { "epoch": 1.839210155148096, "grad_norm": 0.4515003704595137, "learning_rate": 5e-06, "loss": 0.3963, "step": 3260 }, { "epoch": 1.844851904090268, "grad_norm": 0.4556275342458607, "learning_rate": 5e-06, "loss": 0.398, "step": 3270 }, { "epoch": 1.85049365303244, "grad_norm": 0.46890202055080116, "learning_rate": 5e-06, "loss": 0.3804, "step": 3280 }, { "epoch": 1.8561354019746121, "grad_norm": 0.4567048420478033, "learning_rate": 5e-06, "loss": 0.3866, "step": 3290 }, { "epoch": 1.8617771509167842, "grad_norm": 0.46010370992720745, "learning_rate": 5e-06, "loss": 0.4071, "step": 3300 }, { "epoch": 1.8674188998589563, "grad_norm": 0.46330780688841133, "learning_rate": 5e-06, "loss": 0.4007, "step": 3310 }, { "epoch": 1.8730606488011283, "grad_norm": 0.467515061321271, "learning_rate": 5e-06, "loss": 0.3958, "step": 3320 }, { "epoch": 1.8787023977433004, "grad_norm": 0.4478417663578568, "learning_rate": 5e-06, "loss": 0.3965, "step": 3330 }, { "epoch": 1.8843441466854725, "grad_norm": 0.46131439541886865, "learning_rate": 5e-06, "loss": 0.3972, "step": 3340 }, { "epoch": 1.8899858956276445, "grad_norm": 0.4649363714764279, "learning_rate": 5e-06, "loss": 0.3943, "step": 3350 }, { "epoch": 1.8956276445698168, "grad_norm": 0.46303194795992636, "learning_rate": 5e-06, "loss": 0.391, "step": 3360 }, { "epoch": 1.9012693935119889, "grad_norm": 0.42878941947013166, "learning_rate": 5e-06, "loss": 0.4039, "step": 3370 }, { "epoch": 1.906911142454161, "grad_norm": 0.4725709423896906, "learning_rate": 5e-06, "loss": 0.3988, "step": 3380 }, { "epoch": 1.912552891396333, "grad_norm": 0.47671368663777564, "learning_rate": 5e-06, "loss": 0.3884, "step": 3390 }, { "epoch": 1.918194640338505, "grad_norm": 0.46668210559071105, "learning_rate": 5e-06, "loss": 0.4066, "step": 3400 }, { "epoch": 1.9238363892806771, "grad_norm": 0.4572223756340763, "learning_rate": 5e-06, "loss": 0.3973, "step": 3410 }, { "epoch": 1.9294781382228492, "grad_norm": 0.45902290859441564, "learning_rate": 5e-06, "loss": 0.3929, "step": 3420 }, { "epoch": 1.9351198871650213, "grad_norm": 0.458044072628747, "learning_rate": 5e-06, "loss": 0.3857, "step": 3430 }, { "epoch": 1.9407616361071933, "grad_norm": 0.4693435523479085, "learning_rate": 5e-06, "loss": 0.3904, "step": 3440 }, { "epoch": 1.9464033850493654, "grad_norm": 0.4525757784211967, "learning_rate": 5e-06, "loss": 0.3901, "step": 3450 }, { "epoch": 1.9520451339915375, "grad_norm": 0.4527753194974229, "learning_rate": 5e-06, "loss": 0.3883, "step": 3460 }, { "epoch": 1.9576868829337095, "grad_norm": 0.45775257683357495, "learning_rate": 5e-06, "loss": 0.3933, "step": 3470 }, { "epoch": 1.9633286318758816, "grad_norm": 0.4513845604140946, "learning_rate": 5e-06, "loss": 0.4084, "step": 3480 }, { "epoch": 1.9689703808180536, "grad_norm": 0.488458649525053, "learning_rate": 5e-06, "loss": 0.3829, "step": 3490 }, { "epoch": 1.9746121297602257, "grad_norm": 0.446090731775206, "learning_rate": 5e-06, "loss": 0.3924, "step": 3500 }, { "epoch": 1.9802538787023978, "grad_norm": 0.4709438262355249, "learning_rate": 5e-06, "loss": 0.3894, "step": 3510 }, { "epoch": 1.9858956276445698, "grad_norm": 0.4870735211701005, "learning_rate": 5e-06, "loss": 0.4027, "step": 3520 }, { "epoch": 1.991537376586742, "grad_norm": 0.4803646418996235, "learning_rate": 5e-06, "loss": 0.3859, "step": 3530 }, { "epoch": 1.997179125528914, "grad_norm": 0.4703422698968033, "learning_rate": 5e-06, "loss": 0.3944, "step": 3540 }, { "epoch": 2.0, "eval_loss": 0.43374887108802795, "eval_runtime": 448.6077, "eval_samples_per_second": 26.611, "eval_steps_per_second": 0.417, "step": 3545 }, { "epoch": 2.002820874471086, "grad_norm": 0.4413473757815756, "learning_rate": 5e-06, "loss": 0.3928, "step": 3550 }, { "epoch": 2.008462623413258, "grad_norm": 0.4522342894282536, "learning_rate": 5e-06, "loss": 0.3547, "step": 3560 }, { "epoch": 2.01410437235543, "grad_norm": 0.4290168433856863, "learning_rate": 5e-06, "loss": 0.356, "step": 3570 }, { "epoch": 2.0197461212976022, "grad_norm": 0.48082235439755094, "learning_rate": 5e-06, "loss": 0.3528, "step": 3580 }, { "epoch": 2.0253878702397743, "grad_norm": 0.45722766485772276, "learning_rate": 5e-06, "loss": 0.3442, "step": 3590 }, { "epoch": 2.0310296191819464, "grad_norm": 0.45747195342514013, "learning_rate": 5e-06, "loss": 0.3432, "step": 3600 }, { "epoch": 2.0366713681241184, "grad_norm": 0.4358767354369319, "learning_rate": 5e-06, "loss": 0.3466, "step": 3610 }, { "epoch": 2.0423131170662905, "grad_norm": 0.45049478931307607, "learning_rate": 5e-06, "loss": 0.3504, "step": 3620 }, { "epoch": 2.0479548660084625, "grad_norm": 0.4563642590969261, "learning_rate": 5e-06, "loss": 0.3406, "step": 3630 }, { "epoch": 2.0535966149506346, "grad_norm": 0.4312356627894684, "learning_rate": 5e-06, "loss": 0.3431, "step": 3640 }, { "epoch": 2.0592383638928067, "grad_norm": 0.47281064022880187, "learning_rate": 5e-06, "loss": 0.3492, "step": 3650 }, { "epoch": 2.0648801128349787, "grad_norm": 0.4435651181299306, "learning_rate": 5e-06, "loss": 0.3411, "step": 3660 }, { "epoch": 2.070521861777151, "grad_norm": 0.4694545955857111, "learning_rate": 5e-06, "loss": 0.3476, "step": 3670 }, { "epoch": 2.076163610719323, "grad_norm": 0.4094871096575717, "learning_rate": 5e-06, "loss": 0.3446, "step": 3680 }, { "epoch": 2.081805359661495, "grad_norm": 0.4407127764565404, "learning_rate": 5e-06, "loss": 0.3511, "step": 3690 }, { "epoch": 2.087447108603667, "grad_norm": 0.46130441782721465, "learning_rate": 5e-06, "loss": 0.3701, "step": 3700 }, { "epoch": 2.093088857545839, "grad_norm": 0.48247411322370654, "learning_rate": 5e-06, "loss": 0.3563, "step": 3710 }, { "epoch": 2.098730606488011, "grad_norm": 0.4219187636429636, "learning_rate": 5e-06, "loss": 0.3471, "step": 3720 }, { "epoch": 2.104372355430183, "grad_norm": 0.4611444707906527, "learning_rate": 5e-06, "loss": 0.3567, "step": 3730 }, { "epoch": 2.1100141043723553, "grad_norm": 0.4553725011658897, "learning_rate": 5e-06, "loss": 0.3529, "step": 3740 }, { "epoch": 2.1156558533145273, "grad_norm": 0.4464002785245552, "learning_rate": 5e-06, "loss": 0.3501, "step": 3750 }, { "epoch": 2.1212976022566994, "grad_norm": 0.4300513716532807, "learning_rate": 5e-06, "loss": 0.3498, "step": 3760 }, { "epoch": 2.1269393511988715, "grad_norm": 0.456511976198015, "learning_rate": 5e-06, "loss": 0.3519, "step": 3770 }, { "epoch": 2.1325811001410435, "grad_norm": 0.4233132317201342, "learning_rate": 5e-06, "loss": 0.3476, "step": 3780 }, { "epoch": 2.138222849083216, "grad_norm": 0.471119966312247, "learning_rate": 5e-06, "loss": 0.3519, "step": 3790 }, { "epoch": 2.143864598025388, "grad_norm": 0.4633272783360576, "learning_rate": 5e-06, "loss": 0.3564, "step": 3800 }, { "epoch": 2.14950634696756, "grad_norm": 0.42965266973567434, "learning_rate": 5e-06, "loss": 0.3518, "step": 3810 }, { "epoch": 2.155148095909732, "grad_norm": 0.4298353082026474, "learning_rate": 5e-06, "loss": 0.3445, "step": 3820 }, { "epoch": 2.1607898448519043, "grad_norm": 0.47739223858312535, "learning_rate": 5e-06, "loss": 0.3445, "step": 3830 }, { "epoch": 2.1664315937940763, "grad_norm": 0.4532966358832045, "learning_rate": 5e-06, "loss": 0.356, "step": 3840 }, { "epoch": 2.1720733427362484, "grad_norm": 0.4410221778245472, "learning_rate": 5e-06, "loss": 0.3662, "step": 3850 }, { "epoch": 2.1777150916784205, "grad_norm": 0.4263307654776881, "learning_rate": 5e-06, "loss": 0.3467, "step": 3860 }, { "epoch": 2.1833568406205925, "grad_norm": 0.45149035680132815, "learning_rate": 5e-06, "loss": 0.3448, "step": 3870 }, { "epoch": 2.1889985895627646, "grad_norm": 0.4529321307975293, "learning_rate": 5e-06, "loss": 0.3477, "step": 3880 }, { "epoch": 2.1946403385049367, "grad_norm": 0.4607065783291932, "learning_rate": 5e-06, "loss": 0.3495, "step": 3890 }, { "epoch": 2.2002820874471087, "grad_norm": 0.4602518910582201, "learning_rate": 5e-06, "loss": 0.3487, "step": 3900 }, { "epoch": 2.205923836389281, "grad_norm": 0.43692806647452487, "learning_rate": 5e-06, "loss": 0.3524, "step": 3910 }, { "epoch": 2.211565585331453, "grad_norm": 0.4552717555185162, "learning_rate": 5e-06, "loss": 0.3471, "step": 3920 }, { "epoch": 2.217207334273625, "grad_norm": 0.4525591489683545, "learning_rate": 5e-06, "loss": 0.3592, "step": 3930 }, { "epoch": 2.222849083215797, "grad_norm": 0.4404336083861297, "learning_rate": 5e-06, "loss": 0.3557, "step": 3940 }, { "epoch": 2.228490832157969, "grad_norm": 0.45114848417343256, "learning_rate": 5e-06, "loss": 0.3423, "step": 3950 }, { "epoch": 2.234132581100141, "grad_norm": 0.4370952273186252, "learning_rate": 5e-06, "loss": 0.3546, "step": 3960 }, { "epoch": 2.239774330042313, "grad_norm": 0.4435799605060227, "learning_rate": 5e-06, "loss": 0.3567, "step": 3970 }, { "epoch": 2.2454160789844853, "grad_norm": 0.4915642595318201, "learning_rate": 5e-06, "loss": 0.3584, "step": 3980 }, { "epoch": 2.2510578279266573, "grad_norm": 0.44114555771160074, "learning_rate": 5e-06, "loss": 0.3486, "step": 3990 }, { "epoch": 2.2566995768688294, "grad_norm": 0.4749861176607326, "learning_rate": 5e-06, "loss": 0.3636, "step": 4000 }, { "epoch": 2.2623413258110014, "grad_norm": 0.46753231180049154, "learning_rate": 5e-06, "loss": 0.3537, "step": 4010 }, { "epoch": 2.2679830747531735, "grad_norm": 0.4419176647270738, "learning_rate": 5e-06, "loss": 0.3501, "step": 4020 }, { "epoch": 2.2736248236953456, "grad_norm": 0.4587766070955877, "learning_rate": 5e-06, "loss": 0.3541, "step": 4030 }, { "epoch": 2.2792665726375176, "grad_norm": 0.43530289551944434, "learning_rate": 5e-06, "loss": 0.351, "step": 4040 }, { "epoch": 2.2849083215796897, "grad_norm": 0.4631459072773793, "learning_rate": 5e-06, "loss": 0.3485, "step": 4050 }, { "epoch": 2.2905500705218618, "grad_norm": 0.47022508312977196, "learning_rate": 5e-06, "loss": 0.3502, "step": 4060 }, { "epoch": 2.296191819464034, "grad_norm": 0.4505813037738865, "learning_rate": 5e-06, "loss": 0.3651, "step": 4070 }, { "epoch": 2.301833568406206, "grad_norm": 0.4486825542866407, "learning_rate": 5e-06, "loss": 0.3544, "step": 4080 }, { "epoch": 2.307475317348378, "grad_norm": 0.4665999075970455, "learning_rate": 5e-06, "loss": 0.3685, "step": 4090 }, { "epoch": 2.31311706629055, "grad_norm": 0.4888288507127307, "learning_rate": 5e-06, "loss": 0.3619, "step": 4100 }, { "epoch": 2.318758815232722, "grad_norm": 0.41270795857689285, "learning_rate": 5e-06, "loss": 0.352, "step": 4110 }, { "epoch": 2.324400564174894, "grad_norm": 0.4419259104601605, "learning_rate": 5e-06, "loss": 0.3383, "step": 4120 }, { "epoch": 2.330042313117066, "grad_norm": 0.46603521309981116, "learning_rate": 5e-06, "loss": 0.3536, "step": 4130 }, { "epoch": 2.3356840620592383, "grad_norm": 0.47937141599717065, "learning_rate": 5e-06, "loss": 0.3563, "step": 4140 }, { "epoch": 2.3413258110014104, "grad_norm": 0.44765567183470945, "learning_rate": 5e-06, "loss": 0.3467, "step": 4150 }, { "epoch": 2.3469675599435824, "grad_norm": 0.4457046351361799, "learning_rate": 5e-06, "loss": 0.3503, "step": 4160 }, { "epoch": 2.3526093088857545, "grad_norm": 0.44307837034261943, "learning_rate": 5e-06, "loss": 0.352, "step": 4170 }, { "epoch": 2.3582510578279265, "grad_norm": 0.4267219846723022, "learning_rate": 5e-06, "loss": 0.3774, "step": 4180 }, { "epoch": 2.3638928067700986, "grad_norm": 0.48122908327097114, "learning_rate": 5e-06, "loss": 0.3598, "step": 4190 }, { "epoch": 2.3695345557122707, "grad_norm": 0.45331508297626466, "learning_rate": 5e-06, "loss": 0.362, "step": 4200 }, { "epoch": 2.3751763046544427, "grad_norm": 0.4594196615052227, "learning_rate": 5e-06, "loss": 0.354, "step": 4210 }, { "epoch": 2.380818053596615, "grad_norm": 0.4495058567180949, "learning_rate": 5e-06, "loss": 0.3602, "step": 4220 }, { "epoch": 2.386459802538787, "grad_norm": 0.4676232528999999, "learning_rate": 5e-06, "loss": 0.3516, "step": 4230 }, { "epoch": 2.392101551480959, "grad_norm": 0.4663506464819916, "learning_rate": 5e-06, "loss": 0.3489, "step": 4240 }, { "epoch": 2.397743300423131, "grad_norm": 0.4377174836018769, "learning_rate": 5e-06, "loss": 0.3566, "step": 4250 }, { "epoch": 2.403385049365303, "grad_norm": 0.4410880459267694, "learning_rate": 5e-06, "loss": 0.355, "step": 4260 }, { "epoch": 2.409026798307475, "grad_norm": 0.4416031717769207, "learning_rate": 5e-06, "loss": 0.3606, "step": 4270 }, { "epoch": 2.414668547249647, "grad_norm": 0.46431459859880975, "learning_rate": 5e-06, "loss": 0.3551, "step": 4280 }, { "epoch": 2.4203102961918193, "grad_norm": 0.4603756553070287, "learning_rate": 5e-06, "loss": 0.3521, "step": 4290 }, { "epoch": 2.4259520451339913, "grad_norm": 0.43937667290163923, "learning_rate": 5e-06, "loss": 0.3736, "step": 4300 }, { "epoch": 2.4315937940761634, "grad_norm": 0.4446699867853186, "learning_rate": 5e-06, "loss": 0.3467, "step": 4310 }, { "epoch": 2.4372355430183354, "grad_norm": 0.43626545583793597, "learning_rate": 5e-06, "loss": 0.348, "step": 4320 }, { "epoch": 2.4428772919605075, "grad_norm": 0.49173390105039966, "learning_rate": 5e-06, "loss": 0.3521, "step": 4330 }, { "epoch": 2.44851904090268, "grad_norm": 0.45995716373861045, "learning_rate": 5e-06, "loss": 0.3456, "step": 4340 }, { "epoch": 2.454160789844852, "grad_norm": 0.44249790330543903, "learning_rate": 5e-06, "loss": 0.3433, "step": 4350 }, { "epoch": 2.459802538787024, "grad_norm": 0.45560126813520535, "learning_rate": 5e-06, "loss": 0.349, "step": 4360 }, { "epoch": 2.465444287729196, "grad_norm": 0.4594734244394021, "learning_rate": 5e-06, "loss": 0.3578, "step": 4370 }, { "epoch": 2.4710860366713683, "grad_norm": 0.4572577458846818, "learning_rate": 5e-06, "loss": 0.3485, "step": 4380 }, { "epoch": 2.4767277856135403, "grad_norm": 0.44219315631814177, "learning_rate": 5e-06, "loss": 0.3471, "step": 4390 }, { "epoch": 2.4823695345557124, "grad_norm": 0.42240807095659477, "learning_rate": 5e-06, "loss": 0.3562, "step": 4400 }, { "epoch": 2.4880112834978845, "grad_norm": 0.47229751050541774, "learning_rate": 5e-06, "loss": 0.3482, "step": 4410 }, { "epoch": 2.4936530324400565, "grad_norm": 0.4458066836074724, "learning_rate": 5e-06, "loss": 0.355, "step": 4420 }, { "epoch": 2.4992947813822286, "grad_norm": 0.4503226109681392, "learning_rate": 5e-06, "loss": 0.3538, "step": 4430 }, { "epoch": 2.5049365303244007, "grad_norm": 0.44110608535592855, "learning_rate": 5e-06, "loss": 0.3747, "step": 4440 }, { "epoch": 2.5105782792665727, "grad_norm": 0.5109212717715426, "learning_rate": 5e-06, "loss": 0.3519, "step": 4450 }, { "epoch": 2.516220028208745, "grad_norm": 0.4249918016919024, "learning_rate": 5e-06, "loss": 0.342, "step": 4460 }, { "epoch": 2.521861777150917, "grad_norm": 0.5086305630978165, "learning_rate": 5e-06, "loss": 0.3607, "step": 4470 }, { "epoch": 2.527503526093089, "grad_norm": 0.4358509107515122, "learning_rate": 5e-06, "loss": 0.3468, "step": 4480 }, { "epoch": 2.533145275035261, "grad_norm": 0.4539559547925107, "learning_rate": 5e-06, "loss": 0.3443, "step": 4490 }, { "epoch": 2.538787023977433, "grad_norm": 0.4653862436948121, "learning_rate": 5e-06, "loss": 0.3524, "step": 4500 }, { "epoch": 2.544428772919605, "grad_norm": 0.41711716794524994, "learning_rate": 5e-06, "loss": 0.3479, "step": 4510 }, { "epoch": 2.550070521861777, "grad_norm": 0.45795092567053497, "learning_rate": 5e-06, "loss": 0.3695, "step": 4520 }, { "epoch": 2.5557122708039492, "grad_norm": 0.4724307476878103, "learning_rate": 5e-06, "loss": 0.353, "step": 4530 }, { "epoch": 2.5613540197461213, "grad_norm": 0.4667580239368319, "learning_rate": 5e-06, "loss": 0.3539, "step": 4540 }, { "epoch": 2.5669957686882934, "grad_norm": 0.4475375918113466, "learning_rate": 5e-06, "loss": 0.3598, "step": 4550 }, { "epoch": 2.5726375176304654, "grad_norm": 0.480757840067183, "learning_rate": 5e-06, "loss": 0.3485, "step": 4560 }, { "epoch": 2.5782792665726375, "grad_norm": 0.4276733499528922, "learning_rate": 5e-06, "loss": 0.3479, "step": 4570 }, { "epoch": 2.5839210155148096, "grad_norm": 0.43537614271812025, "learning_rate": 5e-06, "loss": 0.3543, "step": 4580 }, { "epoch": 2.5895627644569816, "grad_norm": 0.42394662487032214, "learning_rate": 5e-06, "loss": 0.3475, "step": 4590 }, { "epoch": 2.5952045133991537, "grad_norm": 0.45439257995617655, "learning_rate": 5e-06, "loss": 0.3484, "step": 4600 }, { "epoch": 2.6008462623413258, "grad_norm": 0.4463381033101569, "learning_rate": 5e-06, "loss": 0.3478, "step": 4610 }, { "epoch": 2.606488011283498, "grad_norm": 0.4651753425049505, "learning_rate": 5e-06, "loss": 0.3532, "step": 4620 }, { "epoch": 2.61212976022567, "grad_norm": 0.4725584824422778, "learning_rate": 5e-06, "loss": 0.3667, "step": 4630 }, { "epoch": 2.617771509167842, "grad_norm": 0.4496062316974007, "learning_rate": 5e-06, "loss": 0.3566, "step": 4640 }, { "epoch": 2.623413258110014, "grad_norm": 0.4301211716374985, "learning_rate": 5e-06, "loss": 0.3466, "step": 4650 }, { "epoch": 2.629055007052186, "grad_norm": 0.4567935039875112, "learning_rate": 5e-06, "loss": 0.3532, "step": 4660 }, { "epoch": 2.634696755994358, "grad_norm": 0.45514691870247576, "learning_rate": 5e-06, "loss": 0.352, "step": 4670 }, { "epoch": 2.64033850493653, "grad_norm": 0.4435768402675874, "learning_rate": 5e-06, "loss": 0.3479, "step": 4680 }, { "epoch": 2.6459802538787023, "grad_norm": 0.43799237086382287, "learning_rate": 5e-06, "loss": 0.3399, "step": 4690 }, { "epoch": 2.6516220028208743, "grad_norm": 0.45347330937833646, "learning_rate": 5e-06, "loss": 0.3496, "step": 4700 }, { "epoch": 2.6572637517630464, "grad_norm": 0.45449617328134695, "learning_rate": 5e-06, "loss": 0.3698, "step": 4710 }, { "epoch": 2.6629055007052185, "grad_norm": 0.45514167950119666, "learning_rate": 5e-06, "loss": 0.3557, "step": 4720 }, { "epoch": 2.6685472496473905, "grad_norm": 0.45124610082620425, "learning_rate": 5e-06, "loss": 0.3555, "step": 4730 }, { "epoch": 2.6741889985895626, "grad_norm": 0.45506617549803663, "learning_rate": 5e-06, "loss": 0.3533, "step": 4740 }, { "epoch": 2.679830747531735, "grad_norm": 0.4497891236146143, "learning_rate": 5e-06, "loss": 0.3593, "step": 4750 }, { "epoch": 2.685472496473907, "grad_norm": 0.43730277262363093, "learning_rate": 5e-06, "loss": 0.35, "step": 4760 }, { "epoch": 2.6911142454160792, "grad_norm": 0.4453843880728269, "learning_rate": 5e-06, "loss": 0.3508, "step": 4770 }, { "epoch": 2.6967559943582513, "grad_norm": 0.4551381875534027, "learning_rate": 5e-06, "loss": 0.3659, "step": 4780 }, { "epoch": 2.7023977433004234, "grad_norm": 0.44693198155600794, "learning_rate": 5e-06, "loss": 0.3488, "step": 4790 }, { "epoch": 2.7080394922425954, "grad_norm": 0.4725411086517588, "learning_rate": 5e-06, "loss": 0.3622, "step": 4800 }, { "epoch": 2.7136812411847675, "grad_norm": 0.45435090794835215, "learning_rate": 5e-06, "loss": 0.3659, "step": 4810 }, { "epoch": 2.7193229901269396, "grad_norm": 0.40891902748686465, "learning_rate": 5e-06, "loss": 0.3521, "step": 4820 }, { "epoch": 2.7249647390691116, "grad_norm": 0.4588622825344602, "learning_rate": 5e-06, "loss": 0.3609, "step": 4830 }, { "epoch": 2.7306064880112837, "grad_norm": 0.47220524310687695, "learning_rate": 5e-06, "loss": 0.3608, "step": 4840 }, { "epoch": 2.7362482369534558, "grad_norm": 0.4813382330408875, "learning_rate": 5e-06, "loss": 0.3516, "step": 4850 }, { "epoch": 2.741889985895628, "grad_norm": 0.44851106014638686, "learning_rate": 5e-06, "loss": 0.3575, "step": 4860 }, { "epoch": 2.7475317348378, "grad_norm": 0.4648873406447677, "learning_rate": 5e-06, "loss": 0.3508, "step": 4870 }, { "epoch": 2.753173483779972, "grad_norm": 0.44332878174865403, "learning_rate": 5e-06, "loss": 0.3548, "step": 4880 }, { "epoch": 2.758815232722144, "grad_norm": 0.44016959273981, "learning_rate": 5e-06, "loss": 0.3518, "step": 4890 }, { "epoch": 2.764456981664316, "grad_norm": 0.42469643512821914, "learning_rate": 5e-06, "loss": 0.3501, "step": 4900 }, { "epoch": 2.770098730606488, "grad_norm": 0.44842384575861166, "learning_rate": 5e-06, "loss": 0.3452, "step": 4910 }, { "epoch": 2.77574047954866, "grad_norm": 0.44453086608294007, "learning_rate": 5e-06, "loss": 0.3576, "step": 4920 }, { "epoch": 2.7813822284908323, "grad_norm": 0.4613767704732174, "learning_rate": 5e-06, "loss": 0.3669, "step": 4930 }, { "epoch": 2.7870239774330043, "grad_norm": 0.42157749246627113, "learning_rate": 5e-06, "loss": 0.3558, "step": 4940 }, { "epoch": 2.7926657263751764, "grad_norm": 0.44623021177861155, "learning_rate": 5e-06, "loss": 0.353, "step": 4950 }, { "epoch": 2.7983074753173485, "grad_norm": 0.44511445391899146, "learning_rate": 5e-06, "loss": 0.3575, "step": 4960 }, { "epoch": 2.8039492242595205, "grad_norm": 0.4496517977205029, "learning_rate": 5e-06, "loss": 0.376, "step": 4970 }, { "epoch": 2.8095909732016926, "grad_norm": 0.4568581481429044, "learning_rate": 5e-06, "loss": 0.3552, "step": 4980 }, { "epoch": 2.8152327221438647, "grad_norm": 0.45872415735621647, "learning_rate": 5e-06, "loss": 0.3538, "step": 4990 }, { "epoch": 2.8208744710860367, "grad_norm": 0.43280090040022784, "learning_rate": 5e-06, "loss": 0.3596, "step": 5000 }, { "epoch": 2.826516220028209, "grad_norm": 0.4271253356285509, "learning_rate": 5e-06, "loss": 0.3589, "step": 5010 }, { "epoch": 2.832157968970381, "grad_norm": 0.45509701773858097, "learning_rate": 5e-06, "loss": 0.3717, "step": 5020 }, { "epoch": 2.837799717912553, "grad_norm": 0.43287288682215924, "learning_rate": 5e-06, "loss": 0.3573, "step": 5030 }, { "epoch": 2.843441466854725, "grad_norm": 0.4688529933224419, "learning_rate": 5e-06, "loss": 0.3477, "step": 5040 }, { "epoch": 2.849083215796897, "grad_norm": 0.4331993042860941, "learning_rate": 5e-06, "loss": 0.3514, "step": 5050 }, { "epoch": 2.854724964739069, "grad_norm": 0.47629494492943353, "learning_rate": 5e-06, "loss": 0.3457, "step": 5060 }, { "epoch": 2.860366713681241, "grad_norm": 0.4547175713111894, "learning_rate": 5e-06, "loss": 0.3616, "step": 5070 }, { "epoch": 2.8660084626234132, "grad_norm": 0.4697185774932994, "learning_rate": 5e-06, "loss": 0.3527, "step": 5080 }, { "epoch": 2.8716502115655853, "grad_norm": 0.46979390495300094, "learning_rate": 5e-06, "loss": 0.367, "step": 5090 }, { "epoch": 2.8772919605077574, "grad_norm": 0.4779125028298598, "learning_rate": 5e-06, "loss": 0.3511, "step": 5100 }, { "epoch": 2.8829337094499294, "grad_norm": 0.4974784539605145, "learning_rate": 5e-06, "loss": 0.3623, "step": 5110 }, { "epoch": 2.8885754583921015, "grad_norm": 0.4614842753048295, "learning_rate": 5e-06, "loss": 0.3495, "step": 5120 }, { "epoch": 2.8942172073342736, "grad_norm": 0.43741541412768414, "learning_rate": 5e-06, "loss": 0.3566, "step": 5130 }, { "epoch": 2.8998589562764456, "grad_norm": 0.4611139730639956, "learning_rate": 5e-06, "loss": 0.357, "step": 5140 }, { "epoch": 2.9055007052186177, "grad_norm": 0.4584393192245279, "learning_rate": 5e-06, "loss": 0.3559, "step": 5150 }, { "epoch": 2.9111424541607898, "grad_norm": 0.4605897500358934, "learning_rate": 5e-06, "loss": 0.3599, "step": 5160 }, { "epoch": 2.916784203102962, "grad_norm": 0.5047737206876777, "learning_rate": 5e-06, "loss": 0.3554, "step": 5170 }, { "epoch": 2.922425952045134, "grad_norm": 0.43957877748790186, "learning_rate": 5e-06, "loss": 0.3553, "step": 5180 }, { "epoch": 2.928067700987306, "grad_norm": 0.41934448745808994, "learning_rate": 5e-06, "loss": 0.3537, "step": 5190 }, { "epoch": 2.933709449929478, "grad_norm": 0.4497013017770954, "learning_rate": 5e-06, "loss": 0.3591, "step": 5200 }, { "epoch": 2.93935119887165, "grad_norm": 0.46915975111439107, "learning_rate": 5e-06, "loss": 0.3561, "step": 5210 }, { "epoch": 2.944992947813822, "grad_norm": 0.4428104855895761, "learning_rate": 5e-06, "loss": 0.3633, "step": 5220 }, { "epoch": 2.950634696755994, "grad_norm": 0.448532360201155, "learning_rate": 5e-06, "loss": 0.3496, "step": 5230 }, { "epoch": 2.9562764456981663, "grad_norm": 0.47532539519127587, "learning_rate": 5e-06, "loss": 0.3484, "step": 5240 }, { "epoch": 2.9619181946403383, "grad_norm": 0.43655270107253413, "learning_rate": 5e-06, "loss": 0.3735, "step": 5250 }, { "epoch": 2.9675599435825104, "grad_norm": 0.4654091728547412, "learning_rate": 5e-06, "loss": 0.3517, "step": 5260 }, { "epoch": 2.9732016925246825, "grad_norm": 0.48276086071545776, "learning_rate": 5e-06, "loss": 0.358, "step": 5270 }, { "epoch": 2.9788434414668545, "grad_norm": 0.4497726059890603, "learning_rate": 5e-06, "loss": 0.3743, "step": 5280 }, { "epoch": 2.9844851904090266, "grad_norm": 0.42161219193763577, "learning_rate": 5e-06, "loss": 0.3519, "step": 5290 }, { "epoch": 2.9901269393511987, "grad_norm": 0.4593665569282975, "learning_rate": 5e-06, "loss": 0.3473, "step": 5300 }, { "epoch": 2.9957686882933707, "grad_norm": 0.4432358435800667, "learning_rate": 5e-06, "loss": 0.3623, "step": 5310 }, { "epoch": 2.9991537376586743, "eval_loss": 0.440873384475708, "eval_runtime": 444.0657, "eval_samples_per_second": 26.883, "eval_steps_per_second": 0.421, "step": 5316 }, { "epoch": 2.9991537376586743, "step": 5316, "total_flos": 2786674505416704.0, "train_loss": 0.4018145801655057, "train_runtime": 71328.9114, "train_samples_per_second": 9.54, "train_steps_per_second": 0.075 } ], "logging_steps": 10, "max_steps": 5316, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2786674505416704.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }