|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9991537376586743, |
|
"eval_steps": 500, |
|
"global_step": 5316, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005641748942172073, |
|
"grad_norm": 0.8860281773045255, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5781, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.011283497884344146, |
|
"grad_norm": 0.9275466531112635, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5493, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01692524682651622, |
|
"grad_norm": 0.9785057374089036, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5144, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.022566995768688293, |
|
"grad_norm": 0.9445832962940219, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5132, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.028208744710860368, |
|
"grad_norm": 0.8794347209187481, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4958, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03385049365303244, |
|
"grad_norm": 0.8768425701561404, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5032, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.039492242595204514, |
|
"grad_norm": 0.7246974546492897, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4798, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.045133991537376586, |
|
"grad_norm": 0.554012169044732, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4888, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.05077574047954866, |
|
"grad_norm": 0.5618297520336772, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4811, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.056417489421720736, |
|
"grad_norm": 0.5213657220782494, |
|
"learning_rate": 5e-06, |
|
"loss": 0.468, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06205923836389281, |
|
"grad_norm": 0.509393805640559, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4829, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.06770098730606489, |
|
"grad_norm": 0.543532182255718, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4737, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.07334273624823695, |
|
"grad_norm": 0.5016209370031858, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4787, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.07898448519040903, |
|
"grad_norm": 0.5209031521531445, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4848, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0846262341325811, |
|
"grad_norm": 0.47470655842824117, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4776, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.09026798307475317, |
|
"grad_norm": 0.5098693274003744, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4765, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.09590973201692525, |
|
"grad_norm": 0.5148841472543283, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4744, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.10155148095909731, |
|
"grad_norm": 0.5303116836505042, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4789, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1071932299012694, |
|
"grad_norm": 0.5035762913816794, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4763, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.11283497884344147, |
|
"grad_norm": 0.5096690168519262, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4687, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.11847672778561354, |
|
"grad_norm": 0.50737809697083, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4615, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.12411847672778561, |
|
"grad_norm": 0.4961749979329462, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4713, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.12976022566995768, |
|
"grad_norm": 0.47944422939763603, |
|
"learning_rate": 5e-06, |
|
"loss": 0.475, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.13540197461212977, |
|
"grad_norm": 0.49717123601985425, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4739, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.14104372355430184, |
|
"grad_norm": 0.47611353329941747, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4649, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1466854724964739, |
|
"grad_norm": 0.4894294603649133, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4917, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.152327221438646, |
|
"grad_norm": 0.48373950578804115, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4493, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.15796897038081806, |
|
"grad_norm": 0.522789579136924, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4664, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.16361071932299012, |
|
"grad_norm": 0.4789421152666509, |
|
"learning_rate": 5e-06, |
|
"loss": 0.459, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.1692524682651622, |
|
"grad_norm": 0.5012557865235248, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4552, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.17489421720733428, |
|
"grad_norm": 0.4624585048654735, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4596, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.18053596614950634, |
|
"grad_norm": 0.47502867235282875, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4782, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.1861777150916784, |
|
"grad_norm": 0.48965795082153923, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4645, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.1918194640338505, |
|
"grad_norm": 0.48242874081163245, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4647, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.19746121297602257, |
|
"grad_norm": 0.5121876755000992, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4665, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.20310296191819463, |
|
"grad_norm": 0.49770668244489025, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4707, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.20874471086036672, |
|
"grad_norm": 0.5159796946721876, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4626, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.2143864598025388, |
|
"grad_norm": 0.48687862998047, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4605, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.22002820874471085, |
|
"grad_norm": 0.5040230352920434, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4582, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.22566995768688294, |
|
"grad_norm": 0.4841608785680818, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4655, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.231311706629055, |
|
"grad_norm": 0.49039176099743137, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4495, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.23695345557122707, |
|
"grad_norm": 0.5040217827748269, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4529, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.24259520451339917, |
|
"grad_norm": 0.47120041333093315, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4569, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.24823695345557123, |
|
"grad_norm": 0.4890234120450319, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4613, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.2538787023977433, |
|
"grad_norm": 0.48217359915393404, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4472, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.25952045133991536, |
|
"grad_norm": 0.467804639174959, |
|
"learning_rate": 5e-06, |
|
"loss": 0.448, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.2651622002820874, |
|
"grad_norm": 0.48164716949150344, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4502, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.27080394922425954, |
|
"grad_norm": 0.5145780661011983, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4572, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.2764456981664316, |
|
"grad_norm": 0.5207011942621447, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4616, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.2820874471086037, |
|
"grad_norm": 0.4935510238242219, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4659, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.28772919605077574, |
|
"grad_norm": 0.5079417454565434, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4582, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.2933709449929478, |
|
"grad_norm": 0.5314525886288128, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4628, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.29901269393511987, |
|
"grad_norm": 0.5053611809759164, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4478, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.304654442877292, |
|
"grad_norm": 0.4794058313719111, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4451, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.31029619181946405, |
|
"grad_norm": 0.48888977415025386, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4702, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.3159379407616361, |
|
"grad_norm": 0.4849302874069741, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4561, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.3215796897038082, |
|
"grad_norm": 0.47972377135392075, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4457, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.32722143864598024, |
|
"grad_norm": 0.4869264334442687, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4498, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.3328631875881523, |
|
"grad_norm": 0.5030426273166695, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4698, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.3385049365303244, |
|
"grad_norm": 0.4792385544239688, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4524, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.3441466854724965, |
|
"grad_norm": 0.4757776685745222, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4481, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.34978843441466856, |
|
"grad_norm": 0.5141080166869366, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4522, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.3554301833568406, |
|
"grad_norm": 0.521030094448152, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4529, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.3610719322990127, |
|
"grad_norm": 0.49616684223591123, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4585, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.36671368124118475, |
|
"grad_norm": 0.5224973873990862, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4531, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.3723554301833568, |
|
"grad_norm": 0.46606976454004667, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4499, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.37799717912552894, |
|
"grad_norm": 0.4631578950745994, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4591, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.383638928067701, |
|
"grad_norm": 0.463696350712983, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4617, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.38928067700987307, |
|
"grad_norm": 0.49700726007271695, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4419, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.39492242595204513, |
|
"grad_norm": 0.5047528462302425, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4546, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.4005641748942172, |
|
"grad_norm": 0.4881338305694489, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4415, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.40620592383638926, |
|
"grad_norm": 0.4950088901214604, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4467, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.4118476727785614, |
|
"grad_norm": 0.48800943523969437, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4617, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.41748942172073344, |
|
"grad_norm": 0.4761347013711521, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4455, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.4231311706629055, |
|
"grad_norm": 0.4811571918715123, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4752, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.4287729196050776, |
|
"grad_norm": 0.4785173629798188, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4312, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.43441466854724964, |
|
"grad_norm": 0.499757446583109, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4522, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.4400564174894217, |
|
"grad_norm": 0.5007042680003394, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4547, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.44569816643159377, |
|
"grad_norm": 0.4832215616215704, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4328, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.4513399153737659, |
|
"grad_norm": 0.4556785539804432, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4526, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.45698166431593795, |
|
"grad_norm": 0.4583262040596829, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4543, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.46262341325811, |
|
"grad_norm": 0.47568673401701195, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4489, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.4682651622002821, |
|
"grad_norm": 0.5099408600605224, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4635, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.47390691114245415, |
|
"grad_norm": 0.48286512485005056, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4414, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.4795486600846262, |
|
"grad_norm": 0.4662493732462359, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4702, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.48519040902679833, |
|
"grad_norm": 0.4688957523663092, |
|
"learning_rate": 5e-06, |
|
"loss": 0.452, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.4908321579689704, |
|
"grad_norm": 0.5020197782038716, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4401, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.49647390691114246, |
|
"grad_norm": 0.5293932494261749, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4594, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.5021156558533145, |
|
"grad_norm": 0.5071632389201434, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4445, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.5077574047954866, |
|
"grad_norm": 0.4878558607219793, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4419, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5133991537376587, |
|
"grad_norm": 0.463852809384172, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4476, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.5190409026798307, |
|
"grad_norm": 0.4552138421616773, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4477, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.5246826516220028, |
|
"grad_norm": 0.4451067953129034, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4602, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.5303244005641748, |
|
"grad_norm": 0.4892614287238877, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4463, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.535966149506347, |
|
"grad_norm": 0.5203275452886463, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4481, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.5416078984485191, |
|
"grad_norm": 0.4993491544629748, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4547, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.5472496473906912, |
|
"grad_norm": 0.4530550836722553, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4342, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.5528913963328632, |
|
"grad_norm": 0.47205399956664723, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4391, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.5585331452750353, |
|
"grad_norm": 0.5288042301017725, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4477, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.5641748942172073, |
|
"grad_norm": 0.5024574538810612, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4644, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5698166431593794, |
|
"grad_norm": 0.46444873871572295, |
|
"learning_rate": 5e-06, |
|
"loss": 0.442, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.5754583921015515, |
|
"grad_norm": 0.48362451913417076, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4363, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.5811001410437235, |
|
"grad_norm": 0.48683768680972256, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4523, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.5867418899858956, |
|
"grad_norm": 0.4753500530255471, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4452, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.5923836389280677, |
|
"grad_norm": 0.494982125586109, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4504, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.5980253878702397, |
|
"grad_norm": 0.4658594939623635, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4414, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.6036671368124118, |
|
"grad_norm": 0.4576854005593855, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4336, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.609308885754584, |
|
"grad_norm": 0.47667736492718527, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4446, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.614950634696756, |
|
"grad_norm": 0.49049704641298675, |
|
"learning_rate": 5e-06, |
|
"loss": 0.452, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.6205923836389281, |
|
"grad_norm": 0.47137093184915657, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4756, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.6262341325811002, |
|
"grad_norm": 0.48219417585137514, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4427, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.6318758815232722, |
|
"grad_norm": 0.453987324341643, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4323, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.6375176304654443, |
|
"grad_norm": 0.5092221096631693, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4488, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.6431593794076164, |
|
"grad_norm": 0.5005212580369779, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4611, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.6488011283497884, |
|
"grad_norm": 0.5259263069942747, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4438, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.6544428772919605, |
|
"grad_norm": 0.4915487101147822, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4452, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.6600846262341326, |
|
"grad_norm": 0.4636364534332318, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4397, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.6657263751763046, |
|
"grad_norm": 0.4698556111548417, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4417, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.6713681241184767, |
|
"grad_norm": 0.5329347792411113, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4539, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.6770098730606487, |
|
"grad_norm": 0.5126424852624655, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4429, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.6826516220028209, |
|
"grad_norm": 0.4600428689947934, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4481, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.688293370944993, |
|
"grad_norm": 0.4918232014874478, |
|
"learning_rate": 5e-06, |
|
"loss": 0.44, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.693935119887165, |
|
"grad_norm": 0.5091072490058565, |
|
"learning_rate": 5e-06, |
|
"loss": 0.443, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.6995768688293371, |
|
"grad_norm": 0.5086478162333048, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4439, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.7052186177715092, |
|
"grad_norm": 0.47954449181032316, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4252, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.7108603667136812, |
|
"grad_norm": 0.46596459050514427, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4482, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.7165021156558533, |
|
"grad_norm": 0.46248125242410526, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4402, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.7221438645980254, |
|
"grad_norm": 0.49235084627255177, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4368, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.7277856135401974, |
|
"grad_norm": 0.4864015478165713, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4577, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.7334273624823695, |
|
"grad_norm": 0.5066841927831519, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4527, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.7390691114245416, |
|
"grad_norm": 0.4767296270599191, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4421, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.7447108603667136, |
|
"grad_norm": 0.4770443766109164, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4548, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.7503526093088858, |
|
"grad_norm": 0.4792819993673282, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4349, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.7559943582510579, |
|
"grad_norm": 0.48987632661924885, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4378, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.7616361071932299, |
|
"grad_norm": 0.4896409271912306, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4408, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.767277856135402, |
|
"grad_norm": 0.5370347277468178, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4456, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.7729196050775741, |
|
"grad_norm": 0.5032968949454037, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4473, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.7785613540197461, |
|
"grad_norm": 0.48685319139056404, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4434, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.7842031029619182, |
|
"grad_norm": 0.49748304716726394, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4296, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.7898448519040903, |
|
"grad_norm": 0.48733408476356, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4447, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.7954866008462623, |
|
"grad_norm": 0.5053450525255075, |
|
"learning_rate": 5e-06, |
|
"loss": 0.437, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.8011283497884344, |
|
"grad_norm": 0.5051373461963963, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4517, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.8067700987306065, |
|
"grad_norm": 0.5031702066693102, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4458, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.8124118476727785, |
|
"grad_norm": 0.5185876273657516, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4542, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.8180535966149506, |
|
"grad_norm": 0.49124261927260193, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4405, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.8236953455571228, |
|
"grad_norm": 0.49751086570325753, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4357, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.8293370944992948, |
|
"grad_norm": 0.4707406079652606, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4404, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.8349788434414669, |
|
"grad_norm": 0.4611322469678821, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4291, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.840620592383639, |
|
"grad_norm": 0.46796161325249325, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4446, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.846262341325811, |
|
"grad_norm": 0.5039127016141375, |
|
"learning_rate": 5e-06, |
|
"loss": 0.442, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.8519040902679831, |
|
"grad_norm": 0.4882929849367327, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4427, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.8575458392101551, |
|
"grad_norm": 0.46485028595629135, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4493, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.8631875881523272, |
|
"grad_norm": 0.5261718908487378, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4425, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.8688293370944993, |
|
"grad_norm": 0.5030638236696873, |
|
"learning_rate": 5e-06, |
|
"loss": 0.453, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.8744710860366713, |
|
"grad_norm": 0.47397319099408175, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4667, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.8801128349788434, |
|
"grad_norm": 0.45947257613776293, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4408, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.8857545839210155, |
|
"grad_norm": 0.4886106451240436, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4323, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.8913963328631875, |
|
"grad_norm": 0.46842609789012146, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4385, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.8970380818053597, |
|
"grad_norm": 0.49975332721542237, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4398, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.9026798307475318, |
|
"grad_norm": 0.48527328135804326, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4484, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.9083215796897038, |
|
"grad_norm": 0.49172287584389185, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4463, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.9139633286318759, |
|
"grad_norm": 0.508732362088126, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4395, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.919605077574048, |
|
"grad_norm": 0.47225307145651074, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4548, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.92524682651622, |
|
"grad_norm": 0.46028374293695373, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4402, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.9308885754583921, |
|
"grad_norm": 0.4887795142703319, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4524, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.9365303244005642, |
|
"grad_norm": 0.48414776958913036, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4388, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.9421720733427362, |
|
"grad_norm": 0.47408507089480434, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4373, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.9478138222849083, |
|
"grad_norm": 0.4755919436355295, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4521, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.9534555712270804, |
|
"grad_norm": 0.48600199903202446, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4387, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.9590973201692524, |
|
"grad_norm": 0.4826408864245463, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4474, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.9647390691114246, |
|
"grad_norm": 0.5360459005214712, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4402, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.9703808180535967, |
|
"grad_norm": 0.5267429044967258, |
|
"learning_rate": 5e-06, |
|
"loss": 0.448, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.9760225669957687, |
|
"grad_norm": 0.487975885463895, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4527, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.9816643159379408, |
|
"grad_norm": 0.4656913505732415, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4458, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.9873060648801129, |
|
"grad_norm": 0.48356320486134374, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4565, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.9929478138222849, |
|
"grad_norm": 0.5106136347337831, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4481, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.998589562764457, |
|
"grad_norm": 0.481310325218027, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4318, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.9997179125528914, |
|
"eval_loss": 0.4379998743534088, |
|
"eval_runtime": 445.9549, |
|
"eval_samples_per_second": 26.77, |
|
"eval_steps_per_second": 0.419, |
|
"step": 1772 |
|
}, |
|
{ |
|
"epoch": 1.004231311706629, |
|
"grad_norm": 0.5560347360599398, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4228, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.0098730606488011, |
|
"grad_norm": 0.43343321808918617, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3888, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.0155148095909732, |
|
"grad_norm": 0.49193958815688976, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4081, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.0211565585331452, |
|
"grad_norm": 0.44261196562739774, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4083, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.0267983074753173, |
|
"grad_norm": 0.48715391428811605, |
|
"learning_rate": 5e-06, |
|
"loss": 0.403, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.0324400564174894, |
|
"grad_norm": 0.45617321287848667, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3984, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.0380818053596614, |
|
"grad_norm": 0.46951380049994146, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3908, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.0437235543018335, |
|
"grad_norm": 0.4606776177243496, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3944, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.0493653032440056, |
|
"grad_norm": 0.46717676409843034, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3888, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.0550070521861776, |
|
"grad_norm": 0.4602516664423018, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3936, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.0606488011283497, |
|
"grad_norm": 0.42788829282622504, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3881, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.0662905500705218, |
|
"grad_norm": 0.45508688226916866, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3997, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.071932299012694, |
|
"grad_norm": 0.45167507963707426, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3945, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.077574047954866, |
|
"grad_norm": 0.4638857492654454, |
|
"learning_rate": 5e-06, |
|
"loss": 0.393, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.0832157968970382, |
|
"grad_norm": 0.4565491666336401, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3905, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.0888575458392102, |
|
"grad_norm": 0.4468567209212458, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3945, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.0944992947813823, |
|
"grad_norm": 0.4451125923550269, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3888, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.1001410437235544, |
|
"grad_norm": 0.4700452714699321, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3889, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.1057827926657264, |
|
"grad_norm": 0.4650898761163617, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3883, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.1114245416078985, |
|
"grad_norm": 0.4707832390036078, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4015, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.1170662905500706, |
|
"grad_norm": 0.4743179184075093, |
|
"learning_rate": 5e-06, |
|
"loss": 0.391, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.1227080394922426, |
|
"grad_norm": 0.4823902906366835, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3933, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.1283497884344147, |
|
"grad_norm": 0.4929422166855442, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4033, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.1339915373765868, |
|
"grad_norm": 0.46931415950586963, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3962, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.1396332863187588, |
|
"grad_norm": 0.4716793144119691, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3843, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.1452750352609309, |
|
"grad_norm": 0.46214625180030394, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3896, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.150916784203103, |
|
"grad_norm": 0.4478869441665965, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3903, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.156558533145275, |
|
"grad_norm": 0.47404105806443103, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3923, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.162200282087447, |
|
"grad_norm": 0.4815826404229114, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4137, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.1678420310296191, |
|
"grad_norm": 0.47653645240601855, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3952, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.1734837799717912, |
|
"grad_norm": 0.49644988829819037, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3981, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.1791255289139633, |
|
"grad_norm": 0.46657331353149667, |
|
"learning_rate": 5e-06, |
|
"loss": 0.397, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.1847672778561353, |
|
"grad_norm": 0.4713649930891489, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3987, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.1904090267983074, |
|
"grad_norm": 0.4988957090347967, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3913, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.1960507757404795, |
|
"grad_norm": 0.4644112960714002, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4001, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.2016925246826515, |
|
"grad_norm": 0.4707333720355816, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3968, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.2073342736248236, |
|
"grad_norm": 0.47608463008729884, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4022, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.2129760225669957, |
|
"grad_norm": 0.45996551421943505, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4083, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.2186177715091677, |
|
"grad_norm": 0.45343813144247014, |
|
"learning_rate": 5e-06, |
|
"loss": 0.379, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.22425952045134, |
|
"grad_norm": 0.44526573844484096, |
|
"learning_rate": 5e-06, |
|
"loss": 0.397, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.229901269393512, |
|
"grad_norm": 0.48779112035480804, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3988, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.2355430183356841, |
|
"grad_norm": 0.4487329859304139, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3802, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.2411847672778562, |
|
"grad_norm": 0.47886286342692885, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3966, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.2468265162200283, |
|
"grad_norm": 0.45776874778870136, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3955, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.2524682651622003, |
|
"grad_norm": 0.47257007534396295, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3888, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.2581100141043724, |
|
"grad_norm": 0.46751284003891047, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3969, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.2637517630465445, |
|
"grad_norm": 0.4661158831574023, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4088, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.2693935119887165, |
|
"grad_norm": 0.4394915987852524, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3935, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.2750352609308886, |
|
"grad_norm": 0.45334151132727485, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3944, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.2806770098730607, |
|
"grad_norm": 0.5078200971616262, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3905, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.2863187588152327, |
|
"grad_norm": 0.4713106460600115, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3955, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.2919605077574048, |
|
"grad_norm": 0.4635282807772546, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4013, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.2976022566995769, |
|
"grad_norm": 0.48334074568481694, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4005, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.303244005641749, |
|
"grad_norm": 0.48456675280641903, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3933, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.308885754583921, |
|
"grad_norm": 0.46200542060106936, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3835, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.314527503526093, |
|
"grad_norm": 0.4815654441432598, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4045, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.320169252468265, |
|
"grad_norm": 0.48826822709991896, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4009, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.3258110014104372, |
|
"grad_norm": 0.4716440781629598, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3938, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.3314527503526092, |
|
"grad_norm": 0.4602369354038975, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3928, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.3370944992947813, |
|
"grad_norm": 0.49648382398328583, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4033, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.3427362482369536, |
|
"grad_norm": 0.46739455409641245, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3972, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.3483779971791257, |
|
"grad_norm": 0.443323801929617, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3968, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 1.3540197461212977, |
|
"grad_norm": 0.4539331304987661, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3924, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.3596614950634698, |
|
"grad_norm": 0.47781406684365874, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4117, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.3653032440056418, |
|
"grad_norm": 0.45421508901943547, |
|
"learning_rate": 5e-06, |
|
"loss": 0.39, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 1.370944992947814, |
|
"grad_norm": 0.4630026357905712, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.376586741889986, |
|
"grad_norm": 0.4523702465470941, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4077, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.382228490832158, |
|
"grad_norm": 0.45782219911496075, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3959, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.38787023977433, |
|
"grad_norm": 0.5117871130526895, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4016, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.3935119887165022, |
|
"grad_norm": 0.5315155467795695, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3973, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 1.3991537376586742, |
|
"grad_norm": 0.46939820177172814, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3942, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 1.4047954866008463, |
|
"grad_norm": 0.4543328677593474, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3998, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 1.4104372355430184, |
|
"grad_norm": 0.4496789872069011, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3963, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.4160789844851904, |
|
"grad_norm": 0.471782994083793, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3885, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 1.4217207334273625, |
|
"grad_norm": 0.4693960167487946, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3917, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 1.4273624823695346, |
|
"grad_norm": 0.4538271312417869, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3967, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 1.4330042313117066, |
|
"grad_norm": 0.44186919531244484, |
|
"learning_rate": 5e-06, |
|
"loss": 0.395, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 1.4386459802538787, |
|
"grad_norm": 0.4687203668363411, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3981, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.4442877291960508, |
|
"grad_norm": 0.4587106074858165, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3925, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 1.4499294781382228, |
|
"grad_norm": 0.4572379861998615, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4025, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 1.4555712270803949, |
|
"grad_norm": 0.4718970269023812, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3943, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 1.461212976022567, |
|
"grad_norm": 0.5041632079079171, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4082, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 1.466854724964739, |
|
"grad_norm": 0.4499939474661196, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4006, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.472496473906911, |
|
"grad_norm": 0.44298819801342293, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4043, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 1.4781382228490831, |
|
"grad_norm": 0.4779382761973526, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4056, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 1.4837799717912552, |
|
"grad_norm": 0.47692712159702133, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3907, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 1.4894217207334273, |
|
"grad_norm": 0.46307783252557594, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3959, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 1.4950634696755993, |
|
"grad_norm": 0.4606066509600136, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3934, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.5007052186177714, |
|
"grad_norm": 0.47324134946913055, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4035, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 1.5063469675599435, |
|
"grad_norm": 0.46378369582677753, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3993, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 1.5119887165021155, |
|
"grad_norm": 0.446704021535711, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3918, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 1.5176304654442876, |
|
"grad_norm": 0.4914697095674317, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3909, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 1.5232722143864597, |
|
"grad_norm": 0.45218861250501363, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3891, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.5289139633286317, |
|
"grad_norm": 0.47296389203246264, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4063, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 1.5345557122708038, |
|
"grad_norm": 0.4565578982324502, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4004, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 1.540197461212976, |
|
"grad_norm": 0.4682402541291805, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3926, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 1.5458392101551481, |
|
"grad_norm": 0.461279370651597, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3896, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 1.5514809590973202, |
|
"grad_norm": 0.48043898076393116, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3999, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.5571227080394923, |
|
"grad_norm": 0.4940569464772548, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4077, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 1.5627644569816643, |
|
"grad_norm": 0.507591402814353, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4009, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 1.5684062059238364, |
|
"grad_norm": 0.4706877768766689, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3924, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 1.5740479548660085, |
|
"grad_norm": 0.47676776366340334, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3877, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 1.5796897038081805, |
|
"grad_norm": 0.5182129400923178, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4036, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.5853314527503526, |
|
"grad_norm": 0.48116837090131204, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3975, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 1.5909732016925247, |
|
"grad_norm": 0.4275386122917158, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3918, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 1.5966149506346967, |
|
"grad_norm": 0.4558363289192502, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3882, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 1.6022566995768688, |
|
"grad_norm": 0.46362140028375165, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3986, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 1.607898448519041, |
|
"grad_norm": 0.46282031581127986, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4039, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.6135401974612131, |
|
"grad_norm": 0.46441758640717906, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4025, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 1.6191819464033852, |
|
"grad_norm": 0.46074665854229274, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3905, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 1.6248236953455573, |
|
"grad_norm": 0.4777266277891572, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4042, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 1.6304654442877293, |
|
"grad_norm": 0.45553733285190573, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3962, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 1.6361071932299014, |
|
"grad_norm": 0.47178767330297033, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3892, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.6417489421720735, |
|
"grad_norm": 0.47359171054395643, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3881, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 1.6473906911142455, |
|
"grad_norm": 0.46603715950910696, |
|
"learning_rate": 5e-06, |
|
"loss": 0.394, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 1.6530324400564176, |
|
"grad_norm": 0.4558346803274527, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3972, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 1.6586741889985896, |
|
"grad_norm": 0.479027856317277, |
|
"learning_rate": 5e-06, |
|
"loss": 0.394, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 1.6643159379407617, |
|
"grad_norm": 0.4643135506673433, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3862, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.6699576868829338, |
|
"grad_norm": 0.4619833964547844, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4016, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 1.6755994358251058, |
|
"grad_norm": 0.44851503998801856, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4095, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 1.681241184767278, |
|
"grad_norm": 0.4467522582124666, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3999, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 1.68688293370945, |
|
"grad_norm": 0.472063710369713, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3939, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 1.692524682651622, |
|
"grad_norm": 0.48009995546325723, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4183, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.698166431593794, |
|
"grad_norm": 0.44768147031308736, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3983, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 1.7038081805359662, |
|
"grad_norm": 0.476978526816067, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3951, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 1.7094499294781382, |
|
"grad_norm": 0.4696465520344116, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3961, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 1.7150916784203103, |
|
"grad_norm": 0.4576648300580514, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3942, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 1.7207334273624824, |
|
"grad_norm": 0.4731142162264191, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3946, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 1.7263751763046544, |
|
"grad_norm": 0.4756579121422872, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3925, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 1.7320169252468265, |
|
"grad_norm": 0.46052906775460756, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3959, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 1.7376586741889986, |
|
"grad_norm": 0.43627250154239816, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4012, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 1.7433004231311706, |
|
"grad_norm": 0.4820135003105483, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3881, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 1.7489421720733427, |
|
"grad_norm": 0.4778018041594859, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.7545839210155147, |
|
"grad_norm": 0.4884888164729651, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3891, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 1.7602256699576868, |
|
"grad_norm": 0.4475592439449893, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3926, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 1.7658674188998589, |
|
"grad_norm": 0.47654858360039826, |
|
"learning_rate": 5e-06, |
|
"loss": 0.419, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 1.771509167842031, |
|
"grad_norm": 0.4555878766506712, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3962, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 1.777150916784203, |
|
"grad_norm": 0.46471151818843526, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3956, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 1.782792665726375, |
|
"grad_norm": 0.47449169536040453, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4006, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 1.7884344146685471, |
|
"grad_norm": 0.4602783138679876, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3917, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 1.7940761636107192, |
|
"grad_norm": 0.472002669632583, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4085, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 1.7997179125528913, |
|
"grad_norm": 0.4675920743304564, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4003, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 1.8053596614950633, |
|
"grad_norm": 0.4615694339610234, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3955, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.8110014104372354, |
|
"grad_norm": 0.4628092696805375, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4016, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 1.8166431593794075, |
|
"grad_norm": 0.5254614489605429, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4033, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 1.8222849083215797, |
|
"grad_norm": 0.44928160369491393, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3927, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 1.8279266572637518, |
|
"grad_norm": 0.4956917940016818, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3963, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 1.8335684062059239, |
|
"grad_norm": 0.45361998350750077, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3983, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 1.839210155148096, |
|
"grad_norm": 0.4515003704595137, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3963, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 1.844851904090268, |
|
"grad_norm": 0.4556275342458607, |
|
"learning_rate": 5e-06, |
|
"loss": 0.398, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 1.85049365303244, |
|
"grad_norm": 0.46890202055080116, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3804, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 1.8561354019746121, |
|
"grad_norm": 0.4567048420478033, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3866, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 1.8617771509167842, |
|
"grad_norm": 0.46010370992720745, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4071, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.8674188998589563, |
|
"grad_norm": 0.46330780688841133, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4007, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 1.8730606488011283, |
|
"grad_norm": 0.467515061321271, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3958, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 1.8787023977433004, |
|
"grad_norm": 0.4478417663578568, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3965, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 1.8843441466854725, |
|
"grad_norm": 0.46131439541886865, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3972, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 1.8899858956276445, |
|
"grad_norm": 0.4649363714764279, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3943, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 1.8956276445698168, |
|
"grad_norm": 0.46303194795992636, |
|
"learning_rate": 5e-06, |
|
"loss": 0.391, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 1.9012693935119889, |
|
"grad_norm": 0.42878941947013166, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4039, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 1.906911142454161, |
|
"grad_norm": 0.4725709423896906, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3988, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 1.912552891396333, |
|
"grad_norm": 0.47671368663777564, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3884, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 1.918194640338505, |
|
"grad_norm": 0.46668210559071105, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4066, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.9238363892806771, |
|
"grad_norm": 0.4572223756340763, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3973, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 1.9294781382228492, |
|
"grad_norm": 0.45902290859441564, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3929, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 1.9351198871650213, |
|
"grad_norm": 0.458044072628747, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3857, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 1.9407616361071933, |
|
"grad_norm": 0.4693435523479085, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3904, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 1.9464033850493654, |
|
"grad_norm": 0.4525757784211967, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3901, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 1.9520451339915375, |
|
"grad_norm": 0.4527753194974229, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3883, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 1.9576868829337095, |
|
"grad_norm": 0.45775257683357495, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3933, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 1.9633286318758816, |
|
"grad_norm": 0.4513845604140946, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4084, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 1.9689703808180536, |
|
"grad_norm": 0.488458649525053, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3829, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 1.9746121297602257, |
|
"grad_norm": 0.446090731775206, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3924, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.9802538787023978, |
|
"grad_norm": 0.4709438262355249, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3894, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 1.9858956276445698, |
|
"grad_norm": 0.4870735211701005, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4027, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 1.991537376586742, |
|
"grad_norm": 0.4803646418996235, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3859, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 1.997179125528914, |
|
"grad_norm": 0.4703422698968033, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3944, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.43374887108802795, |
|
"eval_runtime": 448.6077, |
|
"eval_samples_per_second": 26.611, |
|
"eval_steps_per_second": 0.417, |
|
"step": 3545 |
|
}, |
|
{ |
|
"epoch": 2.002820874471086, |
|
"grad_norm": 0.4413473757815756, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3928, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 2.008462623413258, |
|
"grad_norm": 0.4522342894282536, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3547, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 2.01410437235543, |
|
"grad_norm": 0.4290168433856863, |
|
"learning_rate": 5e-06, |
|
"loss": 0.356, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 2.0197461212976022, |
|
"grad_norm": 0.48082235439755094, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3528, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 2.0253878702397743, |
|
"grad_norm": 0.45722766485772276, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3442, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 2.0310296191819464, |
|
"grad_norm": 0.45747195342514013, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3432, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.0366713681241184, |
|
"grad_norm": 0.4358767354369319, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3466, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 2.0423131170662905, |
|
"grad_norm": 0.45049478931307607, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3504, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 2.0479548660084625, |
|
"grad_norm": 0.4563642590969261, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3406, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 2.0535966149506346, |
|
"grad_norm": 0.4312356627894684, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3431, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 2.0592383638928067, |
|
"grad_norm": 0.47281064022880187, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3492, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 2.0648801128349787, |
|
"grad_norm": 0.4435651181299306, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3411, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 2.070521861777151, |
|
"grad_norm": 0.4694545955857111, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3476, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 2.076163610719323, |
|
"grad_norm": 0.4094871096575717, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3446, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 2.081805359661495, |
|
"grad_norm": 0.4407127764565404, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3511, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 2.087447108603667, |
|
"grad_norm": 0.46130441782721465, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3701, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.093088857545839, |
|
"grad_norm": 0.48247411322370654, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3563, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 2.098730606488011, |
|
"grad_norm": 0.4219187636429636, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3471, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 2.104372355430183, |
|
"grad_norm": 0.4611444707906527, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3567, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 2.1100141043723553, |
|
"grad_norm": 0.4553725011658897, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3529, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 2.1156558533145273, |
|
"grad_norm": 0.4464002785245552, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3501, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 2.1212976022566994, |
|
"grad_norm": 0.4300513716532807, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3498, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 2.1269393511988715, |
|
"grad_norm": 0.456511976198015, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3519, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 2.1325811001410435, |
|
"grad_norm": 0.4233132317201342, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3476, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 2.138222849083216, |
|
"grad_norm": 0.471119966312247, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3519, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 2.143864598025388, |
|
"grad_norm": 0.4633272783360576, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3564, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.14950634696756, |
|
"grad_norm": 0.42965266973567434, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3518, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 2.155148095909732, |
|
"grad_norm": 0.4298353082026474, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3445, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 2.1607898448519043, |
|
"grad_norm": 0.47739223858312535, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3445, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 2.1664315937940763, |
|
"grad_norm": 0.4532966358832045, |
|
"learning_rate": 5e-06, |
|
"loss": 0.356, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 2.1720733427362484, |
|
"grad_norm": 0.4410221778245472, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3662, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 2.1777150916784205, |
|
"grad_norm": 0.4263307654776881, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3467, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 2.1833568406205925, |
|
"grad_norm": 0.45149035680132815, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3448, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 2.1889985895627646, |
|
"grad_norm": 0.4529321307975293, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3477, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 2.1946403385049367, |
|
"grad_norm": 0.4607065783291932, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3495, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 2.2002820874471087, |
|
"grad_norm": 0.4602518910582201, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3487, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.205923836389281, |
|
"grad_norm": 0.43692806647452487, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3524, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 2.211565585331453, |
|
"grad_norm": 0.4552717555185162, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3471, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 2.217207334273625, |
|
"grad_norm": 0.4525591489683545, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3592, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 2.222849083215797, |
|
"grad_norm": 0.4404336083861297, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3557, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 2.228490832157969, |
|
"grad_norm": 0.45114848417343256, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3423, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 2.234132581100141, |
|
"grad_norm": 0.4370952273186252, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3546, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 2.239774330042313, |
|
"grad_norm": 0.4435799605060227, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3567, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 2.2454160789844853, |
|
"grad_norm": 0.4915642595318201, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3584, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 2.2510578279266573, |
|
"grad_norm": 0.44114555771160074, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3486, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 2.2566995768688294, |
|
"grad_norm": 0.4749861176607326, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3636, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.2623413258110014, |
|
"grad_norm": 0.46753231180049154, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3537, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 2.2679830747531735, |
|
"grad_norm": 0.4419176647270738, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3501, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 2.2736248236953456, |
|
"grad_norm": 0.4587766070955877, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3541, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 2.2792665726375176, |
|
"grad_norm": 0.43530289551944434, |
|
"learning_rate": 5e-06, |
|
"loss": 0.351, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 2.2849083215796897, |
|
"grad_norm": 0.4631459072773793, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3485, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 2.2905500705218618, |
|
"grad_norm": 0.47022508312977196, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3502, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 2.296191819464034, |
|
"grad_norm": 0.4505813037738865, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3651, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 2.301833568406206, |
|
"grad_norm": 0.4486825542866407, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3544, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 2.307475317348378, |
|
"grad_norm": 0.4665999075970455, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3685, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 2.31311706629055, |
|
"grad_norm": 0.4888288507127307, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3619, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 2.318758815232722, |
|
"grad_norm": 0.41270795857689285, |
|
"learning_rate": 5e-06, |
|
"loss": 0.352, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 2.324400564174894, |
|
"grad_norm": 0.4419259104601605, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3383, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 2.330042313117066, |
|
"grad_norm": 0.46603521309981116, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3536, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 2.3356840620592383, |
|
"grad_norm": 0.47937141599717065, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3563, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 2.3413258110014104, |
|
"grad_norm": 0.44765567183470945, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3467, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 2.3469675599435824, |
|
"grad_norm": 0.4457046351361799, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3503, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 2.3526093088857545, |
|
"grad_norm": 0.44307837034261943, |
|
"learning_rate": 5e-06, |
|
"loss": 0.352, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 2.3582510578279265, |
|
"grad_norm": 0.4267219846723022, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3774, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 2.3638928067700986, |
|
"grad_norm": 0.48122908327097114, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3598, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 2.3695345557122707, |
|
"grad_norm": 0.45331508297626466, |
|
"learning_rate": 5e-06, |
|
"loss": 0.362, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.3751763046544427, |
|
"grad_norm": 0.4594196615052227, |
|
"learning_rate": 5e-06, |
|
"loss": 0.354, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 2.380818053596615, |
|
"grad_norm": 0.4495058567180949, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3602, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 2.386459802538787, |
|
"grad_norm": 0.4676232528999999, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3516, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 2.392101551480959, |
|
"grad_norm": 0.4663506464819916, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3489, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 2.397743300423131, |
|
"grad_norm": 0.4377174836018769, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3566, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 2.403385049365303, |
|
"grad_norm": 0.4410880459267694, |
|
"learning_rate": 5e-06, |
|
"loss": 0.355, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 2.409026798307475, |
|
"grad_norm": 0.4416031717769207, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3606, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 2.414668547249647, |
|
"grad_norm": 0.46431459859880975, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3551, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 2.4203102961918193, |
|
"grad_norm": 0.4603756553070287, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3521, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 2.4259520451339913, |
|
"grad_norm": 0.43937667290163923, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3736, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.4315937940761634, |
|
"grad_norm": 0.4446699867853186, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3467, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 2.4372355430183354, |
|
"grad_norm": 0.43626545583793597, |
|
"learning_rate": 5e-06, |
|
"loss": 0.348, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 2.4428772919605075, |
|
"grad_norm": 0.49173390105039966, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3521, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 2.44851904090268, |
|
"grad_norm": 0.45995716373861045, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3456, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 2.454160789844852, |
|
"grad_norm": 0.44249790330543903, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3433, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 2.459802538787024, |
|
"grad_norm": 0.45560126813520535, |
|
"learning_rate": 5e-06, |
|
"loss": 0.349, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 2.465444287729196, |
|
"grad_norm": 0.4594734244394021, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3578, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 2.4710860366713683, |
|
"grad_norm": 0.4572577458846818, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3485, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 2.4767277856135403, |
|
"grad_norm": 0.44219315631814177, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3471, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 2.4823695345557124, |
|
"grad_norm": 0.42240807095659477, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3562, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.4880112834978845, |
|
"grad_norm": 0.47229751050541774, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3482, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 2.4936530324400565, |
|
"grad_norm": 0.4458066836074724, |
|
"learning_rate": 5e-06, |
|
"loss": 0.355, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 2.4992947813822286, |
|
"grad_norm": 0.4503226109681392, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3538, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 2.5049365303244007, |
|
"grad_norm": 0.44110608535592855, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3747, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 2.5105782792665727, |
|
"grad_norm": 0.5109212717715426, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3519, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 2.516220028208745, |
|
"grad_norm": 0.4249918016919024, |
|
"learning_rate": 5e-06, |
|
"loss": 0.342, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 2.521861777150917, |
|
"grad_norm": 0.5086305630978165, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3607, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 2.527503526093089, |
|
"grad_norm": 0.4358509107515122, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3468, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 2.533145275035261, |
|
"grad_norm": 0.4539559547925107, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3443, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 2.538787023977433, |
|
"grad_norm": 0.4653862436948121, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3524, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.544428772919605, |
|
"grad_norm": 0.41711716794524994, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3479, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 2.550070521861777, |
|
"grad_norm": 0.45795092567053497, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3695, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 2.5557122708039492, |
|
"grad_norm": 0.4724307476878103, |
|
"learning_rate": 5e-06, |
|
"loss": 0.353, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 2.5613540197461213, |
|
"grad_norm": 0.4667580239368319, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3539, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 2.5669957686882934, |
|
"grad_norm": 0.4475375918113466, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3598, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 2.5726375176304654, |
|
"grad_norm": 0.480757840067183, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3485, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 2.5782792665726375, |
|
"grad_norm": 0.4276733499528922, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3479, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 2.5839210155148096, |
|
"grad_norm": 0.43537614271812025, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3543, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 2.5895627644569816, |
|
"grad_norm": 0.42394662487032214, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3475, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 2.5952045133991537, |
|
"grad_norm": 0.45439257995617655, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3484, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.6008462623413258, |
|
"grad_norm": 0.4463381033101569, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3478, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 2.606488011283498, |
|
"grad_norm": 0.4651753425049505, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3532, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 2.61212976022567, |
|
"grad_norm": 0.4725584824422778, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3667, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 2.617771509167842, |
|
"grad_norm": 0.4496062316974007, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3566, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 2.623413258110014, |
|
"grad_norm": 0.4301211716374985, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3466, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 2.629055007052186, |
|
"grad_norm": 0.4567935039875112, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3532, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 2.634696755994358, |
|
"grad_norm": 0.45514691870247576, |
|
"learning_rate": 5e-06, |
|
"loss": 0.352, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 2.64033850493653, |
|
"grad_norm": 0.4435768402675874, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3479, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 2.6459802538787023, |
|
"grad_norm": 0.43799237086382287, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3399, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 2.6516220028208743, |
|
"grad_norm": 0.45347330937833646, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3496, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 2.6572637517630464, |
|
"grad_norm": 0.45449617328134695, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3698, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 2.6629055007052185, |
|
"grad_norm": 0.45514167950119666, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3557, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 2.6685472496473905, |
|
"grad_norm": 0.45124610082620425, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3555, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 2.6741889985895626, |
|
"grad_norm": 0.45506617549803663, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3533, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 2.679830747531735, |
|
"grad_norm": 0.4497891236146143, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3593, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 2.685472496473907, |
|
"grad_norm": 0.43730277262363093, |
|
"learning_rate": 5e-06, |
|
"loss": 0.35, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 2.6911142454160792, |
|
"grad_norm": 0.4453843880728269, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3508, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 2.6967559943582513, |
|
"grad_norm": 0.4551381875534027, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3659, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 2.7023977433004234, |
|
"grad_norm": 0.44693198155600794, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3488, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 2.7080394922425954, |
|
"grad_norm": 0.4725411086517588, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3622, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.7136812411847675, |
|
"grad_norm": 0.45435090794835215, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3659, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 2.7193229901269396, |
|
"grad_norm": 0.40891902748686465, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3521, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 2.7249647390691116, |
|
"grad_norm": 0.4588622825344602, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3609, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 2.7306064880112837, |
|
"grad_norm": 0.47220524310687695, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3608, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 2.7362482369534558, |
|
"grad_norm": 0.4813382330408875, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3516, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 2.741889985895628, |
|
"grad_norm": 0.44851106014638686, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3575, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 2.7475317348378, |
|
"grad_norm": 0.4648873406447677, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3508, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 2.753173483779972, |
|
"grad_norm": 0.44332878174865403, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3548, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 2.758815232722144, |
|
"grad_norm": 0.44016959273981, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3518, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 2.764456981664316, |
|
"grad_norm": 0.42469643512821914, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3501, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 2.770098730606488, |
|
"grad_norm": 0.44842384575861166, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3452, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 2.77574047954866, |
|
"grad_norm": 0.44453086608294007, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3576, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 2.7813822284908323, |
|
"grad_norm": 0.4613767704732174, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3669, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 2.7870239774330043, |
|
"grad_norm": 0.42157749246627113, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3558, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 2.7926657263751764, |
|
"grad_norm": 0.44623021177861155, |
|
"learning_rate": 5e-06, |
|
"loss": 0.353, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 2.7983074753173485, |
|
"grad_norm": 0.44511445391899146, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3575, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 2.8039492242595205, |
|
"grad_norm": 0.4496517977205029, |
|
"learning_rate": 5e-06, |
|
"loss": 0.376, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 2.8095909732016926, |
|
"grad_norm": 0.4568581481429044, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3552, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 2.8152327221438647, |
|
"grad_norm": 0.45872415735621647, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3538, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 2.8208744710860367, |
|
"grad_norm": 0.43280090040022784, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3596, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.826516220028209, |
|
"grad_norm": 0.4271253356285509, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3589, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 2.832157968970381, |
|
"grad_norm": 0.45509701773858097, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3717, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 2.837799717912553, |
|
"grad_norm": 0.43287288682215924, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3573, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 2.843441466854725, |
|
"grad_norm": 0.4688529933224419, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3477, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 2.849083215796897, |
|
"grad_norm": 0.4331993042860941, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3514, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 2.854724964739069, |
|
"grad_norm": 0.47629494492943353, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3457, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 2.860366713681241, |
|
"grad_norm": 0.4547175713111894, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3616, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 2.8660084626234132, |
|
"grad_norm": 0.4697185774932994, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3527, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 2.8716502115655853, |
|
"grad_norm": 0.46979390495300094, |
|
"learning_rate": 5e-06, |
|
"loss": 0.367, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 2.8772919605077574, |
|
"grad_norm": 0.4779125028298598, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3511, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 2.8829337094499294, |
|
"grad_norm": 0.4974784539605145, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3623, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 2.8885754583921015, |
|
"grad_norm": 0.4614842753048295, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3495, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 2.8942172073342736, |
|
"grad_norm": 0.43741541412768414, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3566, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 2.8998589562764456, |
|
"grad_norm": 0.4611139730639956, |
|
"learning_rate": 5e-06, |
|
"loss": 0.357, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 2.9055007052186177, |
|
"grad_norm": 0.4584393192245279, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3559, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 2.9111424541607898, |
|
"grad_norm": 0.4605897500358934, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3599, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 2.916784203102962, |
|
"grad_norm": 0.5047737206876777, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3554, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 2.922425952045134, |
|
"grad_norm": 0.43957877748790186, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3553, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 2.928067700987306, |
|
"grad_norm": 0.41934448745808994, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3537, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 2.933709449929478, |
|
"grad_norm": 0.4497013017770954, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3591, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 2.93935119887165, |
|
"grad_norm": 0.46915975111439107, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3561, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 2.944992947813822, |
|
"grad_norm": 0.4428104855895761, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3633, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 2.950634696755994, |
|
"grad_norm": 0.448532360201155, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3496, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 2.9562764456981663, |
|
"grad_norm": 0.47532539519127587, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3484, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 2.9619181946403383, |
|
"grad_norm": 0.43655270107253413, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3735, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 2.9675599435825104, |
|
"grad_norm": 0.4654091728547412, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3517, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 2.9732016925246825, |
|
"grad_norm": 0.48276086071545776, |
|
"learning_rate": 5e-06, |
|
"loss": 0.358, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 2.9788434414668545, |
|
"grad_norm": 0.4497726059890603, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3743, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 2.9844851904090266, |
|
"grad_norm": 0.42161219193763577, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3519, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 2.9901269393511987, |
|
"grad_norm": 0.4593665569282975, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3473, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 2.9957686882933707, |
|
"grad_norm": 0.4432358435800667, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3623, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 2.9991537376586743, |
|
"eval_loss": 0.440873384475708, |
|
"eval_runtime": 444.0657, |
|
"eval_samples_per_second": 26.883, |
|
"eval_steps_per_second": 0.421, |
|
"step": 5316 |
|
}, |
|
{ |
|
"epoch": 2.9991537376586743, |
|
"step": 5316, |
|
"total_flos": 2786674505416704.0, |
|
"train_loss": 0.4018145801655057, |
|
"train_runtime": 71328.9114, |
|
"train_samples_per_second": 9.54, |
|
"train_steps_per_second": 0.075 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5316, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2786674505416704.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|