oh-dcft-v3.1-llama-3.1-8b-qwen / trainer_state.json
sedrickkeh's picture
End of training
b2df411 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9991537376586743,
"eval_steps": 500,
"global_step": 5316,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005641748942172073,
"grad_norm": 0.8860281773045255,
"learning_rate": 5e-06,
"loss": 0.5781,
"step": 10
},
{
"epoch": 0.011283497884344146,
"grad_norm": 0.9275466531112635,
"learning_rate": 5e-06,
"loss": 0.5493,
"step": 20
},
{
"epoch": 0.01692524682651622,
"grad_norm": 0.9785057374089036,
"learning_rate": 5e-06,
"loss": 0.5144,
"step": 30
},
{
"epoch": 0.022566995768688293,
"grad_norm": 0.9445832962940219,
"learning_rate": 5e-06,
"loss": 0.5132,
"step": 40
},
{
"epoch": 0.028208744710860368,
"grad_norm": 0.8794347209187481,
"learning_rate": 5e-06,
"loss": 0.4958,
"step": 50
},
{
"epoch": 0.03385049365303244,
"grad_norm": 0.8768425701561404,
"learning_rate": 5e-06,
"loss": 0.5032,
"step": 60
},
{
"epoch": 0.039492242595204514,
"grad_norm": 0.7246974546492897,
"learning_rate": 5e-06,
"loss": 0.4798,
"step": 70
},
{
"epoch": 0.045133991537376586,
"grad_norm": 0.554012169044732,
"learning_rate": 5e-06,
"loss": 0.4888,
"step": 80
},
{
"epoch": 0.05077574047954866,
"grad_norm": 0.5618297520336772,
"learning_rate": 5e-06,
"loss": 0.4811,
"step": 90
},
{
"epoch": 0.056417489421720736,
"grad_norm": 0.5213657220782494,
"learning_rate": 5e-06,
"loss": 0.468,
"step": 100
},
{
"epoch": 0.06205923836389281,
"grad_norm": 0.509393805640559,
"learning_rate": 5e-06,
"loss": 0.4829,
"step": 110
},
{
"epoch": 0.06770098730606489,
"grad_norm": 0.543532182255718,
"learning_rate": 5e-06,
"loss": 0.4737,
"step": 120
},
{
"epoch": 0.07334273624823695,
"grad_norm": 0.5016209370031858,
"learning_rate": 5e-06,
"loss": 0.4787,
"step": 130
},
{
"epoch": 0.07898448519040903,
"grad_norm": 0.5209031521531445,
"learning_rate": 5e-06,
"loss": 0.4848,
"step": 140
},
{
"epoch": 0.0846262341325811,
"grad_norm": 0.47470655842824117,
"learning_rate": 5e-06,
"loss": 0.4776,
"step": 150
},
{
"epoch": 0.09026798307475317,
"grad_norm": 0.5098693274003744,
"learning_rate": 5e-06,
"loss": 0.4765,
"step": 160
},
{
"epoch": 0.09590973201692525,
"grad_norm": 0.5148841472543283,
"learning_rate": 5e-06,
"loss": 0.4744,
"step": 170
},
{
"epoch": 0.10155148095909731,
"grad_norm": 0.5303116836505042,
"learning_rate": 5e-06,
"loss": 0.4789,
"step": 180
},
{
"epoch": 0.1071932299012694,
"grad_norm": 0.5035762913816794,
"learning_rate": 5e-06,
"loss": 0.4763,
"step": 190
},
{
"epoch": 0.11283497884344147,
"grad_norm": 0.5096690168519262,
"learning_rate": 5e-06,
"loss": 0.4687,
"step": 200
},
{
"epoch": 0.11847672778561354,
"grad_norm": 0.50737809697083,
"learning_rate": 5e-06,
"loss": 0.4615,
"step": 210
},
{
"epoch": 0.12411847672778561,
"grad_norm": 0.4961749979329462,
"learning_rate": 5e-06,
"loss": 0.4713,
"step": 220
},
{
"epoch": 0.12976022566995768,
"grad_norm": 0.47944422939763603,
"learning_rate": 5e-06,
"loss": 0.475,
"step": 230
},
{
"epoch": 0.13540197461212977,
"grad_norm": 0.49717123601985425,
"learning_rate": 5e-06,
"loss": 0.4739,
"step": 240
},
{
"epoch": 0.14104372355430184,
"grad_norm": 0.47611353329941747,
"learning_rate": 5e-06,
"loss": 0.4649,
"step": 250
},
{
"epoch": 0.1466854724964739,
"grad_norm": 0.4894294603649133,
"learning_rate": 5e-06,
"loss": 0.4917,
"step": 260
},
{
"epoch": 0.152327221438646,
"grad_norm": 0.48373950578804115,
"learning_rate": 5e-06,
"loss": 0.4493,
"step": 270
},
{
"epoch": 0.15796897038081806,
"grad_norm": 0.522789579136924,
"learning_rate": 5e-06,
"loss": 0.4664,
"step": 280
},
{
"epoch": 0.16361071932299012,
"grad_norm": 0.4789421152666509,
"learning_rate": 5e-06,
"loss": 0.459,
"step": 290
},
{
"epoch": 0.1692524682651622,
"grad_norm": 0.5012557865235248,
"learning_rate": 5e-06,
"loss": 0.4552,
"step": 300
},
{
"epoch": 0.17489421720733428,
"grad_norm": 0.4624585048654735,
"learning_rate": 5e-06,
"loss": 0.4596,
"step": 310
},
{
"epoch": 0.18053596614950634,
"grad_norm": 0.47502867235282875,
"learning_rate": 5e-06,
"loss": 0.4782,
"step": 320
},
{
"epoch": 0.1861777150916784,
"grad_norm": 0.48965795082153923,
"learning_rate": 5e-06,
"loss": 0.4645,
"step": 330
},
{
"epoch": 0.1918194640338505,
"grad_norm": 0.48242874081163245,
"learning_rate": 5e-06,
"loss": 0.4647,
"step": 340
},
{
"epoch": 0.19746121297602257,
"grad_norm": 0.5121876755000992,
"learning_rate": 5e-06,
"loss": 0.4665,
"step": 350
},
{
"epoch": 0.20310296191819463,
"grad_norm": 0.49770668244489025,
"learning_rate": 5e-06,
"loss": 0.4707,
"step": 360
},
{
"epoch": 0.20874471086036672,
"grad_norm": 0.5159796946721876,
"learning_rate": 5e-06,
"loss": 0.4626,
"step": 370
},
{
"epoch": 0.2143864598025388,
"grad_norm": 0.48687862998047,
"learning_rate": 5e-06,
"loss": 0.4605,
"step": 380
},
{
"epoch": 0.22002820874471085,
"grad_norm": 0.5040230352920434,
"learning_rate": 5e-06,
"loss": 0.4582,
"step": 390
},
{
"epoch": 0.22566995768688294,
"grad_norm": 0.4841608785680818,
"learning_rate": 5e-06,
"loss": 0.4655,
"step": 400
},
{
"epoch": 0.231311706629055,
"grad_norm": 0.49039176099743137,
"learning_rate": 5e-06,
"loss": 0.4495,
"step": 410
},
{
"epoch": 0.23695345557122707,
"grad_norm": 0.5040217827748269,
"learning_rate": 5e-06,
"loss": 0.4529,
"step": 420
},
{
"epoch": 0.24259520451339917,
"grad_norm": 0.47120041333093315,
"learning_rate": 5e-06,
"loss": 0.4569,
"step": 430
},
{
"epoch": 0.24823695345557123,
"grad_norm": 0.4890234120450319,
"learning_rate": 5e-06,
"loss": 0.4613,
"step": 440
},
{
"epoch": 0.2538787023977433,
"grad_norm": 0.48217359915393404,
"learning_rate": 5e-06,
"loss": 0.4472,
"step": 450
},
{
"epoch": 0.25952045133991536,
"grad_norm": 0.467804639174959,
"learning_rate": 5e-06,
"loss": 0.448,
"step": 460
},
{
"epoch": 0.2651622002820874,
"grad_norm": 0.48164716949150344,
"learning_rate": 5e-06,
"loss": 0.4502,
"step": 470
},
{
"epoch": 0.27080394922425954,
"grad_norm": 0.5145780661011983,
"learning_rate": 5e-06,
"loss": 0.4572,
"step": 480
},
{
"epoch": 0.2764456981664316,
"grad_norm": 0.5207011942621447,
"learning_rate": 5e-06,
"loss": 0.4616,
"step": 490
},
{
"epoch": 0.2820874471086037,
"grad_norm": 0.4935510238242219,
"learning_rate": 5e-06,
"loss": 0.4659,
"step": 500
},
{
"epoch": 0.28772919605077574,
"grad_norm": 0.5079417454565434,
"learning_rate": 5e-06,
"loss": 0.4582,
"step": 510
},
{
"epoch": 0.2933709449929478,
"grad_norm": 0.5314525886288128,
"learning_rate": 5e-06,
"loss": 0.4628,
"step": 520
},
{
"epoch": 0.29901269393511987,
"grad_norm": 0.5053611809759164,
"learning_rate": 5e-06,
"loss": 0.4478,
"step": 530
},
{
"epoch": 0.304654442877292,
"grad_norm": 0.4794058313719111,
"learning_rate": 5e-06,
"loss": 0.4451,
"step": 540
},
{
"epoch": 0.31029619181946405,
"grad_norm": 0.48888977415025386,
"learning_rate": 5e-06,
"loss": 0.4702,
"step": 550
},
{
"epoch": 0.3159379407616361,
"grad_norm": 0.4849302874069741,
"learning_rate": 5e-06,
"loss": 0.4561,
"step": 560
},
{
"epoch": 0.3215796897038082,
"grad_norm": 0.47972377135392075,
"learning_rate": 5e-06,
"loss": 0.4457,
"step": 570
},
{
"epoch": 0.32722143864598024,
"grad_norm": 0.4869264334442687,
"learning_rate": 5e-06,
"loss": 0.4498,
"step": 580
},
{
"epoch": 0.3328631875881523,
"grad_norm": 0.5030426273166695,
"learning_rate": 5e-06,
"loss": 0.4698,
"step": 590
},
{
"epoch": 0.3385049365303244,
"grad_norm": 0.4792385544239688,
"learning_rate": 5e-06,
"loss": 0.4524,
"step": 600
},
{
"epoch": 0.3441466854724965,
"grad_norm": 0.4757776685745222,
"learning_rate": 5e-06,
"loss": 0.4481,
"step": 610
},
{
"epoch": 0.34978843441466856,
"grad_norm": 0.5141080166869366,
"learning_rate": 5e-06,
"loss": 0.4522,
"step": 620
},
{
"epoch": 0.3554301833568406,
"grad_norm": 0.521030094448152,
"learning_rate": 5e-06,
"loss": 0.4529,
"step": 630
},
{
"epoch": 0.3610719322990127,
"grad_norm": 0.49616684223591123,
"learning_rate": 5e-06,
"loss": 0.4585,
"step": 640
},
{
"epoch": 0.36671368124118475,
"grad_norm": 0.5224973873990862,
"learning_rate": 5e-06,
"loss": 0.4531,
"step": 650
},
{
"epoch": 0.3723554301833568,
"grad_norm": 0.46606976454004667,
"learning_rate": 5e-06,
"loss": 0.4499,
"step": 660
},
{
"epoch": 0.37799717912552894,
"grad_norm": 0.4631578950745994,
"learning_rate": 5e-06,
"loss": 0.4591,
"step": 670
},
{
"epoch": 0.383638928067701,
"grad_norm": 0.463696350712983,
"learning_rate": 5e-06,
"loss": 0.4617,
"step": 680
},
{
"epoch": 0.38928067700987307,
"grad_norm": 0.49700726007271695,
"learning_rate": 5e-06,
"loss": 0.4419,
"step": 690
},
{
"epoch": 0.39492242595204513,
"grad_norm": 0.5047528462302425,
"learning_rate": 5e-06,
"loss": 0.4546,
"step": 700
},
{
"epoch": 0.4005641748942172,
"grad_norm": 0.4881338305694489,
"learning_rate": 5e-06,
"loss": 0.4415,
"step": 710
},
{
"epoch": 0.40620592383638926,
"grad_norm": 0.4950088901214604,
"learning_rate": 5e-06,
"loss": 0.4467,
"step": 720
},
{
"epoch": 0.4118476727785614,
"grad_norm": 0.48800943523969437,
"learning_rate": 5e-06,
"loss": 0.4617,
"step": 730
},
{
"epoch": 0.41748942172073344,
"grad_norm": 0.4761347013711521,
"learning_rate": 5e-06,
"loss": 0.4455,
"step": 740
},
{
"epoch": 0.4231311706629055,
"grad_norm": 0.4811571918715123,
"learning_rate": 5e-06,
"loss": 0.4752,
"step": 750
},
{
"epoch": 0.4287729196050776,
"grad_norm": 0.4785173629798188,
"learning_rate": 5e-06,
"loss": 0.4312,
"step": 760
},
{
"epoch": 0.43441466854724964,
"grad_norm": 0.499757446583109,
"learning_rate": 5e-06,
"loss": 0.4522,
"step": 770
},
{
"epoch": 0.4400564174894217,
"grad_norm": 0.5007042680003394,
"learning_rate": 5e-06,
"loss": 0.4547,
"step": 780
},
{
"epoch": 0.44569816643159377,
"grad_norm": 0.4832215616215704,
"learning_rate": 5e-06,
"loss": 0.4328,
"step": 790
},
{
"epoch": 0.4513399153737659,
"grad_norm": 0.4556785539804432,
"learning_rate": 5e-06,
"loss": 0.4526,
"step": 800
},
{
"epoch": 0.45698166431593795,
"grad_norm": 0.4583262040596829,
"learning_rate": 5e-06,
"loss": 0.4543,
"step": 810
},
{
"epoch": 0.46262341325811,
"grad_norm": 0.47568673401701195,
"learning_rate": 5e-06,
"loss": 0.4489,
"step": 820
},
{
"epoch": 0.4682651622002821,
"grad_norm": 0.5099408600605224,
"learning_rate": 5e-06,
"loss": 0.4635,
"step": 830
},
{
"epoch": 0.47390691114245415,
"grad_norm": 0.48286512485005056,
"learning_rate": 5e-06,
"loss": 0.4414,
"step": 840
},
{
"epoch": 0.4795486600846262,
"grad_norm": 0.4662493732462359,
"learning_rate": 5e-06,
"loss": 0.4702,
"step": 850
},
{
"epoch": 0.48519040902679833,
"grad_norm": 0.4688957523663092,
"learning_rate": 5e-06,
"loss": 0.452,
"step": 860
},
{
"epoch": 0.4908321579689704,
"grad_norm": 0.5020197782038716,
"learning_rate": 5e-06,
"loss": 0.4401,
"step": 870
},
{
"epoch": 0.49647390691114246,
"grad_norm": 0.5293932494261749,
"learning_rate": 5e-06,
"loss": 0.4594,
"step": 880
},
{
"epoch": 0.5021156558533145,
"grad_norm": 0.5071632389201434,
"learning_rate": 5e-06,
"loss": 0.4445,
"step": 890
},
{
"epoch": 0.5077574047954866,
"grad_norm": 0.4878558607219793,
"learning_rate": 5e-06,
"loss": 0.4419,
"step": 900
},
{
"epoch": 0.5133991537376587,
"grad_norm": 0.463852809384172,
"learning_rate": 5e-06,
"loss": 0.4476,
"step": 910
},
{
"epoch": 0.5190409026798307,
"grad_norm": 0.4552138421616773,
"learning_rate": 5e-06,
"loss": 0.4477,
"step": 920
},
{
"epoch": 0.5246826516220028,
"grad_norm": 0.4451067953129034,
"learning_rate": 5e-06,
"loss": 0.4602,
"step": 930
},
{
"epoch": 0.5303244005641748,
"grad_norm": 0.4892614287238877,
"learning_rate": 5e-06,
"loss": 0.4463,
"step": 940
},
{
"epoch": 0.535966149506347,
"grad_norm": 0.5203275452886463,
"learning_rate": 5e-06,
"loss": 0.4481,
"step": 950
},
{
"epoch": 0.5416078984485191,
"grad_norm": 0.4993491544629748,
"learning_rate": 5e-06,
"loss": 0.4547,
"step": 960
},
{
"epoch": 0.5472496473906912,
"grad_norm": 0.4530550836722553,
"learning_rate": 5e-06,
"loss": 0.4342,
"step": 970
},
{
"epoch": 0.5528913963328632,
"grad_norm": 0.47205399956664723,
"learning_rate": 5e-06,
"loss": 0.4391,
"step": 980
},
{
"epoch": 0.5585331452750353,
"grad_norm": 0.5288042301017725,
"learning_rate": 5e-06,
"loss": 0.4477,
"step": 990
},
{
"epoch": 0.5641748942172073,
"grad_norm": 0.5024574538810612,
"learning_rate": 5e-06,
"loss": 0.4644,
"step": 1000
},
{
"epoch": 0.5698166431593794,
"grad_norm": 0.46444873871572295,
"learning_rate": 5e-06,
"loss": 0.442,
"step": 1010
},
{
"epoch": 0.5754583921015515,
"grad_norm": 0.48362451913417076,
"learning_rate": 5e-06,
"loss": 0.4363,
"step": 1020
},
{
"epoch": 0.5811001410437235,
"grad_norm": 0.48683768680972256,
"learning_rate": 5e-06,
"loss": 0.4523,
"step": 1030
},
{
"epoch": 0.5867418899858956,
"grad_norm": 0.4753500530255471,
"learning_rate": 5e-06,
"loss": 0.4452,
"step": 1040
},
{
"epoch": 0.5923836389280677,
"grad_norm": 0.494982125586109,
"learning_rate": 5e-06,
"loss": 0.4504,
"step": 1050
},
{
"epoch": 0.5980253878702397,
"grad_norm": 0.4658594939623635,
"learning_rate": 5e-06,
"loss": 0.4414,
"step": 1060
},
{
"epoch": 0.6036671368124118,
"grad_norm": 0.4576854005593855,
"learning_rate": 5e-06,
"loss": 0.4336,
"step": 1070
},
{
"epoch": 0.609308885754584,
"grad_norm": 0.47667736492718527,
"learning_rate": 5e-06,
"loss": 0.4446,
"step": 1080
},
{
"epoch": 0.614950634696756,
"grad_norm": 0.49049704641298675,
"learning_rate": 5e-06,
"loss": 0.452,
"step": 1090
},
{
"epoch": 0.6205923836389281,
"grad_norm": 0.47137093184915657,
"learning_rate": 5e-06,
"loss": 0.4756,
"step": 1100
},
{
"epoch": 0.6262341325811002,
"grad_norm": 0.48219417585137514,
"learning_rate": 5e-06,
"loss": 0.4427,
"step": 1110
},
{
"epoch": 0.6318758815232722,
"grad_norm": 0.453987324341643,
"learning_rate": 5e-06,
"loss": 0.4323,
"step": 1120
},
{
"epoch": 0.6375176304654443,
"grad_norm": 0.5092221096631693,
"learning_rate": 5e-06,
"loss": 0.4488,
"step": 1130
},
{
"epoch": 0.6431593794076164,
"grad_norm": 0.5005212580369779,
"learning_rate": 5e-06,
"loss": 0.4611,
"step": 1140
},
{
"epoch": 0.6488011283497884,
"grad_norm": 0.5259263069942747,
"learning_rate": 5e-06,
"loss": 0.4438,
"step": 1150
},
{
"epoch": 0.6544428772919605,
"grad_norm": 0.4915487101147822,
"learning_rate": 5e-06,
"loss": 0.4452,
"step": 1160
},
{
"epoch": 0.6600846262341326,
"grad_norm": 0.4636364534332318,
"learning_rate": 5e-06,
"loss": 0.4397,
"step": 1170
},
{
"epoch": 0.6657263751763046,
"grad_norm": 0.4698556111548417,
"learning_rate": 5e-06,
"loss": 0.4417,
"step": 1180
},
{
"epoch": 0.6713681241184767,
"grad_norm": 0.5329347792411113,
"learning_rate": 5e-06,
"loss": 0.4539,
"step": 1190
},
{
"epoch": 0.6770098730606487,
"grad_norm": 0.5126424852624655,
"learning_rate": 5e-06,
"loss": 0.4429,
"step": 1200
},
{
"epoch": 0.6826516220028209,
"grad_norm": 0.4600428689947934,
"learning_rate": 5e-06,
"loss": 0.4481,
"step": 1210
},
{
"epoch": 0.688293370944993,
"grad_norm": 0.4918232014874478,
"learning_rate": 5e-06,
"loss": 0.44,
"step": 1220
},
{
"epoch": 0.693935119887165,
"grad_norm": 0.5091072490058565,
"learning_rate": 5e-06,
"loss": 0.443,
"step": 1230
},
{
"epoch": 0.6995768688293371,
"grad_norm": 0.5086478162333048,
"learning_rate": 5e-06,
"loss": 0.4439,
"step": 1240
},
{
"epoch": 0.7052186177715092,
"grad_norm": 0.47954449181032316,
"learning_rate": 5e-06,
"loss": 0.4252,
"step": 1250
},
{
"epoch": 0.7108603667136812,
"grad_norm": 0.46596459050514427,
"learning_rate": 5e-06,
"loss": 0.4482,
"step": 1260
},
{
"epoch": 0.7165021156558533,
"grad_norm": 0.46248125242410526,
"learning_rate": 5e-06,
"loss": 0.4402,
"step": 1270
},
{
"epoch": 0.7221438645980254,
"grad_norm": 0.49235084627255177,
"learning_rate": 5e-06,
"loss": 0.4368,
"step": 1280
},
{
"epoch": 0.7277856135401974,
"grad_norm": 0.4864015478165713,
"learning_rate": 5e-06,
"loss": 0.4577,
"step": 1290
},
{
"epoch": 0.7334273624823695,
"grad_norm": 0.5066841927831519,
"learning_rate": 5e-06,
"loss": 0.4527,
"step": 1300
},
{
"epoch": 0.7390691114245416,
"grad_norm": 0.4767296270599191,
"learning_rate": 5e-06,
"loss": 0.4421,
"step": 1310
},
{
"epoch": 0.7447108603667136,
"grad_norm": 0.4770443766109164,
"learning_rate": 5e-06,
"loss": 0.4548,
"step": 1320
},
{
"epoch": 0.7503526093088858,
"grad_norm": 0.4792819993673282,
"learning_rate": 5e-06,
"loss": 0.4349,
"step": 1330
},
{
"epoch": 0.7559943582510579,
"grad_norm": 0.48987632661924885,
"learning_rate": 5e-06,
"loss": 0.4378,
"step": 1340
},
{
"epoch": 0.7616361071932299,
"grad_norm": 0.4896409271912306,
"learning_rate": 5e-06,
"loss": 0.4408,
"step": 1350
},
{
"epoch": 0.767277856135402,
"grad_norm": 0.5370347277468178,
"learning_rate": 5e-06,
"loss": 0.4456,
"step": 1360
},
{
"epoch": 0.7729196050775741,
"grad_norm": 0.5032968949454037,
"learning_rate": 5e-06,
"loss": 0.4473,
"step": 1370
},
{
"epoch": 0.7785613540197461,
"grad_norm": 0.48685319139056404,
"learning_rate": 5e-06,
"loss": 0.4434,
"step": 1380
},
{
"epoch": 0.7842031029619182,
"grad_norm": 0.49748304716726394,
"learning_rate": 5e-06,
"loss": 0.4296,
"step": 1390
},
{
"epoch": 0.7898448519040903,
"grad_norm": 0.48733408476356,
"learning_rate": 5e-06,
"loss": 0.4447,
"step": 1400
},
{
"epoch": 0.7954866008462623,
"grad_norm": 0.5053450525255075,
"learning_rate": 5e-06,
"loss": 0.437,
"step": 1410
},
{
"epoch": 0.8011283497884344,
"grad_norm": 0.5051373461963963,
"learning_rate": 5e-06,
"loss": 0.4517,
"step": 1420
},
{
"epoch": 0.8067700987306065,
"grad_norm": 0.5031702066693102,
"learning_rate": 5e-06,
"loss": 0.4458,
"step": 1430
},
{
"epoch": 0.8124118476727785,
"grad_norm": 0.5185876273657516,
"learning_rate": 5e-06,
"loss": 0.4542,
"step": 1440
},
{
"epoch": 0.8180535966149506,
"grad_norm": 0.49124261927260193,
"learning_rate": 5e-06,
"loss": 0.4405,
"step": 1450
},
{
"epoch": 0.8236953455571228,
"grad_norm": 0.49751086570325753,
"learning_rate": 5e-06,
"loss": 0.4357,
"step": 1460
},
{
"epoch": 0.8293370944992948,
"grad_norm": 0.4707406079652606,
"learning_rate": 5e-06,
"loss": 0.4404,
"step": 1470
},
{
"epoch": 0.8349788434414669,
"grad_norm": 0.4611322469678821,
"learning_rate": 5e-06,
"loss": 0.4291,
"step": 1480
},
{
"epoch": 0.840620592383639,
"grad_norm": 0.46796161325249325,
"learning_rate": 5e-06,
"loss": 0.4446,
"step": 1490
},
{
"epoch": 0.846262341325811,
"grad_norm": 0.5039127016141375,
"learning_rate": 5e-06,
"loss": 0.442,
"step": 1500
},
{
"epoch": 0.8519040902679831,
"grad_norm": 0.4882929849367327,
"learning_rate": 5e-06,
"loss": 0.4427,
"step": 1510
},
{
"epoch": 0.8575458392101551,
"grad_norm": 0.46485028595629135,
"learning_rate": 5e-06,
"loss": 0.4493,
"step": 1520
},
{
"epoch": 0.8631875881523272,
"grad_norm": 0.5261718908487378,
"learning_rate": 5e-06,
"loss": 0.4425,
"step": 1530
},
{
"epoch": 0.8688293370944993,
"grad_norm": 0.5030638236696873,
"learning_rate": 5e-06,
"loss": 0.453,
"step": 1540
},
{
"epoch": 0.8744710860366713,
"grad_norm": 0.47397319099408175,
"learning_rate": 5e-06,
"loss": 0.4667,
"step": 1550
},
{
"epoch": 0.8801128349788434,
"grad_norm": 0.45947257613776293,
"learning_rate": 5e-06,
"loss": 0.4408,
"step": 1560
},
{
"epoch": 0.8857545839210155,
"grad_norm": 0.4886106451240436,
"learning_rate": 5e-06,
"loss": 0.4323,
"step": 1570
},
{
"epoch": 0.8913963328631875,
"grad_norm": 0.46842609789012146,
"learning_rate": 5e-06,
"loss": 0.4385,
"step": 1580
},
{
"epoch": 0.8970380818053597,
"grad_norm": 0.49975332721542237,
"learning_rate": 5e-06,
"loss": 0.4398,
"step": 1590
},
{
"epoch": 0.9026798307475318,
"grad_norm": 0.48527328135804326,
"learning_rate": 5e-06,
"loss": 0.4484,
"step": 1600
},
{
"epoch": 0.9083215796897038,
"grad_norm": 0.49172287584389185,
"learning_rate": 5e-06,
"loss": 0.4463,
"step": 1610
},
{
"epoch": 0.9139633286318759,
"grad_norm": 0.508732362088126,
"learning_rate": 5e-06,
"loss": 0.4395,
"step": 1620
},
{
"epoch": 0.919605077574048,
"grad_norm": 0.47225307145651074,
"learning_rate": 5e-06,
"loss": 0.4548,
"step": 1630
},
{
"epoch": 0.92524682651622,
"grad_norm": 0.46028374293695373,
"learning_rate": 5e-06,
"loss": 0.4402,
"step": 1640
},
{
"epoch": 0.9308885754583921,
"grad_norm": 0.4887795142703319,
"learning_rate": 5e-06,
"loss": 0.4524,
"step": 1650
},
{
"epoch": 0.9365303244005642,
"grad_norm": 0.48414776958913036,
"learning_rate": 5e-06,
"loss": 0.4388,
"step": 1660
},
{
"epoch": 0.9421720733427362,
"grad_norm": 0.47408507089480434,
"learning_rate": 5e-06,
"loss": 0.4373,
"step": 1670
},
{
"epoch": 0.9478138222849083,
"grad_norm": 0.4755919436355295,
"learning_rate": 5e-06,
"loss": 0.4521,
"step": 1680
},
{
"epoch": 0.9534555712270804,
"grad_norm": 0.48600199903202446,
"learning_rate": 5e-06,
"loss": 0.4387,
"step": 1690
},
{
"epoch": 0.9590973201692524,
"grad_norm": 0.4826408864245463,
"learning_rate": 5e-06,
"loss": 0.4474,
"step": 1700
},
{
"epoch": 0.9647390691114246,
"grad_norm": 0.5360459005214712,
"learning_rate": 5e-06,
"loss": 0.4402,
"step": 1710
},
{
"epoch": 0.9703808180535967,
"grad_norm": 0.5267429044967258,
"learning_rate": 5e-06,
"loss": 0.448,
"step": 1720
},
{
"epoch": 0.9760225669957687,
"grad_norm": 0.487975885463895,
"learning_rate": 5e-06,
"loss": 0.4527,
"step": 1730
},
{
"epoch": 0.9816643159379408,
"grad_norm": 0.4656913505732415,
"learning_rate": 5e-06,
"loss": 0.4458,
"step": 1740
},
{
"epoch": 0.9873060648801129,
"grad_norm": 0.48356320486134374,
"learning_rate": 5e-06,
"loss": 0.4565,
"step": 1750
},
{
"epoch": 0.9929478138222849,
"grad_norm": 0.5106136347337831,
"learning_rate": 5e-06,
"loss": 0.4481,
"step": 1760
},
{
"epoch": 0.998589562764457,
"grad_norm": 0.481310325218027,
"learning_rate": 5e-06,
"loss": 0.4318,
"step": 1770
},
{
"epoch": 0.9997179125528914,
"eval_loss": 0.4379998743534088,
"eval_runtime": 445.9549,
"eval_samples_per_second": 26.77,
"eval_steps_per_second": 0.419,
"step": 1772
},
{
"epoch": 1.004231311706629,
"grad_norm": 0.5560347360599398,
"learning_rate": 5e-06,
"loss": 0.4228,
"step": 1780
},
{
"epoch": 1.0098730606488011,
"grad_norm": 0.43343321808918617,
"learning_rate": 5e-06,
"loss": 0.3888,
"step": 1790
},
{
"epoch": 1.0155148095909732,
"grad_norm": 0.49193958815688976,
"learning_rate": 5e-06,
"loss": 0.4081,
"step": 1800
},
{
"epoch": 1.0211565585331452,
"grad_norm": 0.44261196562739774,
"learning_rate": 5e-06,
"loss": 0.4083,
"step": 1810
},
{
"epoch": 1.0267983074753173,
"grad_norm": 0.48715391428811605,
"learning_rate": 5e-06,
"loss": 0.403,
"step": 1820
},
{
"epoch": 1.0324400564174894,
"grad_norm": 0.45617321287848667,
"learning_rate": 5e-06,
"loss": 0.3984,
"step": 1830
},
{
"epoch": 1.0380818053596614,
"grad_norm": 0.46951380049994146,
"learning_rate": 5e-06,
"loss": 0.3908,
"step": 1840
},
{
"epoch": 1.0437235543018335,
"grad_norm": 0.4606776177243496,
"learning_rate": 5e-06,
"loss": 0.3944,
"step": 1850
},
{
"epoch": 1.0493653032440056,
"grad_norm": 0.46717676409843034,
"learning_rate": 5e-06,
"loss": 0.3888,
"step": 1860
},
{
"epoch": 1.0550070521861776,
"grad_norm": 0.4602516664423018,
"learning_rate": 5e-06,
"loss": 0.3936,
"step": 1870
},
{
"epoch": 1.0606488011283497,
"grad_norm": 0.42788829282622504,
"learning_rate": 5e-06,
"loss": 0.3881,
"step": 1880
},
{
"epoch": 1.0662905500705218,
"grad_norm": 0.45508688226916866,
"learning_rate": 5e-06,
"loss": 0.3997,
"step": 1890
},
{
"epoch": 1.071932299012694,
"grad_norm": 0.45167507963707426,
"learning_rate": 5e-06,
"loss": 0.3945,
"step": 1900
},
{
"epoch": 1.077574047954866,
"grad_norm": 0.4638857492654454,
"learning_rate": 5e-06,
"loss": 0.393,
"step": 1910
},
{
"epoch": 1.0832157968970382,
"grad_norm": 0.4565491666336401,
"learning_rate": 5e-06,
"loss": 0.3905,
"step": 1920
},
{
"epoch": 1.0888575458392102,
"grad_norm": 0.4468567209212458,
"learning_rate": 5e-06,
"loss": 0.3945,
"step": 1930
},
{
"epoch": 1.0944992947813823,
"grad_norm": 0.4451125923550269,
"learning_rate": 5e-06,
"loss": 0.3888,
"step": 1940
},
{
"epoch": 1.1001410437235544,
"grad_norm": 0.4700452714699321,
"learning_rate": 5e-06,
"loss": 0.3889,
"step": 1950
},
{
"epoch": 1.1057827926657264,
"grad_norm": 0.4650898761163617,
"learning_rate": 5e-06,
"loss": 0.3883,
"step": 1960
},
{
"epoch": 1.1114245416078985,
"grad_norm": 0.4707832390036078,
"learning_rate": 5e-06,
"loss": 0.4015,
"step": 1970
},
{
"epoch": 1.1170662905500706,
"grad_norm": 0.4743179184075093,
"learning_rate": 5e-06,
"loss": 0.391,
"step": 1980
},
{
"epoch": 1.1227080394922426,
"grad_norm": 0.4823902906366835,
"learning_rate": 5e-06,
"loss": 0.3933,
"step": 1990
},
{
"epoch": 1.1283497884344147,
"grad_norm": 0.4929422166855442,
"learning_rate": 5e-06,
"loss": 0.4033,
"step": 2000
},
{
"epoch": 1.1339915373765868,
"grad_norm": 0.46931415950586963,
"learning_rate": 5e-06,
"loss": 0.3962,
"step": 2010
},
{
"epoch": 1.1396332863187588,
"grad_norm": 0.4716793144119691,
"learning_rate": 5e-06,
"loss": 0.3843,
"step": 2020
},
{
"epoch": 1.1452750352609309,
"grad_norm": 0.46214625180030394,
"learning_rate": 5e-06,
"loss": 0.3896,
"step": 2030
},
{
"epoch": 1.150916784203103,
"grad_norm": 0.4478869441665965,
"learning_rate": 5e-06,
"loss": 0.3903,
"step": 2040
},
{
"epoch": 1.156558533145275,
"grad_norm": 0.47404105806443103,
"learning_rate": 5e-06,
"loss": 0.3923,
"step": 2050
},
{
"epoch": 1.162200282087447,
"grad_norm": 0.4815826404229114,
"learning_rate": 5e-06,
"loss": 0.4137,
"step": 2060
},
{
"epoch": 1.1678420310296191,
"grad_norm": 0.47653645240601855,
"learning_rate": 5e-06,
"loss": 0.3952,
"step": 2070
},
{
"epoch": 1.1734837799717912,
"grad_norm": 0.49644988829819037,
"learning_rate": 5e-06,
"loss": 0.3981,
"step": 2080
},
{
"epoch": 1.1791255289139633,
"grad_norm": 0.46657331353149667,
"learning_rate": 5e-06,
"loss": 0.397,
"step": 2090
},
{
"epoch": 1.1847672778561353,
"grad_norm": 0.4713649930891489,
"learning_rate": 5e-06,
"loss": 0.3987,
"step": 2100
},
{
"epoch": 1.1904090267983074,
"grad_norm": 0.4988957090347967,
"learning_rate": 5e-06,
"loss": 0.3913,
"step": 2110
},
{
"epoch": 1.1960507757404795,
"grad_norm": 0.4644112960714002,
"learning_rate": 5e-06,
"loss": 0.4001,
"step": 2120
},
{
"epoch": 1.2016925246826515,
"grad_norm": 0.4707333720355816,
"learning_rate": 5e-06,
"loss": 0.3968,
"step": 2130
},
{
"epoch": 1.2073342736248236,
"grad_norm": 0.47608463008729884,
"learning_rate": 5e-06,
"loss": 0.4022,
"step": 2140
},
{
"epoch": 1.2129760225669957,
"grad_norm": 0.45996551421943505,
"learning_rate": 5e-06,
"loss": 0.4083,
"step": 2150
},
{
"epoch": 1.2186177715091677,
"grad_norm": 0.45343813144247014,
"learning_rate": 5e-06,
"loss": 0.379,
"step": 2160
},
{
"epoch": 1.22425952045134,
"grad_norm": 0.44526573844484096,
"learning_rate": 5e-06,
"loss": 0.397,
"step": 2170
},
{
"epoch": 1.229901269393512,
"grad_norm": 0.48779112035480804,
"learning_rate": 5e-06,
"loss": 0.3988,
"step": 2180
},
{
"epoch": 1.2355430183356841,
"grad_norm": 0.4487329859304139,
"learning_rate": 5e-06,
"loss": 0.3802,
"step": 2190
},
{
"epoch": 1.2411847672778562,
"grad_norm": 0.47886286342692885,
"learning_rate": 5e-06,
"loss": 0.3966,
"step": 2200
},
{
"epoch": 1.2468265162200283,
"grad_norm": 0.45776874778870136,
"learning_rate": 5e-06,
"loss": 0.3955,
"step": 2210
},
{
"epoch": 1.2524682651622003,
"grad_norm": 0.47257007534396295,
"learning_rate": 5e-06,
"loss": 0.3888,
"step": 2220
},
{
"epoch": 1.2581100141043724,
"grad_norm": 0.46751284003891047,
"learning_rate": 5e-06,
"loss": 0.3969,
"step": 2230
},
{
"epoch": 1.2637517630465445,
"grad_norm": 0.4661158831574023,
"learning_rate": 5e-06,
"loss": 0.4088,
"step": 2240
},
{
"epoch": 1.2693935119887165,
"grad_norm": 0.4394915987852524,
"learning_rate": 5e-06,
"loss": 0.3935,
"step": 2250
},
{
"epoch": 1.2750352609308886,
"grad_norm": 0.45334151132727485,
"learning_rate": 5e-06,
"loss": 0.3944,
"step": 2260
},
{
"epoch": 1.2806770098730607,
"grad_norm": 0.5078200971616262,
"learning_rate": 5e-06,
"loss": 0.3905,
"step": 2270
},
{
"epoch": 1.2863187588152327,
"grad_norm": 0.4713106460600115,
"learning_rate": 5e-06,
"loss": 0.3955,
"step": 2280
},
{
"epoch": 1.2919605077574048,
"grad_norm": 0.4635282807772546,
"learning_rate": 5e-06,
"loss": 0.4013,
"step": 2290
},
{
"epoch": 1.2976022566995769,
"grad_norm": 0.48334074568481694,
"learning_rate": 5e-06,
"loss": 0.4005,
"step": 2300
},
{
"epoch": 1.303244005641749,
"grad_norm": 0.48456675280641903,
"learning_rate": 5e-06,
"loss": 0.3933,
"step": 2310
},
{
"epoch": 1.308885754583921,
"grad_norm": 0.46200542060106936,
"learning_rate": 5e-06,
"loss": 0.3835,
"step": 2320
},
{
"epoch": 1.314527503526093,
"grad_norm": 0.4815654441432598,
"learning_rate": 5e-06,
"loss": 0.4045,
"step": 2330
},
{
"epoch": 1.320169252468265,
"grad_norm": 0.48826822709991896,
"learning_rate": 5e-06,
"loss": 0.4009,
"step": 2340
},
{
"epoch": 1.3258110014104372,
"grad_norm": 0.4716440781629598,
"learning_rate": 5e-06,
"loss": 0.3938,
"step": 2350
},
{
"epoch": 1.3314527503526092,
"grad_norm": 0.4602369354038975,
"learning_rate": 5e-06,
"loss": 0.3928,
"step": 2360
},
{
"epoch": 1.3370944992947813,
"grad_norm": 0.49648382398328583,
"learning_rate": 5e-06,
"loss": 0.4033,
"step": 2370
},
{
"epoch": 1.3427362482369536,
"grad_norm": 0.46739455409641245,
"learning_rate": 5e-06,
"loss": 0.3972,
"step": 2380
},
{
"epoch": 1.3483779971791257,
"grad_norm": 0.443323801929617,
"learning_rate": 5e-06,
"loss": 0.3968,
"step": 2390
},
{
"epoch": 1.3540197461212977,
"grad_norm": 0.4539331304987661,
"learning_rate": 5e-06,
"loss": 0.3924,
"step": 2400
},
{
"epoch": 1.3596614950634698,
"grad_norm": 0.47781406684365874,
"learning_rate": 5e-06,
"loss": 0.4117,
"step": 2410
},
{
"epoch": 1.3653032440056418,
"grad_norm": 0.45421508901943547,
"learning_rate": 5e-06,
"loss": 0.39,
"step": 2420
},
{
"epoch": 1.370944992947814,
"grad_norm": 0.4630026357905712,
"learning_rate": 5e-06,
"loss": 0.4,
"step": 2430
},
{
"epoch": 1.376586741889986,
"grad_norm": 0.4523702465470941,
"learning_rate": 5e-06,
"loss": 0.4077,
"step": 2440
},
{
"epoch": 1.382228490832158,
"grad_norm": 0.45782219911496075,
"learning_rate": 5e-06,
"loss": 0.3959,
"step": 2450
},
{
"epoch": 1.38787023977433,
"grad_norm": 0.5117871130526895,
"learning_rate": 5e-06,
"loss": 0.4016,
"step": 2460
},
{
"epoch": 1.3935119887165022,
"grad_norm": 0.5315155467795695,
"learning_rate": 5e-06,
"loss": 0.3973,
"step": 2470
},
{
"epoch": 1.3991537376586742,
"grad_norm": 0.46939820177172814,
"learning_rate": 5e-06,
"loss": 0.3942,
"step": 2480
},
{
"epoch": 1.4047954866008463,
"grad_norm": 0.4543328677593474,
"learning_rate": 5e-06,
"loss": 0.3998,
"step": 2490
},
{
"epoch": 1.4104372355430184,
"grad_norm": 0.4496789872069011,
"learning_rate": 5e-06,
"loss": 0.3963,
"step": 2500
},
{
"epoch": 1.4160789844851904,
"grad_norm": 0.471782994083793,
"learning_rate": 5e-06,
"loss": 0.3885,
"step": 2510
},
{
"epoch": 1.4217207334273625,
"grad_norm": 0.4693960167487946,
"learning_rate": 5e-06,
"loss": 0.3917,
"step": 2520
},
{
"epoch": 1.4273624823695346,
"grad_norm": 0.4538271312417869,
"learning_rate": 5e-06,
"loss": 0.3967,
"step": 2530
},
{
"epoch": 1.4330042313117066,
"grad_norm": 0.44186919531244484,
"learning_rate": 5e-06,
"loss": 0.395,
"step": 2540
},
{
"epoch": 1.4386459802538787,
"grad_norm": 0.4687203668363411,
"learning_rate": 5e-06,
"loss": 0.3981,
"step": 2550
},
{
"epoch": 1.4442877291960508,
"grad_norm": 0.4587106074858165,
"learning_rate": 5e-06,
"loss": 0.3925,
"step": 2560
},
{
"epoch": 1.4499294781382228,
"grad_norm": 0.4572379861998615,
"learning_rate": 5e-06,
"loss": 0.4025,
"step": 2570
},
{
"epoch": 1.4555712270803949,
"grad_norm": 0.4718970269023812,
"learning_rate": 5e-06,
"loss": 0.3943,
"step": 2580
},
{
"epoch": 1.461212976022567,
"grad_norm": 0.5041632079079171,
"learning_rate": 5e-06,
"loss": 0.4082,
"step": 2590
},
{
"epoch": 1.466854724964739,
"grad_norm": 0.4499939474661196,
"learning_rate": 5e-06,
"loss": 0.4006,
"step": 2600
},
{
"epoch": 1.472496473906911,
"grad_norm": 0.44298819801342293,
"learning_rate": 5e-06,
"loss": 0.4043,
"step": 2610
},
{
"epoch": 1.4781382228490831,
"grad_norm": 0.4779382761973526,
"learning_rate": 5e-06,
"loss": 0.4056,
"step": 2620
},
{
"epoch": 1.4837799717912552,
"grad_norm": 0.47692712159702133,
"learning_rate": 5e-06,
"loss": 0.3907,
"step": 2630
},
{
"epoch": 1.4894217207334273,
"grad_norm": 0.46307783252557594,
"learning_rate": 5e-06,
"loss": 0.3959,
"step": 2640
},
{
"epoch": 1.4950634696755993,
"grad_norm": 0.4606066509600136,
"learning_rate": 5e-06,
"loss": 0.3934,
"step": 2650
},
{
"epoch": 1.5007052186177714,
"grad_norm": 0.47324134946913055,
"learning_rate": 5e-06,
"loss": 0.4035,
"step": 2660
},
{
"epoch": 1.5063469675599435,
"grad_norm": 0.46378369582677753,
"learning_rate": 5e-06,
"loss": 0.3993,
"step": 2670
},
{
"epoch": 1.5119887165021155,
"grad_norm": 0.446704021535711,
"learning_rate": 5e-06,
"loss": 0.3918,
"step": 2680
},
{
"epoch": 1.5176304654442876,
"grad_norm": 0.4914697095674317,
"learning_rate": 5e-06,
"loss": 0.3909,
"step": 2690
},
{
"epoch": 1.5232722143864597,
"grad_norm": 0.45218861250501363,
"learning_rate": 5e-06,
"loss": 0.3891,
"step": 2700
},
{
"epoch": 1.5289139633286317,
"grad_norm": 0.47296389203246264,
"learning_rate": 5e-06,
"loss": 0.4063,
"step": 2710
},
{
"epoch": 1.5345557122708038,
"grad_norm": 0.4565578982324502,
"learning_rate": 5e-06,
"loss": 0.4004,
"step": 2720
},
{
"epoch": 1.540197461212976,
"grad_norm": 0.4682402541291805,
"learning_rate": 5e-06,
"loss": 0.3926,
"step": 2730
},
{
"epoch": 1.5458392101551481,
"grad_norm": 0.461279370651597,
"learning_rate": 5e-06,
"loss": 0.3896,
"step": 2740
},
{
"epoch": 1.5514809590973202,
"grad_norm": 0.48043898076393116,
"learning_rate": 5e-06,
"loss": 0.3999,
"step": 2750
},
{
"epoch": 1.5571227080394923,
"grad_norm": 0.4940569464772548,
"learning_rate": 5e-06,
"loss": 0.4077,
"step": 2760
},
{
"epoch": 1.5627644569816643,
"grad_norm": 0.507591402814353,
"learning_rate": 5e-06,
"loss": 0.4009,
"step": 2770
},
{
"epoch": 1.5684062059238364,
"grad_norm": 0.4706877768766689,
"learning_rate": 5e-06,
"loss": 0.3924,
"step": 2780
},
{
"epoch": 1.5740479548660085,
"grad_norm": 0.47676776366340334,
"learning_rate": 5e-06,
"loss": 0.3877,
"step": 2790
},
{
"epoch": 1.5796897038081805,
"grad_norm": 0.5182129400923178,
"learning_rate": 5e-06,
"loss": 0.4036,
"step": 2800
},
{
"epoch": 1.5853314527503526,
"grad_norm": 0.48116837090131204,
"learning_rate": 5e-06,
"loss": 0.3975,
"step": 2810
},
{
"epoch": 1.5909732016925247,
"grad_norm": 0.4275386122917158,
"learning_rate": 5e-06,
"loss": 0.3918,
"step": 2820
},
{
"epoch": 1.5966149506346967,
"grad_norm": 0.4558363289192502,
"learning_rate": 5e-06,
"loss": 0.3882,
"step": 2830
},
{
"epoch": 1.6022566995768688,
"grad_norm": 0.46362140028375165,
"learning_rate": 5e-06,
"loss": 0.3986,
"step": 2840
},
{
"epoch": 1.607898448519041,
"grad_norm": 0.46282031581127986,
"learning_rate": 5e-06,
"loss": 0.4039,
"step": 2850
},
{
"epoch": 1.6135401974612131,
"grad_norm": 0.46441758640717906,
"learning_rate": 5e-06,
"loss": 0.4025,
"step": 2860
},
{
"epoch": 1.6191819464033852,
"grad_norm": 0.46074665854229274,
"learning_rate": 5e-06,
"loss": 0.3905,
"step": 2870
},
{
"epoch": 1.6248236953455573,
"grad_norm": 0.4777266277891572,
"learning_rate": 5e-06,
"loss": 0.4042,
"step": 2880
},
{
"epoch": 1.6304654442877293,
"grad_norm": 0.45553733285190573,
"learning_rate": 5e-06,
"loss": 0.3962,
"step": 2890
},
{
"epoch": 1.6361071932299014,
"grad_norm": 0.47178767330297033,
"learning_rate": 5e-06,
"loss": 0.3892,
"step": 2900
},
{
"epoch": 1.6417489421720735,
"grad_norm": 0.47359171054395643,
"learning_rate": 5e-06,
"loss": 0.3881,
"step": 2910
},
{
"epoch": 1.6473906911142455,
"grad_norm": 0.46603715950910696,
"learning_rate": 5e-06,
"loss": 0.394,
"step": 2920
},
{
"epoch": 1.6530324400564176,
"grad_norm": 0.4558346803274527,
"learning_rate": 5e-06,
"loss": 0.3972,
"step": 2930
},
{
"epoch": 1.6586741889985896,
"grad_norm": 0.479027856317277,
"learning_rate": 5e-06,
"loss": 0.394,
"step": 2940
},
{
"epoch": 1.6643159379407617,
"grad_norm": 0.4643135506673433,
"learning_rate": 5e-06,
"loss": 0.3862,
"step": 2950
},
{
"epoch": 1.6699576868829338,
"grad_norm": 0.4619833964547844,
"learning_rate": 5e-06,
"loss": 0.4016,
"step": 2960
},
{
"epoch": 1.6755994358251058,
"grad_norm": 0.44851503998801856,
"learning_rate": 5e-06,
"loss": 0.4095,
"step": 2970
},
{
"epoch": 1.681241184767278,
"grad_norm": 0.4467522582124666,
"learning_rate": 5e-06,
"loss": 0.3999,
"step": 2980
},
{
"epoch": 1.68688293370945,
"grad_norm": 0.472063710369713,
"learning_rate": 5e-06,
"loss": 0.3939,
"step": 2990
},
{
"epoch": 1.692524682651622,
"grad_norm": 0.48009995546325723,
"learning_rate": 5e-06,
"loss": 0.4183,
"step": 3000
},
{
"epoch": 1.698166431593794,
"grad_norm": 0.44768147031308736,
"learning_rate": 5e-06,
"loss": 0.3983,
"step": 3010
},
{
"epoch": 1.7038081805359662,
"grad_norm": 0.476978526816067,
"learning_rate": 5e-06,
"loss": 0.3951,
"step": 3020
},
{
"epoch": 1.7094499294781382,
"grad_norm": 0.4696465520344116,
"learning_rate": 5e-06,
"loss": 0.3961,
"step": 3030
},
{
"epoch": 1.7150916784203103,
"grad_norm": 0.4576648300580514,
"learning_rate": 5e-06,
"loss": 0.3942,
"step": 3040
},
{
"epoch": 1.7207334273624824,
"grad_norm": 0.4731142162264191,
"learning_rate": 5e-06,
"loss": 0.3946,
"step": 3050
},
{
"epoch": 1.7263751763046544,
"grad_norm": 0.4756579121422872,
"learning_rate": 5e-06,
"loss": 0.3925,
"step": 3060
},
{
"epoch": 1.7320169252468265,
"grad_norm": 0.46052906775460756,
"learning_rate": 5e-06,
"loss": 0.3959,
"step": 3070
},
{
"epoch": 1.7376586741889986,
"grad_norm": 0.43627250154239816,
"learning_rate": 5e-06,
"loss": 0.4012,
"step": 3080
},
{
"epoch": 1.7433004231311706,
"grad_norm": 0.4820135003105483,
"learning_rate": 5e-06,
"loss": 0.3881,
"step": 3090
},
{
"epoch": 1.7489421720733427,
"grad_norm": 0.4778018041594859,
"learning_rate": 5e-06,
"loss": 0.4,
"step": 3100
},
{
"epoch": 1.7545839210155147,
"grad_norm": 0.4884888164729651,
"learning_rate": 5e-06,
"loss": 0.3891,
"step": 3110
},
{
"epoch": 1.7602256699576868,
"grad_norm": 0.4475592439449893,
"learning_rate": 5e-06,
"loss": 0.3926,
"step": 3120
},
{
"epoch": 1.7658674188998589,
"grad_norm": 0.47654858360039826,
"learning_rate": 5e-06,
"loss": 0.419,
"step": 3130
},
{
"epoch": 1.771509167842031,
"grad_norm": 0.4555878766506712,
"learning_rate": 5e-06,
"loss": 0.3962,
"step": 3140
},
{
"epoch": 1.777150916784203,
"grad_norm": 0.46471151818843526,
"learning_rate": 5e-06,
"loss": 0.3956,
"step": 3150
},
{
"epoch": 1.782792665726375,
"grad_norm": 0.47449169536040453,
"learning_rate": 5e-06,
"loss": 0.4006,
"step": 3160
},
{
"epoch": 1.7884344146685471,
"grad_norm": 0.4602783138679876,
"learning_rate": 5e-06,
"loss": 0.3917,
"step": 3170
},
{
"epoch": 1.7940761636107192,
"grad_norm": 0.472002669632583,
"learning_rate": 5e-06,
"loss": 0.4085,
"step": 3180
},
{
"epoch": 1.7997179125528913,
"grad_norm": 0.4675920743304564,
"learning_rate": 5e-06,
"loss": 0.4003,
"step": 3190
},
{
"epoch": 1.8053596614950633,
"grad_norm": 0.4615694339610234,
"learning_rate": 5e-06,
"loss": 0.3955,
"step": 3200
},
{
"epoch": 1.8110014104372354,
"grad_norm": 0.4628092696805375,
"learning_rate": 5e-06,
"loss": 0.4016,
"step": 3210
},
{
"epoch": 1.8166431593794075,
"grad_norm": 0.5254614489605429,
"learning_rate": 5e-06,
"loss": 0.4033,
"step": 3220
},
{
"epoch": 1.8222849083215797,
"grad_norm": 0.44928160369491393,
"learning_rate": 5e-06,
"loss": 0.3927,
"step": 3230
},
{
"epoch": 1.8279266572637518,
"grad_norm": 0.4956917940016818,
"learning_rate": 5e-06,
"loss": 0.3963,
"step": 3240
},
{
"epoch": 1.8335684062059239,
"grad_norm": 0.45361998350750077,
"learning_rate": 5e-06,
"loss": 0.3983,
"step": 3250
},
{
"epoch": 1.839210155148096,
"grad_norm": 0.4515003704595137,
"learning_rate": 5e-06,
"loss": 0.3963,
"step": 3260
},
{
"epoch": 1.844851904090268,
"grad_norm": 0.4556275342458607,
"learning_rate": 5e-06,
"loss": 0.398,
"step": 3270
},
{
"epoch": 1.85049365303244,
"grad_norm": 0.46890202055080116,
"learning_rate": 5e-06,
"loss": 0.3804,
"step": 3280
},
{
"epoch": 1.8561354019746121,
"grad_norm": 0.4567048420478033,
"learning_rate": 5e-06,
"loss": 0.3866,
"step": 3290
},
{
"epoch": 1.8617771509167842,
"grad_norm": 0.46010370992720745,
"learning_rate": 5e-06,
"loss": 0.4071,
"step": 3300
},
{
"epoch": 1.8674188998589563,
"grad_norm": 0.46330780688841133,
"learning_rate": 5e-06,
"loss": 0.4007,
"step": 3310
},
{
"epoch": 1.8730606488011283,
"grad_norm": 0.467515061321271,
"learning_rate": 5e-06,
"loss": 0.3958,
"step": 3320
},
{
"epoch": 1.8787023977433004,
"grad_norm": 0.4478417663578568,
"learning_rate": 5e-06,
"loss": 0.3965,
"step": 3330
},
{
"epoch": 1.8843441466854725,
"grad_norm": 0.46131439541886865,
"learning_rate": 5e-06,
"loss": 0.3972,
"step": 3340
},
{
"epoch": 1.8899858956276445,
"grad_norm": 0.4649363714764279,
"learning_rate": 5e-06,
"loss": 0.3943,
"step": 3350
},
{
"epoch": 1.8956276445698168,
"grad_norm": 0.46303194795992636,
"learning_rate": 5e-06,
"loss": 0.391,
"step": 3360
},
{
"epoch": 1.9012693935119889,
"grad_norm": 0.42878941947013166,
"learning_rate": 5e-06,
"loss": 0.4039,
"step": 3370
},
{
"epoch": 1.906911142454161,
"grad_norm": 0.4725709423896906,
"learning_rate": 5e-06,
"loss": 0.3988,
"step": 3380
},
{
"epoch": 1.912552891396333,
"grad_norm": 0.47671368663777564,
"learning_rate": 5e-06,
"loss": 0.3884,
"step": 3390
},
{
"epoch": 1.918194640338505,
"grad_norm": 0.46668210559071105,
"learning_rate": 5e-06,
"loss": 0.4066,
"step": 3400
},
{
"epoch": 1.9238363892806771,
"grad_norm": 0.4572223756340763,
"learning_rate": 5e-06,
"loss": 0.3973,
"step": 3410
},
{
"epoch": 1.9294781382228492,
"grad_norm": 0.45902290859441564,
"learning_rate": 5e-06,
"loss": 0.3929,
"step": 3420
},
{
"epoch": 1.9351198871650213,
"grad_norm": 0.458044072628747,
"learning_rate": 5e-06,
"loss": 0.3857,
"step": 3430
},
{
"epoch": 1.9407616361071933,
"grad_norm": 0.4693435523479085,
"learning_rate": 5e-06,
"loss": 0.3904,
"step": 3440
},
{
"epoch": 1.9464033850493654,
"grad_norm": 0.4525757784211967,
"learning_rate": 5e-06,
"loss": 0.3901,
"step": 3450
},
{
"epoch": 1.9520451339915375,
"grad_norm": 0.4527753194974229,
"learning_rate": 5e-06,
"loss": 0.3883,
"step": 3460
},
{
"epoch": 1.9576868829337095,
"grad_norm": 0.45775257683357495,
"learning_rate": 5e-06,
"loss": 0.3933,
"step": 3470
},
{
"epoch": 1.9633286318758816,
"grad_norm": 0.4513845604140946,
"learning_rate": 5e-06,
"loss": 0.4084,
"step": 3480
},
{
"epoch": 1.9689703808180536,
"grad_norm": 0.488458649525053,
"learning_rate": 5e-06,
"loss": 0.3829,
"step": 3490
},
{
"epoch": 1.9746121297602257,
"grad_norm": 0.446090731775206,
"learning_rate": 5e-06,
"loss": 0.3924,
"step": 3500
},
{
"epoch": 1.9802538787023978,
"grad_norm": 0.4709438262355249,
"learning_rate": 5e-06,
"loss": 0.3894,
"step": 3510
},
{
"epoch": 1.9858956276445698,
"grad_norm": 0.4870735211701005,
"learning_rate": 5e-06,
"loss": 0.4027,
"step": 3520
},
{
"epoch": 1.991537376586742,
"grad_norm": 0.4803646418996235,
"learning_rate": 5e-06,
"loss": 0.3859,
"step": 3530
},
{
"epoch": 1.997179125528914,
"grad_norm": 0.4703422698968033,
"learning_rate": 5e-06,
"loss": 0.3944,
"step": 3540
},
{
"epoch": 2.0,
"eval_loss": 0.43374887108802795,
"eval_runtime": 448.6077,
"eval_samples_per_second": 26.611,
"eval_steps_per_second": 0.417,
"step": 3545
},
{
"epoch": 2.002820874471086,
"grad_norm": 0.4413473757815756,
"learning_rate": 5e-06,
"loss": 0.3928,
"step": 3550
},
{
"epoch": 2.008462623413258,
"grad_norm": 0.4522342894282536,
"learning_rate": 5e-06,
"loss": 0.3547,
"step": 3560
},
{
"epoch": 2.01410437235543,
"grad_norm": 0.4290168433856863,
"learning_rate": 5e-06,
"loss": 0.356,
"step": 3570
},
{
"epoch": 2.0197461212976022,
"grad_norm": 0.48082235439755094,
"learning_rate": 5e-06,
"loss": 0.3528,
"step": 3580
},
{
"epoch": 2.0253878702397743,
"grad_norm": 0.45722766485772276,
"learning_rate": 5e-06,
"loss": 0.3442,
"step": 3590
},
{
"epoch": 2.0310296191819464,
"grad_norm": 0.45747195342514013,
"learning_rate": 5e-06,
"loss": 0.3432,
"step": 3600
},
{
"epoch": 2.0366713681241184,
"grad_norm": 0.4358767354369319,
"learning_rate": 5e-06,
"loss": 0.3466,
"step": 3610
},
{
"epoch": 2.0423131170662905,
"grad_norm": 0.45049478931307607,
"learning_rate": 5e-06,
"loss": 0.3504,
"step": 3620
},
{
"epoch": 2.0479548660084625,
"grad_norm": 0.4563642590969261,
"learning_rate": 5e-06,
"loss": 0.3406,
"step": 3630
},
{
"epoch": 2.0535966149506346,
"grad_norm": 0.4312356627894684,
"learning_rate": 5e-06,
"loss": 0.3431,
"step": 3640
},
{
"epoch": 2.0592383638928067,
"grad_norm": 0.47281064022880187,
"learning_rate": 5e-06,
"loss": 0.3492,
"step": 3650
},
{
"epoch": 2.0648801128349787,
"grad_norm": 0.4435651181299306,
"learning_rate": 5e-06,
"loss": 0.3411,
"step": 3660
},
{
"epoch": 2.070521861777151,
"grad_norm": 0.4694545955857111,
"learning_rate": 5e-06,
"loss": 0.3476,
"step": 3670
},
{
"epoch": 2.076163610719323,
"grad_norm": 0.4094871096575717,
"learning_rate": 5e-06,
"loss": 0.3446,
"step": 3680
},
{
"epoch": 2.081805359661495,
"grad_norm": 0.4407127764565404,
"learning_rate": 5e-06,
"loss": 0.3511,
"step": 3690
},
{
"epoch": 2.087447108603667,
"grad_norm": 0.46130441782721465,
"learning_rate": 5e-06,
"loss": 0.3701,
"step": 3700
},
{
"epoch": 2.093088857545839,
"grad_norm": 0.48247411322370654,
"learning_rate": 5e-06,
"loss": 0.3563,
"step": 3710
},
{
"epoch": 2.098730606488011,
"grad_norm": 0.4219187636429636,
"learning_rate": 5e-06,
"loss": 0.3471,
"step": 3720
},
{
"epoch": 2.104372355430183,
"grad_norm": 0.4611444707906527,
"learning_rate": 5e-06,
"loss": 0.3567,
"step": 3730
},
{
"epoch": 2.1100141043723553,
"grad_norm": 0.4553725011658897,
"learning_rate": 5e-06,
"loss": 0.3529,
"step": 3740
},
{
"epoch": 2.1156558533145273,
"grad_norm": 0.4464002785245552,
"learning_rate": 5e-06,
"loss": 0.3501,
"step": 3750
},
{
"epoch": 2.1212976022566994,
"grad_norm": 0.4300513716532807,
"learning_rate": 5e-06,
"loss": 0.3498,
"step": 3760
},
{
"epoch": 2.1269393511988715,
"grad_norm": 0.456511976198015,
"learning_rate": 5e-06,
"loss": 0.3519,
"step": 3770
},
{
"epoch": 2.1325811001410435,
"grad_norm": 0.4233132317201342,
"learning_rate": 5e-06,
"loss": 0.3476,
"step": 3780
},
{
"epoch": 2.138222849083216,
"grad_norm": 0.471119966312247,
"learning_rate": 5e-06,
"loss": 0.3519,
"step": 3790
},
{
"epoch": 2.143864598025388,
"grad_norm": 0.4633272783360576,
"learning_rate": 5e-06,
"loss": 0.3564,
"step": 3800
},
{
"epoch": 2.14950634696756,
"grad_norm": 0.42965266973567434,
"learning_rate": 5e-06,
"loss": 0.3518,
"step": 3810
},
{
"epoch": 2.155148095909732,
"grad_norm": 0.4298353082026474,
"learning_rate": 5e-06,
"loss": 0.3445,
"step": 3820
},
{
"epoch": 2.1607898448519043,
"grad_norm": 0.47739223858312535,
"learning_rate": 5e-06,
"loss": 0.3445,
"step": 3830
},
{
"epoch": 2.1664315937940763,
"grad_norm": 0.4532966358832045,
"learning_rate": 5e-06,
"loss": 0.356,
"step": 3840
},
{
"epoch": 2.1720733427362484,
"grad_norm": 0.4410221778245472,
"learning_rate": 5e-06,
"loss": 0.3662,
"step": 3850
},
{
"epoch": 2.1777150916784205,
"grad_norm": 0.4263307654776881,
"learning_rate": 5e-06,
"loss": 0.3467,
"step": 3860
},
{
"epoch": 2.1833568406205925,
"grad_norm": 0.45149035680132815,
"learning_rate": 5e-06,
"loss": 0.3448,
"step": 3870
},
{
"epoch": 2.1889985895627646,
"grad_norm": 0.4529321307975293,
"learning_rate": 5e-06,
"loss": 0.3477,
"step": 3880
},
{
"epoch": 2.1946403385049367,
"grad_norm": 0.4607065783291932,
"learning_rate": 5e-06,
"loss": 0.3495,
"step": 3890
},
{
"epoch": 2.2002820874471087,
"grad_norm": 0.4602518910582201,
"learning_rate": 5e-06,
"loss": 0.3487,
"step": 3900
},
{
"epoch": 2.205923836389281,
"grad_norm": 0.43692806647452487,
"learning_rate": 5e-06,
"loss": 0.3524,
"step": 3910
},
{
"epoch": 2.211565585331453,
"grad_norm": 0.4552717555185162,
"learning_rate": 5e-06,
"loss": 0.3471,
"step": 3920
},
{
"epoch": 2.217207334273625,
"grad_norm": 0.4525591489683545,
"learning_rate": 5e-06,
"loss": 0.3592,
"step": 3930
},
{
"epoch": 2.222849083215797,
"grad_norm": 0.4404336083861297,
"learning_rate": 5e-06,
"loss": 0.3557,
"step": 3940
},
{
"epoch": 2.228490832157969,
"grad_norm": 0.45114848417343256,
"learning_rate": 5e-06,
"loss": 0.3423,
"step": 3950
},
{
"epoch": 2.234132581100141,
"grad_norm": 0.4370952273186252,
"learning_rate": 5e-06,
"loss": 0.3546,
"step": 3960
},
{
"epoch": 2.239774330042313,
"grad_norm": 0.4435799605060227,
"learning_rate": 5e-06,
"loss": 0.3567,
"step": 3970
},
{
"epoch": 2.2454160789844853,
"grad_norm": 0.4915642595318201,
"learning_rate": 5e-06,
"loss": 0.3584,
"step": 3980
},
{
"epoch": 2.2510578279266573,
"grad_norm": 0.44114555771160074,
"learning_rate": 5e-06,
"loss": 0.3486,
"step": 3990
},
{
"epoch": 2.2566995768688294,
"grad_norm": 0.4749861176607326,
"learning_rate": 5e-06,
"loss": 0.3636,
"step": 4000
},
{
"epoch": 2.2623413258110014,
"grad_norm": 0.46753231180049154,
"learning_rate": 5e-06,
"loss": 0.3537,
"step": 4010
},
{
"epoch": 2.2679830747531735,
"grad_norm": 0.4419176647270738,
"learning_rate": 5e-06,
"loss": 0.3501,
"step": 4020
},
{
"epoch": 2.2736248236953456,
"grad_norm": 0.4587766070955877,
"learning_rate": 5e-06,
"loss": 0.3541,
"step": 4030
},
{
"epoch": 2.2792665726375176,
"grad_norm": 0.43530289551944434,
"learning_rate": 5e-06,
"loss": 0.351,
"step": 4040
},
{
"epoch": 2.2849083215796897,
"grad_norm": 0.4631459072773793,
"learning_rate": 5e-06,
"loss": 0.3485,
"step": 4050
},
{
"epoch": 2.2905500705218618,
"grad_norm": 0.47022508312977196,
"learning_rate": 5e-06,
"loss": 0.3502,
"step": 4060
},
{
"epoch": 2.296191819464034,
"grad_norm": 0.4505813037738865,
"learning_rate": 5e-06,
"loss": 0.3651,
"step": 4070
},
{
"epoch": 2.301833568406206,
"grad_norm": 0.4486825542866407,
"learning_rate": 5e-06,
"loss": 0.3544,
"step": 4080
},
{
"epoch": 2.307475317348378,
"grad_norm": 0.4665999075970455,
"learning_rate": 5e-06,
"loss": 0.3685,
"step": 4090
},
{
"epoch": 2.31311706629055,
"grad_norm": 0.4888288507127307,
"learning_rate": 5e-06,
"loss": 0.3619,
"step": 4100
},
{
"epoch": 2.318758815232722,
"grad_norm": 0.41270795857689285,
"learning_rate": 5e-06,
"loss": 0.352,
"step": 4110
},
{
"epoch": 2.324400564174894,
"grad_norm": 0.4419259104601605,
"learning_rate": 5e-06,
"loss": 0.3383,
"step": 4120
},
{
"epoch": 2.330042313117066,
"grad_norm": 0.46603521309981116,
"learning_rate": 5e-06,
"loss": 0.3536,
"step": 4130
},
{
"epoch": 2.3356840620592383,
"grad_norm": 0.47937141599717065,
"learning_rate": 5e-06,
"loss": 0.3563,
"step": 4140
},
{
"epoch": 2.3413258110014104,
"grad_norm": 0.44765567183470945,
"learning_rate": 5e-06,
"loss": 0.3467,
"step": 4150
},
{
"epoch": 2.3469675599435824,
"grad_norm": 0.4457046351361799,
"learning_rate": 5e-06,
"loss": 0.3503,
"step": 4160
},
{
"epoch": 2.3526093088857545,
"grad_norm": 0.44307837034261943,
"learning_rate": 5e-06,
"loss": 0.352,
"step": 4170
},
{
"epoch": 2.3582510578279265,
"grad_norm": 0.4267219846723022,
"learning_rate": 5e-06,
"loss": 0.3774,
"step": 4180
},
{
"epoch": 2.3638928067700986,
"grad_norm": 0.48122908327097114,
"learning_rate": 5e-06,
"loss": 0.3598,
"step": 4190
},
{
"epoch": 2.3695345557122707,
"grad_norm": 0.45331508297626466,
"learning_rate": 5e-06,
"loss": 0.362,
"step": 4200
},
{
"epoch": 2.3751763046544427,
"grad_norm": 0.4594196615052227,
"learning_rate": 5e-06,
"loss": 0.354,
"step": 4210
},
{
"epoch": 2.380818053596615,
"grad_norm": 0.4495058567180949,
"learning_rate": 5e-06,
"loss": 0.3602,
"step": 4220
},
{
"epoch": 2.386459802538787,
"grad_norm": 0.4676232528999999,
"learning_rate": 5e-06,
"loss": 0.3516,
"step": 4230
},
{
"epoch": 2.392101551480959,
"grad_norm": 0.4663506464819916,
"learning_rate": 5e-06,
"loss": 0.3489,
"step": 4240
},
{
"epoch": 2.397743300423131,
"grad_norm": 0.4377174836018769,
"learning_rate": 5e-06,
"loss": 0.3566,
"step": 4250
},
{
"epoch": 2.403385049365303,
"grad_norm": 0.4410880459267694,
"learning_rate": 5e-06,
"loss": 0.355,
"step": 4260
},
{
"epoch": 2.409026798307475,
"grad_norm": 0.4416031717769207,
"learning_rate": 5e-06,
"loss": 0.3606,
"step": 4270
},
{
"epoch": 2.414668547249647,
"grad_norm": 0.46431459859880975,
"learning_rate": 5e-06,
"loss": 0.3551,
"step": 4280
},
{
"epoch": 2.4203102961918193,
"grad_norm": 0.4603756553070287,
"learning_rate": 5e-06,
"loss": 0.3521,
"step": 4290
},
{
"epoch": 2.4259520451339913,
"grad_norm": 0.43937667290163923,
"learning_rate": 5e-06,
"loss": 0.3736,
"step": 4300
},
{
"epoch": 2.4315937940761634,
"grad_norm": 0.4446699867853186,
"learning_rate": 5e-06,
"loss": 0.3467,
"step": 4310
},
{
"epoch": 2.4372355430183354,
"grad_norm": 0.43626545583793597,
"learning_rate": 5e-06,
"loss": 0.348,
"step": 4320
},
{
"epoch": 2.4428772919605075,
"grad_norm": 0.49173390105039966,
"learning_rate": 5e-06,
"loss": 0.3521,
"step": 4330
},
{
"epoch": 2.44851904090268,
"grad_norm": 0.45995716373861045,
"learning_rate": 5e-06,
"loss": 0.3456,
"step": 4340
},
{
"epoch": 2.454160789844852,
"grad_norm": 0.44249790330543903,
"learning_rate": 5e-06,
"loss": 0.3433,
"step": 4350
},
{
"epoch": 2.459802538787024,
"grad_norm": 0.45560126813520535,
"learning_rate": 5e-06,
"loss": 0.349,
"step": 4360
},
{
"epoch": 2.465444287729196,
"grad_norm": 0.4594734244394021,
"learning_rate": 5e-06,
"loss": 0.3578,
"step": 4370
},
{
"epoch": 2.4710860366713683,
"grad_norm": 0.4572577458846818,
"learning_rate": 5e-06,
"loss": 0.3485,
"step": 4380
},
{
"epoch": 2.4767277856135403,
"grad_norm": 0.44219315631814177,
"learning_rate": 5e-06,
"loss": 0.3471,
"step": 4390
},
{
"epoch": 2.4823695345557124,
"grad_norm": 0.42240807095659477,
"learning_rate": 5e-06,
"loss": 0.3562,
"step": 4400
},
{
"epoch": 2.4880112834978845,
"grad_norm": 0.47229751050541774,
"learning_rate": 5e-06,
"loss": 0.3482,
"step": 4410
},
{
"epoch": 2.4936530324400565,
"grad_norm": 0.4458066836074724,
"learning_rate": 5e-06,
"loss": 0.355,
"step": 4420
},
{
"epoch": 2.4992947813822286,
"grad_norm": 0.4503226109681392,
"learning_rate": 5e-06,
"loss": 0.3538,
"step": 4430
},
{
"epoch": 2.5049365303244007,
"grad_norm": 0.44110608535592855,
"learning_rate": 5e-06,
"loss": 0.3747,
"step": 4440
},
{
"epoch": 2.5105782792665727,
"grad_norm": 0.5109212717715426,
"learning_rate": 5e-06,
"loss": 0.3519,
"step": 4450
},
{
"epoch": 2.516220028208745,
"grad_norm": 0.4249918016919024,
"learning_rate": 5e-06,
"loss": 0.342,
"step": 4460
},
{
"epoch": 2.521861777150917,
"grad_norm": 0.5086305630978165,
"learning_rate": 5e-06,
"loss": 0.3607,
"step": 4470
},
{
"epoch": 2.527503526093089,
"grad_norm": 0.4358509107515122,
"learning_rate": 5e-06,
"loss": 0.3468,
"step": 4480
},
{
"epoch": 2.533145275035261,
"grad_norm": 0.4539559547925107,
"learning_rate": 5e-06,
"loss": 0.3443,
"step": 4490
},
{
"epoch": 2.538787023977433,
"grad_norm": 0.4653862436948121,
"learning_rate": 5e-06,
"loss": 0.3524,
"step": 4500
},
{
"epoch": 2.544428772919605,
"grad_norm": 0.41711716794524994,
"learning_rate": 5e-06,
"loss": 0.3479,
"step": 4510
},
{
"epoch": 2.550070521861777,
"grad_norm": 0.45795092567053497,
"learning_rate": 5e-06,
"loss": 0.3695,
"step": 4520
},
{
"epoch": 2.5557122708039492,
"grad_norm": 0.4724307476878103,
"learning_rate": 5e-06,
"loss": 0.353,
"step": 4530
},
{
"epoch": 2.5613540197461213,
"grad_norm": 0.4667580239368319,
"learning_rate": 5e-06,
"loss": 0.3539,
"step": 4540
},
{
"epoch": 2.5669957686882934,
"grad_norm": 0.4475375918113466,
"learning_rate": 5e-06,
"loss": 0.3598,
"step": 4550
},
{
"epoch": 2.5726375176304654,
"grad_norm": 0.480757840067183,
"learning_rate": 5e-06,
"loss": 0.3485,
"step": 4560
},
{
"epoch": 2.5782792665726375,
"grad_norm": 0.4276733499528922,
"learning_rate": 5e-06,
"loss": 0.3479,
"step": 4570
},
{
"epoch": 2.5839210155148096,
"grad_norm": 0.43537614271812025,
"learning_rate": 5e-06,
"loss": 0.3543,
"step": 4580
},
{
"epoch": 2.5895627644569816,
"grad_norm": 0.42394662487032214,
"learning_rate": 5e-06,
"loss": 0.3475,
"step": 4590
},
{
"epoch": 2.5952045133991537,
"grad_norm": 0.45439257995617655,
"learning_rate": 5e-06,
"loss": 0.3484,
"step": 4600
},
{
"epoch": 2.6008462623413258,
"grad_norm": 0.4463381033101569,
"learning_rate": 5e-06,
"loss": 0.3478,
"step": 4610
},
{
"epoch": 2.606488011283498,
"grad_norm": 0.4651753425049505,
"learning_rate": 5e-06,
"loss": 0.3532,
"step": 4620
},
{
"epoch": 2.61212976022567,
"grad_norm": 0.4725584824422778,
"learning_rate": 5e-06,
"loss": 0.3667,
"step": 4630
},
{
"epoch": 2.617771509167842,
"grad_norm": 0.4496062316974007,
"learning_rate": 5e-06,
"loss": 0.3566,
"step": 4640
},
{
"epoch": 2.623413258110014,
"grad_norm": 0.4301211716374985,
"learning_rate": 5e-06,
"loss": 0.3466,
"step": 4650
},
{
"epoch": 2.629055007052186,
"grad_norm": 0.4567935039875112,
"learning_rate": 5e-06,
"loss": 0.3532,
"step": 4660
},
{
"epoch": 2.634696755994358,
"grad_norm": 0.45514691870247576,
"learning_rate": 5e-06,
"loss": 0.352,
"step": 4670
},
{
"epoch": 2.64033850493653,
"grad_norm": 0.4435768402675874,
"learning_rate": 5e-06,
"loss": 0.3479,
"step": 4680
},
{
"epoch": 2.6459802538787023,
"grad_norm": 0.43799237086382287,
"learning_rate": 5e-06,
"loss": 0.3399,
"step": 4690
},
{
"epoch": 2.6516220028208743,
"grad_norm": 0.45347330937833646,
"learning_rate": 5e-06,
"loss": 0.3496,
"step": 4700
},
{
"epoch": 2.6572637517630464,
"grad_norm": 0.45449617328134695,
"learning_rate": 5e-06,
"loss": 0.3698,
"step": 4710
},
{
"epoch": 2.6629055007052185,
"grad_norm": 0.45514167950119666,
"learning_rate": 5e-06,
"loss": 0.3557,
"step": 4720
},
{
"epoch": 2.6685472496473905,
"grad_norm": 0.45124610082620425,
"learning_rate": 5e-06,
"loss": 0.3555,
"step": 4730
},
{
"epoch": 2.6741889985895626,
"grad_norm": 0.45506617549803663,
"learning_rate": 5e-06,
"loss": 0.3533,
"step": 4740
},
{
"epoch": 2.679830747531735,
"grad_norm": 0.4497891236146143,
"learning_rate": 5e-06,
"loss": 0.3593,
"step": 4750
},
{
"epoch": 2.685472496473907,
"grad_norm": 0.43730277262363093,
"learning_rate": 5e-06,
"loss": 0.35,
"step": 4760
},
{
"epoch": 2.6911142454160792,
"grad_norm": 0.4453843880728269,
"learning_rate": 5e-06,
"loss": 0.3508,
"step": 4770
},
{
"epoch": 2.6967559943582513,
"grad_norm": 0.4551381875534027,
"learning_rate": 5e-06,
"loss": 0.3659,
"step": 4780
},
{
"epoch": 2.7023977433004234,
"grad_norm": 0.44693198155600794,
"learning_rate": 5e-06,
"loss": 0.3488,
"step": 4790
},
{
"epoch": 2.7080394922425954,
"grad_norm": 0.4725411086517588,
"learning_rate": 5e-06,
"loss": 0.3622,
"step": 4800
},
{
"epoch": 2.7136812411847675,
"grad_norm": 0.45435090794835215,
"learning_rate": 5e-06,
"loss": 0.3659,
"step": 4810
},
{
"epoch": 2.7193229901269396,
"grad_norm": 0.40891902748686465,
"learning_rate": 5e-06,
"loss": 0.3521,
"step": 4820
},
{
"epoch": 2.7249647390691116,
"grad_norm": 0.4588622825344602,
"learning_rate": 5e-06,
"loss": 0.3609,
"step": 4830
},
{
"epoch": 2.7306064880112837,
"grad_norm": 0.47220524310687695,
"learning_rate": 5e-06,
"loss": 0.3608,
"step": 4840
},
{
"epoch": 2.7362482369534558,
"grad_norm": 0.4813382330408875,
"learning_rate": 5e-06,
"loss": 0.3516,
"step": 4850
},
{
"epoch": 2.741889985895628,
"grad_norm": 0.44851106014638686,
"learning_rate": 5e-06,
"loss": 0.3575,
"step": 4860
},
{
"epoch": 2.7475317348378,
"grad_norm": 0.4648873406447677,
"learning_rate": 5e-06,
"loss": 0.3508,
"step": 4870
},
{
"epoch": 2.753173483779972,
"grad_norm": 0.44332878174865403,
"learning_rate": 5e-06,
"loss": 0.3548,
"step": 4880
},
{
"epoch": 2.758815232722144,
"grad_norm": 0.44016959273981,
"learning_rate": 5e-06,
"loss": 0.3518,
"step": 4890
},
{
"epoch": 2.764456981664316,
"grad_norm": 0.42469643512821914,
"learning_rate": 5e-06,
"loss": 0.3501,
"step": 4900
},
{
"epoch": 2.770098730606488,
"grad_norm": 0.44842384575861166,
"learning_rate": 5e-06,
"loss": 0.3452,
"step": 4910
},
{
"epoch": 2.77574047954866,
"grad_norm": 0.44453086608294007,
"learning_rate": 5e-06,
"loss": 0.3576,
"step": 4920
},
{
"epoch": 2.7813822284908323,
"grad_norm": 0.4613767704732174,
"learning_rate": 5e-06,
"loss": 0.3669,
"step": 4930
},
{
"epoch": 2.7870239774330043,
"grad_norm": 0.42157749246627113,
"learning_rate": 5e-06,
"loss": 0.3558,
"step": 4940
},
{
"epoch": 2.7926657263751764,
"grad_norm": 0.44623021177861155,
"learning_rate": 5e-06,
"loss": 0.353,
"step": 4950
},
{
"epoch": 2.7983074753173485,
"grad_norm": 0.44511445391899146,
"learning_rate": 5e-06,
"loss": 0.3575,
"step": 4960
},
{
"epoch": 2.8039492242595205,
"grad_norm": 0.4496517977205029,
"learning_rate": 5e-06,
"loss": 0.376,
"step": 4970
},
{
"epoch": 2.8095909732016926,
"grad_norm": 0.4568581481429044,
"learning_rate": 5e-06,
"loss": 0.3552,
"step": 4980
},
{
"epoch": 2.8152327221438647,
"grad_norm": 0.45872415735621647,
"learning_rate": 5e-06,
"loss": 0.3538,
"step": 4990
},
{
"epoch": 2.8208744710860367,
"grad_norm": 0.43280090040022784,
"learning_rate": 5e-06,
"loss": 0.3596,
"step": 5000
},
{
"epoch": 2.826516220028209,
"grad_norm": 0.4271253356285509,
"learning_rate": 5e-06,
"loss": 0.3589,
"step": 5010
},
{
"epoch": 2.832157968970381,
"grad_norm": 0.45509701773858097,
"learning_rate": 5e-06,
"loss": 0.3717,
"step": 5020
},
{
"epoch": 2.837799717912553,
"grad_norm": 0.43287288682215924,
"learning_rate": 5e-06,
"loss": 0.3573,
"step": 5030
},
{
"epoch": 2.843441466854725,
"grad_norm": 0.4688529933224419,
"learning_rate": 5e-06,
"loss": 0.3477,
"step": 5040
},
{
"epoch": 2.849083215796897,
"grad_norm": 0.4331993042860941,
"learning_rate": 5e-06,
"loss": 0.3514,
"step": 5050
},
{
"epoch": 2.854724964739069,
"grad_norm": 0.47629494492943353,
"learning_rate": 5e-06,
"loss": 0.3457,
"step": 5060
},
{
"epoch": 2.860366713681241,
"grad_norm": 0.4547175713111894,
"learning_rate": 5e-06,
"loss": 0.3616,
"step": 5070
},
{
"epoch": 2.8660084626234132,
"grad_norm": 0.4697185774932994,
"learning_rate": 5e-06,
"loss": 0.3527,
"step": 5080
},
{
"epoch": 2.8716502115655853,
"grad_norm": 0.46979390495300094,
"learning_rate": 5e-06,
"loss": 0.367,
"step": 5090
},
{
"epoch": 2.8772919605077574,
"grad_norm": 0.4779125028298598,
"learning_rate": 5e-06,
"loss": 0.3511,
"step": 5100
},
{
"epoch": 2.8829337094499294,
"grad_norm": 0.4974784539605145,
"learning_rate": 5e-06,
"loss": 0.3623,
"step": 5110
},
{
"epoch": 2.8885754583921015,
"grad_norm": 0.4614842753048295,
"learning_rate": 5e-06,
"loss": 0.3495,
"step": 5120
},
{
"epoch": 2.8942172073342736,
"grad_norm": 0.43741541412768414,
"learning_rate": 5e-06,
"loss": 0.3566,
"step": 5130
},
{
"epoch": 2.8998589562764456,
"grad_norm": 0.4611139730639956,
"learning_rate": 5e-06,
"loss": 0.357,
"step": 5140
},
{
"epoch": 2.9055007052186177,
"grad_norm": 0.4584393192245279,
"learning_rate": 5e-06,
"loss": 0.3559,
"step": 5150
},
{
"epoch": 2.9111424541607898,
"grad_norm": 0.4605897500358934,
"learning_rate": 5e-06,
"loss": 0.3599,
"step": 5160
},
{
"epoch": 2.916784203102962,
"grad_norm": 0.5047737206876777,
"learning_rate": 5e-06,
"loss": 0.3554,
"step": 5170
},
{
"epoch": 2.922425952045134,
"grad_norm": 0.43957877748790186,
"learning_rate": 5e-06,
"loss": 0.3553,
"step": 5180
},
{
"epoch": 2.928067700987306,
"grad_norm": 0.41934448745808994,
"learning_rate": 5e-06,
"loss": 0.3537,
"step": 5190
},
{
"epoch": 2.933709449929478,
"grad_norm": 0.4497013017770954,
"learning_rate": 5e-06,
"loss": 0.3591,
"step": 5200
},
{
"epoch": 2.93935119887165,
"grad_norm": 0.46915975111439107,
"learning_rate": 5e-06,
"loss": 0.3561,
"step": 5210
},
{
"epoch": 2.944992947813822,
"grad_norm": 0.4428104855895761,
"learning_rate": 5e-06,
"loss": 0.3633,
"step": 5220
},
{
"epoch": 2.950634696755994,
"grad_norm": 0.448532360201155,
"learning_rate": 5e-06,
"loss": 0.3496,
"step": 5230
},
{
"epoch": 2.9562764456981663,
"grad_norm": 0.47532539519127587,
"learning_rate": 5e-06,
"loss": 0.3484,
"step": 5240
},
{
"epoch": 2.9619181946403383,
"grad_norm": 0.43655270107253413,
"learning_rate": 5e-06,
"loss": 0.3735,
"step": 5250
},
{
"epoch": 2.9675599435825104,
"grad_norm": 0.4654091728547412,
"learning_rate": 5e-06,
"loss": 0.3517,
"step": 5260
},
{
"epoch": 2.9732016925246825,
"grad_norm": 0.48276086071545776,
"learning_rate": 5e-06,
"loss": 0.358,
"step": 5270
},
{
"epoch": 2.9788434414668545,
"grad_norm": 0.4497726059890603,
"learning_rate": 5e-06,
"loss": 0.3743,
"step": 5280
},
{
"epoch": 2.9844851904090266,
"grad_norm": 0.42161219193763577,
"learning_rate": 5e-06,
"loss": 0.3519,
"step": 5290
},
{
"epoch": 2.9901269393511987,
"grad_norm": 0.4593665569282975,
"learning_rate": 5e-06,
"loss": 0.3473,
"step": 5300
},
{
"epoch": 2.9957686882933707,
"grad_norm": 0.4432358435800667,
"learning_rate": 5e-06,
"loss": 0.3623,
"step": 5310
},
{
"epoch": 2.9991537376586743,
"eval_loss": 0.440873384475708,
"eval_runtime": 444.0657,
"eval_samples_per_second": 26.883,
"eval_steps_per_second": 0.421,
"step": 5316
},
{
"epoch": 2.9991537376586743,
"step": 5316,
"total_flos": 2786674505416704.0,
"train_loss": 0.4018145801655057,
"train_runtime": 71328.9114,
"train_samples_per_second": 9.54,
"train_steps_per_second": 0.075
}
],
"logging_steps": 10,
"max_steps": 5316,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2786674505416704.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}