oh_scale_x4_compute_equal / trainer_state.json
sedrickkeh's picture
End of training
9f61e75 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.99849454271735,
"eval_steps": 500,
"global_step": 3984,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0100363818843307,
"grad_norm": 33.33474786117092,
"learning_rate": 5e-06,
"loss": 1.0592,
"step": 10
},
{
"epoch": 0.0200727637686614,
"grad_norm": 1.3624139034064995,
"learning_rate": 5e-06,
"loss": 0.9442,
"step": 20
},
{
"epoch": 0.030109145652992095,
"grad_norm": 1.4119904374590468,
"learning_rate": 5e-06,
"loss": 0.8939,
"step": 30
},
{
"epoch": 0.0401455275373228,
"grad_norm": 1.528739282183849,
"learning_rate": 5e-06,
"loss": 0.8733,
"step": 40
},
{
"epoch": 0.050181909421653494,
"grad_norm": 1.0731508202908417,
"learning_rate": 5e-06,
"loss": 0.8605,
"step": 50
},
{
"epoch": 0.06021829130598419,
"grad_norm": 0.9332984692607159,
"learning_rate": 5e-06,
"loss": 0.8442,
"step": 60
},
{
"epoch": 0.07025467319031489,
"grad_norm": 0.872377007810567,
"learning_rate": 5e-06,
"loss": 0.8326,
"step": 70
},
{
"epoch": 0.0802910550746456,
"grad_norm": 1.1369570969633511,
"learning_rate": 5e-06,
"loss": 0.8255,
"step": 80
},
{
"epoch": 0.09032743695897628,
"grad_norm": 1.2039587178247435,
"learning_rate": 5e-06,
"loss": 0.8131,
"step": 90
},
{
"epoch": 0.10036381884330699,
"grad_norm": 0.7786189445202707,
"learning_rate": 5e-06,
"loss": 0.8089,
"step": 100
},
{
"epoch": 0.11040020072763769,
"grad_norm": 0.8167899418786717,
"learning_rate": 5e-06,
"loss": 0.8015,
"step": 110
},
{
"epoch": 0.12043658261196838,
"grad_norm": 0.7700801877494686,
"learning_rate": 5e-06,
"loss": 0.7984,
"step": 120
},
{
"epoch": 0.13047296449629908,
"grad_norm": 0.7344316347793653,
"learning_rate": 5e-06,
"loss": 0.8001,
"step": 130
},
{
"epoch": 0.14050934638062978,
"grad_norm": 0.8770050061566579,
"learning_rate": 5e-06,
"loss": 0.7932,
"step": 140
},
{
"epoch": 0.1505457282649605,
"grad_norm": 0.996787607804233,
"learning_rate": 5e-06,
"loss": 0.7958,
"step": 150
},
{
"epoch": 0.1605821101492912,
"grad_norm": 0.6341583113229519,
"learning_rate": 5e-06,
"loss": 0.7933,
"step": 160
},
{
"epoch": 0.17061849203362187,
"grad_norm": 0.7057153333092503,
"learning_rate": 5e-06,
"loss": 0.7864,
"step": 170
},
{
"epoch": 0.18065487391795257,
"grad_norm": 0.5875164626639859,
"learning_rate": 5e-06,
"loss": 0.7864,
"step": 180
},
{
"epoch": 0.19069125580228327,
"grad_norm": 0.696758643929393,
"learning_rate": 5e-06,
"loss": 0.7851,
"step": 190
},
{
"epoch": 0.20072763768661397,
"grad_norm": 0.6720306069577169,
"learning_rate": 5e-06,
"loss": 0.782,
"step": 200
},
{
"epoch": 0.21076401957094468,
"grad_norm": 0.7205930987361605,
"learning_rate": 5e-06,
"loss": 0.7802,
"step": 210
},
{
"epoch": 0.22080040145527538,
"grad_norm": 0.7263170071853633,
"learning_rate": 5e-06,
"loss": 0.7766,
"step": 220
},
{
"epoch": 0.23083678333960608,
"grad_norm": 0.6222177030840054,
"learning_rate": 5e-06,
"loss": 0.7811,
"step": 230
},
{
"epoch": 0.24087316522393676,
"grad_norm": 0.7933389948657629,
"learning_rate": 5e-06,
"loss": 0.7785,
"step": 240
},
{
"epoch": 0.25090954710826746,
"grad_norm": 0.677033021877631,
"learning_rate": 5e-06,
"loss": 0.774,
"step": 250
},
{
"epoch": 0.26094592899259816,
"grad_norm": 1.1592201237448847,
"learning_rate": 5e-06,
"loss": 0.7685,
"step": 260
},
{
"epoch": 0.27098231087692887,
"grad_norm": 0.8219803131042634,
"learning_rate": 5e-06,
"loss": 0.7733,
"step": 270
},
{
"epoch": 0.28101869276125957,
"grad_norm": 0.8862071608668877,
"learning_rate": 5e-06,
"loss": 0.7717,
"step": 280
},
{
"epoch": 0.2910550746455903,
"grad_norm": 0.7936049147378009,
"learning_rate": 5e-06,
"loss": 0.7732,
"step": 290
},
{
"epoch": 0.301091456529921,
"grad_norm": 1.0061666072778264,
"learning_rate": 5e-06,
"loss": 0.7694,
"step": 300
},
{
"epoch": 0.3111278384142517,
"grad_norm": 0.8032266175827166,
"learning_rate": 5e-06,
"loss": 0.7667,
"step": 310
},
{
"epoch": 0.3211642202985824,
"grad_norm": 0.6353992902115431,
"learning_rate": 5e-06,
"loss": 0.7651,
"step": 320
},
{
"epoch": 0.3312006021829131,
"grad_norm": 0.771841795504432,
"learning_rate": 5e-06,
"loss": 0.7628,
"step": 330
},
{
"epoch": 0.34123698406724373,
"grad_norm": 0.6845658986967137,
"learning_rate": 5e-06,
"loss": 0.7717,
"step": 340
},
{
"epoch": 0.35127336595157443,
"grad_norm": 0.5670233457983912,
"learning_rate": 5e-06,
"loss": 0.7634,
"step": 350
},
{
"epoch": 0.36130974783590514,
"grad_norm": 0.6478342801580839,
"learning_rate": 5e-06,
"loss": 0.7629,
"step": 360
},
{
"epoch": 0.37134612972023584,
"grad_norm": 0.6993852271462582,
"learning_rate": 5e-06,
"loss": 0.7633,
"step": 370
},
{
"epoch": 0.38138251160456654,
"grad_norm": 0.8808430833699983,
"learning_rate": 5e-06,
"loss": 0.7689,
"step": 380
},
{
"epoch": 0.39141889348889725,
"grad_norm": 0.8240632882958299,
"learning_rate": 5e-06,
"loss": 0.7595,
"step": 390
},
{
"epoch": 0.40145527537322795,
"grad_norm": 0.6461166154976471,
"learning_rate": 5e-06,
"loss": 0.758,
"step": 400
},
{
"epoch": 0.41149165725755865,
"grad_norm": 0.6015328676217312,
"learning_rate": 5e-06,
"loss": 0.7615,
"step": 410
},
{
"epoch": 0.42152803914188935,
"grad_norm": 0.6844312177243449,
"learning_rate": 5e-06,
"loss": 0.7604,
"step": 420
},
{
"epoch": 0.43156442102622006,
"grad_norm": 0.5903552392115103,
"learning_rate": 5e-06,
"loss": 0.762,
"step": 430
},
{
"epoch": 0.44160080291055076,
"grad_norm": 0.6188002518762006,
"learning_rate": 5e-06,
"loss": 0.7608,
"step": 440
},
{
"epoch": 0.45163718479488146,
"grad_norm": 0.5972767152570548,
"learning_rate": 5e-06,
"loss": 0.7498,
"step": 450
},
{
"epoch": 0.46167356667921217,
"grad_norm": 0.8557407016909138,
"learning_rate": 5e-06,
"loss": 0.7557,
"step": 460
},
{
"epoch": 0.47170994856354287,
"grad_norm": 0.5850889857419986,
"learning_rate": 5e-06,
"loss": 0.7506,
"step": 470
},
{
"epoch": 0.4817463304478735,
"grad_norm": 0.6245054223045923,
"learning_rate": 5e-06,
"loss": 0.7577,
"step": 480
},
{
"epoch": 0.4917827123322042,
"grad_norm": 0.8464030388596748,
"learning_rate": 5e-06,
"loss": 0.7524,
"step": 490
},
{
"epoch": 0.5018190942165349,
"grad_norm": 0.5975779064483824,
"learning_rate": 5e-06,
"loss": 0.7542,
"step": 500
},
{
"epoch": 0.5118554761008657,
"grad_norm": 0.5557828388284775,
"learning_rate": 5e-06,
"loss": 0.7562,
"step": 510
},
{
"epoch": 0.5218918579851963,
"grad_norm": 0.5041527087050589,
"learning_rate": 5e-06,
"loss": 0.7505,
"step": 520
},
{
"epoch": 0.5319282398695271,
"grad_norm": 0.606175554740277,
"learning_rate": 5e-06,
"loss": 0.7488,
"step": 530
},
{
"epoch": 0.5419646217538577,
"grad_norm": 0.6932655836595772,
"learning_rate": 5e-06,
"loss": 0.7459,
"step": 540
},
{
"epoch": 0.5520010036381884,
"grad_norm": 0.6342619574494085,
"learning_rate": 5e-06,
"loss": 0.748,
"step": 550
},
{
"epoch": 0.5620373855225191,
"grad_norm": 0.7273685186875936,
"learning_rate": 5e-06,
"loss": 0.7514,
"step": 560
},
{
"epoch": 0.5720737674068498,
"grad_norm": 0.6175109686722693,
"learning_rate": 5e-06,
"loss": 0.7487,
"step": 570
},
{
"epoch": 0.5821101492911805,
"grad_norm": 0.5906521692541239,
"learning_rate": 5e-06,
"loss": 0.7444,
"step": 580
},
{
"epoch": 0.5921465311755112,
"grad_norm": 0.5770083684995156,
"learning_rate": 5e-06,
"loss": 0.7463,
"step": 590
},
{
"epoch": 0.602182913059842,
"grad_norm": 0.5611205193626524,
"learning_rate": 5e-06,
"loss": 0.7501,
"step": 600
},
{
"epoch": 0.6122192949441726,
"grad_norm": 0.5626260973998957,
"learning_rate": 5e-06,
"loss": 0.7476,
"step": 610
},
{
"epoch": 0.6222556768285034,
"grad_norm": 0.5359369641368554,
"learning_rate": 5e-06,
"loss": 0.7438,
"step": 620
},
{
"epoch": 0.632292058712834,
"grad_norm": 0.5986655577379103,
"learning_rate": 5e-06,
"loss": 0.7486,
"step": 630
},
{
"epoch": 0.6423284405971648,
"grad_norm": 0.6386177918361722,
"learning_rate": 5e-06,
"loss": 0.7485,
"step": 640
},
{
"epoch": 0.6523648224814954,
"grad_norm": 0.5978315440058417,
"learning_rate": 5e-06,
"loss": 0.7433,
"step": 650
},
{
"epoch": 0.6624012043658262,
"grad_norm": 0.6036098551107931,
"learning_rate": 5e-06,
"loss": 0.7455,
"step": 660
},
{
"epoch": 0.6724375862501568,
"grad_norm": 0.5636202992429957,
"learning_rate": 5e-06,
"loss": 0.7466,
"step": 670
},
{
"epoch": 0.6824739681344875,
"grad_norm": 0.702719868075261,
"learning_rate": 5e-06,
"loss": 0.7423,
"step": 680
},
{
"epoch": 0.6925103500188182,
"grad_norm": 0.5833095780838476,
"learning_rate": 5e-06,
"loss": 0.7398,
"step": 690
},
{
"epoch": 0.7025467319031489,
"grad_norm": 0.6308085334601381,
"learning_rate": 5e-06,
"loss": 0.7429,
"step": 700
},
{
"epoch": 0.7125831137874796,
"grad_norm": 0.5487784356318717,
"learning_rate": 5e-06,
"loss": 0.7411,
"step": 710
},
{
"epoch": 0.7226194956718103,
"grad_norm": 0.5865151577104253,
"learning_rate": 5e-06,
"loss": 0.7428,
"step": 720
},
{
"epoch": 0.732655877556141,
"grad_norm": 0.6872870216284236,
"learning_rate": 5e-06,
"loss": 0.7431,
"step": 730
},
{
"epoch": 0.7426922594404717,
"grad_norm": 0.7183251325599549,
"learning_rate": 5e-06,
"loss": 0.7423,
"step": 740
},
{
"epoch": 0.7527286413248024,
"grad_norm": 0.7454754048807214,
"learning_rate": 5e-06,
"loss": 0.7387,
"step": 750
},
{
"epoch": 0.7627650232091331,
"grad_norm": 0.5531190685288089,
"learning_rate": 5e-06,
"loss": 0.743,
"step": 760
},
{
"epoch": 0.7728014050934638,
"grad_norm": 0.5534601111291695,
"learning_rate": 5e-06,
"loss": 0.741,
"step": 770
},
{
"epoch": 0.7828377869777945,
"grad_norm": 0.5724229278027059,
"learning_rate": 5e-06,
"loss": 0.742,
"step": 780
},
{
"epoch": 0.7928741688621253,
"grad_norm": 0.5444775257450564,
"learning_rate": 5e-06,
"loss": 0.7375,
"step": 790
},
{
"epoch": 0.8029105507464559,
"grad_norm": 0.5172792176589391,
"learning_rate": 5e-06,
"loss": 0.7387,
"step": 800
},
{
"epoch": 0.8129469326307867,
"grad_norm": 0.5788117817392925,
"learning_rate": 5e-06,
"loss": 0.7376,
"step": 810
},
{
"epoch": 0.8229833145151173,
"grad_norm": 0.5261510224866858,
"learning_rate": 5e-06,
"loss": 0.739,
"step": 820
},
{
"epoch": 0.833019696399448,
"grad_norm": 0.5280046534368359,
"learning_rate": 5e-06,
"loss": 0.7359,
"step": 830
},
{
"epoch": 0.8430560782837787,
"grad_norm": 0.5641435748973659,
"learning_rate": 5e-06,
"loss": 0.7349,
"step": 840
},
{
"epoch": 0.8530924601681094,
"grad_norm": 0.557571684728077,
"learning_rate": 5e-06,
"loss": 0.7365,
"step": 850
},
{
"epoch": 0.8631288420524401,
"grad_norm": 0.6296144133240885,
"learning_rate": 5e-06,
"loss": 0.7373,
"step": 860
},
{
"epoch": 0.8731652239367708,
"grad_norm": 0.5966607955551202,
"learning_rate": 5e-06,
"loss": 0.7353,
"step": 870
},
{
"epoch": 0.8832016058211015,
"grad_norm": 0.5563914474354427,
"learning_rate": 5e-06,
"loss": 0.7353,
"step": 880
},
{
"epoch": 0.8932379877054322,
"grad_norm": 0.536620041860774,
"learning_rate": 5e-06,
"loss": 0.7338,
"step": 890
},
{
"epoch": 0.9032743695897629,
"grad_norm": 0.5387289422962349,
"learning_rate": 5e-06,
"loss": 0.7372,
"step": 900
},
{
"epoch": 0.9133107514740936,
"grad_norm": 0.6013347978585226,
"learning_rate": 5e-06,
"loss": 0.7337,
"step": 910
},
{
"epoch": 0.9233471333584243,
"grad_norm": 0.583886043468759,
"learning_rate": 5e-06,
"loss": 0.7336,
"step": 920
},
{
"epoch": 0.933383515242755,
"grad_norm": 0.5382696935521819,
"learning_rate": 5e-06,
"loss": 0.7368,
"step": 930
},
{
"epoch": 0.9434198971270857,
"grad_norm": 0.5653997036590966,
"learning_rate": 5e-06,
"loss": 0.7345,
"step": 940
},
{
"epoch": 0.9534562790114164,
"grad_norm": 0.6567632931315763,
"learning_rate": 5e-06,
"loss": 0.7304,
"step": 950
},
{
"epoch": 0.963492660895747,
"grad_norm": 0.6418916205752738,
"learning_rate": 5e-06,
"loss": 0.7301,
"step": 960
},
{
"epoch": 0.9735290427800778,
"grad_norm": 0.6530499607318279,
"learning_rate": 5e-06,
"loss": 0.7387,
"step": 970
},
{
"epoch": 0.9835654246644084,
"grad_norm": 0.5551081098759132,
"learning_rate": 5e-06,
"loss": 0.7333,
"step": 980
},
{
"epoch": 0.9936018065487392,
"grad_norm": 0.6077836691860066,
"learning_rate": 5e-06,
"loss": 0.7336,
"step": 990
},
{
"epoch": 0.9996236356793377,
"eval_loss": 0.7306540608406067,
"eval_runtime": 710.0823,
"eval_samples_per_second": 37.811,
"eval_steps_per_second": 0.591,
"step": 996
},
{
"epoch": 1.0036381884330698,
"grad_norm": 0.7501884083987931,
"learning_rate": 5e-06,
"loss": 0.7631,
"step": 1000
},
{
"epoch": 1.0136745703174006,
"grad_norm": 0.6377477928653016,
"learning_rate": 5e-06,
"loss": 0.6949,
"step": 1010
},
{
"epoch": 1.0237109522017314,
"grad_norm": 0.5049710429815633,
"learning_rate": 5e-06,
"loss": 0.6968,
"step": 1020
},
{
"epoch": 1.033747334086062,
"grad_norm": 0.6837967725702919,
"learning_rate": 5e-06,
"loss": 0.6905,
"step": 1030
},
{
"epoch": 1.0437837159703927,
"grad_norm": 0.5356223402685194,
"learning_rate": 5e-06,
"loss": 0.6922,
"step": 1040
},
{
"epoch": 1.0538200978547234,
"grad_norm": 0.8370676577254016,
"learning_rate": 5e-06,
"loss": 0.6901,
"step": 1050
},
{
"epoch": 1.0638564797390542,
"grad_norm": 0.6105636596448625,
"learning_rate": 5e-06,
"loss": 0.6922,
"step": 1060
},
{
"epoch": 1.0738928616233847,
"grad_norm": 0.5927877282574158,
"learning_rate": 5e-06,
"loss": 0.6979,
"step": 1070
},
{
"epoch": 1.0839292435077155,
"grad_norm": 0.540102358596148,
"learning_rate": 5e-06,
"loss": 0.6958,
"step": 1080
},
{
"epoch": 1.0939656253920462,
"grad_norm": 0.59126294445945,
"learning_rate": 5e-06,
"loss": 0.6931,
"step": 1090
},
{
"epoch": 1.1040020072763768,
"grad_norm": 0.6608825970633089,
"learning_rate": 5e-06,
"loss": 0.6937,
"step": 1100
},
{
"epoch": 1.1140383891607075,
"grad_norm": 0.5164144607050836,
"learning_rate": 5e-06,
"loss": 0.6917,
"step": 1110
},
{
"epoch": 1.1240747710450383,
"grad_norm": 0.5814527717684509,
"learning_rate": 5e-06,
"loss": 0.6972,
"step": 1120
},
{
"epoch": 1.134111152929369,
"grad_norm": 0.8029370829094067,
"learning_rate": 5e-06,
"loss": 0.6911,
"step": 1130
},
{
"epoch": 1.1441475348136996,
"grad_norm": 0.5560867302991094,
"learning_rate": 5e-06,
"loss": 0.6934,
"step": 1140
},
{
"epoch": 1.1541839166980303,
"grad_norm": 0.5562556075209857,
"learning_rate": 5e-06,
"loss": 0.6966,
"step": 1150
},
{
"epoch": 1.164220298582361,
"grad_norm": 0.5466240170135728,
"learning_rate": 5e-06,
"loss": 0.6905,
"step": 1160
},
{
"epoch": 1.1742566804666918,
"grad_norm": 0.6181691352555871,
"learning_rate": 5e-06,
"loss": 0.6949,
"step": 1170
},
{
"epoch": 1.1842930623510224,
"grad_norm": 0.5816875585540926,
"learning_rate": 5e-06,
"loss": 0.6982,
"step": 1180
},
{
"epoch": 1.1943294442353531,
"grad_norm": 0.6682656613454141,
"learning_rate": 5e-06,
"loss": 0.6975,
"step": 1190
},
{
"epoch": 1.204365826119684,
"grad_norm": 0.5227039044223272,
"learning_rate": 5e-06,
"loss": 0.6938,
"step": 1200
},
{
"epoch": 1.2144022080040147,
"grad_norm": 0.5934796942110975,
"learning_rate": 5e-06,
"loss": 0.6928,
"step": 1210
},
{
"epoch": 1.2244385898883452,
"grad_norm": 0.7797726771495517,
"learning_rate": 5e-06,
"loss": 0.6966,
"step": 1220
},
{
"epoch": 1.234474971772676,
"grad_norm": 0.529604849325101,
"learning_rate": 5e-06,
"loss": 0.6923,
"step": 1230
},
{
"epoch": 1.2445113536570067,
"grad_norm": 0.6023982359465729,
"learning_rate": 5e-06,
"loss": 0.6945,
"step": 1240
},
{
"epoch": 1.2545477355413372,
"grad_norm": 0.5560385701612578,
"learning_rate": 5e-06,
"loss": 0.6944,
"step": 1250
},
{
"epoch": 1.264584117425668,
"grad_norm": 0.5283876353062209,
"learning_rate": 5e-06,
"loss": 0.6959,
"step": 1260
},
{
"epoch": 1.2746204993099988,
"grad_norm": 0.673569525320333,
"learning_rate": 5e-06,
"loss": 0.6957,
"step": 1270
},
{
"epoch": 1.2846568811943295,
"grad_norm": 0.5956349315336121,
"learning_rate": 5e-06,
"loss": 0.6919,
"step": 1280
},
{
"epoch": 1.29469326307866,
"grad_norm": 0.5672304692349656,
"learning_rate": 5e-06,
"loss": 0.6886,
"step": 1290
},
{
"epoch": 1.3047296449629908,
"grad_norm": 0.5877274033488099,
"learning_rate": 5e-06,
"loss": 0.6962,
"step": 1300
},
{
"epoch": 1.3147660268473216,
"grad_norm": 0.6060436152781087,
"learning_rate": 5e-06,
"loss": 0.6972,
"step": 1310
},
{
"epoch": 1.3248024087316521,
"grad_norm": 0.566578140937884,
"learning_rate": 5e-06,
"loss": 0.6929,
"step": 1320
},
{
"epoch": 1.3348387906159829,
"grad_norm": 0.5294558357936279,
"learning_rate": 5e-06,
"loss": 0.6879,
"step": 1330
},
{
"epoch": 1.3448751725003136,
"grad_norm": 0.6237162250745834,
"learning_rate": 5e-06,
"loss": 0.6963,
"step": 1340
},
{
"epoch": 1.3549115543846444,
"grad_norm": 0.5430804692697989,
"learning_rate": 5e-06,
"loss": 0.6879,
"step": 1350
},
{
"epoch": 1.3649479362689751,
"grad_norm": 0.7283462410509066,
"learning_rate": 5e-06,
"loss": 0.6976,
"step": 1360
},
{
"epoch": 1.3749843181533057,
"grad_norm": 0.6917759789030041,
"learning_rate": 5e-06,
"loss": 0.6852,
"step": 1370
},
{
"epoch": 1.3850207000376364,
"grad_norm": 0.5567789537021975,
"learning_rate": 5e-06,
"loss": 0.6948,
"step": 1380
},
{
"epoch": 1.3950570819219672,
"grad_norm": 0.5454605064190993,
"learning_rate": 5e-06,
"loss": 0.6954,
"step": 1390
},
{
"epoch": 1.4050934638062977,
"grad_norm": 0.5900124519339845,
"learning_rate": 5e-06,
"loss": 0.6921,
"step": 1400
},
{
"epoch": 1.4151298456906285,
"grad_norm": 0.5690895340205371,
"learning_rate": 5e-06,
"loss": 0.6907,
"step": 1410
},
{
"epoch": 1.4251662275749593,
"grad_norm": 0.5672995415143901,
"learning_rate": 5e-06,
"loss": 0.6924,
"step": 1420
},
{
"epoch": 1.43520260945929,
"grad_norm": 0.5311142120372314,
"learning_rate": 5e-06,
"loss": 0.696,
"step": 1430
},
{
"epoch": 1.4452389913436205,
"grad_norm": 0.5999039652092255,
"learning_rate": 5e-06,
"loss": 0.6902,
"step": 1440
},
{
"epoch": 1.4552753732279513,
"grad_norm": 0.5933508701370325,
"learning_rate": 5e-06,
"loss": 0.6921,
"step": 1450
},
{
"epoch": 1.465311755112282,
"grad_norm": 0.5165015340634761,
"learning_rate": 5e-06,
"loss": 0.6926,
"step": 1460
},
{
"epoch": 1.4753481369966126,
"grad_norm": 0.542825456215961,
"learning_rate": 5e-06,
"loss": 0.6924,
"step": 1470
},
{
"epoch": 1.4853845188809434,
"grad_norm": 0.5173942736182007,
"learning_rate": 5e-06,
"loss": 0.6889,
"step": 1480
},
{
"epoch": 1.4954209007652741,
"grad_norm": 0.5444379147874959,
"learning_rate": 5e-06,
"loss": 0.6895,
"step": 1490
},
{
"epoch": 1.5054572826496049,
"grad_norm": 0.5186245031758688,
"learning_rate": 5e-06,
"loss": 0.6955,
"step": 1500
},
{
"epoch": 1.5154936645339356,
"grad_norm": 0.5273924158023976,
"learning_rate": 5e-06,
"loss": 0.6971,
"step": 1510
},
{
"epoch": 1.5255300464182662,
"grad_norm": 0.5676022244683365,
"learning_rate": 5e-06,
"loss": 0.695,
"step": 1520
},
{
"epoch": 1.535566428302597,
"grad_norm": 0.6271761201828187,
"learning_rate": 5e-06,
"loss": 0.6917,
"step": 1530
},
{
"epoch": 1.5456028101869275,
"grad_norm": 0.6124159982732021,
"learning_rate": 5e-06,
"loss": 0.6911,
"step": 1540
},
{
"epoch": 1.5556391920712582,
"grad_norm": 0.6862261314401948,
"learning_rate": 5e-06,
"loss": 0.6864,
"step": 1550
},
{
"epoch": 1.565675573955589,
"grad_norm": 0.6275955880807454,
"learning_rate": 5e-06,
"loss": 0.6936,
"step": 1560
},
{
"epoch": 1.5757119558399197,
"grad_norm": 0.6289883161946416,
"learning_rate": 5e-06,
"loss": 0.6894,
"step": 1570
},
{
"epoch": 1.5857483377242505,
"grad_norm": 0.580105480957792,
"learning_rate": 5e-06,
"loss": 0.693,
"step": 1580
},
{
"epoch": 1.5957847196085813,
"grad_norm": 0.5560879464825351,
"learning_rate": 5e-06,
"loss": 0.692,
"step": 1590
},
{
"epoch": 1.6058211014929118,
"grad_norm": 0.58969515178897,
"learning_rate": 5e-06,
"loss": 0.6916,
"step": 1600
},
{
"epoch": 1.6158574833772426,
"grad_norm": 0.626694720233302,
"learning_rate": 5e-06,
"loss": 0.6867,
"step": 1610
},
{
"epoch": 1.625893865261573,
"grad_norm": 0.5549455637786119,
"learning_rate": 5e-06,
"loss": 0.6897,
"step": 1620
},
{
"epoch": 1.6359302471459038,
"grad_norm": 0.5938553058669086,
"learning_rate": 5e-06,
"loss": 0.6921,
"step": 1630
},
{
"epoch": 1.6459666290302346,
"grad_norm": 0.5753903253177306,
"learning_rate": 5e-06,
"loss": 0.6848,
"step": 1640
},
{
"epoch": 1.6560030109145654,
"grad_norm": 0.5191405989431156,
"learning_rate": 5e-06,
"loss": 0.69,
"step": 1650
},
{
"epoch": 1.6660393927988961,
"grad_norm": 0.5561359520494403,
"learning_rate": 5e-06,
"loss": 0.6986,
"step": 1660
},
{
"epoch": 1.6760757746832267,
"grad_norm": 0.6203674148098636,
"learning_rate": 5e-06,
"loss": 0.6908,
"step": 1670
},
{
"epoch": 1.6861121565675574,
"grad_norm": 0.6125267283499654,
"learning_rate": 5e-06,
"loss": 0.6914,
"step": 1680
},
{
"epoch": 1.696148538451888,
"grad_norm": 0.6670104437553405,
"learning_rate": 5e-06,
"loss": 0.6921,
"step": 1690
},
{
"epoch": 1.7061849203362187,
"grad_norm": 0.6164770721352887,
"learning_rate": 5e-06,
"loss": 0.6906,
"step": 1700
},
{
"epoch": 1.7162213022205495,
"grad_norm": 0.5419884156579401,
"learning_rate": 5e-06,
"loss": 0.6943,
"step": 1710
},
{
"epoch": 1.7262576841048802,
"grad_norm": 0.5735849533542166,
"learning_rate": 5e-06,
"loss": 0.69,
"step": 1720
},
{
"epoch": 1.736294065989211,
"grad_norm": 0.5297233454269522,
"learning_rate": 5e-06,
"loss": 0.6892,
"step": 1730
},
{
"epoch": 1.7463304478735417,
"grad_norm": 0.5393616139715296,
"learning_rate": 5e-06,
"loss": 0.6915,
"step": 1740
},
{
"epoch": 1.7563668297578723,
"grad_norm": 0.5643636572625912,
"learning_rate": 5e-06,
"loss": 0.6946,
"step": 1750
},
{
"epoch": 1.7664032116422028,
"grad_norm": 0.5884196950446513,
"learning_rate": 5e-06,
"loss": 0.6888,
"step": 1760
},
{
"epoch": 1.7764395935265336,
"grad_norm": 0.5236101003130557,
"learning_rate": 5e-06,
"loss": 0.687,
"step": 1770
},
{
"epoch": 1.7864759754108643,
"grad_norm": 0.5035132462272621,
"learning_rate": 5e-06,
"loss": 0.6888,
"step": 1780
},
{
"epoch": 1.796512357295195,
"grad_norm": 0.503023782309955,
"learning_rate": 5e-06,
"loss": 0.697,
"step": 1790
},
{
"epoch": 1.8065487391795259,
"grad_norm": 0.6043844268122752,
"learning_rate": 5e-06,
"loss": 0.6939,
"step": 1800
},
{
"epoch": 1.8165851210638566,
"grad_norm": 0.5232601740320375,
"learning_rate": 5e-06,
"loss": 0.6904,
"step": 1810
},
{
"epoch": 1.8266215029481871,
"grad_norm": 0.5370830872283877,
"learning_rate": 5e-06,
"loss": 0.6911,
"step": 1820
},
{
"epoch": 1.836657884832518,
"grad_norm": 0.6459700462195024,
"learning_rate": 5e-06,
"loss": 0.6882,
"step": 1830
},
{
"epoch": 1.8466942667168484,
"grad_norm": 0.582019267718641,
"learning_rate": 5e-06,
"loss": 0.6922,
"step": 1840
},
{
"epoch": 1.8567306486011792,
"grad_norm": 0.5870687122095531,
"learning_rate": 5e-06,
"loss": 0.6922,
"step": 1850
},
{
"epoch": 1.86676703048551,
"grad_norm": 0.5078692726109669,
"learning_rate": 5e-06,
"loss": 0.6885,
"step": 1860
},
{
"epoch": 1.8768034123698407,
"grad_norm": 0.5040649883447891,
"learning_rate": 5e-06,
"loss": 0.6881,
"step": 1870
},
{
"epoch": 1.8868397942541715,
"grad_norm": 0.4995959267347782,
"learning_rate": 5e-06,
"loss": 0.6862,
"step": 1880
},
{
"epoch": 1.8968761761385022,
"grad_norm": 0.5881241901658035,
"learning_rate": 5e-06,
"loss": 0.6917,
"step": 1890
},
{
"epoch": 1.9069125580228328,
"grad_norm": 0.5588343821948456,
"learning_rate": 5e-06,
"loss": 0.6888,
"step": 1900
},
{
"epoch": 1.9169489399071633,
"grad_norm": 0.4870751715972866,
"learning_rate": 5e-06,
"loss": 0.6872,
"step": 1910
},
{
"epoch": 1.926985321791494,
"grad_norm": 0.5735655591316463,
"learning_rate": 5e-06,
"loss": 0.6947,
"step": 1920
},
{
"epoch": 1.9370217036758248,
"grad_norm": 0.6085871436334919,
"learning_rate": 5e-06,
"loss": 0.6888,
"step": 1930
},
{
"epoch": 1.9470580855601556,
"grad_norm": 0.6006262589490265,
"learning_rate": 5e-06,
"loss": 0.6884,
"step": 1940
},
{
"epoch": 1.9570944674444863,
"grad_norm": 0.6341474482034111,
"learning_rate": 5e-06,
"loss": 0.6886,
"step": 1950
},
{
"epoch": 1.967130849328817,
"grad_norm": 0.5362807629078432,
"learning_rate": 5e-06,
"loss": 0.6873,
"step": 1960
},
{
"epoch": 1.9771672312131476,
"grad_norm": 0.4940952174490118,
"learning_rate": 5e-06,
"loss": 0.6895,
"step": 1970
},
{
"epoch": 1.9872036130974784,
"grad_norm": 0.5068555383392377,
"learning_rate": 5e-06,
"loss": 0.6886,
"step": 1980
},
{
"epoch": 1.997239994981809,
"grad_norm": 0.5115531434616939,
"learning_rate": 5e-06,
"loss": 0.6923,
"step": 1990
},
{
"epoch": 1.999247271358675,
"eval_loss": 0.7167317867279053,
"eval_runtime": 681.5165,
"eval_samples_per_second": 39.396,
"eval_steps_per_second": 0.616,
"step": 1992
},
{
"epoch": 2.0072763768661397,
"grad_norm": 0.7464092668086445,
"learning_rate": 5e-06,
"loss": 0.7021,
"step": 2000
},
{
"epoch": 2.0173127587504704,
"grad_norm": 0.6285168143241044,
"learning_rate": 5e-06,
"loss": 0.6464,
"step": 2010
},
{
"epoch": 2.027349140634801,
"grad_norm": 0.6153973121048567,
"learning_rate": 5e-06,
"loss": 0.6468,
"step": 2020
},
{
"epoch": 2.037385522519132,
"grad_norm": 0.5873302210415512,
"learning_rate": 5e-06,
"loss": 0.6466,
"step": 2030
},
{
"epoch": 2.0474219044034627,
"grad_norm": 0.6320791509681155,
"learning_rate": 5e-06,
"loss": 0.645,
"step": 2040
},
{
"epoch": 2.057458286287793,
"grad_norm": 0.7492672540101699,
"learning_rate": 5e-06,
"loss": 0.6484,
"step": 2050
},
{
"epoch": 2.067494668172124,
"grad_norm": 0.6049252570937941,
"learning_rate": 5e-06,
"loss": 0.6479,
"step": 2060
},
{
"epoch": 2.0775310500564546,
"grad_norm": 0.5658472447863507,
"learning_rate": 5e-06,
"loss": 0.646,
"step": 2070
},
{
"epoch": 2.0875674319407853,
"grad_norm": 0.5516870136698201,
"learning_rate": 5e-06,
"loss": 0.6492,
"step": 2080
},
{
"epoch": 2.097603813825116,
"grad_norm": 0.7029586116565276,
"learning_rate": 5e-06,
"loss": 0.6538,
"step": 2090
},
{
"epoch": 2.107640195709447,
"grad_norm": 0.5501781567288918,
"learning_rate": 5e-06,
"loss": 0.6468,
"step": 2100
},
{
"epoch": 2.1176765775937776,
"grad_norm": 0.5732859435604044,
"learning_rate": 5e-06,
"loss": 0.6468,
"step": 2110
},
{
"epoch": 2.1277129594781083,
"grad_norm": 0.6097094231081762,
"learning_rate": 5e-06,
"loss": 0.645,
"step": 2120
},
{
"epoch": 2.1377493413624387,
"grad_norm": 0.6168934436640728,
"learning_rate": 5e-06,
"loss": 0.6495,
"step": 2130
},
{
"epoch": 2.1477857232467694,
"grad_norm": 0.5791030315614372,
"learning_rate": 5e-06,
"loss": 0.6494,
"step": 2140
},
{
"epoch": 2.1578221051311,
"grad_norm": 0.5248289813751731,
"learning_rate": 5e-06,
"loss": 0.6502,
"step": 2150
},
{
"epoch": 2.167858487015431,
"grad_norm": 0.5339345181224099,
"learning_rate": 5e-06,
"loss": 0.6469,
"step": 2160
},
{
"epoch": 2.1778948688997617,
"grad_norm": 0.5503349261990413,
"learning_rate": 5e-06,
"loss": 0.6445,
"step": 2170
},
{
"epoch": 2.1879312507840925,
"grad_norm": 0.5464800148102408,
"learning_rate": 5e-06,
"loss": 0.6506,
"step": 2180
},
{
"epoch": 2.197967632668423,
"grad_norm": 0.5247947975936182,
"learning_rate": 5e-06,
"loss": 0.6527,
"step": 2190
},
{
"epoch": 2.2080040145527535,
"grad_norm": 0.5606999830634875,
"learning_rate": 5e-06,
"loss": 0.6493,
"step": 2200
},
{
"epoch": 2.2180403964370843,
"grad_norm": 0.5331049457160831,
"learning_rate": 5e-06,
"loss": 0.6537,
"step": 2210
},
{
"epoch": 2.228076778321415,
"grad_norm": 0.5988937144831796,
"learning_rate": 5e-06,
"loss": 0.6491,
"step": 2220
},
{
"epoch": 2.238113160205746,
"grad_norm": 0.5493199529484449,
"learning_rate": 5e-06,
"loss": 0.6497,
"step": 2230
},
{
"epoch": 2.2481495420900766,
"grad_norm": 0.5391276137070077,
"learning_rate": 5e-06,
"loss": 0.6479,
"step": 2240
},
{
"epoch": 2.2581859239744073,
"grad_norm": 0.5468874755147779,
"learning_rate": 5e-06,
"loss": 0.6481,
"step": 2250
},
{
"epoch": 2.268222305858738,
"grad_norm": 0.6698860969319781,
"learning_rate": 5e-06,
"loss": 0.6496,
"step": 2260
},
{
"epoch": 2.278258687743069,
"grad_norm": 0.5534091734548499,
"learning_rate": 5e-06,
"loss": 0.6495,
"step": 2270
},
{
"epoch": 2.288295069627399,
"grad_norm": 0.5251666746216365,
"learning_rate": 5e-06,
"loss": 0.6488,
"step": 2280
},
{
"epoch": 2.29833145151173,
"grad_norm": 0.6041634029692523,
"learning_rate": 5e-06,
"loss": 0.6463,
"step": 2290
},
{
"epoch": 2.3083678333960607,
"grad_norm": 0.6700682402301694,
"learning_rate": 5e-06,
"loss": 0.6512,
"step": 2300
},
{
"epoch": 2.3184042152803914,
"grad_norm": 0.5507167873654453,
"learning_rate": 5e-06,
"loss": 0.6531,
"step": 2310
},
{
"epoch": 2.328440597164722,
"grad_norm": 0.7110576199813733,
"learning_rate": 5e-06,
"loss": 0.651,
"step": 2320
},
{
"epoch": 2.338476979049053,
"grad_norm": 0.547011003517586,
"learning_rate": 5e-06,
"loss": 0.6496,
"step": 2330
},
{
"epoch": 2.3485133609333837,
"grad_norm": 0.5277892887616297,
"learning_rate": 5e-06,
"loss": 0.6478,
"step": 2340
},
{
"epoch": 2.358549742817714,
"grad_norm": 0.5418890015733175,
"learning_rate": 5e-06,
"loss": 0.6474,
"step": 2350
},
{
"epoch": 2.3685861247020448,
"grad_norm": 0.5453644592494074,
"learning_rate": 5e-06,
"loss": 0.6513,
"step": 2360
},
{
"epoch": 2.3786225065863755,
"grad_norm": 0.5570648512157045,
"learning_rate": 5e-06,
"loss": 0.6549,
"step": 2370
},
{
"epoch": 2.3886588884707063,
"grad_norm": 0.6533123491533362,
"learning_rate": 5e-06,
"loss": 0.6517,
"step": 2380
},
{
"epoch": 2.398695270355037,
"grad_norm": 0.5514173498092635,
"learning_rate": 5e-06,
"loss": 0.6479,
"step": 2390
},
{
"epoch": 2.408731652239368,
"grad_norm": 0.6169109034186734,
"learning_rate": 5e-06,
"loss": 0.6497,
"step": 2400
},
{
"epoch": 2.4187680341236986,
"grad_norm": 0.6092701719038681,
"learning_rate": 5e-06,
"loss": 0.6523,
"step": 2410
},
{
"epoch": 2.4288044160080293,
"grad_norm": 0.5995748884635175,
"learning_rate": 5e-06,
"loss": 0.653,
"step": 2420
},
{
"epoch": 2.4388407978923596,
"grad_norm": 0.5733910522734972,
"learning_rate": 5e-06,
"loss": 0.6482,
"step": 2430
},
{
"epoch": 2.4488771797766904,
"grad_norm": 0.5191638101081872,
"learning_rate": 5e-06,
"loss": 0.656,
"step": 2440
},
{
"epoch": 2.458913561661021,
"grad_norm": 0.8798002303204917,
"learning_rate": 5e-06,
"loss": 0.6501,
"step": 2450
},
{
"epoch": 2.468949943545352,
"grad_norm": 0.5055430681257954,
"learning_rate": 5e-06,
"loss": 0.6526,
"step": 2460
},
{
"epoch": 2.4789863254296827,
"grad_norm": 0.5305005320670955,
"learning_rate": 5e-06,
"loss": 0.6579,
"step": 2470
},
{
"epoch": 2.4890227073140134,
"grad_norm": 0.6181982492315344,
"learning_rate": 5e-06,
"loss": 0.6473,
"step": 2480
},
{
"epoch": 2.499059089198344,
"grad_norm": 0.6215467401287779,
"learning_rate": 5e-06,
"loss": 0.6511,
"step": 2490
},
{
"epoch": 2.5090954710826745,
"grad_norm": 0.5729138634014542,
"learning_rate": 5e-06,
"loss": 0.6577,
"step": 2500
},
{
"epoch": 2.5191318529670053,
"grad_norm": 0.5393679833546277,
"learning_rate": 5e-06,
"loss": 0.6536,
"step": 2510
},
{
"epoch": 2.529168234851336,
"grad_norm": 0.534381658436043,
"learning_rate": 5e-06,
"loss": 0.6489,
"step": 2520
},
{
"epoch": 2.5392046167356668,
"grad_norm": 0.539600655245499,
"learning_rate": 5e-06,
"loss": 0.6495,
"step": 2530
},
{
"epoch": 2.5492409986199975,
"grad_norm": 0.6226376194292436,
"learning_rate": 5e-06,
"loss": 0.651,
"step": 2540
},
{
"epoch": 2.5592773805043283,
"grad_norm": 0.5865717492190782,
"learning_rate": 5e-06,
"loss": 0.6541,
"step": 2550
},
{
"epoch": 2.569313762388659,
"grad_norm": 0.6438993538240664,
"learning_rate": 5e-06,
"loss": 0.6507,
"step": 2560
},
{
"epoch": 2.57935014427299,
"grad_norm": 0.7518127666950809,
"learning_rate": 5e-06,
"loss": 0.6499,
"step": 2570
},
{
"epoch": 2.58938652615732,
"grad_norm": 0.586864178840436,
"learning_rate": 5e-06,
"loss": 0.6616,
"step": 2580
},
{
"epoch": 2.599422908041651,
"grad_norm": 0.5941413788184908,
"learning_rate": 5e-06,
"loss": 0.6486,
"step": 2590
},
{
"epoch": 2.6094592899259816,
"grad_norm": 0.5451550588012898,
"learning_rate": 5e-06,
"loss": 0.6544,
"step": 2600
},
{
"epoch": 2.6194956718103124,
"grad_norm": 0.5147403638954634,
"learning_rate": 5e-06,
"loss": 0.6539,
"step": 2610
},
{
"epoch": 2.629532053694643,
"grad_norm": 0.5467938780782444,
"learning_rate": 5e-06,
"loss": 0.6475,
"step": 2620
},
{
"epoch": 2.639568435578974,
"grad_norm": 0.5458940947855774,
"learning_rate": 5e-06,
"loss": 0.6501,
"step": 2630
},
{
"epoch": 2.6496048174633042,
"grad_norm": 0.600288340260203,
"learning_rate": 5e-06,
"loss": 0.6533,
"step": 2640
},
{
"epoch": 2.659641199347635,
"grad_norm": 0.5329802788249394,
"learning_rate": 5e-06,
"loss": 0.6539,
"step": 2650
},
{
"epoch": 2.6696775812319657,
"grad_norm": 0.5494428968278945,
"learning_rate": 5e-06,
"loss": 0.6485,
"step": 2660
},
{
"epoch": 2.6797139631162965,
"grad_norm": 0.5583817948296362,
"learning_rate": 5e-06,
"loss": 0.6522,
"step": 2670
},
{
"epoch": 2.6897503450006273,
"grad_norm": 0.5760005041925667,
"learning_rate": 5e-06,
"loss": 0.6513,
"step": 2680
},
{
"epoch": 2.699786726884958,
"grad_norm": 0.6094121074922789,
"learning_rate": 5e-06,
"loss": 0.6507,
"step": 2690
},
{
"epoch": 2.709823108769289,
"grad_norm": 0.6168874365624798,
"learning_rate": 5e-06,
"loss": 0.6532,
"step": 2700
},
{
"epoch": 2.7198594906536195,
"grad_norm": 0.5204111827775895,
"learning_rate": 5e-06,
"loss": 0.652,
"step": 2710
},
{
"epoch": 2.7298958725379503,
"grad_norm": 0.5414555391536807,
"learning_rate": 5e-06,
"loss": 0.6529,
"step": 2720
},
{
"epoch": 2.7399322544222806,
"grad_norm": 0.5343532711007503,
"learning_rate": 5e-06,
"loss": 0.6551,
"step": 2730
},
{
"epoch": 2.7499686363066114,
"grad_norm": 0.5730034177518993,
"learning_rate": 5e-06,
"loss": 0.6559,
"step": 2740
},
{
"epoch": 2.760005018190942,
"grad_norm": 0.5413799739509437,
"learning_rate": 5e-06,
"loss": 0.6536,
"step": 2750
},
{
"epoch": 2.770041400075273,
"grad_norm": 0.58185222058302,
"learning_rate": 5e-06,
"loss": 0.6607,
"step": 2760
},
{
"epoch": 2.7800777819596036,
"grad_norm": 0.5659358615987367,
"learning_rate": 5e-06,
"loss": 0.6532,
"step": 2770
},
{
"epoch": 2.7901141638439344,
"grad_norm": 0.5269963486852614,
"learning_rate": 5e-06,
"loss": 0.6516,
"step": 2780
},
{
"epoch": 2.8001505457282647,
"grad_norm": 0.6056482620803397,
"learning_rate": 5e-06,
"loss": 0.6552,
"step": 2790
},
{
"epoch": 2.8101869276125955,
"grad_norm": 1.0590493469402826,
"learning_rate": 5e-06,
"loss": 0.6556,
"step": 2800
},
{
"epoch": 2.8202233094969262,
"grad_norm": 0.8284099367079102,
"learning_rate": 5e-06,
"loss": 0.6549,
"step": 2810
},
{
"epoch": 2.830259691381257,
"grad_norm": 0.6501574839000807,
"learning_rate": 5e-06,
"loss": 0.6507,
"step": 2820
},
{
"epoch": 2.8402960732655878,
"grad_norm": 0.5410870388612636,
"learning_rate": 5e-06,
"loss": 0.6526,
"step": 2830
},
{
"epoch": 2.8503324551499185,
"grad_norm": 0.6510662758185398,
"learning_rate": 5e-06,
"loss": 0.6519,
"step": 2840
},
{
"epoch": 2.8603688370342493,
"grad_norm": 0.5238204299865007,
"learning_rate": 5e-06,
"loss": 0.6593,
"step": 2850
},
{
"epoch": 2.87040521891858,
"grad_norm": 0.5397683523247623,
"learning_rate": 5e-06,
"loss": 0.6526,
"step": 2860
},
{
"epoch": 2.880441600802911,
"grad_norm": 0.5558371306126749,
"learning_rate": 5e-06,
"loss": 0.6521,
"step": 2870
},
{
"epoch": 2.890477982687241,
"grad_norm": 0.5084123704706736,
"learning_rate": 5e-06,
"loss": 0.655,
"step": 2880
},
{
"epoch": 2.900514364571572,
"grad_norm": 0.5363806596935408,
"learning_rate": 5e-06,
"loss": 0.6523,
"step": 2890
},
{
"epoch": 2.9105507464559026,
"grad_norm": 0.526536425381128,
"learning_rate": 5e-06,
"loss": 0.6549,
"step": 2900
},
{
"epoch": 2.9205871283402334,
"grad_norm": 0.5720711163344511,
"learning_rate": 5e-06,
"loss": 0.6516,
"step": 2910
},
{
"epoch": 2.930623510224564,
"grad_norm": 0.5168344329750222,
"learning_rate": 5e-06,
"loss": 0.6505,
"step": 2920
},
{
"epoch": 2.940659892108895,
"grad_norm": 0.5068041805158231,
"learning_rate": 5e-06,
"loss": 0.6523,
"step": 2930
},
{
"epoch": 2.950696273993225,
"grad_norm": 0.5854150052744146,
"learning_rate": 5e-06,
"loss": 0.6562,
"step": 2940
},
{
"epoch": 2.960732655877556,
"grad_norm": 0.6196831772444017,
"learning_rate": 5e-06,
"loss": 0.6542,
"step": 2950
},
{
"epoch": 2.9707690377618867,
"grad_norm": 0.5314563776407271,
"learning_rate": 5e-06,
"loss": 0.6561,
"step": 2960
},
{
"epoch": 2.9808054196462175,
"grad_norm": 0.5039099479336567,
"learning_rate": 5e-06,
"loss": 0.6563,
"step": 2970
},
{
"epoch": 2.9908418015305482,
"grad_norm": 0.5313007683420622,
"learning_rate": 5e-06,
"loss": 0.6521,
"step": 2980
},
{
"epoch": 2.999874545226446,
"eval_loss": 0.7146658897399902,
"eval_runtime": 674.3767,
"eval_samples_per_second": 39.813,
"eval_steps_per_second": 0.623,
"step": 2989
},
{
"epoch": 3.000878183414879,
"grad_norm": 1.159167907299322,
"learning_rate": 5e-06,
"loss": 0.6898,
"step": 2990
},
{
"epoch": 3.0109145652992098,
"grad_norm": 0.7818114221168871,
"learning_rate": 5e-06,
"loss": 0.6112,
"step": 3000
},
{
"epoch": 3.0209509471835405,
"grad_norm": 0.7106384774672879,
"learning_rate": 5e-06,
"loss": 0.6059,
"step": 3010
},
{
"epoch": 3.030987329067871,
"grad_norm": 0.640003891116449,
"learning_rate": 5e-06,
"loss": 0.6085,
"step": 3020
},
{
"epoch": 3.0410237109522016,
"grad_norm": 0.6066326657235696,
"learning_rate": 5e-06,
"loss": 0.6077,
"step": 3030
},
{
"epoch": 3.0510600928365323,
"grad_norm": 0.6071064742090702,
"learning_rate": 5e-06,
"loss": 0.6116,
"step": 3040
},
{
"epoch": 3.061096474720863,
"grad_norm": 0.693281501487692,
"learning_rate": 5e-06,
"loss": 0.6052,
"step": 3050
},
{
"epoch": 3.071132856605194,
"grad_norm": 0.7693025911684378,
"learning_rate": 5e-06,
"loss": 0.6095,
"step": 3060
},
{
"epoch": 3.0811692384895246,
"grad_norm": 0.6184427233437038,
"learning_rate": 5e-06,
"loss": 0.6069,
"step": 3070
},
{
"epoch": 3.0912056203738554,
"grad_norm": 0.634870226376911,
"learning_rate": 5e-06,
"loss": 0.6099,
"step": 3080
},
{
"epoch": 3.101242002258186,
"grad_norm": 0.5947920487741215,
"learning_rate": 5e-06,
"loss": 0.6092,
"step": 3090
},
{
"epoch": 3.1112783841425165,
"grad_norm": 0.5475506951891964,
"learning_rate": 5e-06,
"loss": 0.6066,
"step": 3100
},
{
"epoch": 3.121314766026847,
"grad_norm": 0.5786846254856872,
"learning_rate": 5e-06,
"loss": 0.6077,
"step": 3110
},
{
"epoch": 3.131351147911178,
"grad_norm": 0.5837921428316006,
"learning_rate": 5e-06,
"loss": 0.6125,
"step": 3120
},
{
"epoch": 3.1413875297955087,
"grad_norm": 0.6388660075166559,
"learning_rate": 5e-06,
"loss": 0.6073,
"step": 3130
},
{
"epoch": 3.1514239116798395,
"grad_norm": 0.6247319700614546,
"learning_rate": 5e-06,
"loss": 0.6129,
"step": 3140
},
{
"epoch": 3.1614602935641702,
"grad_norm": 0.7220969862146115,
"learning_rate": 5e-06,
"loss": 0.6096,
"step": 3150
},
{
"epoch": 3.171496675448501,
"grad_norm": 0.5966143252277392,
"learning_rate": 5e-06,
"loss": 0.6108,
"step": 3160
},
{
"epoch": 3.1815330573328313,
"grad_norm": 0.5806668148525886,
"learning_rate": 5e-06,
"loss": 0.611,
"step": 3170
},
{
"epoch": 3.191569439217162,
"grad_norm": 0.5847564401984537,
"learning_rate": 5e-06,
"loss": 0.6099,
"step": 3180
},
{
"epoch": 3.201605821101493,
"grad_norm": 0.5685073324759383,
"learning_rate": 5e-06,
"loss": 0.6162,
"step": 3190
},
{
"epoch": 3.2116422029858236,
"grad_norm": 0.5806892343391038,
"learning_rate": 5e-06,
"loss": 0.6099,
"step": 3200
},
{
"epoch": 3.2216785848701543,
"grad_norm": 0.5629335787755336,
"learning_rate": 5e-06,
"loss": 0.6122,
"step": 3210
},
{
"epoch": 3.231714966754485,
"grad_norm": 0.6104998235017857,
"learning_rate": 5e-06,
"loss": 0.6128,
"step": 3220
},
{
"epoch": 3.241751348638816,
"grad_norm": 0.670576007712542,
"learning_rate": 5e-06,
"loss": 0.6122,
"step": 3230
},
{
"epoch": 3.251787730523146,
"grad_norm": 0.6597487550561909,
"learning_rate": 5e-06,
"loss": 0.6135,
"step": 3240
},
{
"epoch": 3.261824112407477,
"grad_norm": 0.5645378989833628,
"learning_rate": 5e-06,
"loss": 0.6149,
"step": 3250
},
{
"epoch": 3.2718604942918077,
"grad_norm": 0.5939861646065504,
"learning_rate": 5e-06,
"loss": 0.6097,
"step": 3260
},
{
"epoch": 3.2818968761761385,
"grad_norm": 0.6160406690736504,
"learning_rate": 5e-06,
"loss": 0.6086,
"step": 3270
},
{
"epoch": 3.291933258060469,
"grad_norm": 0.5487825374465094,
"learning_rate": 5e-06,
"loss": 0.6144,
"step": 3280
},
{
"epoch": 3.3019696399448,
"grad_norm": 0.6520181865316601,
"learning_rate": 5e-06,
"loss": 0.6183,
"step": 3290
},
{
"epoch": 3.3120060218291307,
"grad_norm": 0.5977506986605584,
"learning_rate": 5e-06,
"loss": 0.6147,
"step": 3300
},
{
"epoch": 3.3220424037134615,
"grad_norm": 0.6484133892242163,
"learning_rate": 5e-06,
"loss": 0.6151,
"step": 3310
},
{
"epoch": 3.332078785597792,
"grad_norm": 0.5970543245993525,
"learning_rate": 5e-06,
"loss": 0.6108,
"step": 3320
},
{
"epoch": 3.3421151674821226,
"grad_norm": 0.6116862845869632,
"learning_rate": 5e-06,
"loss": 0.6166,
"step": 3330
},
{
"epoch": 3.3521515493664533,
"grad_norm": 0.5580458755366267,
"learning_rate": 5e-06,
"loss": 0.6177,
"step": 3340
},
{
"epoch": 3.362187931250784,
"grad_norm": 0.7040073547476862,
"learning_rate": 5e-06,
"loss": 0.6181,
"step": 3350
},
{
"epoch": 3.372224313135115,
"grad_norm": 0.652807816857214,
"learning_rate": 5e-06,
"loss": 0.6186,
"step": 3360
},
{
"epoch": 3.3822606950194456,
"grad_norm": 0.6106384378999347,
"learning_rate": 5e-06,
"loss": 0.6189,
"step": 3370
},
{
"epoch": 3.3922970769037764,
"grad_norm": 0.6920160744092827,
"learning_rate": 5e-06,
"loss": 0.617,
"step": 3380
},
{
"epoch": 3.4023334587881067,
"grad_norm": 0.5814853765567533,
"learning_rate": 5e-06,
"loss": 0.6118,
"step": 3390
},
{
"epoch": 3.4123698406724374,
"grad_norm": 0.5389887117603109,
"learning_rate": 5e-06,
"loss": 0.6126,
"step": 3400
},
{
"epoch": 3.422406222556768,
"grad_norm": 0.582316325799389,
"learning_rate": 5e-06,
"loss": 0.6143,
"step": 3410
},
{
"epoch": 3.432442604441099,
"grad_norm": 0.5612761289810537,
"learning_rate": 5e-06,
"loss": 0.616,
"step": 3420
},
{
"epoch": 3.4424789863254297,
"grad_norm": 0.5315307027152637,
"learning_rate": 5e-06,
"loss": 0.6126,
"step": 3430
},
{
"epoch": 3.4525153682097605,
"grad_norm": 0.5632095541748001,
"learning_rate": 5e-06,
"loss": 0.6139,
"step": 3440
},
{
"epoch": 3.462551750094091,
"grad_norm": 0.5777253428377956,
"learning_rate": 5e-06,
"loss": 0.6153,
"step": 3450
},
{
"epoch": 3.472588131978422,
"grad_norm": 0.5802066633079221,
"learning_rate": 5e-06,
"loss": 0.6183,
"step": 3460
},
{
"epoch": 3.4826245138627523,
"grad_norm": 0.6858510179050318,
"learning_rate": 5e-06,
"loss": 0.6105,
"step": 3470
},
{
"epoch": 3.492660895747083,
"grad_norm": 0.6150923461042579,
"learning_rate": 5e-06,
"loss": 0.6123,
"step": 3480
},
{
"epoch": 3.502697277631414,
"grad_norm": 0.6850358509214438,
"learning_rate": 5e-06,
"loss": 0.6176,
"step": 3490
},
{
"epoch": 3.5127336595157446,
"grad_norm": 0.6221194287714066,
"learning_rate": 5e-06,
"loss": 0.6135,
"step": 3500
},
{
"epoch": 3.5227700414000753,
"grad_norm": 0.6337555357747637,
"learning_rate": 5e-06,
"loss": 0.6176,
"step": 3510
},
{
"epoch": 3.532806423284406,
"grad_norm": 0.5696342404252304,
"learning_rate": 5e-06,
"loss": 0.6194,
"step": 3520
},
{
"epoch": 3.5428428051687364,
"grad_norm": 0.5192096724412292,
"learning_rate": 5e-06,
"loss": 0.6169,
"step": 3530
},
{
"epoch": 3.552879187053067,
"grad_norm": 0.6461636488212382,
"learning_rate": 5e-06,
"loss": 0.6194,
"step": 3540
},
{
"epoch": 3.562915568937398,
"grad_norm": 0.5204792269879596,
"learning_rate": 5e-06,
"loss": 0.6134,
"step": 3550
},
{
"epoch": 3.5729519508217287,
"grad_norm": 0.5799708864875738,
"learning_rate": 5e-06,
"loss": 0.6188,
"step": 3560
},
{
"epoch": 3.5829883327060594,
"grad_norm": 0.5463250823773549,
"learning_rate": 5e-06,
"loss": 0.6176,
"step": 3570
},
{
"epoch": 3.59302471459039,
"grad_norm": 0.6314712032266755,
"learning_rate": 5e-06,
"loss": 0.6207,
"step": 3580
},
{
"epoch": 3.603061096474721,
"grad_norm": 0.6479660409480549,
"learning_rate": 5e-06,
"loss": 0.6164,
"step": 3590
},
{
"epoch": 3.6130974783590517,
"grad_norm": 0.6300826657668005,
"learning_rate": 5e-06,
"loss": 0.6161,
"step": 3600
},
{
"epoch": 3.6231338602433825,
"grad_norm": 0.6595382686169196,
"learning_rate": 5e-06,
"loss": 0.6206,
"step": 3610
},
{
"epoch": 3.6331702421277132,
"grad_norm": 0.5652565303110992,
"learning_rate": 5e-06,
"loss": 0.6196,
"step": 3620
},
{
"epoch": 3.6432066240120435,
"grad_norm": 0.579203582642288,
"learning_rate": 5e-06,
"loss": 0.6204,
"step": 3630
},
{
"epoch": 3.6532430058963743,
"grad_norm": 0.5437849615714947,
"learning_rate": 5e-06,
"loss": 0.6183,
"step": 3640
},
{
"epoch": 3.663279387780705,
"grad_norm": 0.5787593937881179,
"learning_rate": 5e-06,
"loss": 0.6167,
"step": 3650
},
{
"epoch": 3.673315769665036,
"grad_norm": 0.5811897003512283,
"learning_rate": 5e-06,
"loss": 0.6152,
"step": 3660
},
{
"epoch": 3.6833521515493666,
"grad_norm": 0.5178334072146914,
"learning_rate": 5e-06,
"loss": 0.612,
"step": 3670
},
{
"epoch": 3.693388533433697,
"grad_norm": 0.6269969593175169,
"learning_rate": 5e-06,
"loss": 0.616,
"step": 3680
},
{
"epoch": 3.7034249153180276,
"grad_norm": 0.6398231596326978,
"learning_rate": 5e-06,
"loss": 0.62,
"step": 3690
},
{
"epoch": 3.7134612972023584,
"grad_norm": 0.5787553408157999,
"learning_rate": 5e-06,
"loss": 0.6199,
"step": 3700
},
{
"epoch": 3.723497679086689,
"grad_norm": 0.5592510334692263,
"learning_rate": 5e-06,
"loss": 0.6172,
"step": 3710
},
{
"epoch": 3.73353406097102,
"grad_norm": 0.5911520225126639,
"learning_rate": 5e-06,
"loss": 0.6206,
"step": 3720
},
{
"epoch": 3.7435704428553507,
"grad_norm": 0.5509577144961972,
"learning_rate": 5e-06,
"loss": 0.6162,
"step": 3730
},
{
"epoch": 3.7536068247396814,
"grad_norm": 0.6230171740270747,
"learning_rate": 5e-06,
"loss": 0.6211,
"step": 3740
},
{
"epoch": 3.763643206624012,
"grad_norm": 0.606064062911815,
"learning_rate": 5e-06,
"loss": 0.6169,
"step": 3750
},
{
"epoch": 3.773679588508343,
"grad_norm": 0.5251378729385787,
"learning_rate": 5e-06,
"loss": 0.6142,
"step": 3760
},
{
"epoch": 3.7837159703926737,
"grad_norm": 0.5313902122829314,
"learning_rate": 5e-06,
"loss": 0.6171,
"step": 3770
},
{
"epoch": 3.793752352277004,
"grad_norm": 0.5738855269066635,
"learning_rate": 5e-06,
"loss": 0.614,
"step": 3780
},
{
"epoch": 3.803788734161335,
"grad_norm": 0.6094017649137001,
"learning_rate": 5e-06,
"loss": 0.6184,
"step": 3790
},
{
"epoch": 3.8138251160456655,
"grad_norm": 0.577775104243207,
"learning_rate": 5e-06,
"loss": 0.6147,
"step": 3800
},
{
"epoch": 3.8238614979299963,
"grad_norm": 0.5896473350091869,
"learning_rate": 5e-06,
"loss": 0.6191,
"step": 3810
},
{
"epoch": 3.833897879814327,
"grad_norm": 0.6197381864983481,
"learning_rate": 5e-06,
"loss": 0.6204,
"step": 3820
},
{
"epoch": 3.8439342616986574,
"grad_norm": 0.6385235404417998,
"learning_rate": 5e-06,
"loss": 0.6184,
"step": 3830
},
{
"epoch": 3.853970643582988,
"grad_norm": 0.5605836986977404,
"learning_rate": 5e-06,
"loss": 0.6184,
"step": 3840
},
{
"epoch": 3.864007025467319,
"grad_norm": 0.5314662217673221,
"learning_rate": 5e-06,
"loss": 0.6175,
"step": 3850
},
{
"epoch": 3.8740434073516496,
"grad_norm": 0.5617746957894754,
"learning_rate": 5e-06,
"loss": 0.6168,
"step": 3860
},
{
"epoch": 3.8840797892359804,
"grad_norm": 0.6130682994037887,
"learning_rate": 5e-06,
"loss": 0.6213,
"step": 3870
},
{
"epoch": 3.894116171120311,
"grad_norm": 0.6330660373549564,
"learning_rate": 5e-06,
"loss": 0.6237,
"step": 3880
},
{
"epoch": 3.904152553004642,
"grad_norm": 0.5757814490358608,
"learning_rate": 5e-06,
"loss": 0.6199,
"step": 3890
},
{
"epoch": 3.9141889348889727,
"grad_norm": 0.5686860241059948,
"learning_rate": 5e-06,
"loss": 0.6166,
"step": 3900
},
{
"epoch": 3.9242253167733034,
"grad_norm": 0.577591496190582,
"learning_rate": 5e-06,
"loss": 0.6194,
"step": 3910
},
{
"epoch": 3.934261698657634,
"grad_norm": 0.5589041470451204,
"learning_rate": 5e-06,
"loss": 0.619,
"step": 3920
},
{
"epoch": 3.9442980805419645,
"grad_norm": 0.6004802459840047,
"learning_rate": 5e-06,
"loss": 0.6148,
"step": 3930
},
{
"epoch": 3.9543344624262953,
"grad_norm": 0.6153349703188992,
"learning_rate": 5e-06,
"loss": 0.6166,
"step": 3940
},
{
"epoch": 3.964370844310626,
"grad_norm": 0.5218178205884076,
"learning_rate": 5e-06,
"loss": 0.619,
"step": 3950
},
{
"epoch": 3.974407226194957,
"grad_norm": 0.547168472398349,
"learning_rate": 5e-06,
"loss": 0.6194,
"step": 3960
},
{
"epoch": 3.9844436080792875,
"grad_norm": 0.5359836200059497,
"learning_rate": 5e-06,
"loss": 0.6225,
"step": 3970
},
{
"epoch": 3.994479989963618,
"grad_norm": 0.7180200697231374,
"learning_rate": 5e-06,
"loss": 0.6204,
"step": 3980
},
{
"epoch": 3.99849454271735,
"eval_loss": 0.7222821116447449,
"eval_runtime": 679.219,
"eval_samples_per_second": 39.529,
"eval_steps_per_second": 0.618,
"step": 3984
},
{
"epoch": 3.99849454271735,
"step": 3984,
"total_flos": 6673139006177280.0,
"train_loss": 0.6817261522194468,
"train_runtime": 130344.1738,
"train_samples_per_second": 15.654,
"train_steps_per_second": 0.031
}
],
"logging_steps": 10,
"max_steps": 3984,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6673139006177280.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}