fine-tuned-llama-13b-acad / trainer_state.json
ewsiegel's picture
fine tuned llama 13b checkpoint 2900: eval loss ~1.25 for acad dataset
0e9b3b3
{
"best_metric": 1.2186033725738525,
"best_model_checkpoint": "./output5/checkpoint-2600",
"epoch": 9.965635738831615,
"eval_steps": 50,
"global_step": 2900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03436426116838488,
"grad_norm": 0.69921875,
"learning_rate": 1e-05,
"loss": 2.2132,
"step": 10
},
{
"epoch": 0.06872852233676977,
"grad_norm": 0.84375,
"learning_rate": 1e-05,
"loss": 1.4922,
"step": 20
},
{
"epoch": 0.10309278350515463,
"grad_norm": 1.1171875,
"learning_rate": 1e-05,
"loss": 1.5456,
"step": 30
},
{
"epoch": 0.13745704467353953,
"grad_norm": 1.4609375,
"learning_rate": 1e-05,
"loss": 1.3655,
"step": 40
},
{
"epoch": 0.1718213058419244,
"grad_norm": 2.6875,
"learning_rate": 1e-05,
"loss": 1.5387,
"step": 50
},
{
"epoch": 0.1718213058419244,
"eval_loss": 1.578947901725769,
"eval_runtime": 80.1848,
"eval_samples_per_second": 6.236,
"eval_steps_per_second": 0.399,
"step": 50
},
{
"epoch": 0.20618556701030927,
"grad_norm": 0.71484375,
"learning_rate": 1e-05,
"loss": 1.9347,
"step": 60
},
{
"epoch": 0.24054982817869416,
"grad_norm": 0.7578125,
"learning_rate": 1e-05,
"loss": 1.3243,
"step": 70
},
{
"epoch": 0.27491408934707906,
"grad_norm": 0.890625,
"learning_rate": 1e-05,
"loss": 1.2335,
"step": 80
},
{
"epoch": 0.30927835051546393,
"grad_norm": 1.3515625,
"learning_rate": 1e-05,
"loss": 1.3811,
"step": 90
},
{
"epoch": 0.3436426116838488,
"grad_norm": 2.046875,
"learning_rate": 1e-05,
"loss": 1.4447,
"step": 100
},
{
"epoch": 0.3436426116838488,
"eval_loss": 1.5564889907836914,
"eval_runtime": 79.6461,
"eval_samples_per_second": 6.278,
"eval_steps_per_second": 0.402,
"step": 100
},
{
"epoch": 0.37800687285223367,
"grad_norm": 0.671875,
"learning_rate": 1e-05,
"loss": 2.0252,
"step": 110
},
{
"epoch": 0.41237113402061853,
"grad_norm": 0.82421875,
"learning_rate": 1e-05,
"loss": 1.3097,
"step": 120
},
{
"epoch": 0.44673539518900346,
"grad_norm": 0.8515625,
"learning_rate": 1e-05,
"loss": 1.1997,
"step": 130
},
{
"epoch": 0.48109965635738833,
"grad_norm": 0.99609375,
"learning_rate": 1e-05,
"loss": 1.1678,
"step": 140
},
{
"epoch": 0.5154639175257731,
"grad_norm": 2.546875,
"learning_rate": 1e-05,
"loss": 1.5449,
"step": 150
},
{
"epoch": 0.5154639175257731,
"eval_loss": 1.5245628356933594,
"eval_runtime": 79.8147,
"eval_samples_per_second": 6.265,
"eval_steps_per_second": 0.401,
"step": 150
},
{
"epoch": 0.5498281786941581,
"grad_norm": 0.62109375,
"learning_rate": 1e-05,
"loss": 2.0199,
"step": 160
},
{
"epoch": 0.584192439862543,
"grad_norm": 0.9609375,
"learning_rate": 1e-05,
"loss": 1.4356,
"step": 170
},
{
"epoch": 0.6185567010309279,
"grad_norm": 1.171875,
"learning_rate": 1e-05,
"loss": 1.192,
"step": 180
},
{
"epoch": 0.6529209621993127,
"grad_norm": 1.171875,
"learning_rate": 1e-05,
"loss": 1.212,
"step": 190
},
{
"epoch": 0.6872852233676976,
"grad_norm": 1.4296875,
"learning_rate": 1e-05,
"loss": 1.357,
"step": 200
},
{
"epoch": 0.6872852233676976,
"eval_loss": 1.4709516763687134,
"eval_runtime": 80.6225,
"eval_samples_per_second": 6.202,
"eval_steps_per_second": 0.397,
"step": 200
},
{
"epoch": 0.7216494845360825,
"grad_norm": 0.72265625,
"learning_rate": 1e-05,
"loss": 1.9052,
"step": 210
},
{
"epoch": 0.7560137457044673,
"grad_norm": 0.69140625,
"learning_rate": 1e-05,
"loss": 1.3356,
"step": 220
},
{
"epoch": 0.7903780068728522,
"grad_norm": 0.91796875,
"learning_rate": 1e-05,
"loss": 1.2412,
"step": 230
},
{
"epoch": 0.8247422680412371,
"grad_norm": 1.0,
"learning_rate": 1e-05,
"loss": 1.3001,
"step": 240
},
{
"epoch": 0.8591065292096219,
"grad_norm": 1.3984375,
"learning_rate": 1e-05,
"loss": 1.0416,
"step": 250
},
{
"epoch": 0.8591065292096219,
"eval_loss": 1.4416706562042236,
"eval_runtime": 80.1066,
"eval_samples_per_second": 6.242,
"eval_steps_per_second": 0.399,
"step": 250
},
{
"epoch": 0.8934707903780069,
"grad_norm": 0.68359375,
"learning_rate": 1e-05,
"loss": 1.9173,
"step": 260
},
{
"epoch": 0.9278350515463918,
"grad_norm": 0.796875,
"learning_rate": 1e-05,
"loss": 1.1558,
"step": 270
},
{
"epoch": 0.9621993127147767,
"grad_norm": 1.3125,
"learning_rate": 1e-05,
"loss": 1.0802,
"step": 280
},
{
"epoch": 0.9965635738831615,
"grad_norm": 1.6328125,
"learning_rate": 1e-05,
"loss": 1.1607,
"step": 290
},
{
"epoch": 1.0309278350515463,
"grad_norm": 0.72265625,
"learning_rate": 1e-05,
"loss": 1.9346,
"step": 300
},
{
"epoch": 1.0309278350515463,
"eval_loss": 1.4207162857055664,
"eval_runtime": 80.3244,
"eval_samples_per_second": 6.225,
"eval_steps_per_second": 0.398,
"step": 300
},
{
"epoch": 1.0652920962199313,
"grad_norm": 0.70703125,
"learning_rate": 1e-05,
"loss": 1.317,
"step": 310
},
{
"epoch": 1.0996563573883162,
"grad_norm": 0.84765625,
"learning_rate": 1e-05,
"loss": 1.1657,
"step": 320
},
{
"epoch": 1.134020618556701,
"grad_norm": 1.4296875,
"learning_rate": 1e-05,
"loss": 1.1752,
"step": 330
},
{
"epoch": 1.168384879725086,
"grad_norm": 1.5859375,
"learning_rate": 1e-05,
"loss": 1.1167,
"step": 340
},
{
"epoch": 1.2027491408934707,
"grad_norm": 0.73046875,
"learning_rate": 1e-05,
"loss": 1.8479,
"step": 350
},
{
"epoch": 1.2027491408934707,
"eval_loss": 1.4646198749542236,
"eval_runtime": 80.4124,
"eval_samples_per_second": 6.218,
"eval_steps_per_second": 0.398,
"step": 350
},
{
"epoch": 1.2371134020618557,
"grad_norm": 0.59765625,
"learning_rate": 1e-05,
"loss": 1.2725,
"step": 360
},
{
"epoch": 1.2714776632302405,
"grad_norm": 1.015625,
"learning_rate": 1e-05,
"loss": 1.0993,
"step": 370
},
{
"epoch": 1.3058419243986255,
"grad_norm": 1.0390625,
"learning_rate": 1e-05,
"loss": 0.9937,
"step": 380
},
{
"epoch": 1.3402061855670104,
"grad_norm": 1.2890625,
"learning_rate": 1e-05,
"loss": 1.1734,
"step": 390
},
{
"epoch": 1.3745704467353952,
"grad_norm": 0.484375,
"learning_rate": 1e-05,
"loss": 2.099,
"step": 400
},
{
"epoch": 1.3745704467353952,
"eval_loss": 1.3869779109954834,
"eval_runtime": 81.9537,
"eval_samples_per_second": 6.101,
"eval_steps_per_second": 0.39,
"step": 400
},
{
"epoch": 1.40893470790378,
"grad_norm": 0.74609375,
"learning_rate": 1e-05,
"loss": 1.284,
"step": 410
},
{
"epoch": 1.443298969072165,
"grad_norm": 0.6484375,
"learning_rate": 1e-05,
"loss": 1.1155,
"step": 420
},
{
"epoch": 1.47766323024055,
"grad_norm": 1.0625,
"learning_rate": 1e-05,
"loss": 1.0428,
"step": 430
},
{
"epoch": 1.5120274914089347,
"grad_norm": 1.5625,
"learning_rate": 1e-05,
"loss": 1.1593,
"step": 440
},
{
"epoch": 1.5463917525773194,
"grad_norm": 0.59375,
"learning_rate": 1e-05,
"loss": 1.9615,
"step": 450
},
{
"epoch": 1.5463917525773194,
"eval_loss": 1.3642898797988892,
"eval_runtime": 80.3381,
"eval_samples_per_second": 6.224,
"eval_steps_per_second": 0.398,
"step": 450
},
{
"epoch": 1.5807560137457046,
"grad_norm": 0.80078125,
"learning_rate": 1e-05,
"loss": 1.3177,
"step": 460
},
{
"epoch": 1.6151202749140894,
"grad_norm": 0.82421875,
"learning_rate": 1e-05,
"loss": 1.1826,
"step": 470
},
{
"epoch": 1.6494845360824741,
"grad_norm": 1.109375,
"learning_rate": 1e-05,
"loss": 1.2282,
"step": 480
},
{
"epoch": 1.6838487972508591,
"grad_norm": 1.6953125,
"learning_rate": 1e-05,
"loss": 1.0857,
"step": 490
},
{
"epoch": 1.718213058419244,
"grad_norm": 0.50390625,
"learning_rate": 1e-05,
"loss": 1.8553,
"step": 500
},
{
"epoch": 1.718213058419244,
"eval_loss": 1.3780114650726318,
"eval_runtime": 79.3369,
"eval_samples_per_second": 6.302,
"eval_steps_per_second": 0.403,
"step": 500
},
{
"epoch": 1.7525773195876289,
"grad_norm": 0.79296875,
"learning_rate": 1e-05,
"loss": 1.2868,
"step": 510
},
{
"epoch": 1.7869415807560136,
"grad_norm": 0.8515625,
"learning_rate": 1e-05,
"loss": 1.1694,
"step": 520
},
{
"epoch": 1.8213058419243986,
"grad_norm": 1.3671875,
"learning_rate": 1e-05,
"loss": 1.175,
"step": 530
},
{
"epoch": 1.8556701030927836,
"grad_norm": 1.9140625,
"learning_rate": 1e-05,
"loss": 1.0767,
"step": 540
},
{
"epoch": 1.8900343642611683,
"grad_norm": 0.59375,
"learning_rate": 1e-05,
"loss": 1.9291,
"step": 550
},
{
"epoch": 1.8900343642611683,
"eval_loss": 1.3614013195037842,
"eval_runtime": 80.21,
"eval_samples_per_second": 6.234,
"eval_steps_per_second": 0.399,
"step": 550
},
{
"epoch": 1.9243986254295533,
"grad_norm": 0.86328125,
"learning_rate": 1e-05,
"loss": 1.1665,
"step": 560
},
{
"epoch": 1.9587628865979383,
"grad_norm": 1.0546875,
"learning_rate": 1e-05,
"loss": 1.2012,
"step": 570
},
{
"epoch": 1.993127147766323,
"grad_norm": 1.40625,
"learning_rate": 1e-05,
"loss": 1.1156,
"step": 580
},
{
"epoch": 2.027491408934708,
"grad_norm": 0.462890625,
"learning_rate": 1e-05,
"loss": 1.8829,
"step": 590
},
{
"epoch": 2.0618556701030926,
"grad_norm": 0.61328125,
"learning_rate": 1e-05,
"loss": 1.3168,
"step": 600
},
{
"epoch": 2.0618556701030926,
"eval_loss": 1.3548240661621094,
"eval_runtime": 80.342,
"eval_samples_per_second": 6.223,
"eval_steps_per_second": 0.398,
"step": 600
},
{
"epoch": 2.0962199312714778,
"grad_norm": 0.9140625,
"learning_rate": 1e-05,
"loss": 1.1702,
"step": 610
},
{
"epoch": 2.1305841924398625,
"grad_norm": 1.2265625,
"learning_rate": 1e-05,
"loss": 1.1369,
"step": 620
},
{
"epoch": 2.1649484536082473,
"grad_norm": 1.484375,
"learning_rate": 1e-05,
"loss": 1.0219,
"step": 630
},
{
"epoch": 2.1993127147766325,
"grad_norm": 0.5234375,
"learning_rate": 1e-05,
"loss": 1.8862,
"step": 640
},
{
"epoch": 2.2336769759450172,
"grad_norm": 0.6640625,
"learning_rate": 1e-05,
"loss": 1.2818,
"step": 650
},
{
"epoch": 2.2336769759450172,
"eval_loss": 1.3763458728790283,
"eval_runtime": 80.2731,
"eval_samples_per_second": 6.229,
"eval_steps_per_second": 0.399,
"step": 650
},
{
"epoch": 2.268041237113402,
"grad_norm": 0.640625,
"learning_rate": 1e-05,
"loss": 1.088,
"step": 660
},
{
"epoch": 2.3024054982817868,
"grad_norm": 0.87890625,
"learning_rate": 1e-05,
"loss": 1.1099,
"step": 670
},
{
"epoch": 2.336769759450172,
"grad_norm": 1.265625,
"learning_rate": 1e-05,
"loss": 1.0243,
"step": 680
},
{
"epoch": 2.3711340206185567,
"grad_norm": 0.578125,
"learning_rate": 1e-05,
"loss": 1.9865,
"step": 690
},
{
"epoch": 2.4054982817869415,
"grad_norm": 0.671875,
"learning_rate": 1e-05,
"loss": 1.2409,
"step": 700
},
{
"epoch": 2.4054982817869415,
"eval_loss": 1.3502774238586426,
"eval_runtime": 79.9224,
"eval_samples_per_second": 6.256,
"eval_steps_per_second": 0.4,
"step": 700
},
{
"epoch": 2.4398625429553267,
"grad_norm": 0.73046875,
"learning_rate": 1e-05,
"loss": 1.1074,
"step": 710
},
{
"epoch": 2.4742268041237114,
"grad_norm": 1.0,
"learning_rate": 1e-05,
"loss": 0.9593,
"step": 720
},
{
"epoch": 2.508591065292096,
"grad_norm": 1.265625,
"learning_rate": 1e-05,
"loss": 0.9963,
"step": 730
},
{
"epoch": 2.542955326460481,
"grad_norm": 0.61328125,
"learning_rate": 1e-05,
"loss": 1.6973,
"step": 740
},
{
"epoch": 2.5773195876288657,
"grad_norm": 0.890625,
"learning_rate": 1e-05,
"loss": 1.3532,
"step": 750
},
{
"epoch": 2.5773195876288657,
"eval_loss": 1.3376290798187256,
"eval_runtime": 79.8496,
"eval_samples_per_second": 6.262,
"eval_steps_per_second": 0.401,
"step": 750
},
{
"epoch": 2.611683848797251,
"grad_norm": 0.953125,
"learning_rate": 1e-05,
"loss": 1.0937,
"step": 760
},
{
"epoch": 2.6460481099656357,
"grad_norm": 1.0625,
"learning_rate": 1e-05,
"loss": 1.1459,
"step": 770
},
{
"epoch": 2.680412371134021,
"grad_norm": 1.3515625,
"learning_rate": 1e-05,
"loss": 1.0239,
"step": 780
},
{
"epoch": 2.7147766323024056,
"grad_norm": 0.52734375,
"learning_rate": 1e-05,
"loss": 1.8602,
"step": 790
},
{
"epoch": 2.7491408934707904,
"grad_norm": 0.71875,
"learning_rate": 1e-05,
"loss": 1.355,
"step": 800
},
{
"epoch": 2.7491408934707904,
"eval_loss": 1.3292906284332275,
"eval_runtime": 80.1225,
"eval_samples_per_second": 6.24,
"eval_steps_per_second": 0.399,
"step": 800
},
{
"epoch": 2.783505154639175,
"grad_norm": 0.7265625,
"learning_rate": 1e-05,
"loss": 1.0998,
"step": 810
},
{
"epoch": 2.81786941580756,
"grad_norm": 1.078125,
"learning_rate": 1e-05,
"loss": 0.9259,
"step": 820
},
{
"epoch": 2.852233676975945,
"grad_norm": 1.65625,
"learning_rate": 1e-05,
"loss": 0.9241,
"step": 830
},
{
"epoch": 2.88659793814433,
"grad_norm": 0.56640625,
"learning_rate": 1e-05,
"loss": 1.7542,
"step": 840
},
{
"epoch": 2.9209621993127146,
"grad_norm": 0.57421875,
"learning_rate": 1e-05,
"loss": 1.1226,
"step": 850
},
{
"epoch": 2.9209621993127146,
"eval_loss": 1.3402440547943115,
"eval_runtime": 81.6541,
"eval_samples_per_second": 6.123,
"eval_steps_per_second": 0.392,
"step": 850
},
{
"epoch": 2.9553264604811,
"grad_norm": 0.6484375,
"learning_rate": 1e-05,
"loss": 1.0165,
"step": 860
},
{
"epoch": 2.9896907216494846,
"grad_norm": 1.1328125,
"learning_rate": 1e-05,
"loss": 0.9868,
"step": 870
},
{
"epoch": 3.0240549828178693,
"grad_norm": 0.53125,
"learning_rate": 1e-05,
"loss": 1.8808,
"step": 880
},
{
"epoch": 3.058419243986254,
"grad_norm": 0.6171875,
"learning_rate": 1e-05,
"loss": 1.2961,
"step": 890
},
{
"epoch": 3.0927835051546393,
"grad_norm": 0.76953125,
"learning_rate": 1e-05,
"loss": 1.0729,
"step": 900
},
{
"epoch": 3.0927835051546393,
"eval_loss": 1.3085824251174927,
"eval_runtime": 80.7302,
"eval_samples_per_second": 6.193,
"eval_steps_per_second": 0.396,
"step": 900
},
{
"epoch": 3.127147766323024,
"grad_norm": 1.1015625,
"learning_rate": 1e-05,
"loss": 0.903,
"step": 910
},
{
"epoch": 3.161512027491409,
"grad_norm": 1.4921875,
"learning_rate": 1e-05,
"loss": 1.0791,
"step": 920
},
{
"epoch": 3.195876288659794,
"grad_norm": 0.55859375,
"learning_rate": 1e-05,
"loss": 1.8545,
"step": 930
},
{
"epoch": 3.2302405498281788,
"grad_norm": 0.84375,
"learning_rate": 1e-05,
"loss": 1.3069,
"step": 940
},
{
"epoch": 3.2646048109965635,
"grad_norm": 0.8671875,
"learning_rate": 1e-05,
"loss": 1.1804,
"step": 950
},
{
"epoch": 3.2646048109965635,
"eval_loss": 1.3051691055297852,
"eval_runtime": 80.7138,
"eval_samples_per_second": 6.195,
"eval_steps_per_second": 0.396,
"step": 950
},
{
"epoch": 3.2989690721649483,
"grad_norm": 1.3046875,
"learning_rate": 1e-05,
"loss": 1.1198,
"step": 960
},
{
"epoch": 3.3333333333333335,
"grad_norm": 1.0546875,
"learning_rate": 1e-05,
"loss": 0.8602,
"step": 970
},
{
"epoch": 3.3676975945017182,
"grad_norm": 0.5078125,
"learning_rate": 1e-05,
"loss": 1.6749,
"step": 980
},
{
"epoch": 3.402061855670103,
"grad_norm": 0.5859375,
"learning_rate": 1e-05,
"loss": 1.2621,
"step": 990
},
{
"epoch": 3.436426116838488,
"grad_norm": 0.71875,
"learning_rate": 1e-05,
"loss": 1.0859,
"step": 1000
},
{
"epoch": 3.436426116838488,
"eval_loss": 1.286286473274231,
"eval_runtime": 80.1366,
"eval_samples_per_second": 6.239,
"eval_steps_per_second": 0.399,
"step": 1000
},
{
"epoch": 3.470790378006873,
"grad_norm": 1.0,
"learning_rate": 1e-05,
"loss": 1.065,
"step": 1010
},
{
"epoch": 3.5051546391752577,
"grad_norm": 1.28125,
"learning_rate": 1e-05,
"loss": 0.9891,
"step": 1020
},
{
"epoch": 3.5395189003436425,
"grad_norm": 0.52734375,
"learning_rate": 1e-05,
"loss": 1.7157,
"step": 1030
},
{
"epoch": 3.5738831615120272,
"grad_norm": 0.75,
"learning_rate": 1e-05,
"loss": 1.3258,
"step": 1040
},
{
"epoch": 3.6082474226804124,
"grad_norm": 0.99609375,
"learning_rate": 1e-05,
"loss": 1.0757,
"step": 1050
},
{
"epoch": 3.6082474226804124,
"eval_loss": 1.2840737104415894,
"eval_runtime": 80.3804,
"eval_samples_per_second": 6.22,
"eval_steps_per_second": 0.398,
"step": 1050
},
{
"epoch": 3.642611683848797,
"grad_norm": 0.87890625,
"learning_rate": 1e-05,
"loss": 0.8942,
"step": 1060
},
{
"epoch": 3.6769759450171824,
"grad_norm": 1.2890625,
"learning_rate": 1e-05,
"loss": 0.8901,
"step": 1070
},
{
"epoch": 3.711340206185567,
"grad_norm": 0.59375,
"learning_rate": 1e-05,
"loss": 1.7041,
"step": 1080
},
{
"epoch": 3.745704467353952,
"grad_norm": 0.59375,
"learning_rate": 1e-05,
"loss": 1.1786,
"step": 1090
},
{
"epoch": 3.7800687285223367,
"grad_norm": 0.67578125,
"learning_rate": 1e-05,
"loss": 1.0116,
"step": 1100
},
{
"epoch": 3.7800687285223367,
"eval_loss": 1.277146339416504,
"eval_runtime": 79.1842,
"eval_samples_per_second": 6.314,
"eval_steps_per_second": 0.404,
"step": 1100
},
{
"epoch": 3.8144329896907214,
"grad_norm": 0.7890625,
"learning_rate": 1e-05,
"loss": 1.0424,
"step": 1110
},
{
"epoch": 3.8487972508591066,
"grad_norm": 1.2578125,
"learning_rate": 1e-05,
"loss": 1.0771,
"step": 1120
},
{
"epoch": 3.8831615120274914,
"grad_norm": 0.48046875,
"learning_rate": 1e-05,
"loss": 1.8089,
"step": 1130
},
{
"epoch": 3.917525773195876,
"grad_norm": 0.73046875,
"learning_rate": 1e-05,
"loss": 1.3133,
"step": 1140
},
{
"epoch": 3.9518900343642613,
"grad_norm": 0.78515625,
"learning_rate": 1e-05,
"loss": 0.9659,
"step": 1150
},
{
"epoch": 3.9518900343642613,
"eval_loss": 1.291884183883667,
"eval_runtime": 80.2935,
"eval_samples_per_second": 6.227,
"eval_steps_per_second": 0.399,
"step": 1150
},
{
"epoch": 3.986254295532646,
"grad_norm": 0.875,
"learning_rate": 1e-05,
"loss": 0.9024,
"step": 1160
},
{
"epoch": 4.020618556701031,
"grad_norm": 0.578125,
"learning_rate": 1e-05,
"loss": 1.7178,
"step": 1170
},
{
"epoch": 4.054982817869416,
"grad_norm": 0.53515625,
"learning_rate": 1e-05,
"loss": 1.3409,
"step": 1180
},
{
"epoch": 4.0893470790378,
"grad_norm": 0.828125,
"learning_rate": 1e-05,
"loss": 1.0683,
"step": 1190
},
{
"epoch": 4.123711340206185,
"grad_norm": 1.0,
"learning_rate": 1e-05,
"loss": 0.9598,
"step": 1200
},
{
"epoch": 4.123711340206185,
"eval_loss": 1.2872310876846313,
"eval_runtime": 79.8735,
"eval_samples_per_second": 6.26,
"eval_steps_per_second": 0.401,
"step": 1200
},
{
"epoch": 4.158075601374571,
"grad_norm": 1.0,
"learning_rate": 1e-05,
"loss": 0.9059,
"step": 1210
},
{
"epoch": 4.1924398625429555,
"grad_norm": 0.53125,
"learning_rate": 1e-05,
"loss": 1.7104,
"step": 1220
},
{
"epoch": 4.22680412371134,
"grad_norm": 0.6484375,
"learning_rate": 1e-05,
"loss": 1.3306,
"step": 1230
},
{
"epoch": 4.261168384879725,
"grad_norm": 0.85546875,
"learning_rate": 1e-05,
"loss": 0.9887,
"step": 1240
},
{
"epoch": 4.29553264604811,
"grad_norm": 0.76171875,
"learning_rate": 1e-05,
"loss": 0.9614,
"step": 1250
},
{
"epoch": 4.29553264604811,
"eval_loss": 1.3099355697631836,
"eval_runtime": 80.6322,
"eval_samples_per_second": 6.201,
"eval_steps_per_second": 0.397,
"step": 1250
},
{
"epoch": 4.329896907216495,
"grad_norm": 0.9140625,
"learning_rate": 1e-05,
"loss": 0.9707,
"step": 1260
},
{
"epoch": 4.364261168384879,
"grad_norm": 0.47265625,
"learning_rate": 1e-05,
"loss": 1.678,
"step": 1270
},
{
"epoch": 4.398625429553265,
"grad_norm": 0.68359375,
"learning_rate": 1e-05,
"loss": 1.4278,
"step": 1280
},
{
"epoch": 4.43298969072165,
"grad_norm": 0.796875,
"learning_rate": 1e-05,
"loss": 1.145,
"step": 1290
},
{
"epoch": 4.4673539518900345,
"grad_norm": 0.95703125,
"learning_rate": 1e-05,
"loss": 1.0103,
"step": 1300
},
{
"epoch": 4.4673539518900345,
"eval_loss": 1.2861213684082031,
"eval_runtime": 80.5496,
"eval_samples_per_second": 6.207,
"eval_steps_per_second": 0.397,
"step": 1300
},
{
"epoch": 4.501718213058419,
"grad_norm": 0.734375,
"learning_rate": 1e-05,
"loss": 0.8367,
"step": 1310
},
{
"epoch": 4.536082474226804,
"grad_norm": 0.58984375,
"learning_rate": 1e-05,
"loss": 1.7564,
"step": 1320
},
{
"epoch": 4.570446735395189,
"grad_norm": 0.71484375,
"learning_rate": 1e-05,
"loss": 1.3221,
"step": 1330
},
{
"epoch": 4.6048109965635735,
"grad_norm": 0.78515625,
"learning_rate": 1e-05,
"loss": 1.0203,
"step": 1340
},
{
"epoch": 4.639175257731958,
"grad_norm": 0.89453125,
"learning_rate": 1e-05,
"loss": 0.9545,
"step": 1350
},
{
"epoch": 4.639175257731958,
"eval_loss": 1.255950927734375,
"eval_runtime": 79.395,
"eval_samples_per_second": 6.298,
"eval_steps_per_second": 0.403,
"step": 1350
},
{
"epoch": 4.673539518900344,
"grad_norm": 1.0703125,
"learning_rate": 1e-05,
"loss": 0.9984,
"step": 1360
},
{
"epoch": 4.707903780068729,
"grad_norm": 0.55078125,
"learning_rate": 1e-05,
"loss": 1.6831,
"step": 1370
},
{
"epoch": 4.742268041237113,
"grad_norm": 0.7578125,
"learning_rate": 1e-05,
"loss": 1.2777,
"step": 1380
},
{
"epoch": 4.776632302405498,
"grad_norm": 0.7734375,
"learning_rate": 1e-05,
"loss": 1.1472,
"step": 1390
},
{
"epoch": 4.810996563573883,
"grad_norm": 0.71484375,
"learning_rate": 1e-05,
"loss": 0.9402,
"step": 1400
},
{
"epoch": 4.810996563573883,
"eval_loss": 1.2922863960266113,
"eval_runtime": 80.5804,
"eval_samples_per_second": 6.205,
"eval_steps_per_second": 0.397,
"step": 1400
},
{
"epoch": 4.845360824742268,
"grad_norm": 0.8671875,
"learning_rate": 1e-05,
"loss": 0.8348,
"step": 1410
},
{
"epoch": 4.879725085910653,
"grad_norm": 0.5625,
"learning_rate": 1e-05,
"loss": 1.6422,
"step": 1420
},
{
"epoch": 4.914089347079038,
"grad_norm": 0.88671875,
"learning_rate": 1e-05,
"loss": 1.1908,
"step": 1430
},
{
"epoch": 4.948453608247423,
"grad_norm": 0.75390625,
"learning_rate": 1e-05,
"loss": 0.9643,
"step": 1440
},
{
"epoch": 4.982817869415808,
"grad_norm": 1.1953125,
"learning_rate": 1e-05,
"loss": 0.9506,
"step": 1450
},
{
"epoch": 4.982817869415808,
"eval_loss": 1.3255817890167236,
"eval_runtime": 81.1295,
"eval_samples_per_second": 6.163,
"eval_steps_per_second": 0.394,
"step": 1450
},
{
"epoch": 5.017182130584192,
"grad_norm": 0.52734375,
"learning_rate": 1e-05,
"loss": 1.5737,
"step": 1460
},
{
"epoch": 5.051546391752577,
"grad_norm": 0.68359375,
"learning_rate": 1e-05,
"loss": 1.3278,
"step": 1470
},
{
"epoch": 5.085910652920962,
"grad_norm": 0.71484375,
"learning_rate": 1e-05,
"loss": 1.0812,
"step": 1480
},
{
"epoch": 5.120274914089347,
"grad_norm": 0.90234375,
"learning_rate": 1e-05,
"loss": 0.9006,
"step": 1490
},
{
"epoch": 5.154639175257732,
"grad_norm": 0.8828125,
"learning_rate": 1e-05,
"loss": 0.9406,
"step": 1500
},
{
"epoch": 5.154639175257732,
"eval_loss": 1.252558708190918,
"eval_runtime": 80.1608,
"eval_samples_per_second": 6.237,
"eval_steps_per_second": 0.399,
"step": 1500
},
{
"epoch": 5.189003436426117,
"grad_norm": 0.494140625,
"learning_rate": 1e-05,
"loss": 1.5607,
"step": 1510
},
{
"epoch": 5.223367697594502,
"grad_norm": 0.59375,
"learning_rate": 1e-05,
"loss": 1.3299,
"step": 1520
},
{
"epoch": 5.257731958762887,
"grad_norm": 0.66796875,
"learning_rate": 1e-05,
"loss": 1.1667,
"step": 1530
},
{
"epoch": 5.292096219931271,
"grad_norm": 0.66796875,
"learning_rate": 1e-05,
"loss": 0.9309,
"step": 1540
},
{
"epoch": 5.326460481099656,
"grad_norm": 1.15625,
"learning_rate": 1e-05,
"loss": 0.8841,
"step": 1550
},
{
"epoch": 5.326460481099656,
"eval_loss": 1.2698159217834473,
"eval_runtime": 78.8603,
"eval_samples_per_second": 6.34,
"eval_steps_per_second": 0.406,
"step": 1550
},
{
"epoch": 5.360824742268041,
"grad_norm": 0.51171875,
"learning_rate": 1e-05,
"loss": 1.6537,
"step": 1560
},
{
"epoch": 5.3951890034364265,
"grad_norm": 0.65234375,
"learning_rate": 1e-05,
"loss": 1.3609,
"step": 1570
},
{
"epoch": 5.429553264604811,
"grad_norm": 0.7265625,
"learning_rate": 1e-05,
"loss": 1.0217,
"step": 1580
},
{
"epoch": 5.463917525773196,
"grad_norm": 0.87890625,
"learning_rate": 1e-05,
"loss": 0.9493,
"step": 1590
},
{
"epoch": 5.498281786941581,
"grad_norm": 1.0234375,
"learning_rate": 1e-05,
"loss": 0.9035,
"step": 1600
},
{
"epoch": 5.498281786941581,
"eval_loss": 1.273864984512329,
"eval_runtime": 81.3392,
"eval_samples_per_second": 6.147,
"eval_steps_per_second": 0.393,
"step": 1600
},
{
"epoch": 5.5326460481099655,
"grad_norm": 0.482421875,
"learning_rate": 1e-05,
"loss": 1.5479,
"step": 1610
},
{
"epoch": 5.56701030927835,
"grad_norm": 0.640625,
"learning_rate": 1e-05,
"loss": 1.4661,
"step": 1620
},
{
"epoch": 5.601374570446735,
"grad_norm": 0.7421875,
"learning_rate": 1e-05,
"loss": 1.068,
"step": 1630
},
{
"epoch": 5.63573883161512,
"grad_norm": 0.78515625,
"learning_rate": 1e-05,
"loss": 0.961,
"step": 1640
},
{
"epoch": 5.670103092783505,
"grad_norm": 0.953125,
"learning_rate": 1e-05,
"loss": 0.7961,
"step": 1650
},
{
"epoch": 5.670103092783505,
"eval_loss": 1.267844557762146,
"eval_runtime": 80.6166,
"eval_samples_per_second": 6.202,
"eval_steps_per_second": 0.397,
"step": 1650
},
{
"epoch": 5.70446735395189,
"grad_norm": 0.5625,
"learning_rate": 1e-05,
"loss": 1.6992,
"step": 1660
},
{
"epoch": 5.738831615120275,
"grad_norm": 0.6484375,
"learning_rate": 1e-05,
"loss": 1.4267,
"step": 1670
},
{
"epoch": 5.77319587628866,
"grad_norm": 1.0078125,
"learning_rate": 1e-05,
"loss": 1.0164,
"step": 1680
},
{
"epoch": 5.8075601374570445,
"grad_norm": 0.859375,
"learning_rate": 1e-05,
"loss": 0.9975,
"step": 1690
},
{
"epoch": 5.841924398625429,
"grad_norm": 0.84375,
"learning_rate": 1e-05,
"loss": 0.8743,
"step": 1700
},
{
"epoch": 5.841924398625429,
"eval_loss": 1.2871294021606445,
"eval_runtime": 80.7109,
"eval_samples_per_second": 6.195,
"eval_steps_per_second": 0.396,
"step": 1700
},
{
"epoch": 5.876288659793815,
"grad_norm": 0.435546875,
"learning_rate": 1e-05,
"loss": 1.5058,
"step": 1710
},
{
"epoch": 5.9106529209622,
"grad_norm": 0.58203125,
"learning_rate": 1e-05,
"loss": 1.2027,
"step": 1720
},
{
"epoch": 5.945017182130584,
"grad_norm": 0.7421875,
"learning_rate": 1e-05,
"loss": 1.0008,
"step": 1730
},
{
"epoch": 5.979381443298969,
"grad_norm": 0.9140625,
"learning_rate": 1e-05,
"loss": 0.975,
"step": 1740
},
{
"epoch": 6.013745704467354,
"grad_norm": 0.5078125,
"learning_rate": 1e-05,
"loss": 1.4782,
"step": 1750
},
{
"epoch": 6.013745704467354,
"eval_loss": 1.2818772792816162,
"eval_runtime": 79.8208,
"eval_samples_per_second": 6.264,
"eval_steps_per_second": 0.401,
"step": 1750
},
{
"epoch": 6.048109965635739,
"grad_norm": 0.6796875,
"learning_rate": 1e-05,
"loss": 1.4131,
"step": 1760
},
{
"epoch": 6.082474226804123,
"grad_norm": 0.69921875,
"learning_rate": 1e-05,
"loss": 1.0934,
"step": 1770
},
{
"epoch": 6.116838487972508,
"grad_norm": 0.7734375,
"learning_rate": 1e-05,
"loss": 0.9576,
"step": 1780
},
{
"epoch": 6.151202749140894,
"grad_norm": 1.1328125,
"learning_rate": 1e-05,
"loss": 0.8559,
"step": 1790
},
{
"epoch": 6.185567010309279,
"grad_norm": 0.63671875,
"learning_rate": 1e-05,
"loss": 1.485,
"step": 1800
},
{
"epoch": 6.185567010309279,
"eval_loss": 1.2550129890441895,
"eval_runtime": 80.5854,
"eval_samples_per_second": 6.205,
"eval_steps_per_second": 0.397,
"step": 1800
},
{
"epoch": 6.219931271477663,
"grad_norm": 0.69140625,
"learning_rate": 1e-05,
"loss": 1.5779,
"step": 1810
},
{
"epoch": 6.254295532646048,
"grad_norm": 0.7421875,
"learning_rate": 1e-05,
"loss": 1.1541,
"step": 1820
},
{
"epoch": 6.288659793814433,
"grad_norm": 0.84765625,
"learning_rate": 1e-05,
"loss": 1.0334,
"step": 1830
},
{
"epoch": 6.323024054982818,
"grad_norm": 1.28125,
"learning_rate": 1e-05,
"loss": 1.0781,
"step": 1840
},
{
"epoch": 6.357388316151202,
"grad_norm": 0.4453125,
"learning_rate": 1e-05,
"loss": 1.4498,
"step": 1850
},
{
"epoch": 6.357388316151202,
"eval_loss": 1.245072603225708,
"eval_runtime": 80.4884,
"eval_samples_per_second": 6.212,
"eval_steps_per_second": 0.398,
"step": 1850
},
{
"epoch": 6.391752577319588,
"grad_norm": 0.66015625,
"learning_rate": 1e-05,
"loss": 1.3156,
"step": 1860
},
{
"epoch": 6.426116838487973,
"grad_norm": 0.79296875,
"learning_rate": 1e-05,
"loss": 0.999,
"step": 1870
},
{
"epoch": 6.4604810996563575,
"grad_norm": 0.86328125,
"learning_rate": 1e-05,
"loss": 1.0062,
"step": 1880
},
{
"epoch": 6.494845360824742,
"grad_norm": 1.1484375,
"learning_rate": 1e-05,
"loss": 0.8547,
"step": 1890
},
{
"epoch": 6.529209621993127,
"grad_norm": 0.458984375,
"learning_rate": 1e-05,
"loss": 1.4964,
"step": 1900
},
{
"epoch": 6.529209621993127,
"eval_loss": 1.2364414930343628,
"eval_runtime": 79.7167,
"eval_samples_per_second": 6.272,
"eval_steps_per_second": 0.401,
"step": 1900
},
{
"epoch": 6.563573883161512,
"grad_norm": 0.63671875,
"learning_rate": 1e-05,
"loss": 1.3453,
"step": 1910
},
{
"epoch": 6.597938144329897,
"grad_norm": 0.953125,
"learning_rate": 1e-05,
"loss": 1.0536,
"step": 1920
},
{
"epoch": 6.632302405498281,
"grad_norm": 0.81640625,
"learning_rate": 1e-05,
"loss": 0.7931,
"step": 1930
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.953125,
"learning_rate": 1e-05,
"loss": 0.9338,
"step": 1940
},
{
"epoch": 6.701030927835052,
"grad_norm": 0.4921875,
"learning_rate": 1e-05,
"loss": 1.5344,
"step": 1950
},
{
"epoch": 6.701030927835052,
"eval_loss": 1.2666130065917969,
"eval_runtime": 80.835,
"eval_samples_per_second": 6.185,
"eval_steps_per_second": 0.396,
"step": 1950
},
{
"epoch": 6.7353951890034365,
"grad_norm": 0.640625,
"learning_rate": 1e-05,
"loss": 1.3881,
"step": 1960
},
{
"epoch": 6.769759450171821,
"grad_norm": 0.640625,
"learning_rate": 1e-05,
"loss": 1.0083,
"step": 1970
},
{
"epoch": 6.804123711340206,
"grad_norm": 0.671875,
"learning_rate": 1e-05,
"loss": 0.9552,
"step": 1980
},
{
"epoch": 6.838487972508591,
"grad_norm": 1.1796875,
"learning_rate": 1e-05,
"loss": 0.7725,
"step": 1990
},
{
"epoch": 6.872852233676976,
"grad_norm": 0.50390625,
"learning_rate": 1e-05,
"loss": 1.3508,
"step": 2000
},
{
"epoch": 6.872852233676976,
"eval_loss": 1.2704153060913086,
"eval_runtime": 80.3278,
"eval_samples_per_second": 6.224,
"eval_steps_per_second": 0.398,
"step": 2000
},
{
"epoch": 6.907216494845361,
"grad_norm": 0.6796875,
"learning_rate": 1e-05,
"loss": 1.3881,
"step": 2010
},
{
"epoch": 6.941580756013746,
"grad_norm": 0.7578125,
"learning_rate": 1e-05,
"loss": 0.9989,
"step": 2020
},
{
"epoch": 6.975945017182131,
"grad_norm": 1.09375,
"learning_rate": 1e-05,
"loss": 0.8806,
"step": 2030
},
{
"epoch": 7.010309278350515,
"grad_norm": 0.482421875,
"learning_rate": 1e-05,
"loss": 1.261,
"step": 2040
},
{
"epoch": 7.0446735395189,
"grad_norm": 0.54296875,
"learning_rate": 1e-05,
"loss": 1.4767,
"step": 2050
},
{
"epoch": 7.0446735395189,
"eval_loss": 1.280444860458374,
"eval_runtime": 79.4244,
"eval_samples_per_second": 6.295,
"eval_steps_per_second": 0.403,
"step": 2050
},
{
"epoch": 7.079037800687285,
"grad_norm": 0.8046875,
"learning_rate": 1e-05,
"loss": 1.0493,
"step": 2060
},
{
"epoch": 7.11340206185567,
"grad_norm": 0.96875,
"learning_rate": 1e-05,
"loss": 0.9817,
"step": 2070
},
{
"epoch": 7.147766323024055,
"grad_norm": 0.953125,
"learning_rate": 1e-05,
"loss": 0.942,
"step": 2080
},
{
"epoch": 7.18213058419244,
"grad_norm": 0.46875,
"learning_rate": 1e-05,
"loss": 1.3796,
"step": 2090
},
{
"epoch": 7.216494845360825,
"grad_norm": 0.61328125,
"learning_rate": 1e-05,
"loss": 1.5174,
"step": 2100
},
{
"epoch": 7.216494845360825,
"eval_loss": 1.2758172750473022,
"eval_runtime": 80.4337,
"eval_samples_per_second": 6.216,
"eval_steps_per_second": 0.398,
"step": 2100
},
{
"epoch": 7.25085910652921,
"grad_norm": 0.8671875,
"learning_rate": 1e-05,
"loss": 1.0615,
"step": 2110
},
{
"epoch": 7.285223367697594,
"grad_norm": 0.8828125,
"learning_rate": 1e-05,
"loss": 0.9334,
"step": 2120
},
{
"epoch": 7.319587628865979,
"grad_norm": 1.03125,
"learning_rate": 1e-05,
"loss": 0.9128,
"step": 2130
},
{
"epoch": 7.353951890034364,
"grad_norm": 0.470703125,
"learning_rate": 1e-05,
"loss": 1.2603,
"step": 2140
},
{
"epoch": 7.3883161512027495,
"grad_norm": 0.6953125,
"learning_rate": 1e-05,
"loss": 1.4707,
"step": 2150
},
{
"epoch": 7.3883161512027495,
"eval_loss": 1.2430615425109863,
"eval_runtime": 79.9744,
"eval_samples_per_second": 6.252,
"eval_steps_per_second": 0.4,
"step": 2150
},
{
"epoch": 7.422680412371134,
"grad_norm": 0.8359375,
"learning_rate": 1e-05,
"loss": 0.9619,
"step": 2160
},
{
"epoch": 7.457044673539519,
"grad_norm": 1.125,
"learning_rate": 1e-05,
"loss": 1.0213,
"step": 2170
},
{
"epoch": 7.491408934707904,
"grad_norm": 0.79296875,
"learning_rate": 1e-05,
"loss": 0.8239,
"step": 2180
},
{
"epoch": 7.525773195876289,
"grad_norm": 0.458984375,
"learning_rate": 1e-05,
"loss": 1.3376,
"step": 2190
},
{
"epoch": 7.560137457044673,
"grad_norm": 0.6328125,
"learning_rate": 1e-05,
"loss": 1.5446,
"step": 2200
},
{
"epoch": 7.560137457044673,
"eval_loss": 1.2588214874267578,
"eval_runtime": 80.0582,
"eval_samples_per_second": 6.245,
"eval_steps_per_second": 0.4,
"step": 2200
},
{
"epoch": 7.594501718213058,
"grad_norm": 0.7421875,
"learning_rate": 1e-05,
"loss": 1.1267,
"step": 2210
},
{
"epoch": 7.628865979381443,
"grad_norm": 0.765625,
"learning_rate": 1e-05,
"loss": 0.9444,
"step": 2220
},
{
"epoch": 7.6632302405498285,
"grad_norm": 1.03125,
"learning_rate": 1e-05,
"loss": 0.7487,
"step": 2230
},
{
"epoch": 7.697594501718213,
"grad_norm": 0.421875,
"learning_rate": 1e-05,
"loss": 1.3165,
"step": 2240
},
{
"epoch": 7.731958762886598,
"grad_norm": 0.65234375,
"learning_rate": 1e-05,
"loss": 1.504,
"step": 2250
},
{
"epoch": 7.731958762886598,
"eval_loss": 1.2653926610946655,
"eval_runtime": 79.9943,
"eval_samples_per_second": 6.25,
"eval_steps_per_second": 0.4,
"step": 2250
},
{
"epoch": 7.766323024054983,
"grad_norm": 0.734375,
"learning_rate": 1e-05,
"loss": 1.1336,
"step": 2260
},
{
"epoch": 7.8006872852233675,
"grad_norm": 0.86328125,
"learning_rate": 1e-05,
"loss": 0.9161,
"step": 2270
},
{
"epoch": 7.835051546391752,
"grad_norm": 0.89453125,
"learning_rate": 1e-05,
"loss": 0.8741,
"step": 2280
},
{
"epoch": 7.869415807560138,
"grad_norm": 0.435546875,
"learning_rate": 1e-05,
"loss": 1.3443,
"step": 2290
},
{
"epoch": 7.903780068728523,
"grad_norm": 0.703125,
"learning_rate": 1e-05,
"loss": 1.3251,
"step": 2300
},
{
"epoch": 7.903780068728523,
"eval_loss": 1.2268811464309692,
"eval_runtime": 80.5707,
"eval_samples_per_second": 6.206,
"eval_steps_per_second": 0.397,
"step": 2300
},
{
"epoch": 7.938144329896907,
"grad_norm": 0.6484375,
"learning_rate": 1e-05,
"loss": 1.0545,
"step": 2310
},
{
"epoch": 7.972508591065292,
"grad_norm": 1.0546875,
"learning_rate": 1e-05,
"loss": 0.7926,
"step": 2320
},
{
"epoch": 8.006872852233677,
"grad_norm": 0.43359375,
"learning_rate": 1e-05,
"loss": 1.1817,
"step": 2330
},
{
"epoch": 8.041237113402062,
"grad_norm": 0.52734375,
"learning_rate": 1e-05,
"loss": 1.6075,
"step": 2340
},
{
"epoch": 8.075601374570446,
"grad_norm": 0.671875,
"learning_rate": 1e-05,
"loss": 1.0504,
"step": 2350
},
{
"epoch": 8.075601374570446,
"eval_loss": 1.2449791431427002,
"eval_runtime": 78.7268,
"eval_samples_per_second": 6.351,
"eval_steps_per_second": 0.406,
"step": 2350
},
{
"epoch": 8.109965635738831,
"grad_norm": 0.8203125,
"learning_rate": 1e-05,
"loss": 0.9824,
"step": 2360
},
{
"epoch": 8.144329896907216,
"grad_norm": 0.9296875,
"learning_rate": 1e-05,
"loss": 0.8892,
"step": 2370
},
{
"epoch": 8.1786941580756,
"grad_norm": 0.400390625,
"learning_rate": 1e-05,
"loss": 1.2176,
"step": 2380
},
{
"epoch": 8.213058419243985,
"grad_norm": 0.51171875,
"learning_rate": 1e-05,
"loss": 1.5713,
"step": 2390
},
{
"epoch": 8.24742268041237,
"grad_norm": 0.8046875,
"learning_rate": 1e-05,
"loss": 1.0251,
"step": 2400
},
{
"epoch": 8.24742268041237,
"eval_loss": 1.2358765602111816,
"eval_runtime": 79.6888,
"eval_samples_per_second": 6.274,
"eval_steps_per_second": 0.402,
"step": 2400
},
{
"epoch": 8.281786941580757,
"grad_norm": 0.80859375,
"learning_rate": 1e-05,
"loss": 0.9868,
"step": 2410
},
{
"epoch": 8.316151202749142,
"grad_norm": 1.0,
"learning_rate": 1e-05,
"loss": 0.8865,
"step": 2420
},
{
"epoch": 8.350515463917526,
"grad_norm": 0.453125,
"learning_rate": 1e-05,
"loss": 1.1553,
"step": 2430
},
{
"epoch": 8.384879725085911,
"grad_norm": 0.53125,
"learning_rate": 1e-05,
"loss": 1.6919,
"step": 2440
},
{
"epoch": 8.419243986254296,
"grad_norm": 0.7421875,
"learning_rate": 1e-05,
"loss": 1.1336,
"step": 2450
},
{
"epoch": 8.419243986254296,
"eval_loss": 1.2419917583465576,
"eval_runtime": 80.2326,
"eval_samples_per_second": 6.232,
"eval_steps_per_second": 0.399,
"step": 2450
},
{
"epoch": 8.45360824742268,
"grad_norm": 0.77734375,
"learning_rate": 1e-05,
"loss": 0.9585,
"step": 2460
},
{
"epoch": 8.487972508591065,
"grad_norm": 0.83984375,
"learning_rate": 1e-05,
"loss": 0.8052,
"step": 2470
},
{
"epoch": 8.52233676975945,
"grad_norm": 0.46875,
"learning_rate": 1e-05,
"loss": 1.1585,
"step": 2480
},
{
"epoch": 8.556701030927835,
"grad_norm": 0.62890625,
"learning_rate": 1e-05,
"loss": 1.4953,
"step": 2490
},
{
"epoch": 8.59106529209622,
"grad_norm": 0.71484375,
"learning_rate": 1e-05,
"loss": 1.082,
"step": 2500
},
{
"epoch": 8.59106529209622,
"eval_loss": 1.226503849029541,
"eval_runtime": 79.806,
"eval_samples_per_second": 6.265,
"eval_steps_per_second": 0.401,
"step": 2500
},
{
"epoch": 8.625429553264604,
"grad_norm": 0.84375,
"learning_rate": 1e-05,
"loss": 1.0459,
"step": 2510
},
{
"epoch": 8.65979381443299,
"grad_norm": 1.1640625,
"learning_rate": 1e-05,
"loss": 0.8447,
"step": 2520
},
{
"epoch": 8.694158075601374,
"grad_norm": 0.431640625,
"learning_rate": 1e-05,
"loss": 1.2142,
"step": 2530
},
{
"epoch": 8.728522336769759,
"grad_norm": 0.69140625,
"learning_rate": 1e-05,
"loss": 1.6356,
"step": 2540
},
{
"epoch": 8.762886597938145,
"grad_norm": 0.7265625,
"learning_rate": 1e-05,
"loss": 1.0244,
"step": 2550
},
{
"epoch": 8.762886597938145,
"eval_loss": 1.23357093334198,
"eval_runtime": 80.1976,
"eval_samples_per_second": 6.235,
"eval_steps_per_second": 0.399,
"step": 2550
},
{
"epoch": 8.79725085910653,
"grad_norm": 0.79296875,
"learning_rate": 1e-05,
"loss": 0.8479,
"step": 2560
},
{
"epoch": 8.831615120274915,
"grad_norm": 0.9921875,
"learning_rate": 1e-05,
"loss": 0.8469,
"step": 2570
},
{
"epoch": 8.8659793814433,
"grad_norm": 0.54296875,
"learning_rate": 1e-05,
"loss": 1.0058,
"step": 2580
},
{
"epoch": 8.900343642611684,
"grad_norm": 0.609375,
"learning_rate": 1e-05,
"loss": 1.4478,
"step": 2590
},
{
"epoch": 8.934707903780069,
"grad_norm": 0.7734375,
"learning_rate": 1e-05,
"loss": 1.0217,
"step": 2600
},
{
"epoch": 8.934707903780069,
"eval_loss": 1.2186033725738525,
"eval_runtime": 79.9235,
"eval_samples_per_second": 6.256,
"eval_steps_per_second": 0.4,
"step": 2600
},
{
"epoch": 8.969072164948454,
"grad_norm": 1.03125,
"learning_rate": 1e-05,
"loss": 0.8485,
"step": 2610
},
{
"epoch": 9.003436426116838,
"grad_norm": 0.412109375,
"learning_rate": 1e-05,
"loss": 1.0749,
"step": 2620
},
{
"epoch": 9.037800687285223,
"grad_norm": 0.515625,
"learning_rate": 1e-05,
"loss": 1.6991,
"step": 2630
},
{
"epoch": 9.072164948453608,
"grad_norm": 0.546875,
"learning_rate": 1e-05,
"loss": 1.0847,
"step": 2640
},
{
"epoch": 9.106529209621993,
"grad_norm": 0.82421875,
"learning_rate": 1e-05,
"loss": 0.9059,
"step": 2650
},
{
"epoch": 9.106529209621993,
"eval_loss": 1.2460401058197021,
"eval_runtime": 79.7952,
"eval_samples_per_second": 6.266,
"eval_steps_per_second": 0.401,
"step": 2650
},
{
"epoch": 9.140893470790378,
"grad_norm": 0.83984375,
"learning_rate": 1e-05,
"loss": 0.9823,
"step": 2660
},
{
"epoch": 9.175257731958762,
"grad_norm": 0.42578125,
"learning_rate": 1e-05,
"loss": 0.9337,
"step": 2670
},
{
"epoch": 9.209621993127147,
"grad_norm": 0.625,
"learning_rate": 1e-05,
"loss": 1.7176,
"step": 2680
},
{
"epoch": 9.243986254295532,
"grad_norm": 0.921875,
"learning_rate": 1e-05,
"loss": 1.1614,
"step": 2690
},
{
"epoch": 9.278350515463918,
"grad_norm": 0.859375,
"learning_rate": 1e-05,
"loss": 0.9592,
"step": 2700
},
{
"epoch": 9.278350515463918,
"eval_loss": 1.2704145908355713,
"eval_runtime": 81.332,
"eval_samples_per_second": 6.148,
"eval_steps_per_second": 0.393,
"step": 2700
},
{
"epoch": 9.312714776632303,
"grad_norm": 0.90234375,
"learning_rate": 1e-05,
"loss": 0.7952,
"step": 2710
},
{
"epoch": 9.347079037800688,
"grad_norm": 0.375,
"learning_rate": 1e-05,
"loss": 1.0608,
"step": 2720
},
{
"epoch": 9.381443298969073,
"grad_norm": 0.546875,
"learning_rate": 1e-05,
"loss": 1.6669,
"step": 2730
},
{
"epoch": 9.415807560137457,
"grad_norm": 0.6796875,
"learning_rate": 1e-05,
"loss": 1.0593,
"step": 2740
},
{
"epoch": 9.450171821305842,
"grad_norm": 0.8359375,
"learning_rate": 1e-05,
"loss": 0.9128,
"step": 2750
},
{
"epoch": 9.450171821305842,
"eval_loss": 1.2561514377593994,
"eval_runtime": 79.3208,
"eval_samples_per_second": 6.304,
"eval_steps_per_second": 0.403,
"step": 2750
},
{
"epoch": 9.484536082474227,
"grad_norm": 0.98046875,
"learning_rate": 1e-05,
"loss": 0.7417,
"step": 2760
},
{
"epoch": 9.518900343642612,
"grad_norm": 0.423828125,
"learning_rate": 1e-05,
"loss": 1.0339,
"step": 2770
},
{
"epoch": 9.553264604810996,
"grad_norm": 0.5546875,
"learning_rate": 1e-05,
"loss": 1.6701,
"step": 2780
},
{
"epoch": 9.587628865979381,
"grad_norm": 0.890625,
"learning_rate": 1e-05,
"loss": 1.1387,
"step": 2790
},
{
"epoch": 9.621993127147766,
"grad_norm": 0.87109375,
"learning_rate": 1e-05,
"loss": 0.9777,
"step": 2800
},
{
"epoch": 9.621993127147766,
"eval_loss": 1.2424900531768799,
"eval_runtime": 79.3914,
"eval_samples_per_second": 6.298,
"eval_steps_per_second": 0.403,
"step": 2800
},
{
"epoch": 9.65635738831615,
"grad_norm": 1.015625,
"learning_rate": 1e-05,
"loss": 0.8611,
"step": 2810
},
{
"epoch": 9.690721649484535,
"grad_norm": 0.416015625,
"learning_rate": 1e-05,
"loss": 0.9828,
"step": 2820
},
{
"epoch": 9.72508591065292,
"grad_norm": 0.515625,
"learning_rate": 1e-05,
"loss": 1.5909,
"step": 2830
},
{
"epoch": 9.759450171821307,
"grad_norm": 0.78515625,
"learning_rate": 1e-05,
"loss": 1.082,
"step": 2840
},
{
"epoch": 9.793814432989691,
"grad_norm": 0.64453125,
"learning_rate": 1e-05,
"loss": 0.8769,
"step": 2850
},
{
"epoch": 9.793814432989691,
"eval_loss": 1.2342780828475952,
"eval_runtime": 80.7675,
"eval_samples_per_second": 6.191,
"eval_steps_per_second": 0.396,
"step": 2850
},
{
"epoch": 9.828178694158076,
"grad_norm": 0.94140625,
"learning_rate": 1e-05,
"loss": 0.8709,
"step": 2860
},
{
"epoch": 9.862542955326461,
"grad_norm": 0.412109375,
"learning_rate": 1e-05,
"loss": 0.9289,
"step": 2870
},
{
"epoch": 9.896907216494846,
"grad_norm": 0.51171875,
"learning_rate": 1e-05,
"loss": 1.6699,
"step": 2880
},
{
"epoch": 9.93127147766323,
"grad_norm": 0.8359375,
"learning_rate": 1e-05,
"loss": 0.9995,
"step": 2890
},
{
"epoch": 9.965635738831615,
"grad_norm": 0.96484375,
"learning_rate": 1e-05,
"loss": 1.0169,
"step": 2900
},
{
"epoch": 9.965635738831615,
"eval_loss": 1.254168152809143,
"eval_runtime": 79.4694,
"eval_samples_per_second": 6.292,
"eval_steps_per_second": 0.403,
"step": 2900
}
],
"logging_steps": 10,
"max_steps": 2910,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 9.007961726071603e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}