Llama_3.2_1B_Fine-tune_SQuAD / trainer_state.json
Salmamoori's picture
Upload folder using huggingface_hub
55c645f verified
{
"best_metric": Infinity,
"best_model_checkpoint": null,
"epoch": 2.9979908675799085,
"eval_steps": 50,
"global_step": 4104,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0073059360730593605,
"grad_norm": NaN,
"learning_rate": 2e-05,
"loss": 0.0,
"step": 10
},
{
"epoch": 0.014611872146118721,
"grad_norm": NaN,
"learning_rate": 4e-05,
"loss": 0.0,
"step": 20
},
{
"epoch": 0.021917808219178082,
"grad_norm": NaN,
"learning_rate": 6e-05,
"loss": 0.0,
"step": 30
},
{
"epoch": 0.029223744292237442,
"grad_norm": NaN,
"learning_rate": 8e-05,
"loss": 0.0,
"step": 40
},
{
"epoch": 0.0365296803652968,
"grad_norm": NaN,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 50
},
{
"epoch": 0.0365296803652968,
"eval_loss": NaN,
"eval_runtime": 86.8266,
"eval_samples_per_second": 121.737,
"eval_steps_per_second": 7.613,
"step": 50
},
{
"epoch": 0.043835616438356165,
"grad_norm": NaN,
"learning_rate": 9.97533300444006e-05,
"loss": 0.0,
"step": 60
},
{
"epoch": 0.05114155251141553,
"grad_norm": NaN,
"learning_rate": 9.950666008880118e-05,
"loss": 0.0,
"step": 70
},
{
"epoch": 0.058447488584474884,
"grad_norm": NaN,
"learning_rate": 9.925999013320178e-05,
"loss": 0.0,
"step": 80
},
{
"epoch": 0.06575342465753424,
"grad_norm": NaN,
"learning_rate": 9.901332017760238e-05,
"loss": 0.0,
"step": 90
},
{
"epoch": 0.0730593607305936,
"grad_norm": NaN,
"learning_rate": 9.876665022200296e-05,
"loss": 0.0,
"step": 100
},
{
"epoch": 0.0730593607305936,
"eval_loss": NaN,
"eval_runtime": 89.5193,
"eval_samples_per_second": 118.075,
"eval_steps_per_second": 7.384,
"step": 100
},
{
"epoch": 0.08036529680365297,
"grad_norm": NaN,
"learning_rate": 9.851998026640355e-05,
"loss": 0.0,
"step": 110
},
{
"epoch": 0.08767123287671233,
"grad_norm": NaN,
"learning_rate": 9.827331031080415e-05,
"loss": 0.0,
"step": 120
},
{
"epoch": 0.09497716894977169,
"grad_norm": NaN,
"learning_rate": 9.802664035520473e-05,
"loss": 0.0,
"step": 130
},
{
"epoch": 0.10228310502283106,
"grad_norm": NaN,
"learning_rate": 9.777997039960533e-05,
"loss": 0.0,
"step": 140
},
{
"epoch": 0.1095890410958904,
"grad_norm": NaN,
"learning_rate": 9.753330044400593e-05,
"loss": 0.0,
"step": 150
},
{
"epoch": 0.1095890410958904,
"eval_loss": NaN,
"eval_runtime": 89.6606,
"eval_samples_per_second": 117.889,
"eval_steps_per_second": 7.372,
"step": 150
},
{
"epoch": 0.11689497716894977,
"grad_norm": NaN,
"learning_rate": 9.728663048840652e-05,
"loss": 0.0,
"step": 160
},
{
"epoch": 0.12420091324200913,
"grad_norm": NaN,
"learning_rate": 9.70399605328071e-05,
"loss": 0.0,
"step": 170
},
{
"epoch": 0.13150684931506848,
"grad_norm": NaN,
"learning_rate": 9.67932905772077e-05,
"loss": 0.0,
"step": 180
},
{
"epoch": 0.13881278538812786,
"grad_norm": NaN,
"learning_rate": 9.65466206216083e-05,
"loss": 0.0,
"step": 190
},
{
"epoch": 0.1461187214611872,
"grad_norm": NaN,
"learning_rate": 9.629995066600888e-05,
"loss": 0.0,
"step": 200
},
{
"epoch": 0.1461187214611872,
"eval_loss": NaN,
"eval_runtime": 87.4596,
"eval_samples_per_second": 120.856,
"eval_steps_per_second": 7.558,
"step": 200
},
{
"epoch": 0.15342465753424658,
"grad_norm": NaN,
"learning_rate": 9.605328071040948e-05,
"loss": 0.0,
"step": 210
},
{
"epoch": 0.16073059360730593,
"grad_norm": NaN,
"learning_rate": 9.580661075481007e-05,
"loss": 0.0,
"step": 220
},
{
"epoch": 0.1680365296803653,
"grad_norm": NaN,
"learning_rate": 9.555994079921066e-05,
"loss": 0.0,
"step": 230
},
{
"epoch": 0.17534246575342466,
"grad_norm": NaN,
"learning_rate": 9.531327084361125e-05,
"loss": 0.0,
"step": 240
},
{
"epoch": 0.182648401826484,
"grad_norm": NaN,
"learning_rate": 9.506660088801185e-05,
"loss": 0.0,
"step": 250
},
{
"epoch": 0.182648401826484,
"eval_loss": NaN,
"eval_runtime": 87.3071,
"eval_samples_per_second": 121.067,
"eval_steps_per_second": 7.571,
"step": 250
},
{
"epoch": 0.18995433789954339,
"grad_norm": NaN,
"learning_rate": 9.481993093241244e-05,
"loss": 0.0,
"step": 260
},
{
"epoch": 0.19726027397260273,
"grad_norm": NaN,
"learning_rate": 9.457326097681303e-05,
"loss": 0.0,
"step": 270
},
{
"epoch": 0.2045662100456621,
"grad_norm": NaN,
"learning_rate": 9.432659102121362e-05,
"loss": 0.0,
"step": 280
},
{
"epoch": 0.21187214611872146,
"grad_norm": NaN,
"learning_rate": 9.40799210656142e-05,
"loss": 0.0,
"step": 290
},
{
"epoch": 0.2191780821917808,
"grad_norm": NaN,
"learning_rate": 9.38332511100148e-05,
"loss": 0.0,
"step": 300
},
{
"epoch": 0.2191780821917808,
"eval_loss": NaN,
"eval_runtime": 87.4316,
"eval_samples_per_second": 120.895,
"eval_steps_per_second": 7.56,
"step": 300
},
{
"epoch": 0.2264840182648402,
"grad_norm": NaN,
"learning_rate": 9.35865811544154e-05,
"loss": 0.0,
"step": 310
},
{
"epoch": 0.23378995433789954,
"grad_norm": NaN,
"learning_rate": 9.3339911198816e-05,
"loss": 0.0,
"step": 320
},
{
"epoch": 0.2410958904109589,
"grad_norm": NaN,
"learning_rate": 9.309324124321658e-05,
"loss": 0.0,
"step": 330
},
{
"epoch": 0.24840182648401826,
"grad_norm": NaN,
"learning_rate": 9.284657128761717e-05,
"loss": 0.0,
"step": 340
},
{
"epoch": 0.2557077625570776,
"grad_norm": NaN,
"learning_rate": 9.259990133201777e-05,
"loss": 0.0,
"step": 350
},
{
"epoch": 0.2557077625570776,
"eval_loss": NaN,
"eval_runtime": 89.7649,
"eval_samples_per_second": 117.752,
"eval_steps_per_second": 7.364,
"step": 350
},
{
"epoch": 0.26301369863013696,
"grad_norm": NaN,
"learning_rate": 9.235323137641837e-05,
"loss": 0.0,
"step": 360
},
{
"epoch": 0.27031963470319637,
"grad_norm": NaN,
"learning_rate": 9.210656142081895e-05,
"loss": 0.0,
"step": 370
},
{
"epoch": 0.2776255707762557,
"grad_norm": NaN,
"learning_rate": 9.185989146521954e-05,
"loss": 0.0,
"step": 380
},
{
"epoch": 0.28493150684931506,
"grad_norm": NaN,
"learning_rate": 9.161322150962013e-05,
"loss": 0.0,
"step": 390
},
{
"epoch": 0.2922374429223744,
"grad_norm": NaN,
"learning_rate": 9.136655155402072e-05,
"loss": 0.0,
"step": 400
},
{
"epoch": 0.2922374429223744,
"eval_loss": NaN,
"eval_runtime": 89.6739,
"eval_samples_per_second": 117.871,
"eval_steps_per_second": 7.371,
"step": 400
},
{
"epoch": 0.29954337899543376,
"grad_norm": NaN,
"learning_rate": 9.111988159842132e-05,
"loss": 0.0,
"step": 410
},
{
"epoch": 0.30684931506849317,
"grad_norm": NaN,
"learning_rate": 9.087321164282192e-05,
"loss": 0.0,
"step": 420
},
{
"epoch": 0.3141552511415525,
"grad_norm": NaN,
"learning_rate": 9.06265416872225e-05,
"loss": 0.0,
"step": 430
},
{
"epoch": 0.32146118721461187,
"grad_norm": NaN,
"learning_rate": 9.03798717316231e-05,
"loss": 0.0,
"step": 440
},
{
"epoch": 0.3287671232876712,
"grad_norm": NaN,
"learning_rate": 9.013320177602368e-05,
"loss": 0.0,
"step": 450
},
{
"epoch": 0.3287671232876712,
"eval_loss": NaN,
"eval_runtime": 87.7201,
"eval_samples_per_second": 120.497,
"eval_steps_per_second": 7.535,
"step": 450
},
{
"epoch": 0.3360730593607306,
"grad_norm": NaN,
"learning_rate": 8.988653182042427e-05,
"loss": 0.0,
"step": 460
},
{
"epoch": 0.34337899543378997,
"grad_norm": NaN,
"learning_rate": 8.963986186482487e-05,
"loss": 0.0,
"step": 470
},
{
"epoch": 0.3506849315068493,
"grad_norm": NaN,
"learning_rate": 8.939319190922547e-05,
"loss": 0.0,
"step": 480
},
{
"epoch": 0.35799086757990867,
"grad_norm": NaN,
"learning_rate": 8.914652195362605e-05,
"loss": 0.0,
"step": 490
},
{
"epoch": 0.365296803652968,
"grad_norm": NaN,
"learning_rate": 8.889985199802664e-05,
"loss": 0.0,
"step": 500
},
{
"epoch": 0.365296803652968,
"eval_loss": NaN,
"eval_runtime": 87.4748,
"eval_samples_per_second": 120.835,
"eval_steps_per_second": 7.556,
"step": 500
},
{
"epoch": 0.3726027397260274,
"grad_norm": NaN,
"learning_rate": 8.865318204242724e-05,
"loss": 0.0,
"step": 510
},
{
"epoch": 0.37990867579908677,
"grad_norm": NaN,
"learning_rate": 8.840651208682784e-05,
"loss": 0.0,
"step": 520
},
{
"epoch": 0.3872146118721461,
"grad_norm": NaN,
"learning_rate": 8.815984213122842e-05,
"loss": 0.0,
"step": 530
},
{
"epoch": 0.39452054794520547,
"grad_norm": NaN,
"learning_rate": 8.791317217562902e-05,
"loss": 0.0,
"step": 540
},
{
"epoch": 0.4018264840182648,
"grad_norm": NaN,
"learning_rate": 8.76665022200296e-05,
"loss": 0.0,
"step": 550
},
{
"epoch": 0.4018264840182648,
"eval_loss": NaN,
"eval_runtime": 87.3047,
"eval_samples_per_second": 121.07,
"eval_steps_per_second": 7.571,
"step": 550
},
{
"epoch": 0.4091324200913242,
"grad_norm": NaN,
"learning_rate": 8.74198322644302e-05,
"loss": 0.0,
"step": 560
},
{
"epoch": 0.41643835616438357,
"grad_norm": NaN,
"learning_rate": 8.717316230883079e-05,
"loss": 0.0,
"step": 570
},
{
"epoch": 0.4237442922374429,
"grad_norm": NaN,
"learning_rate": 8.692649235323139e-05,
"loss": 0.0,
"step": 580
},
{
"epoch": 0.43105022831050227,
"grad_norm": NaN,
"learning_rate": 8.667982239763197e-05,
"loss": 0.0,
"step": 590
},
{
"epoch": 0.4383561643835616,
"grad_norm": NaN,
"learning_rate": 8.643315244203257e-05,
"loss": 0.0,
"step": 600
},
{
"epoch": 0.4383561643835616,
"eval_loss": NaN,
"eval_runtime": 87.4314,
"eval_samples_per_second": 120.895,
"eval_steps_per_second": 7.56,
"step": 600
},
{
"epoch": 0.445662100456621,
"grad_norm": NaN,
"learning_rate": 8.618648248643315e-05,
"loss": 0.0,
"step": 610
},
{
"epoch": 0.4529680365296804,
"grad_norm": NaN,
"learning_rate": 8.593981253083376e-05,
"loss": 0.0,
"step": 620
},
{
"epoch": 0.4602739726027397,
"grad_norm": NaN,
"learning_rate": 8.569314257523434e-05,
"loss": 0.0,
"step": 630
},
{
"epoch": 0.46757990867579907,
"grad_norm": NaN,
"learning_rate": 8.544647261963494e-05,
"loss": 0.0,
"step": 640
},
{
"epoch": 0.4748858447488584,
"grad_norm": NaN,
"learning_rate": 8.519980266403552e-05,
"loss": 0.0,
"step": 650
},
{
"epoch": 0.4748858447488584,
"eval_loss": NaN,
"eval_runtime": 88.9686,
"eval_samples_per_second": 118.806,
"eval_steps_per_second": 7.43,
"step": 650
},
{
"epoch": 0.4821917808219178,
"grad_norm": NaN,
"learning_rate": 8.495313270843612e-05,
"loss": 0.0,
"step": 660
},
{
"epoch": 0.4894977168949772,
"grad_norm": NaN,
"learning_rate": 8.470646275283671e-05,
"loss": 0.0,
"step": 670
},
{
"epoch": 0.4968036529680365,
"grad_norm": NaN,
"learning_rate": 8.445979279723731e-05,
"loss": 0.0,
"step": 680
},
{
"epoch": 0.5041095890410959,
"grad_norm": NaN,
"learning_rate": 8.421312284163789e-05,
"loss": 0.0,
"step": 690
},
{
"epoch": 0.5114155251141552,
"grad_norm": NaN,
"learning_rate": 8.396645288603849e-05,
"loss": 0.0,
"step": 700
},
{
"epoch": 0.5114155251141552,
"eval_loss": NaN,
"eval_runtime": 89.7675,
"eval_samples_per_second": 117.749,
"eval_steps_per_second": 7.363,
"step": 700
},
{
"epoch": 0.5187214611872146,
"grad_norm": NaN,
"learning_rate": 8.371978293043907e-05,
"loss": 0.0,
"step": 710
},
{
"epoch": 0.5260273972602739,
"grad_norm": NaN,
"learning_rate": 8.347311297483968e-05,
"loss": 0.0,
"step": 720
},
{
"epoch": 0.5333333333333333,
"grad_norm": NaN,
"learning_rate": 8.322644301924026e-05,
"loss": 0.0,
"step": 730
},
{
"epoch": 0.5406392694063927,
"grad_norm": NaN,
"learning_rate": 8.297977306364086e-05,
"loss": 0.0,
"step": 740
},
{
"epoch": 0.547945205479452,
"grad_norm": NaN,
"learning_rate": 8.273310310804144e-05,
"loss": 0.0,
"step": 750
},
{
"epoch": 0.547945205479452,
"eval_loss": NaN,
"eval_runtime": 89.6344,
"eval_samples_per_second": 117.923,
"eval_steps_per_second": 7.374,
"step": 750
},
{
"epoch": 0.5552511415525114,
"grad_norm": NaN,
"learning_rate": 8.248643315244204e-05,
"loss": 0.0,
"step": 760
},
{
"epoch": 0.5625570776255707,
"grad_norm": NaN,
"learning_rate": 8.223976319684262e-05,
"loss": 0.0,
"step": 770
},
{
"epoch": 0.5698630136986301,
"grad_norm": NaN,
"learning_rate": 8.199309324124323e-05,
"loss": 0.0,
"step": 780
},
{
"epoch": 0.5771689497716895,
"grad_norm": NaN,
"learning_rate": 8.174642328564381e-05,
"loss": 0.0,
"step": 790
},
{
"epoch": 0.5844748858447488,
"grad_norm": NaN,
"learning_rate": 8.149975333004441e-05,
"loss": 0.0,
"step": 800
},
{
"epoch": 0.5844748858447488,
"eval_loss": NaN,
"eval_runtime": 87.5041,
"eval_samples_per_second": 120.794,
"eval_steps_per_second": 7.554,
"step": 800
},
{
"epoch": 0.5917808219178082,
"grad_norm": NaN,
"learning_rate": 8.125308337444499e-05,
"loss": 0.0,
"step": 810
},
{
"epoch": 0.5990867579908675,
"grad_norm": NaN,
"learning_rate": 8.100641341884559e-05,
"loss": 0.0,
"step": 820
},
{
"epoch": 0.6063926940639269,
"grad_norm": NaN,
"learning_rate": 8.075974346324618e-05,
"loss": 0.0,
"step": 830
},
{
"epoch": 0.6136986301369863,
"grad_norm": NaN,
"learning_rate": 8.051307350764678e-05,
"loss": 0.0,
"step": 840
},
{
"epoch": 0.6210045662100456,
"grad_norm": NaN,
"learning_rate": 8.026640355204736e-05,
"loss": 0.0,
"step": 850
},
{
"epoch": 0.6210045662100456,
"eval_loss": NaN,
"eval_runtime": 87.6119,
"eval_samples_per_second": 120.646,
"eval_steps_per_second": 7.545,
"step": 850
},
{
"epoch": 0.628310502283105,
"grad_norm": NaN,
"learning_rate": 8.001973359644796e-05,
"loss": 0.0,
"step": 860
},
{
"epoch": 0.6356164383561644,
"grad_norm": NaN,
"learning_rate": 7.977306364084854e-05,
"loss": 0.0,
"step": 870
},
{
"epoch": 0.6429223744292237,
"grad_norm": NaN,
"learning_rate": 7.952639368524915e-05,
"loss": 0.0,
"step": 880
},
{
"epoch": 0.6502283105022831,
"grad_norm": NaN,
"learning_rate": 7.927972372964973e-05,
"loss": 0.0,
"step": 890
},
{
"epoch": 0.6575342465753424,
"grad_norm": NaN,
"learning_rate": 7.903305377405033e-05,
"loss": 0.0,
"step": 900
},
{
"epoch": 0.6575342465753424,
"eval_loss": NaN,
"eval_runtime": 87.2395,
"eval_samples_per_second": 121.161,
"eval_steps_per_second": 7.577,
"step": 900
},
{
"epoch": 0.6648401826484018,
"grad_norm": NaN,
"learning_rate": 7.878638381845091e-05,
"loss": 0.0,
"step": 910
},
{
"epoch": 0.6721461187214612,
"grad_norm": NaN,
"learning_rate": 7.853971386285151e-05,
"loss": 0.0,
"step": 920
},
{
"epoch": 0.6794520547945205,
"grad_norm": NaN,
"learning_rate": 7.829304390725209e-05,
"loss": 0.0,
"step": 930
},
{
"epoch": 0.6867579908675799,
"grad_norm": NaN,
"learning_rate": 7.80463739516527e-05,
"loss": 0.0,
"step": 940
},
{
"epoch": 0.6940639269406392,
"grad_norm": NaN,
"learning_rate": 7.779970399605328e-05,
"loss": 0.0,
"step": 950
},
{
"epoch": 0.6940639269406392,
"eval_loss": NaN,
"eval_runtime": 88.6493,
"eval_samples_per_second": 119.234,
"eval_steps_per_second": 7.456,
"step": 950
},
{
"epoch": 0.7013698630136986,
"grad_norm": NaN,
"learning_rate": 7.755303404045388e-05,
"loss": 0.0,
"step": 960
},
{
"epoch": 0.708675799086758,
"grad_norm": NaN,
"learning_rate": 7.730636408485446e-05,
"loss": 0.0,
"step": 970
},
{
"epoch": 0.7159817351598173,
"grad_norm": NaN,
"learning_rate": 7.705969412925506e-05,
"loss": 0.0,
"step": 980
},
{
"epoch": 0.7232876712328767,
"grad_norm": NaN,
"learning_rate": 7.681302417365566e-05,
"loss": 0.0,
"step": 990
},
{
"epoch": 0.730593607305936,
"grad_norm": NaN,
"learning_rate": 7.656635421805625e-05,
"loss": 0.0,
"step": 1000
},
{
"epoch": 0.730593607305936,
"eval_loss": NaN,
"eval_runtime": 89.7851,
"eval_samples_per_second": 117.726,
"eval_steps_per_second": 7.362,
"step": 1000
},
{
"epoch": 0.7378995433789954,
"grad_norm": NaN,
"learning_rate": 7.631968426245683e-05,
"loss": 0.0,
"step": 1010
},
{
"epoch": 0.7452054794520548,
"grad_norm": NaN,
"learning_rate": 7.607301430685743e-05,
"loss": 0.0,
"step": 1020
},
{
"epoch": 0.7525114155251141,
"grad_norm": NaN,
"learning_rate": 7.582634435125801e-05,
"loss": 0.0,
"step": 1030
},
{
"epoch": 0.7598173515981735,
"grad_norm": NaN,
"learning_rate": 7.557967439565862e-05,
"loss": 0.0,
"step": 1040
},
{
"epoch": 0.7671232876712328,
"grad_norm": NaN,
"learning_rate": 7.53330044400592e-05,
"loss": 0.0,
"step": 1050
},
{
"epoch": 0.7671232876712328,
"eval_loss": NaN,
"eval_runtime": 89.5436,
"eval_samples_per_second": 118.043,
"eval_steps_per_second": 7.382,
"step": 1050
},
{
"epoch": 0.7744292237442922,
"grad_norm": NaN,
"learning_rate": 7.50863344844598e-05,
"loss": 0.0,
"step": 1060
},
{
"epoch": 0.7817351598173516,
"grad_norm": NaN,
"learning_rate": 7.483966452886039e-05,
"loss": 0.0,
"step": 1070
},
{
"epoch": 0.7890410958904109,
"grad_norm": NaN,
"learning_rate": 7.459299457326098e-05,
"loss": 0.0,
"step": 1080
},
{
"epoch": 0.7963470319634703,
"grad_norm": NaN,
"learning_rate": 7.434632461766156e-05,
"loss": 0.0,
"step": 1090
},
{
"epoch": 0.8036529680365296,
"grad_norm": NaN,
"learning_rate": 7.409965466206217e-05,
"loss": 0.0,
"step": 1100
},
{
"epoch": 0.8036529680365296,
"eval_loss": NaN,
"eval_runtime": 87.5031,
"eval_samples_per_second": 120.796,
"eval_steps_per_second": 7.554,
"step": 1100
},
{
"epoch": 0.810958904109589,
"grad_norm": NaN,
"learning_rate": 7.385298470646276e-05,
"loss": 0.0,
"step": 1110
},
{
"epoch": 0.8182648401826484,
"grad_norm": NaN,
"learning_rate": 7.360631475086335e-05,
"loss": 0.0,
"step": 1120
},
{
"epoch": 0.8255707762557077,
"grad_norm": NaN,
"learning_rate": 7.335964479526394e-05,
"loss": 0.0,
"step": 1130
},
{
"epoch": 0.8328767123287671,
"grad_norm": NaN,
"learning_rate": 7.311297483966453e-05,
"loss": 0.0,
"step": 1140
},
{
"epoch": 0.8401826484018264,
"grad_norm": NaN,
"learning_rate": 7.286630488406513e-05,
"loss": 0.0,
"step": 1150
},
{
"epoch": 0.8401826484018264,
"eval_loss": NaN,
"eval_runtime": 87.5455,
"eval_samples_per_second": 120.737,
"eval_steps_per_second": 7.55,
"step": 1150
},
{
"epoch": 0.8474885844748858,
"grad_norm": NaN,
"learning_rate": 7.261963492846572e-05,
"loss": 0.0,
"step": 1160
},
{
"epoch": 0.8547945205479452,
"grad_norm": NaN,
"learning_rate": 7.23729649728663e-05,
"loss": 0.0,
"step": 1170
},
{
"epoch": 0.8621004566210045,
"grad_norm": NaN,
"learning_rate": 7.21262950172669e-05,
"loss": 0.0,
"step": 1180
},
{
"epoch": 0.869406392694064,
"grad_norm": NaN,
"learning_rate": 7.187962506166749e-05,
"loss": 0.0,
"step": 1190
},
{
"epoch": 0.8767123287671232,
"grad_norm": NaN,
"learning_rate": 7.16329551060681e-05,
"loss": 0.0,
"step": 1200
},
{
"epoch": 0.8767123287671232,
"eval_loss": NaN,
"eval_runtime": 87.4965,
"eval_samples_per_second": 120.805,
"eval_steps_per_second": 7.555,
"step": 1200
},
{
"epoch": 0.8840182648401826,
"grad_norm": NaN,
"learning_rate": 7.138628515046868e-05,
"loss": 0.0,
"step": 1210
},
{
"epoch": 0.891324200913242,
"grad_norm": NaN,
"learning_rate": 7.113961519486927e-05,
"loss": 0.0,
"step": 1220
},
{
"epoch": 0.8986301369863013,
"grad_norm": NaN,
"learning_rate": 7.089294523926986e-05,
"loss": 0.0,
"step": 1230
},
{
"epoch": 0.9059360730593607,
"grad_norm": NaN,
"learning_rate": 7.064627528367045e-05,
"loss": 0.0,
"step": 1240
},
{
"epoch": 0.91324200913242,
"grad_norm": NaN,
"learning_rate": 7.039960532807104e-05,
"loss": 0.0,
"step": 1250
},
{
"epoch": 0.91324200913242,
"eval_loss": NaN,
"eval_runtime": 87.5268,
"eval_samples_per_second": 120.763,
"eval_steps_per_second": 7.552,
"step": 1250
},
{
"epoch": 0.9205479452054794,
"grad_norm": NaN,
"learning_rate": 7.015293537247165e-05,
"loss": 0.0,
"step": 1260
},
{
"epoch": 0.9278538812785389,
"grad_norm": NaN,
"learning_rate": 6.990626541687223e-05,
"loss": 0.0,
"step": 1270
},
{
"epoch": 0.9351598173515981,
"grad_norm": NaN,
"learning_rate": 6.965959546127282e-05,
"loss": 0.0,
"step": 1280
},
{
"epoch": 0.9424657534246575,
"grad_norm": NaN,
"learning_rate": 6.941292550567341e-05,
"loss": 0.0,
"step": 1290
},
{
"epoch": 0.9497716894977168,
"grad_norm": NaN,
"learning_rate": 6.9166255550074e-05,
"loss": 0.0,
"step": 1300
},
{
"epoch": 0.9497716894977168,
"eval_loss": NaN,
"eval_runtime": 89.7784,
"eval_samples_per_second": 117.734,
"eval_steps_per_second": 7.363,
"step": 1300
},
{
"epoch": 0.9570776255707762,
"grad_norm": NaN,
"learning_rate": 6.89195855944746e-05,
"loss": 0.0,
"step": 1310
},
{
"epoch": 0.9643835616438357,
"grad_norm": NaN,
"learning_rate": 6.86729156388752e-05,
"loss": 0.0,
"step": 1320
},
{
"epoch": 0.971689497716895,
"grad_norm": NaN,
"learning_rate": 6.842624568327578e-05,
"loss": 0.0,
"step": 1330
},
{
"epoch": 0.9789954337899544,
"grad_norm": NaN,
"learning_rate": 6.817957572767637e-05,
"loss": 0.0,
"step": 1340
},
{
"epoch": 0.9863013698630136,
"grad_norm": NaN,
"learning_rate": 6.793290577207696e-05,
"loss": 0.0,
"step": 1350
},
{
"epoch": 0.9863013698630136,
"eval_loss": NaN,
"eval_runtime": 89.7601,
"eval_samples_per_second": 117.758,
"eval_steps_per_second": 7.364,
"step": 1350
},
{
"epoch": 0.993607305936073,
"grad_norm": NaN,
"learning_rate": 6.768623581647757e-05,
"loss": 0.0,
"step": 1360
},
{
"epoch": 1.0007305936073059,
"grad_norm": NaN,
"learning_rate": 6.743956586087815e-05,
"loss": 0.0,
"step": 1370
},
{
"epoch": 1.0080365296803653,
"grad_norm": NaN,
"learning_rate": 6.719289590527875e-05,
"loss": 0.0,
"step": 1380
},
{
"epoch": 1.0153424657534247,
"grad_norm": NaN,
"learning_rate": 6.694622594967933e-05,
"loss": 0.0,
"step": 1390
},
{
"epoch": 1.022648401826484,
"grad_norm": NaN,
"learning_rate": 6.669955599407992e-05,
"loss": 0.0,
"step": 1400
},
{
"epoch": 1.022648401826484,
"eval_loss": NaN,
"eval_runtime": 87.4612,
"eval_samples_per_second": 120.854,
"eval_steps_per_second": 7.558,
"step": 1400
},
{
"epoch": 1.0299543378995433,
"grad_norm": NaN,
"learning_rate": 6.645288603848051e-05,
"loss": 0.0,
"step": 1410
},
{
"epoch": 1.0372602739726027,
"grad_norm": NaN,
"learning_rate": 6.620621608288112e-05,
"loss": 0.0,
"step": 1420
},
{
"epoch": 1.044566210045662,
"grad_norm": NaN,
"learning_rate": 6.59595461272817e-05,
"loss": 0.0,
"step": 1430
},
{
"epoch": 1.0518721461187215,
"grad_norm": NaN,
"learning_rate": 6.57128761716823e-05,
"loss": 0.0,
"step": 1440
},
{
"epoch": 1.059178082191781,
"grad_norm": NaN,
"learning_rate": 6.546620621608288e-05,
"loss": 0.0,
"step": 1450
},
{
"epoch": 1.059178082191781,
"eval_loss": NaN,
"eval_runtime": 87.6838,
"eval_samples_per_second": 120.547,
"eval_steps_per_second": 7.538,
"step": 1450
},
{
"epoch": 1.0664840182648403,
"grad_norm": NaN,
"learning_rate": 6.521953626048347e-05,
"loss": 0.0,
"step": 1460
},
{
"epoch": 1.0737899543378995,
"grad_norm": NaN,
"learning_rate": 6.497286630488407e-05,
"loss": 0.0,
"step": 1470
},
{
"epoch": 1.0810958904109589,
"grad_norm": NaN,
"learning_rate": 6.472619634928467e-05,
"loss": 0.0,
"step": 1480
},
{
"epoch": 1.0884018264840183,
"grad_norm": NaN,
"learning_rate": 6.447952639368525e-05,
"loss": 0.0,
"step": 1490
},
{
"epoch": 1.0957077625570777,
"grad_norm": NaN,
"learning_rate": 6.423285643808585e-05,
"loss": 0.0,
"step": 1500
},
{
"epoch": 1.0957077625570777,
"eval_loss": NaN,
"eval_runtime": 87.6114,
"eval_samples_per_second": 120.646,
"eval_steps_per_second": 7.545,
"step": 1500
},
{
"epoch": 1.103013698630137,
"grad_norm": NaN,
"learning_rate": 6.398618648248643e-05,
"loss": 0.0,
"step": 1510
},
{
"epoch": 1.1103196347031963,
"grad_norm": NaN,
"learning_rate": 6.373951652688704e-05,
"loss": 0.0,
"step": 1520
},
{
"epoch": 1.1176255707762557,
"grad_norm": NaN,
"learning_rate": 6.349284657128762e-05,
"loss": 0.0,
"step": 1530
},
{
"epoch": 1.124931506849315,
"grad_norm": NaN,
"learning_rate": 6.324617661568822e-05,
"loss": 0.0,
"step": 1540
},
{
"epoch": 1.1322374429223745,
"grad_norm": NaN,
"learning_rate": 6.29995066600888e-05,
"loss": 0.0,
"step": 1550
},
{
"epoch": 1.1322374429223745,
"eval_loss": NaN,
"eval_runtime": 88.5181,
"eval_samples_per_second": 119.411,
"eval_steps_per_second": 7.467,
"step": 1550
},
{
"epoch": 1.139543378995434,
"grad_norm": NaN,
"learning_rate": 6.27528367044894e-05,
"loss": 0.0,
"step": 1560
},
{
"epoch": 1.146849315068493,
"grad_norm": NaN,
"learning_rate": 6.250616674888998e-05,
"loss": 0.0,
"step": 1570
},
{
"epoch": 1.1541552511415525,
"grad_norm": NaN,
"learning_rate": 6.225949679329059e-05,
"loss": 0.0,
"step": 1580
},
{
"epoch": 1.161461187214612,
"grad_norm": NaN,
"learning_rate": 6.201282683769117e-05,
"loss": 0.0,
"step": 1590
},
{
"epoch": 1.1687671232876713,
"grad_norm": NaN,
"learning_rate": 6.176615688209177e-05,
"loss": 0.0,
"step": 1600
},
{
"epoch": 1.1687671232876713,
"eval_loss": NaN,
"eval_runtime": 89.8677,
"eval_samples_per_second": 117.617,
"eval_steps_per_second": 7.355,
"step": 1600
},
{
"epoch": 1.1760730593607307,
"grad_norm": NaN,
"learning_rate": 6.151948692649235e-05,
"loss": 0.0,
"step": 1610
},
{
"epoch": 1.1833789954337899,
"grad_norm": NaN,
"learning_rate": 6.127281697089295e-05,
"loss": 0.0,
"step": 1620
},
{
"epoch": 1.1906849315068493,
"grad_norm": NaN,
"learning_rate": 6.102614701529354e-05,
"loss": 0.0,
"step": 1630
},
{
"epoch": 1.1979908675799087,
"grad_norm": NaN,
"learning_rate": 6.077947705969413e-05,
"loss": 0.0,
"step": 1640
},
{
"epoch": 1.205296803652968,
"grad_norm": NaN,
"learning_rate": 6.053280710409472e-05,
"loss": 0.0,
"step": 1650
},
{
"epoch": 1.205296803652968,
"eval_loss": NaN,
"eval_runtime": 90.1315,
"eval_samples_per_second": 117.273,
"eval_steps_per_second": 7.334,
"step": 1650
},
{
"epoch": 1.2126027397260275,
"grad_norm": NaN,
"learning_rate": 6.028613714849531e-05,
"loss": 0.0,
"step": 1660
},
{
"epoch": 1.2199086757990867,
"grad_norm": NaN,
"learning_rate": 6.003946719289591e-05,
"loss": 0.0,
"step": 1670
},
{
"epoch": 1.227214611872146,
"grad_norm": NaN,
"learning_rate": 5.9792797237296503e-05,
"loss": 0.0,
"step": 1680
},
{
"epoch": 1.2345205479452055,
"grad_norm": NaN,
"learning_rate": 5.954612728169709e-05,
"loss": 0.0,
"step": 1690
},
{
"epoch": 1.241826484018265,
"grad_norm": NaN,
"learning_rate": 5.929945732609768e-05,
"loss": 0.0,
"step": 1700
},
{
"epoch": 1.241826484018265,
"eval_loss": NaN,
"eval_runtime": 87.7312,
"eval_samples_per_second": 120.482,
"eval_steps_per_second": 7.534,
"step": 1700
},
{
"epoch": 1.2491324200913243,
"grad_norm": NaN,
"learning_rate": 5.905278737049827e-05,
"loss": 0.0,
"step": 1710
},
{
"epoch": 1.2564383561643835,
"grad_norm": NaN,
"learning_rate": 5.880611741489887e-05,
"loss": 0.0,
"step": 1720
},
{
"epoch": 1.263744292237443,
"grad_norm": NaN,
"learning_rate": 5.855944745929946e-05,
"loss": 0.0,
"step": 1730
},
{
"epoch": 1.2710502283105023,
"grad_norm": NaN,
"learning_rate": 5.8312777503700054e-05,
"loss": 0.0,
"step": 1740
},
{
"epoch": 1.2783561643835617,
"grad_norm": NaN,
"learning_rate": 5.806610754810064e-05,
"loss": 0.0,
"step": 1750
},
{
"epoch": 1.2783561643835617,
"eval_loss": NaN,
"eval_runtime": 87.6857,
"eval_samples_per_second": 120.544,
"eval_steps_per_second": 7.538,
"step": 1750
},
{
"epoch": 1.285662100456621,
"grad_norm": NaN,
"learning_rate": 5.781943759250123e-05,
"loss": 0.0,
"step": 1760
},
{
"epoch": 1.2929680365296803,
"grad_norm": NaN,
"learning_rate": 5.757276763690183e-05,
"loss": 0.0,
"step": 1770
},
{
"epoch": 1.3002739726027397,
"grad_norm": NaN,
"learning_rate": 5.732609768130242e-05,
"loss": 0.0,
"step": 1780
},
{
"epoch": 1.307579908675799,
"grad_norm": NaN,
"learning_rate": 5.7079427725703014e-05,
"loss": 0.0,
"step": 1790
},
{
"epoch": 1.3148858447488585,
"grad_norm": NaN,
"learning_rate": 5.6832757770103604e-05,
"loss": 0.0,
"step": 1800
},
{
"epoch": 1.3148858447488585,
"eval_loss": NaN,
"eval_runtime": 87.7498,
"eval_samples_per_second": 120.456,
"eval_steps_per_second": 7.533,
"step": 1800
},
{
"epoch": 1.322191780821918,
"grad_norm": NaN,
"learning_rate": 5.658608781450419e-05,
"loss": 0.0,
"step": 1810
},
{
"epoch": 1.329497716894977,
"grad_norm": NaN,
"learning_rate": 5.633941785890479e-05,
"loss": 0.0,
"step": 1820
},
{
"epoch": 1.3368036529680365,
"grad_norm": NaN,
"learning_rate": 5.609274790330538e-05,
"loss": 0.0,
"step": 1830
},
{
"epoch": 1.344109589041096,
"grad_norm": NaN,
"learning_rate": 5.5846077947705975e-05,
"loss": 0.0,
"step": 1840
},
{
"epoch": 1.3514155251141553,
"grad_norm": NaN,
"learning_rate": 5.5599407992106565e-05,
"loss": 0.0,
"step": 1850
},
{
"epoch": 1.3514155251141553,
"eval_loss": NaN,
"eval_runtime": 87.7348,
"eval_samples_per_second": 120.477,
"eval_steps_per_second": 7.534,
"step": 1850
},
{
"epoch": 1.3587214611872147,
"grad_norm": NaN,
"learning_rate": 5.5352738036507154e-05,
"loss": 0.0,
"step": 1860
},
{
"epoch": 1.366027397260274,
"grad_norm": NaN,
"learning_rate": 5.5106068080907743e-05,
"loss": 0.0,
"step": 1870
},
{
"epoch": 1.3733333333333333,
"grad_norm": NaN,
"learning_rate": 5.485939812530834e-05,
"loss": 0.0,
"step": 1880
},
{
"epoch": 1.3806392694063927,
"grad_norm": NaN,
"learning_rate": 5.461272816970893e-05,
"loss": 0.0,
"step": 1890
},
{
"epoch": 1.387945205479452,
"grad_norm": NaN,
"learning_rate": 5.4366058214109525e-05,
"loss": 0.0,
"step": 1900
},
{
"epoch": 1.387945205479452,
"eval_loss": NaN,
"eval_runtime": 87.626,
"eval_samples_per_second": 120.626,
"eval_steps_per_second": 7.543,
"step": 1900
},
{
"epoch": 1.3952511415525115,
"grad_norm": NaN,
"learning_rate": 5.4119388258510115e-05,
"loss": 0.0,
"step": 1910
},
{
"epoch": 1.4025570776255707,
"grad_norm": NaN,
"learning_rate": 5.3872718302910704e-05,
"loss": 0.0,
"step": 1920
},
{
"epoch": 1.40986301369863,
"grad_norm": NaN,
"learning_rate": 5.36260483473113e-05,
"loss": 0.0,
"step": 1930
},
{
"epoch": 1.4171689497716895,
"grad_norm": NaN,
"learning_rate": 5.337937839171189e-05,
"loss": 0.0,
"step": 1940
},
{
"epoch": 1.424474885844749,
"grad_norm": NaN,
"learning_rate": 5.3132708436112486e-05,
"loss": 0.0,
"step": 1950
},
{
"epoch": 1.424474885844749,
"eval_loss": NaN,
"eval_runtime": 89.8105,
"eval_samples_per_second": 117.692,
"eval_steps_per_second": 7.36,
"step": 1950
},
{
"epoch": 1.4317808219178083,
"grad_norm": NaN,
"learning_rate": 5.2886038480513075e-05,
"loss": 0.0,
"step": 1960
},
{
"epoch": 1.4390867579908675,
"grad_norm": NaN,
"learning_rate": 5.2639368524913665e-05,
"loss": 0.0,
"step": 1970
},
{
"epoch": 1.446392694063927,
"grad_norm": NaN,
"learning_rate": 5.239269856931426e-05,
"loss": 0.0,
"step": 1980
},
{
"epoch": 1.4536986301369863,
"grad_norm": NaN,
"learning_rate": 5.214602861371485e-05,
"loss": 0.0,
"step": 1990
},
{
"epoch": 1.4610045662100457,
"grad_norm": NaN,
"learning_rate": 5.189935865811545e-05,
"loss": 0.0,
"step": 2000
},
{
"epoch": 1.4610045662100457,
"eval_loss": NaN,
"eval_runtime": 89.7513,
"eval_samples_per_second": 117.77,
"eval_steps_per_second": 7.365,
"step": 2000
},
{
"epoch": 1.4683105022831051,
"grad_norm": NaN,
"learning_rate": 5.1652688702516036e-05,
"loss": 0.0,
"step": 2010
},
{
"epoch": 1.4756164383561643,
"grad_norm": NaN,
"learning_rate": 5.1406018746916626e-05,
"loss": 0.0,
"step": 2020
},
{
"epoch": 1.4829223744292237,
"grad_norm": NaN,
"learning_rate": 5.115934879131722e-05,
"loss": 0.0,
"step": 2030
},
{
"epoch": 1.490228310502283,
"grad_norm": NaN,
"learning_rate": 5.091267883571781e-05,
"loss": 0.0,
"step": 2040
},
{
"epoch": 1.4975342465753425,
"grad_norm": NaN,
"learning_rate": 5.06660088801184e-05,
"loss": 0.0,
"step": 2050
},
{
"epoch": 1.4975342465753425,
"eval_loss": NaN,
"eval_runtime": 87.5277,
"eval_samples_per_second": 120.762,
"eval_steps_per_second": 7.552,
"step": 2050
},
{
"epoch": 1.504840182648402,
"grad_norm": NaN,
"learning_rate": 5.0419338924519e-05,
"loss": 0.0,
"step": 2060
},
{
"epoch": 1.512146118721461,
"grad_norm": NaN,
"learning_rate": 5.0172668968919586e-05,
"loss": 0.0,
"step": 2070
},
{
"epoch": 1.5194520547945205,
"grad_norm": NaN,
"learning_rate": 4.992599901332018e-05,
"loss": 0.0,
"step": 2080
},
{
"epoch": 1.52675799086758,
"grad_norm": NaN,
"learning_rate": 4.967932905772077e-05,
"loss": 0.0,
"step": 2090
},
{
"epoch": 1.5340639269406393,
"grad_norm": NaN,
"learning_rate": 4.943265910212136e-05,
"loss": 0.0,
"step": 2100
},
{
"epoch": 1.5340639269406393,
"eval_loss": NaN,
"eval_runtime": 87.4849,
"eval_samples_per_second": 120.821,
"eval_steps_per_second": 7.556,
"step": 2100
},
{
"epoch": 1.5413698630136987,
"grad_norm": NaN,
"learning_rate": 4.918598914652196e-05,
"loss": 0.0,
"step": 2110
},
{
"epoch": 1.548675799086758,
"grad_norm": NaN,
"learning_rate": 4.893931919092255e-05,
"loss": 0.0,
"step": 2120
},
{
"epoch": 1.5559817351598173,
"grad_norm": NaN,
"learning_rate": 4.869264923532314e-05,
"loss": 0.0,
"step": 2130
},
{
"epoch": 1.5632876712328767,
"grad_norm": NaN,
"learning_rate": 4.844597927972373e-05,
"loss": 0.0,
"step": 2140
},
{
"epoch": 1.5705936073059361,
"grad_norm": NaN,
"learning_rate": 4.819930932412432e-05,
"loss": 0.0,
"step": 2150
},
{
"epoch": 1.5705936073059361,
"eval_loss": NaN,
"eval_runtime": 87.7089,
"eval_samples_per_second": 120.512,
"eval_steps_per_second": 7.536,
"step": 2150
},
{
"epoch": 1.5778995433789955,
"grad_norm": NaN,
"learning_rate": 4.795263936852492e-05,
"loss": 0.0,
"step": 2160
},
{
"epoch": 1.5852054794520547,
"grad_norm": NaN,
"learning_rate": 4.770596941292551e-05,
"loss": 0.0,
"step": 2170
},
{
"epoch": 1.592511415525114,
"grad_norm": NaN,
"learning_rate": 4.7459299457326104e-05,
"loss": 0.0,
"step": 2180
},
{
"epoch": 1.5998173515981735,
"grad_norm": NaN,
"learning_rate": 4.7212629501726694e-05,
"loss": 0.0,
"step": 2190
},
{
"epoch": 1.607123287671233,
"grad_norm": NaN,
"learning_rate": 4.696595954612728e-05,
"loss": 0.0,
"step": 2200
},
{
"epoch": 1.607123287671233,
"eval_loss": NaN,
"eval_runtime": 87.4106,
"eval_samples_per_second": 120.924,
"eval_steps_per_second": 7.562,
"step": 2200
},
{
"epoch": 1.6144292237442923,
"grad_norm": NaN,
"learning_rate": 4.671928959052788e-05,
"loss": 0.0,
"step": 2210
},
{
"epoch": 1.6217351598173515,
"grad_norm": NaN,
"learning_rate": 4.647261963492847e-05,
"loss": 0.0,
"step": 2220
},
{
"epoch": 1.629041095890411,
"grad_norm": NaN,
"learning_rate": 4.622594967932906e-05,
"loss": 0.0,
"step": 2230
},
{
"epoch": 1.6363470319634703,
"grad_norm": NaN,
"learning_rate": 4.5979279723729654e-05,
"loss": 0.0,
"step": 2240
},
{
"epoch": 1.6436529680365297,
"grad_norm": NaN,
"learning_rate": 4.5732609768130244e-05,
"loss": 0.0,
"step": 2250
},
{
"epoch": 1.6436529680365297,
"eval_loss": NaN,
"eval_runtime": 87.6352,
"eval_samples_per_second": 120.614,
"eval_steps_per_second": 7.543,
"step": 2250
},
{
"epoch": 1.6509589041095891,
"grad_norm": NaN,
"learning_rate": 4.548593981253084e-05,
"loss": 0.0,
"step": 2260
},
{
"epoch": 1.6582648401826483,
"grad_norm": NaN,
"learning_rate": 4.523926985693143e-05,
"loss": 0.0,
"step": 2270
},
{
"epoch": 1.6655707762557077,
"grad_norm": NaN,
"learning_rate": 4.499259990133202e-05,
"loss": 0.0,
"step": 2280
},
{
"epoch": 1.6728767123287671,
"grad_norm": NaN,
"learning_rate": 4.4745929945732615e-05,
"loss": 0.0,
"step": 2290
},
{
"epoch": 1.6801826484018265,
"grad_norm": NaN,
"learning_rate": 4.4499259990133204e-05,
"loss": 0.0,
"step": 2300
},
{
"epoch": 1.6801826484018265,
"eval_loss": NaN,
"eval_runtime": 89.8795,
"eval_samples_per_second": 117.602,
"eval_steps_per_second": 7.354,
"step": 2300
},
{
"epoch": 1.687488584474886,
"grad_norm": NaN,
"learning_rate": 4.42525900345338e-05,
"loss": 0.0,
"step": 2310
},
{
"epoch": 1.694794520547945,
"grad_norm": NaN,
"learning_rate": 4.400592007893439e-05,
"loss": 0.0,
"step": 2320
},
{
"epoch": 1.7021004566210047,
"grad_norm": NaN,
"learning_rate": 4.375925012333498e-05,
"loss": 0.0,
"step": 2330
},
{
"epoch": 1.709406392694064,
"grad_norm": NaN,
"learning_rate": 4.3512580167735576e-05,
"loss": 0.0,
"step": 2340
},
{
"epoch": 1.7167123287671233,
"grad_norm": NaN,
"learning_rate": 4.3265910212136165e-05,
"loss": 0.0,
"step": 2350
},
{
"epoch": 1.7167123287671233,
"eval_loss": NaN,
"eval_runtime": 89.9633,
"eval_samples_per_second": 117.492,
"eval_steps_per_second": 7.347,
"step": 2350
},
{
"epoch": 1.7240182648401827,
"grad_norm": NaN,
"learning_rate": 4.3019240256536755e-05,
"loss": 0.0,
"step": 2360
},
{
"epoch": 1.731324200913242,
"grad_norm": NaN,
"learning_rate": 4.277257030093735e-05,
"loss": 0.0,
"step": 2370
},
{
"epoch": 1.7386301369863015,
"grad_norm": NaN,
"learning_rate": 4.252590034533794e-05,
"loss": 0.0,
"step": 2380
},
{
"epoch": 1.7459360730593607,
"grad_norm": NaN,
"learning_rate": 4.2279230389738537e-05,
"loss": 0.0,
"step": 2390
},
{
"epoch": 1.7532420091324201,
"grad_norm": NaN,
"learning_rate": 4.2032560434139126e-05,
"loss": 0.0,
"step": 2400
},
{
"epoch": 1.7532420091324201,
"eval_loss": NaN,
"eval_runtime": 89.7987,
"eval_samples_per_second": 117.708,
"eval_steps_per_second": 7.361,
"step": 2400
},
{
"epoch": 1.7605479452054795,
"grad_norm": NaN,
"learning_rate": 4.1785890478539715e-05,
"loss": 0.0,
"step": 2410
},
{
"epoch": 1.7678538812785387,
"grad_norm": NaN,
"learning_rate": 4.153922052294031e-05,
"loss": 0.0,
"step": 2420
},
{
"epoch": 1.7751598173515983,
"grad_norm": NaN,
"learning_rate": 4.12925505673409e-05,
"loss": 0.0,
"step": 2430
},
{
"epoch": 1.7824657534246575,
"grad_norm": NaN,
"learning_rate": 4.10458806117415e-05,
"loss": 0.0,
"step": 2440
},
{
"epoch": 1.789771689497717,
"grad_norm": NaN,
"learning_rate": 4.079921065614209e-05,
"loss": 0.0,
"step": 2450
},
{
"epoch": 1.789771689497717,
"eval_loss": NaN,
"eval_runtime": 87.6424,
"eval_samples_per_second": 120.604,
"eval_steps_per_second": 7.542,
"step": 2450
},
{
"epoch": 1.7970776255707763,
"grad_norm": NaN,
"learning_rate": 4.0552540700542676e-05,
"loss": 0.0,
"step": 2460
},
{
"epoch": 1.8043835616438355,
"grad_norm": NaN,
"learning_rate": 4.030587074494327e-05,
"loss": 0.0,
"step": 2470
},
{
"epoch": 1.8116894977168951,
"grad_norm": NaN,
"learning_rate": 4.005920078934386e-05,
"loss": 0.0,
"step": 2480
},
{
"epoch": 1.8189954337899543,
"grad_norm": NaN,
"learning_rate": 3.981253083374445e-05,
"loss": 0.0,
"step": 2490
},
{
"epoch": 1.8263013698630137,
"grad_norm": NaN,
"learning_rate": 3.956586087814505e-05,
"loss": 0.0,
"step": 2500
},
{
"epoch": 1.8263013698630137,
"eval_loss": NaN,
"eval_runtime": 87.5994,
"eval_samples_per_second": 120.663,
"eval_steps_per_second": 7.546,
"step": 2500
},
{
"epoch": 1.8336073059360731,
"grad_norm": NaN,
"learning_rate": 3.931919092254564e-05,
"loss": 0.0,
"step": 2510
},
{
"epoch": 1.8409132420091323,
"grad_norm": NaN,
"learning_rate": 3.907252096694623e-05,
"loss": 0.0,
"step": 2520
},
{
"epoch": 1.848219178082192,
"grad_norm": NaN,
"learning_rate": 3.882585101134682e-05,
"loss": 0.0,
"step": 2530
},
{
"epoch": 1.8555251141552511,
"grad_norm": NaN,
"learning_rate": 3.857918105574741e-05,
"loss": 0.0,
"step": 2540
},
{
"epoch": 1.8628310502283105,
"grad_norm": NaN,
"learning_rate": 3.833251110014801e-05,
"loss": 0.0,
"step": 2550
},
{
"epoch": 1.8628310502283105,
"eval_loss": NaN,
"eval_runtime": 87.4443,
"eval_samples_per_second": 120.877,
"eval_steps_per_second": 7.559,
"step": 2550
},
{
"epoch": 1.87013698630137,
"grad_norm": NaN,
"learning_rate": 3.80858411445486e-05,
"loss": 0.0,
"step": 2560
},
{
"epoch": 1.8774429223744291,
"grad_norm": NaN,
"learning_rate": 3.783917118894919e-05,
"loss": 0.0,
"step": 2570
},
{
"epoch": 1.8847488584474887,
"grad_norm": NaN,
"learning_rate": 3.759250123334978e-05,
"loss": 0.0,
"step": 2580
},
{
"epoch": 1.892054794520548,
"grad_norm": NaN,
"learning_rate": 3.734583127775037e-05,
"loss": 0.0,
"step": 2590
},
{
"epoch": 1.8993607305936073,
"grad_norm": NaN,
"learning_rate": 3.709916132215097e-05,
"loss": 0.0,
"step": 2600
},
{
"epoch": 1.8993607305936073,
"eval_loss": NaN,
"eval_runtime": 87.7012,
"eval_samples_per_second": 120.523,
"eval_steps_per_second": 7.537,
"step": 2600
},
{
"epoch": 1.9066666666666667,
"grad_norm": NaN,
"learning_rate": 3.685249136655156e-05,
"loss": 0.0,
"step": 2610
},
{
"epoch": 1.913972602739726,
"grad_norm": NaN,
"learning_rate": 3.660582141095215e-05,
"loss": 0.0,
"step": 2620
},
{
"epoch": 1.9212785388127855,
"grad_norm": NaN,
"learning_rate": 3.6359151455352744e-05,
"loss": 0.0,
"step": 2630
},
{
"epoch": 1.9285844748858447,
"grad_norm": NaN,
"learning_rate": 3.6112481499753333e-05,
"loss": 0.0,
"step": 2640
},
{
"epoch": 1.9358904109589041,
"grad_norm": NaN,
"learning_rate": 3.586581154415392e-05,
"loss": 0.0,
"step": 2650
},
{
"epoch": 1.9358904109589041,
"eval_loss": NaN,
"eval_runtime": 87.5232,
"eval_samples_per_second": 120.768,
"eval_steps_per_second": 7.552,
"step": 2650
},
{
"epoch": 1.9431963470319635,
"grad_norm": NaN,
"learning_rate": 3.561914158855452e-05,
"loss": 0.0,
"step": 2660
},
{
"epoch": 1.9505022831050227,
"grad_norm": NaN,
"learning_rate": 3.537247163295511e-05,
"loss": 0.0,
"step": 2670
},
{
"epoch": 1.9578082191780823,
"grad_norm": NaN,
"learning_rate": 3.5125801677355705e-05,
"loss": 0.0,
"step": 2680
},
{
"epoch": 1.9651141552511415,
"grad_norm": NaN,
"learning_rate": 3.4879131721756294e-05,
"loss": 0.0,
"step": 2690
},
{
"epoch": 1.972420091324201,
"grad_norm": NaN,
"learning_rate": 3.4632461766156884e-05,
"loss": 0.0,
"step": 2700
},
{
"epoch": 1.972420091324201,
"eval_loss": NaN,
"eval_runtime": 87.6204,
"eval_samples_per_second": 120.634,
"eval_steps_per_second": 7.544,
"step": 2700
},
{
"epoch": 1.9797260273972603,
"grad_norm": NaN,
"learning_rate": 3.438579181055748e-05,
"loss": 0.0,
"step": 2710
},
{
"epoch": 1.9870319634703195,
"grad_norm": NaN,
"learning_rate": 3.413912185495807e-05,
"loss": 0.0,
"step": 2720
},
{
"epoch": 1.9943378995433791,
"grad_norm": NaN,
"learning_rate": 3.389245189935866e-05,
"loss": 0.0,
"step": 2730
},
{
"epoch": 2.0014611872146117,
"grad_norm": NaN,
"learning_rate": 3.3645781943759255e-05,
"loss": 0.0,
"step": 2740
},
{
"epoch": 2.0087671232876714,
"grad_norm": NaN,
"learning_rate": 3.3399111988159844e-05,
"loss": 0.0,
"step": 2750
},
{
"epoch": 2.0087671232876714,
"eval_loss": NaN,
"eval_runtime": 89.786,
"eval_samples_per_second": 117.724,
"eval_steps_per_second": 7.362,
"step": 2750
},
{
"epoch": 2.0160730593607306,
"grad_norm": NaN,
"learning_rate": 3.315244203256044e-05,
"loss": 0.0,
"step": 2760
},
{
"epoch": 2.0233789954337897,
"grad_norm": NaN,
"learning_rate": 3.290577207696103e-05,
"loss": 0.0,
"step": 2770
},
{
"epoch": 2.0306849315068494,
"grad_norm": NaN,
"learning_rate": 3.265910212136162e-05,
"loss": 0.0,
"step": 2780
},
{
"epoch": 2.0379908675799085,
"grad_norm": NaN,
"learning_rate": 3.2412432165762216e-05,
"loss": 0.0,
"step": 2790
},
{
"epoch": 2.045296803652968,
"grad_norm": NaN,
"learning_rate": 3.2165762210162805e-05,
"loss": 0.0,
"step": 2800
},
{
"epoch": 2.045296803652968,
"eval_loss": NaN,
"eval_runtime": 89.6927,
"eval_samples_per_second": 117.847,
"eval_steps_per_second": 7.37,
"step": 2800
},
{
"epoch": 2.0526027397260274,
"grad_norm": NaN,
"learning_rate": 3.1919092254563395e-05,
"loss": 0.0,
"step": 2810
},
{
"epoch": 2.0599086757990865,
"grad_norm": NaN,
"learning_rate": 3.167242229896399e-05,
"loss": 0.0,
"step": 2820
},
{
"epoch": 2.067214611872146,
"grad_norm": NaN,
"learning_rate": 3.142575234336458e-05,
"loss": 0.0,
"step": 2830
},
{
"epoch": 2.0745205479452054,
"grad_norm": NaN,
"learning_rate": 3.1179082387765176e-05,
"loss": 0.0,
"step": 2840
},
{
"epoch": 2.081826484018265,
"grad_norm": NaN,
"learning_rate": 3.0932412432165766e-05,
"loss": 0.0,
"step": 2850
},
{
"epoch": 2.081826484018265,
"eval_loss": NaN,
"eval_runtime": 89.0102,
"eval_samples_per_second": 118.75,
"eval_steps_per_second": 7.426,
"step": 2850
},
{
"epoch": 2.089132420091324,
"grad_norm": NaN,
"learning_rate": 3.0685742476566355e-05,
"loss": 0.0,
"step": 2860
},
{
"epoch": 2.0964383561643833,
"grad_norm": NaN,
"learning_rate": 3.0439072520966948e-05,
"loss": 0.0,
"step": 2870
},
{
"epoch": 2.103744292237443,
"grad_norm": NaN,
"learning_rate": 3.0192402565367538e-05,
"loss": 0.0,
"step": 2880
},
{
"epoch": 2.111050228310502,
"grad_norm": NaN,
"learning_rate": 2.994573260976813e-05,
"loss": 0.0,
"step": 2890
},
{
"epoch": 2.118356164383562,
"grad_norm": NaN,
"learning_rate": 2.9699062654168723e-05,
"loss": 0.0,
"step": 2900
},
{
"epoch": 2.118356164383562,
"eval_loss": NaN,
"eval_runtime": 87.4632,
"eval_samples_per_second": 120.851,
"eval_steps_per_second": 7.557,
"step": 2900
},
{
"epoch": 2.125662100456621,
"grad_norm": NaN,
"learning_rate": 2.9452392698569313e-05,
"loss": 0.0,
"step": 2910
},
{
"epoch": 2.1329680365296806,
"grad_norm": NaN,
"learning_rate": 2.920572274296991e-05,
"loss": 0.0,
"step": 2920
},
{
"epoch": 2.1402739726027398,
"grad_norm": NaN,
"learning_rate": 2.89590527873705e-05,
"loss": 0.0,
"step": 2930
},
{
"epoch": 2.147579908675799,
"grad_norm": NaN,
"learning_rate": 2.871238283177109e-05,
"loss": 0.0,
"step": 2940
},
{
"epoch": 2.1548858447488586,
"grad_norm": NaN,
"learning_rate": 2.8465712876171684e-05,
"loss": 0.0,
"step": 2950
},
{
"epoch": 2.1548858447488586,
"eval_loss": NaN,
"eval_runtime": 87.3497,
"eval_samples_per_second": 121.008,
"eval_steps_per_second": 7.567,
"step": 2950
},
{
"epoch": 2.1621917808219178,
"grad_norm": NaN,
"learning_rate": 2.8219042920572273e-05,
"loss": 0.0,
"step": 2960
},
{
"epoch": 2.169497716894977,
"grad_norm": NaN,
"learning_rate": 2.7972372964972866e-05,
"loss": 0.0,
"step": 2970
},
{
"epoch": 2.1768036529680366,
"grad_norm": NaN,
"learning_rate": 2.772570300937346e-05,
"loss": 0.0,
"step": 2980
},
{
"epoch": 2.1841095890410958,
"grad_norm": NaN,
"learning_rate": 2.7479033053774052e-05,
"loss": 0.0,
"step": 2990
},
{
"epoch": 2.1914155251141554,
"grad_norm": NaN,
"learning_rate": 2.7232363098174645e-05,
"loss": 0.0,
"step": 3000
},
{
"epoch": 2.1914155251141554,
"eval_loss": NaN,
"eval_runtime": 87.4657,
"eval_samples_per_second": 120.847,
"eval_steps_per_second": 7.557,
"step": 3000
},
{
"epoch": 2.1987214611872146,
"grad_norm": NaN,
"learning_rate": 2.6985693142575234e-05,
"loss": 0.0,
"step": 3010
},
{
"epoch": 2.206027397260274,
"grad_norm": NaN,
"learning_rate": 2.6739023186975827e-05,
"loss": 0.0,
"step": 3020
},
{
"epoch": 2.2133333333333334,
"grad_norm": NaN,
"learning_rate": 2.649235323137642e-05,
"loss": 0.0,
"step": 3030
},
{
"epoch": 2.2206392694063926,
"grad_norm": NaN,
"learning_rate": 2.6245683275777013e-05,
"loss": 0.0,
"step": 3040
},
{
"epoch": 2.227945205479452,
"grad_norm": NaN,
"learning_rate": 2.5999013320177602e-05,
"loss": 0.0,
"step": 3050
},
{
"epoch": 2.227945205479452,
"eval_loss": NaN,
"eval_runtime": 87.5429,
"eval_samples_per_second": 120.741,
"eval_steps_per_second": 7.551,
"step": 3050
},
{
"epoch": 2.2352511415525114,
"grad_norm": NaN,
"learning_rate": 2.5752343364578195e-05,
"loss": 0.0,
"step": 3060
},
{
"epoch": 2.2425570776255705,
"grad_norm": NaN,
"learning_rate": 2.5505673408978788e-05,
"loss": 0.0,
"step": 3070
},
{
"epoch": 2.24986301369863,
"grad_norm": NaN,
"learning_rate": 2.525900345337938e-05,
"loss": 0.0,
"step": 3080
},
{
"epoch": 2.2571689497716894,
"grad_norm": NaN,
"learning_rate": 2.501233349777997e-05,
"loss": 0.0,
"step": 3090
},
{
"epoch": 2.264474885844749,
"grad_norm": NaN,
"learning_rate": 2.4765663542180563e-05,
"loss": 0.0,
"step": 3100
},
{
"epoch": 2.264474885844749,
"eval_loss": NaN,
"eval_runtime": 87.4603,
"eval_samples_per_second": 120.855,
"eval_steps_per_second": 7.558,
"step": 3100
},
{
"epoch": 2.271780821917808,
"grad_norm": NaN,
"learning_rate": 2.4518993586581156e-05,
"loss": 0.0,
"step": 3110
},
{
"epoch": 2.279086757990868,
"grad_norm": NaN,
"learning_rate": 2.427232363098175e-05,
"loss": 0.0,
"step": 3120
},
{
"epoch": 2.286392694063927,
"grad_norm": NaN,
"learning_rate": 2.402565367538234e-05,
"loss": 0.0,
"step": 3130
},
{
"epoch": 2.293698630136986,
"grad_norm": NaN,
"learning_rate": 2.377898371978293e-05,
"loss": 0.0,
"step": 3140
},
{
"epoch": 2.301004566210046,
"grad_norm": NaN,
"learning_rate": 2.3532313764183524e-05,
"loss": 0.0,
"step": 3150
},
{
"epoch": 2.301004566210046,
"eval_loss": NaN,
"eval_runtime": 89.2945,
"eval_samples_per_second": 118.372,
"eval_steps_per_second": 7.402,
"step": 3150
},
{
"epoch": 2.308310502283105,
"grad_norm": NaN,
"learning_rate": 2.3285643808584116e-05,
"loss": 0.0,
"step": 3160
},
{
"epoch": 2.315616438356164,
"grad_norm": NaN,
"learning_rate": 2.303897385298471e-05,
"loss": 0.0,
"step": 3170
},
{
"epoch": 2.322922374429224,
"grad_norm": NaN,
"learning_rate": 2.27923038973853e-05,
"loss": 0.0,
"step": 3180
},
{
"epoch": 2.330228310502283,
"grad_norm": NaN,
"learning_rate": 2.254563394178589e-05,
"loss": 0.0,
"step": 3190
},
{
"epoch": 2.3375342465753426,
"grad_norm": NaN,
"learning_rate": 2.2298963986186484e-05,
"loss": 0.0,
"step": 3200
},
{
"epoch": 2.3375342465753426,
"eval_loss": NaN,
"eval_runtime": 89.6497,
"eval_samples_per_second": 117.903,
"eval_steps_per_second": 7.373,
"step": 3200
},
{
"epoch": 2.3448401826484018,
"grad_norm": NaN,
"learning_rate": 2.2052294030587077e-05,
"loss": 0.0,
"step": 3210
},
{
"epoch": 2.3521461187214614,
"grad_norm": NaN,
"learning_rate": 2.180562407498767e-05,
"loss": 0.0,
"step": 3220
},
{
"epoch": 2.3594520547945206,
"grad_norm": NaN,
"learning_rate": 2.155895411938826e-05,
"loss": 0.0,
"step": 3230
},
{
"epoch": 2.3667579908675798,
"grad_norm": NaN,
"learning_rate": 2.1312284163788852e-05,
"loss": 0.0,
"step": 3240
},
{
"epoch": 2.3740639269406394,
"grad_norm": NaN,
"learning_rate": 2.1065614208189445e-05,
"loss": 0.0,
"step": 3250
},
{
"epoch": 2.3740639269406394,
"eval_loss": NaN,
"eval_runtime": 89.7652,
"eval_samples_per_second": 117.752,
"eval_steps_per_second": 7.364,
"step": 3250
},
{
"epoch": 2.3813698630136986,
"grad_norm": NaN,
"learning_rate": 2.0818944252590038e-05,
"loss": 0.0,
"step": 3260
},
{
"epoch": 2.3886757990867578,
"grad_norm": NaN,
"learning_rate": 2.0572274296990627e-05,
"loss": 0.0,
"step": 3270
},
{
"epoch": 2.3959817351598174,
"grad_norm": NaN,
"learning_rate": 2.032560434139122e-05,
"loss": 0.0,
"step": 3280
},
{
"epoch": 2.4032876712328766,
"grad_norm": NaN,
"learning_rate": 2.0078934385791813e-05,
"loss": 0.0,
"step": 3290
},
{
"epoch": 2.410593607305936,
"grad_norm": NaN,
"learning_rate": 1.9832264430192406e-05,
"loss": 0.0,
"step": 3300
},
{
"epoch": 2.410593607305936,
"eval_loss": NaN,
"eval_runtime": 88.5138,
"eval_samples_per_second": 119.416,
"eval_steps_per_second": 7.468,
"step": 3300
},
{
"epoch": 2.4178995433789954,
"grad_norm": NaN,
"learning_rate": 1.9585594474592995e-05,
"loss": 0.0,
"step": 3310
},
{
"epoch": 2.425205479452055,
"grad_norm": NaN,
"learning_rate": 1.9338924518993588e-05,
"loss": 0.0,
"step": 3320
},
{
"epoch": 2.432511415525114,
"grad_norm": NaN,
"learning_rate": 1.909225456339418e-05,
"loss": 0.0,
"step": 3330
},
{
"epoch": 2.4398173515981734,
"grad_norm": NaN,
"learning_rate": 1.8845584607794774e-05,
"loss": 0.0,
"step": 3340
},
{
"epoch": 2.447123287671233,
"grad_norm": NaN,
"learning_rate": 1.8598914652195363e-05,
"loss": 0.0,
"step": 3350
},
{
"epoch": 2.447123287671233,
"eval_loss": NaN,
"eval_runtime": 87.334,
"eval_samples_per_second": 121.03,
"eval_steps_per_second": 7.569,
"step": 3350
},
{
"epoch": 2.454429223744292,
"grad_norm": NaN,
"learning_rate": 1.8352244696595956e-05,
"loss": 0.0,
"step": 3360
},
{
"epoch": 2.4617351598173514,
"grad_norm": NaN,
"learning_rate": 1.810557474099655e-05,
"loss": 0.0,
"step": 3370
},
{
"epoch": 2.469041095890411,
"grad_norm": NaN,
"learning_rate": 1.785890478539714e-05,
"loss": 0.0,
"step": 3380
},
{
"epoch": 2.47634703196347,
"grad_norm": NaN,
"learning_rate": 1.761223482979773e-05,
"loss": 0.0,
"step": 3390
},
{
"epoch": 2.48365296803653,
"grad_norm": NaN,
"learning_rate": 1.7365564874198324e-05,
"loss": 0.0,
"step": 3400
},
{
"epoch": 2.48365296803653,
"eval_loss": NaN,
"eval_runtime": 87.2333,
"eval_samples_per_second": 121.169,
"eval_steps_per_second": 7.577,
"step": 3400
},
{
"epoch": 2.490958904109589,
"grad_norm": NaN,
"learning_rate": 1.7118894918598917e-05,
"loss": 0.0,
"step": 3410
},
{
"epoch": 2.4982648401826486,
"grad_norm": NaN,
"learning_rate": 1.687222496299951e-05,
"loss": 0.0,
"step": 3420
},
{
"epoch": 2.505570776255708,
"grad_norm": NaN,
"learning_rate": 1.66255550074001e-05,
"loss": 0.0,
"step": 3430
},
{
"epoch": 2.512876712328767,
"grad_norm": NaN,
"learning_rate": 1.6378885051800692e-05,
"loss": 0.0,
"step": 3440
},
{
"epoch": 2.5201826484018266,
"grad_norm": NaN,
"learning_rate": 1.6132215096201285e-05,
"loss": 0.0,
"step": 3450
},
{
"epoch": 2.5201826484018266,
"eval_loss": NaN,
"eval_runtime": 87.3114,
"eval_samples_per_second": 121.061,
"eval_steps_per_second": 7.571,
"step": 3450
},
{
"epoch": 2.527488584474886,
"grad_norm": NaN,
"learning_rate": 1.5885545140601878e-05,
"loss": 0.0,
"step": 3460
},
{
"epoch": 2.534794520547945,
"grad_norm": NaN,
"learning_rate": 1.5638875185002467e-05,
"loss": 0.0,
"step": 3470
},
{
"epoch": 2.5421004566210046,
"grad_norm": NaN,
"learning_rate": 1.539220522940306e-05,
"loss": 0.0,
"step": 3480
},
{
"epoch": 2.5494063926940638,
"grad_norm": NaN,
"learning_rate": 1.5145535273803651e-05,
"loss": 0.0,
"step": 3490
},
{
"epoch": 2.5567123287671234,
"grad_norm": NaN,
"learning_rate": 1.4898865318204244e-05,
"loss": 0.0,
"step": 3500
},
{
"epoch": 2.5567123287671234,
"eval_loss": NaN,
"eval_runtime": 87.2842,
"eval_samples_per_second": 121.099,
"eval_steps_per_second": 7.573,
"step": 3500
},
{
"epoch": 2.5640182648401826,
"grad_norm": NaN,
"learning_rate": 1.4652195362604835e-05,
"loss": 0.0,
"step": 3510
},
{
"epoch": 2.571324200913242,
"grad_norm": NaN,
"learning_rate": 1.4405525407005426e-05,
"loss": 0.0,
"step": 3520
},
{
"epoch": 2.5786301369863014,
"grad_norm": NaN,
"learning_rate": 1.4158855451406019e-05,
"loss": 0.0,
"step": 3530
},
{
"epoch": 2.5859360730593606,
"grad_norm": NaN,
"learning_rate": 1.3912185495806612e-05,
"loss": 0.0,
"step": 3540
},
{
"epoch": 2.59324200913242,
"grad_norm": NaN,
"learning_rate": 1.3665515540207203e-05,
"loss": 0.0,
"step": 3550
},
{
"epoch": 2.59324200913242,
"eval_loss": NaN,
"eval_runtime": 87.1762,
"eval_samples_per_second": 121.249,
"eval_steps_per_second": 7.582,
"step": 3550
},
{
"epoch": 2.6005479452054794,
"grad_norm": NaN,
"learning_rate": 1.3418845584607796e-05,
"loss": 0.0,
"step": 3560
},
{
"epoch": 2.6078538812785386,
"grad_norm": NaN,
"learning_rate": 1.3172175629008387e-05,
"loss": 0.0,
"step": 3570
},
{
"epoch": 2.615159817351598,
"grad_norm": NaN,
"learning_rate": 1.292550567340898e-05,
"loss": 0.0,
"step": 3580
},
{
"epoch": 2.6224657534246574,
"grad_norm": NaN,
"learning_rate": 1.267883571780957e-05,
"loss": 0.0,
"step": 3590
},
{
"epoch": 2.629771689497717,
"grad_norm": NaN,
"learning_rate": 1.2432165762210164e-05,
"loss": 0.0,
"step": 3600
},
{
"epoch": 2.629771689497717,
"eval_loss": NaN,
"eval_runtime": 89.0981,
"eval_samples_per_second": 118.633,
"eval_steps_per_second": 7.419,
"step": 3600
},
{
"epoch": 2.637077625570776,
"grad_norm": NaN,
"learning_rate": 1.2185495806610755e-05,
"loss": 0.0,
"step": 3610
},
{
"epoch": 2.644383561643836,
"grad_norm": NaN,
"learning_rate": 1.1938825851011348e-05,
"loss": 0.0,
"step": 3620
},
{
"epoch": 2.651689497716895,
"grad_norm": NaN,
"learning_rate": 1.169215589541194e-05,
"loss": 0.0,
"step": 3630
},
{
"epoch": 2.658995433789954,
"grad_norm": NaN,
"learning_rate": 1.1445485939812531e-05,
"loss": 0.0,
"step": 3640
},
{
"epoch": 2.666301369863014,
"grad_norm": NaN,
"learning_rate": 1.1198815984213124e-05,
"loss": 0.0,
"step": 3650
},
{
"epoch": 2.666301369863014,
"eval_loss": NaN,
"eval_runtime": 89.3812,
"eval_samples_per_second": 118.258,
"eval_steps_per_second": 7.395,
"step": 3650
},
{
"epoch": 2.673607305936073,
"grad_norm": NaN,
"learning_rate": 1.0952146028613715e-05,
"loss": 0.0,
"step": 3660
},
{
"epoch": 2.680913242009132,
"grad_norm": NaN,
"learning_rate": 1.0705476073014308e-05,
"loss": 0.0,
"step": 3670
},
{
"epoch": 2.688219178082192,
"grad_norm": NaN,
"learning_rate": 1.04588061174149e-05,
"loss": 0.0,
"step": 3680
},
{
"epoch": 2.695525114155251,
"grad_norm": NaN,
"learning_rate": 1.0212136161815492e-05,
"loss": 0.0,
"step": 3690
},
{
"epoch": 2.7028310502283106,
"grad_norm": NaN,
"learning_rate": 9.965466206216083e-06,
"loss": 0.0,
"step": 3700
},
{
"epoch": 2.7028310502283106,
"eval_loss": NaN,
"eval_runtime": 89.3655,
"eval_samples_per_second": 118.278,
"eval_steps_per_second": 7.397,
"step": 3700
},
{
"epoch": 2.71013698630137,
"grad_norm": NaN,
"learning_rate": 9.718796250616676e-06,
"loss": 0.0,
"step": 3710
},
{
"epoch": 2.7174429223744294,
"grad_norm": NaN,
"learning_rate": 9.472126295017267e-06,
"loss": 0.0,
"step": 3720
},
{
"epoch": 2.7247488584474886,
"grad_norm": NaN,
"learning_rate": 9.22545633941786e-06,
"loss": 0.0,
"step": 3730
},
{
"epoch": 2.732054794520548,
"grad_norm": NaN,
"learning_rate": 8.978786383818451e-06,
"loss": 0.0,
"step": 3740
},
{
"epoch": 2.7393607305936074,
"grad_norm": NaN,
"learning_rate": 8.732116428219044e-06,
"loss": 0.0,
"step": 3750
},
{
"epoch": 2.7393607305936074,
"eval_loss": NaN,
"eval_runtime": 86.8786,
"eval_samples_per_second": 121.664,
"eval_steps_per_second": 7.608,
"step": 3750
},
{
"epoch": 2.7466666666666666,
"grad_norm": NaN,
"learning_rate": 8.485446472619635e-06,
"loss": 0.0,
"step": 3760
},
{
"epoch": 2.7539726027397258,
"grad_norm": NaN,
"learning_rate": 8.238776517020228e-06,
"loss": 0.0,
"step": 3770
},
{
"epoch": 2.7612785388127854,
"grad_norm": NaN,
"learning_rate": 7.99210656142082e-06,
"loss": 0.0,
"step": 3780
},
{
"epoch": 2.768584474885845,
"grad_norm": NaN,
"learning_rate": 7.745436605821412e-06,
"loss": 0.0,
"step": 3790
},
{
"epoch": 2.775890410958904,
"grad_norm": NaN,
"learning_rate": 7.498766650222003e-06,
"loss": 0.0,
"step": 3800
},
{
"epoch": 2.775890410958904,
"eval_loss": NaN,
"eval_runtime": 87.0328,
"eval_samples_per_second": 121.449,
"eval_steps_per_second": 7.595,
"step": 3800
},
{
"epoch": 2.7831963470319634,
"grad_norm": NaN,
"learning_rate": 7.252096694622595e-06,
"loss": 0.0,
"step": 3810
},
{
"epoch": 2.790502283105023,
"grad_norm": NaN,
"learning_rate": 7.005426739023187e-06,
"loss": 0.0,
"step": 3820
},
{
"epoch": 2.797808219178082,
"grad_norm": NaN,
"learning_rate": 6.758756783423779e-06,
"loss": 0.0,
"step": 3830
},
{
"epoch": 2.8051141552511414,
"grad_norm": NaN,
"learning_rate": 6.512086827824371e-06,
"loss": 0.0,
"step": 3840
},
{
"epoch": 2.812420091324201,
"grad_norm": NaN,
"learning_rate": 6.265416872224963e-06,
"loss": 0.0,
"step": 3850
},
{
"epoch": 2.812420091324201,
"eval_loss": NaN,
"eval_runtime": 86.9228,
"eval_samples_per_second": 121.602,
"eval_steps_per_second": 7.604,
"step": 3850
},
{
"epoch": 2.81972602739726,
"grad_norm": NaN,
"learning_rate": 6.018746916625555e-06,
"loss": 0.0,
"step": 3860
},
{
"epoch": 2.8270319634703194,
"grad_norm": NaN,
"learning_rate": 5.772076961026148e-06,
"loss": 0.0,
"step": 3870
},
{
"epoch": 2.834337899543379,
"grad_norm": NaN,
"learning_rate": 5.52540700542674e-06,
"loss": 0.0,
"step": 3880
},
{
"epoch": 2.8416438356164386,
"grad_norm": NaN,
"learning_rate": 5.278737049827332e-06,
"loss": 0.0,
"step": 3890
},
{
"epoch": 2.848949771689498,
"grad_norm": NaN,
"learning_rate": 5.032067094227924e-06,
"loss": 0.0,
"step": 3900
},
{
"epoch": 2.848949771689498,
"eval_loss": NaN,
"eval_runtime": 86.8846,
"eval_samples_per_second": 121.656,
"eval_steps_per_second": 7.608,
"step": 3900
},
{
"epoch": 2.856255707762557,
"grad_norm": NaN,
"learning_rate": 4.785397138628516e-06,
"loss": 0.0,
"step": 3910
},
{
"epoch": 2.8635616438356166,
"grad_norm": NaN,
"learning_rate": 4.538727183029108e-06,
"loss": 0.0,
"step": 3920
},
{
"epoch": 2.870867579908676,
"grad_norm": NaN,
"learning_rate": 4.2920572274297e-06,
"loss": 0.0,
"step": 3930
},
{
"epoch": 2.878173515981735,
"grad_norm": NaN,
"learning_rate": 4.045387271830292e-06,
"loss": 0.0,
"step": 3940
},
{
"epoch": 2.8854794520547946,
"grad_norm": NaN,
"learning_rate": 3.7987173162308833e-06,
"loss": 0.0,
"step": 3950
},
{
"epoch": 2.8854794520547946,
"eval_loss": NaN,
"eval_runtime": 87.2057,
"eval_samples_per_second": 121.208,
"eval_steps_per_second": 7.58,
"step": 3950
},
{
"epoch": 2.892785388127854,
"grad_norm": NaN,
"learning_rate": 3.5520473606314752e-06,
"loss": 0.0,
"step": 3960
},
{
"epoch": 2.900091324200913,
"grad_norm": NaN,
"learning_rate": 3.3053774050320672e-06,
"loss": 0.0,
"step": 3970
},
{
"epoch": 2.9073972602739726,
"grad_norm": NaN,
"learning_rate": 3.058707449432659e-06,
"loss": 0.0,
"step": 3980
},
{
"epoch": 2.9147031963470322,
"grad_norm": NaN,
"learning_rate": 2.812037493833251e-06,
"loss": 0.0,
"step": 3990
},
{
"epoch": 2.9220091324200914,
"grad_norm": NaN,
"learning_rate": 2.5653675382338436e-06,
"loss": 0.0,
"step": 4000
},
{
"epoch": 2.9220091324200914,
"eval_loss": NaN,
"eval_runtime": 78.7041,
"eval_samples_per_second": 134.301,
"eval_steps_per_second": 8.399,
"step": 4000
},
{
"epoch": 2.9293150684931506,
"grad_norm": NaN,
"learning_rate": 2.3186975826344356e-06,
"loss": 0.0,
"step": 4010
},
{
"epoch": 2.9366210045662102,
"grad_norm": NaN,
"learning_rate": 2.0720276270350275e-06,
"loss": 0.0,
"step": 4020
},
{
"epoch": 2.9439269406392694,
"grad_norm": NaN,
"learning_rate": 1.8253576714356193e-06,
"loss": 0.0,
"step": 4030
},
{
"epoch": 2.9512328767123286,
"grad_norm": NaN,
"learning_rate": 1.5786877158362113e-06,
"loss": 0.0,
"step": 4040
},
{
"epoch": 2.958538812785388,
"grad_norm": NaN,
"learning_rate": 1.3320177602368033e-06,
"loss": 0.0,
"step": 4050
},
{
"epoch": 2.958538812785388,
"eval_loss": NaN,
"eval_runtime": 75.4374,
"eval_samples_per_second": 140.116,
"eval_steps_per_second": 8.762,
"step": 4050
},
{
"epoch": 2.9658447488584474,
"grad_norm": NaN,
"learning_rate": 1.0853478046373952e-06,
"loss": 0.0,
"step": 4060
},
{
"epoch": 2.9731506849315066,
"grad_norm": NaN,
"learning_rate": 8.386778490379872e-07,
"loss": 0.0,
"step": 4070
},
{
"epoch": 2.980456621004566,
"grad_norm": NaN,
"learning_rate": 5.920078934385792e-07,
"loss": 0.0,
"step": 4080
},
{
"epoch": 2.987762557077626,
"grad_norm": NaN,
"learning_rate": 3.4533793783917124e-07,
"loss": 0.0,
"step": 4090
},
{
"epoch": 2.995068493150685,
"grad_norm": NaN,
"learning_rate": 9.86679822397632e-08,
"loss": 0.0,
"step": 4100
},
{
"epoch": 2.995068493150685,
"eval_loss": NaN,
"eval_runtime": 75.3535,
"eval_samples_per_second": 140.272,
"eval_steps_per_second": 8.772,
"step": 4100
}
],
"logging_steps": 10,
"max_steps": 4104,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.893527772009083e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}