{ "best_metric": Infinity, "best_model_checkpoint": null, "epoch": 2.9979908675799085, "eval_steps": 50, "global_step": 4104, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0073059360730593605, "grad_norm": NaN, "learning_rate": 2e-05, "loss": 0.0, "step": 10 }, { "epoch": 0.014611872146118721, "grad_norm": NaN, "learning_rate": 4e-05, "loss": 0.0, "step": 20 }, { "epoch": 0.021917808219178082, "grad_norm": NaN, "learning_rate": 6e-05, "loss": 0.0, "step": 30 }, { "epoch": 0.029223744292237442, "grad_norm": NaN, "learning_rate": 8e-05, "loss": 0.0, "step": 40 }, { "epoch": 0.0365296803652968, "grad_norm": NaN, "learning_rate": 0.0001, "loss": 0.0, "step": 50 }, { "epoch": 0.0365296803652968, "eval_loss": NaN, "eval_runtime": 86.8266, "eval_samples_per_second": 121.737, "eval_steps_per_second": 7.613, "step": 50 }, { "epoch": 0.043835616438356165, "grad_norm": NaN, "learning_rate": 9.97533300444006e-05, "loss": 0.0, "step": 60 }, { "epoch": 0.05114155251141553, "grad_norm": NaN, "learning_rate": 9.950666008880118e-05, "loss": 0.0, "step": 70 }, { "epoch": 0.058447488584474884, "grad_norm": NaN, "learning_rate": 9.925999013320178e-05, "loss": 0.0, "step": 80 }, { "epoch": 0.06575342465753424, "grad_norm": NaN, "learning_rate": 9.901332017760238e-05, "loss": 0.0, "step": 90 }, { "epoch": 0.0730593607305936, "grad_norm": NaN, "learning_rate": 9.876665022200296e-05, "loss": 0.0, "step": 100 }, { "epoch": 0.0730593607305936, "eval_loss": NaN, "eval_runtime": 89.5193, "eval_samples_per_second": 118.075, "eval_steps_per_second": 7.384, "step": 100 }, { "epoch": 0.08036529680365297, "grad_norm": NaN, "learning_rate": 9.851998026640355e-05, "loss": 0.0, "step": 110 }, { "epoch": 0.08767123287671233, "grad_norm": NaN, "learning_rate": 9.827331031080415e-05, "loss": 0.0, "step": 120 }, { "epoch": 0.09497716894977169, "grad_norm": NaN, "learning_rate": 9.802664035520473e-05, "loss": 0.0, "step": 130 }, { "epoch": 0.10228310502283106, "grad_norm": NaN, "learning_rate": 9.777997039960533e-05, "loss": 0.0, "step": 140 }, { "epoch": 0.1095890410958904, "grad_norm": NaN, "learning_rate": 9.753330044400593e-05, "loss": 0.0, "step": 150 }, { "epoch": 0.1095890410958904, "eval_loss": NaN, "eval_runtime": 89.6606, "eval_samples_per_second": 117.889, "eval_steps_per_second": 7.372, "step": 150 }, { "epoch": 0.11689497716894977, "grad_norm": NaN, "learning_rate": 9.728663048840652e-05, "loss": 0.0, "step": 160 }, { "epoch": 0.12420091324200913, "grad_norm": NaN, "learning_rate": 9.70399605328071e-05, "loss": 0.0, "step": 170 }, { "epoch": 0.13150684931506848, "grad_norm": NaN, "learning_rate": 9.67932905772077e-05, "loss": 0.0, "step": 180 }, { "epoch": 0.13881278538812786, "grad_norm": NaN, "learning_rate": 9.65466206216083e-05, "loss": 0.0, "step": 190 }, { "epoch": 0.1461187214611872, "grad_norm": NaN, "learning_rate": 9.629995066600888e-05, "loss": 0.0, "step": 200 }, { "epoch": 0.1461187214611872, "eval_loss": NaN, "eval_runtime": 87.4596, "eval_samples_per_second": 120.856, "eval_steps_per_second": 7.558, "step": 200 }, { "epoch": 0.15342465753424658, "grad_norm": NaN, "learning_rate": 9.605328071040948e-05, "loss": 0.0, "step": 210 }, { "epoch": 0.16073059360730593, "grad_norm": NaN, "learning_rate": 9.580661075481007e-05, "loss": 0.0, "step": 220 }, { "epoch": 0.1680365296803653, "grad_norm": NaN, "learning_rate": 9.555994079921066e-05, "loss": 0.0, "step": 230 }, { "epoch": 0.17534246575342466, "grad_norm": NaN, "learning_rate": 9.531327084361125e-05, "loss": 0.0, "step": 240 }, { "epoch": 0.182648401826484, "grad_norm": NaN, "learning_rate": 9.506660088801185e-05, "loss": 0.0, "step": 250 }, { "epoch": 0.182648401826484, "eval_loss": NaN, "eval_runtime": 87.3071, "eval_samples_per_second": 121.067, "eval_steps_per_second": 7.571, "step": 250 }, { "epoch": 0.18995433789954339, "grad_norm": NaN, "learning_rate": 9.481993093241244e-05, "loss": 0.0, "step": 260 }, { "epoch": 0.19726027397260273, "grad_norm": NaN, "learning_rate": 9.457326097681303e-05, "loss": 0.0, "step": 270 }, { "epoch": 0.2045662100456621, "grad_norm": NaN, "learning_rate": 9.432659102121362e-05, "loss": 0.0, "step": 280 }, { "epoch": 0.21187214611872146, "grad_norm": NaN, "learning_rate": 9.40799210656142e-05, "loss": 0.0, "step": 290 }, { "epoch": 0.2191780821917808, "grad_norm": NaN, "learning_rate": 9.38332511100148e-05, "loss": 0.0, "step": 300 }, { "epoch": 0.2191780821917808, "eval_loss": NaN, "eval_runtime": 87.4316, "eval_samples_per_second": 120.895, "eval_steps_per_second": 7.56, "step": 300 }, { "epoch": 0.2264840182648402, "grad_norm": NaN, "learning_rate": 9.35865811544154e-05, "loss": 0.0, "step": 310 }, { "epoch": 0.23378995433789954, "grad_norm": NaN, "learning_rate": 9.3339911198816e-05, "loss": 0.0, "step": 320 }, { "epoch": 0.2410958904109589, "grad_norm": NaN, "learning_rate": 9.309324124321658e-05, "loss": 0.0, "step": 330 }, { "epoch": 0.24840182648401826, "grad_norm": NaN, "learning_rate": 9.284657128761717e-05, "loss": 0.0, "step": 340 }, { "epoch": 0.2557077625570776, "grad_norm": NaN, "learning_rate": 9.259990133201777e-05, "loss": 0.0, "step": 350 }, { "epoch": 0.2557077625570776, "eval_loss": NaN, "eval_runtime": 89.7649, "eval_samples_per_second": 117.752, "eval_steps_per_second": 7.364, "step": 350 }, { "epoch": 0.26301369863013696, "grad_norm": NaN, "learning_rate": 9.235323137641837e-05, "loss": 0.0, "step": 360 }, { "epoch": 0.27031963470319637, "grad_norm": NaN, "learning_rate": 9.210656142081895e-05, "loss": 0.0, "step": 370 }, { "epoch": 0.2776255707762557, "grad_norm": NaN, "learning_rate": 9.185989146521954e-05, "loss": 0.0, "step": 380 }, { "epoch": 0.28493150684931506, "grad_norm": NaN, "learning_rate": 9.161322150962013e-05, "loss": 0.0, "step": 390 }, { "epoch": 0.2922374429223744, "grad_norm": NaN, "learning_rate": 9.136655155402072e-05, "loss": 0.0, "step": 400 }, { "epoch": 0.2922374429223744, "eval_loss": NaN, "eval_runtime": 89.6739, "eval_samples_per_second": 117.871, "eval_steps_per_second": 7.371, "step": 400 }, { "epoch": 0.29954337899543376, "grad_norm": NaN, "learning_rate": 9.111988159842132e-05, "loss": 0.0, "step": 410 }, { "epoch": 0.30684931506849317, "grad_norm": NaN, "learning_rate": 9.087321164282192e-05, "loss": 0.0, "step": 420 }, { "epoch": 0.3141552511415525, "grad_norm": NaN, "learning_rate": 9.06265416872225e-05, "loss": 0.0, "step": 430 }, { "epoch": 0.32146118721461187, "grad_norm": NaN, "learning_rate": 9.03798717316231e-05, "loss": 0.0, "step": 440 }, { "epoch": 0.3287671232876712, "grad_norm": NaN, "learning_rate": 9.013320177602368e-05, "loss": 0.0, "step": 450 }, { "epoch": 0.3287671232876712, "eval_loss": NaN, "eval_runtime": 87.7201, "eval_samples_per_second": 120.497, "eval_steps_per_second": 7.535, "step": 450 }, { "epoch": 0.3360730593607306, "grad_norm": NaN, "learning_rate": 8.988653182042427e-05, "loss": 0.0, "step": 460 }, { "epoch": 0.34337899543378997, "grad_norm": NaN, "learning_rate": 8.963986186482487e-05, "loss": 0.0, "step": 470 }, { "epoch": 0.3506849315068493, "grad_norm": NaN, "learning_rate": 8.939319190922547e-05, "loss": 0.0, "step": 480 }, { "epoch": 0.35799086757990867, "grad_norm": NaN, "learning_rate": 8.914652195362605e-05, "loss": 0.0, "step": 490 }, { "epoch": 0.365296803652968, "grad_norm": NaN, "learning_rate": 8.889985199802664e-05, "loss": 0.0, "step": 500 }, { "epoch": 0.365296803652968, "eval_loss": NaN, "eval_runtime": 87.4748, "eval_samples_per_second": 120.835, "eval_steps_per_second": 7.556, "step": 500 }, { "epoch": 0.3726027397260274, "grad_norm": NaN, "learning_rate": 8.865318204242724e-05, "loss": 0.0, "step": 510 }, { "epoch": 0.37990867579908677, "grad_norm": NaN, "learning_rate": 8.840651208682784e-05, "loss": 0.0, "step": 520 }, { "epoch": 0.3872146118721461, "grad_norm": NaN, "learning_rate": 8.815984213122842e-05, "loss": 0.0, "step": 530 }, { "epoch": 0.39452054794520547, "grad_norm": NaN, "learning_rate": 8.791317217562902e-05, "loss": 0.0, "step": 540 }, { "epoch": 0.4018264840182648, "grad_norm": NaN, "learning_rate": 8.76665022200296e-05, "loss": 0.0, "step": 550 }, { "epoch": 0.4018264840182648, "eval_loss": NaN, "eval_runtime": 87.3047, "eval_samples_per_second": 121.07, "eval_steps_per_second": 7.571, "step": 550 }, { "epoch": 0.4091324200913242, "grad_norm": NaN, "learning_rate": 8.74198322644302e-05, "loss": 0.0, "step": 560 }, { "epoch": 0.41643835616438357, "grad_norm": NaN, "learning_rate": 8.717316230883079e-05, "loss": 0.0, "step": 570 }, { "epoch": 0.4237442922374429, "grad_norm": NaN, "learning_rate": 8.692649235323139e-05, "loss": 0.0, "step": 580 }, { "epoch": 0.43105022831050227, "grad_norm": NaN, "learning_rate": 8.667982239763197e-05, "loss": 0.0, "step": 590 }, { "epoch": 0.4383561643835616, "grad_norm": NaN, "learning_rate": 8.643315244203257e-05, "loss": 0.0, "step": 600 }, { "epoch": 0.4383561643835616, "eval_loss": NaN, "eval_runtime": 87.4314, "eval_samples_per_second": 120.895, "eval_steps_per_second": 7.56, "step": 600 }, { "epoch": 0.445662100456621, "grad_norm": NaN, "learning_rate": 8.618648248643315e-05, "loss": 0.0, "step": 610 }, { "epoch": 0.4529680365296804, "grad_norm": NaN, "learning_rate": 8.593981253083376e-05, "loss": 0.0, "step": 620 }, { "epoch": 0.4602739726027397, "grad_norm": NaN, "learning_rate": 8.569314257523434e-05, "loss": 0.0, "step": 630 }, { "epoch": 0.46757990867579907, "grad_norm": NaN, "learning_rate": 8.544647261963494e-05, "loss": 0.0, "step": 640 }, { "epoch": 0.4748858447488584, "grad_norm": NaN, "learning_rate": 8.519980266403552e-05, "loss": 0.0, "step": 650 }, { "epoch": 0.4748858447488584, "eval_loss": NaN, "eval_runtime": 88.9686, "eval_samples_per_second": 118.806, "eval_steps_per_second": 7.43, "step": 650 }, { "epoch": 0.4821917808219178, "grad_norm": NaN, "learning_rate": 8.495313270843612e-05, "loss": 0.0, "step": 660 }, { "epoch": 0.4894977168949772, "grad_norm": NaN, "learning_rate": 8.470646275283671e-05, "loss": 0.0, "step": 670 }, { "epoch": 0.4968036529680365, "grad_norm": NaN, "learning_rate": 8.445979279723731e-05, "loss": 0.0, "step": 680 }, { "epoch": 0.5041095890410959, "grad_norm": NaN, "learning_rate": 8.421312284163789e-05, "loss": 0.0, "step": 690 }, { "epoch": 0.5114155251141552, "grad_norm": NaN, "learning_rate": 8.396645288603849e-05, "loss": 0.0, "step": 700 }, { "epoch": 0.5114155251141552, "eval_loss": NaN, "eval_runtime": 89.7675, "eval_samples_per_second": 117.749, "eval_steps_per_second": 7.363, "step": 700 }, { "epoch": 0.5187214611872146, "grad_norm": NaN, "learning_rate": 8.371978293043907e-05, "loss": 0.0, "step": 710 }, { "epoch": 0.5260273972602739, "grad_norm": NaN, "learning_rate": 8.347311297483968e-05, "loss": 0.0, "step": 720 }, { "epoch": 0.5333333333333333, "grad_norm": NaN, "learning_rate": 8.322644301924026e-05, "loss": 0.0, "step": 730 }, { "epoch": 0.5406392694063927, "grad_norm": NaN, "learning_rate": 8.297977306364086e-05, "loss": 0.0, "step": 740 }, { "epoch": 0.547945205479452, "grad_norm": NaN, "learning_rate": 8.273310310804144e-05, "loss": 0.0, "step": 750 }, { "epoch": 0.547945205479452, "eval_loss": NaN, "eval_runtime": 89.6344, "eval_samples_per_second": 117.923, "eval_steps_per_second": 7.374, "step": 750 }, { "epoch": 0.5552511415525114, "grad_norm": NaN, "learning_rate": 8.248643315244204e-05, "loss": 0.0, "step": 760 }, { "epoch": 0.5625570776255707, "grad_norm": NaN, "learning_rate": 8.223976319684262e-05, "loss": 0.0, "step": 770 }, { "epoch": 0.5698630136986301, "grad_norm": NaN, "learning_rate": 8.199309324124323e-05, "loss": 0.0, "step": 780 }, { "epoch": 0.5771689497716895, "grad_norm": NaN, "learning_rate": 8.174642328564381e-05, "loss": 0.0, "step": 790 }, { "epoch": 0.5844748858447488, "grad_norm": NaN, "learning_rate": 8.149975333004441e-05, "loss": 0.0, "step": 800 }, { "epoch": 0.5844748858447488, "eval_loss": NaN, "eval_runtime": 87.5041, "eval_samples_per_second": 120.794, "eval_steps_per_second": 7.554, "step": 800 }, { "epoch": 0.5917808219178082, "grad_norm": NaN, "learning_rate": 8.125308337444499e-05, "loss": 0.0, "step": 810 }, { "epoch": 0.5990867579908675, "grad_norm": NaN, "learning_rate": 8.100641341884559e-05, "loss": 0.0, "step": 820 }, { "epoch": 0.6063926940639269, "grad_norm": NaN, "learning_rate": 8.075974346324618e-05, "loss": 0.0, "step": 830 }, { "epoch": 0.6136986301369863, "grad_norm": NaN, "learning_rate": 8.051307350764678e-05, "loss": 0.0, "step": 840 }, { "epoch": 0.6210045662100456, "grad_norm": NaN, "learning_rate": 8.026640355204736e-05, "loss": 0.0, "step": 850 }, { "epoch": 0.6210045662100456, "eval_loss": NaN, "eval_runtime": 87.6119, "eval_samples_per_second": 120.646, "eval_steps_per_second": 7.545, "step": 850 }, { "epoch": 0.628310502283105, "grad_norm": NaN, "learning_rate": 8.001973359644796e-05, "loss": 0.0, "step": 860 }, { "epoch": 0.6356164383561644, "grad_norm": NaN, "learning_rate": 7.977306364084854e-05, "loss": 0.0, "step": 870 }, { "epoch": 0.6429223744292237, "grad_norm": NaN, "learning_rate": 7.952639368524915e-05, "loss": 0.0, "step": 880 }, { "epoch": 0.6502283105022831, "grad_norm": NaN, "learning_rate": 7.927972372964973e-05, "loss": 0.0, "step": 890 }, { "epoch": 0.6575342465753424, "grad_norm": NaN, "learning_rate": 7.903305377405033e-05, "loss": 0.0, "step": 900 }, { "epoch": 0.6575342465753424, "eval_loss": NaN, "eval_runtime": 87.2395, "eval_samples_per_second": 121.161, "eval_steps_per_second": 7.577, "step": 900 }, { "epoch": 0.6648401826484018, "grad_norm": NaN, "learning_rate": 7.878638381845091e-05, "loss": 0.0, "step": 910 }, { "epoch": 0.6721461187214612, "grad_norm": NaN, "learning_rate": 7.853971386285151e-05, "loss": 0.0, "step": 920 }, { "epoch": 0.6794520547945205, "grad_norm": NaN, "learning_rate": 7.829304390725209e-05, "loss": 0.0, "step": 930 }, { "epoch": 0.6867579908675799, "grad_norm": NaN, "learning_rate": 7.80463739516527e-05, "loss": 0.0, "step": 940 }, { "epoch": 0.6940639269406392, "grad_norm": NaN, "learning_rate": 7.779970399605328e-05, "loss": 0.0, "step": 950 }, { "epoch": 0.6940639269406392, "eval_loss": NaN, "eval_runtime": 88.6493, "eval_samples_per_second": 119.234, "eval_steps_per_second": 7.456, "step": 950 }, { "epoch": 0.7013698630136986, "grad_norm": NaN, "learning_rate": 7.755303404045388e-05, "loss": 0.0, "step": 960 }, { "epoch": 0.708675799086758, "grad_norm": NaN, "learning_rate": 7.730636408485446e-05, "loss": 0.0, "step": 970 }, { "epoch": 0.7159817351598173, "grad_norm": NaN, "learning_rate": 7.705969412925506e-05, "loss": 0.0, "step": 980 }, { "epoch": 0.7232876712328767, "grad_norm": NaN, "learning_rate": 7.681302417365566e-05, "loss": 0.0, "step": 990 }, { "epoch": 0.730593607305936, "grad_norm": NaN, "learning_rate": 7.656635421805625e-05, "loss": 0.0, "step": 1000 }, { "epoch": 0.730593607305936, "eval_loss": NaN, "eval_runtime": 89.7851, "eval_samples_per_second": 117.726, "eval_steps_per_second": 7.362, "step": 1000 }, { "epoch": 0.7378995433789954, "grad_norm": NaN, "learning_rate": 7.631968426245683e-05, "loss": 0.0, "step": 1010 }, { "epoch": 0.7452054794520548, "grad_norm": NaN, "learning_rate": 7.607301430685743e-05, "loss": 0.0, "step": 1020 }, { "epoch": 0.7525114155251141, "grad_norm": NaN, "learning_rate": 7.582634435125801e-05, "loss": 0.0, "step": 1030 }, { "epoch": 0.7598173515981735, "grad_norm": NaN, "learning_rate": 7.557967439565862e-05, "loss": 0.0, "step": 1040 }, { "epoch": 0.7671232876712328, "grad_norm": NaN, "learning_rate": 7.53330044400592e-05, "loss": 0.0, "step": 1050 }, { "epoch": 0.7671232876712328, "eval_loss": NaN, "eval_runtime": 89.5436, "eval_samples_per_second": 118.043, "eval_steps_per_second": 7.382, "step": 1050 }, { "epoch": 0.7744292237442922, "grad_norm": NaN, "learning_rate": 7.50863344844598e-05, "loss": 0.0, "step": 1060 }, { "epoch": 0.7817351598173516, "grad_norm": NaN, "learning_rate": 7.483966452886039e-05, "loss": 0.0, "step": 1070 }, { "epoch": 0.7890410958904109, "grad_norm": NaN, "learning_rate": 7.459299457326098e-05, "loss": 0.0, "step": 1080 }, { "epoch": 0.7963470319634703, "grad_norm": NaN, "learning_rate": 7.434632461766156e-05, "loss": 0.0, "step": 1090 }, { "epoch": 0.8036529680365296, "grad_norm": NaN, "learning_rate": 7.409965466206217e-05, "loss": 0.0, "step": 1100 }, { "epoch": 0.8036529680365296, "eval_loss": NaN, "eval_runtime": 87.5031, "eval_samples_per_second": 120.796, "eval_steps_per_second": 7.554, "step": 1100 }, { "epoch": 0.810958904109589, "grad_norm": NaN, "learning_rate": 7.385298470646276e-05, "loss": 0.0, "step": 1110 }, { "epoch": 0.8182648401826484, "grad_norm": NaN, "learning_rate": 7.360631475086335e-05, "loss": 0.0, "step": 1120 }, { "epoch": 0.8255707762557077, "grad_norm": NaN, "learning_rate": 7.335964479526394e-05, "loss": 0.0, "step": 1130 }, { "epoch": 0.8328767123287671, "grad_norm": NaN, "learning_rate": 7.311297483966453e-05, "loss": 0.0, "step": 1140 }, { "epoch": 0.8401826484018264, "grad_norm": NaN, "learning_rate": 7.286630488406513e-05, "loss": 0.0, "step": 1150 }, { "epoch": 0.8401826484018264, "eval_loss": NaN, "eval_runtime": 87.5455, "eval_samples_per_second": 120.737, "eval_steps_per_second": 7.55, "step": 1150 }, { "epoch": 0.8474885844748858, "grad_norm": NaN, "learning_rate": 7.261963492846572e-05, "loss": 0.0, "step": 1160 }, { "epoch": 0.8547945205479452, "grad_norm": NaN, "learning_rate": 7.23729649728663e-05, "loss": 0.0, "step": 1170 }, { "epoch": 0.8621004566210045, "grad_norm": NaN, "learning_rate": 7.21262950172669e-05, "loss": 0.0, "step": 1180 }, { "epoch": 0.869406392694064, "grad_norm": NaN, "learning_rate": 7.187962506166749e-05, "loss": 0.0, "step": 1190 }, { "epoch": 0.8767123287671232, "grad_norm": NaN, "learning_rate": 7.16329551060681e-05, "loss": 0.0, "step": 1200 }, { "epoch": 0.8767123287671232, "eval_loss": NaN, "eval_runtime": 87.4965, "eval_samples_per_second": 120.805, "eval_steps_per_second": 7.555, "step": 1200 }, { "epoch": 0.8840182648401826, "grad_norm": NaN, "learning_rate": 7.138628515046868e-05, "loss": 0.0, "step": 1210 }, { "epoch": 0.891324200913242, "grad_norm": NaN, "learning_rate": 7.113961519486927e-05, "loss": 0.0, "step": 1220 }, { "epoch": 0.8986301369863013, "grad_norm": NaN, "learning_rate": 7.089294523926986e-05, "loss": 0.0, "step": 1230 }, { "epoch": 0.9059360730593607, "grad_norm": NaN, "learning_rate": 7.064627528367045e-05, "loss": 0.0, "step": 1240 }, { "epoch": 0.91324200913242, "grad_norm": NaN, "learning_rate": 7.039960532807104e-05, "loss": 0.0, "step": 1250 }, { "epoch": 0.91324200913242, "eval_loss": NaN, "eval_runtime": 87.5268, "eval_samples_per_second": 120.763, "eval_steps_per_second": 7.552, "step": 1250 }, { "epoch": 0.9205479452054794, "grad_norm": NaN, "learning_rate": 7.015293537247165e-05, "loss": 0.0, "step": 1260 }, { "epoch": 0.9278538812785389, "grad_norm": NaN, "learning_rate": 6.990626541687223e-05, "loss": 0.0, "step": 1270 }, { "epoch": 0.9351598173515981, "grad_norm": NaN, "learning_rate": 6.965959546127282e-05, "loss": 0.0, "step": 1280 }, { "epoch": 0.9424657534246575, "grad_norm": NaN, "learning_rate": 6.941292550567341e-05, "loss": 0.0, "step": 1290 }, { "epoch": 0.9497716894977168, "grad_norm": NaN, "learning_rate": 6.9166255550074e-05, "loss": 0.0, "step": 1300 }, { "epoch": 0.9497716894977168, "eval_loss": NaN, "eval_runtime": 89.7784, "eval_samples_per_second": 117.734, "eval_steps_per_second": 7.363, "step": 1300 }, { "epoch": 0.9570776255707762, "grad_norm": NaN, "learning_rate": 6.89195855944746e-05, "loss": 0.0, "step": 1310 }, { "epoch": 0.9643835616438357, "grad_norm": NaN, "learning_rate": 6.86729156388752e-05, "loss": 0.0, "step": 1320 }, { "epoch": 0.971689497716895, "grad_norm": NaN, "learning_rate": 6.842624568327578e-05, "loss": 0.0, "step": 1330 }, { "epoch": 0.9789954337899544, "grad_norm": NaN, "learning_rate": 6.817957572767637e-05, "loss": 0.0, "step": 1340 }, { "epoch": 0.9863013698630136, "grad_norm": NaN, "learning_rate": 6.793290577207696e-05, "loss": 0.0, "step": 1350 }, { "epoch": 0.9863013698630136, "eval_loss": NaN, "eval_runtime": 89.7601, "eval_samples_per_second": 117.758, "eval_steps_per_second": 7.364, "step": 1350 }, { "epoch": 0.993607305936073, "grad_norm": NaN, "learning_rate": 6.768623581647757e-05, "loss": 0.0, "step": 1360 }, { "epoch": 1.0007305936073059, "grad_norm": NaN, "learning_rate": 6.743956586087815e-05, "loss": 0.0, "step": 1370 }, { "epoch": 1.0080365296803653, "grad_norm": NaN, "learning_rate": 6.719289590527875e-05, "loss": 0.0, "step": 1380 }, { "epoch": 1.0153424657534247, "grad_norm": NaN, "learning_rate": 6.694622594967933e-05, "loss": 0.0, "step": 1390 }, { "epoch": 1.022648401826484, "grad_norm": NaN, "learning_rate": 6.669955599407992e-05, "loss": 0.0, "step": 1400 }, { "epoch": 1.022648401826484, "eval_loss": NaN, "eval_runtime": 87.4612, "eval_samples_per_second": 120.854, "eval_steps_per_second": 7.558, "step": 1400 }, { "epoch": 1.0299543378995433, "grad_norm": NaN, "learning_rate": 6.645288603848051e-05, "loss": 0.0, "step": 1410 }, { "epoch": 1.0372602739726027, "grad_norm": NaN, "learning_rate": 6.620621608288112e-05, "loss": 0.0, "step": 1420 }, { "epoch": 1.044566210045662, "grad_norm": NaN, "learning_rate": 6.59595461272817e-05, "loss": 0.0, "step": 1430 }, { "epoch": 1.0518721461187215, "grad_norm": NaN, "learning_rate": 6.57128761716823e-05, "loss": 0.0, "step": 1440 }, { "epoch": 1.059178082191781, "grad_norm": NaN, "learning_rate": 6.546620621608288e-05, "loss": 0.0, "step": 1450 }, { "epoch": 1.059178082191781, "eval_loss": NaN, "eval_runtime": 87.6838, "eval_samples_per_second": 120.547, "eval_steps_per_second": 7.538, "step": 1450 }, { "epoch": 1.0664840182648403, "grad_norm": NaN, "learning_rate": 6.521953626048347e-05, "loss": 0.0, "step": 1460 }, { "epoch": 1.0737899543378995, "grad_norm": NaN, "learning_rate": 6.497286630488407e-05, "loss": 0.0, "step": 1470 }, { "epoch": 1.0810958904109589, "grad_norm": NaN, "learning_rate": 6.472619634928467e-05, "loss": 0.0, "step": 1480 }, { "epoch": 1.0884018264840183, "grad_norm": NaN, "learning_rate": 6.447952639368525e-05, "loss": 0.0, "step": 1490 }, { "epoch": 1.0957077625570777, "grad_norm": NaN, "learning_rate": 6.423285643808585e-05, "loss": 0.0, "step": 1500 }, { "epoch": 1.0957077625570777, "eval_loss": NaN, "eval_runtime": 87.6114, "eval_samples_per_second": 120.646, "eval_steps_per_second": 7.545, "step": 1500 }, { "epoch": 1.103013698630137, "grad_norm": NaN, "learning_rate": 6.398618648248643e-05, "loss": 0.0, "step": 1510 }, { "epoch": 1.1103196347031963, "grad_norm": NaN, "learning_rate": 6.373951652688704e-05, "loss": 0.0, "step": 1520 }, { "epoch": 1.1176255707762557, "grad_norm": NaN, "learning_rate": 6.349284657128762e-05, "loss": 0.0, "step": 1530 }, { "epoch": 1.124931506849315, "grad_norm": NaN, "learning_rate": 6.324617661568822e-05, "loss": 0.0, "step": 1540 }, { "epoch": 1.1322374429223745, "grad_norm": NaN, "learning_rate": 6.29995066600888e-05, "loss": 0.0, "step": 1550 }, { "epoch": 1.1322374429223745, "eval_loss": NaN, "eval_runtime": 88.5181, "eval_samples_per_second": 119.411, "eval_steps_per_second": 7.467, "step": 1550 }, { "epoch": 1.139543378995434, "grad_norm": NaN, "learning_rate": 6.27528367044894e-05, "loss": 0.0, "step": 1560 }, { "epoch": 1.146849315068493, "grad_norm": NaN, "learning_rate": 6.250616674888998e-05, "loss": 0.0, "step": 1570 }, { "epoch": 1.1541552511415525, "grad_norm": NaN, "learning_rate": 6.225949679329059e-05, "loss": 0.0, "step": 1580 }, { "epoch": 1.161461187214612, "grad_norm": NaN, "learning_rate": 6.201282683769117e-05, "loss": 0.0, "step": 1590 }, { "epoch": 1.1687671232876713, "grad_norm": NaN, "learning_rate": 6.176615688209177e-05, "loss": 0.0, "step": 1600 }, { "epoch": 1.1687671232876713, "eval_loss": NaN, "eval_runtime": 89.8677, "eval_samples_per_second": 117.617, "eval_steps_per_second": 7.355, "step": 1600 }, { "epoch": 1.1760730593607307, "grad_norm": NaN, "learning_rate": 6.151948692649235e-05, "loss": 0.0, "step": 1610 }, { "epoch": 1.1833789954337899, "grad_norm": NaN, "learning_rate": 6.127281697089295e-05, "loss": 0.0, "step": 1620 }, { "epoch": 1.1906849315068493, "grad_norm": NaN, "learning_rate": 6.102614701529354e-05, "loss": 0.0, "step": 1630 }, { "epoch": 1.1979908675799087, "grad_norm": NaN, "learning_rate": 6.077947705969413e-05, "loss": 0.0, "step": 1640 }, { "epoch": 1.205296803652968, "grad_norm": NaN, "learning_rate": 6.053280710409472e-05, "loss": 0.0, "step": 1650 }, { "epoch": 1.205296803652968, "eval_loss": NaN, "eval_runtime": 90.1315, "eval_samples_per_second": 117.273, "eval_steps_per_second": 7.334, "step": 1650 }, { "epoch": 1.2126027397260275, "grad_norm": NaN, "learning_rate": 6.028613714849531e-05, "loss": 0.0, "step": 1660 }, { "epoch": 1.2199086757990867, "grad_norm": NaN, "learning_rate": 6.003946719289591e-05, "loss": 0.0, "step": 1670 }, { "epoch": 1.227214611872146, "grad_norm": NaN, "learning_rate": 5.9792797237296503e-05, "loss": 0.0, "step": 1680 }, { "epoch": 1.2345205479452055, "grad_norm": NaN, "learning_rate": 5.954612728169709e-05, "loss": 0.0, "step": 1690 }, { "epoch": 1.241826484018265, "grad_norm": NaN, "learning_rate": 5.929945732609768e-05, "loss": 0.0, "step": 1700 }, { "epoch": 1.241826484018265, "eval_loss": NaN, "eval_runtime": 87.7312, "eval_samples_per_second": 120.482, "eval_steps_per_second": 7.534, "step": 1700 }, { "epoch": 1.2491324200913243, "grad_norm": NaN, "learning_rate": 5.905278737049827e-05, "loss": 0.0, "step": 1710 }, { "epoch": 1.2564383561643835, "grad_norm": NaN, "learning_rate": 5.880611741489887e-05, "loss": 0.0, "step": 1720 }, { "epoch": 1.263744292237443, "grad_norm": NaN, "learning_rate": 5.855944745929946e-05, "loss": 0.0, "step": 1730 }, { "epoch": 1.2710502283105023, "grad_norm": NaN, "learning_rate": 5.8312777503700054e-05, "loss": 0.0, "step": 1740 }, { "epoch": 1.2783561643835617, "grad_norm": NaN, "learning_rate": 5.806610754810064e-05, "loss": 0.0, "step": 1750 }, { "epoch": 1.2783561643835617, "eval_loss": NaN, "eval_runtime": 87.6857, "eval_samples_per_second": 120.544, "eval_steps_per_second": 7.538, "step": 1750 }, { "epoch": 1.285662100456621, "grad_norm": NaN, "learning_rate": 5.781943759250123e-05, "loss": 0.0, "step": 1760 }, { "epoch": 1.2929680365296803, "grad_norm": NaN, "learning_rate": 5.757276763690183e-05, "loss": 0.0, "step": 1770 }, { "epoch": 1.3002739726027397, "grad_norm": NaN, "learning_rate": 5.732609768130242e-05, "loss": 0.0, "step": 1780 }, { "epoch": 1.307579908675799, "grad_norm": NaN, "learning_rate": 5.7079427725703014e-05, "loss": 0.0, "step": 1790 }, { "epoch": 1.3148858447488585, "grad_norm": NaN, "learning_rate": 5.6832757770103604e-05, "loss": 0.0, "step": 1800 }, { "epoch": 1.3148858447488585, "eval_loss": NaN, "eval_runtime": 87.7498, "eval_samples_per_second": 120.456, "eval_steps_per_second": 7.533, "step": 1800 }, { "epoch": 1.322191780821918, "grad_norm": NaN, "learning_rate": 5.658608781450419e-05, "loss": 0.0, "step": 1810 }, { "epoch": 1.329497716894977, "grad_norm": NaN, "learning_rate": 5.633941785890479e-05, "loss": 0.0, "step": 1820 }, { "epoch": 1.3368036529680365, "grad_norm": NaN, "learning_rate": 5.609274790330538e-05, "loss": 0.0, "step": 1830 }, { "epoch": 1.344109589041096, "grad_norm": NaN, "learning_rate": 5.5846077947705975e-05, "loss": 0.0, "step": 1840 }, { "epoch": 1.3514155251141553, "grad_norm": NaN, "learning_rate": 5.5599407992106565e-05, "loss": 0.0, "step": 1850 }, { "epoch": 1.3514155251141553, "eval_loss": NaN, "eval_runtime": 87.7348, "eval_samples_per_second": 120.477, "eval_steps_per_second": 7.534, "step": 1850 }, { "epoch": 1.3587214611872147, "grad_norm": NaN, "learning_rate": 5.5352738036507154e-05, "loss": 0.0, "step": 1860 }, { "epoch": 1.366027397260274, "grad_norm": NaN, "learning_rate": 5.5106068080907743e-05, "loss": 0.0, "step": 1870 }, { "epoch": 1.3733333333333333, "grad_norm": NaN, "learning_rate": 5.485939812530834e-05, "loss": 0.0, "step": 1880 }, { "epoch": 1.3806392694063927, "grad_norm": NaN, "learning_rate": 5.461272816970893e-05, "loss": 0.0, "step": 1890 }, { "epoch": 1.387945205479452, "grad_norm": NaN, "learning_rate": 5.4366058214109525e-05, "loss": 0.0, "step": 1900 }, { "epoch": 1.387945205479452, "eval_loss": NaN, "eval_runtime": 87.626, "eval_samples_per_second": 120.626, "eval_steps_per_second": 7.543, "step": 1900 }, { "epoch": 1.3952511415525115, "grad_norm": NaN, "learning_rate": 5.4119388258510115e-05, "loss": 0.0, "step": 1910 }, { "epoch": 1.4025570776255707, "grad_norm": NaN, "learning_rate": 5.3872718302910704e-05, "loss": 0.0, "step": 1920 }, { "epoch": 1.40986301369863, "grad_norm": NaN, "learning_rate": 5.36260483473113e-05, "loss": 0.0, "step": 1930 }, { "epoch": 1.4171689497716895, "grad_norm": NaN, "learning_rate": 5.337937839171189e-05, "loss": 0.0, "step": 1940 }, { "epoch": 1.424474885844749, "grad_norm": NaN, "learning_rate": 5.3132708436112486e-05, "loss": 0.0, "step": 1950 }, { "epoch": 1.424474885844749, "eval_loss": NaN, "eval_runtime": 89.8105, "eval_samples_per_second": 117.692, "eval_steps_per_second": 7.36, "step": 1950 }, { "epoch": 1.4317808219178083, "grad_norm": NaN, "learning_rate": 5.2886038480513075e-05, "loss": 0.0, "step": 1960 }, { "epoch": 1.4390867579908675, "grad_norm": NaN, "learning_rate": 5.2639368524913665e-05, "loss": 0.0, "step": 1970 }, { "epoch": 1.446392694063927, "grad_norm": NaN, "learning_rate": 5.239269856931426e-05, "loss": 0.0, "step": 1980 }, { "epoch": 1.4536986301369863, "grad_norm": NaN, "learning_rate": 5.214602861371485e-05, "loss": 0.0, "step": 1990 }, { "epoch": 1.4610045662100457, "grad_norm": NaN, "learning_rate": 5.189935865811545e-05, "loss": 0.0, "step": 2000 }, { "epoch": 1.4610045662100457, "eval_loss": NaN, "eval_runtime": 89.7513, "eval_samples_per_second": 117.77, "eval_steps_per_second": 7.365, "step": 2000 }, { "epoch": 1.4683105022831051, "grad_norm": NaN, "learning_rate": 5.1652688702516036e-05, "loss": 0.0, "step": 2010 }, { "epoch": 1.4756164383561643, "grad_norm": NaN, "learning_rate": 5.1406018746916626e-05, "loss": 0.0, "step": 2020 }, { "epoch": 1.4829223744292237, "grad_norm": NaN, "learning_rate": 5.115934879131722e-05, "loss": 0.0, "step": 2030 }, { "epoch": 1.490228310502283, "grad_norm": NaN, "learning_rate": 5.091267883571781e-05, "loss": 0.0, "step": 2040 }, { "epoch": 1.4975342465753425, "grad_norm": NaN, "learning_rate": 5.06660088801184e-05, "loss": 0.0, "step": 2050 }, { "epoch": 1.4975342465753425, "eval_loss": NaN, "eval_runtime": 87.5277, "eval_samples_per_second": 120.762, "eval_steps_per_second": 7.552, "step": 2050 }, { "epoch": 1.504840182648402, "grad_norm": NaN, "learning_rate": 5.0419338924519e-05, "loss": 0.0, "step": 2060 }, { "epoch": 1.512146118721461, "grad_norm": NaN, "learning_rate": 5.0172668968919586e-05, "loss": 0.0, "step": 2070 }, { "epoch": 1.5194520547945205, "grad_norm": NaN, "learning_rate": 4.992599901332018e-05, "loss": 0.0, "step": 2080 }, { "epoch": 1.52675799086758, "grad_norm": NaN, "learning_rate": 4.967932905772077e-05, "loss": 0.0, "step": 2090 }, { "epoch": 1.5340639269406393, "grad_norm": NaN, "learning_rate": 4.943265910212136e-05, "loss": 0.0, "step": 2100 }, { "epoch": 1.5340639269406393, "eval_loss": NaN, "eval_runtime": 87.4849, "eval_samples_per_second": 120.821, "eval_steps_per_second": 7.556, "step": 2100 }, { "epoch": 1.5413698630136987, "grad_norm": NaN, "learning_rate": 4.918598914652196e-05, "loss": 0.0, "step": 2110 }, { "epoch": 1.548675799086758, "grad_norm": NaN, "learning_rate": 4.893931919092255e-05, "loss": 0.0, "step": 2120 }, { "epoch": 1.5559817351598173, "grad_norm": NaN, "learning_rate": 4.869264923532314e-05, "loss": 0.0, "step": 2130 }, { "epoch": 1.5632876712328767, "grad_norm": NaN, "learning_rate": 4.844597927972373e-05, "loss": 0.0, "step": 2140 }, { "epoch": 1.5705936073059361, "grad_norm": NaN, "learning_rate": 4.819930932412432e-05, "loss": 0.0, "step": 2150 }, { "epoch": 1.5705936073059361, "eval_loss": NaN, "eval_runtime": 87.7089, "eval_samples_per_second": 120.512, "eval_steps_per_second": 7.536, "step": 2150 }, { "epoch": 1.5778995433789955, "grad_norm": NaN, "learning_rate": 4.795263936852492e-05, "loss": 0.0, "step": 2160 }, { "epoch": 1.5852054794520547, "grad_norm": NaN, "learning_rate": 4.770596941292551e-05, "loss": 0.0, "step": 2170 }, { "epoch": 1.592511415525114, "grad_norm": NaN, "learning_rate": 4.7459299457326104e-05, "loss": 0.0, "step": 2180 }, { "epoch": 1.5998173515981735, "grad_norm": NaN, "learning_rate": 4.7212629501726694e-05, "loss": 0.0, "step": 2190 }, { "epoch": 1.607123287671233, "grad_norm": NaN, "learning_rate": 4.696595954612728e-05, "loss": 0.0, "step": 2200 }, { "epoch": 1.607123287671233, "eval_loss": NaN, "eval_runtime": 87.4106, "eval_samples_per_second": 120.924, "eval_steps_per_second": 7.562, "step": 2200 }, { "epoch": 1.6144292237442923, "grad_norm": NaN, "learning_rate": 4.671928959052788e-05, "loss": 0.0, "step": 2210 }, { "epoch": 1.6217351598173515, "grad_norm": NaN, "learning_rate": 4.647261963492847e-05, "loss": 0.0, "step": 2220 }, { "epoch": 1.629041095890411, "grad_norm": NaN, "learning_rate": 4.622594967932906e-05, "loss": 0.0, "step": 2230 }, { "epoch": 1.6363470319634703, "grad_norm": NaN, "learning_rate": 4.5979279723729654e-05, "loss": 0.0, "step": 2240 }, { "epoch": 1.6436529680365297, "grad_norm": NaN, "learning_rate": 4.5732609768130244e-05, "loss": 0.0, "step": 2250 }, { "epoch": 1.6436529680365297, "eval_loss": NaN, "eval_runtime": 87.6352, "eval_samples_per_second": 120.614, "eval_steps_per_second": 7.543, "step": 2250 }, { "epoch": 1.6509589041095891, "grad_norm": NaN, "learning_rate": 4.548593981253084e-05, "loss": 0.0, "step": 2260 }, { "epoch": 1.6582648401826483, "grad_norm": NaN, "learning_rate": 4.523926985693143e-05, "loss": 0.0, "step": 2270 }, { "epoch": 1.6655707762557077, "grad_norm": NaN, "learning_rate": 4.499259990133202e-05, "loss": 0.0, "step": 2280 }, { "epoch": 1.6728767123287671, "grad_norm": NaN, "learning_rate": 4.4745929945732615e-05, "loss": 0.0, "step": 2290 }, { "epoch": 1.6801826484018265, "grad_norm": NaN, "learning_rate": 4.4499259990133204e-05, "loss": 0.0, "step": 2300 }, { "epoch": 1.6801826484018265, "eval_loss": NaN, "eval_runtime": 89.8795, "eval_samples_per_second": 117.602, "eval_steps_per_second": 7.354, "step": 2300 }, { "epoch": 1.687488584474886, "grad_norm": NaN, "learning_rate": 4.42525900345338e-05, "loss": 0.0, "step": 2310 }, { "epoch": 1.694794520547945, "grad_norm": NaN, "learning_rate": 4.400592007893439e-05, "loss": 0.0, "step": 2320 }, { "epoch": 1.7021004566210047, "grad_norm": NaN, "learning_rate": 4.375925012333498e-05, "loss": 0.0, "step": 2330 }, { "epoch": 1.709406392694064, "grad_norm": NaN, "learning_rate": 4.3512580167735576e-05, "loss": 0.0, "step": 2340 }, { "epoch": 1.7167123287671233, "grad_norm": NaN, "learning_rate": 4.3265910212136165e-05, "loss": 0.0, "step": 2350 }, { "epoch": 1.7167123287671233, "eval_loss": NaN, "eval_runtime": 89.9633, "eval_samples_per_second": 117.492, "eval_steps_per_second": 7.347, "step": 2350 }, { "epoch": 1.7240182648401827, "grad_norm": NaN, "learning_rate": 4.3019240256536755e-05, "loss": 0.0, "step": 2360 }, { "epoch": 1.731324200913242, "grad_norm": NaN, "learning_rate": 4.277257030093735e-05, "loss": 0.0, "step": 2370 }, { "epoch": 1.7386301369863015, "grad_norm": NaN, "learning_rate": 4.252590034533794e-05, "loss": 0.0, "step": 2380 }, { "epoch": 1.7459360730593607, "grad_norm": NaN, "learning_rate": 4.2279230389738537e-05, "loss": 0.0, "step": 2390 }, { "epoch": 1.7532420091324201, "grad_norm": NaN, "learning_rate": 4.2032560434139126e-05, "loss": 0.0, "step": 2400 }, { "epoch": 1.7532420091324201, "eval_loss": NaN, "eval_runtime": 89.7987, "eval_samples_per_second": 117.708, "eval_steps_per_second": 7.361, "step": 2400 }, { "epoch": 1.7605479452054795, "grad_norm": NaN, "learning_rate": 4.1785890478539715e-05, "loss": 0.0, "step": 2410 }, { "epoch": 1.7678538812785387, "grad_norm": NaN, "learning_rate": 4.153922052294031e-05, "loss": 0.0, "step": 2420 }, { "epoch": 1.7751598173515983, "grad_norm": NaN, "learning_rate": 4.12925505673409e-05, "loss": 0.0, "step": 2430 }, { "epoch": 1.7824657534246575, "grad_norm": NaN, "learning_rate": 4.10458806117415e-05, "loss": 0.0, "step": 2440 }, { "epoch": 1.789771689497717, "grad_norm": NaN, "learning_rate": 4.079921065614209e-05, "loss": 0.0, "step": 2450 }, { "epoch": 1.789771689497717, "eval_loss": NaN, "eval_runtime": 87.6424, "eval_samples_per_second": 120.604, "eval_steps_per_second": 7.542, "step": 2450 }, { "epoch": 1.7970776255707763, "grad_norm": NaN, "learning_rate": 4.0552540700542676e-05, "loss": 0.0, "step": 2460 }, { "epoch": 1.8043835616438355, "grad_norm": NaN, "learning_rate": 4.030587074494327e-05, "loss": 0.0, "step": 2470 }, { "epoch": 1.8116894977168951, "grad_norm": NaN, "learning_rate": 4.005920078934386e-05, "loss": 0.0, "step": 2480 }, { "epoch": 1.8189954337899543, "grad_norm": NaN, "learning_rate": 3.981253083374445e-05, "loss": 0.0, "step": 2490 }, { "epoch": 1.8263013698630137, "grad_norm": NaN, "learning_rate": 3.956586087814505e-05, "loss": 0.0, "step": 2500 }, { "epoch": 1.8263013698630137, "eval_loss": NaN, "eval_runtime": 87.5994, "eval_samples_per_second": 120.663, "eval_steps_per_second": 7.546, "step": 2500 }, { "epoch": 1.8336073059360731, "grad_norm": NaN, "learning_rate": 3.931919092254564e-05, "loss": 0.0, "step": 2510 }, { "epoch": 1.8409132420091323, "grad_norm": NaN, "learning_rate": 3.907252096694623e-05, "loss": 0.0, "step": 2520 }, { "epoch": 1.848219178082192, "grad_norm": NaN, "learning_rate": 3.882585101134682e-05, "loss": 0.0, "step": 2530 }, { "epoch": 1.8555251141552511, "grad_norm": NaN, "learning_rate": 3.857918105574741e-05, "loss": 0.0, "step": 2540 }, { "epoch": 1.8628310502283105, "grad_norm": NaN, "learning_rate": 3.833251110014801e-05, "loss": 0.0, "step": 2550 }, { "epoch": 1.8628310502283105, "eval_loss": NaN, "eval_runtime": 87.4443, "eval_samples_per_second": 120.877, "eval_steps_per_second": 7.559, "step": 2550 }, { "epoch": 1.87013698630137, "grad_norm": NaN, "learning_rate": 3.80858411445486e-05, "loss": 0.0, "step": 2560 }, { "epoch": 1.8774429223744291, "grad_norm": NaN, "learning_rate": 3.783917118894919e-05, "loss": 0.0, "step": 2570 }, { "epoch": 1.8847488584474887, "grad_norm": NaN, "learning_rate": 3.759250123334978e-05, "loss": 0.0, "step": 2580 }, { "epoch": 1.892054794520548, "grad_norm": NaN, "learning_rate": 3.734583127775037e-05, "loss": 0.0, "step": 2590 }, { "epoch": 1.8993607305936073, "grad_norm": NaN, "learning_rate": 3.709916132215097e-05, "loss": 0.0, "step": 2600 }, { "epoch": 1.8993607305936073, "eval_loss": NaN, "eval_runtime": 87.7012, "eval_samples_per_second": 120.523, "eval_steps_per_second": 7.537, "step": 2600 }, { "epoch": 1.9066666666666667, "grad_norm": NaN, "learning_rate": 3.685249136655156e-05, "loss": 0.0, "step": 2610 }, { "epoch": 1.913972602739726, "grad_norm": NaN, "learning_rate": 3.660582141095215e-05, "loss": 0.0, "step": 2620 }, { "epoch": 1.9212785388127855, "grad_norm": NaN, "learning_rate": 3.6359151455352744e-05, "loss": 0.0, "step": 2630 }, { "epoch": 1.9285844748858447, "grad_norm": NaN, "learning_rate": 3.6112481499753333e-05, "loss": 0.0, "step": 2640 }, { "epoch": 1.9358904109589041, "grad_norm": NaN, "learning_rate": 3.586581154415392e-05, "loss": 0.0, "step": 2650 }, { "epoch": 1.9358904109589041, "eval_loss": NaN, "eval_runtime": 87.5232, "eval_samples_per_second": 120.768, "eval_steps_per_second": 7.552, "step": 2650 }, { "epoch": 1.9431963470319635, "grad_norm": NaN, "learning_rate": 3.561914158855452e-05, "loss": 0.0, "step": 2660 }, { "epoch": 1.9505022831050227, "grad_norm": NaN, "learning_rate": 3.537247163295511e-05, "loss": 0.0, "step": 2670 }, { "epoch": 1.9578082191780823, "grad_norm": NaN, "learning_rate": 3.5125801677355705e-05, "loss": 0.0, "step": 2680 }, { "epoch": 1.9651141552511415, "grad_norm": NaN, "learning_rate": 3.4879131721756294e-05, "loss": 0.0, "step": 2690 }, { "epoch": 1.972420091324201, "grad_norm": NaN, "learning_rate": 3.4632461766156884e-05, "loss": 0.0, "step": 2700 }, { "epoch": 1.972420091324201, "eval_loss": NaN, "eval_runtime": 87.6204, "eval_samples_per_second": 120.634, "eval_steps_per_second": 7.544, "step": 2700 }, { "epoch": 1.9797260273972603, "grad_norm": NaN, "learning_rate": 3.438579181055748e-05, "loss": 0.0, "step": 2710 }, { "epoch": 1.9870319634703195, "grad_norm": NaN, "learning_rate": 3.413912185495807e-05, "loss": 0.0, "step": 2720 }, { "epoch": 1.9943378995433791, "grad_norm": NaN, "learning_rate": 3.389245189935866e-05, "loss": 0.0, "step": 2730 }, { "epoch": 2.0014611872146117, "grad_norm": NaN, "learning_rate": 3.3645781943759255e-05, "loss": 0.0, "step": 2740 }, { "epoch": 2.0087671232876714, "grad_norm": NaN, "learning_rate": 3.3399111988159844e-05, "loss": 0.0, "step": 2750 }, { "epoch": 2.0087671232876714, "eval_loss": NaN, "eval_runtime": 89.786, "eval_samples_per_second": 117.724, "eval_steps_per_second": 7.362, "step": 2750 }, { "epoch": 2.0160730593607306, "grad_norm": NaN, "learning_rate": 3.315244203256044e-05, "loss": 0.0, "step": 2760 }, { "epoch": 2.0233789954337897, "grad_norm": NaN, "learning_rate": 3.290577207696103e-05, "loss": 0.0, "step": 2770 }, { "epoch": 2.0306849315068494, "grad_norm": NaN, "learning_rate": 3.265910212136162e-05, "loss": 0.0, "step": 2780 }, { "epoch": 2.0379908675799085, "grad_norm": NaN, "learning_rate": 3.2412432165762216e-05, "loss": 0.0, "step": 2790 }, { "epoch": 2.045296803652968, "grad_norm": NaN, "learning_rate": 3.2165762210162805e-05, "loss": 0.0, "step": 2800 }, { "epoch": 2.045296803652968, "eval_loss": NaN, "eval_runtime": 89.6927, "eval_samples_per_second": 117.847, "eval_steps_per_second": 7.37, "step": 2800 }, { "epoch": 2.0526027397260274, "grad_norm": NaN, "learning_rate": 3.1919092254563395e-05, "loss": 0.0, "step": 2810 }, { "epoch": 2.0599086757990865, "grad_norm": NaN, "learning_rate": 3.167242229896399e-05, "loss": 0.0, "step": 2820 }, { "epoch": 2.067214611872146, "grad_norm": NaN, "learning_rate": 3.142575234336458e-05, "loss": 0.0, "step": 2830 }, { "epoch": 2.0745205479452054, "grad_norm": NaN, "learning_rate": 3.1179082387765176e-05, "loss": 0.0, "step": 2840 }, { "epoch": 2.081826484018265, "grad_norm": NaN, "learning_rate": 3.0932412432165766e-05, "loss": 0.0, "step": 2850 }, { "epoch": 2.081826484018265, "eval_loss": NaN, "eval_runtime": 89.0102, "eval_samples_per_second": 118.75, "eval_steps_per_second": 7.426, "step": 2850 }, { "epoch": 2.089132420091324, "grad_norm": NaN, "learning_rate": 3.0685742476566355e-05, "loss": 0.0, "step": 2860 }, { "epoch": 2.0964383561643833, "grad_norm": NaN, "learning_rate": 3.0439072520966948e-05, "loss": 0.0, "step": 2870 }, { "epoch": 2.103744292237443, "grad_norm": NaN, "learning_rate": 3.0192402565367538e-05, "loss": 0.0, "step": 2880 }, { "epoch": 2.111050228310502, "grad_norm": NaN, "learning_rate": 2.994573260976813e-05, "loss": 0.0, "step": 2890 }, { "epoch": 2.118356164383562, "grad_norm": NaN, "learning_rate": 2.9699062654168723e-05, "loss": 0.0, "step": 2900 }, { "epoch": 2.118356164383562, "eval_loss": NaN, "eval_runtime": 87.4632, "eval_samples_per_second": 120.851, "eval_steps_per_second": 7.557, "step": 2900 }, { "epoch": 2.125662100456621, "grad_norm": NaN, "learning_rate": 2.9452392698569313e-05, "loss": 0.0, "step": 2910 }, { "epoch": 2.1329680365296806, "grad_norm": NaN, "learning_rate": 2.920572274296991e-05, "loss": 0.0, "step": 2920 }, { "epoch": 2.1402739726027398, "grad_norm": NaN, "learning_rate": 2.89590527873705e-05, "loss": 0.0, "step": 2930 }, { "epoch": 2.147579908675799, "grad_norm": NaN, "learning_rate": 2.871238283177109e-05, "loss": 0.0, "step": 2940 }, { "epoch": 2.1548858447488586, "grad_norm": NaN, "learning_rate": 2.8465712876171684e-05, "loss": 0.0, "step": 2950 }, { "epoch": 2.1548858447488586, "eval_loss": NaN, "eval_runtime": 87.3497, "eval_samples_per_second": 121.008, "eval_steps_per_second": 7.567, "step": 2950 }, { "epoch": 2.1621917808219178, "grad_norm": NaN, "learning_rate": 2.8219042920572273e-05, "loss": 0.0, "step": 2960 }, { "epoch": 2.169497716894977, "grad_norm": NaN, "learning_rate": 2.7972372964972866e-05, "loss": 0.0, "step": 2970 }, { "epoch": 2.1768036529680366, "grad_norm": NaN, "learning_rate": 2.772570300937346e-05, "loss": 0.0, "step": 2980 }, { "epoch": 2.1841095890410958, "grad_norm": NaN, "learning_rate": 2.7479033053774052e-05, "loss": 0.0, "step": 2990 }, { "epoch": 2.1914155251141554, "grad_norm": NaN, "learning_rate": 2.7232363098174645e-05, "loss": 0.0, "step": 3000 }, { "epoch": 2.1914155251141554, "eval_loss": NaN, "eval_runtime": 87.4657, "eval_samples_per_second": 120.847, "eval_steps_per_second": 7.557, "step": 3000 }, { "epoch": 2.1987214611872146, "grad_norm": NaN, "learning_rate": 2.6985693142575234e-05, "loss": 0.0, "step": 3010 }, { "epoch": 2.206027397260274, "grad_norm": NaN, "learning_rate": 2.6739023186975827e-05, "loss": 0.0, "step": 3020 }, { "epoch": 2.2133333333333334, "grad_norm": NaN, "learning_rate": 2.649235323137642e-05, "loss": 0.0, "step": 3030 }, { "epoch": 2.2206392694063926, "grad_norm": NaN, "learning_rate": 2.6245683275777013e-05, "loss": 0.0, "step": 3040 }, { "epoch": 2.227945205479452, "grad_norm": NaN, "learning_rate": 2.5999013320177602e-05, "loss": 0.0, "step": 3050 }, { "epoch": 2.227945205479452, "eval_loss": NaN, "eval_runtime": 87.5429, "eval_samples_per_second": 120.741, "eval_steps_per_second": 7.551, "step": 3050 }, { "epoch": 2.2352511415525114, "grad_norm": NaN, "learning_rate": 2.5752343364578195e-05, "loss": 0.0, "step": 3060 }, { "epoch": 2.2425570776255705, "grad_norm": NaN, "learning_rate": 2.5505673408978788e-05, "loss": 0.0, "step": 3070 }, { "epoch": 2.24986301369863, "grad_norm": NaN, "learning_rate": 2.525900345337938e-05, "loss": 0.0, "step": 3080 }, { "epoch": 2.2571689497716894, "grad_norm": NaN, "learning_rate": 2.501233349777997e-05, "loss": 0.0, "step": 3090 }, { "epoch": 2.264474885844749, "grad_norm": NaN, "learning_rate": 2.4765663542180563e-05, "loss": 0.0, "step": 3100 }, { "epoch": 2.264474885844749, "eval_loss": NaN, "eval_runtime": 87.4603, "eval_samples_per_second": 120.855, "eval_steps_per_second": 7.558, "step": 3100 }, { "epoch": 2.271780821917808, "grad_norm": NaN, "learning_rate": 2.4518993586581156e-05, "loss": 0.0, "step": 3110 }, { "epoch": 2.279086757990868, "grad_norm": NaN, "learning_rate": 2.427232363098175e-05, "loss": 0.0, "step": 3120 }, { "epoch": 2.286392694063927, "grad_norm": NaN, "learning_rate": 2.402565367538234e-05, "loss": 0.0, "step": 3130 }, { "epoch": 2.293698630136986, "grad_norm": NaN, "learning_rate": 2.377898371978293e-05, "loss": 0.0, "step": 3140 }, { "epoch": 2.301004566210046, "grad_norm": NaN, "learning_rate": 2.3532313764183524e-05, "loss": 0.0, "step": 3150 }, { "epoch": 2.301004566210046, "eval_loss": NaN, "eval_runtime": 89.2945, "eval_samples_per_second": 118.372, "eval_steps_per_second": 7.402, "step": 3150 }, { "epoch": 2.308310502283105, "grad_norm": NaN, "learning_rate": 2.3285643808584116e-05, "loss": 0.0, "step": 3160 }, { "epoch": 2.315616438356164, "grad_norm": NaN, "learning_rate": 2.303897385298471e-05, "loss": 0.0, "step": 3170 }, { "epoch": 2.322922374429224, "grad_norm": NaN, "learning_rate": 2.27923038973853e-05, "loss": 0.0, "step": 3180 }, { "epoch": 2.330228310502283, "grad_norm": NaN, "learning_rate": 2.254563394178589e-05, "loss": 0.0, "step": 3190 }, { "epoch": 2.3375342465753426, "grad_norm": NaN, "learning_rate": 2.2298963986186484e-05, "loss": 0.0, "step": 3200 }, { "epoch": 2.3375342465753426, "eval_loss": NaN, "eval_runtime": 89.6497, "eval_samples_per_second": 117.903, "eval_steps_per_second": 7.373, "step": 3200 }, { "epoch": 2.3448401826484018, "grad_norm": NaN, "learning_rate": 2.2052294030587077e-05, "loss": 0.0, "step": 3210 }, { "epoch": 2.3521461187214614, "grad_norm": NaN, "learning_rate": 2.180562407498767e-05, "loss": 0.0, "step": 3220 }, { "epoch": 2.3594520547945206, "grad_norm": NaN, "learning_rate": 2.155895411938826e-05, "loss": 0.0, "step": 3230 }, { "epoch": 2.3667579908675798, "grad_norm": NaN, "learning_rate": 2.1312284163788852e-05, "loss": 0.0, "step": 3240 }, { "epoch": 2.3740639269406394, "grad_norm": NaN, "learning_rate": 2.1065614208189445e-05, "loss": 0.0, "step": 3250 }, { "epoch": 2.3740639269406394, "eval_loss": NaN, "eval_runtime": 89.7652, "eval_samples_per_second": 117.752, "eval_steps_per_second": 7.364, "step": 3250 }, { "epoch": 2.3813698630136986, "grad_norm": NaN, "learning_rate": 2.0818944252590038e-05, "loss": 0.0, "step": 3260 }, { "epoch": 2.3886757990867578, "grad_norm": NaN, "learning_rate": 2.0572274296990627e-05, "loss": 0.0, "step": 3270 }, { "epoch": 2.3959817351598174, "grad_norm": NaN, "learning_rate": 2.032560434139122e-05, "loss": 0.0, "step": 3280 }, { "epoch": 2.4032876712328766, "grad_norm": NaN, "learning_rate": 2.0078934385791813e-05, "loss": 0.0, "step": 3290 }, { "epoch": 2.410593607305936, "grad_norm": NaN, "learning_rate": 1.9832264430192406e-05, "loss": 0.0, "step": 3300 }, { "epoch": 2.410593607305936, "eval_loss": NaN, "eval_runtime": 88.5138, "eval_samples_per_second": 119.416, "eval_steps_per_second": 7.468, "step": 3300 }, { "epoch": 2.4178995433789954, "grad_norm": NaN, "learning_rate": 1.9585594474592995e-05, "loss": 0.0, "step": 3310 }, { "epoch": 2.425205479452055, "grad_norm": NaN, "learning_rate": 1.9338924518993588e-05, "loss": 0.0, "step": 3320 }, { "epoch": 2.432511415525114, "grad_norm": NaN, "learning_rate": 1.909225456339418e-05, "loss": 0.0, "step": 3330 }, { "epoch": 2.4398173515981734, "grad_norm": NaN, "learning_rate": 1.8845584607794774e-05, "loss": 0.0, "step": 3340 }, { "epoch": 2.447123287671233, "grad_norm": NaN, "learning_rate": 1.8598914652195363e-05, "loss": 0.0, "step": 3350 }, { "epoch": 2.447123287671233, "eval_loss": NaN, "eval_runtime": 87.334, "eval_samples_per_second": 121.03, "eval_steps_per_second": 7.569, "step": 3350 }, { "epoch": 2.454429223744292, "grad_norm": NaN, "learning_rate": 1.8352244696595956e-05, "loss": 0.0, "step": 3360 }, { "epoch": 2.4617351598173514, "grad_norm": NaN, "learning_rate": 1.810557474099655e-05, "loss": 0.0, "step": 3370 }, { "epoch": 2.469041095890411, "grad_norm": NaN, "learning_rate": 1.785890478539714e-05, "loss": 0.0, "step": 3380 }, { "epoch": 2.47634703196347, "grad_norm": NaN, "learning_rate": 1.761223482979773e-05, "loss": 0.0, "step": 3390 }, { "epoch": 2.48365296803653, "grad_norm": NaN, "learning_rate": 1.7365564874198324e-05, "loss": 0.0, "step": 3400 }, { "epoch": 2.48365296803653, "eval_loss": NaN, "eval_runtime": 87.2333, "eval_samples_per_second": 121.169, "eval_steps_per_second": 7.577, "step": 3400 }, { "epoch": 2.490958904109589, "grad_norm": NaN, "learning_rate": 1.7118894918598917e-05, "loss": 0.0, "step": 3410 }, { "epoch": 2.4982648401826486, "grad_norm": NaN, "learning_rate": 1.687222496299951e-05, "loss": 0.0, "step": 3420 }, { "epoch": 2.505570776255708, "grad_norm": NaN, "learning_rate": 1.66255550074001e-05, "loss": 0.0, "step": 3430 }, { "epoch": 2.512876712328767, "grad_norm": NaN, "learning_rate": 1.6378885051800692e-05, "loss": 0.0, "step": 3440 }, { "epoch": 2.5201826484018266, "grad_norm": NaN, "learning_rate": 1.6132215096201285e-05, "loss": 0.0, "step": 3450 }, { "epoch": 2.5201826484018266, "eval_loss": NaN, "eval_runtime": 87.3114, "eval_samples_per_second": 121.061, "eval_steps_per_second": 7.571, "step": 3450 }, { "epoch": 2.527488584474886, "grad_norm": NaN, "learning_rate": 1.5885545140601878e-05, "loss": 0.0, "step": 3460 }, { "epoch": 2.534794520547945, "grad_norm": NaN, "learning_rate": 1.5638875185002467e-05, "loss": 0.0, "step": 3470 }, { "epoch": 2.5421004566210046, "grad_norm": NaN, "learning_rate": 1.539220522940306e-05, "loss": 0.0, "step": 3480 }, { "epoch": 2.5494063926940638, "grad_norm": NaN, "learning_rate": 1.5145535273803651e-05, "loss": 0.0, "step": 3490 }, { "epoch": 2.5567123287671234, "grad_norm": NaN, "learning_rate": 1.4898865318204244e-05, "loss": 0.0, "step": 3500 }, { "epoch": 2.5567123287671234, "eval_loss": NaN, "eval_runtime": 87.2842, "eval_samples_per_second": 121.099, "eval_steps_per_second": 7.573, "step": 3500 }, { "epoch": 2.5640182648401826, "grad_norm": NaN, "learning_rate": 1.4652195362604835e-05, "loss": 0.0, "step": 3510 }, { "epoch": 2.571324200913242, "grad_norm": NaN, "learning_rate": 1.4405525407005426e-05, "loss": 0.0, "step": 3520 }, { "epoch": 2.5786301369863014, "grad_norm": NaN, "learning_rate": 1.4158855451406019e-05, "loss": 0.0, "step": 3530 }, { "epoch": 2.5859360730593606, "grad_norm": NaN, "learning_rate": 1.3912185495806612e-05, "loss": 0.0, "step": 3540 }, { "epoch": 2.59324200913242, "grad_norm": NaN, "learning_rate": 1.3665515540207203e-05, "loss": 0.0, "step": 3550 }, { "epoch": 2.59324200913242, "eval_loss": NaN, "eval_runtime": 87.1762, "eval_samples_per_second": 121.249, "eval_steps_per_second": 7.582, "step": 3550 }, { "epoch": 2.6005479452054794, "grad_norm": NaN, "learning_rate": 1.3418845584607796e-05, "loss": 0.0, "step": 3560 }, { "epoch": 2.6078538812785386, "grad_norm": NaN, "learning_rate": 1.3172175629008387e-05, "loss": 0.0, "step": 3570 }, { "epoch": 2.615159817351598, "grad_norm": NaN, "learning_rate": 1.292550567340898e-05, "loss": 0.0, "step": 3580 }, { "epoch": 2.6224657534246574, "grad_norm": NaN, "learning_rate": 1.267883571780957e-05, "loss": 0.0, "step": 3590 }, { "epoch": 2.629771689497717, "grad_norm": NaN, "learning_rate": 1.2432165762210164e-05, "loss": 0.0, "step": 3600 }, { "epoch": 2.629771689497717, "eval_loss": NaN, "eval_runtime": 89.0981, "eval_samples_per_second": 118.633, "eval_steps_per_second": 7.419, "step": 3600 }, { "epoch": 2.637077625570776, "grad_norm": NaN, "learning_rate": 1.2185495806610755e-05, "loss": 0.0, "step": 3610 }, { "epoch": 2.644383561643836, "grad_norm": NaN, "learning_rate": 1.1938825851011348e-05, "loss": 0.0, "step": 3620 }, { "epoch": 2.651689497716895, "grad_norm": NaN, "learning_rate": 1.169215589541194e-05, "loss": 0.0, "step": 3630 }, { "epoch": 2.658995433789954, "grad_norm": NaN, "learning_rate": 1.1445485939812531e-05, "loss": 0.0, "step": 3640 }, { "epoch": 2.666301369863014, "grad_norm": NaN, "learning_rate": 1.1198815984213124e-05, "loss": 0.0, "step": 3650 }, { "epoch": 2.666301369863014, "eval_loss": NaN, "eval_runtime": 89.3812, "eval_samples_per_second": 118.258, "eval_steps_per_second": 7.395, "step": 3650 }, { "epoch": 2.673607305936073, "grad_norm": NaN, "learning_rate": 1.0952146028613715e-05, "loss": 0.0, "step": 3660 }, { "epoch": 2.680913242009132, "grad_norm": NaN, "learning_rate": 1.0705476073014308e-05, "loss": 0.0, "step": 3670 }, { "epoch": 2.688219178082192, "grad_norm": NaN, "learning_rate": 1.04588061174149e-05, "loss": 0.0, "step": 3680 }, { "epoch": 2.695525114155251, "grad_norm": NaN, "learning_rate": 1.0212136161815492e-05, "loss": 0.0, "step": 3690 }, { "epoch": 2.7028310502283106, "grad_norm": NaN, "learning_rate": 9.965466206216083e-06, "loss": 0.0, "step": 3700 }, { "epoch": 2.7028310502283106, "eval_loss": NaN, "eval_runtime": 89.3655, "eval_samples_per_second": 118.278, "eval_steps_per_second": 7.397, "step": 3700 }, { "epoch": 2.71013698630137, "grad_norm": NaN, "learning_rate": 9.718796250616676e-06, "loss": 0.0, "step": 3710 }, { "epoch": 2.7174429223744294, "grad_norm": NaN, "learning_rate": 9.472126295017267e-06, "loss": 0.0, "step": 3720 }, { "epoch": 2.7247488584474886, "grad_norm": NaN, "learning_rate": 9.22545633941786e-06, "loss": 0.0, "step": 3730 }, { "epoch": 2.732054794520548, "grad_norm": NaN, "learning_rate": 8.978786383818451e-06, "loss": 0.0, "step": 3740 }, { "epoch": 2.7393607305936074, "grad_norm": NaN, "learning_rate": 8.732116428219044e-06, "loss": 0.0, "step": 3750 }, { "epoch": 2.7393607305936074, "eval_loss": NaN, "eval_runtime": 86.8786, "eval_samples_per_second": 121.664, "eval_steps_per_second": 7.608, "step": 3750 }, { "epoch": 2.7466666666666666, "grad_norm": NaN, "learning_rate": 8.485446472619635e-06, "loss": 0.0, "step": 3760 }, { "epoch": 2.7539726027397258, "grad_norm": NaN, "learning_rate": 8.238776517020228e-06, "loss": 0.0, "step": 3770 }, { "epoch": 2.7612785388127854, "grad_norm": NaN, "learning_rate": 7.99210656142082e-06, "loss": 0.0, "step": 3780 }, { "epoch": 2.768584474885845, "grad_norm": NaN, "learning_rate": 7.745436605821412e-06, "loss": 0.0, "step": 3790 }, { "epoch": 2.775890410958904, "grad_norm": NaN, "learning_rate": 7.498766650222003e-06, "loss": 0.0, "step": 3800 }, { "epoch": 2.775890410958904, "eval_loss": NaN, "eval_runtime": 87.0328, "eval_samples_per_second": 121.449, "eval_steps_per_second": 7.595, "step": 3800 }, { "epoch": 2.7831963470319634, "grad_norm": NaN, "learning_rate": 7.252096694622595e-06, "loss": 0.0, "step": 3810 }, { "epoch": 2.790502283105023, "grad_norm": NaN, "learning_rate": 7.005426739023187e-06, "loss": 0.0, "step": 3820 }, { "epoch": 2.797808219178082, "grad_norm": NaN, "learning_rate": 6.758756783423779e-06, "loss": 0.0, "step": 3830 }, { "epoch": 2.8051141552511414, "grad_norm": NaN, "learning_rate": 6.512086827824371e-06, "loss": 0.0, "step": 3840 }, { "epoch": 2.812420091324201, "grad_norm": NaN, "learning_rate": 6.265416872224963e-06, "loss": 0.0, "step": 3850 }, { "epoch": 2.812420091324201, "eval_loss": NaN, "eval_runtime": 86.9228, "eval_samples_per_second": 121.602, "eval_steps_per_second": 7.604, "step": 3850 }, { "epoch": 2.81972602739726, "grad_norm": NaN, "learning_rate": 6.018746916625555e-06, "loss": 0.0, "step": 3860 }, { "epoch": 2.8270319634703194, "grad_norm": NaN, "learning_rate": 5.772076961026148e-06, "loss": 0.0, "step": 3870 }, { "epoch": 2.834337899543379, "grad_norm": NaN, "learning_rate": 5.52540700542674e-06, "loss": 0.0, "step": 3880 }, { "epoch": 2.8416438356164386, "grad_norm": NaN, "learning_rate": 5.278737049827332e-06, "loss": 0.0, "step": 3890 }, { "epoch": 2.848949771689498, "grad_norm": NaN, "learning_rate": 5.032067094227924e-06, "loss": 0.0, "step": 3900 }, { "epoch": 2.848949771689498, "eval_loss": NaN, "eval_runtime": 86.8846, "eval_samples_per_second": 121.656, "eval_steps_per_second": 7.608, "step": 3900 }, { "epoch": 2.856255707762557, "grad_norm": NaN, "learning_rate": 4.785397138628516e-06, "loss": 0.0, "step": 3910 }, { "epoch": 2.8635616438356166, "grad_norm": NaN, "learning_rate": 4.538727183029108e-06, "loss": 0.0, "step": 3920 }, { "epoch": 2.870867579908676, "grad_norm": NaN, "learning_rate": 4.2920572274297e-06, "loss": 0.0, "step": 3930 }, { "epoch": 2.878173515981735, "grad_norm": NaN, "learning_rate": 4.045387271830292e-06, "loss": 0.0, "step": 3940 }, { "epoch": 2.8854794520547946, "grad_norm": NaN, "learning_rate": 3.7987173162308833e-06, "loss": 0.0, "step": 3950 }, { "epoch": 2.8854794520547946, "eval_loss": NaN, "eval_runtime": 87.2057, "eval_samples_per_second": 121.208, "eval_steps_per_second": 7.58, "step": 3950 }, { "epoch": 2.892785388127854, "grad_norm": NaN, "learning_rate": 3.5520473606314752e-06, "loss": 0.0, "step": 3960 }, { "epoch": 2.900091324200913, "grad_norm": NaN, "learning_rate": 3.3053774050320672e-06, "loss": 0.0, "step": 3970 }, { "epoch": 2.9073972602739726, "grad_norm": NaN, "learning_rate": 3.058707449432659e-06, "loss": 0.0, "step": 3980 }, { "epoch": 2.9147031963470322, "grad_norm": NaN, "learning_rate": 2.812037493833251e-06, "loss": 0.0, "step": 3990 }, { "epoch": 2.9220091324200914, "grad_norm": NaN, "learning_rate": 2.5653675382338436e-06, "loss": 0.0, "step": 4000 }, { "epoch": 2.9220091324200914, "eval_loss": NaN, "eval_runtime": 78.7041, "eval_samples_per_second": 134.301, "eval_steps_per_second": 8.399, "step": 4000 }, { "epoch": 2.9293150684931506, "grad_norm": NaN, "learning_rate": 2.3186975826344356e-06, "loss": 0.0, "step": 4010 }, { "epoch": 2.9366210045662102, "grad_norm": NaN, "learning_rate": 2.0720276270350275e-06, "loss": 0.0, "step": 4020 }, { "epoch": 2.9439269406392694, "grad_norm": NaN, "learning_rate": 1.8253576714356193e-06, "loss": 0.0, "step": 4030 }, { "epoch": 2.9512328767123286, "grad_norm": NaN, "learning_rate": 1.5786877158362113e-06, "loss": 0.0, "step": 4040 }, { "epoch": 2.958538812785388, "grad_norm": NaN, "learning_rate": 1.3320177602368033e-06, "loss": 0.0, "step": 4050 }, { "epoch": 2.958538812785388, "eval_loss": NaN, "eval_runtime": 75.4374, "eval_samples_per_second": 140.116, "eval_steps_per_second": 8.762, "step": 4050 }, { "epoch": 2.9658447488584474, "grad_norm": NaN, "learning_rate": 1.0853478046373952e-06, "loss": 0.0, "step": 4060 }, { "epoch": 2.9731506849315066, "grad_norm": NaN, "learning_rate": 8.386778490379872e-07, "loss": 0.0, "step": 4070 }, { "epoch": 2.980456621004566, "grad_norm": NaN, "learning_rate": 5.920078934385792e-07, "loss": 0.0, "step": 4080 }, { "epoch": 2.987762557077626, "grad_norm": NaN, "learning_rate": 3.4533793783917124e-07, "loss": 0.0, "step": 4090 }, { "epoch": 2.995068493150685, "grad_norm": NaN, "learning_rate": 9.86679822397632e-08, "loss": 0.0, "step": 4100 }, { "epoch": 2.995068493150685, "eval_loss": NaN, "eval_runtime": 75.3535, "eval_samples_per_second": 140.272, "eval_steps_per_second": 8.772, "step": 4100 } ], "logging_steps": 10, "max_steps": 4104, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.893527772009083e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }