|
{ |
|
"best_metric": Infinity, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9979908675799085, |
|
"eval_steps": 50, |
|
"global_step": 4104, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0073059360730593605, |
|
"grad_norm": NaN, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.014611872146118721, |
|
"grad_norm": NaN, |
|
"learning_rate": 4e-05, |
|
"loss": 0.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.021917808219178082, |
|
"grad_norm": NaN, |
|
"learning_rate": 6e-05, |
|
"loss": 0.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.029223744292237442, |
|
"grad_norm": NaN, |
|
"learning_rate": 8e-05, |
|
"loss": 0.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0365296803652968, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0365296803652968, |
|
"eval_loss": NaN, |
|
"eval_runtime": 86.8266, |
|
"eval_samples_per_second": 121.737, |
|
"eval_steps_per_second": 7.613, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.043835616438356165, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.97533300444006e-05, |
|
"loss": 0.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05114155251141553, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.950666008880118e-05, |
|
"loss": 0.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.058447488584474884, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.925999013320178e-05, |
|
"loss": 0.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06575342465753424, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.901332017760238e-05, |
|
"loss": 0.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0730593607305936, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.876665022200296e-05, |
|
"loss": 0.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0730593607305936, |
|
"eval_loss": NaN, |
|
"eval_runtime": 89.5193, |
|
"eval_samples_per_second": 118.075, |
|
"eval_steps_per_second": 7.384, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08036529680365297, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.851998026640355e-05, |
|
"loss": 0.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.08767123287671233, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.827331031080415e-05, |
|
"loss": 0.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.09497716894977169, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.802664035520473e-05, |
|
"loss": 0.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.10228310502283106, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.777997039960533e-05, |
|
"loss": 0.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.1095890410958904, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.753330044400593e-05, |
|
"loss": 0.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1095890410958904, |
|
"eval_loss": NaN, |
|
"eval_runtime": 89.6606, |
|
"eval_samples_per_second": 117.889, |
|
"eval_steps_per_second": 7.372, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.11689497716894977, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.728663048840652e-05, |
|
"loss": 0.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.12420091324200913, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.70399605328071e-05, |
|
"loss": 0.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.13150684931506848, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.67932905772077e-05, |
|
"loss": 0.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.13881278538812786, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.65466206216083e-05, |
|
"loss": 0.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.1461187214611872, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.629995066600888e-05, |
|
"loss": 0.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1461187214611872, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.4596, |
|
"eval_samples_per_second": 120.856, |
|
"eval_steps_per_second": 7.558, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.15342465753424658, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.605328071040948e-05, |
|
"loss": 0.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.16073059360730593, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.580661075481007e-05, |
|
"loss": 0.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.1680365296803653, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.555994079921066e-05, |
|
"loss": 0.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.17534246575342466, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.531327084361125e-05, |
|
"loss": 0.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.182648401826484, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.506660088801185e-05, |
|
"loss": 0.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.182648401826484, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.3071, |
|
"eval_samples_per_second": 121.067, |
|
"eval_steps_per_second": 7.571, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.18995433789954339, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.481993093241244e-05, |
|
"loss": 0.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.19726027397260273, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.457326097681303e-05, |
|
"loss": 0.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.2045662100456621, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.432659102121362e-05, |
|
"loss": 0.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.21187214611872146, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.40799210656142e-05, |
|
"loss": 0.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.2191780821917808, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.38332511100148e-05, |
|
"loss": 0.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2191780821917808, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.4316, |
|
"eval_samples_per_second": 120.895, |
|
"eval_steps_per_second": 7.56, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2264840182648402, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.35865811544154e-05, |
|
"loss": 0.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.23378995433789954, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.3339911198816e-05, |
|
"loss": 0.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.2410958904109589, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.309324124321658e-05, |
|
"loss": 0.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.24840182648401826, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.284657128761717e-05, |
|
"loss": 0.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.2557077625570776, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.259990133201777e-05, |
|
"loss": 0.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.2557077625570776, |
|
"eval_loss": NaN, |
|
"eval_runtime": 89.7649, |
|
"eval_samples_per_second": 117.752, |
|
"eval_steps_per_second": 7.364, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.26301369863013696, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.235323137641837e-05, |
|
"loss": 0.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.27031963470319637, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.210656142081895e-05, |
|
"loss": 0.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.2776255707762557, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.185989146521954e-05, |
|
"loss": 0.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.28493150684931506, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.161322150962013e-05, |
|
"loss": 0.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.2922374429223744, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.136655155402072e-05, |
|
"loss": 0.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2922374429223744, |
|
"eval_loss": NaN, |
|
"eval_runtime": 89.6739, |
|
"eval_samples_per_second": 117.871, |
|
"eval_steps_per_second": 7.371, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.29954337899543376, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.111988159842132e-05, |
|
"loss": 0.0, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.30684931506849317, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.087321164282192e-05, |
|
"loss": 0.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.3141552511415525, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.06265416872225e-05, |
|
"loss": 0.0, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.32146118721461187, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.03798717316231e-05, |
|
"loss": 0.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.3287671232876712, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.013320177602368e-05, |
|
"loss": 0.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.3287671232876712, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.7201, |
|
"eval_samples_per_second": 120.497, |
|
"eval_steps_per_second": 7.535, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.3360730593607306, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.988653182042427e-05, |
|
"loss": 0.0, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.34337899543378997, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.963986186482487e-05, |
|
"loss": 0.0, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.3506849315068493, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.939319190922547e-05, |
|
"loss": 0.0, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.35799086757990867, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.914652195362605e-05, |
|
"loss": 0.0, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.365296803652968, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.889985199802664e-05, |
|
"loss": 0.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.365296803652968, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.4748, |
|
"eval_samples_per_second": 120.835, |
|
"eval_steps_per_second": 7.556, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3726027397260274, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.865318204242724e-05, |
|
"loss": 0.0, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.37990867579908677, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.840651208682784e-05, |
|
"loss": 0.0, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.3872146118721461, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.815984213122842e-05, |
|
"loss": 0.0, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.39452054794520547, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.791317217562902e-05, |
|
"loss": 0.0, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.4018264840182648, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.76665022200296e-05, |
|
"loss": 0.0, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.4018264840182648, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.3047, |
|
"eval_samples_per_second": 121.07, |
|
"eval_steps_per_second": 7.571, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.4091324200913242, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.74198322644302e-05, |
|
"loss": 0.0, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.41643835616438357, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.717316230883079e-05, |
|
"loss": 0.0, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.4237442922374429, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.692649235323139e-05, |
|
"loss": 0.0, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.43105022831050227, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.667982239763197e-05, |
|
"loss": 0.0, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.4383561643835616, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.643315244203257e-05, |
|
"loss": 0.0, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.4383561643835616, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.4314, |
|
"eval_samples_per_second": 120.895, |
|
"eval_steps_per_second": 7.56, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.445662100456621, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.618648248643315e-05, |
|
"loss": 0.0, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.4529680365296804, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.593981253083376e-05, |
|
"loss": 0.0, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.4602739726027397, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.569314257523434e-05, |
|
"loss": 0.0, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.46757990867579907, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.544647261963494e-05, |
|
"loss": 0.0, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.4748858447488584, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.519980266403552e-05, |
|
"loss": 0.0, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.4748858447488584, |
|
"eval_loss": NaN, |
|
"eval_runtime": 88.9686, |
|
"eval_samples_per_second": 118.806, |
|
"eval_steps_per_second": 7.43, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.4821917808219178, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.495313270843612e-05, |
|
"loss": 0.0, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.4894977168949772, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.470646275283671e-05, |
|
"loss": 0.0, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.4968036529680365, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.445979279723731e-05, |
|
"loss": 0.0, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.5041095890410959, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.421312284163789e-05, |
|
"loss": 0.0, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.5114155251141552, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.396645288603849e-05, |
|
"loss": 0.0, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5114155251141552, |
|
"eval_loss": NaN, |
|
"eval_runtime": 89.7675, |
|
"eval_samples_per_second": 117.749, |
|
"eval_steps_per_second": 7.363, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5187214611872146, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.371978293043907e-05, |
|
"loss": 0.0, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.5260273972602739, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.347311297483968e-05, |
|
"loss": 0.0, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.322644301924026e-05, |
|
"loss": 0.0, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.5406392694063927, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.297977306364086e-05, |
|
"loss": 0.0, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.547945205479452, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.273310310804144e-05, |
|
"loss": 0.0, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.547945205479452, |
|
"eval_loss": NaN, |
|
"eval_runtime": 89.6344, |
|
"eval_samples_per_second": 117.923, |
|
"eval_steps_per_second": 7.374, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.5552511415525114, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.248643315244204e-05, |
|
"loss": 0.0, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.5625570776255707, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.223976319684262e-05, |
|
"loss": 0.0, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.5698630136986301, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.199309324124323e-05, |
|
"loss": 0.0, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.5771689497716895, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.174642328564381e-05, |
|
"loss": 0.0, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.5844748858447488, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.149975333004441e-05, |
|
"loss": 0.0, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5844748858447488, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.5041, |
|
"eval_samples_per_second": 120.794, |
|
"eval_steps_per_second": 7.554, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5917808219178082, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.125308337444499e-05, |
|
"loss": 0.0, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.5990867579908675, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.100641341884559e-05, |
|
"loss": 0.0, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.6063926940639269, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.075974346324618e-05, |
|
"loss": 0.0, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.6136986301369863, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.051307350764678e-05, |
|
"loss": 0.0, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.6210045662100456, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.026640355204736e-05, |
|
"loss": 0.0, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.6210045662100456, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.6119, |
|
"eval_samples_per_second": 120.646, |
|
"eval_steps_per_second": 7.545, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.628310502283105, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.001973359644796e-05, |
|
"loss": 0.0, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.6356164383561644, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.977306364084854e-05, |
|
"loss": 0.0, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.6429223744292237, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.952639368524915e-05, |
|
"loss": 0.0, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.6502283105022831, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.927972372964973e-05, |
|
"loss": 0.0, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.6575342465753424, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.903305377405033e-05, |
|
"loss": 0.0, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6575342465753424, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.2395, |
|
"eval_samples_per_second": 121.161, |
|
"eval_steps_per_second": 7.577, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6648401826484018, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.878638381845091e-05, |
|
"loss": 0.0, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.6721461187214612, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.853971386285151e-05, |
|
"loss": 0.0, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.6794520547945205, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.829304390725209e-05, |
|
"loss": 0.0, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.6867579908675799, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.80463739516527e-05, |
|
"loss": 0.0, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.6940639269406392, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.779970399605328e-05, |
|
"loss": 0.0, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.6940639269406392, |
|
"eval_loss": NaN, |
|
"eval_runtime": 88.6493, |
|
"eval_samples_per_second": 119.234, |
|
"eval_steps_per_second": 7.456, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.7013698630136986, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.755303404045388e-05, |
|
"loss": 0.0, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.708675799086758, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.730636408485446e-05, |
|
"loss": 0.0, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.7159817351598173, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.705969412925506e-05, |
|
"loss": 0.0, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.7232876712328767, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.681302417365566e-05, |
|
"loss": 0.0, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.730593607305936, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.656635421805625e-05, |
|
"loss": 0.0, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.730593607305936, |
|
"eval_loss": NaN, |
|
"eval_runtime": 89.7851, |
|
"eval_samples_per_second": 117.726, |
|
"eval_steps_per_second": 7.362, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.7378995433789954, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.631968426245683e-05, |
|
"loss": 0.0, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.7452054794520548, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.607301430685743e-05, |
|
"loss": 0.0, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.7525114155251141, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.582634435125801e-05, |
|
"loss": 0.0, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.7598173515981735, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.557967439565862e-05, |
|
"loss": 0.0, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.7671232876712328, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.53330044400592e-05, |
|
"loss": 0.0, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.7671232876712328, |
|
"eval_loss": NaN, |
|
"eval_runtime": 89.5436, |
|
"eval_samples_per_second": 118.043, |
|
"eval_steps_per_second": 7.382, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.7744292237442922, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.50863344844598e-05, |
|
"loss": 0.0, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.7817351598173516, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.483966452886039e-05, |
|
"loss": 0.0, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.7890410958904109, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.459299457326098e-05, |
|
"loss": 0.0, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.7963470319634703, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.434632461766156e-05, |
|
"loss": 0.0, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.8036529680365296, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.409965466206217e-05, |
|
"loss": 0.0, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.8036529680365296, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.5031, |
|
"eval_samples_per_second": 120.796, |
|
"eval_steps_per_second": 7.554, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.810958904109589, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.385298470646276e-05, |
|
"loss": 0.0, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.8182648401826484, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.360631475086335e-05, |
|
"loss": 0.0, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.8255707762557077, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.335964479526394e-05, |
|
"loss": 0.0, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.8328767123287671, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.311297483966453e-05, |
|
"loss": 0.0, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.8401826484018264, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.286630488406513e-05, |
|
"loss": 0.0, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.8401826484018264, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.5455, |
|
"eval_samples_per_second": 120.737, |
|
"eval_steps_per_second": 7.55, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.8474885844748858, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.261963492846572e-05, |
|
"loss": 0.0, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.8547945205479452, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.23729649728663e-05, |
|
"loss": 0.0, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.8621004566210045, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.21262950172669e-05, |
|
"loss": 0.0, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.869406392694064, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.187962506166749e-05, |
|
"loss": 0.0, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.8767123287671232, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.16329551060681e-05, |
|
"loss": 0.0, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.8767123287671232, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.4965, |
|
"eval_samples_per_second": 120.805, |
|
"eval_steps_per_second": 7.555, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.8840182648401826, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.138628515046868e-05, |
|
"loss": 0.0, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.891324200913242, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.113961519486927e-05, |
|
"loss": 0.0, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.8986301369863013, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.089294523926986e-05, |
|
"loss": 0.0, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.9059360730593607, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.064627528367045e-05, |
|
"loss": 0.0, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.91324200913242, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.039960532807104e-05, |
|
"loss": 0.0, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.91324200913242, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.5268, |
|
"eval_samples_per_second": 120.763, |
|
"eval_steps_per_second": 7.552, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.9205479452054794, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.015293537247165e-05, |
|
"loss": 0.0, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.9278538812785389, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.990626541687223e-05, |
|
"loss": 0.0, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.9351598173515981, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.965959546127282e-05, |
|
"loss": 0.0, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.9424657534246575, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.941292550567341e-05, |
|
"loss": 0.0, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.9497716894977168, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.9166255550074e-05, |
|
"loss": 0.0, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.9497716894977168, |
|
"eval_loss": NaN, |
|
"eval_runtime": 89.7784, |
|
"eval_samples_per_second": 117.734, |
|
"eval_steps_per_second": 7.363, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.9570776255707762, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.89195855944746e-05, |
|
"loss": 0.0, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.9643835616438357, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.86729156388752e-05, |
|
"loss": 0.0, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.971689497716895, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.842624568327578e-05, |
|
"loss": 0.0, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.9789954337899544, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.817957572767637e-05, |
|
"loss": 0.0, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.9863013698630136, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.793290577207696e-05, |
|
"loss": 0.0, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.9863013698630136, |
|
"eval_loss": NaN, |
|
"eval_runtime": 89.7601, |
|
"eval_samples_per_second": 117.758, |
|
"eval_steps_per_second": 7.364, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.993607305936073, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.768623581647757e-05, |
|
"loss": 0.0, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.0007305936073059, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.743956586087815e-05, |
|
"loss": 0.0, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.0080365296803653, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.719289590527875e-05, |
|
"loss": 0.0, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.0153424657534247, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.694622594967933e-05, |
|
"loss": 0.0, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.022648401826484, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.669955599407992e-05, |
|
"loss": 0.0, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.022648401826484, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.4612, |
|
"eval_samples_per_second": 120.854, |
|
"eval_steps_per_second": 7.558, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.0299543378995433, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.645288603848051e-05, |
|
"loss": 0.0, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.0372602739726027, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.620621608288112e-05, |
|
"loss": 0.0, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.044566210045662, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.59595461272817e-05, |
|
"loss": 0.0, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.0518721461187215, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.57128761716823e-05, |
|
"loss": 0.0, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.059178082191781, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.546620621608288e-05, |
|
"loss": 0.0, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.059178082191781, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.6838, |
|
"eval_samples_per_second": 120.547, |
|
"eval_steps_per_second": 7.538, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.0664840182648403, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.521953626048347e-05, |
|
"loss": 0.0, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.0737899543378995, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.497286630488407e-05, |
|
"loss": 0.0, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.0810958904109589, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.472619634928467e-05, |
|
"loss": 0.0, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.0884018264840183, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.447952639368525e-05, |
|
"loss": 0.0, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.0957077625570777, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.423285643808585e-05, |
|
"loss": 0.0, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.0957077625570777, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.6114, |
|
"eval_samples_per_second": 120.646, |
|
"eval_steps_per_second": 7.545, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.103013698630137, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.398618648248643e-05, |
|
"loss": 0.0, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.1103196347031963, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.373951652688704e-05, |
|
"loss": 0.0, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.1176255707762557, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.349284657128762e-05, |
|
"loss": 0.0, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.124931506849315, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.324617661568822e-05, |
|
"loss": 0.0, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.1322374429223745, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.29995066600888e-05, |
|
"loss": 0.0, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.1322374429223745, |
|
"eval_loss": NaN, |
|
"eval_runtime": 88.5181, |
|
"eval_samples_per_second": 119.411, |
|
"eval_steps_per_second": 7.467, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.139543378995434, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.27528367044894e-05, |
|
"loss": 0.0, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.146849315068493, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.250616674888998e-05, |
|
"loss": 0.0, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.1541552511415525, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.225949679329059e-05, |
|
"loss": 0.0, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.161461187214612, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.201282683769117e-05, |
|
"loss": 0.0, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.1687671232876713, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.176615688209177e-05, |
|
"loss": 0.0, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.1687671232876713, |
|
"eval_loss": NaN, |
|
"eval_runtime": 89.8677, |
|
"eval_samples_per_second": 117.617, |
|
"eval_steps_per_second": 7.355, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.1760730593607307, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.151948692649235e-05, |
|
"loss": 0.0, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.1833789954337899, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.127281697089295e-05, |
|
"loss": 0.0, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.1906849315068493, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.102614701529354e-05, |
|
"loss": 0.0, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.1979908675799087, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.077947705969413e-05, |
|
"loss": 0.0, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.205296803652968, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.053280710409472e-05, |
|
"loss": 0.0, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.205296803652968, |
|
"eval_loss": NaN, |
|
"eval_runtime": 90.1315, |
|
"eval_samples_per_second": 117.273, |
|
"eval_steps_per_second": 7.334, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.2126027397260275, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.028613714849531e-05, |
|
"loss": 0.0, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.2199086757990867, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.003946719289591e-05, |
|
"loss": 0.0, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.227214611872146, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.9792797237296503e-05, |
|
"loss": 0.0, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.2345205479452055, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.954612728169709e-05, |
|
"loss": 0.0, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.241826484018265, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.929945732609768e-05, |
|
"loss": 0.0, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.241826484018265, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.7312, |
|
"eval_samples_per_second": 120.482, |
|
"eval_steps_per_second": 7.534, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.2491324200913243, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.905278737049827e-05, |
|
"loss": 0.0, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.2564383561643835, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.880611741489887e-05, |
|
"loss": 0.0, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.263744292237443, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.855944745929946e-05, |
|
"loss": 0.0, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.2710502283105023, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.8312777503700054e-05, |
|
"loss": 0.0, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.2783561643835617, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.806610754810064e-05, |
|
"loss": 0.0, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.2783561643835617, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.6857, |
|
"eval_samples_per_second": 120.544, |
|
"eval_steps_per_second": 7.538, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.285662100456621, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.781943759250123e-05, |
|
"loss": 0.0, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.2929680365296803, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.757276763690183e-05, |
|
"loss": 0.0, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.3002739726027397, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.732609768130242e-05, |
|
"loss": 0.0, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.307579908675799, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.7079427725703014e-05, |
|
"loss": 0.0, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.3148858447488585, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.6832757770103604e-05, |
|
"loss": 0.0, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.3148858447488585, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.7498, |
|
"eval_samples_per_second": 120.456, |
|
"eval_steps_per_second": 7.533, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.322191780821918, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.658608781450419e-05, |
|
"loss": 0.0, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.329497716894977, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.633941785890479e-05, |
|
"loss": 0.0, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.3368036529680365, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.609274790330538e-05, |
|
"loss": 0.0, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.344109589041096, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.5846077947705975e-05, |
|
"loss": 0.0, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.3514155251141553, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.5599407992106565e-05, |
|
"loss": 0.0, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.3514155251141553, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.7348, |
|
"eval_samples_per_second": 120.477, |
|
"eval_steps_per_second": 7.534, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.3587214611872147, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.5352738036507154e-05, |
|
"loss": 0.0, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.366027397260274, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.5106068080907743e-05, |
|
"loss": 0.0, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.3733333333333333, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.485939812530834e-05, |
|
"loss": 0.0, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.3806392694063927, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.461272816970893e-05, |
|
"loss": 0.0, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.387945205479452, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.4366058214109525e-05, |
|
"loss": 0.0, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.387945205479452, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.626, |
|
"eval_samples_per_second": 120.626, |
|
"eval_steps_per_second": 7.543, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.3952511415525115, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.4119388258510115e-05, |
|
"loss": 0.0, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.4025570776255707, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.3872718302910704e-05, |
|
"loss": 0.0, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.40986301369863, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.36260483473113e-05, |
|
"loss": 0.0, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.4171689497716895, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.337937839171189e-05, |
|
"loss": 0.0, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.424474885844749, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.3132708436112486e-05, |
|
"loss": 0.0, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.424474885844749, |
|
"eval_loss": NaN, |
|
"eval_runtime": 89.8105, |
|
"eval_samples_per_second": 117.692, |
|
"eval_steps_per_second": 7.36, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.4317808219178083, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.2886038480513075e-05, |
|
"loss": 0.0, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.4390867579908675, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.2639368524913665e-05, |
|
"loss": 0.0, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.446392694063927, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.239269856931426e-05, |
|
"loss": 0.0, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.4536986301369863, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.214602861371485e-05, |
|
"loss": 0.0, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.4610045662100457, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.189935865811545e-05, |
|
"loss": 0.0, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.4610045662100457, |
|
"eval_loss": NaN, |
|
"eval_runtime": 89.7513, |
|
"eval_samples_per_second": 117.77, |
|
"eval_steps_per_second": 7.365, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.4683105022831051, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.1652688702516036e-05, |
|
"loss": 0.0, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.4756164383561643, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.1406018746916626e-05, |
|
"loss": 0.0, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.4829223744292237, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.115934879131722e-05, |
|
"loss": 0.0, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.490228310502283, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.091267883571781e-05, |
|
"loss": 0.0, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.4975342465753425, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.06660088801184e-05, |
|
"loss": 0.0, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.4975342465753425, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.5277, |
|
"eval_samples_per_second": 120.762, |
|
"eval_steps_per_second": 7.552, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.504840182648402, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.0419338924519e-05, |
|
"loss": 0.0, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.512146118721461, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.0172668968919586e-05, |
|
"loss": 0.0, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.5194520547945205, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.992599901332018e-05, |
|
"loss": 0.0, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.52675799086758, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.967932905772077e-05, |
|
"loss": 0.0, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.5340639269406393, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.943265910212136e-05, |
|
"loss": 0.0, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.5340639269406393, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.4849, |
|
"eval_samples_per_second": 120.821, |
|
"eval_steps_per_second": 7.556, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.5413698630136987, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.918598914652196e-05, |
|
"loss": 0.0, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.548675799086758, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.893931919092255e-05, |
|
"loss": 0.0, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.5559817351598173, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.869264923532314e-05, |
|
"loss": 0.0, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.5632876712328767, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.844597927972373e-05, |
|
"loss": 0.0, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.5705936073059361, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.819930932412432e-05, |
|
"loss": 0.0, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.5705936073059361, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.7089, |
|
"eval_samples_per_second": 120.512, |
|
"eval_steps_per_second": 7.536, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.5778995433789955, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.795263936852492e-05, |
|
"loss": 0.0, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.5852054794520547, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.770596941292551e-05, |
|
"loss": 0.0, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.592511415525114, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.7459299457326104e-05, |
|
"loss": 0.0, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.5998173515981735, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.7212629501726694e-05, |
|
"loss": 0.0, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.607123287671233, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.696595954612728e-05, |
|
"loss": 0.0, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.607123287671233, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.4106, |
|
"eval_samples_per_second": 120.924, |
|
"eval_steps_per_second": 7.562, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.6144292237442923, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.671928959052788e-05, |
|
"loss": 0.0, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.6217351598173515, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.647261963492847e-05, |
|
"loss": 0.0, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.629041095890411, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.622594967932906e-05, |
|
"loss": 0.0, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.6363470319634703, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.5979279723729654e-05, |
|
"loss": 0.0, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.6436529680365297, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.5732609768130244e-05, |
|
"loss": 0.0, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.6436529680365297, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.6352, |
|
"eval_samples_per_second": 120.614, |
|
"eval_steps_per_second": 7.543, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.6509589041095891, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.548593981253084e-05, |
|
"loss": 0.0, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.6582648401826483, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.523926985693143e-05, |
|
"loss": 0.0, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.6655707762557077, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.499259990133202e-05, |
|
"loss": 0.0, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.6728767123287671, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.4745929945732615e-05, |
|
"loss": 0.0, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.6801826484018265, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.4499259990133204e-05, |
|
"loss": 0.0, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.6801826484018265, |
|
"eval_loss": NaN, |
|
"eval_runtime": 89.8795, |
|
"eval_samples_per_second": 117.602, |
|
"eval_steps_per_second": 7.354, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.687488584474886, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.42525900345338e-05, |
|
"loss": 0.0, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.694794520547945, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.400592007893439e-05, |
|
"loss": 0.0, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.7021004566210047, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.375925012333498e-05, |
|
"loss": 0.0, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.709406392694064, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.3512580167735576e-05, |
|
"loss": 0.0, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.7167123287671233, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.3265910212136165e-05, |
|
"loss": 0.0, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.7167123287671233, |
|
"eval_loss": NaN, |
|
"eval_runtime": 89.9633, |
|
"eval_samples_per_second": 117.492, |
|
"eval_steps_per_second": 7.347, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.7240182648401827, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.3019240256536755e-05, |
|
"loss": 0.0, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.731324200913242, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.277257030093735e-05, |
|
"loss": 0.0, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.7386301369863015, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.252590034533794e-05, |
|
"loss": 0.0, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.7459360730593607, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.2279230389738537e-05, |
|
"loss": 0.0, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 1.7532420091324201, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.2032560434139126e-05, |
|
"loss": 0.0, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.7532420091324201, |
|
"eval_loss": NaN, |
|
"eval_runtime": 89.7987, |
|
"eval_samples_per_second": 117.708, |
|
"eval_steps_per_second": 7.361, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.7605479452054795, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.1785890478539715e-05, |
|
"loss": 0.0, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.7678538812785387, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.153922052294031e-05, |
|
"loss": 0.0, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 1.7751598173515983, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.12925505673409e-05, |
|
"loss": 0.0, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.7824657534246575, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.10458806117415e-05, |
|
"loss": 0.0, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.789771689497717, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.079921065614209e-05, |
|
"loss": 0.0, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.789771689497717, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.6424, |
|
"eval_samples_per_second": 120.604, |
|
"eval_steps_per_second": 7.542, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.7970776255707763, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.0552540700542676e-05, |
|
"loss": 0.0, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.8043835616438355, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.030587074494327e-05, |
|
"loss": 0.0, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 1.8116894977168951, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.005920078934386e-05, |
|
"loss": 0.0, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 1.8189954337899543, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.981253083374445e-05, |
|
"loss": 0.0, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 1.8263013698630137, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.956586087814505e-05, |
|
"loss": 0.0, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.8263013698630137, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.5994, |
|
"eval_samples_per_second": 120.663, |
|
"eval_steps_per_second": 7.546, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.8336073059360731, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.931919092254564e-05, |
|
"loss": 0.0, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 1.8409132420091323, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.907252096694623e-05, |
|
"loss": 0.0, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 1.848219178082192, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.882585101134682e-05, |
|
"loss": 0.0, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 1.8555251141552511, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.857918105574741e-05, |
|
"loss": 0.0, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 1.8628310502283105, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.833251110014801e-05, |
|
"loss": 0.0, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.8628310502283105, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.4443, |
|
"eval_samples_per_second": 120.877, |
|
"eval_steps_per_second": 7.559, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.87013698630137, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.80858411445486e-05, |
|
"loss": 0.0, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 1.8774429223744291, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.783917118894919e-05, |
|
"loss": 0.0, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 1.8847488584474887, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.759250123334978e-05, |
|
"loss": 0.0, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 1.892054794520548, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.734583127775037e-05, |
|
"loss": 0.0, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 1.8993607305936073, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.709916132215097e-05, |
|
"loss": 0.0, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.8993607305936073, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.7012, |
|
"eval_samples_per_second": 120.523, |
|
"eval_steps_per_second": 7.537, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.9066666666666667, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.685249136655156e-05, |
|
"loss": 0.0, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 1.913972602739726, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.660582141095215e-05, |
|
"loss": 0.0, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 1.9212785388127855, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.6359151455352744e-05, |
|
"loss": 0.0, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 1.9285844748858447, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.6112481499753333e-05, |
|
"loss": 0.0, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 1.9358904109589041, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.586581154415392e-05, |
|
"loss": 0.0, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.9358904109589041, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.5232, |
|
"eval_samples_per_second": 120.768, |
|
"eval_steps_per_second": 7.552, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.9431963470319635, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.561914158855452e-05, |
|
"loss": 0.0, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 1.9505022831050227, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.537247163295511e-05, |
|
"loss": 0.0, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 1.9578082191780823, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.5125801677355705e-05, |
|
"loss": 0.0, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 1.9651141552511415, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.4879131721756294e-05, |
|
"loss": 0.0, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 1.972420091324201, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.4632461766156884e-05, |
|
"loss": 0.0, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.972420091324201, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.6204, |
|
"eval_samples_per_second": 120.634, |
|
"eval_steps_per_second": 7.544, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.9797260273972603, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.438579181055748e-05, |
|
"loss": 0.0, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 1.9870319634703195, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.413912185495807e-05, |
|
"loss": 0.0, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 1.9943378995433791, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.389245189935866e-05, |
|
"loss": 0.0, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 2.0014611872146117, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.3645781943759255e-05, |
|
"loss": 0.0, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 2.0087671232876714, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.3399111988159844e-05, |
|
"loss": 0.0, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.0087671232876714, |
|
"eval_loss": NaN, |
|
"eval_runtime": 89.786, |
|
"eval_samples_per_second": 117.724, |
|
"eval_steps_per_second": 7.362, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.0160730593607306, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.315244203256044e-05, |
|
"loss": 0.0, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 2.0233789954337897, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.290577207696103e-05, |
|
"loss": 0.0, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 2.0306849315068494, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.265910212136162e-05, |
|
"loss": 0.0, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 2.0379908675799085, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.2412432165762216e-05, |
|
"loss": 0.0, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 2.045296803652968, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.2165762210162805e-05, |
|
"loss": 0.0, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.045296803652968, |
|
"eval_loss": NaN, |
|
"eval_runtime": 89.6927, |
|
"eval_samples_per_second": 117.847, |
|
"eval_steps_per_second": 7.37, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.0526027397260274, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.1919092254563395e-05, |
|
"loss": 0.0, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 2.0599086757990865, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.167242229896399e-05, |
|
"loss": 0.0, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 2.067214611872146, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.142575234336458e-05, |
|
"loss": 0.0, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 2.0745205479452054, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.1179082387765176e-05, |
|
"loss": 0.0, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 2.081826484018265, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.0932412432165766e-05, |
|
"loss": 0.0, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.081826484018265, |
|
"eval_loss": NaN, |
|
"eval_runtime": 89.0102, |
|
"eval_samples_per_second": 118.75, |
|
"eval_steps_per_second": 7.426, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.089132420091324, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.0685742476566355e-05, |
|
"loss": 0.0, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 2.0964383561643833, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.0439072520966948e-05, |
|
"loss": 0.0, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 2.103744292237443, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.0192402565367538e-05, |
|
"loss": 0.0, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 2.111050228310502, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.994573260976813e-05, |
|
"loss": 0.0, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 2.118356164383562, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.9699062654168723e-05, |
|
"loss": 0.0, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.118356164383562, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.4632, |
|
"eval_samples_per_second": 120.851, |
|
"eval_steps_per_second": 7.557, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.125662100456621, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.9452392698569313e-05, |
|
"loss": 0.0, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 2.1329680365296806, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.920572274296991e-05, |
|
"loss": 0.0, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 2.1402739726027398, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.89590527873705e-05, |
|
"loss": 0.0, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 2.147579908675799, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.871238283177109e-05, |
|
"loss": 0.0, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 2.1548858447488586, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.8465712876171684e-05, |
|
"loss": 0.0, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.1548858447488586, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.3497, |
|
"eval_samples_per_second": 121.008, |
|
"eval_steps_per_second": 7.567, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.1621917808219178, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.8219042920572273e-05, |
|
"loss": 0.0, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 2.169497716894977, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.7972372964972866e-05, |
|
"loss": 0.0, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 2.1768036529680366, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.772570300937346e-05, |
|
"loss": 0.0, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 2.1841095890410958, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.7479033053774052e-05, |
|
"loss": 0.0, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 2.1914155251141554, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.7232363098174645e-05, |
|
"loss": 0.0, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.1914155251141554, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.4657, |
|
"eval_samples_per_second": 120.847, |
|
"eval_steps_per_second": 7.557, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.1987214611872146, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.6985693142575234e-05, |
|
"loss": 0.0, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 2.206027397260274, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.6739023186975827e-05, |
|
"loss": 0.0, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 2.2133333333333334, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.649235323137642e-05, |
|
"loss": 0.0, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 2.2206392694063926, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.6245683275777013e-05, |
|
"loss": 0.0, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 2.227945205479452, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.5999013320177602e-05, |
|
"loss": 0.0, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 2.227945205479452, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.5429, |
|
"eval_samples_per_second": 120.741, |
|
"eval_steps_per_second": 7.551, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 2.2352511415525114, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.5752343364578195e-05, |
|
"loss": 0.0, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 2.2425570776255705, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.5505673408978788e-05, |
|
"loss": 0.0, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 2.24986301369863, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.525900345337938e-05, |
|
"loss": 0.0, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 2.2571689497716894, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.501233349777997e-05, |
|
"loss": 0.0, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 2.264474885844749, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.4765663542180563e-05, |
|
"loss": 0.0, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.264474885844749, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.4603, |
|
"eval_samples_per_second": 120.855, |
|
"eval_steps_per_second": 7.558, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.271780821917808, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.4518993586581156e-05, |
|
"loss": 0.0, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 2.279086757990868, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.427232363098175e-05, |
|
"loss": 0.0, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 2.286392694063927, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.402565367538234e-05, |
|
"loss": 0.0, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 2.293698630136986, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.377898371978293e-05, |
|
"loss": 0.0, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 2.301004566210046, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.3532313764183524e-05, |
|
"loss": 0.0, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 2.301004566210046, |
|
"eval_loss": NaN, |
|
"eval_runtime": 89.2945, |
|
"eval_samples_per_second": 118.372, |
|
"eval_steps_per_second": 7.402, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 2.308310502283105, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.3285643808584116e-05, |
|
"loss": 0.0, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 2.315616438356164, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.303897385298471e-05, |
|
"loss": 0.0, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 2.322922374429224, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.27923038973853e-05, |
|
"loss": 0.0, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 2.330228310502283, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.254563394178589e-05, |
|
"loss": 0.0, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 2.3375342465753426, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.2298963986186484e-05, |
|
"loss": 0.0, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.3375342465753426, |
|
"eval_loss": NaN, |
|
"eval_runtime": 89.6497, |
|
"eval_samples_per_second": 117.903, |
|
"eval_steps_per_second": 7.373, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.3448401826484018, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.2052294030587077e-05, |
|
"loss": 0.0, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 2.3521461187214614, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.180562407498767e-05, |
|
"loss": 0.0, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 2.3594520547945206, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.155895411938826e-05, |
|
"loss": 0.0, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 2.3667579908675798, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.1312284163788852e-05, |
|
"loss": 0.0, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 2.3740639269406394, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.1065614208189445e-05, |
|
"loss": 0.0, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 2.3740639269406394, |
|
"eval_loss": NaN, |
|
"eval_runtime": 89.7652, |
|
"eval_samples_per_second": 117.752, |
|
"eval_steps_per_second": 7.364, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 2.3813698630136986, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.0818944252590038e-05, |
|
"loss": 0.0, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 2.3886757990867578, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.0572274296990627e-05, |
|
"loss": 0.0, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 2.3959817351598174, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.032560434139122e-05, |
|
"loss": 0.0, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 2.4032876712328766, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.0078934385791813e-05, |
|
"loss": 0.0, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 2.410593607305936, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.9832264430192406e-05, |
|
"loss": 0.0, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.410593607305936, |
|
"eval_loss": NaN, |
|
"eval_runtime": 88.5138, |
|
"eval_samples_per_second": 119.416, |
|
"eval_steps_per_second": 7.468, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.4178995433789954, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.9585594474592995e-05, |
|
"loss": 0.0, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 2.425205479452055, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.9338924518993588e-05, |
|
"loss": 0.0, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 2.432511415525114, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.909225456339418e-05, |
|
"loss": 0.0, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 2.4398173515981734, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.8845584607794774e-05, |
|
"loss": 0.0, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 2.447123287671233, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.8598914652195363e-05, |
|
"loss": 0.0, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 2.447123287671233, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.334, |
|
"eval_samples_per_second": 121.03, |
|
"eval_steps_per_second": 7.569, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 2.454429223744292, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.8352244696595956e-05, |
|
"loss": 0.0, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 2.4617351598173514, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.810557474099655e-05, |
|
"loss": 0.0, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 2.469041095890411, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.785890478539714e-05, |
|
"loss": 0.0, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 2.47634703196347, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.761223482979773e-05, |
|
"loss": 0.0, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 2.48365296803653, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.7365564874198324e-05, |
|
"loss": 0.0, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.48365296803653, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.2333, |
|
"eval_samples_per_second": 121.169, |
|
"eval_steps_per_second": 7.577, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.490958904109589, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.7118894918598917e-05, |
|
"loss": 0.0, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 2.4982648401826486, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.687222496299951e-05, |
|
"loss": 0.0, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 2.505570776255708, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.66255550074001e-05, |
|
"loss": 0.0, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 2.512876712328767, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.6378885051800692e-05, |
|
"loss": 0.0, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 2.5201826484018266, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.6132215096201285e-05, |
|
"loss": 0.0, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 2.5201826484018266, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.3114, |
|
"eval_samples_per_second": 121.061, |
|
"eval_steps_per_second": 7.571, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 2.527488584474886, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.5885545140601878e-05, |
|
"loss": 0.0, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 2.534794520547945, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.5638875185002467e-05, |
|
"loss": 0.0, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 2.5421004566210046, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.539220522940306e-05, |
|
"loss": 0.0, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 2.5494063926940638, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.5145535273803651e-05, |
|
"loss": 0.0, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 2.5567123287671234, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.4898865318204244e-05, |
|
"loss": 0.0, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.5567123287671234, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.2842, |
|
"eval_samples_per_second": 121.099, |
|
"eval_steps_per_second": 7.573, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.5640182648401826, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.4652195362604835e-05, |
|
"loss": 0.0, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 2.571324200913242, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.4405525407005426e-05, |
|
"loss": 0.0, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 2.5786301369863014, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.4158855451406019e-05, |
|
"loss": 0.0, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 2.5859360730593606, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.3912185495806612e-05, |
|
"loss": 0.0, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 2.59324200913242, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.3665515540207203e-05, |
|
"loss": 0.0, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 2.59324200913242, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.1762, |
|
"eval_samples_per_second": 121.249, |
|
"eval_steps_per_second": 7.582, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 2.6005479452054794, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.3418845584607796e-05, |
|
"loss": 0.0, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 2.6078538812785386, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.3172175629008387e-05, |
|
"loss": 0.0, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 2.615159817351598, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.292550567340898e-05, |
|
"loss": 0.0, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 2.6224657534246574, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.267883571780957e-05, |
|
"loss": 0.0, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 2.629771689497717, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.2432165762210164e-05, |
|
"loss": 0.0, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.629771689497717, |
|
"eval_loss": NaN, |
|
"eval_runtime": 89.0981, |
|
"eval_samples_per_second": 118.633, |
|
"eval_steps_per_second": 7.419, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.637077625570776, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.2185495806610755e-05, |
|
"loss": 0.0, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 2.644383561643836, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.1938825851011348e-05, |
|
"loss": 0.0, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 2.651689497716895, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.169215589541194e-05, |
|
"loss": 0.0, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 2.658995433789954, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.1445485939812531e-05, |
|
"loss": 0.0, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 2.666301369863014, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.1198815984213124e-05, |
|
"loss": 0.0, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 2.666301369863014, |
|
"eval_loss": NaN, |
|
"eval_runtime": 89.3812, |
|
"eval_samples_per_second": 118.258, |
|
"eval_steps_per_second": 7.395, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 2.673607305936073, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.0952146028613715e-05, |
|
"loss": 0.0, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 2.680913242009132, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.0705476073014308e-05, |
|
"loss": 0.0, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 2.688219178082192, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.04588061174149e-05, |
|
"loss": 0.0, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 2.695525114155251, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.0212136161815492e-05, |
|
"loss": 0.0, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 2.7028310502283106, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.965466206216083e-06, |
|
"loss": 0.0, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.7028310502283106, |
|
"eval_loss": NaN, |
|
"eval_runtime": 89.3655, |
|
"eval_samples_per_second": 118.278, |
|
"eval_steps_per_second": 7.397, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.71013698630137, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.718796250616676e-06, |
|
"loss": 0.0, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 2.7174429223744294, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.472126295017267e-06, |
|
"loss": 0.0, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 2.7247488584474886, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.22545633941786e-06, |
|
"loss": 0.0, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 2.732054794520548, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.978786383818451e-06, |
|
"loss": 0.0, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 2.7393607305936074, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.732116428219044e-06, |
|
"loss": 0.0, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 2.7393607305936074, |
|
"eval_loss": NaN, |
|
"eval_runtime": 86.8786, |
|
"eval_samples_per_second": 121.664, |
|
"eval_steps_per_second": 7.608, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 2.7466666666666666, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.485446472619635e-06, |
|
"loss": 0.0, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 2.7539726027397258, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.238776517020228e-06, |
|
"loss": 0.0, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 2.7612785388127854, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.99210656142082e-06, |
|
"loss": 0.0, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 2.768584474885845, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.745436605821412e-06, |
|
"loss": 0.0, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 2.775890410958904, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.498766650222003e-06, |
|
"loss": 0.0, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.775890410958904, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.0328, |
|
"eval_samples_per_second": 121.449, |
|
"eval_steps_per_second": 7.595, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.7831963470319634, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.252096694622595e-06, |
|
"loss": 0.0, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 2.790502283105023, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.005426739023187e-06, |
|
"loss": 0.0, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 2.797808219178082, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.758756783423779e-06, |
|
"loss": 0.0, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 2.8051141552511414, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.512086827824371e-06, |
|
"loss": 0.0, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 2.812420091324201, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.265416872224963e-06, |
|
"loss": 0.0, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 2.812420091324201, |
|
"eval_loss": NaN, |
|
"eval_runtime": 86.9228, |
|
"eval_samples_per_second": 121.602, |
|
"eval_steps_per_second": 7.604, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 2.81972602739726, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.018746916625555e-06, |
|
"loss": 0.0, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 2.8270319634703194, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.772076961026148e-06, |
|
"loss": 0.0, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 2.834337899543379, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.52540700542674e-06, |
|
"loss": 0.0, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 2.8416438356164386, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.278737049827332e-06, |
|
"loss": 0.0, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 2.848949771689498, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.032067094227924e-06, |
|
"loss": 0.0, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.848949771689498, |
|
"eval_loss": NaN, |
|
"eval_runtime": 86.8846, |
|
"eval_samples_per_second": 121.656, |
|
"eval_steps_per_second": 7.608, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.856255707762557, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.785397138628516e-06, |
|
"loss": 0.0, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 2.8635616438356166, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.538727183029108e-06, |
|
"loss": 0.0, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 2.870867579908676, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.2920572274297e-06, |
|
"loss": 0.0, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 2.878173515981735, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.045387271830292e-06, |
|
"loss": 0.0, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 2.8854794520547946, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.7987173162308833e-06, |
|
"loss": 0.0, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 2.8854794520547946, |
|
"eval_loss": NaN, |
|
"eval_runtime": 87.2057, |
|
"eval_samples_per_second": 121.208, |
|
"eval_steps_per_second": 7.58, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 2.892785388127854, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.5520473606314752e-06, |
|
"loss": 0.0, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 2.900091324200913, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.3053774050320672e-06, |
|
"loss": 0.0, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 2.9073972602739726, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.058707449432659e-06, |
|
"loss": 0.0, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 2.9147031963470322, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.812037493833251e-06, |
|
"loss": 0.0, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 2.9220091324200914, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.5653675382338436e-06, |
|
"loss": 0.0, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.9220091324200914, |
|
"eval_loss": NaN, |
|
"eval_runtime": 78.7041, |
|
"eval_samples_per_second": 134.301, |
|
"eval_steps_per_second": 8.399, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.9293150684931506, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.3186975826344356e-06, |
|
"loss": 0.0, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 2.9366210045662102, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.0720276270350275e-06, |
|
"loss": 0.0, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 2.9439269406392694, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.8253576714356193e-06, |
|
"loss": 0.0, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 2.9512328767123286, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.5786877158362113e-06, |
|
"loss": 0.0, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 2.958538812785388, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.3320177602368033e-06, |
|
"loss": 0.0, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 2.958538812785388, |
|
"eval_loss": NaN, |
|
"eval_runtime": 75.4374, |
|
"eval_samples_per_second": 140.116, |
|
"eval_steps_per_second": 8.762, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 2.9658447488584474, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.0853478046373952e-06, |
|
"loss": 0.0, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 2.9731506849315066, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.386778490379872e-07, |
|
"loss": 0.0, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 2.980456621004566, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.920078934385792e-07, |
|
"loss": 0.0, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 2.987762557077626, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.4533793783917124e-07, |
|
"loss": 0.0, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 2.995068493150685, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.86679822397632e-08, |
|
"loss": 0.0, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 2.995068493150685, |
|
"eval_loss": NaN, |
|
"eval_runtime": 75.3535, |
|
"eval_samples_per_second": 140.272, |
|
"eval_steps_per_second": 8.772, |
|
"step": 4100 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 4104, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.893527772009083e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|