|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 30.0, |
|
"eval_steps": 500, |
|
"global_step": 387090, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.038750678136867396, |
|
"grad_norm": 7.044970512390137, |
|
"learning_rate": 1.9922498643726265e-05, |
|
"loss": 5.0771, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.07750135627373479, |
|
"grad_norm": 6.129168510437012, |
|
"learning_rate": 1.9844997287452532e-05, |
|
"loss": 4.5935, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.11625203441060218, |
|
"grad_norm": 6.968145847320557, |
|
"learning_rate": 1.9767495931178796e-05, |
|
"loss": 4.3872, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.15500271254746958, |
|
"grad_norm": 6.270899295806885, |
|
"learning_rate": 1.9689994574905063e-05, |
|
"loss": 4.2507, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.19375339068433697, |
|
"grad_norm": 6.4452104568481445, |
|
"learning_rate": 1.9612493218631326e-05, |
|
"loss": 4.135, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.23250406882120436, |
|
"grad_norm": 6.564201354980469, |
|
"learning_rate": 1.9534991862357593e-05, |
|
"loss": 4.0704, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.2712547469580718, |
|
"grad_norm": 6.716593265533447, |
|
"learning_rate": 1.9457490506083857e-05, |
|
"loss": 3.9814, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.31000542509493917, |
|
"grad_norm": 6.578341484069824, |
|
"learning_rate": 1.9379989149810124e-05, |
|
"loss": 3.9078, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.34875610323180656, |
|
"grad_norm": 6.6163482666015625, |
|
"learning_rate": 1.9302487793536387e-05, |
|
"loss": 3.858, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.38750678136867395, |
|
"grad_norm": 5.90346097946167, |
|
"learning_rate": 1.9224986437262654e-05, |
|
"loss": 3.8036, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.42625745950554134, |
|
"grad_norm": 5.8487467765808105, |
|
"learning_rate": 1.9147485080988918e-05, |
|
"loss": 3.7518, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.4650081376424087, |
|
"grad_norm": 6.573217391967773, |
|
"learning_rate": 1.9069983724715185e-05, |
|
"loss": 3.7164, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.5037588157792762, |
|
"grad_norm": 5.970231056213379, |
|
"learning_rate": 1.899248236844145e-05, |
|
"loss": 3.673, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.5425094939161436, |
|
"grad_norm": 6.683649063110352, |
|
"learning_rate": 1.8914981012167715e-05, |
|
"loss": 3.6405, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.581260172053011, |
|
"grad_norm": 6.538488864898682, |
|
"learning_rate": 1.883747965589398e-05, |
|
"loss": 3.6133, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.6200108501898783, |
|
"grad_norm": 6.528162479400635, |
|
"learning_rate": 1.8759978299620246e-05, |
|
"loss": 3.5754, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.6587615283267457, |
|
"grad_norm": 6.43408203125, |
|
"learning_rate": 1.868247694334651e-05, |
|
"loss": 3.5548, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.6975122064636131, |
|
"grad_norm": 6.131889820098877, |
|
"learning_rate": 1.8604975587072776e-05, |
|
"loss": 3.5228, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.7362628846004805, |
|
"grad_norm": 6.320891857147217, |
|
"learning_rate": 1.852747423079904e-05, |
|
"loss": 3.4999, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.7750135627373479, |
|
"grad_norm": 6.105418682098389, |
|
"learning_rate": 1.8449972874525307e-05, |
|
"loss": 3.5071, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.8137642408742153, |
|
"grad_norm": 6.774458885192871, |
|
"learning_rate": 1.837247151825157e-05, |
|
"loss": 3.4616, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.8525149190110827, |
|
"grad_norm": 6.263659477233887, |
|
"learning_rate": 1.8294970161977838e-05, |
|
"loss": 3.4499, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.8912655971479501, |
|
"grad_norm": 6.58251428604126, |
|
"learning_rate": 1.82174688057041e-05, |
|
"loss": 3.4157, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.9300162752848175, |
|
"grad_norm": 6.030143737792969, |
|
"learning_rate": 1.8139967449430368e-05, |
|
"loss": 3.3842, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.9687669534216848, |
|
"grad_norm": 6.361506462097168, |
|
"learning_rate": 1.806246609315663e-05, |
|
"loss": 3.3707, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 3.2508351802825928, |
|
"eval_runtime": 267.1986, |
|
"eval_samples_per_second": 772.702, |
|
"eval_steps_per_second": 12.077, |
|
"step": 12903 |
|
}, |
|
{ |
|
"epoch": 1.0075176315585523, |
|
"grad_norm": 6.418643474578857, |
|
"learning_rate": 1.79849647368829e-05, |
|
"loss": 3.3673, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.0462683096954197, |
|
"grad_norm": 6.310774326324463, |
|
"learning_rate": 1.7907463380609162e-05, |
|
"loss": 3.3276, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.0850189878322871, |
|
"grad_norm": 6.517366409301758, |
|
"learning_rate": 1.782996202433543e-05, |
|
"loss": 3.3288, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.1237696659691545, |
|
"grad_norm": 6.407958984375, |
|
"learning_rate": 1.7752460668061693e-05, |
|
"loss": 3.3003, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.162520344106022, |
|
"grad_norm": 6.145129203796387, |
|
"learning_rate": 1.767495931178796e-05, |
|
"loss": 3.2694, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.2012710222428893, |
|
"grad_norm": 6.586604118347168, |
|
"learning_rate": 1.7597457955514223e-05, |
|
"loss": 3.2627, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.2400217003797567, |
|
"grad_norm": 6.122056007385254, |
|
"learning_rate": 1.751995659924049e-05, |
|
"loss": 3.2631, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.278772378516624, |
|
"grad_norm": 6.545727252960205, |
|
"learning_rate": 1.7442455242966754e-05, |
|
"loss": 3.2324, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.3175230566534915, |
|
"grad_norm": 6.427816390991211, |
|
"learning_rate": 1.7364953886693017e-05, |
|
"loss": 3.227, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.3562737347903588, |
|
"grad_norm": 6.253689765930176, |
|
"learning_rate": 1.7287452530419284e-05, |
|
"loss": 3.2099, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.3950244129272262, |
|
"grad_norm": 6.5702080726623535, |
|
"learning_rate": 1.7209951174145548e-05, |
|
"loss": 3.2102, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.4337750910640936, |
|
"grad_norm": 6.4822564125061035, |
|
"learning_rate": 1.7132449817871815e-05, |
|
"loss": 3.1935, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.472525769200961, |
|
"grad_norm": 6.524315357208252, |
|
"learning_rate": 1.705494846159808e-05, |
|
"loss": 3.1955, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.5112764473378284, |
|
"grad_norm": 6.302344799041748, |
|
"learning_rate": 1.6977447105324345e-05, |
|
"loss": 3.1726, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.5500271254746958, |
|
"grad_norm": 5.837028503417969, |
|
"learning_rate": 1.689994574905061e-05, |
|
"loss": 3.1277, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.5887778036115632, |
|
"grad_norm": 6.489377975463867, |
|
"learning_rate": 1.6822444392776876e-05, |
|
"loss": 3.1414, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.6275284817484306, |
|
"grad_norm": 6.543872833251953, |
|
"learning_rate": 1.674494303650314e-05, |
|
"loss": 3.104, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.666279159885298, |
|
"grad_norm": 6.05628776550293, |
|
"learning_rate": 1.6667441680229406e-05, |
|
"loss": 3.1459, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.7050298380221653, |
|
"grad_norm": 6.027078151702881, |
|
"learning_rate": 1.658994032395567e-05, |
|
"loss": 3.0963, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.7437805161590327, |
|
"grad_norm": 6.577582359313965, |
|
"learning_rate": 1.6512438967681937e-05, |
|
"loss": 3.118, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.7825311942959001, |
|
"grad_norm": 5.9164204597473145, |
|
"learning_rate": 1.64349376114082e-05, |
|
"loss": 3.0928, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.8212818724327677, |
|
"grad_norm": 6.155348300933838, |
|
"learning_rate": 1.6357436255134468e-05, |
|
"loss": 3.0885, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.8600325505696351, |
|
"grad_norm": 6.302849769592285, |
|
"learning_rate": 1.627993489886073e-05, |
|
"loss": 3.0741, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.8987832287065025, |
|
"grad_norm": 6.140907287597656, |
|
"learning_rate": 1.6202433542586998e-05, |
|
"loss": 3.0633, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 1.93753390684337, |
|
"grad_norm": 5.85639762878418, |
|
"learning_rate": 1.612493218631326e-05, |
|
"loss": 3.0401, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.9762845849802373, |
|
"grad_norm": 6.558920383453369, |
|
"learning_rate": 1.604743083003953e-05, |
|
"loss": 3.05, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.9521713256835938, |
|
"eval_runtime": 258.4886, |
|
"eval_samples_per_second": 798.739, |
|
"eval_steps_per_second": 12.484, |
|
"step": 25806 |
|
}, |
|
{ |
|
"epoch": 2.0150352631171047, |
|
"grad_norm": 6.003655433654785, |
|
"learning_rate": 1.5969929473765792e-05, |
|
"loss": 3.0292, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 2.053785941253972, |
|
"grad_norm": 6.43280029296875, |
|
"learning_rate": 1.589242811749206e-05, |
|
"loss": 3.0205, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 2.0925366193908395, |
|
"grad_norm": 6.051511287689209, |
|
"learning_rate": 1.5814926761218323e-05, |
|
"loss": 3.0152, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 2.131287297527707, |
|
"grad_norm": 7.381418704986572, |
|
"learning_rate": 1.573742540494459e-05, |
|
"loss": 3.0067, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 2.1700379756645742, |
|
"grad_norm": 6.032004356384277, |
|
"learning_rate": 1.5659924048670853e-05, |
|
"loss": 2.9821, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 2.2087886538014416, |
|
"grad_norm": 6.481622695922852, |
|
"learning_rate": 1.558242269239712e-05, |
|
"loss": 2.9824, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 2.247539331938309, |
|
"grad_norm": 5.934979438781738, |
|
"learning_rate": 1.5504921336123384e-05, |
|
"loss": 2.9708, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 2.2862900100751764, |
|
"grad_norm": 7.498392581939697, |
|
"learning_rate": 1.542741997984965e-05, |
|
"loss": 2.9836, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 2.325040688212044, |
|
"grad_norm": 6.350077152252197, |
|
"learning_rate": 1.5349918623575914e-05, |
|
"loss": 2.9608, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 2.363791366348911, |
|
"grad_norm": 5.6795783042907715, |
|
"learning_rate": 1.527241726730218e-05, |
|
"loss": 2.9551, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 2.4025420444857786, |
|
"grad_norm": 6.395376682281494, |
|
"learning_rate": 1.5194915911028445e-05, |
|
"loss": 2.9551, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 2.441292722622646, |
|
"grad_norm": 6.238061904907227, |
|
"learning_rate": 1.511741455475471e-05, |
|
"loss": 2.9527, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 2.4800434007595134, |
|
"grad_norm": 6.641284465789795, |
|
"learning_rate": 1.5039913198480975e-05, |
|
"loss": 2.9444, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 2.5187940788963807, |
|
"grad_norm": 6.30321741104126, |
|
"learning_rate": 1.496241184220724e-05, |
|
"loss": 2.9346, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 2.557544757033248, |
|
"grad_norm": 8.681157112121582, |
|
"learning_rate": 1.4884910485933506e-05, |
|
"loss": 2.9204, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 2.5962954351701155, |
|
"grad_norm": 6.423407077789307, |
|
"learning_rate": 1.4807409129659771e-05, |
|
"loss": 2.9066, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 2.635046113306983, |
|
"grad_norm": 6.697604179382324, |
|
"learning_rate": 1.4729907773386036e-05, |
|
"loss": 2.9079, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 2.6737967914438503, |
|
"grad_norm": 6.646244049072266, |
|
"learning_rate": 1.4652406417112302e-05, |
|
"loss": 2.9148, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 2.7125474695807177, |
|
"grad_norm": 6.80411958694458, |
|
"learning_rate": 1.4574905060838567e-05, |
|
"loss": 2.9005, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 2.751298147717585, |
|
"grad_norm": 6.345988750457764, |
|
"learning_rate": 1.4497403704564832e-05, |
|
"loss": 2.888, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 2.7900488258544525, |
|
"grad_norm": 5.965686798095703, |
|
"learning_rate": 1.4419902348291098e-05, |
|
"loss": 2.8845, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 2.82879950399132, |
|
"grad_norm": 6.068357944488525, |
|
"learning_rate": 1.4342400992017363e-05, |
|
"loss": 2.8835, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 2.8675501821281872, |
|
"grad_norm": 5.874370098114014, |
|
"learning_rate": 1.4264899635743628e-05, |
|
"loss": 2.8904, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.9063008602650546, |
|
"grad_norm": 6.0566935539245605, |
|
"learning_rate": 1.4187398279469893e-05, |
|
"loss": 2.8823, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 2.945051538401922, |
|
"grad_norm": 6.21787691116333, |
|
"learning_rate": 1.4109896923196159e-05, |
|
"loss": 2.867, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.9838022165387894, |
|
"grad_norm": 6.055897235870361, |
|
"learning_rate": 1.4032395566922424e-05, |
|
"loss": 2.867, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 2.775702953338623, |
|
"eval_runtime": 259.0101, |
|
"eval_samples_per_second": 797.131, |
|
"eval_steps_per_second": 12.459, |
|
"step": 38709 |
|
}, |
|
{ |
|
"epoch": 3.022552894675657, |
|
"grad_norm": 5.503760814666748, |
|
"learning_rate": 1.3954894210648689e-05, |
|
"loss": 2.8428, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 3.061303572812524, |
|
"grad_norm": 6.250561714172363, |
|
"learning_rate": 1.3877392854374954e-05, |
|
"loss": 2.842, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 3.1000542509493916, |
|
"grad_norm": 6.394408226013184, |
|
"learning_rate": 1.379989149810122e-05, |
|
"loss": 2.8368, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 3.138804929086259, |
|
"grad_norm": 5.7096428871154785, |
|
"learning_rate": 1.3722390141827483e-05, |
|
"loss": 2.8253, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 3.1775556072231264, |
|
"grad_norm": 6.807374000549316, |
|
"learning_rate": 1.3644888785553749e-05, |
|
"loss": 2.821, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 3.2163062853599937, |
|
"grad_norm": 6.367000102996826, |
|
"learning_rate": 1.3567387429280014e-05, |
|
"loss": 2.8302, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 3.255056963496861, |
|
"grad_norm": 6.30033540725708, |
|
"learning_rate": 1.3489886073006279e-05, |
|
"loss": 2.8191, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 3.2938076416337285, |
|
"grad_norm": 7.257653713226318, |
|
"learning_rate": 1.3412384716732544e-05, |
|
"loss": 2.8196, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 3.332558319770596, |
|
"grad_norm": 7.1162109375, |
|
"learning_rate": 1.333488336045881e-05, |
|
"loss": 2.817, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 3.3713089979074633, |
|
"grad_norm": 6.336881160736084, |
|
"learning_rate": 1.3257382004185075e-05, |
|
"loss": 2.8064, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 3.4100596760443307, |
|
"grad_norm": 6.641462326049805, |
|
"learning_rate": 1.317988064791134e-05, |
|
"loss": 2.8035, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 3.448810354181198, |
|
"grad_norm": 6.033754348754883, |
|
"learning_rate": 1.3102379291637605e-05, |
|
"loss": 2.7976, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 3.4875610323180655, |
|
"grad_norm": 6.544773101806641, |
|
"learning_rate": 1.302487793536387e-05, |
|
"loss": 2.8048, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 3.526311710454933, |
|
"grad_norm": 6.382020950317383, |
|
"learning_rate": 1.2947376579090136e-05, |
|
"loss": 2.7982, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 3.5650623885918002, |
|
"grad_norm": 6.194632053375244, |
|
"learning_rate": 1.2869875222816401e-05, |
|
"loss": 2.7749, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 3.6038130667286676, |
|
"grad_norm": 6.429641246795654, |
|
"learning_rate": 1.2792373866542665e-05, |
|
"loss": 2.7853, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 3.642563744865535, |
|
"grad_norm": 6.209822177886963, |
|
"learning_rate": 1.271487251026893e-05, |
|
"loss": 2.7841, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 3.6813144230024024, |
|
"grad_norm": 6.935910701751709, |
|
"learning_rate": 1.2637371153995195e-05, |
|
"loss": 2.7681, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 3.72006510113927, |
|
"grad_norm": 7.021639347076416, |
|
"learning_rate": 1.255986979772146e-05, |
|
"loss": 2.7658, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 3.758815779276137, |
|
"grad_norm": 6.242121696472168, |
|
"learning_rate": 1.2482368441447726e-05, |
|
"loss": 2.7698, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 3.7975664574130046, |
|
"grad_norm": 6.123905658721924, |
|
"learning_rate": 1.2404867085173991e-05, |
|
"loss": 2.7711, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 3.836317135549872, |
|
"grad_norm": 6.735771179199219, |
|
"learning_rate": 1.2327365728900256e-05, |
|
"loss": 2.726, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 3.8750678136867394, |
|
"grad_norm": 6.921602725982666, |
|
"learning_rate": 1.2249864372626522e-05, |
|
"loss": 2.7545, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 3.9138184918236067, |
|
"grad_norm": 6.343456745147705, |
|
"learning_rate": 1.2172363016352787e-05, |
|
"loss": 2.7474, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 3.9525691699604746, |
|
"grad_norm": 6.30169677734375, |
|
"learning_rate": 1.2094861660079052e-05, |
|
"loss": 2.7467, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 3.9913198480973415, |
|
"grad_norm": 6.6629767417907715, |
|
"learning_rate": 1.2017360303805317e-05, |
|
"loss": 2.7475, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.6640822887420654, |
|
"eval_runtime": 260.2494, |
|
"eval_samples_per_second": 793.335, |
|
"eval_steps_per_second": 12.4, |
|
"step": 51612 |
|
}, |
|
{ |
|
"epoch": 4.030070526234209, |
|
"grad_norm": 6.397671222686768, |
|
"learning_rate": 1.1939858947531581e-05, |
|
"loss": 2.7311, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 4.068821204371076, |
|
"grad_norm": 6.374961853027344, |
|
"learning_rate": 1.1862357591257846e-05, |
|
"loss": 2.7119, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 4.107571882507944, |
|
"grad_norm": 5.920938968658447, |
|
"learning_rate": 1.1784856234984112e-05, |
|
"loss": 2.7217, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 4.146322560644811, |
|
"grad_norm": 6.377143859863281, |
|
"learning_rate": 1.1707354878710377e-05, |
|
"loss": 2.7044, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 4.185073238781679, |
|
"grad_norm": 7.047250270843506, |
|
"learning_rate": 1.1629853522436642e-05, |
|
"loss": 2.7213, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 4.223823916918546, |
|
"grad_norm": 6.682352066040039, |
|
"learning_rate": 1.1552352166162907e-05, |
|
"loss": 2.7025, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 4.262574595055414, |
|
"grad_norm": 6.547230243682861, |
|
"learning_rate": 1.1474850809889173e-05, |
|
"loss": 2.7068, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 4.301325273192281, |
|
"grad_norm": 6.038912296295166, |
|
"learning_rate": 1.1397349453615438e-05, |
|
"loss": 2.7061, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 4.3400759513291485, |
|
"grad_norm": 6.072612762451172, |
|
"learning_rate": 1.1319848097341703e-05, |
|
"loss": 2.7037, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 4.378826629466015, |
|
"grad_norm": 5.6306281089782715, |
|
"learning_rate": 1.1242346741067968e-05, |
|
"loss": 2.6999, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 4.417577307602883, |
|
"grad_norm": 6.18297004699707, |
|
"learning_rate": 1.1164845384794234e-05, |
|
"loss": 2.6974, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 4.45632798573975, |
|
"grad_norm": 6.371115207672119, |
|
"learning_rate": 1.1087344028520499e-05, |
|
"loss": 2.6918, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 4.495078663876618, |
|
"grad_norm": 6.444944381713867, |
|
"learning_rate": 1.1009842672246764e-05, |
|
"loss": 2.6874, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 4.533829342013485, |
|
"grad_norm": 6.176960468292236, |
|
"learning_rate": 1.093234131597303e-05, |
|
"loss": 2.68, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 4.572580020150353, |
|
"grad_norm": 6.731847763061523, |
|
"learning_rate": 1.0854839959699295e-05, |
|
"loss": 2.6919, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 4.61133069828722, |
|
"grad_norm": 7.826213836669922, |
|
"learning_rate": 1.077733860342556e-05, |
|
"loss": 2.6824, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 4.650081376424088, |
|
"grad_norm": 7.052020072937012, |
|
"learning_rate": 1.0699837247151825e-05, |
|
"loss": 2.6616, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 4.6888320545609545, |
|
"grad_norm": 5.36915922164917, |
|
"learning_rate": 1.062233589087809e-05, |
|
"loss": 2.667, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 4.727582732697822, |
|
"grad_norm": 6.491717338562012, |
|
"learning_rate": 1.0544834534604356e-05, |
|
"loss": 2.6896, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 4.766333410834689, |
|
"grad_norm": 7.702902793884277, |
|
"learning_rate": 1.0467333178330621e-05, |
|
"loss": 2.6712, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 4.805084088971557, |
|
"grad_norm": 6.359930992126465, |
|
"learning_rate": 1.0389831822056886e-05, |
|
"loss": 2.6704, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 4.843834767108424, |
|
"grad_norm": 6.2874531745910645, |
|
"learning_rate": 1.0312330465783152e-05, |
|
"loss": 2.6757, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 4.882585445245292, |
|
"grad_norm": 6.827906131744385, |
|
"learning_rate": 1.0234829109509417e-05, |
|
"loss": 2.6567, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 4.921336123382159, |
|
"grad_norm": 6.620416164398193, |
|
"learning_rate": 1.0157327753235682e-05, |
|
"loss": 2.6615, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 4.960086801519027, |
|
"grad_norm": 6.6219162940979, |
|
"learning_rate": 1.0079826396961947e-05, |
|
"loss": 2.657, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 4.998837479655894, |
|
"grad_norm": 6.214903831481934, |
|
"learning_rate": 1.0002325040688213e-05, |
|
"loss": 2.6549, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 2.578911066055298, |
|
"eval_runtime": 265.1883, |
|
"eval_samples_per_second": 778.56, |
|
"eval_steps_per_second": 12.169, |
|
"step": 64515 |
|
}, |
|
{ |
|
"epoch": 5.0375881577927615, |
|
"grad_norm": 6.627685546875, |
|
"learning_rate": 9.924823684414478e-06, |
|
"loss": 2.6203, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 5.076338835929628, |
|
"grad_norm": 6.23040771484375, |
|
"learning_rate": 9.847322328140743e-06, |
|
"loss": 2.6349, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 5.115089514066496, |
|
"grad_norm": 6.667369365692139, |
|
"learning_rate": 9.769820971867009e-06, |
|
"loss": 2.647, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 5.153840192203363, |
|
"grad_norm": 6.694558620452881, |
|
"learning_rate": 9.692319615593274e-06, |
|
"loss": 2.6214, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 5.192590870340231, |
|
"grad_norm": 6.280242443084717, |
|
"learning_rate": 9.614818259319539e-06, |
|
"loss": 2.6206, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 5.231341548477098, |
|
"grad_norm": 6.660119533538818, |
|
"learning_rate": 9.537316903045804e-06, |
|
"loss": 2.6307, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 5.270092226613966, |
|
"grad_norm": 6.439652919769287, |
|
"learning_rate": 9.45981554677207e-06, |
|
"loss": 2.6431, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 5.308842904750833, |
|
"grad_norm": 6.055843830108643, |
|
"learning_rate": 9.382314190498335e-06, |
|
"loss": 2.6144, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 5.347593582887701, |
|
"grad_norm": 6.519714832305908, |
|
"learning_rate": 9.3048128342246e-06, |
|
"loss": 2.6056, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 5.3863442610245675, |
|
"grad_norm": 6.72304630279541, |
|
"learning_rate": 9.227311477950864e-06, |
|
"loss": 2.623, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 5.425094939161435, |
|
"grad_norm": 7.048790454864502, |
|
"learning_rate": 9.149810121677129e-06, |
|
"loss": 2.6043, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 5.463845617298302, |
|
"grad_norm": 6.654219627380371, |
|
"learning_rate": 9.072308765403394e-06, |
|
"loss": 2.6135, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 5.50259629543517, |
|
"grad_norm": 5.948112487792969, |
|
"learning_rate": 8.99480740912966e-06, |
|
"loss": 2.6295, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 5.541346973572038, |
|
"grad_norm": 7.8044328689575195, |
|
"learning_rate": 8.917306052855925e-06, |
|
"loss": 2.6104, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 5.580097651708905, |
|
"grad_norm": 6.743612766265869, |
|
"learning_rate": 8.83980469658219e-06, |
|
"loss": 2.6216, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 5.618848329845772, |
|
"grad_norm": 6.346240043640137, |
|
"learning_rate": 8.762303340308455e-06, |
|
"loss": 2.6238, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 5.65759900798264, |
|
"grad_norm": 6.496920108795166, |
|
"learning_rate": 8.68480198403472e-06, |
|
"loss": 2.6334, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 5.6963496861195075, |
|
"grad_norm": 6.356810569763184, |
|
"learning_rate": 8.607300627760986e-06, |
|
"loss": 2.5995, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 5.7351003642563745, |
|
"grad_norm": 6.226792812347412, |
|
"learning_rate": 8.529799271487251e-06, |
|
"loss": 2.5974, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 5.773851042393241, |
|
"grad_norm": 6.6555962562561035, |
|
"learning_rate": 8.452297915213516e-06, |
|
"loss": 2.6285, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 5.812601720530109, |
|
"grad_norm": 6.32110595703125, |
|
"learning_rate": 8.374796558939782e-06, |
|
"loss": 2.6035, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 5.851352398666977, |
|
"grad_norm": 6.651345252990723, |
|
"learning_rate": 8.297295202666047e-06, |
|
"loss": 2.5886, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 5.890103076803844, |
|
"grad_norm": 6.736583232879639, |
|
"learning_rate": 8.219793846392312e-06, |
|
"loss": 2.5903, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 5.928853754940711, |
|
"grad_norm": 6.635737895965576, |
|
"learning_rate": 8.142292490118577e-06, |
|
"loss": 2.597, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 5.967604433077579, |
|
"grad_norm": 6.3186492919921875, |
|
"learning_rate": 8.064791133844843e-06, |
|
"loss": 2.5732, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 2.5146169662475586, |
|
"eval_runtime": 259.2569, |
|
"eval_samples_per_second": 796.372, |
|
"eval_steps_per_second": 12.447, |
|
"step": 77418 |
|
}, |
|
{ |
|
"epoch": 6.006355111214447, |
|
"grad_norm": 6.408041000366211, |
|
"learning_rate": 7.987289777571108e-06, |
|
"loss": 2.5742, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 6.045105789351314, |
|
"grad_norm": 6.398166656494141, |
|
"learning_rate": 7.909788421297373e-06, |
|
"loss": 2.5829, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 6.083856467488181, |
|
"grad_norm": 6.89434289932251, |
|
"learning_rate": 7.832287065023639e-06, |
|
"loss": 2.58, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 6.122607145625048, |
|
"grad_norm": 5.935701847076416, |
|
"learning_rate": 7.754785708749904e-06, |
|
"loss": 2.5853, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 6.161357823761916, |
|
"grad_norm": 7.224461555480957, |
|
"learning_rate": 7.677284352476169e-06, |
|
"loss": 2.5597, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 6.200108501898783, |
|
"grad_norm": 6.59751033782959, |
|
"learning_rate": 7.5997829962024335e-06, |
|
"loss": 2.5821, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 6.238859180035651, |
|
"grad_norm": 6.414103031158447, |
|
"learning_rate": 7.522281639928699e-06, |
|
"loss": 2.5542, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 6.277609858172518, |
|
"grad_norm": 6.270075798034668, |
|
"learning_rate": 7.444780283654964e-06, |
|
"loss": 2.5735, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 6.316360536309386, |
|
"grad_norm": 6.3846306800842285, |
|
"learning_rate": 7.367278927381229e-06, |
|
"loss": 2.5563, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 6.355111214446253, |
|
"grad_norm": 6.725887298583984, |
|
"learning_rate": 7.2897775711074945e-06, |
|
"loss": 2.5582, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 6.3938618925831205, |
|
"grad_norm": 6.913090229034424, |
|
"learning_rate": 7.21227621483376e-06, |
|
"loss": 2.5681, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 6.4326125707199875, |
|
"grad_norm": 6.630814075469971, |
|
"learning_rate": 7.134774858560025e-06, |
|
"loss": 2.5493, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 6.471363248856855, |
|
"grad_norm": 7.482264518737793, |
|
"learning_rate": 7.05727350228629e-06, |
|
"loss": 2.5672, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 6.510113926993722, |
|
"grad_norm": 5.896800518035889, |
|
"learning_rate": 6.979772146012556e-06, |
|
"loss": 2.5563, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 6.54886460513059, |
|
"grad_norm": 6.603734016418457, |
|
"learning_rate": 6.902270789738821e-06, |
|
"loss": 2.5358, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 6.587615283267457, |
|
"grad_norm": 6.386889457702637, |
|
"learning_rate": 6.824769433465086e-06, |
|
"loss": 2.5449, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 6.626365961404325, |
|
"grad_norm": 6.661931037902832, |
|
"learning_rate": 6.747268077191351e-06, |
|
"loss": 2.5405, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 6.665116639541192, |
|
"grad_norm": 6.331045627593994, |
|
"learning_rate": 6.669766720917617e-06, |
|
"loss": 2.5419, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 6.70386731767806, |
|
"grad_norm": 7.050119400024414, |
|
"learning_rate": 6.592265364643882e-06, |
|
"loss": 2.5196, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 6.742617995814927, |
|
"grad_norm": 6.065616130828857, |
|
"learning_rate": 6.514764008370147e-06, |
|
"loss": 2.539, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 6.781368673951794, |
|
"grad_norm": 5.768097877502441, |
|
"learning_rate": 6.4372626520964125e-06, |
|
"loss": 2.5245, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 6.820119352088661, |
|
"grad_norm": 6.785781383514404, |
|
"learning_rate": 6.359761295822677e-06, |
|
"loss": 2.5473, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 6.858870030225529, |
|
"grad_norm": 6.658846855163574, |
|
"learning_rate": 6.282259939548942e-06, |
|
"loss": 2.5385, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 6.897620708362396, |
|
"grad_norm": 5.932773590087891, |
|
"learning_rate": 6.2047585832752074e-06, |
|
"loss": 2.528, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 6.936371386499264, |
|
"grad_norm": 6.457767963409424, |
|
"learning_rate": 6.127257227001473e-06, |
|
"loss": 2.5327, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 6.975122064636131, |
|
"grad_norm": 6.143023490905762, |
|
"learning_rate": 6.049755870727738e-06, |
|
"loss": 2.5352, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 2.4585013389587402, |
|
"eval_runtime": 258.9573, |
|
"eval_samples_per_second": 797.294, |
|
"eval_steps_per_second": 12.462, |
|
"step": 90321 |
|
}, |
|
{ |
|
"epoch": 7.013872742772999, |
|
"grad_norm": 6.153046607971191, |
|
"learning_rate": 5.972254514454003e-06, |
|
"loss": 2.5315, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 7.052623420909866, |
|
"grad_norm": 7.131119728088379, |
|
"learning_rate": 5.8947531581802685e-06, |
|
"loss": 2.5431, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 7.0913740990467335, |
|
"grad_norm": 6.677100658416748, |
|
"learning_rate": 5.817251801906534e-06, |
|
"loss": 2.5204, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 7.1301247771836005, |
|
"grad_norm": 6.799976348876953, |
|
"learning_rate": 5.739750445632799e-06, |
|
"loss": 2.5221, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 7.168875455320468, |
|
"grad_norm": 6.515171051025391, |
|
"learning_rate": 5.662249089359064e-06, |
|
"loss": 2.5222, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 7.207626133457335, |
|
"grad_norm": 7.057505130767822, |
|
"learning_rate": 5.58474773308533e-06, |
|
"loss": 2.5262, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 7.246376811594203, |
|
"grad_norm": 5.927343368530273, |
|
"learning_rate": 5.507246376811595e-06, |
|
"loss": 2.5272, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 7.28512748973107, |
|
"grad_norm": 6.7214155197143555, |
|
"learning_rate": 5.42974502053786e-06, |
|
"loss": 2.5195, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 7.323878167867938, |
|
"grad_norm": 6.162799835205078, |
|
"learning_rate": 5.352243664264125e-06, |
|
"loss": 2.5117, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 7.362628846004805, |
|
"grad_norm": 6.725783824920654, |
|
"learning_rate": 5.274742307990391e-06, |
|
"loss": 2.522, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 7.401379524141673, |
|
"grad_norm": 5.721879959106445, |
|
"learning_rate": 5.197240951716656e-06, |
|
"loss": 2.5047, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 7.44013020227854, |
|
"grad_norm": 7.531757354736328, |
|
"learning_rate": 5.11973959544292e-06, |
|
"loss": 2.4981, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 7.478880880415407, |
|
"grad_norm": 6.200819492340088, |
|
"learning_rate": 5.042238239169186e-06, |
|
"loss": 2.5016, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 7.517631558552274, |
|
"grad_norm": 6.8695597648620605, |
|
"learning_rate": 4.964736882895451e-06, |
|
"loss": 2.5085, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 7.556382236689142, |
|
"grad_norm": 6.3883843421936035, |
|
"learning_rate": 4.887235526621716e-06, |
|
"loss": 2.5092, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 7.595132914826009, |
|
"grad_norm": 6.085172653198242, |
|
"learning_rate": 4.809734170347981e-06, |
|
"loss": 2.4957, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 7.633883592962877, |
|
"grad_norm": 6.23600435256958, |
|
"learning_rate": 4.732232814074247e-06, |
|
"loss": 2.4876, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 7.672634271099744, |
|
"grad_norm": 6.483453750610352, |
|
"learning_rate": 4.654731457800512e-06, |
|
"loss": 2.5029, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 7.711384949236612, |
|
"grad_norm": 6.627302646636963, |
|
"learning_rate": 4.577230101526777e-06, |
|
"loss": 2.4989, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 7.750135627373479, |
|
"grad_norm": 7.044070243835449, |
|
"learning_rate": 4.4997287452530425e-06, |
|
"loss": 2.5085, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 7.7888863055103466, |
|
"grad_norm": 5.986552715301514, |
|
"learning_rate": 4.422227388979308e-06, |
|
"loss": 2.4842, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 7.8276369836472135, |
|
"grad_norm": 6.3408708572387695, |
|
"learning_rate": 4.344726032705573e-06, |
|
"loss": 2.4973, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 7.866387661784081, |
|
"grad_norm": 6.100359916687012, |
|
"learning_rate": 4.267224676431838e-06, |
|
"loss": 2.5111, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 7.905138339920948, |
|
"grad_norm": 6.7454833984375, |
|
"learning_rate": 4.1897233201581036e-06, |
|
"loss": 2.4766, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 7.943889018057816, |
|
"grad_norm": 6.790141582489014, |
|
"learning_rate": 4.112221963884369e-06, |
|
"loss": 2.4788, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 7.982639696194683, |
|
"grad_norm": 6.926203727722168, |
|
"learning_rate": 4.034720607610634e-06, |
|
"loss": 2.4875, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 2.435317277908325, |
|
"eval_runtime": 258.5225, |
|
"eval_samples_per_second": 798.634, |
|
"eval_steps_per_second": 12.482, |
|
"step": 103224 |
|
}, |
|
{ |
|
"epoch": 8.02139037433155, |
|
"grad_norm": 6.832672119140625, |
|
"learning_rate": 3.957219251336899e-06, |
|
"loss": 2.4812, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 8.060141052468419, |
|
"grad_norm": 6.771292209625244, |
|
"learning_rate": 3.879717895063164e-06, |
|
"loss": 2.4945, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 8.098891730605285, |
|
"grad_norm": 6.624267101287842, |
|
"learning_rate": 3.802216538789429e-06, |
|
"loss": 2.4813, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 8.137642408742153, |
|
"grad_norm": 6.566524028778076, |
|
"learning_rate": 3.724715182515694e-06, |
|
"loss": 2.5087, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 8.17639308687902, |
|
"grad_norm": 6.612277507781982, |
|
"learning_rate": 3.647213826241959e-06, |
|
"loss": 2.481, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 8.215143765015888, |
|
"grad_norm": 6.12284517288208, |
|
"learning_rate": 3.5697124699682244e-06, |
|
"loss": 2.4825, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 8.253894443152754, |
|
"grad_norm": 6.495052814483643, |
|
"learning_rate": 3.4922111136944897e-06, |
|
"loss": 2.4883, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 8.292645121289622, |
|
"grad_norm": 7.689423561096191, |
|
"learning_rate": 3.414709757420755e-06, |
|
"loss": 2.4857, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 8.33139579942649, |
|
"grad_norm": 6.188397407531738, |
|
"learning_rate": 3.3372084011470202e-06, |
|
"loss": 2.4788, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 8.370146477563358, |
|
"grad_norm": 6.282194137573242, |
|
"learning_rate": 3.2597070448732855e-06, |
|
"loss": 2.4856, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 8.408897155700224, |
|
"grad_norm": 6.457098007202148, |
|
"learning_rate": 3.1822056885995508e-06, |
|
"loss": 2.4623, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 8.447647833837092, |
|
"grad_norm": 7.726540565490723, |
|
"learning_rate": 3.1047043323258156e-06, |
|
"loss": 2.4671, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 8.48639851197396, |
|
"grad_norm": 6.308920383453369, |
|
"learning_rate": 3.027202976052081e-06, |
|
"loss": 2.4808, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 8.525149190110827, |
|
"grad_norm": 6.501667499542236, |
|
"learning_rate": 2.949701619778346e-06, |
|
"loss": 2.4736, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 8.563899868247695, |
|
"grad_norm": 7.358393669128418, |
|
"learning_rate": 2.8722002635046114e-06, |
|
"loss": 2.4697, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 8.602650546384561, |
|
"grad_norm": 6.261012554168701, |
|
"learning_rate": 2.7946989072308767e-06, |
|
"loss": 2.4631, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 8.64140122452143, |
|
"grad_norm": 6.515717029571533, |
|
"learning_rate": 2.717197550957142e-06, |
|
"loss": 2.4915, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 8.680151902658297, |
|
"grad_norm": 6.8307600021362305, |
|
"learning_rate": 2.6396961946834072e-06, |
|
"loss": 2.48, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 8.718902580795163, |
|
"grad_norm": 6.784819602966309, |
|
"learning_rate": 2.5621948384096725e-06, |
|
"loss": 2.4748, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 8.75765325893203, |
|
"grad_norm": 7.1304473876953125, |
|
"learning_rate": 2.4846934821359373e-06, |
|
"loss": 2.4723, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 8.796403937068899, |
|
"grad_norm": 6.297511100769043, |
|
"learning_rate": 2.4071921258622026e-06, |
|
"loss": 2.463, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 8.835154615205767, |
|
"grad_norm": 6.689960479736328, |
|
"learning_rate": 2.329690769588468e-06, |
|
"loss": 2.4621, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 8.873905293342634, |
|
"grad_norm": 6.450560569763184, |
|
"learning_rate": 2.252189413314733e-06, |
|
"loss": 2.4559, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 8.9126559714795, |
|
"grad_norm": 6.459935665130615, |
|
"learning_rate": 2.1746880570409984e-06, |
|
"loss": 2.4646, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 8.951406649616368, |
|
"grad_norm": 6.182426452636719, |
|
"learning_rate": 2.0971867007672637e-06, |
|
"loss": 2.4665, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 8.990157327753236, |
|
"grad_norm": 7.122648239135742, |
|
"learning_rate": 2.019685344493529e-06, |
|
"loss": 2.475, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 2.406507968902588, |
|
"eval_runtime": 258.9009, |
|
"eval_samples_per_second": 797.467, |
|
"eval_steps_per_second": 12.464, |
|
"step": 116127 |
|
}, |
|
{ |
|
"epoch": 9.028908005890104, |
|
"grad_norm": 7.267585754394531, |
|
"learning_rate": 1.942183988219794e-06, |
|
"loss": 2.447, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 9.06765868402697, |
|
"grad_norm": 6.2447991371154785, |
|
"learning_rate": 1.8646826319460593e-06, |
|
"loss": 2.4609, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 9.106409362163838, |
|
"grad_norm": 6.521481037139893, |
|
"learning_rate": 1.7871812756723245e-06, |
|
"loss": 2.4418, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 9.145160040300706, |
|
"grad_norm": 6.647397041320801, |
|
"learning_rate": 1.7096799193985896e-06, |
|
"loss": 2.4665, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 9.183910718437573, |
|
"grad_norm": 6.247033596038818, |
|
"learning_rate": 1.6321785631248548e-06, |
|
"loss": 2.4647, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 9.22266139657444, |
|
"grad_norm": 6.595357894897461, |
|
"learning_rate": 1.5546772068511201e-06, |
|
"loss": 2.4705, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 9.261412074711307, |
|
"grad_norm": 8.117677688598633, |
|
"learning_rate": 1.4771758505773854e-06, |
|
"loss": 2.4629, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 9.300162752848175, |
|
"grad_norm": 6.991618633270264, |
|
"learning_rate": 1.3996744943036504e-06, |
|
"loss": 2.4498, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 9.338913430985043, |
|
"grad_norm": 6.236393451690674, |
|
"learning_rate": 1.3221731380299157e-06, |
|
"loss": 2.467, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 9.377664109121909, |
|
"grad_norm": 6.595478534698486, |
|
"learning_rate": 1.2446717817561808e-06, |
|
"loss": 2.4547, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 9.416414787258777, |
|
"grad_norm": 7.194475173950195, |
|
"learning_rate": 1.167170425482446e-06, |
|
"loss": 2.4669, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 9.455165465395645, |
|
"grad_norm": 6.341099262237549, |
|
"learning_rate": 1.0896690692087113e-06, |
|
"loss": 2.4661, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 9.493916143532513, |
|
"grad_norm": 7.257521629333496, |
|
"learning_rate": 1.0121677129349766e-06, |
|
"loss": 2.4629, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 9.532666821669379, |
|
"grad_norm": 6.399875164031982, |
|
"learning_rate": 9.346663566612417e-07, |
|
"loss": 2.4555, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 9.571417499806246, |
|
"grad_norm": 7.292248249053955, |
|
"learning_rate": 8.571650003875069e-07, |
|
"loss": 2.4646, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 9.610168177943114, |
|
"grad_norm": 6.8132548332214355, |
|
"learning_rate": 7.79663644113772e-07, |
|
"loss": 2.4521, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 9.648918856079982, |
|
"grad_norm": 6.302210330963135, |
|
"learning_rate": 7.021622878400372e-07, |
|
"loss": 2.451, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 9.687669534216848, |
|
"grad_norm": 6.902337551116943, |
|
"learning_rate": 6.246609315663025e-07, |
|
"loss": 2.4515, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 9.726420212353716, |
|
"grad_norm": 6.4049296379089355, |
|
"learning_rate": 5.471595752925676e-07, |
|
"loss": 2.454, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 9.765170890490584, |
|
"grad_norm": 7.109240531921387, |
|
"learning_rate": 4.6965821901883286e-07, |
|
"loss": 2.4379, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 9.803921568627452, |
|
"grad_norm": 6.1289873123168945, |
|
"learning_rate": 3.921568627450981e-07, |
|
"loss": 2.4438, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 9.842672246764318, |
|
"grad_norm": 6.873955726623535, |
|
"learning_rate": 3.146555064713633e-07, |
|
"loss": 2.4526, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 9.881422924901186, |
|
"grad_norm": 6.842904090881348, |
|
"learning_rate": 2.3715415019762845e-07, |
|
"loss": 2.4471, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 9.920173603038053, |
|
"grad_norm": 9.636740684509277, |
|
"learning_rate": 1.5965279392389367e-07, |
|
"loss": 2.4469, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 9.958924281174921, |
|
"grad_norm": 6.161515235900879, |
|
"learning_rate": 8.21514376501589e-08, |
|
"loss": 2.4608, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 9.997674959311787, |
|
"grad_norm": 6.582516193389893, |
|
"learning_rate": 4.6500813764240875e-09, |
|
"loss": 2.4411, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 2.3977291584014893, |
|
"eval_runtime": 258.9982, |
|
"eval_samples_per_second": 797.168, |
|
"eval_steps_per_second": 12.46, |
|
"step": 129030 |
|
}, |
|
{ |
|
"epoch": 10.036425637448655, |
|
"grad_norm": 6.090233325958252, |
|
"learning_rate": 9.963574362551346e-06, |
|
"loss": 2.4784, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 10.075176315585523, |
|
"grad_norm": 6.285606384277344, |
|
"learning_rate": 9.924823684414478e-06, |
|
"loss": 2.4657, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 10.11392699372239, |
|
"grad_norm": 5.937399864196777, |
|
"learning_rate": 9.886073006277611e-06, |
|
"loss": 2.4869, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 10.152677671859257, |
|
"grad_norm": 7.235742568969727, |
|
"learning_rate": 9.847322328140743e-06, |
|
"loss": 2.4726, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 10.191428349996125, |
|
"grad_norm": 6.6334028244018555, |
|
"learning_rate": 9.808571650003877e-06, |
|
"loss": 2.472, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 10.230179028132993, |
|
"grad_norm": 7.366402626037598, |
|
"learning_rate": 9.769820971867009e-06, |
|
"loss": 2.4887, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 10.26892970626986, |
|
"grad_norm": 6.17592716217041, |
|
"learning_rate": 9.731070293730142e-06, |
|
"loss": 2.4854, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 10.307680384406726, |
|
"grad_norm": 6.376716613769531, |
|
"learning_rate": 9.692319615593274e-06, |
|
"loss": 2.486, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 10.346431062543594, |
|
"grad_norm": 6.293849945068359, |
|
"learning_rate": 9.653568937456407e-06, |
|
"loss": 2.4707, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 10.385181740680462, |
|
"grad_norm": 6.606166839599609, |
|
"learning_rate": 9.614818259319539e-06, |
|
"loss": 2.4704, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 10.42393241881733, |
|
"grad_norm": 6.805929660797119, |
|
"learning_rate": 9.576067581182673e-06, |
|
"loss": 2.4727, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 10.462683096954196, |
|
"grad_norm": 6.598349571228027, |
|
"learning_rate": 9.537316903045804e-06, |
|
"loss": 2.4825, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 10.501433775091064, |
|
"grad_norm": 5.807904243469238, |
|
"learning_rate": 9.498566224908938e-06, |
|
"loss": 2.4721, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 10.540184453227932, |
|
"grad_norm": 6.681980609893799, |
|
"learning_rate": 9.45981554677207e-06, |
|
"loss": 2.4764, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 10.5789351313648, |
|
"grad_norm": 6.540719032287598, |
|
"learning_rate": 9.421064868635203e-06, |
|
"loss": 2.4545, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 10.617685809501666, |
|
"grad_norm": 6.627035140991211, |
|
"learning_rate": 9.382314190498335e-06, |
|
"loss": 2.4778, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 10.656436487638533, |
|
"grad_norm": 6.348284721374512, |
|
"learning_rate": 9.343563512361468e-06, |
|
"loss": 2.4597, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 10.695187165775401, |
|
"grad_norm": 6.790314197540283, |
|
"learning_rate": 9.3048128342246e-06, |
|
"loss": 2.471, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 10.733937843912269, |
|
"grad_norm": 6.8181233406066895, |
|
"learning_rate": 9.266062156087732e-06, |
|
"loss": 2.4571, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 10.772688522049135, |
|
"grad_norm": 6.593683242797852, |
|
"learning_rate": 9.227311477950864e-06, |
|
"loss": 2.4843, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 10.811439200186003, |
|
"grad_norm": 6.600128650665283, |
|
"learning_rate": 9.188560799813997e-06, |
|
"loss": 2.464, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 10.85018987832287, |
|
"grad_norm": 6.368162631988525, |
|
"learning_rate": 9.149810121677129e-06, |
|
"loss": 2.4598, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 10.888940556459739, |
|
"grad_norm": 6.5435943603515625, |
|
"learning_rate": 9.111059443540262e-06, |
|
"loss": 2.4704, |
|
"step": 140500 |
|
}, |
|
{ |
|
"epoch": 10.927691234596605, |
|
"grad_norm": 6.06011962890625, |
|
"learning_rate": 9.072308765403394e-06, |
|
"loss": 2.4514, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 10.966441912733472, |
|
"grad_norm": 7.2288689613342285, |
|
"learning_rate": 9.033558087266528e-06, |
|
"loss": 2.4521, |
|
"step": 141500 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 2.3912322521209717, |
|
"eval_runtime": 258.9953, |
|
"eval_samples_per_second": 797.176, |
|
"eval_steps_per_second": 12.46, |
|
"step": 141933 |
|
}, |
|
{ |
|
"epoch": 11.00519259087034, |
|
"grad_norm": 6.698403358459473, |
|
"learning_rate": 8.99480740912966e-06, |
|
"loss": 2.4457, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 11.043943269007208, |
|
"grad_norm": 6.455236911773682, |
|
"learning_rate": 8.956056730992793e-06, |
|
"loss": 2.4507, |
|
"step": 142500 |
|
}, |
|
{ |
|
"epoch": 11.082693947144074, |
|
"grad_norm": 6.590576648712158, |
|
"learning_rate": 8.917306052855925e-06, |
|
"loss": 2.4256, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 11.121444625280942, |
|
"grad_norm": 6.957404136657715, |
|
"learning_rate": 8.878555374719058e-06, |
|
"loss": 2.4549, |
|
"step": 143500 |
|
}, |
|
{ |
|
"epoch": 11.16019530341781, |
|
"grad_norm": 6.926699161529541, |
|
"learning_rate": 8.83980469658219e-06, |
|
"loss": 2.4499, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 11.198945981554678, |
|
"grad_norm": 6.484086036682129, |
|
"learning_rate": 8.801054018445324e-06, |
|
"loss": 2.4443, |
|
"step": 144500 |
|
}, |
|
{ |
|
"epoch": 11.237696659691544, |
|
"grad_norm": 6.107706069946289, |
|
"learning_rate": 8.762303340308455e-06, |
|
"loss": 2.4459, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 11.276447337828412, |
|
"grad_norm": 7.301278591156006, |
|
"learning_rate": 8.723552662171589e-06, |
|
"loss": 2.4463, |
|
"step": 145500 |
|
}, |
|
{ |
|
"epoch": 11.31519801596528, |
|
"grad_norm": 6.378045082092285, |
|
"learning_rate": 8.68480198403472e-06, |
|
"loss": 2.4494, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 11.353948694102147, |
|
"grad_norm": 6.803300857543945, |
|
"learning_rate": 8.646051305897854e-06, |
|
"loss": 2.4235, |
|
"step": 146500 |
|
}, |
|
{ |
|
"epoch": 11.392699372239015, |
|
"grad_norm": 6.401794910430908, |
|
"learning_rate": 8.607300627760986e-06, |
|
"loss": 2.4353, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 11.431450050375881, |
|
"grad_norm": 6.455550193786621, |
|
"learning_rate": 8.56854994962412e-06, |
|
"loss": 2.4306, |
|
"step": 147500 |
|
}, |
|
{ |
|
"epoch": 11.470200728512749, |
|
"grad_norm": 6.416442394256592, |
|
"learning_rate": 8.529799271487251e-06, |
|
"loss": 2.4143, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 11.508951406649617, |
|
"grad_norm": 6.768812656402588, |
|
"learning_rate": 8.491048593350385e-06, |
|
"loss": 2.4184, |
|
"step": 148500 |
|
}, |
|
{ |
|
"epoch": 11.547702084786483, |
|
"grad_norm": 6.085323810577393, |
|
"learning_rate": 8.452297915213516e-06, |
|
"loss": 2.4318, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 11.58645276292335, |
|
"grad_norm": 6.181857585906982, |
|
"learning_rate": 8.41354723707665e-06, |
|
"loss": 2.4348, |
|
"step": 149500 |
|
}, |
|
{ |
|
"epoch": 11.625203441060219, |
|
"grad_norm": 6.558756351470947, |
|
"learning_rate": 8.374796558939782e-06, |
|
"loss": 2.413, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 11.663954119197086, |
|
"grad_norm": 6.249685287475586, |
|
"learning_rate": 8.336045880802915e-06, |
|
"loss": 2.4271, |
|
"step": 150500 |
|
}, |
|
{ |
|
"epoch": 11.702704797333954, |
|
"grad_norm": 6.789103984832764, |
|
"learning_rate": 8.297295202666047e-06, |
|
"loss": 2.4226, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 11.74145547547082, |
|
"grad_norm": 6.4289140701293945, |
|
"learning_rate": 8.25854452452918e-06, |
|
"loss": 2.4184, |
|
"step": 151500 |
|
}, |
|
{ |
|
"epoch": 11.780206153607688, |
|
"grad_norm": 6.098612308502197, |
|
"learning_rate": 8.219793846392312e-06, |
|
"loss": 2.4132, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 11.818956831744556, |
|
"grad_norm": 6.500378608703613, |
|
"learning_rate": 8.181043168255444e-06, |
|
"loss": 2.4184, |
|
"step": 152500 |
|
}, |
|
{ |
|
"epoch": 11.857707509881424, |
|
"grad_norm": 6.583259105682373, |
|
"learning_rate": 8.142292490118577e-06, |
|
"loss": 2.4259, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 11.89645818801829, |
|
"grad_norm": 6.7018303871154785, |
|
"learning_rate": 8.10354181198171e-06, |
|
"loss": 2.4185, |
|
"step": 153500 |
|
}, |
|
{ |
|
"epoch": 11.935208866155158, |
|
"grad_norm": 6.679374694824219, |
|
"learning_rate": 8.064791133844843e-06, |
|
"loss": 2.4078, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 11.973959544292025, |
|
"grad_norm": 6.576003551483154, |
|
"learning_rate": 8.026040455707974e-06, |
|
"loss": 2.4212, |
|
"step": 154500 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 2.3491039276123047, |
|
"eval_runtime": 260.4232, |
|
"eval_samples_per_second": 792.806, |
|
"eval_steps_per_second": 12.391, |
|
"step": 154836 |
|
}, |
|
{ |
|
"epoch": 12.012710222428893, |
|
"grad_norm": 6.768045902252197, |
|
"learning_rate": 7.987289777571108e-06, |
|
"loss": 2.399, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 12.05146090056576, |
|
"grad_norm": 6.445169925689697, |
|
"learning_rate": 7.94853909943424e-06, |
|
"loss": 2.4055, |
|
"step": 155500 |
|
}, |
|
{ |
|
"epoch": 12.090211578702627, |
|
"grad_norm": 6.684764385223389, |
|
"learning_rate": 7.909788421297373e-06, |
|
"loss": 2.3979, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 12.128962256839495, |
|
"grad_norm": 7.150822162628174, |
|
"learning_rate": 7.871037743160505e-06, |
|
"loss": 2.4091, |
|
"step": 156500 |
|
}, |
|
{ |
|
"epoch": 12.167712934976363, |
|
"grad_norm": 6.7067131996154785, |
|
"learning_rate": 7.832287065023639e-06, |
|
"loss": 2.4057, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 12.206463613113229, |
|
"grad_norm": 6.288236141204834, |
|
"learning_rate": 7.79353638688677e-06, |
|
"loss": 2.4024, |
|
"step": 157500 |
|
}, |
|
{ |
|
"epoch": 12.245214291250097, |
|
"grad_norm": 6.532754898071289, |
|
"learning_rate": 7.754785708749904e-06, |
|
"loss": 2.4119, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 12.283964969386965, |
|
"grad_norm": 6.437507629394531, |
|
"learning_rate": 7.716035030613036e-06, |
|
"loss": 2.4048, |
|
"step": 158500 |
|
}, |
|
{ |
|
"epoch": 12.322715647523832, |
|
"grad_norm": 6.648064136505127, |
|
"learning_rate": 7.677284352476169e-06, |
|
"loss": 2.3954, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 12.361466325660698, |
|
"grad_norm": 6.406070232391357, |
|
"learning_rate": 7.6385336743393e-06, |
|
"loss": 2.4069, |
|
"step": 159500 |
|
}, |
|
{ |
|
"epoch": 12.400217003797566, |
|
"grad_norm": 6.75925350189209, |
|
"learning_rate": 7.5997829962024335e-06, |
|
"loss": 2.3803, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 12.438967681934434, |
|
"grad_norm": 7.390876770019531, |
|
"learning_rate": 7.561032318065566e-06, |
|
"loss": 2.3952, |
|
"step": 160500 |
|
}, |
|
{ |
|
"epoch": 12.477718360071302, |
|
"grad_norm": 6.584438800811768, |
|
"learning_rate": 7.522281639928699e-06, |
|
"loss": 2.3921, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 12.516469038208168, |
|
"grad_norm": 6.7814040184021, |
|
"learning_rate": 7.483530961791831e-06, |
|
"loss": 2.4035, |
|
"step": 161500 |
|
}, |
|
{ |
|
"epoch": 12.555219716345036, |
|
"grad_norm": 6.544926166534424, |
|
"learning_rate": 7.444780283654964e-06, |
|
"loss": 2.3855, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 12.593970394481904, |
|
"grad_norm": 6.649155139923096, |
|
"learning_rate": 7.406029605518097e-06, |
|
"loss": 2.3884, |
|
"step": 162500 |
|
}, |
|
{ |
|
"epoch": 12.632721072618772, |
|
"grad_norm": 6.128752708435059, |
|
"learning_rate": 7.367278927381229e-06, |
|
"loss": 2.3915, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 12.671471750755638, |
|
"grad_norm": 6.694360733032227, |
|
"learning_rate": 7.328528249244362e-06, |
|
"loss": 2.4065, |
|
"step": 163500 |
|
}, |
|
{ |
|
"epoch": 12.710222428892505, |
|
"grad_norm": 6.9979963302612305, |
|
"learning_rate": 7.2897775711074945e-06, |
|
"loss": 2.3816, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 12.748973107029373, |
|
"grad_norm": 6.7657294273376465, |
|
"learning_rate": 7.251026892970627e-06, |
|
"loss": 2.385, |
|
"step": 164500 |
|
}, |
|
{ |
|
"epoch": 12.787723785166241, |
|
"grad_norm": 7.142265796661377, |
|
"learning_rate": 7.21227621483376e-06, |
|
"loss": 2.3809, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 12.826474463303107, |
|
"grad_norm": 6.2213134765625, |
|
"learning_rate": 7.1735255366968924e-06, |
|
"loss": 2.3883, |
|
"step": 165500 |
|
}, |
|
{ |
|
"epoch": 12.865225141439975, |
|
"grad_norm": 6.274342060089111, |
|
"learning_rate": 7.134774858560025e-06, |
|
"loss": 2.3838, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 12.903975819576843, |
|
"grad_norm": 6.5893049240112305, |
|
"learning_rate": 7.096024180423158e-06, |
|
"loss": 2.3832, |
|
"step": 166500 |
|
}, |
|
{ |
|
"epoch": 12.94272649771371, |
|
"grad_norm": 6.229060173034668, |
|
"learning_rate": 7.05727350228629e-06, |
|
"loss": 2.3839, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 12.981477175850577, |
|
"grad_norm": 7.251420497894287, |
|
"learning_rate": 7.018522824149423e-06, |
|
"loss": 2.3838, |
|
"step": 167500 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_loss": 2.3215689659118652, |
|
"eval_runtime": 259.7568, |
|
"eval_samples_per_second": 794.84, |
|
"eval_steps_per_second": 12.423, |
|
"step": 167739 |
|
}, |
|
{ |
|
"epoch": 13.020227853987445, |
|
"grad_norm": 5.944735050201416, |
|
"learning_rate": 6.979772146012556e-06, |
|
"loss": 2.3687, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 13.058978532124312, |
|
"grad_norm": 6.25685977935791, |
|
"learning_rate": 6.941021467875688e-06, |
|
"loss": 2.3761, |
|
"step": 168500 |
|
}, |
|
{ |
|
"epoch": 13.09772921026118, |
|
"grad_norm": 6.244680881500244, |
|
"learning_rate": 6.902270789738821e-06, |
|
"loss": 2.3463, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 13.136479888398046, |
|
"grad_norm": 6.370804309844971, |
|
"learning_rate": 6.8635201116019535e-06, |
|
"loss": 2.3597, |
|
"step": 169500 |
|
}, |
|
{ |
|
"epoch": 13.175230566534914, |
|
"grad_norm": 6.249234676361084, |
|
"learning_rate": 6.824769433465086e-06, |
|
"loss": 2.3679, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 13.213981244671782, |
|
"grad_norm": 6.973300933837891, |
|
"learning_rate": 6.786018755328219e-06, |
|
"loss": 2.3669, |
|
"step": 170500 |
|
}, |
|
{ |
|
"epoch": 13.25273192280865, |
|
"grad_norm": 7.319492816925049, |
|
"learning_rate": 6.747268077191351e-06, |
|
"loss": 2.3528, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 13.291482600945516, |
|
"grad_norm": 6.924526214599609, |
|
"learning_rate": 6.708517399054484e-06, |
|
"loss": 2.3662, |
|
"step": 171500 |
|
}, |
|
{ |
|
"epoch": 13.330233279082384, |
|
"grad_norm": 6.761091709136963, |
|
"learning_rate": 6.669766720917617e-06, |
|
"loss": 2.3608, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 13.368983957219251, |
|
"grad_norm": 6.105197429656982, |
|
"learning_rate": 6.631016042780749e-06, |
|
"loss": 2.3536, |
|
"step": 172500 |
|
}, |
|
{ |
|
"epoch": 13.40773463535612, |
|
"grad_norm": 6.724457740783691, |
|
"learning_rate": 6.592265364643882e-06, |
|
"loss": 2.3682, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 13.446485313492985, |
|
"grad_norm": 6.62090539932251, |
|
"learning_rate": 6.553514686507015e-06, |
|
"loss": 2.3549, |
|
"step": 173500 |
|
}, |
|
{ |
|
"epoch": 13.485235991629853, |
|
"grad_norm": 6.862425327301025, |
|
"learning_rate": 6.514764008370147e-06, |
|
"loss": 2.3475, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 13.523986669766721, |
|
"grad_norm": 6.164032936096191, |
|
"learning_rate": 6.47601333023328e-06, |
|
"loss": 2.3625, |
|
"step": 174500 |
|
}, |
|
{ |
|
"epoch": 13.562737347903589, |
|
"grad_norm": 7.522220134735107, |
|
"learning_rate": 6.4372626520964125e-06, |
|
"loss": 2.3676, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 13.601488026040455, |
|
"grad_norm": 6.564206600189209, |
|
"learning_rate": 6.398511973959545e-06, |
|
"loss": 2.3606, |
|
"step": 175500 |
|
}, |
|
{ |
|
"epoch": 13.640238704177323, |
|
"grad_norm": 6.069074630737305, |
|
"learning_rate": 6.359761295822677e-06, |
|
"loss": 2.3644, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 13.67898938231419, |
|
"grad_norm": 6.570771217346191, |
|
"learning_rate": 6.3210106176858095e-06, |
|
"loss": 2.3711, |
|
"step": 176500 |
|
}, |
|
{ |
|
"epoch": 13.717740060451058, |
|
"grad_norm": 6.1281609535217285, |
|
"learning_rate": 6.282259939548942e-06, |
|
"loss": 2.348, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 13.756490738587924, |
|
"grad_norm": 6.176905632019043, |
|
"learning_rate": 6.243509261412075e-06, |
|
"loss": 2.379, |
|
"step": 177500 |
|
}, |
|
{ |
|
"epoch": 13.795241416724792, |
|
"grad_norm": 7.890781402587891, |
|
"learning_rate": 6.2047585832752074e-06, |
|
"loss": 2.365, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 13.83399209486166, |
|
"grad_norm": 6.160940647125244, |
|
"learning_rate": 6.16600790513834e-06, |
|
"loss": 2.3391, |
|
"step": 178500 |
|
}, |
|
{ |
|
"epoch": 13.872742772998528, |
|
"grad_norm": 6.732828617095947, |
|
"learning_rate": 6.127257227001473e-06, |
|
"loss": 2.355, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 13.911493451135394, |
|
"grad_norm": 6.500529766082764, |
|
"learning_rate": 6.088506548864605e-06, |
|
"loss": 2.3512, |
|
"step": 179500 |
|
}, |
|
{ |
|
"epoch": 13.950244129272262, |
|
"grad_norm": 7.362790584564209, |
|
"learning_rate": 6.049755870727738e-06, |
|
"loss": 2.3654, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 13.98899480740913, |
|
"grad_norm": 7.070291519165039, |
|
"learning_rate": 6.011005192590871e-06, |
|
"loss": 2.3444, |
|
"step": 180500 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 2.2924630641937256, |
|
"eval_runtime": 259.3076, |
|
"eval_samples_per_second": 796.217, |
|
"eval_steps_per_second": 12.445, |
|
"step": 180642 |
|
}, |
|
{ |
|
"epoch": 14.027745485545998, |
|
"grad_norm": 7.284486293792725, |
|
"learning_rate": 5.972254514454003e-06, |
|
"loss": 2.3296, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 14.066496163682864, |
|
"grad_norm": 7.636621952056885, |
|
"learning_rate": 5.933503836317136e-06, |
|
"loss": 2.3314, |
|
"step": 181500 |
|
}, |
|
{ |
|
"epoch": 14.105246841819731, |
|
"grad_norm": 6.692602634429932, |
|
"learning_rate": 5.8947531581802685e-06, |
|
"loss": 2.3363, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 14.1439975199566, |
|
"grad_norm": 6.751750469207764, |
|
"learning_rate": 5.856002480043401e-06, |
|
"loss": 2.3174, |
|
"step": 182500 |
|
}, |
|
{ |
|
"epoch": 14.182748198093467, |
|
"grad_norm": 7.041817665100098, |
|
"learning_rate": 5.817251801906534e-06, |
|
"loss": 2.3295, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 14.221498876230335, |
|
"grad_norm": 7.414912700653076, |
|
"learning_rate": 5.778501123769666e-06, |
|
"loss": 2.3386, |
|
"step": 183500 |
|
}, |
|
{ |
|
"epoch": 14.260249554367201, |
|
"grad_norm": 7.009491920471191, |
|
"learning_rate": 5.739750445632799e-06, |
|
"loss": 2.3282, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 14.299000232504069, |
|
"grad_norm": 6.77699089050293, |
|
"learning_rate": 5.700999767495932e-06, |
|
"loss": 2.3323, |
|
"step": 184500 |
|
}, |
|
{ |
|
"epoch": 14.337750910640937, |
|
"grad_norm": 6.922458171844482, |
|
"learning_rate": 5.662249089359064e-06, |
|
"loss": 2.3545, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 14.376501588777803, |
|
"grad_norm": 7.635495185852051, |
|
"learning_rate": 5.623498411222197e-06, |
|
"loss": 2.3429, |
|
"step": 185500 |
|
}, |
|
{ |
|
"epoch": 14.41525226691467, |
|
"grad_norm": 6.657200813293457, |
|
"learning_rate": 5.58474773308533e-06, |
|
"loss": 2.3371, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 14.454002945051538, |
|
"grad_norm": 6.328368663787842, |
|
"learning_rate": 5.545997054948462e-06, |
|
"loss": 2.3225, |
|
"step": 186500 |
|
}, |
|
{ |
|
"epoch": 14.492753623188406, |
|
"grad_norm": 6.7084503173828125, |
|
"learning_rate": 5.507246376811595e-06, |
|
"loss": 2.3141, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 14.531504301325274, |
|
"grad_norm": 6.23046875, |
|
"learning_rate": 5.4684956986747275e-06, |
|
"loss": 2.3387, |
|
"step": 187500 |
|
}, |
|
{ |
|
"epoch": 14.57025497946214, |
|
"grad_norm": 6.53918981552124, |
|
"learning_rate": 5.42974502053786e-06, |
|
"loss": 2.3355, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 14.609005657599008, |
|
"grad_norm": 6.816432952880859, |
|
"learning_rate": 5.390994342400993e-06, |
|
"loss": 2.3409, |
|
"step": 188500 |
|
}, |
|
{ |
|
"epoch": 14.647756335735876, |
|
"grad_norm": 6.9504475593566895, |
|
"learning_rate": 5.352243664264125e-06, |
|
"loss": 2.3274, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 14.686507013872744, |
|
"grad_norm": 7.058226585388184, |
|
"learning_rate": 5.313492986127258e-06, |
|
"loss": 2.3295, |
|
"step": 189500 |
|
}, |
|
{ |
|
"epoch": 14.72525769200961, |
|
"grad_norm": 6.337547302246094, |
|
"learning_rate": 5.274742307990391e-06, |
|
"loss": 2.316, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 14.764008370146477, |
|
"grad_norm": 7.420670032501221, |
|
"learning_rate": 5.235991629853523e-06, |
|
"loss": 2.3313, |
|
"step": 190500 |
|
}, |
|
{ |
|
"epoch": 14.802759048283345, |
|
"grad_norm": 6.559388637542725, |
|
"learning_rate": 5.197240951716656e-06, |
|
"loss": 2.3368, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 14.841509726420213, |
|
"grad_norm": 6.416265487670898, |
|
"learning_rate": 5.1584902735797886e-06, |
|
"loss": 2.3139, |
|
"step": 191500 |
|
}, |
|
{ |
|
"epoch": 14.88026040455708, |
|
"grad_norm": 6.204991817474365, |
|
"learning_rate": 5.11973959544292e-06, |
|
"loss": 2.3209, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 14.919011082693947, |
|
"grad_norm": 7.657558441162109, |
|
"learning_rate": 5.080988917306053e-06, |
|
"loss": 2.3346, |
|
"step": 192500 |
|
}, |
|
{ |
|
"epoch": 14.957761760830815, |
|
"grad_norm": 6.812448024749756, |
|
"learning_rate": 5.042238239169186e-06, |
|
"loss": 2.3226, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 14.996512438967683, |
|
"grad_norm": 5.866453170776367, |
|
"learning_rate": 5.003487561032318e-06, |
|
"loss": 2.3034, |
|
"step": 193500 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_loss": 2.2758021354675293, |
|
"eval_runtime": 268.9287, |
|
"eval_samples_per_second": 767.731, |
|
"eval_steps_per_second": 11.999, |
|
"step": 193545 |
|
}, |
|
{ |
|
"epoch": 15.035263117104549, |
|
"grad_norm": 6.998913288116455, |
|
"learning_rate": 4.964736882895451e-06, |
|
"loss": 2.3103, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 15.074013795241417, |
|
"grad_norm": 7.022980213165283, |
|
"learning_rate": 4.9259862047585835e-06, |
|
"loss": 2.3121, |
|
"step": 194500 |
|
}, |
|
{ |
|
"epoch": 15.112764473378284, |
|
"grad_norm": 6.3553056716918945, |
|
"learning_rate": 4.887235526621716e-06, |
|
"loss": 2.325, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 15.151515151515152, |
|
"grad_norm": 7.574887752532959, |
|
"learning_rate": 4.848484848484849e-06, |
|
"loss": 2.3128, |
|
"step": 195500 |
|
}, |
|
{ |
|
"epoch": 15.190265829652018, |
|
"grad_norm": 6.3977556228637695, |
|
"learning_rate": 4.809734170347981e-06, |
|
"loss": 2.3058, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 15.229016507788886, |
|
"grad_norm": 6.198862552642822, |
|
"learning_rate": 4.770983492211114e-06, |
|
"loss": 2.3111, |
|
"step": 196500 |
|
}, |
|
{ |
|
"epoch": 15.267767185925754, |
|
"grad_norm": 7.1892499923706055, |
|
"learning_rate": 4.732232814074247e-06, |
|
"loss": 2.3181, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 15.306517864062622, |
|
"grad_norm": 6.773824214935303, |
|
"learning_rate": 4.693482135937379e-06, |
|
"loss": 2.3158, |
|
"step": 197500 |
|
}, |
|
{ |
|
"epoch": 15.345268542199488, |
|
"grad_norm": 6.595972537994385, |
|
"learning_rate": 4.654731457800512e-06, |
|
"loss": 2.2989, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 15.384019220336356, |
|
"grad_norm": 7.397641658782959, |
|
"learning_rate": 4.615980779663645e-06, |
|
"loss": 2.3143, |
|
"step": 198500 |
|
}, |
|
{ |
|
"epoch": 15.422769898473224, |
|
"grad_norm": 7.2511820793151855, |
|
"learning_rate": 4.577230101526777e-06, |
|
"loss": 2.3077, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 15.461520576610091, |
|
"grad_norm": 6.52310848236084, |
|
"learning_rate": 4.53847942338991e-06, |
|
"loss": 2.3062, |
|
"step": 199500 |
|
}, |
|
{ |
|
"epoch": 15.500271254746957, |
|
"grad_norm": 6.681788921356201, |
|
"learning_rate": 4.4997287452530425e-06, |
|
"loss": 2.3078, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 15.539021932883825, |
|
"grad_norm": 7.010565280914307, |
|
"learning_rate": 4.460978067116175e-06, |
|
"loss": 2.3031, |
|
"step": 200500 |
|
}, |
|
{ |
|
"epoch": 15.577772611020693, |
|
"grad_norm": 7.412187576293945, |
|
"learning_rate": 4.422227388979308e-06, |
|
"loss": 2.3029, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 15.616523289157561, |
|
"grad_norm": 6.586581707000732, |
|
"learning_rate": 4.38347671084244e-06, |
|
"loss": 2.3092, |
|
"step": 201500 |
|
}, |
|
{ |
|
"epoch": 15.655273967294427, |
|
"grad_norm": 6.430338382720947, |
|
"learning_rate": 4.344726032705573e-06, |
|
"loss": 2.2972, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 15.694024645431295, |
|
"grad_norm": 6.151809215545654, |
|
"learning_rate": 4.305975354568706e-06, |
|
"loss": 2.2972, |
|
"step": 202500 |
|
}, |
|
{ |
|
"epoch": 15.732775323568163, |
|
"grad_norm": 7.195096492767334, |
|
"learning_rate": 4.267224676431838e-06, |
|
"loss": 2.3045, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 15.77152600170503, |
|
"grad_norm": 6.76158332824707, |
|
"learning_rate": 4.228473998294971e-06, |
|
"loss": 2.2995, |
|
"step": 203500 |
|
}, |
|
{ |
|
"epoch": 15.810276679841897, |
|
"grad_norm": 6.710601329803467, |
|
"learning_rate": 4.1897233201581036e-06, |
|
"loss": 2.3045, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 15.849027357978764, |
|
"grad_norm": 6.813743591308594, |
|
"learning_rate": 4.150972642021236e-06, |
|
"loss": 2.3114, |
|
"step": 204500 |
|
}, |
|
{ |
|
"epoch": 15.887778036115632, |
|
"grad_norm": 7.168315410614014, |
|
"learning_rate": 4.112221963884369e-06, |
|
"loss": 2.2995, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 15.9265287142525, |
|
"grad_norm": 6.606774806976318, |
|
"learning_rate": 4.0734712857475015e-06, |
|
"loss": 2.3023, |
|
"step": 205500 |
|
}, |
|
{ |
|
"epoch": 15.965279392389366, |
|
"grad_norm": 6.852230548858643, |
|
"learning_rate": 4.034720607610634e-06, |
|
"loss": 2.311, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 2.252058982849121, |
|
"eval_runtime": 272.2097, |
|
"eval_samples_per_second": 758.478, |
|
"eval_steps_per_second": 11.855, |
|
"step": 206448 |
|
}, |
|
{ |
|
"epoch": 16.004030070526234, |
|
"grad_norm": 7.245954990386963, |
|
"learning_rate": 3.995969929473767e-06, |
|
"loss": 2.2896, |
|
"step": 206500 |
|
}, |
|
{ |
|
"epoch": 16.0427807486631, |
|
"grad_norm": 6.094116687774658, |
|
"learning_rate": 3.957219251336899e-06, |
|
"loss": 2.2999, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 16.08153142679997, |
|
"grad_norm": 6.302695274353027, |
|
"learning_rate": 3.918468573200031e-06, |
|
"loss": 2.3017, |
|
"step": 207500 |
|
}, |
|
{ |
|
"epoch": 16.120282104936837, |
|
"grad_norm": 6.800222873687744, |
|
"learning_rate": 3.879717895063164e-06, |
|
"loss": 2.2876, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 16.159032783073705, |
|
"grad_norm": 7.139950752258301, |
|
"learning_rate": 3.840967216926296e-06, |
|
"loss": 2.2975, |
|
"step": 208500 |
|
}, |
|
{ |
|
"epoch": 16.19778346121057, |
|
"grad_norm": 6.805322170257568, |
|
"learning_rate": 3.802216538789429e-06, |
|
"loss": 2.2994, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 16.236534139347437, |
|
"grad_norm": 6.6877336502075195, |
|
"learning_rate": 3.7634658606525617e-06, |
|
"loss": 2.277, |
|
"step": 209500 |
|
}, |
|
{ |
|
"epoch": 16.275284817484305, |
|
"grad_norm": 6.0831193923950195, |
|
"learning_rate": 3.724715182515694e-06, |
|
"loss": 2.3029, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 16.314035495621173, |
|
"grad_norm": 6.021631240844727, |
|
"learning_rate": 3.6859645043788265e-06, |
|
"loss": 2.2959, |
|
"step": 210500 |
|
}, |
|
{ |
|
"epoch": 16.35278617375804, |
|
"grad_norm": 7.072383403778076, |
|
"learning_rate": 3.647213826241959e-06, |
|
"loss": 2.2709, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 16.39153685189491, |
|
"grad_norm": 6.0719404220581055, |
|
"learning_rate": 3.608463148105092e-06, |
|
"loss": 2.2952, |
|
"step": 211500 |
|
}, |
|
{ |
|
"epoch": 16.430287530031777, |
|
"grad_norm": 6.733717441558838, |
|
"learning_rate": 3.5697124699682244e-06, |
|
"loss": 2.2984, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 16.469038208168644, |
|
"grad_norm": 7.269532203674316, |
|
"learning_rate": 3.530961791831357e-06, |
|
"loss": 2.2855, |
|
"step": 212500 |
|
}, |
|
{ |
|
"epoch": 16.50778888630551, |
|
"grad_norm": 7.440357208251953, |
|
"learning_rate": 3.4922111136944897e-06, |
|
"loss": 2.2747, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 16.546539564442377, |
|
"grad_norm": 7.448116302490234, |
|
"learning_rate": 3.4534604355576223e-06, |
|
"loss": 2.2933, |
|
"step": 213500 |
|
}, |
|
{ |
|
"epoch": 16.585290242579244, |
|
"grad_norm": 6.202878475189209, |
|
"learning_rate": 3.414709757420755e-06, |
|
"loss": 2.2963, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 16.624040920716112, |
|
"grad_norm": 7.019168376922607, |
|
"learning_rate": 3.3759590792838876e-06, |
|
"loss": 2.2667, |
|
"step": 214500 |
|
}, |
|
{ |
|
"epoch": 16.66279159885298, |
|
"grad_norm": 6.448665142059326, |
|
"learning_rate": 3.3372084011470202e-06, |
|
"loss": 2.2905, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 16.701542276989848, |
|
"grad_norm": 6.160965442657471, |
|
"learning_rate": 3.298457723010153e-06, |
|
"loss": 2.2854, |
|
"step": 215500 |
|
}, |
|
{ |
|
"epoch": 16.740292955126716, |
|
"grad_norm": 6.956637859344482, |
|
"learning_rate": 3.2597070448732855e-06, |
|
"loss": 2.2944, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 16.779043633263584, |
|
"grad_norm": 6.935131549835205, |
|
"learning_rate": 3.220956366736418e-06, |
|
"loss": 2.2795, |
|
"step": 216500 |
|
}, |
|
{ |
|
"epoch": 16.817794311400448, |
|
"grad_norm": 6.656859397888184, |
|
"learning_rate": 3.1822056885995508e-06, |
|
"loss": 2.2872, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 16.856544989537316, |
|
"grad_norm": 6.204549312591553, |
|
"learning_rate": 3.1434550104626834e-06, |
|
"loss": 2.2832, |
|
"step": 217500 |
|
}, |
|
{ |
|
"epoch": 16.895295667674183, |
|
"grad_norm": 6.77413272857666, |
|
"learning_rate": 3.1047043323258156e-06, |
|
"loss": 2.2719, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 16.93404634581105, |
|
"grad_norm": 6.447382926940918, |
|
"learning_rate": 3.0659536541889482e-06, |
|
"loss": 2.2702, |
|
"step": 218500 |
|
}, |
|
{ |
|
"epoch": 16.97279702394792, |
|
"grad_norm": 7.396761894226074, |
|
"learning_rate": 3.027202976052081e-06, |
|
"loss": 2.2813, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_loss": 2.2362165451049805, |
|
"eval_runtime": 266.8391, |
|
"eval_samples_per_second": 773.743, |
|
"eval_steps_per_second": 12.093, |
|
"step": 219351 |
|
}, |
|
{ |
|
"epoch": 17.011547702084787, |
|
"grad_norm": 6.575385093688965, |
|
"learning_rate": 2.9884522979152135e-06, |
|
"loss": 2.2747, |
|
"step": 219500 |
|
}, |
|
{ |
|
"epoch": 17.050298380221655, |
|
"grad_norm": 7.168444633483887, |
|
"learning_rate": 2.949701619778346e-06, |
|
"loss": 2.2868, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 17.089049058358523, |
|
"grad_norm": 7.069167613983154, |
|
"learning_rate": 2.9109509416414788e-06, |
|
"loss": 2.2838, |
|
"step": 220500 |
|
}, |
|
{ |
|
"epoch": 17.12779973649539, |
|
"grad_norm": 6.792834758758545, |
|
"learning_rate": 2.8722002635046114e-06, |
|
"loss": 2.2836, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 17.166550414632255, |
|
"grad_norm": 6.546488285064697, |
|
"learning_rate": 2.833449585367744e-06, |
|
"loss": 2.2733, |
|
"step": 221500 |
|
}, |
|
{ |
|
"epoch": 17.205301092769123, |
|
"grad_norm": 6.293231010437012, |
|
"learning_rate": 2.7946989072308767e-06, |
|
"loss": 2.2688, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 17.24405177090599, |
|
"grad_norm": 6.560914039611816, |
|
"learning_rate": 2.7559482290940093e-06, |
|
"loss": 2.2787, |
|
"step": 222500 |
|
}, |
|
{ |
|
"epoch": 17.28280244904286, |
|
"grad_norm": 6.571765422821045, |
|
"learning_rate": 2.717197550957142e-06, |
|
"loss": 2.2801, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 17.321553127179726, |
|
"grad_norm": 7.396661281585693, |
|
"learning_rate": 2.6784468728202746e-06, |
|
"loss": 2.28, |
|
"step": 223500 |
|
}, |
|
{ |
|
"epoch": 17.360303805316594, |
|
"grad_norm": 6.239862442016602, |
|
"learning_rate": 2.6396961946834072e-06, |
|
"loss": 2.2743, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 17.39905448345346, |
|
"grad_norm": 6.766594886779785, |
|
"learning_rate": 2.60094551654654e-06, |
|
"loss": 2.2456, |
|
"step": 224500 |
|
}, |
|
{ |
|
"epoch": 17.43780516159033, |
|
"grad_norm": 6.488914966583252, |
|
"learning_rate": 2.5621948384096725e-06, |
|
"loss": 2.2666, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 17.476555839727194, |
|
"grad_norm": 6.036900043487549, |
|
"learning_rate": 2.523444160272805e-06, |
|
"loss": 2.2577, |
|
"step": 225500 |
|
}, |
|
{ |
|
"epoch": 17.51530651786406, |
|
"grad_norm": 6.977652549743652, |
|
"learning_rate": 2.4846934821359373e-06, |
|
"loss": 2.2657, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 17.55405719600093, |
|
"grad_norm": 6.468418121337891, |
|
"learning_rate": 2.44594280399907e-06, |
|
"loss": 2.2737, |
|
"step": 226500 |
|
}, |
|
{ |
|
"epoch": 17.592807874137797, |
|
"grad_norm": 6.7042646408081055, |
|
"learning_rate": 2.4071921258622026e-06, |
|
"loss": 2.2685, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 17.631558552274665, |
|
"grad_norm": 6.591056823730469, |
|
"learning_rate": 2.3684414477253352e-06, |
|
"loss": 2.2836, |
|
"step": 227500 |
|
}, |
|
{ |
|
"epoch": 17.670309230411533, |
|
"grad_norm": 7.078721523284912, |
|
"learning_rate": 2.329690769588468e-06, |
|
"loss": 2.2754, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 17.7090599085484, |
|
"grad_norm": 6.701901435852051, |
|
"learning_rate": 2.2909400914516005e-06, |
|
"loss": 2.2494, |
|
"step": 228500 |
|
}, |
|
{ |
|
"epoch": 17.74781058668527, |
|
"grad_norm": 6.622567176818848, |
|
"learning_rate": 2.252189413314733e-06, |
|
"loss": 2.2689, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 17.786561264822133, |
|
"grad_norm": 6.573280334472656, |
|
"learning_rate": 2.2134387351778658e-06, |
|
"loss": 2.271, |
|
"step": 229500 |
|
}, |
|
{ |
|
"epoch": 17.825311942959, |
|
"grad_norm": 6.9067206382751465, |
|
"learning_rate": 2.1746880570409984e-06, |
|
"loss": 2.2573, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 17.86406262109587, |
|
"grad_norm": 6.601592063903809, |
|
"learning_rate": 2.135937378904131e-06, |
|
"loss": 2.2743, |
|
"step": 230500 |
|
}, |
|
{ |
|
"epoch": 17.902813299232736, |
|
"grad_norm": 6.949497699737549, |
|
"learning_rate": 2.0971867007672637e-06, |
|
"loss": 2.2644, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 17.941563977369604, |
|
"grad_norm": 5.614126205444336, |
|
"learning_rate": 2.0584360226303963e-06, |
|
"loss": 2.2608, |
|
"step": 231500 |
|
}, |
|
{ |
|
"epoch": 17.980314655506472, |
|
"grad_norm": 6.880855560302734, |
|
"learning_rate": 2.019685344493529e-06, |
|
"loss": 2.2862, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_loss": 2.229489326477051, |
|
"eval_runtime": 270.0811, |
|
"eval_samples_per_second": 764.456, |
|
"eval_steps_per_second": 11.948, |
|
"step": 232254 |
|
}, |
|
{ |
|
"epoch": 18.01906533364334, |
|
"grad_norm": 6.630836486816406, |
|
"learning_rate": 1.9809346663566616e-06, |
|
"loss": 2.2632, |
|
"step": 232500 |
|
}, |
|
{ |
|
"epoch": 18.057816011780208, |
|
"grad_norm": 6.50869607925415, |
|
"learning_rate": 1.942183988219794e-06, |
|
"loss": 2.2584, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 18.096566689917072, |
|
"grad_norm": 6.81369161605835, |
|
"learning_rate": 1.9034333100829266e-06, |
|
"loss": 2.2599, |
|
"step": 233500 |
|
}, |
|
{ |
|
"epoch": 18.13531736805394, |
|
"grad_norm": 6.202197074890137, |
|
"learning_rate": 1.8646826319460593e-06, |
|
"loss": 2.2532, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 18.174068046190808, |
|
"grad_norm": 6.907183647155762, |
|
"learning_rate": 1.8259319538091919e-06, |
|
"loss": 2.2553, |
|
"step": 234500 |
|
}, |
|
{ |
|
"epoch": 18.212818724327676, |
|
"grad_norm": 7.445714473724365, |
|
"learning_rate": 1.7871812756723245e-06, |
|
"loss": 2.2586, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 18.251569402464543, |
|
"grad_norm": 6.844184398651123, |
|
"learning_rate": 1.7484305975354572e-06, |
|
"loss": 2.2502, |
|
"step": 235500 |
|
}, |
|
{ |
|
"epoch": 18.29032008060141, |
|
"grad_norm": 6.495091438293457, |
|
"learning_rate": 1.7096799193985896e-06, |
|
"loss": 2.2703, |
|
"step": 236000 |
|
}, |
|
{ |
|
"epoch": 18.32907075873828, |
|
"grad_norm": 6.848631858825684, |
|
"learning_rate": 1.6709292412617222e-06, |
|
"loss": 2.2494, |
|
"step": 236500 |
|
}, |
|
{ |
|
"epoch": 18.367821436875147, |
|
"grad_norm": 6.527080535888672, |
|
"learning_rate": 1.6321785631248548e-06, |
|
"loss": 2.2676, |
|
"step": 237000 |
|
}, |
|
{ |
|
"epoch": 18.40657211501201, |
|
"grad_norm": 6.402927875518799, |
|
"learning_rate": 1.5934278849879875e-06, |
|
"loss": 2.256, |
|
"step": 237500 |
|
}, |
|
{ |
|
"epoch": 18.44532279314888, |
|
"grad_norm": 6.720060348510742, |
|
"learning_rate": 1.5546772068511201e-06, |
|
"loss": 2.256, |
|
"step": 238000 |
|
}, |
|
{ |
|
"epoch": 18.484073471285747, |
|
"grad_norm": 6.392049312591553, |
|
"learning_rate": 1.5159265287142528e-06, |
|
"loss": 2.272, |
|
"step": 238500 |
|
}, |
|
{ |
|
"epoch": 18.522824149422615, |
|
"grad_norm": 6.625200271606445, |
|
"learning_rate": 1.4771758505773854e-06, |
|
"loss": 2.2561, |
|
"step": 239000 |
|
}, |
|
{ |
|
"epoch": 18.561574827559483, |
|
"grad_norm": 6.451653003692627, |
|
"learning_rate": 1.438425172440518e-06, |
|
"loss": 2.2518, |
|
"step": 239500 |
|
}, |
|
{ |
|
"epoch": 18.60032550569635, |
|
"grad_norm": 6.246822357177734, |
|
"learning_rate": 1.3996744943036504e-06, |
|
"loss": 2.2541, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 18.639076183833218, |
|
"grad_norm": 6.265354156494141, |
|
"learning_rate": 1.360923816166783e-06, |
|
"loss": 2.2546, |
|
"step": 240500 |
|
}, |
|
{ |
|
"epoch": 18.677826861970086, |
|
"grad_norm": 6.439133644104004, |
|
"learning_rate": 1.3221731380299157e-06, |
|
"loss": 2.2583, |
|
"step": 241000 |
|
}, |
|
{ |
|
"epoch": 18.71657754010695, |
|
"grad_norm": 6.528525352478027, |
|
"learning_rate": 1.2834224598930483e-06, |
|
"loss": 2.2467, |
|
"step": 241500 |
|
}, |
|
{ |
|
"epoch": 18.755328218243818, |
|
"grad_norm": 7.4315900802612305, |
|
"learning_rate": 1.2446717817561808e-06, |
|
"loss": 2.2585, |
|
"step": 242000 |
|
}, |
|
{ |
|
"epoch": 18.794078896380686, |
|
"grad_norm": 7.4202141761779785, |
|
"learning_rate": 1.2059211036193134e-06, |
|
"loss": 2.2637, |
|
"step": 242500 |
|
}, |
|
{ |
|
"epoch": 18.832829574517554, |
|
"grad_norm": 6.3204145431518555, |
|
"learning_rate": 1.167170425482446e-06, |
|
"loss": 2.264, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 18.87158025265442, |
|
"grad_norm": 6.220766067504883, |
|
"learning_rate": 1.1284197473455787e-06, |
|
"loss": 2.2705, |
|
"step": 243500 |
|
}, |
|
{ |
|
"epoch": 18.91033093079129, |
|
"grad_norm": 6.558001518249512, |
|
"learning_rate": 1.0896690692087113e-06, |
|
"loss": 2.2632, |
|
"step": 244000 |
|
}, |
|
{ |
|
"epoch": 18.949081608928157, |
|
"grad_norm": 6.786870956420898, |
|
"learning_rate": 1.050918391071844e-06, |
|
"loss": 2.2441, |
|
"step": 244500 |
|
}, |
|
{ |
|
"epoch": 18.987832287065025, |
|
"grad_norm": 6.955057621002197, |
|
"learning_rate": 1.0121677129349766e-06, |
|
"loss": 2.2503, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_loss": 2.2231059074401855, |
|
"eval_runtime": 272.187, |
|
"eval_samples_per_second": 758.541, |
|
"eval_steps_per_second": 11.856, |
|
"step": 245157 |
|
}, |
|
{ |
|
"epoch": 19.02658296520189, |
|
"grad_norm": 6.136529922485352, |
|
"learning_rate": 9.73417034798109e-07, |
|
"loss": 2.2528, |
|
"step": 245500 |
|
}, |
|
{ |
|
"epoch": 19.065333643338757, |
|
"grad_norm": 7.144802093505859, |
|
"learning_rate": 9.346663566612417e-07, |
|
"loss": 2.248, |
|
"step": 246000 |
|
}, |
|
{ |
|
"epoch": 19.104084321475625, |
|
"grad_norm": 5.582034111022949, |
|
"learning_rate": 8.959156785243743e-07, |
|
"loss": 2.2513, |
|
"step": 246500 |
|
}, |
|
{ |
|
"epoch": 19.142834999612493, |
|
"grad_norm": 6.747804164886475, |
|
"learning_rate": 8.571650003875069e-07, |
|
"loss": 2.2647, |
|
"step": 247000 |
|
}, |
|
{ |
|
"epoch": 19.18158567774936, |
|
"grad_norm": 6.1470417976379395, |
|
"learning_rate": 8.184143222506395e-07, |
|
"loss": 2.2548, |
|
"step": 247500 |
|
}, |
|
{ |
|
"epoch": 19.22033635588623, |
|
"grad_norm": 6.574125289916992, |
|
"learning_rate": 7.79663644113772e-07, |
|
"loss": 2.2714, |
|
"step": 248000 |
|
}, |
|
{ |
|
"epoch": 19.259087034023096, |
|
"grad_norm": 6.6587982177734375, |
|
"learning_rate": 7.409129659769046e-07, |
|
"loss": 2.2491, |
|
"step": 248500 |
|
}, |
|
{ |
|
"epoch": 19.297837712159964, |
|
"grad_norm": 6.578282356262207, |
|
"learning_rate": 7.021622878400372e-07, |
|
"loss": 2.2483, |
|
"step": 249000 |
|
}, |
|
{ |
|
"epoch": 19.33658839029683, |
|
"grad_norm": 6.449355602264404, |
|
"learning_rate": 6.634116097031699e-07, |
|
"loss": 2.2558, |
|
"step": 249500 |
|
}, |
|
{ |
|
"epoch": 19.375339068433696, |
|
"grad_norm": 5.921240329742432, |
|
"learning_rate": 6.246609315663025e-07, |
|
"loss": 2.2428, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 19.414089746570564, |
|
"grad_norm": 6.655218124389648, |
|
"learning_rate": 5.859102534294351e-07, |
|
"loss": 2.2616, |
|
"step": 250500 |
|
}, |
|
{ |
|
"epoch": 19.452840424707432, |
|
"grad_norm": 6.733659744262695, |
|
"learning_rate": 5.471595752925676e-07, |
|
"loss": 2.2481, |
|
"step": 251000 |
|
}, |
|
{ |
|
"epoch": 19.4915911028443, |
|
"grad_norm": 6.9586968421936035, |
|
"learning_rate": 5.084088971557003e-07, |
|
"loss": 2.2495, |
|
"step": 251500 |
|
}, |
|
{ |
|
"epoch": 19.530341780981168, |
|
"grad_norm": 6.441699028015137, |
|
"learning_rate": 4.6965821901883286e-07, |
|
"loss": 2.2456, |
|
"step": 252000 |
|
}, |
|
{ |
|
"epoch": 19.569092459118036, |
|
"grad_norm": 6.126708984375, |
|
"learning_rate": 4.3090754088196544e-07, |
|
"loss": 2.2561, |
|
"step": 252500 |
|
}, |
|
{ |
|
"epoch": 19.607843137254903, |
|
"grad_norm": 6.69553279876709, |
|
"learning_rate": 3.921568627450981e-07, |
|
"loss": 2.2435, |
|
"step": 253000 |
|
}, |
|
{ |
|
"epoch": 19.646593815391768, |
|
"grad_norm": 7.468321800231934, |
|
"learning_rate": 3.5340618460823066e-07, |
|
"loss": 2.2517, |
|
"step": 253500 |
|
}, |
|
{ |
|
"epoch": 19.685344493528635, |
|
"grad_norm": 6.724938869476318, |
|
"learning_rate": 3.146555064713633e-07, |
|
"loss": 2.2307, |
|
"step": 254000 |
|
}, |
|
{ |
|
"epoch": 19.724095171665503, |
|
"grad_norm": 6.407966613769531, |
|
"learning_rate": 2.7590482833449587e-07, |
|
"loss": 2.2469, |
|
"step": 254500 |
|
}, |
|
{ |
|
"epoch": 19.76284584980237, |
|
"grad_norm": 6.521556377410889, |
|
"learning_rate": 2.3715415019762845e-07, |
|
"loss": 2.2266, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 19.80159652793924, |
|
"grad_norm": 6.066943645477295, |
|
"learning_rate": 1.9840347206076106e-07, |
|
"loss": 2.2448, |
|
"step": 255500 |
|
}, |
|
{ |
|
"epoch": 19.840347206076107, |
|
"grad_norm": 6.806808948516846, |
|
"learning_rate": 1.5965279392389367e-07, |
|
"loss": 2.2355, |
|
"step": 256000 |
|
}, |
|
{ |
|
"epoch": 19.879097884212975, |
|
"grad_norm": 6.567816734313965, |
|
"learning_rate": 1.2090211578702627e-07, |
|
"loss": 2.2581, |
|
"step": 256500 |
|
}, |
|
{ |
|
"epoch": 19.917848562349842, |
|
"grad_norm": 7.037693977355957, |
|
"learning_rate": 8.21514376501589e-08, |
|
"loss": 2.2558, |
|
"step": 257000 |
|
}, |
|
{ |
|
"epoch": 19.95659924048671, |
|
"grad_norm": 7.10353422164917, |
|
"learning_rate": 4.340075951329148e-08, |
|
"loss": 2.2513, |
|
"step": 257500 |
|
}, |
|
{ |
|
"epoch": 19.995349918623575, |
|
"grad_norm": 7.407341003417969, |
|
"learning_rate": 4.6500813764240875e-09, |
|
"loss": 2.2448, |
|
"step": 258000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_loss": 2.211674213409424, |
|
"eval_runtime": 270.2294, |
|
"eval_samples_per_second": 764.036, |
|
"eval_steps_per_second": 11.942, |
|
"step": 258060 |
|
}, |
|
{ |
|
"epoch": 20.034100596760442, |
|
"grad_norm": 6.319613456726074, |
|
"learning_rate": 6.643932935493038e-06, |
|
"loss": 2.2464, |
|
"step": 258500 |
|
}, |
|
{ |
|
"epoch": 20.07285127489731, |
|
"grad_norm": 10.947772026062012, |
|
"learning_rate": 6.61809915006846e-06, |
|
"loss": 2.2717, |
|
"step": 259000 |
|
}, |
|
{ |
|
"epoch": 20.111601953034178, |
|
"grad_norm": 6.688451290130615, |
|
"learning_rate": 6.592265364643882e-06, |
|
"loss": 2.246, |
|
"step": 259500 |
|
}, |
|
{ |
|
"epoch": 20.150352631171046, |
|
"grad_norm": 7.084783554077148, |
|
"learning_rate": 6.566431579219303e-06, |
|
"loss": 2.2547, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 20.189103309307914, |
|
"grad_norm": 7.182523250579834, |
|
"learning_rate": 6.540597793794725e-06, |
|
"loss": 2.2673, |
|
"step": 260500 |
|
}, |
|
{ |
|
"epoch": 20.22785398744478, |
|
"grad_norm": 6.572226524353027, |
|
"learning_rate": 6.514764008370147e-06, |
|
"loss": 2.2696, |
|
"step": 261000 |
|
}, |
|
{ |
|
"epoch": 20.26660466558165, |
|
"grad_norm": 6.861509323120117, |
|
"learning_rate": 6.488930222945569e-06, |
|
"loss": 2.2602, |
|
"step": 261500 |
|
}, |
|
{ |
|
"epoch": 20.305355343718514, |
|
"grad_norm": 7.068969249725342, |
|
"learning_rate": 6.46309643752099e-06, |
|
"loss": 2.2736, |
|
"step": 262000 |
|
}, |
|
{ |
|
"epoch": 20.34410602185538, |
|
"grad_norm": 6.5293660163879395, |
|
"learning_rate": 6.4372626520964125e-06, |
|
"loss": 2.2698, |
|
"step": 262500 |
|
}, |
|
{ |
|
"epoch": 20.38285669999225, |
|
"grad_norm": 6.285311698913574, |
|
"learning_rate": 6.411428866671834e-06, |
|
"loss": 2.2671, |
|
"step": 263000 |
|
}, |
|
{ |
|
"epoch": 20.421607378129117, |
|
"grad_norm": 6.466723918914795, |
|
"learning_rate": 6.3855950812472554e-06, |
|
"loss": 2.267, |
|
"step": 263500 |
|
}, |
|
{ |
|
"epoch": 20.460358056265985, |
|
"grad_norm": 7.045479774475098, |
|
"learning_rate": 6.359761295822677e-06, |
|
"loss": 2.2499, |
|
"step": 264000 |
|
}, |
|
{ |
|
"epoch": 20.499108734402853, |
|
"grad_norm": 7.05580472946167, |
|
"learning_rate": 6.333927510398099e-06, |
|
"loss": 2.2793, |
|
"step": 264500 |
|
}, |
|
{ |
|
"epoch": 20.53785941253972, |
|
"grad_norm": 7.213685035705566, |
|
"learning_rate": 6.308093724973521e-06, |
|
"loss": 2.2519, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 20.57661009067659, |
|
"grad_norm": 6.6378984451293945, |
|
"learning_rate": 6.282259939548942e-06, |
|
"loss": 2.2699, |
|
"step": 265500 |
|
}, |
|
{ |
|
"epoch": 20.615360768813453, |
|
"grad_norm": 6.8442463874816895, |
|
"learning_rate": 6.2564261541243645e-06, |
|
"loss": 2.2697, |
|
"step": 266000 |
|
}, |
|
{ |
|
"epoch": 20.65411144695032, |
|
"grad_norm": 7.099138259887695, |
|
"learning_rate": 6.230592368699786e-06, |
|
"loss": 2.2622, |
|
"step": 266500 |
|
}, |
|
{ |
|
"epoch": 20.69286212508719, |
|
"grad_norm": 6.572378635406494, |
|
"learning_rate": 6.2047585832752074e-06, |
|
"loss": 2.2709, |
|
"step": 267000 |
|
}, |
|
{ |
|
"epoch": 20.731612803224056, |
|
"grad_norm": 6.410079479217529, |
|
"learning_rate": 6.17892479785063e-06, |
|
"loss": 2.2599, |
|
"step": 267500 |
|
}, |
|
{ |
|
"epoch": 20.770363481360924, |
|
"grad_norm": 7.154236316680908, |
|
"learning_rate": 6.153091012426051e-06, |
|
"loss": 2.2654, |
|
"step": 268000 |
|
}, |
|
{ |
|
"epoch": 20.809114159497792, |
|
"grad_norm": 7.05757999420166, |
|
"learning_rate": 6.127257227001473e-06, |
|
"loss": 2.2673, |
|
"step": 268500 |
|
}, |
|
{ |
|
"epoch": 20.84786483763466, |
|
"grad_norm": 7.457660675048828, |
|
"learning_rate": 6.101423441576895e-06, |
|
"loss": 2.2534, |
|
"step": 269000 |
|
}, |
|
{ |
|
"epoch": 20.886615515771528, |
|
"grad_norm": 6.697342872619629, |
|
"learning_rate": 6.0755896561523165e-06, |
|
"loss": 2.2721, |
|
"step": 269500 |
|
}, |
|
{ |
|
"epoch": 20.925366193908392, |
|
"grad_norm": 6.83280611038208, |
|
"learning_rate": 6.049755870727738e-06, |
|
"loss": 2.276, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 20.96411687204526, |
|
"grad_norm": 6.298649311065674, |
|
"learning_rate": 6.02392208530316e-06, |
|
"loss": 2.265, |
|
"step": 270500 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_loss": 2.2148919105529785, |
|
"eval_runtime": 275.6952, |
|
"eval_samples_per_second": 748.889, |
|
"eval_steps_per_second": 11.705, |
|
"step": 270963 |
|
}, |
|
{ |
|
"epoch": 21.002867550182128, |
|
"grad_norm": 6.698548316955566, |
|
"learning_rate": 5.998088299878582e-06, |
|
"loss": 2.2576, |
|
"step": 271000 |
|
}, |
|
{ |
|
"epoch": 21.041618228318995, |
|
"grad_norm": 6.784346103668213, |
|
"learning_rate": 5.972254514454003e-06, |
|
"loss": 2.2398, |
|
"step": 271500 |
|
}, |
|
{ |
|
"epoch": 21.080368906455863, |
|
"grad_norm": 7.072300910949707, |
|
"learning_rate": 5.946420729029425e-06, |
|
"loss": 2.2557, |
|
"step": 272000 |
|
}, |
|
{ |
|
"epoch": 21.11911958459273, |
|
"grad_norm": 6.624369144439697, |
|
"learning_rate": 5.920586943604847e-06, |
|
"loss": 2.2337, |
|
"step": 272500 |
|
}, |
|
{ |
|
"epoch": 21.1578702627296, |
|
"grad_norm": 6.317164897918701, |
|
"learning_rate": 5.8947531581802685e-06, |
|
"loss": 2.2534, |
|
"step": 273000 |
|
}, |
|
{ |
|
"epoch": 21.196620940866467, |
|
"grad_norm": 6.728669166564941, |
|
"learning_rate": 5.86891937275569e-06, |
|
"loss": 2.2505, |
|
"step": 273500 |
|
}, |
|
{ |
|
"epoch": 21.23537161900333, |
|
"grad_norm": 6.596154689788818, |
|
"learning_rate": 5.843085587331112e-06, |
|
"loss": 2.253, |
|
"step": 274000 |
|
}, |
|
{ |
|
"epoch": 21.2741222971402, |
|
"grad_norm": 6.471163749694824, |
|
"learning_rate": 5.817251801906534e-06, |
|
"loss": 2.2556, |
|
"step": 274500 |
|
}, |
|
{ |
|
"epoch": 21.312872975277067, |
|
"grad_norm": 6.29288911819458, |
|
"learning_rate": 5.791418016481955e-06, |
|
"loss": 2.2567, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 21.351623653413935, |
|
"grad_norm": 7.078927040100098, |
|
"learning_rate": 5.7655842310573776e-06, |
|
"loss": 2.2294, |
|
"step": 275500 |
|
}, |
|
{ |
|
"epoch": 21.390374331550802, |
|
"grad_norm": 6.867557525634766, |
|
"learning_rate": 5.739750445632799e-06, |
|
"loss": 2.2574, |
|
"step": 276000 |
|
}, |
|
{ |
|
"epoch": 21.42912500968767, |
|
"grad_norm": 6.830238342285156, |
|
"learning_rate": 5.7139166602082205e-06, |
|
"loss": 2.2794, |
|
"step": 276500 |
|
}, |
|
{ |
|
"epoch": 21.467875687824538, |
|
"grad_norm": 6.694831371307373, |
|
"learning_rate": 5.688082874783643e-06, |
|
"loss": 2.253, |
|
"step": 277000 |
|
}, |
|
{ |
|
"epoch": 21.506626365961406, |
|
"grad_norm": 7.064994812011719, |
|
"learning_rate": 5.662249089359064e-06, |
|
"loss": 2.2435, |
|
"step": 277500 |
|
}, |
|
{ |
|
"epoch": 21.54537704409827, |
|
"grad_norm": 6.832572937011719, |
|
"learning_rate": 5.636415303934486e-06, |
|
"loss": 2.2478, |
|
"step": 278000 |
|
}, |
|
{ |
|
"epoch": 21.584127722235138, |
|
"grad_norm": 7.045238494873047, |
|
"learning_rate": 5.610581518509908e-06, |
|
"loss": 2.2434, |
|
"step": 278500 |
|
}, |
|
{ |
|
"epoch": 21.622878400372006, |
|
"grad_norm": 6.720279216766357, |
|
"learning_rate": 5.58474773308533e-06, |
|
"loss": 2.238, |
|
"step": 279000 |
|
}, |
|
{ |
|
"epoch": 21.661629078508874, |
|
"grad_norm": 7.401440143585205, |
|
"learning_rate": 5.558913947660751e-06, |
|
"loss": 2.2461, |
|
"step": 279500 |
|
}, |
|
{ |
|
"epoch": 21.70037975664574, |
|
"grad_norm": 6.497147560119629, |
|
"learning_rate": 5.5330801622361725e-06, |
|
"loss": 2.2339, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 21.73913043478261, |
|
"grad_norm": 6.529776096343994, |
|
"learning_rate": 5.507246376811595e-06, |
|
"loss": 2.2501, |
|
"step": 280500 |
|
}, |
|
{ |
|
"epoch": 21.777881112919477, |
|
"grad_norm": 6.42600679397583, |
|
"learning_rate": 5.481412591387016e-06, |
|
"loss": 2.235, |
|
"step": 281000 |
|
}, |
|
{ |
|
"epoch": 21.816631791056345, |
|
"grad_norm": 6.715229034423828, |
|
"learning_rate": 5.455578805962438e-06, |
|
"loss": 2.2401, |
|
"step": 281500 |
|
}, |
|
{ |
|
"epoch": 21.85538246919321, |
|
"grad_norm": 6.575899124145508, |
|
"learning_rate": 5.42974502053786e-06, |
|
"loss": 2.2576, |
|
"step": 282000 |
|
}, |
|
{ |
|
"epoch": 21.894133147330077, |
|
"grad_norm": 5.999971866607666, |
|
"learning_rate": 5.403911235113282e-06, |
|
"loss": 2.2379, |
|
"step": 282500 |
|
}, |
|
{ |
|
"epoch": 21.932883825466945, |
|
"grad_norm": 6.936278343200684, |
|
"learning_rate": 5.378077449688703e-06, |
|
"loss": 2.2534, |
|
"step": 283000 |
|
}, |
|
{ |
|
"epoch": 21.971634503603813, |
|
"grad_norm": 6.040930271148682, |
|
"learning_rate": 5.352243664264125e-06, |
|
"loss": 2.2391, |
|
"step": 283500 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_loss": 2.1943371295928955, |
|
"eval_runtime": 268.1318, |
|
"eval_samples_per_second": 770.013, |
|
"eval_steps_per_second": 12.035, |
|
"step": 283866 |
|
}, |
|
{ |
|
"epoch": 22.01038518174068, |
|
"grad_norm": 6.7548747062683105, |
|
"learning_rate": 5.326409878839547e-06, |
|
"loss": 2.2428, |
|
"step": 284000 |
|
}, |
|
{ |
|
"epoch": 22.04913585987755, |
|
"grad_norm": 7.0850749015808105, |
|
"learning_rate": 5.300576093414968e-06, |
|
"loss": 2.2273, |
|
"step": 284500 |
|
}, |
|
{ |
|
"epoch": 22.087886538014416, |
|
"grad_norm": 6.658077239990234, |
|
"learning_rate": 5.274742307990391e-06, |
|
"loss": 2.2214, |
|
"step": 285000 |
|
}, |
|
{ |
|
"epoch": 22.126637216151284, |
|
"grad_norm": 7.19653844833374, |
|
"learning_rate": 5.248908522565812e-06, |
|
"loss": 2.2273, |
|
"step": 285500 |
|
}, |
|
{ |
|
"epoch": 22.16538789428815, |
|
"grad_norm": 7.094461441040039, |
|
"learning_rate": 5.223074737141234e-06, |
|
"loss": 2.2359, |
|
"step": 286000 |
|
}, |
|
{ |
|
"epoch": 22.204138572425016, |
|
"grad_norm": 7.156402587890625, |
|
"learning_rate": 5.197240951716656e-06, |
|
"loss": 2.1969, |
|
"step": 286500 |
|
}, |
|
{ |
|
"epoch": 22.242889250561884, |
|
"grad_norm": 6.595995903015137, |
|
"learning_rate": 5.171407166292077e-06, |
|
"loss": 2.2223, |
|
"step": 287000 |
|
}, |
|
{ |
|
"epoch": 22.281639928698752, |
|
"grad_norm": 7.04496955871582, |
|
"learning_rate": 5.145573380867499e-06, |
|
"loss": 2.2343, |
|
"step": 287500 |
|
}, |
|
{ |
|
"epoch": 22.32039060683562, |
|
"grad_norm": 7.146208763122559, |
|
"learning_rate": 5.11973959544292e-06, |
|
"loss": 2.2338, |
|
"step": 288000 |
|
}, |
|
{ |
|
"epoch": 22.359141284972488, |
|
"grad_norm": 6.4659576416015625, |
|
"learning_rate": 5.093905810018343e-06, |
|
"loss": 2.2273, |
|
"step": 288500 |
|
}, |
|
{ |
|
"epoch": 22.397891963109355, |
|
"grad_norm": 6.372287750244141, |
|
"learning_rate": 5.068072024593764e-06, |
|
"loss": 2.2247, |
|
"step": 289000 |
|
}, |
|
{ |
|
"epoch": 22.436642641246223, |
|
"grad_norm": 7.088085174560547, |
|
"learning_rate": 5.042238239169186e-06, |
|
"loss": 2.2474, |
|
"step": 289500 |
|
}, |
|
{ |
|
"epoch": 22.475393319383087, |
|
"grad_norm": 6.911520004272461, |
|
"learning_rate": 5.016404453744608e-06, |
|
"loss": 2.2356, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 22.514143997519955, |
|
"grad_norm": 7.5756611824035645, |
|
"learning_rate": 4.990570668320029e-06, |
|
"loss": 2.2297, |
|
"step": 290500 |
|
}, |
|
{ |
|
"epoch": 22.552894675656823, |
|
"grad_norm": 6.587701320648193, |
|
"learning_rate": 4.964736882895451e-06, |
|
"loss": 2.2245, |
|
"step": 291000 |
|
}, |
|
{ |
|
"epoch": 22.59164535379369, |
|
"grad_norm": 5.8870849609375, |
|
"learning_rate": 4.938903097470873e-06, |
|
"loss": 2.229, |
|
"step": 291500 |
|
}, |
|
{ |
|
"epoch": 22.63039603193056, |
|
"grad_norm": 6.882173538208008, |
|
"learning_rate": 4.913069312046295e-06, |
|
"loss": 2.2254, |
|
"step": 292000 |
|
}, |
|
{ |
|
"epoch": 22.669146710067427, |
|
"grad_norm": 6.710127830505371, |
|
"learning_rate": 4.887235526621716e-06, |
|
"loss": 2.223, |
|
"step": 292500 |
|
}, |
|
{ |
|
"epoch": 22.707897388204294, |
|
"grad_norm": 6.753304481506348, |
|
"learning_rate": 4.8614017411971385e-06, |
|
"loss": 2.2311, |
|
"step": 293000 |
|
}, |
|
{ |
|
"epoch": 22.746648066341162, |
|
"grad_norm": 6.02184534072876, |
|
"learning_rate": 4.83556795577256e-06, |
|
"loss": 2.2198, |
|
"step": 293500 |
|
}, |
|
{ |
|
"epoch": 22.78539874447803, |
|
"grad_norm": 7.022054195404053, |
|
"learning_rate": 4.809734170347981e-06, |
|
"loss": 2.2275, |
|
"step": 294000 |
|
}, |
|
{ |
|
"epoch": 22.824149422614894, |
|
"grad_norm": 7.951735019683838, |
|
"learning_rate": 4.783900384923404e-06, |
|
"loss": 2.2314, |
|
"step": 294500 |
|
}, |
|
{ |
|
"epoch": 22.862900100751762, |
|
"grad_norm": 5.854333877563477, |
|
"learning_rate": 4.758066599498825e-06, |
|
"loss": 2.2172, |
|
"step": 295000 |
|
}, |
|
{ |
|
"epoch": 22.90165077888863, |
|
"grad_norm": 6.547132968902588, |
|
"learning_rate": 4.732232814074247e-06, |
|
"loss": 2.226, |
|
"step": 295500 |
|
}, |
|
{ |
|
"epoch": 22.940401457025498, |
|
"grad_norm": 6.535789966583252, |
|
"learning_rate": 4.706399028649668e-06, |
|
"loss": 2.2299, |
|
"step": 296000 |
|
}, |
|
{ |
|
"epoch": 22.979152135162366, |
|
"grad_norm": 7.285912036895752, |
|
"learning_rate": 4.6805652432250905e-06, |
|
"loss": 2.2239, |
|
"step": 296500 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_loss": 2.185373067855835, |
|
"eval_runtime": 265.4806, |
|
"eval_samples_per_second": 777.703, |
|
"eval_steps_per_second": 12.155, |
|
"step": 296769 |
|
}, |
|
{ |
|
"epoch": 23.017902813299234, |
|
"grad_norm": 6.972716808319092, |
|
"learning_rate": 4.654731457800512e-06, |
|
"loss": 2.2193, |
|
"step": 297000 |
|
}, |
|
{ |
|
"epoch": 23.0566534914361, |
|
"grad_norm": 6.841848373413086, |
|
"learning_rate": 4.628897672375933e-06, |
|
"loss": 2.2139, |
|
"step": 297500 |
|
}, |
|
{ |
|
"epoch": 23.09540416957297, |
|
"grad_norm": 6.285813331604004, |
|
"learning_rate": 4.603063886951356e-06, |
|
"loss": 2.2092, |
|
"step": 298000 |
|
}, |
|
{ |
|
"epoch": 23.134154847709834, |
|
"grad_norm": 6.615530967712402, |
|
"learning_rate": 4.577230101526777e-06, |
|
"loss": 2.2141, |
|
"step": 298500 |
|
}, |
|
{ |
|
"epoch": 23.1729055258467, |
|
"grad_norm": 6.762087821960449, |
|
"learning_rate": 4.551396316102199e-06, |
|
"loss": 2.1944, |
|
"step": 299000 |
|
}, |
|
{ |
|
"epoch": 23.21165620398357, |
|
"grad_norm": 7.053805351257324, |
|
"learning_rate": 4.525562530677621e-06, |
|
"loss": 2.2129, |
|
"step": 299500 |
|
}, |
|
{ |
|
"epoch": 23.250406882120437, |
|
"grad_norm": 7.14516544342041, |
|
"learning_rate": 4.4997287452530425e-06, |
|
"loss": 2.2038, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 23.289157560257305, |
|
"grad_norm": 6.8478803634643555, |
|
"learning_rate": 4.473894959828464e-06, |
|
"loss": 2.2166, |
|
"step": 300500 |
|
}, |
|
{ |
|
"epoch": 23.327908238394173, |
|
"grad_norm": 6.808053970336914, |
|
"learning_rate": 4.448061174403886e-06, |
|
"loss": 2.224, |
|
"step": 301000 |
|
}, |
|
{ |
|
"epoch": 23.36665891653104, |
|
"grad_norm": 7.149857521057129, |
|
"learning_rate": 4.422227388979308e-06, |
|
"loss": 2.2081, |
|
"step": 301500 |
|
}, |
|
{ |
|
"epoch": 23.40540959466791, |
|
"grad_norm": 6.334920406341553, |
|
"learning_rate": 4.396393603554729e-06, |
|
"loss": 2.2217, |
|
"step": 302000 |
|
}, |
|
{ |
|
"epoch": 23.444160272804773, |
|
"grad_norm": 7.154323577880859, |
|
"learning_rate": 4.3705598181301515e-06, |
|
"loss": 2.2129, |
|
"step": 302500 |
|
}, |
|
{ |
|
"epoch": 23.48291095094164, |
|
"grad_norm": 7.202456474304199, |
|
"learning_rate": 4.344726032705573e-06, |
|
"loss": 2.2019, |
|
"step": 303000 |
|
}, |
|
{ |
|
"epoch": 23.52166162907851, |
|
"grad_norm": 6.832441806793213, |
|
"learning_rate": 4.3188922472809945e-06, |
|
"loss": 2.214, |
|
"step": 303500 |
|
}, |
|
{ |
|
"epoch": 23.560412307215376, |
|
"grad_norm": 6.258272647857666, |
|
"learning_rate": 4.293058461856417e-06, |
|
"loss": 2.21, |
|
"step": 304000 |
|
}, |
|
{ |
|
"epoch": 23.599162985352244, |
|
"grad_norm": 6.8391194343566895, |
|
"learning_rate": 4.267224676431838e-06, |
|
"loss": 2.2106, |
|
"step": 304500 |
|
}, |
|
{ |
|
"epoch": 23.637913663489112, |
|
"grad_norm": 6.621433734893799, |
|
"learning_rate": 4.24139089100726e-06, |
|
"loss": 2.2219, |
|
"step": 305000 |
|
}, |
|
{ |
|
"epoch": 23.67666434162598, |
|
"grad_norm": 6.718801498413086, |
|
"learning_rate": 4.215557105582681e-06, |
|
"loss": 2.2215, |
|
"step": 305500 |
|
}, |
|
{ |
|
"epoch": 23.715415019762847, |
|
"grad_norm": 7.0543622970581055, |
|
"learning_rate": 4.1897233201581036e-06, |
|
"loss": 2.2182, |
|
"step": 306000 |
|
}, |
|
{ |
|
"epoch": 23.75416569789971, |
|
"grad_norm": 7.598169326782227, |
|
"learning_rate": 4.163889534733525e-06, |
|
"loss": 2.2218, |
|
"step": 306500 |
|
}, |
|
{ |
|
"epoch": 23.79291637603658, |
|
"grad_norm": 6.874271392822266, |
|
"learning_rate": 4.1380557493089465e-06, |
|
"loss": 2.2061, |
|
"step": 307000 |
|
}, |
|
{ |
|
"epoch": 23.831667054173447, |
|
"grad_norm": 6.820863723754883, |
|
"learning_rate": 4.112221963884369e-06, |
|
"loss": 2.2166, |
|
"step": 307500 |
|
}, |
|
{ |
|
"epoch": 23.870417732310315, |
|
"grad_norm": 7.149729251861572, |
|
"learning_rate": 4.08638817845979e-06, |
|
"loss": 2.2089, |
|
"step": 308000 |
|
}, |
|
{ |
|
"epoch": 23.909168410447183, |
|
"grad_norm": 6.278995990753174, |
|
"learning_rate": 4.060554393035212e-06, |
|
"loss": 2.2163, |
|
"step": 308500 |
|
}, |
|
{ |
|
"epoch": 23.94791908858405, |
|
"grad_norm": 7.162642002105713, |
|
"learning_rate": 4.034720607610634e-06, |
|
"loss": 2.2222, |
|
"step": 309000 |
|
}, |
|
{ |
|
"epoch": 23.98666976672092, |
|
"grad_norm": 6.67965841293335, |
|
"learning_rate": 4.0088868221860556e-06, |
|
"loss": 2.1965, |
|
"step": 309500 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_loss": 2.1702771186828613, |
|
"eval_runtime": 264.5615, |
|
"eval_samples_per_second": 780.405, |
|
"eval_steps_per_second": 12.198, |
|
"step": 309672 |
|
}, |
|
{ |
|
"epoch": 24.025420444857787, |
|
"grad_norm": 6.355005264282227, |
|
"learning_rate": 3.983053036761477e-06, |
|
"loss": 2.1935, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 24.06417112299465, |
|
"grad_norm": 6.339087963104248, |
|
"learning_rate": 3.957219251336899e-06, |
|
"loss": 2.2022, |
|
"step": 310500 |
|
}, |
|
{ |
|
"epoch": 24.10292180113152, |
|
"grad_norm": 6.386953353881836, |
|
"learning_rate": 3.931385465912321e-06, |
|
"loss": 2.1941, |
|
"step": 311000 |
|
}, |
|
{ |
|
"epoch": 24.141672479268387, |
|
"grad_norm": 6.9508376121521, |
|
"learning_rate": 3.905551680487742e-06, |
|
"loss": 2.1991, |
|
"step": 311500 |
|
}, |
|
{ |
|
"epoch": 24.180423157405254, |
|
"grad_norm": 7.1515727043151855, |
|
"learning_rate": 3.879717895063164e-06, |
|
"loss": 2.2118, |
|
"step": 312000 |
|
}, |
|
{ |
|
"epoch": 24.219173835542122, |
|
"grad_norm": 6.807953357696533, |
|
"learning_rate": 3.853884109638585e-06, |
|
"loss": 2.2158, |
|
"step": 312500 |
|
}, |
|
{ |
|
"epoch": 24.25792451367899, |
|
"grad_norm": 7.41762638092041, |
|
"learning_rate": 3.828050324214007e-06, |
|
"loss": 2.1948, |
|
"step": 313000 |
|
}, |
|
{ |
|
"epoch": 24.296675191815858, |
|
"grad_norm": 7.462344646453857, |
|
"learning_rate": 3.802216538789429e-06, |
|
"loss": 2.2061, |
|
"step": 313500 |
|
}, |
|
{ |
|
"epoch": 24.335425869952726, |
|
"grad_norm": 6.6912384033203125, |
|
"learning_rate": 3.7763827533648505e-06, |
|
"loss": 2.1932, |
|
"step": 314000 |
|
}, |
|
{ |
|
"epoch": 24.37417654808959, |
|
"grad_norm": 6.79492712020874, |
|
"learning_rate": 3.7505489679402724e-06, |
|
"loss": 2.193, |
|
"step": 314500 |
|
}, |
|
{ |
|
"epoch": 24.412927226226458, |
|
"grad_norm": 6.873208522796631, |
|
"learning_rate": 3.724715182515694e-06, |
|
"loss": 2.1756, |
|
"step": 315000 |
|
}, |
|
{ |
|
"epoch": 24.451677904363326, |
|
"grad_norm": 6.520395278930664, |
|
"learning_rate": 3.6988813970911158e-06, |
|
"loss": 2.2019, |
|
"step": 315500 |
|
}, |
|
{ |
|
"epoch": 24.490428582500193, |
|
"grad_norm": 7.425100326538086, |
|
"learning_rate": 3.6730476116665377e-06, |
|
"loss": 2.1933, |
|
"step": 316000 |
|
}, |
|
{ |
|
"epoch": 24.52917926063706, |
|
"grad_norm": 6.990531921386719, |
|
"learning_rate": 3.647213826241959e-06, |
|
"loss": 2.1953, |
|
"step": 316500 |
|
}, |
|
{ |
|
"epoch": 24.56792993877393, |
|
"grad_norm": 6.99529504776001, |
|
"learning_rate": 3.621380040817381e-06, |
|
"loss": 2.1668, |
|
"step": 317000 |
|
}, |
|
{ |
|
"epoch": 24.606680616910797, |
|
"grad_norm": 7.046565532684326, |
|
"learning_rate": 3.595546255392803e-06, |
|
"loss": 2.2185, |
|
"step": 317500 |
|
}, |
|
{ |
|
"epoch": 24.645431295047665, |
|
"grad_norm": 7.261152744293213, |
|
"learning_rate": 3.5697124699682244e-06, |
|
"loss": 2.1776, |
|
"step": 318000 |
|
}, |
|
{ |
|
"epoch": 24.68418197318453, |
|
"grad_norm": 7.088150978088379, |
|
"learning_rate": 3.5438786845436463e-06, |
|
"loss": 2.1939, |
|
"step": 318500 |
|
}, |
|
{ |
|
"epoch": 24.722932651321397, |
|
"grad_norm": 7.677366733551025, |
|
"learning_rate": 3.518044899119068e-06, |
|
"loss": 2.1916, |
|
"step": 319000 |
|
}, |
|
{ |
|
"epoch": 24.761683329458265, |
|
"grad_norm": 7.108632564544678, |
|
"learning_rate": 3.4922111136944897e-06, |
|
"loss": 2.1851, |
|
"step": 319500 |
|
}, |
|
{ |
|
"epoch": 24.800434007595133, |
|
"grad_norm": 7.283915996551514, |
|
"learning_rate": 3.4663773282699116e-06, |
|
"loss": 2.2015, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 24.839184685732, |
|
"grad_norm": 7.392533779144287, |
|
"learning_rate": 3.440543542845333e-06, |
|
"loss": 2.1915, |
|
"step": 320500 |
|
}, |
|
{ |
|
"epoch": 24.87793536386887, |
|
"grad_norm": 6.849175453186035, |
|
"learning_rate": 3.414709757420755e-06, |
|
"loss": 2.1931, |
|
"step": 321000 |
|
}, |
|
{ |
|
"epoch": 24.916686042005736, |
|
"grad_norm": 6.42083740234375, |
|
"learning_rate": 3.388875971996177e-06, |
|
"loss": 2.198, |
|
"step": 321500 |
|
}, |
|
{ |
|
"epoch": 24.955436720142604, |
|
"grad_norm": 6.040030002593994, |
|
"learning_rate": 3.3630421865715983e-06, |
|
"loss": 2.1802, |
|
"step": 322000 |
|
}, |
|
{ |
|
"epoch": 24.99418739827947, |
|
"grad_norm": 7.585995674133301, |
|
"learning_rate": 3.3372084011470202e-06, |
|
"loss": 2.1946, |
|
"step": 322500 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_loss": 2.159193992614746, |
|
"eval_runtime": 270.695, |
|
"eval_samples_per_second": 762.722, |
|
"eval_steps_per_second": 11.921, |
|
"step": 322575 |
|
}, |
|
{ |
|
"epoch": 25.032938076416336, |
|
"grad_norm": 7.309504985809326, |
|
"learning_rate": 3.3113746157224417e-06, |
|
"loss": 2.1854, |
|
"step": 323000 |
|
}, |
|
{ |
|
"epoch": 25.071688754553204, |
|
"grad_norm": 6.450008869171143, |
|
"learning_rate": 3.2855408302978636e-06, |
|
"loss": 2.1827, |
|
"step": 323500 |
|
}, |
|
{ |
|
"epoch": 25.11043943269007, |
|
"grad_norm": 6.82379674911499, |
|
"learning_rate": 3.2597070448732855e-06, |
|
"loss": 2.1838, |
|
"step": 324000 |
|
}, |
|
{ |
|
"epoch": 25.14919011082694, |
|
"grad_norm": 7.034087657928467, |
|
"learning_rate": 3.233873259448707e-06, |
|
"loss": 2.1618, |
|
"step": 324500 |
|
}, |
|
{ |
|
"epoch": 25.187940788963807, |
|
"grad_norm": 7.005911827087402, |
|
"learning_rate": 3.208039474024129e-06, |
|
"loss": 2.1912, |
|
"step": 325000 |
|
}, |
|
{ |
|
"epoch": 25.226691467100675, |
|
"grad_norm": 6.7085394859313965, |
|
"learning_rate": 3.1822056885995508e-06, |
|
"loss": 2.1795, |
|
"step": 325500 |
|
}, |
|
{ |
|
"epoch": 25.265442145237543, |
|
"grad_norm": 6.773245334625244, |
|
"learning_rate": 3.1563719031749722e-06, |
|
"loss": 2.1965, |
|
"step": 326000 |
|
}, |
|
{ |
|
"epoch": 25.304192823374407, |
|
"grad_norm": 6.718632698059082, |
|
"learning_rate": 3.130538117750394e-06, |
|
"loss": 2.1976, |
|
"step": 326500 |
|
}, |
|
{ |
|
"epoch": 25.342943501511275, |
|
"grad_norm": 8.191710472106934, |
|
"learning_rate": 3.1047043323258156e-06, |
|
"loss": 2.1762, |
|
"step": 327000 |
|
}, |
|
{ |
|
"epoch": 25.381694179648143, |
|
"grad_norm": 7.172983169555664, |
|
"learning_rate": 3.0788705469012375e-06, |
|
"loss": 2.1703, |
|
"step": 327500 |
|
}, |
|
{ |
|
"epoch": 25.42044485778501, |
|
"grad_norm": 6.283721446990967, |
|
"learning_rate": 3.0530367614766594e-06, |
|
"loss": 2.1692, |
|
"step": 328000 |
|
}, |
|
{ |
|
"epoch": 25.45919553592188, |
|
"grad_norm": 6.850103855133057, |
|
"learning_rate": 3.027202976052081e-06, |
|
"loss": 2.1914, |
|
"step": 328500 |
|
}, |
|
{ |
|
"epoch": 25.497946214058747, |
|
"grad_norm": 6.31437873840332, |
|
"learning_rate": 3.0013691906275028e-06, |
|
"loss": 2.1692, |
|
"step": 329000 |
|
}, |
|
{ |
|
"epoch": 25.536696892195614, |
|
"grad_norm": 6.947432994842529, |
|
"learning_rate": 2.9755354052029247e-06, |
|
"loss": 2.1848, |
|
"step": 329500 |
|
}, |
|
{ |
|
"epoch": 25.575447570332482, |
|
"grad_norm": 6.133412837982178, |
|
"learning_rate": 2.949701619778346e-06, |
|
"loss": 2.1827, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 25.61419824846935, |
|
"grad_norm": 7.019827365875244, |
|
"learning_rate": 2.923867834353768e-06, |
|
"loss": 2.1654, |
|
"step": 330500 |
|
}, |
|
{ |
|
"epoch": 25.652948926606214, |
|
"grad_norm": 7.326742172241211, |
|
"learning_rate": 2.8980340489291895e-06, |
|
"loss": 2.1929, |
|
"step": 331000 |
|
}, |
|
{ |
|
"epoch": 25.691699604743082, |
|
"grad_norm": 7.231571674346924, |
|
"learning_rate": 2.8722002635046114e-06, |
|
"loss": 2.1913, |
|
"step": 331500 |
|
}, |
|
{ |
|
"epoch": 25.73045028287995, |
|
"grad_norm": 7.050189971923828, |
|
"learning_rate": 2.8463664780800333e-06, |
|
"loss": 2.191, |
|
"step": 332000 |
|
}, |
|
{ |
|
"epoch": 25.769200961016818, |
|
"grad_norm": 6.654092311859131, |
|
"learning_rate": 2.8205326926554548e-06, |
|
"loss": 2.1871, |
|
"step": 332500 |
|
}, |
|
{ |
|
"epoch": 25.807951639153686, |
|
"grad_norm": 7.114500522613525, |
|
"learning_rate": 2.7946989072308767e-06, |
|
"loss": 2.1842, |
|
"step": 333000 |
|
}, |
|
{ |
|
"epoch": 25.846702317290553, |
|
"grad_norm": 6.987917900085449, |
|
"learning_rate": 2.7688651218062986e-06, |
|
"loss": 2.1782, |
|
"step": 333500 |
|
}, |
|
{ |
|
"epoch": 25.88545299542742, |
|
"grad_norm": 6.479386806488037, |
|
"learning_rate": 2.74303133638172e-06, |
|
"loss": 2.1904, |
|
"step": 334000 |
|
}, |
|
{ |
|
"epoch": 25.924203673564286, |
|
"grad_norm": 6.597611904144287, |
|
"learning_rate": 2.717197550957142e-06, |
|
"loss": 2.1782, |
|
"step": 334500 |
|
}, |
|
{ |
|
"epoch": 25.962954351701153, |
|
"grad_norm": 7.492031097412109, |
|
"learning_rate": 2.6913637655325634e-06, |
|
"loss": 2.1976, |
|
"step": 335000 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_loss": 2.144183874130249, |
|
"eval_runtime": 268.6578, |
|
"eval_samples_per_second": 768.505, |
|
"eval_steps_per_second": 12.012, |
|
"step": 335478 |
|
}, |
|
{ |
|
"epoch": 26.00170502983802, |
|
"grad_norm": 7.5874552726745605, |
|
"learning_rate": 2.6655299801079853e-06, |
|
"loss": 2.1755, |
|
"step": 335500 |
|
}, |
|
{ |
|
"epoch": 26.04045570797489, |
|
"grad_norm": 7.499856948852539, |
|
"learning_rate": 2.6396961946834072e-06, |
|
"loss": 2.1885, |
|
"step": 336000 |
|
}, |
|
{ |
|
"epoch": 26.079206386111757, |
|
"grad_norm": 7.2821946144104, |
|
"learning_rate": 2.6138624092588287e-06, |
|
"loss": 2.1782, |
|
"step": 336500 |
|
}, |
|
{ |
|
"epoch": 26.117957064248625, |
|
"grad_norm": 7.0137834548950195, |
|
"learning_rate": 2.5880286238342506e-06, |
|
"loss": 2.1688, |
|
"step": 337000 |
|
}, |
|
{ |
|
"epoch": 26.156707742385493, |
|
"grad_norm": 6.468008518218994, |
|
"learning_rate": 2.5621948384096725e-06, |
|
"loss": 2.1735, |
|
"step": 337500 |
|
}, |
|
{ |
|
"epoch": 26.19545842052236, |
|
"grad_norm": 6.922983169555664, |
|
"learning_rate": 2.536361052985094e-06, |
|
"loss": 2.1643, |
|
"step": 338000 |
|
}, |
|
{ |
|
"epoch": 26.23420909865923, |
|
"grad_norm": 6.963326454162598, |
|
"learning_rate": 2.510527267560516e-06, |
|
"loss": 2.1569, |
|
"step": 338500 |
|
}, |
|
{ |
|
"epoch": 26.272959776796093, |
|
"grad_norm": 6.4791579246521, |
|
"learning_rate": 2.4846934821359373e-06, |
|
"loss": 2.1816, |
|
"step": 339000 |
|
}, |
|
{ |
|
"epoch": 26.31171045493296, |
|
"grad_norm": 7.289137840270996, |
|
"learning_rate": 2.4588596967113592e-06, |
|
"loss": 2.1628, |
|
"step": 339500 |
|
}, |
|
{ |
|
"epoch": 26.350461133069828, |
|
"grad_norm": 7.020922660827637, |
|
"learning_rate": 2.433025911286781e-06, |
|
"loss": 2.1608, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 26.389211811206696, |
|
"grad_norm": 6.522220134735107, |
|
"learning_rate": 2.4071921258622026e-06, |
|
"loss": 2.1736, |
|
"step": 340500 |
|
}, |
|
{ |
|
"epoch": 26.427962489343564, |
|
"grad_norm": 7.149320602416992, |
|
"learning_rate": 2.3813583404376245e-06, |
|
"loss": 2.1761, |
|
"step": 341000 |
|
}, |
|
{ |
|
"epoch": 26.46671316748043, |
|
"grad_norm": 7.04742431640625, |
|
"learning_rate": 2.3555245550130464e-06, |
|
"loss": 2.168, |
|
"step": 341500 |
|
}, |
|
{ |
|
"epoch": 26.5054638456173, |
|
"grad_norm": 7.135145664215088, |
|
"learning_rate": 2.329690769588468e-06, |
|
"loss": 2.1928, |
|
"step": 342000 |
|
}, |
|
{ |
|
"epoch": 26.544214523754167, |
|
"grad_norm": 7.492802619934082, |
|
"learning_rate": 2.3038569841638898e-06, |
|
"loss": 2.1764, |
|
"step": 342500 |
|
}, |
|
{ |
|
"epoch": 26.58296520189103, |
|
"grad_norm": 6.618491172790527, |
|
"learning_rate": 2.2780231987393112e-06, |
|
"loss": 2.1768, |
|
"step": 343000 |
|
}, |
|
{ |
|
"epoch": 26.6217158800279, |
|
"grad_norm": 6.808167457580566, |
|
"learning_rate": 2.252189413314733e-06, |
|
"loss": 2.1623, |
|
"step": 343500 |
|
}, |
|
{ |
|
"epoch": 26.660466558164767, |
|
"grad_norm": 6.65431022644043, |
|
"learning_rate": 2.226355627890155e-06, |
|
"loss": 2.1658, |
|
"step": 344000 |
|
}, |
|
{ |
|
"epoch": 26.699217236301635, |
|
"grad_norm": 7.762594699859619, |
|
"learning_rate": 2.2005218424655765e-06, |
|
"loss": 2.1794, |
|
"step": 344500 |
|
}, |
|
{ |
|
"epoch": 26.737967914438503, |
|
"grad_norm": 6.6927056312561035, |
|
"learning_rate": 2.1746880570409984e-06, |
|
"loss": 2.1624, |
|
"step": 345000 |
|
}, |
|
{ |
|
"epoch": 26.77671859257537, |
|
"grad_norm": 6.606927394866943, |
|
"learning_rate": 2.1488542716164203e-06, |
|
"loss": 2.1741, |
|
"step": 345500 |
|
}, |
|
{ |
|
"epoch": 26.81546927071224, |
|
"grad_norm": 6.104671955108643, |
|
"learning_rate": 2.1230204861918418e-06, |
|
"loss": 2.1716, |
|
"step": 346000 |
|
}, |
|
{ |
|
"epoch": 26.854219948849106, |
|
"grad_norm": 5.965663433074951, |
|
"learning_rate": 2.0971867007672637e-06, |
|
"loss": 2.1674, |
|
"step": 346500 |
|
}, |
|
{ |
|
"epoch": 26.89297062698597, |
|
"grad_norm": 6.041355133056641, |
|
"learning_rate": 2.071352915342685e-06, |
|
"loss": 2.181, |
|
"step": 347000 |
|
}, |
|
{ |
|
"epoch": 26.93172130512284, |
|
"grad_norm": 7.279519557952881, |
|
"learning_rate": 2.045519129918107e-06, |
|
"loss": 2.1661, |
|
"step": 347500 |
|
}, |
|
{ |
|
"epoch": 26.970471983259706, |
|
"grad_norm": 6.790727615356445, |
|
"learning_rate": 2.019685344493529e-06, |
|
"loss": 2.1658, |
|
"step": 348000 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_loss": 2.137254238128662, |
|
"eval_runtime": 268.8992, |
|
"eval_samples_per_second": 767.815, |
|
"eval_steps_per_second": 12.001, |
|
"step": 348381 |
|
}, |
|
{ |
|
"epoch": 27.009222661396574, |
|
"grad_norm": 6.905515193939209, |
|
"learning_rate": 1.9938515590689504e-06, |
|
"loss": 2.1569, |
|
"step": 348500 |
|
}, |
|
{ |
|
"epoch": 27.047973339533442, |
|
"grad_norm": 6.515853404998779, |
|
"learning_rate": 1.9680177736443723e-06, |
|
"loss": 2.1713, |
|
"step": 349000 |
|
}, |
|
{ |
|
"epoch": 27.08672401767031, |
|
"grad_norm": 6.981870651245117, |
|
"learning_rate": 1.942183988219794e-06, |
|
"loss": 2.1745, |
|
"step": 349500 |
|
}, |
|
{ |
|
"epoch": 27.125474695807178, |
|
"grad_norm": 6.35358190536499, |
|
"learning_rate": 1.9163502027952157e-06, |
|
"loss": 2.1644, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 27.164225373944046, |
|
"grad_norm": 7.149428844451904, |
|
"learning_rate": 1.8905164173706376e-06, |
|
"loss": 2.1816, |
|
"step": 350500 |
|
}, |
|
{ |
|
"epoch": 27.20297605208091, |
|
"grad_norm": 7.136536121368408, |
|
"learning_rate": 1.8646826319460593e-06, |
|
"loss": 2.1562, |
|
"step": 351000 |
|
}, |
|
{ |
|
"epoch": 27.241726730217778, |
|
"grad_norm": 6.473196506500244, |
|
"learning_rate": 1.838848846521481e-06, |
|
"loss": 2.167, |
|
"step": 351500 |
|
}, |
|
{ |
|
"epoch": 27.280477408354646, |
|
"grad_norm": 6.8429694175720215, |
|
"learning_rate": 1.8130150610969026e-06, |
|
"loss": 2.1587, |
|
"step": 352000 |
|
}, |
|
{ |
|
"epoch": 27.319228086491513, |
|
"grad_norm": 6.667392253875732, |
|
"learning_rate": 1.7871812756723245e-06, |
|
"loss": 2.1575, |
|
"step": 352500 |
|
}, |
|
{ |
|
"epoch": 27.35797876462838, |
|
"grad_norm": 7.551825046539307, |
|
"learning_rate": 1.7613474902477462e-06, |
|
"loss": 2.1567, |
|
"step": 353000 |
|
}, |
|
{ |
|
"epoch": 27.39672944276525, |
|
"grad_norm": 7.393056392669678, |
|
"learning_rate": 1.735513704823168e-06, |
|
"loss": 2.163, |
|
"step": 353500 |
|
}, |
|
{ |
|
"epoch": 27.435480120902117, |
|
"grad_norm": 6.7227678298950195, |
|
"learning_rate": 1.7096799193985896e-06, |
|
"loss": 2.1678, |
|
"step": 354000 |
|
}, |
|
{ |
|
"epoch": 27.474230799038985, |
|
"grad_norm": 6.587380409240723, |
|
"learning_rate": 1.6838461339740115e-06, |
|
"loss": 2.1611, |
|
"step": 354500 |
|
}, |
|
{ |
|
"epoch": 27.51298147717585, |
|
"grad_norm": 7.290678977966309, |
|
"learning_rate": 1.6580123485494332e-06, |
|
"loss": 2.1555, |
|
"step": 355000 |
|
}, |
|
{ |
|
"epoch": 27.551732155312717, |
|
"grad_norm": 6.52154016494751, |
|
"learning_rate": 1.6321785631248548e-06, |
|
"loss": 2.1487, |
|
"step": 355500 |
|
}, |
|
{ |
|
"epoch": 27.590482833449585, |
|
"grad_norm": 6.613160610198975, |
|
"learning_rate": 1.6063447777002765e-06, |
|
"loss": 2.1599, |
|
"step": 356000 |
|
}, |
|
{ |
|
"epoch": 27.629233511586452, |
|
"grad_norm": 7.148532390594482, |
|
"learning_rate": 1.5805109922756984e-06, |
|
"loss": 2.1731, |
|
"step": 356500 |
|
}, |
|
{ |
|
"epoch": 27.66798418972332, |
|
"grad_norm": 6.29647159576416, |
|
"learning_rate": 1.5546772068511201e-06, |
|
"loss": 2.1641, |
|
"step": 357000 |
|
}, |
|
{ |
|
"epoch": 27.706734867860188, |
|
"grad_norm": 6.647765636444092, |
|
"learning_rate": 1.5288434214265418e-06, |
|
"loss": 2.1756, |
|
"step": 357500 |
|
}, |
|
{ |
|
"epoch": 27.745485545997056, |
|
"grad_norm": 6.541094779968262, |
|
"learning_rate": 1.5030096360019635e-06, |
|
"loss": 2.1584, |
|
"step": 358000 |
|
}, |
|
{ |
|
"epoch": 27.784236224133924, |
|
"grad_norm": 7.08396053314209, |
|
"learning_rate": 1.4771758505773854e-06, |
|
"loss": 2.1551, |
|
"step": 358500 |
|
}, |
|
{ |
|
"epoch": 27.822986902270788, |
|
"grad_norm": 6.8339643478393555, |
|
"learning_rate": 1.451342065152807e-06, |
|
"loss": 2.1575, |
|
"step": 359000 |
|
}, |
|
{ |
|
"epoch": 27.861737580407656, |
|
"grad_norm": 6.175314903259277, |
|
"learning_rate": 1.4255082797282288e-06, |
|
"loss": 2.1312, |
|
"step": 359500 |
|
}, |
|
{ |
|
"epoch": 27.900488258544524, |
|
"grad_norm": 6.25184965133667, |
|
"learning_rate": 1.3996744943036504e-06, |
|
"loss": 2.1509, |
|
"step": 360000 |
|
}, |
|
{ |
|
"epoch": 27.93923893668139, |
|
"grad_norm": 7.08027982711792, |
|
"learning_rate": 1.3738407088790723e-06, |
|
"loss": 2.159, |
|
"step": 360500 |
|
}, |
|
{ |
|
"epoch": 27.97798961481826, |
|
"grad_norm": 6.8008880615234375, |
|
"learning_rate": 1.348006923454494e-06, |
|
"loss": 2.1634, |
|
"step": 361000 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_loss": 2.130622386932373, |
|
"eval_runtime": 269.6844, |
|
"eval_samples_per_second": 765.58, |
|
"eval_steps_per_second": 11.966, |
|
"step": 361284 |
|
}, |
|
{ |
|
"epoch": 28.016740292955127, |
|
"grad_norm": 7.46795654296875, |
|
"learning_rate": 1.3221731380299157e-06, |
|
"loss": 2.1363, |
|
"step": 361500 |
|
}, |
|
{ |
|
"epoch": 28.055490971091995, |
|
"grad_norm": 7.271740436553955, |
|
"learning_rate": 1.2963393526053374e-06, |
|
"loss": 2.1604, |
|
"step": 362000 |
|
}, |
|
{ |
|
"epoch": 28.094241649228863, |
|
"grad_norm": 6.692265510559082, |
|
"learning_rate": 1.2705055671807593e-06, |
|
"loss": 2.1596, |
|
"step": 362500 |
|
}, |
|
{ |
|
"epoch": 28.132992327365727, |
|
"grad_norm": 6.122591018676758, |
|
"learning_rate": 1.2446717817561808e-06, |
|
"loss": 2.1524, |
|
"step": 363000 |
|
}, |
|
{ |
|
"epoch": 28.171743005502595, |
|
"grad_norm": 6.683858394622803, |
|
"learning_rate": 1.2188379963316027e-06, |
|
"loss": 2.1598, |
|
"step": 363500 |
|
}, |
|
{ |
|
"epoch": 28.210493683639463, |
|
"grad_norm": 6.768929958343506, |
|
"learning_rate": 1.1930042109070243e-06, |
|
"loss": 2.1515, |
|
"step": 364000 |
|
}, |
|
{ |
|
"epoch": 28.24924436177633, |
|
"grad_norm": 6.956704139709473, |
|
"learning_rate": 1.167170425482446e-06, |
|
"loss": 2.1552, |
|
"step": 364500 |
|
}, |
|
{ |
|
"epoch": 28.2879950399132, |
|
"grad_norm": 6.655780792236328, |
|
"learning_rate": 1.1413366400578677e-06, |
|
"loss": 2.1551, |
|
"step": 365000 |
|
}, |
|
{ |
|
"epoch": 28.326745718050066, |
|
"grad_norm": 7.394413471221924, |
|
"learning_rate": 1.1155028546332896e-06, |
|
"loss": 2.1465, |
|
"step": 365500 |
|
}, |
|
{ |
|
"epoch": 28.365496396186934, |
|
"grad_norm": 7.250267505645752, |
|
"learning_rate": 1.0896690692087113e-06, |
|
"loss": 2.1729, |
|
"step": 366000 |
|
}, |
|
{ |
|
"epoch": 28.404247074323802, |
|
"grad_norm": 6.102252960205078, |
|
"learning_rate": 1.063835283784133e-06, |
|
"loss": 2.1556, |
|
"step": 366500 |
|
}, |
|
{ |
|
"epoch": 28.44299775246067, |
|
"grad_norm": 6.5598297119140625, |
|
"learning_rate": 1.0380014983595547e-06, |
|
"loss": 2.1473, |
|
"step": 367000 |
|
}, |
|
{ |
|
"epoch": 28.481748430597534, |
|
"grad_norm": 7.368846416473389, |
|
"learning_rate": 1.0121677129349766e-06, |
|
"loss": 2.1552, |
|
"step": 367500 |
|
}, |
|
{ |
|
"epoch": 28.520499108734402, |
|
"grad_norm": 6.635545253753662, |
|
"learning_rate": 9.863339275103983e-07, |
|
"loss": 2.1584, |
|
"step": 368000 |
|
}, |
|
{ |
|
"epoch": 28.55924978687127, |
|
"grad_norm": 6.502518177032471, |
|
"learning_rate": 9.6050014208582e-07, |
|
"loss": 2.1669, |
|
"step": 368500 |
|
}, |
|
{ |
|
"epoch": 28.598000465008138, |
|
"grad_norm": 7.150147914886475, |
|
"learning_rate": 9.346663566612417e-07, |
|
"loss": 2.158, |
|
"step": 369000 |
|
}, |
|
{ |
|
"epoch": 28.636751143145005, |
|
"grad_norm": 6.391610622406006, |
|
"learning_rate": 9.088325712366634e-07, |
|
"loss": 2.1464, |
|
"step": 369500 |
|
}, |
|
{ |
|
"epoch": 28.675501821281873, |
|
"grad_norm": 6.436591625213623, |
|
"learning_rate": 8.829987858120852e-07, |
|
"loss": 2.1438, |
|
"step": 370000 |
|
}, |
|
{ |
|
"epoch": 28.71425249941874, |
|
"grad_norm": 6.646981716156006, |
|
"learning_rate": 8.571650003875069e-07, |
|
"loss": 2.1507, |
|
"step": 370500 |
|
}, |
|
{ |
|
"epoch": 28.753003177555605, |
|
"grad_norm": 6.943175792694092, |
|
"learning_rate": 8.313312149629287e-07, |
|
"loss": 2.1483, |
|
"step": 371000 |
|
}, |
|
{ |
|
"epoch": 28.791753855692473, |
|
"grad_norm": 6.345837116241455, |
|
"learning_rate": 8.054974295383504e-07, |
|
"loss": 2.1662, |
|
"step": 371500 |
|
}, |
|
{ |
|
"epoch": 28.83050453382934, |
|
"grad_norm": 6.562370300292969, |
|
"learning_rate": 7.79663644113772e-07, |
|
"loss": 2.1554, |
|
"step": 372000 |
|
}, |
|
{ |
|
"epoch": 28.86925521196621, |
|
"grad_norm": 6.556326866149902, |
|
"learning_rate": 7.538298586891937e-07, |
|
"loss": 2.1448, |
|
"step": 372500 |
|
}, |
|
{ |
|
"epoch": 28.908005890103077, |
|
"grad_norm": 6.487407684326172, |
|
"learning_rate": 7.279960732646154e-07, |
|
"loss": 2.1425, |
|
"step": 373000 |
|
}, |
|
{ |
|
"epoch": 28.946756568239945, |
|
"grad_norm": 7.614674091339111, |
|
"learning_rate": 7.021622878400372e-07, |
|
"loss": 2.1565, |
|
"step": 373500 |
|
}, |
|
{ |
|
"epoch": 28.985507246376812, |
|
"grad_norm": 6.897189140319824, |
|
"learning_rate": 6.763285024154589e-07, |
|
"loss": 2.1438, |
|
"step": 374000 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_loss": 2.1226651668548584, |
|
"eval_runtime": 266.9511, |
|
"eval_samples_per_second": 773.419, |
|
"eval_steps_per_second": 12.088, |
|
"step": 374187 |
|
}, |
|
{ |
|
"epoch": 29.02425792451368, |
|
"grad_norm": 6.869750499725342, |
|
"learning_rate": 6.504947169908807e-07, |
|
"loss": 2.138, |
|
"step": 374500 |
|
}, |
|
{ |
|
"epoch": 29.063008602650548, |
|
"grad_norm": 7.1249589920043945, |
|
"learning_rate": 6.246609315663025e-07, |
|
"loss": 2.1527, |
|
"step": 375000 |
|
}, |
|
{ |
|
"epoch": 29.101759280787412, |
|
"grad_norm": 7.201192378997803, |
|
"learning_rate": 5.988271461417243e-07, |
|
"loss": 2.1517, |
|
"step": 375500 |
|
}, |
|
{ |
|
"epoch": 29.14050995892428, |
|
"grad_norm": 6.720222473144531, |
|
"learning_rate": 5.729933607171459e-07, |
|
"loss": 2.1526, |
|
"step": 376000 |
|
}, |
|
{ |
|
"epoch": 29.179260637061148, |
|
"grad_norm": 6.9030866622924805, |
|
"learning_rate": 5.471595752925676e-07, |
|
"loss": 2.1532, |
|
"step": 376500 |
|
}, |
|
{ |
|
"epoch": 29.218011315198016, |
|
"grad_norm": 5.900801181793213, |
|
"learning_rate": 5.213257898679893e-07, |
|
"loss": 2.1534, |
|
"step": 377000 |
|
}, |
|
{ |
|
"epoch": 29.256761993334884, |
|
"grad_norm": 6.259501934051514, |
|
"learning_rate": 4.954920044434111e-07, |
|
"loss": 2.1621, |
|
"step": 377500 |
|
}, |
|
{ |
|
"epoch": 29.29551267147175, |
|
"grad_norm": 6.566405296325684, |
|
"learning_rate": 4.6965821901883286e-07, |
|
"loss": 2.1621, |
|
"step": 378000 |
|
}, |
|
{ |
|
"epoch": 29.33426334960862, |
|
"grad_norm": 6.553793430328369, |
|
"learning_rate": 4.438244335942546e-07, |
|
"loss": 2.1631, |
|
"step": 378500 |
|
}, |
|
{ |
|
"epoch": 29.373014027745487, |
|
"grad_norm": 6.773620128631592, |
|
"learning_rate": 4.1799064816967634e-07, |
|
"loss": 2.1556, |
|
"step": 379000 |
|
}, |
|
{ |
|
"epoch": 29.41176470588235, |
|
"grad_norm": 6.494615077972412, |
|
"learning_rate": 3.921568627450981e-07, |
|
"loss": 2.1554, |
|
"step": 379500 |
|
}, |
|
{ |
|
"epoch": 29.45051538401922, |
|
"grad_norm": 7.172949314117432, |
|
"learning_rate": 3.663230773205198e-07, |
|
"loss": 2.1494, |
|
"step": 380000 |
|
}, |
|
{ |
|
"epoch": 29.489266062156087, |
|
"grad_norm": 6.8991241455078125, |
|
"learning_rate": 3.4048929189594155e-07, |
|
"loss": 2.1406, |
|
"step": 380500 |
|
}, |
|
{ |
|
"epoch": 29.528016740292955, |
|
"grad_norm": 7.046799182891846, |
|
"learning_rate": 3.146555064713633e-07, |
|
"loss": 2.1493, |
|
"step": 381000 |
|
}, |
|
{ |
|
"epoch": 29.566767418429823, |
|
"grad_norm": 6.826701641082764, |
|
"learning_rate": 2.8882172104678503e-07, |
|
"loss": 2.1459, |
|
"step": 381500 |
|
}, |
|
{ |
|
"epoch": 29.60551809656669, |
|
"grad_norm": 6.649389743804932, |
|
"learning_rate": 2.6298793562220677e-07, |
|
"loss": 2.1593, |
|
"step": 382000 |
|
}, |
|
{ |
|
"epoch": 29.64426877470356, |
|
"grad_norm": 6.10260009765625, |
|
"learning_rate": 2.3715415019762845e-07, |
|
"loss": 2.151, |
|
"step": 382500 |
|
}, |
|
{ |
|
"epoch": 29.683019452840426, |
|
"grad_norm": 6.9101128578186035, |
|
"learning_rate": 2.113203647730502e-07, |
|
"loss": 2.1625, |
|
"step": 383000 |
|
}, |
|
{ |
|
"epoch": 29.72177013097729, |
|
"grad_norm": 6.671387672424316, |
|
"learning_rate": 1.8548657934847193e-07, |
|
"loss": 2.1496, |
|
"step": 383500 |
|
}, |
|
{ |
|
"epoch": 29.76052080911416, |
|
"grad_norm": 7.705864429473877, |
|
"learning_rate": 1.5965279392389367e-07, |
|
"loss": 2.1496, |
|
"step": 384000 |
|
}, |
|
{ |
|
"epoch": 29.799271487251026, |
|
"grad_norm": 6.319827079772949, |
|
"learning_rate": 1.338190084993154e-07, |
|
"loss": 2.149, |
|
"step": 384500 |
|
}, |
|
{ |
|
"epoch": 29.838022165387894, |
|
"grad_norm": 6.850592613220215, |
|
"learning_rate": 1.0798522307473716e-07, |
|
"loss": 2.1511, |
|
"step": 385000 |
|
}, |
|
{ |
|
"epoch": 29.876772843524762, |
|
"grad_norm": 7.166690826416016, |
|
"learning_rate": 8.21514376501589e-08, |
|
"loss": 2.1492, |
|
"step": 385500 |
|
}, |
|
{ |
|
"epoch": 29.91552352166163, |
|
"grad_norm": 6.324465274810791, |
|
"learning_rate": 5.631765222558062e-08, |
|
"loss": 2.1523, |
|
"step": 386000 |
|
}, |
|
{ |
|
"epoch": 29.954274199798498, |
|
"grad_norm": 7.214087009429932, |
|
"learning_rate": 3.048386680100235e-08, |
|
"loss": 2.1628, |
|
"step": 386500 |
|
}, |
|
{ |
|
"epoch": 29.993024877935365, |
|
"grad_norm": 6.88369607925415, |
|
"learning_rate": 4.6500813764240875e-09, |
|
"loss": 2.144, |
|
"step": 387000 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_loss": 2.125218629837036, |
|
"eval_runtime": 267.0237, |
|
"eval_samples_per_second": 773.209, |
|
"eval_steps_per_second": 12.085, |
|
"step": 387090 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 387090, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 30, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.706577784666885e+17, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|