|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.954337899543379, |
|
"eval_steps": 500, |
|
"global_step": 1090, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0091324200913242, |
|
"grad_norm": 382.0, |
|
"learning_rate": 1.8348623853211011e-06, |
|
"loss": 46.9033, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.045662100456621, |
|
"grad_norm": 318.0, |
|
"learning_rate": 9.174311926605506e-06, |
|
"loss": 46.3618, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.091324200913242, |
|
"grad_norm": 139.0, |
|
"learning_rate": 1.834862385321101e-05, |
|
"loss": 39.3883, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.136986301369863, |
|
"grad_norm": 52.25, |
|
"learning_rate": 2.7522935779816515e-05, |
|
"loss": 29.4216, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.182648401826484, |
|
"grad_norm": 20.25, |
|
"learning_rate": 3.669724770642202e-05, |
|
"loss": 24.7169, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.228310502283105, |
|
"grad_norm": 10.0, |
|
"learning_rate": 4.587155963302753e-05, |
|
"loss": 21.4244, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.273972602739726, |
|
"grad_norm": 5.0, |
|
"learning_rate": 5.504587155963303e-05, |
|
"loss": 19.7804, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.319634703196347, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 6.422018348623854e-05, |
|
"loss": 19.1075, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.365296803652968, |
|
"grad_norm": 8.5, |
|
"learning_rate": 7.339449541284404e-05, |
|
"loss": 18.037, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.410958904109589, |
|
"grad_norm": 18.75, |
|
"learning_rate": 8.256880733944955e-05, |
|
"loss": 17.2652, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.45662100456621, |
|
"grad_norm": 29.875, |
|
"learning_rate": 9.174311926605506e-05, |
|
"loss": 14.4775, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.502283105022831, |
|
"grad_norm": 37.75, |
|
"learning_rate": 0.00010091743119266055, |
|
"loss": 9.6302, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.547945205479452, |
|
"grad_norm": 8.75, |
|
"learning_rate": 0.00011009174311926606, |
|
"loss": 4.0499, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.593607305936073, |
|
"grad_norm": 7.03125, |
|
"learning_rate": 0.00011926605504587157, |
|
"loss": 2.4784, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.639269406392694, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 0.00012844036697247707, |
|
"loss": 2.0967, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.684931506849315, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.00013761467889908258, |
|
"loss": 1.8211, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.730593607305936, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 0.0001467889908256881, |
|
"loss": 1.6757, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.776255707762557, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.0001559633027522936, |
|
"loss": 1.5691, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.821917808219178, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 0.0001651376146788991, |
|
"loss": 1.4872, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.867579908675799, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 0.00017431192660550458, |
|
"loss": 1.4228, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.91324200913242, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 0.00018348623853211012, |
|
"loss": 1.3778, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.958904109589041, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 0.0001926605504587156, |
|
"loss": 1.3554, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.9954337899543378, |
|
"eval_loss": 2.645094871520996, |
|
"eval_runtime": 0.2786, |
|
"eval_samples_per_second": 35.888, |
|
"eval_steps_per_second": 3.589, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.004566210045662, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.00019999948721966259, |
|
"loss": 1.326, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.0502283105022832, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00019998154046002822, |
|
"loss": 1.291, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.095890410958904, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.0001999379599421534, |
|
"loss": 1.2721, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.1415525114155252, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 0.00019986875683942535, |
|
"loss": 1.2479, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.187214611872146, |
|
"grad_norm": 3.375, |
|
"learning_rate": 0.00019977394889447524, |
|
"loss": 1.2491, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.2328767123287672, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.00019965356041462955, |
|
"loss": 1.2212, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.278538812785388, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 0.00019950762226567781, |
|
"loss": 1.2246, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.3242009132420092, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 0.00019933617186395917, |
|
"loss": 1.2387, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.36986301369863, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 0.00019913925316676945, |
|
"loss": 1.203, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.4155251141552512, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.00019891691666109113, |
|
"loss": 1.1869, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.461187214611872, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 0.00019866921935064906, |
|
"loss": 1.1858, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.5068493150684932, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.00019839622474129596, |
|
"loss": 1.1696, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.5525114155251143, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00019809800282473013, |
|
"loss": 1.1624, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.5981735159817352, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 0.0001977746300605507, |
|
"loss": 1.1494, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.643835616438356, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.00019742618935665476, |
|
"loss": 1.1314, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.6894977168949772, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 0.00019705277004798073, |
|
"loss": 1.1407, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.7351598173515983, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 0.0001966544678736044, |
|
"loss": 1.1287, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.7808219178082192, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.00019623138495219292, |
|
"loss": 1.1407, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.82648401826484, |
|
"grad_norm": 12.0, |
|
"learning_rate": 0.00019578362975582292, |
|
"loss": 1.1151, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.8721461187214612, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.00019531131708217005, |
|
"loss": 1.1221, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.9178082191780823, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.0001948145680250766, |
|
"loss": 1.0982, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.9634703196347032, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.00019429350994350483, |
|
"loss": 1.0898, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.508340358734131, |
|
"eval_runtime": 0.2456, |
|
"eval_samples_per_second": 40.72, |
|
"eval_steps_per_second": 4.072, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 2.009132420091324, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 0.00019374827642888398, |
|
"loss": 1.1068, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.0547945205479454, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 0.0001931790072708596, |
|
"loss": 1.0932, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.1004566210045663, |
|
"grad_norm": 1.5, |
|
"learning_rate": 0.00019258584842145343, |
|
"loss": 1.1001, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.146118721461187, |
|
"grad_norm": 11.25, |
|
"learning_rate": 0.00019196895195764362, |
|
"loss": 1.1001, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.191780821917808, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 0.0001913284760423745, |
|
"loss": 1.1046, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.237442922374429, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 0.00019066458488400584, |
|
"loss": 1.0795, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.2831050228310503, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.00018997744869421246, |
|
"loss": 1.0767, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.328767123287671, |
|
"grad_norm": 4.875, |
|
"learning_rate": 0.00018926724364434446, |
|
"loss": 1.059, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.374429223744292, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 0.0001885341518202595, |
|
"loss": 1.0695, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.4200913242009134, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.00018777836117563892, |
|
"loss": 1.0709, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.4657534246575343, |
|
"grad_norm": 7.59375, |
|
"learning_rate": 0.00018700006548379898, |
|
"loss": 1.0677, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.5114155251141552, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 0.0001861994642880105, |
|
"loss": 1.0693, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.557077625570776, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.00018537676285033887, |
|
"loss": 1.0508, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.602739726027397, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.0001845321720990181, |
|
"loss": 1.0449, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.6484018264840183, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00018366590857437184, |
|
"loss": 1.0562, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.6940639269406392, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 0.00018277819437329576, |
|
"loss": 1.0428, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.73972602739726, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 0.00018186925709231532, |
|
"loss": 1.0321, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.7853881278538815, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.0001809393297692334, |
|
"loss": 1.0253, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.8310502283105023, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 0.0001799886508233829, |
|
"loss": 1.0377, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.8767123287671232, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 0.0001790174639944997, |
|
"loss": 1.0359, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.922374429223744, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 0.00017802601828023138, |
|
"loss": 1.0428, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.968036529680365, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 0.00017701456787229804, |
|
"loss": 1.0434, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.9954337899543377, |
|
"eval_loss": 2.480058193206787, |
|
"eval_runtime": 0.2581, |
|
"eval_samples_per_second": 38.741, |
|
"eval_steps_per_second": 3.874, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 3.0136986301369864, |
|
"grad_norm": 7.625, |
|
"learning_rate": 0.0001759833720913214, |
|
"loss": 1.0302, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.0593607305936072, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 0.00017493269532033883, |
|
"loss": 1.0273, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 3.105022831050228, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 0.0001738628069370195, |
|
"loss": 1.0212, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.1506849315068495, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.00017277398124460023, |
|
"loss": 1.013, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 3.1963470319634704, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 0.000171666497401558, |
|
"loss": 1.0077, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 3.2420091324200913, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.0001705406393500381, |
|
"loss": 1.0111, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 3.287671232876712, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.00016939669574305566, |
|
"loss": 1.0047, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.0001682349598704892, |
|
"loss": 0.9977, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 3.3789954337899544, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.00016705572958388576, |
|
"loss": 0.9914, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.4246575342465753, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.00016585930722009601, |
|
"loss": 1.0012, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 3.470319634703196, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 0.00016464599952375998, |
|
"loss": 0.9888, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.5159817351598175, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.000163416117568662, |
|
"loss": 1.0036, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 3.5616438356164384, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 0.0001621699766779763, |
|
"loss": 0.9963, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.6073059360730593, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.00016090789634342278, |
|
"loss": 0.9955, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 3.65296803652968, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 0.00015963020014335438, |
|
"loss": 0.9953, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.6986301369863015, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 0.0001583372156597961, |
|
"loss": 0.9959, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 3.7442922374429224, |
|
"grad_norm": 1.75, |
|
"learning_rate": 0.00015702927439445826, |
|
"loss": 0.9906, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.7899543378995433, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.00015570671168374438, |
|
"loss": 0.9849, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 3.8356164383561646, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 0.00015436986661277577, |
|
"loss": 0.9697, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.8812785388127855, |
|
"grad_norm": 1.5, |
|
"learning_rate": 0.0001530190819284555, |
|
"loss": 0.979, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.9269406392694064, |
|
"grad_norm": 10.75, |
|
"learning_rate": 0.00015165470395159313, |
|
"loss": 0.9715, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.9726027397260273, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 0.0001502770824881133, |
|
"loss": 0.9864, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.474334239959717, |
|
"eval_runtime": 0.2363, |
|
"eval_samples_per_second": 42.318, |
|
"eval_steps_per_second": 4.232, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 4.018264840182648, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.00014888657073937076, |
|
"loss": 0.9764, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 4.063926940639269, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.00014748352521159493, |
|
"loss": 0.9564, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 4.109589041095891, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.0001460683056244869, |
|
"loss": 0.9573, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 4.155251141552512, |
|
"grad_norm": 11.5625, |
|
"learning_rate": 0.00014464127481899312, |
|
"loss": 0.957, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 4.200913242009133, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.00014320279866427796, |
|
"loss": 0.9596, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 4.2465753424657535, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 0.00014175324596392075, |
|
"loss": 0.9647, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 4.292237442922374, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.00014029298836135988, |
|
"loss": 0.9632, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 4.337899543378995, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 0.00013882240024460927, |
|
"loss": 0.9664, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 4.383561643835616, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 0.0001373418586502706, |
|
"loss": 0.964, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 4.429223744292237, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.0001358517431668672, |
|
"loss": 0.9531, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 4.474885844748858, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.00013435243583752294, |
|
"loss": 0.958, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 4.52054794520548, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.00013284432106201233, |
|
"loss": 0.9514, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 4.566210045662101, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.00013132778549820618, |
|
"loss": 0.9588, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.6118721461187215, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.00012980321796293836, |
|
"loss": 0.9494, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 4.657534246575342, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.00012827100933231905, |
|
"loss": 0.9508, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 4.703196347031963, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.00012673155244151985, |
|
"loss": 0.9557, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 4.748858447488584, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.000125185241984057, |
|
"loss": 0.9508, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.794520547945205, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.00012363247441059776, |
|
"loss": 0.9562, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 4.840182648401827, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.00012207364782731655, |
|
"loss": 0.9542, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 4.885844748858448, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 0.00012050916189382646, |
|
"loss": 0.9606, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 4.931506849315069, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 0.00011893941772071249, |
|
"loss": 0.9424, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 4.9771689497716896, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.9371, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 4.995433789954338, |
|
"eval_loss": 2.485384464263916, |
|
"eval_runtime": 0.2567, |
|
"eval_samples_per_second": 38.96, |
|
"eval_steps_per_second": 3.896, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 5.0228310502283104, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 0.0001157857657354354, |
|
"loss": 0.9249, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 5.068493150684931, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 0.00011420266647205231, |
|
"loss": 0.9271, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 5.114155251141552, |
|
"grad_norm": 12.0, |
|
"learning_rate": 0.00011261592585930576, |
|
"loss": 0.9329, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 5.159817351598173, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.00011102595071354472, |
|
"loss": 0.9238, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 5.205479452054795, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 0.00010943314868040364, |
|
"loss": 0.9134, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 5.251141552511416, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.00010783792813028827, |
|
"loss": 0.91, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 5.296803652968037, |
|
"grad_norm": 4.375, |
|
"learning_rate": 0.00010624069805367559, |
|
"loss": 0.9193, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 5.342465753424658, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 0.00010464186795625482, |
|
"loss": 0.9101, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 5.3881278538812785, |
|
"grad_norm": 3.1875, |
|
"learning_rate": 0.00010304184775393642, |
|
"loss": 0.9122, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 5.433789954337899, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.00010144104766775572, |
|
"loss": 0.9126, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 5.47945205479452, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 9.983987811869862e-05, |
|
"loss": 0.9177, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 5.525114155251142, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 9.823874962247564e-05, |
|
"loss": 0.9089, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 5.570776255707763, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 9.663807268427198e-05, |
|
"loss": 0.9112, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 5.616438356164384, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 9.503825769350017e-05, |
|
"loss": 0.9142, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 5.662100456621005, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 9.343971481858246e-05, |
|
"loss": 0.9068, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 5.707762557077626, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 9.184285390178978e-05, |
|
"loss": 0.9134, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 5.7534246575342465, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 9.024808435416434e-05, |
|
"loss": 0.9106, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 5.799086757990867, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 8.865581505055291e-05, |
|
"loss": 0.9108, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 5.844748858447488, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 8.706645422477739e-05, |
|
"loss": 0.9027, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 5.890410958904109, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 8.548040936496989e-05, |
|
"loss": 0.9217, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 5.936073059360731, |
|
"grad_norm": 7.09375, |
|
"learning_rate": 8.389808710909881e-05, |
|
"loss": 0.9227, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 5.981735159817352, |
|
"grad_norm": 6.625, |
|
"learning_rate": 8.231989314071317e-05, |
|
"loss": 0.9157, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 2.46421480178833, |
|
"eval_runtime": 0.2356, |
|
"eval_samples_per_second": 42.441, |
|
"eval_steps_per_second": 4.244, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 6.027397260273973, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 8.07462320849313e-05, |
|
"loss": 0.902, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 6.073059360730594, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 7.917750740470117e-05, |
|
"loss": 0.8855, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 6.1187214611872145, |
|
"grad_norm": 1.25, |
|
"learning_rate": 7.761412129735852e-05, |
|
"loss": 0.9014, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 6.164383561643835, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 7.605647459150961e-05, |
|
"loss": 0.8863, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 6.210045662100456, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 7.450496664426477e-05, |
|
"loss": 0.8804, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 6.255707762557078, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 7.295999523884921e-05, |
|
"loss": 0.8795, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 6.301369863013699, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 7.142195648261747e-05, |
|
"loss": 0.8855, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 6.34703196347032, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 6.989124470549745e-05, |
|
"loss": 0.8799, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 6.392694063926941, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 6.83682523588902e-05, |
|
"loss": 0.8731, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 6.438356164383562, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 6.685336991505122e-05, |
|
"loss": 0.8818, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 6.4840182648401825, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 6.534698576697939e-05, |
|
"loss": 0.8792, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 6.529680365296803, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 6.384948612883873e-05, |
|
"loss": 0.8713, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 6.575342465753424, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 6.2361254936939e-05, |
|
"loss": 0.8762, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 6.621004566210045, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 6.088267375130023e-05, |
|
"loss": 0.8708, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 5.941412165782645e-05, |
|
"loss": 0.8797, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 6.712328767123288, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 5.79559751711138e-05, |
|
"loss": 0.8634, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 6.757990867579909, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 5.650860813791785e-05, |
|
"loss": 0.872, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 6.80365296803653, |
|
"grad_norm": 0.625, |
|
"learning_rate": 5.507239164130501e-05, |
|
"loss": 0.8661, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 6.8493150684931505, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 5.364769390551225e-05, |
|
"loss": 0.8744, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 6.894977168949771, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 5.2234880201540284e-05, |
|
"loss": 0.8662, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 6.940639269406392, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 5.0834312753503124e-05, |
|
"loss": 0.8764, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 6.986301369863014, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 4.9446350645759885e-05, |
|
"loss": 0.8657, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 6.995433789954338, |
|
"eval_loss": 2.5075883865356445, |
|
"eval_runtime": 0.2561, |
|
"eval_samples_per_second": 39.047, |
|
"eval_steps_per_second": 3.905, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 7.031963470319635, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 4.807134973085036e-05, |
|
"loss": 0.8614, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 7.077625570776256, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 4.6709662538260267e-05, |
|
"loss": 0.8457, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 7.123287671232877, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 4.53616381840377e-05, |
|
"loss": 0.8502, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 7.168949771689498, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 4.402762228128531e-05, |
|
"loss": 0.8536, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 7.2146118721461185, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 4.2707956851550016e-05, |
|
"loss": 0.8531, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 7.260273972602739, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 4.140298023713416e-05, |
|
"loss": 0.8609, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 7.30593607305936, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 4.011302701434937e-05, |
|
"loss": 0.8529, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 7.351598173515982, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 3.8838427907736476e-05, |
|
"loss": 0.8566, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 7.397260273972603, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 3.757950970527249e-05, |
|
"loss": 0.8508, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 7.442922374429224, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 3.633659517458736e-05, |
|
"loss": 0.8513, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 7.488584474885845, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 3.5110002980210975e-05, |
|
"loss": 0.8499, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 7.534246575342466, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 3.3900047601872596e-05, |
|
"loss": 0.8493, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 7.579908675799087, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 3.270703925387279e-05, |
|
"loss": 0.851, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 7.6255707762557075, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 3.153128380554941e-05, |
|
"loss": 0.8452, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 7.671232876712329, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 3.037308270285709e-05, |
|
"loss": 0.862, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 7.71689497716895, |
|
"grad_norm": 0.625, |
|
"learning_rate": 2.923273289108115e-05, |
|
"loss": 0.8487, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 7.762557077625571, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 2.8110526738705344e-05, |
|
"loss": 0.8516, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 7.808219178082192, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 2.7006751962452882e-05, |
|
"loss": 0.8541, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 7.853881278538813, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 2.592169155352031e-05, |
|
"loss": 0.8486, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 7.899543378995434, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 2.485562370502279e-05, |
|
"loss": 0.8402, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 7.945205479452055, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 2.3808821740669606e-05, |
|
"loss": 0.8474, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 7.9908675799086755, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 2.2781554044688015e-05, |
|
"loss": 0.8393, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 2.515906810760498, |
|
"eval_runtime": 0.2401, |
|
"eval_samples_per_second": 41.645, |
|
"eval_steps_per_second": 4.164, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 8.036529680365296, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 2.1774083993013718e-05, |
|
"loss": 0.8404, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 8.082191780821917, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 2.078666988576504e-05, |
|
"loss": 0.837, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 8.127853881278538, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 1.9819564881018983e-05, |
|
"loss": 0.8372, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 8.173515981735159, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 1.887301692990494e-05, |
|
"loss": 0.846, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 8.219178082191782, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 1.7947268713034127e-05, |
|
"loss": 0.8461, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 8.264840182648403, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 1.7042557578279626e-05, |
|
"loss": 0.8373, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 8.310502283105023, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 1.6159115479924257e-05, |
|
"loss": 0.8422, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 8.356164383561644, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 1.529716891919074e-05, |
|
"loss": 0.8403, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 8.401826484018265, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 1.4456938886170412e-05, |
|
"loss": 0.8343, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 8.447488584474886, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 1.3638640803164516e-05, |
|
"loss": 0.8355, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 8.493150684931507, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 1.2842484469453365e-05, |
|
"loss": 0.841, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 8.538812785388128, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 1.2068674007506786e-05, |
|
"loss": 0.8396, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 8.584474885844749, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 1.1317407810650372e-05, |
|
"loss": 0.8377, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 8.63013698630137, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 1.058887849220026e-05, |
|
"loss": 0.8348, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 8.67579908675799, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 9.883272836080116e-06, |
|
"loss": 0.8388, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 8.721461187214611, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 9.200771748932513e-06, |
|
"loss": 0.8366, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 8.767123287671232, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 8.541550213737171e-06, |
|
"loss": 0.8436, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 8.812785388127853, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 7.905777244947954e-06, |
|
"loss": 0.8409, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 8.858447488584474, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 7.293615845160196e-06, |
|
"loss": 0.8377, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 8.904109589041095, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 6.705222963319191e-06, |
|
"loss": 0.8425, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 8.949771689497716, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 6.140749454480932e-06, |
|
"loss": 0.8371, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 8.995433789954339, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 5.6003400411351325e-06, |
|
"loss": 0.8462, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 8.995433789954339, |
|
"eval_loss": 2.518533706665039, |
|
"eval_runtime": 0.2546, |
|
"eval_samples_per_second": 39.282, |
|
"eval_steps_per_second": 3.928, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 9.04109589041096, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 5.0841332761005e-06, |
|
"loss": 0.8404, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 9.08675799086758, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 4.592261507001993e-06, |
|
"loss": 0.8303, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 9.132420091324201, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 4.124850842338779e-06, |
|
"loss": 0.8325, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 9.178082191780822, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 3.6820211191520125e-06, |
|
"loss": 0.8353, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 9.223744292237443, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 3.263885872300343e-06, |
|
"loss": 0.8347, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 9.269406392694064, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 2.8705523053513816e-06, |
|
"loss": 0.8329, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 9.315068493150685, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 2.502121263096224e-06, |
|
"loss": 0.8369, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 9.360730593607306, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 2.1586872056944428e-06, |
|
"loss": 0.8324, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 9.406392694063927, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 1.840338184455881e-06, |
|
"loss": 0.8383, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 9.452054794520548, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 1.5471558192656777e-06, |
|
"loss": 0.8315, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 9.497716894977168, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 1.2792152776580968e-06, |
|
"loss": 0.8437, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 9.54337899543379, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 1.036585255544764e-06, |
|
"loss": 0.8418, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 9.58904109589041, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 8.193279596020121e-07, |
|
"loss": 0.8346, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 9.634703196347033, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 6.274990913221035e-07, |
|
"loss": 0.8415, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 9.680365296803654, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 4.6114783273213393e-07, |
|
"loss": 0.8339, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 9.726027397260275, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 3.203168337845508e-07, |
|
"loss": 0.8331, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 9.771689497716896, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 2.05042201422323e-07, |
|
"loss": 0.8458, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 9.817351598173516, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 1.1535349032167908e-07, |
|
"loss": 0.8444, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 9.863013698630137, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 5.127369531473525e-08, |
|
"loss": 0.8486, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 9.908675799086758, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 1.2819245493955744e-08, |
|
"loss": 0.8473, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 9.954337899543379, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 0.0, |
|
"loss": 0.8359, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 9.954337899543379, |
|
"eval_loss": 2.515676259994507, |
|
"eval_runtime": 0.2343, |
|
"eval_samples_per_second": 42.683, |
|
"eval_steps_per_second": 4.268, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 9.954337899543379, |
|
"step": 1090, |
|
"total_flos": 3.327732991202951e+18, |
|
"train_loss": 2.136770288659892, |
|
"train_runtime": 2636.8816, |
|
"train_samples_per_second": 26.554, |
|
"train_steps_per_second": 0.413 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1090, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"total_flos": 3.327732991202951e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|