|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9676393598144797, |
|
"eval_steps": 1000, |
|
"global_step": 28000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03513641713954428, |
|
"grad_norm": 0.8998513221740723, |
|
"learning_rate": 0.00019768068408105893, |
|
"loss": 1.0077, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.07027283427908856, |
|
"grad_norm": 0.8432224988937378, |
|
"learning_rate": 0.00019533794072859319, |
|
"loss": 0.868, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.07027283427908856, |
|
"eval_loss": 0.848849356174469, |
|
"eval_runtime": 335.8512, |
|
"eval_samples_per_second": 37.663, |
|
"eval_steps_per_second": 4.71, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.10540925141863285, |
|
"grad_norm": 1.1127365827560425, |
|
"learning_rate": 0.00019299519737612746, |
|
"loss": 0.822, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.14054566855817713, |
|
"grad_norm": 0.7712583541870117, |
|
"learning_rate": 0.00019065245402366172, |
|
"loss": 0.7967, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.14054566855817713, |
|
"eval_loss": 0.7973089814186096, |
|
"eval_runtime": 336.112, |
|
"eval_samples_per_second": 37.633, |
|
"eval_steps_per_second": 4.707, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.1756820856977214, |
|
"grad_norm": 1.2161240577697754, |
|
"learning_rate": 0.000188309710671196, |
|
"loss": 0.7871, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.2108185028372657, |
|
"grad_norm": 0.806480348110199, |
|
"learning_rate": 0.00018596696731873024, |
|
"loss": 0.7669, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.2108185028372657, |
|
"eval_loss": 0.7698538303375244, |
|
"eval_runtime": 336.2265, |
|
"eval_samples_per_second": 37.62, |
|
"eval_steps_per_second": 4.705, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.24595491997680996, |
|
"grad_norm": 0.7688671350479126, |
|
"learning_rate": 0.0001836242239662645, |
|
"loss": 0.7486, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.28109133711635426, |
|
"grad_norm": 0.7088080644607544, |
|
"learning_rate": 0.00018128148061379877, |
|
"loss": 0.7548, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.28109133711635426, |
|
"eval_loss": 0.7532803416252136, |
|
"eval_runtime": 336.8838, |
|
"eval_samples_per_second": 37.547, |
|
"eval_steps_per_second": 4.696, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.31622775425589855, |
|
"grad_norm": 0.7568556070327759, |
|
"learning_rate": 0.00017893873726133302, |
|
"loss": 0.747, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.3513641713954428, |
|
"grad_norm": 0.7561419606208801, |
|
"learning_rate": 0.0001765959939088673, |
|
"loss": 0.7327, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.3513641713954428, |
|
"eval_loss": 0.7386600375175476, |
|
"eval_runtime": 335.7492, |
|
"eval_samples_per_second": 37.674, |
|
"eval_steps_per_second": 4.712, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.3865005885349871, |
|
"grad_norm": 0.829656720161438, |
|
"learning_rate": 0.00017425325055640155, |
|
"loss": 0.7327, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.4216370056745314, |
|
"grad_norm": 0.8936611413955688, |
|
"learning_rate": 0.00017191050720393583, |
|
"loss": 0.718, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.4216370056745314, |
|
"eval_loss": 0.7304050326347351, |
|
"eval_runtime": 336.6038, |
|
"eval_samples_per_second": 37.578, |
|
"eval_steps_per_second": 4.7, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.4567734228140756, |
|
"grad_norm": 0.79137122631073, |
|
"learning_rate": 0.00016956776385147008, |
|
"loss": 0.7208, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.4919098399536199, |
|
"grad_norm": 0.8772656321525574, |
|
"learning_rate": 0.00016722502049900433, |
|
"loss": 0.705, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.4919098399536199, |
|
"eval_loss": 0.7186248302459717, |
|
"eval_runtime": 336.6708, |
|
"eval_samples_per_second": 37.571, |
|
"eval_steps_per_second": 4.699, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.5270462570931642, |
|
"grad_norm": 0.8973077535629272, |
|
"learning_rate": 0.0001648822771465386, |
|
"loss": 0.7081, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.5621826742327085, |
|
"grad_norm": 0.7941250801086426, |
|
"learning_rate": 0.00016253953379407286, |
|
"loss": 0.7086, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.5621826742327085, |
|
"eval_loss": 0.7134420275688171, |
|
"eval_runtime": 335.4818, |
|
"eval_samples_per_second": 37.704, |
|
"eval_steps_per_second": 4.716, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.5973190913722528, |
|
"grad_norm": 0.5397359132766724, |
|
"learning_rate": 0.00016019679044160714, |
|
"loss": 0.7049, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.6324555085117971, |
|
"grad_norm": 1.6500648260116577, |
|
"learning_rate": 0.0001578540470891414, |
|
"loss": 0.7047, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.6324555085117971, |
|
"eval_loss": 0.7038806080818176, |
|
"eval_runtime": 335.4609, |
|
"eval_samples_per_second": 37.706, |
|
"eval_steps_per_second": 4.716, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.6675919256513413, |
|
"grad_norm": 1.0555411577224731, |
|
"learning_rate": 0.00015551130373667567, |
|
"loss": 0.7005, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.7027283427908856, |
|
"grad_norm": 0.8140186071395874, |
|
"learning_rate": 0.00015316856038420992, |
|
"loss": 0.6874, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.7027283427908856, |
|
"eval_loss": 0.7008971571922302, |
|
"eval_runtime": 335.6608, |
|
"eval_samples_per_second": 37.684, |
|
"eval_steps_per_second": 4.713, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.7378647599304299, |
|
"grad_norm": 1.4687775373458862, |
|
"learning_rate": 0.00015082581703174417, |
|
"loss": 0.6966, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.7730011770699742, |
|
"grad_norm": 0.6557592153549194, |
|
"learning_rate": 0.00014848307367927845, |
|
"loss": 0.7014, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.7730011770699742, |
|
"eval_loss": 0.6960607767105103, |
|
"eval_runtime": 335.7991, |
|
"eval_samples_per_second": 37.668, |
|
"eval_steps_per_second": 4.711, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.8081375942095185, |
|
"grad_norm": 0.9453054666519165, |
|
"learning_rate": 0.0001461403303268127, |
|
"loss": 0.6824, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.8432740113490628, |
|
"grad_norm": 0.6535905003547668, |
|
"learning_rate": 0.00014379758697434698, |
|
"loss": 0.6819, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.8432740113490628, |
|
"eval_loss": 0.6929821372032166, |
|
"eval_runtime": 336.6704, |
|
"eval_samples_per_second": 37.571, |
|
"eval_steps_per_second": 4.699, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.8784104284886071, |
|
"grad_norm": 1.1560312509536743, |
|
"learning_rate": 0.00014145484362188123, |
|
"loss": 0.685, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.9135468456281512, |
|
"grad_norm": 1.0653464794158936, |
|
"learning_rate": 0.0001391121002694155, |
|
"loss": 0.6852, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.9135468456281512, |
|
"eval_loss": 0.6863986253738403, |
|
"eval_runtime": 336.1833, |
|
"eval_samples_per_second": 37.625, |
|
"eval_steps_per_second": 4.706, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.9486832627676955, |
|
"grad_norm": 0.7839105129241943, |
|
"learning_rate": 0.00013676935691694976, |
|
"loss": 0.679, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.9838196799072398, |
|
"grad_norm": 0.9613277316093445, |
|
"learning_rate": 0.00013442661356448404, |
|
"loss": 0.6775, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.9838196799072398, |
|
"eval_loss": 0.6820084452629089, |
|
"eval_runtime": 336.2939, |
|
"eval_samples_per_second": 37.613, |
|
"eval_steps_per_second": 4.704, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.0189560970467841, |
|
"grad_norm": 0.835300624370575, |
|
"learning_rate": 0.0001320838702120183, |
|
"loss": 0.6315, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.0540925141863284, |
|
"grad_norm": 0.7565451264381409, |
|
"learning_rate": 0.00012974112685955254, |
|
"loss": 0.585, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.0540925141863284, |
|
"eval_loss": 0.6878482103347778, |
|
"eval_runtime": 336.2331, |
|
"eval_samples_per_second": 37.62, |
|
"eval_steps_per_second": 4.705, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.0892289313258727, |
|
"grad_norm": 0.9805488586425781, |
|
"learning_rate": 0.0001273983835070868, |
|
"loss": 0.5879, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.124365348465417, |
|
"grad_norm": 0.7680599093437195, |
|
"learning_rate": 0.00012505564015462105, |
|
"loss": 0.5899, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.124365348465417, |
|
"eval_loss": 0.6873815655708313, |
|
"eval_runtime": 335.1602, |
|
"eval_samples_per_second": 37.74, |
|
"eval_steps_per_second": 4.72, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.1595017656049613, |
|
"grad_norm": 0.6148345470428467, |
|
"learning_rate": 0.00012271289680215532, |
|
"loss": 0.587, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.1946381827445056, |
|
"grad_norm": 0.8125023245811462, |
|
"learning_rate": 0.00012037015344968959, |
|
"loss": 0.5807, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.1946381827445056, |
|
"eval_loss": 0.685107409954071, |
|
"eval_runtime": 336.124, |
|
"eval_samples_per_second": 37.632, |
|
"eval_steps_per_second": 4.707, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.22977459988405, |
|
"grad_norm": 0.8291782736778259, |
|
"learning_rate": 0.00011802741009722384, |
|
"loss": 0.5915, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.2649110170235942, |
|
"grad_norm": 1.478590726852417, |
|
"learning_rate": 0.0001156846667447581, |
|
"loss": 0.5843, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.2649110170235942, |
|
"eval_loss": 0.6858633160591125, |
|
"eval_runtime": 336.0404, |
|
"eval_samples_per_second": 37.641, |
|
"eval_steps_per_second": 4.708, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.3000474341631385, |
|
"grad_norm": 0.640578031539917, |
|
"learning_rate": 0.00011334192339229237, |
|
"loss": 0.5905, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.3351838513026828, |
|
"grad_norm": 1.0853077173233032, |
|
"learning_rate": 0.00011099918003982663, |
|
"loss": 0.5857, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.3351838513026828, |
|
"eval_loss": 0.6786811947822571, |
|
"eval_runtime": 336.4389, |
|
"eval_samples_per_second": 37.597, |
|
"eval_steps_per_second": 4.702, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.3703202684422269, |
|
"grad_norm": 0.8859909176826477, |
|
"learning_rate": 0.0001086564366873609, |
|
"loss": 0.5895, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.4054566855817712, |
|
"grad_norm": 0.9398015737533569, |
|
"learning_rate": 0.00010631369333489516, |
|
"loss": 0.5936, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.4054566855817712, |
|
"eval_loss": 0.6774466037750244, |
|
"eval_runtime": 336.3052, |
|
"eval_samples_per_second": 37.612, |
|
"eval_steps_per_second": 4.704, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.4405931027213155, |
|
"grad_norm": 0.786648154258728, |
|
"learning_rate": 0.00010397094998242943, |
|
"loss": 0.5861, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.4757295198608598, |
|
"grad_norm": 0.5784944891929626, |
|
"learning_rate": 0.00010162820662996368, |
|
"loss": 0.5927, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.4757295198608598, |
|
"eval_loss": 0.6745932102203369, |
|
"eval_runtime": 336.7619, |
|
"eval_samples_per_second": 37.561, |
|
"eval_steps_per_second": 4.698, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.510865937000404, |
|
"grad_norm": 0.9801135659217834, |
|
"learning_rate": 9.928546327749796e-05, |
|
"loss": 0.5867, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.5460023541399484, |
|
"grad_norm": 0.9332974553108215, |
|
"learning_rate": 9.694271992503222e-05, |
|
"loss": 0.5899, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.5460023541399484, |
|
"eval_loss": 0.6713685393333435, |
|
"eval_runtime": 336.924, |
|
"eval_samples_per_second": 37.543, |
|
"eval_steps_per_second": 4.695, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.5811387712794926, |
|
"grad_norm": 0.7248233556747437, |
|
"learning_rate": 9.459997657256649e-05, |
|
"loss": 0.5859, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.616275188419037, |
|
"grad_norm": 0.8635448217391968, |
|
"learning_rate": 9.225723322010075e-05, |
|
"loss": 0.5892, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.616275188419037, |
|
"eval_loss": 0.6714141368865967, |
|
"eval_runtime": 335.5916, |
|
"eval_samples_per_second": 37.692, |
|
"eval_steps_per_second": 4.714, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.6514116055585812, |
|
"grad_norm": 0.9144196510314941, |
|
"learning_rate": 8.991448986763502e-05, |
|
"loss": 0.5831, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.6865480226981253, |
|
"grad_norm": 1.1940064430236816, |
|
"learning_rate": 8.757174651516927e-05, |
|
"loss": 0.5862, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.6865480226981253, |
|
"eval_loss": 0.6678937673568726, |
|
"eval_runtime": 336.1373, |
|
"eval_samples_per_second": 37.63, |
|
"eval_steps_per_second": 4.706, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.7216844398376696, |
|
"grad_norm": 0.9592931270599365, |
|
"learning_rate": 8.522900316270352e-05, |
|
"loss": 0.5829, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 1.756820856977214, |
|
"grad_norm": 0.835950493812561, |
|
"learning_rate": 8.288625981023778e-05, |
|
"loss": 0.5854, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.756820856977214, |
|
"eval_loss": 0.6643121242523193, |
|
"eval_runtime": 336.852, |
|
"eval_samples_per_second": 37.551, |
|
"eval_steps_per_second": 4.696, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.7919572741167582, |
|
"grad_norm": 0.842988908290863, |
|
"learning_rate": 8.054351645777205e-05, |
|
"loss": 0.5819, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 1.8270936912563025, |
|
"grad_norm": 0.7810894846916199, |
|
"learning_rate": 7.820077310530631e-05, |
|
"loss": 0.5768, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.8270936912563025, |
|
"eval_loss": 0.6629989147186279, |
|
"eval_runtime": 336.525, |
|
"eval_samples_per_second": 37.587, |
|
"eval_steps_per_second": 4.701, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.8622301083958468, |
|
"grad_norm": 1.061445951461792, |
|
"learning_rate": 7.585802975284058e-05, |
|
"loss": 0.5762, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 1.897366525535391, |
|
"grad_norm": 0.8997741937637329, |
|
"learning_rate": 7.351528640037484e-05, |
|
"loss": 0.5772, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.897366525535391, |
|
"eval_loss": 0.660283625125885, |
|
"eval_runtime": 336.3211, |
|
"eval_samples_per_second": 37.61, |
|
"eval_steps_per_second": 4.704, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.9325029426749354, |
|
"grad_norm": 0.9401558637619019, |
|
"learning_rate": 7.11725430479091e-05, |
|
"loss": 0.581, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 1.9676393598144797, |
|
"grad_norm": 0.8311142921447754, |
|
"learning_rate": 6.882979969544337e-05, |
|
"loss": 0.5756, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.9676393598144797, |
|
"eval_loss": 0.6574224233627319, |
|
"eval_runtime": 336.1686, |
|
"eval_samples_per_second": 37.627, |
|
"eval_steps_per_second": 4.706, |
|
"step": 28000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 42690, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.30815596860842e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|