llama3.1_translate_8B / trainer_state.json
barbaroo's picture
Upload 11 files
4274fc9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9676393598144797,
"eval_steps": 1000,
"global_step": 28000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03513641713954428,
"grad_norm": 0.8998513221740723,
"learning_rate": 0.00019768068408105893,
"loss": 1.0077,
"step": 500
},
{
"epoch": 0.07027283427908856,
"grad_norm": 0.8432224988937378,
"learning_rate": 0.00019533794072859319,
"loss": 0.868,
"step": 1000
},
{
"epoch": 0.07027283427908856,
"eval_loss": 0.848849356174469,
"eval_runtime": 335.8512,
"eval_samples_per_second": 37.663,
"eval_steps_per_second": 4.71,
"step": 1000
},
{
"epoch": 0.10540925141863285,
"grad_norm": 1.1127365827560425,
"learning_rate": 0.00019299519737612746,
"loss": 0.822,
"step": 1500
},
{
"epoch": 0.14054566855817713,
"grad_norm": 0.7712583541870117,
"learning_rate": 0.00019065245402366172,
"loss": 0.7967,
"step": 2000
},
{
"epoch": 0.14054566855817713,
"eval_loss": 0.7973089814186096,
"eval_runtime": 336.112,
"eval_samples_per_second": 37.633,
"eval_steps_per_second": 4.707,
"step": 2000
},
{
"epoch": 0.1756820856977214,
"grad_norm": 1.2161240577697754,
"learning_rate": 0.000188309710671196,
"loss": 0.7871,
"step": 2500
},
{
"epoch": 0.2108185028372657,
"grad_norm": 0.806480348110199,
"learning_rate": 0.00018596696731873024,
"loss": 0.7669,
"step": 3000
},
{
"epoch": 0.2108185028372657,
"eval_loss": 0.7698538303375244,
"eval_runtime": 336.2265,
"eval_samples_per_second": 37.62,
"eval_steps_per_second": 4.705,
"step": 3000
},
{
"epoch": 0.24595491997680996,
"grad_norm": 0.7688671350479126,
"learning_rate": 0.0001836242239662645,
"loss": 0.7486,
"step": 3500
},
{
"epoch": 0.28109133711635426,
"grad_norm": 0.7088080644607544,
"learning_rate": 0.00018128148061379877,
"loss": 0.7548,
"step": 4000
},
{
"epoch": 0.28109133711635426,
"eval_loss": 0.7532803416252136,
"eval_runtime": 336.8838,
"eval_samples_per_second": 37.547,
"eval_steps_per_second": 4.696,
"step": 4000
},
{
"epoch": 0.31622775425589855,
"grad_norm": 0.7568556070327759,
"learning_rate": 0.00017893873726133302,
"loss": 0.747,
"step": 4500
},
{
"epoch": 0.3513641713954428,
"grad_norm": 0.7561419606208801,
"learning_rate": 0.0001765959939088673,
"loss": 0.7327,
"step": 5000
},
{
"epoch": 0.3513641713954428,
"eval_loss": 0.7386600375175476,
"eval_runtime": 335.7492,
"eval_samples_per_second": 37.674,
"eval_steps_per_second": 4.712,
"step": 5000
},
{
"epoch": 0.3865005885349871,
"grad_norm": 0.829656720161438,
"learning_rate": 0.00017425325055640155,
"loss": 0.7327,
"step": 5500
},
{
"epoch": 0.4216370056745314,
"grad_norm": 0.8936611413955688,
"learning_rate": 0.00017191050720393583,
"loss": 0.718,
"step": 6000
},
{
"epoch": 0.4216370056745314,
"eval_loss": 0.7304050326347351,
"eval_runtime": 336.6038,
"eval_samples_per_second": 37.578,
"eval_steps_per_second": 4.7,
"step": 6000
},
{
"epoch": 0.4567734228140756,
"grad_norm": 0.79137122631073,
"learning_rate": 0.00016956776385147008,
"loss": 0.7208,
"step": 6500
},
{
"epoch": 0.4919098399536199,
"grad_norm": 0.8772656321525574,
"learning_rate": 0.00016722502049900433,
"loss": 0.705,
"step": 7000
},
{
"epoch": 0.4919098399536199,
"eval_loss": 0.7186248302459717,
"eval_runtime": 336.6708,
"eval_samples_per_second": 37.571,
"eval_steps_per_second": 4.699,
"step": 7000
},
{
"epoch": 0.5270462570931642,
"grad_norm": 0.8973077535629272,
"learning_rate": 0.0001648822771465386,
"loss": 0.7081,
"step": 7500
},
{
"epoch": 0.5621826742327085,
"grad_norm": 0.7941250801086426,
"learning_rate": 0.00016253953379407286,
"loss": 0.7086,
"step": 8000
},
{
"epoch": 0.5621826742327085,
"eval_loss": 0.7134420275688171,
"eval_runtime": 335.4818,
"eval_samples_per_second": 37.704,
"eval_steps_per_second": 4.716,
"step": 8000
},
{
"epoch": 0.5973190913722528,
"grad_norm": 0.5397359132766724,
"learning_rate": 0.00016019679044160714,
"loss": 0.7049,
"step": 8500
},
{
"epoch": 0.6324555085117971,
"grad_norm": 1.6500648260116577,
"learning_rate": 0.0001578540470891414,
"loss": 0.7047,
"step": 9000
},
{
"epoch": 0.6324555085117971,
"eval_loss": 0.7038806080818176,
"eval_runtime": 335.4609,
"eval_samples_per_second": 37.706,
"eval_steps_per_second": 4.716,
"step": 9000
},
{
"epoch": 0.6675919256513413,
"grad_norm": 1.0555411577224731,
"learning_rate": 0.00015551130373667567,
"loss": 0.7005,
"step": 9500
},
{
"epoch": 0.7027283427908856,
"grad_norm": 0.8140186071395874,
"learning_rate": 0.00015316856038420992,
"loss": 0.6874,
"step": 10000
},
{
"epoch": 0.7027283427908856,
"eval_loss": 0.7008971571922302,
"eval_runtime": 335.6608,
"eval_samples_per_second": 37.684,
"eval_steps_per_second": 4.713,
"step": 10000
},
{
"epoch": 0.7378647599304299,
"grad_norm": 1.4687775373458862,
"learning_rate": 0.00015082581703174417,
"loss": 0.6966,
"step": 10500
},
{
"epoch": 0.7730011770699742,
"grad_norm": 0.6557592153549194,
"learning_rate": 0.00014848307367927845,
"loss": 0.7014,
"step": 11000
},
{
"epoch": 0.7730011770699742,
"eval_loss": 0.6960607767105103,
"eval_runtime": 335.7991,
"eval_samples_per_second": 37.668,
"eval_steps_per_second": 4.711,
"step": 11000
},
{
"epoch": 0.8081375942095185,
"grad_norm": 0.9453054666519165,
"learning_rate": 0.0001461403303268127,
"loss": 0.6824,
"step": 11500
},
{
"epoch": 0.8432740113490628,
"grad_norm": 0.6535905003547668,
"learning_rate": 0.00014379758697434698,
"loss": 0.6819,
"step": 12000
},
{
"epoch": 0.8432740113490628,
"eval_loss": 0.6929821372032166,
"eval_runtime": 336.6704,
"eval_samples_per_second": 37.571,
"eval_steps_per_second": 4.699,
"step": 12000
},
{
"epoch": 0.8784104284886071,
"grad_norm": 1.1560312509536743,
"learning_rate": 0.00014145484362188123,
"loss": 0.685,
"step": 12500
},
{
"epoch": 0.9135468456281512,
"grad_norm": 1.0653464794158936,
"learning_rate": 0.0001391121002694155,
"loss": 0.6852,
"step": 13000
},
{
"epoch": 0.9135468456281512,
"eval_loss": 0.6863986253738403,
"eval_runtime": 336.1833,
"eval_samples_per_second": 37.625,
"eval_steps_per_second": 4.706,
"step": 13000
},
{
"epoch": 0.9486832627676955,
"grad_norm": 0.7839105129241943,
"learning_rate": 0.00013676935691694976,
"loss": 0.679,
"step": 13500
},
{
"epoch": 0.9838196799072398,
"grad_norm": 0.9613277316093445,
"learning_rate": 0.00013442661356448404,
"loss": 0.6775,
"step": 14000
},
{
"epoch": 0.9838196799072398,
"eval_loss": 0.6820084452629089,
"eval_runtime": 336.2939,
"eval_samples_per_second": 37.613,
"eval_steps_per_second": 4.704,
"step": 14000
},
{
"epoch": 1.0189560970467841,
"grad_norm": 0.835300624370575,
"learning_rate": 0.0001320838702120183,
"loss": 0.6315,
"step": 14500
},
{
"epoch": 1.0540925141863284,
"grad_norm": 0.7565451264381409,
"learning_rate": 0.00012974112685955254,
"loss": 0.585,
"step": 15000
},
{
"epoch": 1.0540925141863284,
"eval_loss": 0.6878482103347778,
"eval_runtime": 336.2331,
"eval_samples_per_second": 37.62,
"eval_steps_per_second": 4.705,
"step": 15000
},
{
"epoch": 1.0892289313258727,
"grad_norm": 0.9805488586425781,
"learning_rate": 0.0001273983835070868,
"loss": 0.5879,
"step": 15500
},
{
"epoch": 1.124365348465417,
"grad_norm": 0.7680599093437195,
"learning_rate": 0.00012505564015462105,
"loss": 0.5899,
"step": 16000
},
{
"epoch": 1.124365348465417,
"eval_loss": 0.6873815655708313,
"eval_runtime": 335.1602,
"eval_samples_per_second": 37.74,
"eval_steps_per_second": 4.72,
"step": 16000
},
{
"epoch": 1.1595017656049613,
"grad_norm": 0.6148345470428467,
"learning_rate": 0.00012271289680215532,
"loss": 0.587,
"step": 16500
},
{
"epoch": 1.1946381827445056,
"grad_norm": 0.8125023245811462,
"learning_rate": 0.00012037015344968959,
"loss": 0.5807,
"step": 17000
},
{
"epoch": 1.1946381827445056,
"eval_loss": 0.685107409954071,
"eval_runtime": 336.124,
"eval_samples_per_second": 37.632,
"eval_steps_per_second": 4.707,
"step": 17000
},
{
"epoch": 1.22977459988405,
"grad_norm": 0.8291782736778259,
"learning_rate": 0.00011802741009722384,
"loss": 0.5915,
"step": 17500
},
{
"epoch": 1.2649110170235942,
"grad_norm": 1.478590726852417,
"learning_rate": 0.0001156846667447581,
"loss": 0.5843,
"step": 18000
},
{
"epoch": 1.2649110170235942,
"eval_loss": 0.6858633160591125,
"eval_runtime": 336.0404,
"eval_samples_per_second": 37.641,
"eval_steps_per_second": 4.708,
"step": 18000
},
{
"epoch": 1.3000474341631385,
"grad_norm": 0.640578031539917,
"learning_rate": 0.00011334192339229237,
"loss": 0.5905,
"step": 18500
},
{
"epoch": 1.3351838513026828,
"grad_norm": 1.0853077173233032,
"learning_rate": 0.00011099918003982663,
"loss": 0.5857,
"step": 19000
},
{
"epoch": 1.3351838513026828,
"eval_loss": 0.6786811947822571,
"eval_runtime": 336.4389,
"eval_samples_per_second": 37.597,
"eval_steps_per_second": 4.702,
"step": 19000
},
{
"epoch": 1.3703202684422269,
"grad_norm": 0.8859909176826477,
"learning_rate": 0.0001086564366873609,
"loss": 0.5895,
"step": 19500
},
{
"epoch": 1.4054566855817712,
"grad_norm": 0.9398015737533569,
"learning_rate": 0.00010631369333489516,
"loss": 0.5936,
"step": 20000
},
{
"epoch": 1.4054566855817712,
"eval_loss": 0.6774466037750244,
"eval_runtime": 336.3052,
"eval_samples_per_second": 37.612,
"eval_steps_per_second": 4.704,
"step": 20000
},
{
"epoch": 1.4405931027213155,
"grad_norm": 0.786648154258728,
"learning_rate": 0.00010397094998242943,
"loss": 0.5861,
"step": 20500
},
{
"epoch": 1.4757295198608598,
"grad_norm": 0.5784944891929626,
"learning_rate": 0.00010162820662996368,
"loss": 0.5927,
"step": 21000
},
{
"epoch": 1.4757295198608598,
"eval_loss": 0.6745932102203369,
"eval_runtime": 336.7619,
"eval_samples_per_second": 37.561,
"eval_steps_per_second": 4.698,
"step": 21000
},
{
"epoch": 1.510865937000404,
"grad_norm": 0.9801135659217834,
"learning_rate": 9.928546327749796e-05,
"loss": 0.5867,
"step": 21500
},
{
"epoch": 1.5460023541399484,
"grad_norm": 0.9332974553108215,
"learning_rate": 9.694271992503222e-05,
"loss": 0.5899,
"step": 22000
},
{
"epoch": 1.5460023541399484,
"eval_loss": 0.6713685393333435,
"eval_runtime": 336.924,
"eval_samples_per_second": 37.543,
"eval_steps_per_second": 4.695,
"step": 22000
},
{
"epoch": 1.5811387712794926,
"grad_norm": 0.7248233556747437,
"learning_rate": 9.459997657256649e-05,
"loss": 0.5859,
"step": 22500
},
{
"epoch": 1.616275188419037,
"grad_norm": 0.8635448217391968,
"learning_rate": 9.225723322010075e-05,
"loss": 0.5892,
"step": 23000
},
{
"epoch": 1.616275188419037,
"eval_loss": 0.6714141368865967,
"eval_runtime": 335.5916,
"eval_samples_per_second": 37.692,
"eval_steps_per_second": 4.714,
"step": 23000
},
{
"epoch": 1.6514116055585812,
"grad_norm": 0.9144196510314941,
"learning_rate": 8.991448986763502e-05,
"loss": 0.5831,
"step": 23500
},
{
"epoch": 1.6865480226981253,
"grad_norm": 1.1940064430236816,
"learning_rate": 8.757174651516927e-05,
"loss": 0.5862,
"step": 24000
},
{
"epoch": 1.6865480226981253,
"eval_loss": 0.6678937673568726,
"eval_runtime": 336.1373,
"eval_samples_per_second": 37.63,
"eval_steps_per_second": 4.706,
"step": 24000
},
{
"epoch": 1.7216844398376696,
"grad_norm": 0.9592931270599365,
"learning_rate": 8.522900316270352e-05,
"loss": 0.5829,
"step": 24500
},
{
"epoch": 1.756820856977214,
"grad_norm": 0.835950493812561,
"learning_rate": 8.288625981023778e-05,
"loss": 0.5854,
"step": 25000
},
{
"epoch": 1.756820856977214,
"eval_loss": 0.6643121242523193,
"eval_runtime": 336.852,
"eval_samples_per_second": 37.551,
"eval_steps_per_second": 4.696,
"step": 25000
},
{
"epoch": 1.7919572741167582,
"grad_norm": 0.842988908290863,
"learning_rate": 8.054351645777205e-05,
"loss": 0.5819,
"step": 25500
},
{
"epoch": 1.8270936912563025,
"grad_norm": 0.7810894846916199,
"learning_rate": 7.820077310530631e-05,
"loss": 0.5768,
"step": 26000
},
{
"epoch": 1.8270936912563025,
"eval_loss": 0.6629989147186279,
"eval_runtime": 336.525,
"eval_samples_per_second": 37.587,
"eval_steps_per_second": 4.701,
"step": 26000
},
{
"epoch": 1.8622301083958468,
"grad_norm": 1.061445951461792,
"learning_rate": 7.585802975284058e-05,
"loss": 0.5762,
"step": 26500
},
{
"epoch": 1.897366525535391,
"grad_norm": 0.8997741937637329,
"learning_rate": 7.351528640037484e-05,
"loss": 0.5772,
"step": 27000
},
{
"epoch": 1.897366525535391,
"eval_loss": 0.660283625125885,
"eval_runtime": 336.3211,
"eval_samples_per_second": 37.61,
"eval_steps_per_second": 4.704,
"step": 27000
},
{
"epoch": 1.9325029426749354,
"grad_norm": 0.9401558637619019,
"learning_rate": 7.11725430479091e-05,
"loss": 0.581,
"step": 27500
},
{
"epoch": 1.9676393598144797,
"grad_norm": 0.8311142921447754,
"learning_rate": 6.882979969544337e-05,
"loss": 0.5756,
"step": 28000
},
{
"epoch": 1.9676393598144797,
"eval_loss": 0.6574224233627319,
"eval_runtime": 336.1686,
"eval_samples_per_second": 37.627,
"eval_steps_per_second": 4.706,
"step": 28000
}
],
"logging_steps": 500,
"max_steps": 42690,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.30815596860842e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}