{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9676393598144797, "eval_steps": 1000, "global_step": 28000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03513641713954428, "grad_norm": 0.8998513221740723, "learning_rate": 0.00019768068408105893, "loss": 1.0077, "step": 500 }, { "epoch": 0.07027283427908856, "grad_norm": 0.8432224988937378, "learning_rate": 0.00019533794072859319, "loss": 0.868, "step": 1000 }, { "epoch": 0.07027283427908856, "eval_loss": 0.848849356174469, "eval_runtime": 335.8512, "eval_samples_per_second": 37.663, "eval_steps_per_second": 4.71, "step": 1000 }, { "epoch": 0.10540925141863285, "grad_norm": 1.1127365827560425, "learning_rate": 0.00019299519737612746, "loss": 0.822, "step": 1500 }, { "epoch": 0.14054566855817713, "grad_norm": 0.7712583541870117, "learning_rate": 0.00019065245402366172, "loss": 0.7967, "step": 2000 }, { "epoch": 0.14054566855817713, "eval_loss": 0.7973089814186096, "eval_runtime": 336.112, "eval_samples_per_second": 37.633, "eval_steps_per_second": 4.707, "step": 2000 }, { "epoch": 0.1756820856977214, "grad_norm": 1.2161240577697754, "learning_rate": 0.000188309710671196, "loss": 0.7871, "step": 2500 }, { "epoch": 0.2108185028372657, "grad_norm": 0.806480348110199, "learning_rate": 0.00018596696731873024, "loss": 0.7669, "step": 3000 }, { "epoch": 0.2108185028372657, "eval_loss": 0.7698538303375244, "eval_runtime": 336.2265, "eval_samples_per_second": 37.62, "eval_steps_per_second": 4.705, "step": 3000 }, { "epoch": 0.24595491997680996, "grad_norm": 0.7688671350479126, "learning_rate": 0.0001836242239662645, "loss": 0.7486, "step": 3500 }, { "epoch": 0.28109133711635426, "grad_norm": 0.7088080644607544, "learning_rate": 0.00018128148061379877, "loss": 0.7548, "step": 4000 }, { "epoch": 0.28109133711635426, "eval_loss": 0.7532803416252136, "eval_runtime": 336.8838, "eval_samples_per_second": 37.547, "eval_steps_per_second": 4.696, "step": 4000 }, { "epoch": 0.31622775425589855, "grad_norm": 0.7568556070327759, "learning_rate": 0.00017893873726133302, "loss": 0.747, "step": 4500 }, { "epoch": 0.3513641713954428, "grad_norm": 0.7561419606208801, "learning_rate": 0.0001765959939088673, "loss": 0.7327, "step": 5000 }, { "epoch": 0.3513641713954428, "eval_loss": 0.7386600375175476, "eval_runtime": 335.7492, "eval_samples_per_second": 37.674, "eval_steps_per_second": 4.712, "step": 5000 }, { "epoch": 0.3865005885349871, "grad_norm": 0.829656720161438, "learning_rate": 0.00017425325055640155, "loss": 0.7327, "step": 5500 }, { "epoch": 0.4216370056745314, "grad_norm": 0.8936611413955688, "learning_rate": 0.00017191050720393583, "loss": 0.718, "step": 6000 }, { "epoch": 0.4216370056745314, "eval_loss": 0.7304050326347351, "eval_runtime": 336.6038, "eval_samples_per_second": 37.578, "eval_steps_per_second": 4.7, "step": 6000 }, { "epoch": 0.4567734228140756, "grad_norm": 0.79137122631073, "learning_rate": 0.00016956776385147008, "loss": 0.7208, "step": 6500 }, { "epoch": 0.4919098399536199, "grad_norm": 0.8772656321525574, "learning_rate": 0.00016722502049900433, "loss": 0.705, "step": 7000 }, { "epoch": 0.4919098399536199, "eval_loss": 0.7186248302459717, "eval_runtime": 336.6708, "eval_samples_per_second": 37.571, "eval_steps_per_second": 4.699, "step": 7000 }, { "epoch": 0.5270462570931642, "grad_norm": 0.8973077535629272, "learning_rate": 0.0001648822771465386, "loss": 0.7081, "step": 7500 }, { "epoch": 0.5621826742327085, "grad_norm": 0.7941250801086426, "learning_rate": 0.00016253953379407286, "loss": 0.7086, "step": 8000 }, { "epoch": 0.5621826742327085, "eval_loss": 0.7134420275688171, "eval_runtime": 335.4818, "eval_samples_per_second": 37.704, "eval_steps_per_second": 4.716, "step": 8000 }, { "epoch": 0.5973190913722528, "grad_norm": 0.5397359132766724, "learning_rate": 0.00016019679044160714, "loss": 0.7049, "step": 8500 }, { "epoch": 0.6324555085117971, "grad_norm": 1.6500648260116577, "learning_rate": 0.0001578540470891414, "loss": 0.7047, "step": 9000 }, { "epoch": 0.6324555085117971, "eval_loss": 0.7038806080818176, "eval_runtime": 335.4609, "eval_samples_per_second": 37.706, "eval_steps_per_second": 4.716, "step": 9000 }, { "epoch": 0.6675919256513413, "grad_norm": 1.0555411577224731, "learning_rate": 0.00015551130373667567, "loss": 0.7005, "step": 9500 }, { "epoch": 0.7027283427908856, "grad_norm": 0.8140186071395874, "learning_rate": 0.00015316856038420992, "loss": 0.6874, "step": 10000 }, { "epoch": 0.7027283427908856, "eval_loss": 0.7008971571922302, "eval_runtime": 335.6608, "eval_samples_per_second": 37.684, "eval_steps_per_second": 4.713, "step": 10000 }, { "epoch": 0.7378647599304299, "grad_norm": 1.4687775373458862, "learning_rate": 0.00015082581703174417, "loss": 0.6966, "step": 10500 }, { "epoch": 0.7730011770699742, "grad_norm": 0.6557592153549194, "learning_rate": 0.00014848307367927845, "loss": 0.7014, "step": 11000 }, { "epoch": 0.7730011770699742, "eval_loss": 0.6960607767105103, "eval_runtime": 335.7991, "eval_samples_per_second": 37.668, "eval_steps_per_second": 4.711, "step": 11000 }, { "epoch": 0.8081375942095185, "grad_norm": 0.9453054666519165, "learning_rate": 0.0001461403303268127, "loss": 0.6824, "step": 11500 }, { "epoch": 0.8432740113490628, "grad_norm": 0.6535905003547668, "learning_rate": 0.00014379758697434698, "loss": 0.6819, "step": 12000 }, { "epoch": 0.8432740113490628, "eval_loss": 0.6929821372032166, "eval_runtime": 336.6704, "eval_samples_per_second": 37.571, "eval_steps_per_second": 4.699, "step": 12000 }, { "epoch": 0.8784104284886071, "grad_norm": 1.1560312509536743, "learning_rate": 0.00014145484362188123, "loss": 0.685, "step": 12500 }, { "epoch": 0.9135468456281512, "grad_norm": 1.0653464794158936, "learning_rate": 0.0001391121002694155, "loss": 0.6852, "step": 13000 }, { "epoch": 0.9135468456281512, "eval_loss": 0.6863986253738403, "eval_runtime": 336.1833, "eval_samples_per_second": 37.625, "eval_steps_per_second": 4.706, "step": 13000 }, { "epoch": 0.9486832627676955, "grad_norm": 0.7839105129241943, "learning_rate": 0.00013676935691694976, "loss": 0.679, "step": 13500 }, { "epoch": 0.9838196799072398, "grad_norm": 0.9613277316093445, "learning_rate": 0.00013442661356448404, "loss": 0.6775, "step": 14000 }, { "epoch": 0.9838196799072398, "eval_loss": 0.6820084452629089, "eval_runtime": 336.2939, "eval_samples_per_second": 37.613, "eval_steps_per_second": 4.704, "step": 14000 }, { "epoch": 1.0189560970467841, "grad_norm": 0.835300624370575, "learning_rate": 0.0001320838702120183, "loss": 0.6315, "step": 14500 }, { "epoch": 1.0540925141863284, "grad_norm": 0.7565451264381409, "learning_rate": 0.00012974112685955254, "loss": 0.585, "step": 15000 }, { "epoch": 1.0540925141863284, "eval_loss": 0.6878482103347778, "eval_runtime": 336.2331, "eval_samples_per_second": 37.62, "eval_steps_per_second": 4.705, "step": 15000 }, { "epoch": 1.0892289313258727, "grad_norm": 0.9805488586425781, "learning_rate": 0.0001273983835070868, "loss": 0.5879, "step": 15500 }, { "epoch": 1.124365348465417, "grad_norm": 0.7680599093437195, "learning_rate": 0.00012505564015462105, "loss": 0.5899, "step": 16000 }, { "epoch": 1.124365348465417, "eval_loss": 0.6873815655708313, "eval_runtime": 335.1602, "eval_samples_per_second": 37.74, "eval_steps_per_second": 4.72, "step": 16000 }, { "epoch": 1.1595017656049613, "grad_norm": 0.6148345470428467, "learning_rate": 0.00012271289680215532, "loss": 0.587, "step": 16500 }, { "epoch": 1.1946381827445056, "grad_norm": 0.8125023245811462, "learning_rate": 0.00012037015344968959, "loss": 0.5807, "step": 17000 }, { "epoch": 1.1946381827445056, "eval_loss": 0.685107409954071, "eval_runtime": 336.124, "eval_samples_per_second": 37.632, "eval_steps_per_second": 4.707, "step": 17000 }, { "epoch": 1.22977459988405, "grad_norm": 0.8291782736778259, "learning_rate": 0.00011802741009722384, "loss": 0.5915, "step": 17500 }, { "epoch": 1.2649110170235942, "grad_norm": 1.478590726852417, "learning_rate": 0.0001156846667447581, "loss": 0.5843, "step": 18000 }, { "epoch": 1.2649110170235942, "eval_loss": 0.6858633160591125, "eval_runtime": 336.0404, "eval_samples_per_second": 37.641, "eval_steps_per_second": 4.708, "step": 18000 }, { "epoch": 1.3000474341631385, "grad_norm": 0.640578031539917, "learning_rate": 0.00011334192339229237, "loss": 0.5905, "step": 18500 }, { "epoch": 1.3351838513026828, "grad_norm": 1.0853077173233032, "learning_rate": 0.00011099918003982663, "loss": 0.5857, "step": 19000 }, { "epoch": 1.3351838513026828, "eval_loss": 0.6786811947822571, "eval_runtime": 336.4389, "eval_samples_per_second": 37.597, "eval_steps_per_second": 4.702, "step": 19000 }, { "epoch": 1.3703202684422269, "grad_norm": 0.8859909176826477, "learning_rate": 0.0001086564366873609, "loss": 0.5895, "step": 19500 }, { "epoch": 1.4054566855817712, "grad_norm": 0.9398015737533569, "learning_rate": 0.00010631369333489516, "loss": 0.5936, "step": 20000 }, { "epoch": 1.4054566855817712, "eval_loss": 0.6774466037750244, "eval_runtime": 336.3052, "eval_samples_per_second": 37.612, "eval_steps_per_second": 4.704, "step": 20000 }, { "epoch": 1.4405931027213155, "grad_norm": 0.786648154258728, "learning_rate": 0.00010397094998242943, "loss": 0.5861, "step": 20500 }, { "epoch": 1.4757295198608598, "grad_norm": 0.5784944891929626, "learning_rate": 0.00010162820662996368, "loss": 0.5927, "step": 21000 }, { "epoch": 1.4757295198608598, "eval_loss": 0.6745932102203369, "eval_runtime": 336.7619, "eval_samples_per_second": 37.561, "eval_steps_per_second": 4.698, "step": 21000 }, { "epoch": 1.510865937000404, "grad_norm": 0.9801135659217834, "learning_rate": 9.928546327749796e-05, "loss": 0.5867, "step": 21500 }, { "epoch": 1.5460023541399484, "grad_norm": 0.9332974553108215, "learning_rate": 9.694271992503222e-05, "loss": 0.5899, "step": 22000 }, { "epoch": 1.5460023541399484, "eval_loss": 0.6713685393333435, "eval_runtime": 336.924, "eval_samples_per_second": 37.543, "eval_steps_per_second": 4.695, "step": 22000 }, { "epoch": 1.5811387712794926, "grad_norm": 0.7248233556747437, "learning_rate": 9.459997657256649e-05, "loss": 0.5859, "step": 22500 }, { "epoch": 1.616275188419037, "grad_norm": 0.8635448217391968, "learning_rate": 9.225723322010075e-05, "loss": 0.5892, "step": 23000 }, { "epoch": 1.616275188419037, "eval_loss": 0.6714141368865967, "eval_runtime": 335.5916, "eval_samples_per_second": 37.692, "eval_steps_per_second": 4.714, "step": 23000 }, { "epoch": 1.6514116055585812, "grad_norm": 0.9144196510314941, "learning_rate": 8.991448986763502e-05, "loss": 0.5831, "step": 23500 }, { "epoch": 1.6865480226981253, "grad_norm": 1.1940064430236816, "learning_rate": 8.757174651516927e-05, "loss": 0.5862, "step": 24000 }, { "epoch": 1.6865480226981253, "eval_loss": 0.6678937673568726, "eval_runtime": 336.1373, "eval_samples_per_second": 37.63, "eval_steps_per_second": 4.706, "step": 24000 }, { "epoch": 1.7216844398376696, "grad_norm": 0.9592931270599365, "learning_rate": 8.522900316270352e-05, "loss": 0.5829, "step": 24500 }, { "epoch": 1.756820856977214, "grad_norm": 0.835950493812561, "learning_rate": 8.288625981023778e-05, "loss": 0.5854, "step": 25000 }, { "epoch": 1.756820856977214, "eval_loss": 0.6643121242523193, "eval_runtime": 336.852, "eval_samples_per_second": 37.551, "eval_steps_per_second": 4.696, "step": 25000 }, { "epoch": 1.7919572741167582, "grad_norm": 0.842988908290863, "learning_rate": 8.054351645777205e-05, "loss": 0.5819, "step": 25500 }, { "epoch": 1.8270936912563025, "grad_norm": 0.7810894846916199, "learning_rate": 7.820077310530631e-05, "loss": 0.5768, "step": 26000 }, { "epoch": 1.8270936912563025, "eval_loss": 0.6629989147186279, "eval_runtime": 336.525, "eval_samples_per_second": 37.587, "eval_steps_per_second": 4.701, "step": 26000 }, { "epoch": 1.8622301083958468, "grad_norm": 1.061445951461792, "learning_rate": 7.585802975284058e-05, "loss": 0.5762, "step": 26500 }, { "epoch": 1.897366525535391, "grad_norm": 0.8997741937637329, "learning_rate": 7.351528640037484e-05, "loss": 0.5772, "step": 27000 }, { "epoch": 1.897366525535391, "eval_loss": 0.660283625125885, "eval_runtime": 336.3211, "eval_samples_per_second": 37.61, "eval_steps_per_second": 4.704, "step": 27000 }, { "epoch": 1.9325029426749354, "grad_norm": 0.9401558637619019, "learning_rate": 7.11725430479091e-05, "loss": 0.581, "step": 27500 }, { "epoch": 1.9676393598144797, "grad_norm": 0.8311142921447754, "learning_rate": 6.882979969544337e-05, "loss": 0.5756, "step": 28000 }, { "epoch": 1.9676393598144797, "eval_loss": 0.6574224233627319, "eval_runtime": 336.1686, "eval_samples_per_second": 37.627, "eval_steps_per_second": 4.706, "step": 28000 } ], "logging_steps": 500, "max_steps": 42690, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.30815596860842e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }