{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0033519553072625, "eval_steps": 14, "global_step": 56, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017877094972067038, "grad_norm": 0.008051837794482708, "learning_rate": 1e-05, "loss": 11.9287, "step": 1 }, { "epoch": 0.017877094972067038, "eval_loss": 11.928812026977539, "eval_runtime": 17.7636, "eval_samples_per_second": 5.348, "eval_steps_per_second": 2.702, "step": 1 }, { "epoch": 0.035754189944134075, "grad_norm": 0.008926309645175934, "learning_rate": 2e-05, "loss": 11.9296, "step": 2 }, { "epoch": 0.053631284916201116, "grad_norm": 0.00831968616694212, "learning_rate": 3e-05, "loss": 11.9287, "step": 3 }, { "epoch": 0.07150837988826815, "grad_norm": 0.007635825779289007, "learning_rate": 4e-05, "loss": 11.9297, "step": 4 }, { "epoch": 0.0893854748603352, "grad_norm": 0.006136827636510134, "learning_rate": 5e-05, "loss": 11.9308, "step": 5 }, { "epoch": 0.10726256983240223, "grad_norm": 0.008952487260103226, "learning_rate": 6e-05, "loss": 11.9298, "step": 6 }, { "epoch": 0.12513966480446928, "grad_norm": 0.008229999803006649, "learning_rate": 7e-05, "loss": 11.9316, "step": 7 }, { "epoch": 0.1430167597765363, "grad_norm": 0.008337481878697872, "learning_rate": 8e-05, "loss": 11.9323, "step": 8 }, { "epoch": 0.16089385474860335, "grad_norm": 0.0077857039868831635, "learning_rate": 9e-05, "loss": 11.9287, "step": 9 }, { "epoch": 0.1787709497206704, "grad_norm": 0.007779798936098814, "learning_rate": 0.0001, "loss": 11.9293, "step": 10 }, { "epoch": 0.19664804469273742, "grad_norm": 0.010903590358793736, "learning_rate": 9.988343845952697e-05, "loss": 11.9297, "step": 11 }, { "epoch": 0.21452513966480447, "grad_norm": 0.008899732492864132, "learning_rate": 9.953429730181653e-05, "loss": 11.929, "step": 12 }, { "epoch": 0.2324022346368715, "grad_norm": 0.008686481043696404, "learning_rate": 9.895420438411616e-05, "loss": 11.9327, "step": 13 }, { "epoch": 0.25027932960893856, "grad_norm": 0.008174674585461617, "learning_rate": 9.814586436738998e-05, "loss": 11.9286, "step": 14 }, { "epoch": 0.25027932960893856, "eval_loss": 11.928715705871582, "eval_runtime": 0.3873, "eval_samples_per_second": 245.313, "eval_steps_per_second": 123.947, "step": 14 }, { "epoch": 0.2681564245810056, "grad_norm": 0.008636604063212872, "learning_rate": 9.711304610594104e-05, "loss": 11.9307, "step": 15 }, { "epoch": 0.2860335195530726, "grad_norm": 0.009689634665846825, "learning_rate": 9.586056507527266e-05, "loss": 11.9291, "step": 16 }, { "epoch": 0.3039106145251397, "grad_norm": 0.007181845605373383, "learning_rate": 9.439426092011875e-05, "loss": 11.9316, "step": 17 }, { "epoch": 0.3217877094972067, "grad_norm": 0.00760689377784729, "learning_rate": 9.272097022732443e-05, "loss": 11.9297, "step": 18 }, { "epoch": 0.3396648044692737, "grad_norm": 0.007029213942587376, "learning_rate": 9.08484946505221e-05, "loss": 11.9308, "step": 19 }, { "epoch": 0.3575418994413408, "grad_norm": 0.009038039483129978, "learning_rate": 8.8785564535221e-05, "loss": 11.931, "step": 20 }, { "epoch": 0.3754189944134078, "grad_norm": 0.0075072660110890865, "learning_rate": 8.654179821390621e-05, "loss": 11.9313, "step": 21 }, { "epoch": 0.39329608938547483, "grad_norm": 0.00974891148507595, "learning_rate": 8.412765716093272e-05, "loss": 11.9313, "step": 22 }, { "epoch": 0.4111731843575419, "grad_norm": 0.0073928870260715485, "learning_rate": 8.155439721630264e-05, "loss": 11.9301, "step": 23 }, { "epoch": 0.42905027932960893, "grad_norm": 0.009019100107252598, "learning_rate": 7.883401610574336e-05, "loss": 11.9291, "step": 24 }, { "epoch": 0.44692737430167595, "grad_norm": 0.007918241433799267, "learning_rate": 7.597919750177168e-05, "loss": 11.9313, "step": 25 }, { "epoch": 0.464804469273743, "grad_norm": 0.009383410215377808, "learning_rate": 7.300325188655761e-05, "loss": 11.9298, "step": 26 }, { "epoch": 0.48268156424581005, "grad_norm": 0.008459771983325481, "learning_rate": 6.992005449231208e-05, "loss": 11.9309, "step": 27 }, { "epoch": 0.5005586592178771, "grad_norm": 0.008441867306828499, "learning_rate": 6.674398060854931e-05, "loss": 11.9304, "step": 28 }, { "epoch": 0.5005586592178771, "eval_loss": 11.928580284118652, "eval_runtime": 0.3886, "eval_samples_per_second": 244.488, "eval_steps_per_second": 123.531, "step": 28 }, { "epoch": 0.5184357541899441, "grad_norm": 0.010095364414155483, "learning_rate": 6.348983855785121e-05, "loss": 11.9277, "step": 29 }, { "epoch": 0.5363128491620112, "grad_norm": 0.007604570593684912, "learning_rate": 6.01728006526317e-05, "loss": 11.9298, "step": 30 }, { "epoch": 0.5541899441340782, "grad_norm": 0.0103254783898592, "learning_rate": 5.680833245481234e-05, "loss": 11.9283, "step": 31 }, { "epoch": 0.5720670391061452, "grad_norm": 0.00810755044221878, "learning_rate": 5.341212066823355e-05, "loss": 11.931, "step": 32 }, { "epoch": 0.5899441340782123, "grad_norm": 0.007562727201730013, "learning_rate": 5e-05, "loss": 11.9287, "step": 33 }, { "epoch": 0.6078212290502794, "grad_norm": 0.008853144943714142, "learning_rate": 4.658787933176646e-05, "loss": 11.9304, "step": 34 }, { "epoch": 0.6256983240223464, "grad_norm": 0.009539203718304634, "learning_rate": 4.319166754518768e-05, "loss": 11.93, "step": 35 }, { "epoch": 0.6435754189944134, "grad_norm": 0.009173383004963398, "learning_rate": 3.982719934736832e-05, "loss": 11.9296, "step": 36 }, { "epoch": 0.6614525139664804, "grad_norm": 0.008169720880687237, "learning_rate": 3.651016144214878e-05, "loss": 11.9302, "step": 37 }, { "epoch": 0.6793296089385474, "grad_norm": 0.008827430196106434, "learning_rate": 3.325601939145069e-05, "loss": 11.9295, "step": 38 }, { "epoch": 0.6972067039106146, "grad_norm": 0.010021938011050224, "learning_rate": 3.007994550768793e-05, "loss": 11.9299, "step": 39 }, { "epoch": 0.7150837988826816, "grad_norm": 0.010521038435399532, "learning_rate": 2.6996748113442394e-05, "loss": 11.9308, "step": 40 }, { "epoch": 0.7329608938547486, "grad_norm": 0.009070714004337788, "learning_rate": 2.4020802498228335e-05, "loss": 11.93, "step": 41 }, { "epoch": 0.7508379888268156, "grad_norm": 0.008820487186312675, "learning_rate": 2.1165983894256647e-05, "loss": 11.9279, "step": 42 }, { "epoch": 0.7508379888268156, "eval_loss": 11.928503036499023, "eval_runtime": 0.3804, "eval_samples_per_second": 249.752, "eval_steps_per_second": 126.19, "step": 42 }, { "epoch": 0.7687150837988826, "grad_norm": 0.010876229964196682, "learning_rate": 1.8445602783697374e-05, "loss": 11.9305, "step": 43 }, { "epoch": 0.7865921787709497, "grad_norm": 0.008084769360721111, "learning_rate": 1.5872342839067306e-05, "loss": 11.9299, "step": 44 }, { "epoch": 0.8044692737430168, "grad_norm": 0.009019813500344753, "learning_rate": 1.3458201786093794e-05, "loss": 11.9283, "step": 45 }, { "epoch": 0.8223463687150838, "grad_norm": 0.008098295889794827, "learning_rate": 1.1214435464779006e-05, "loss": 11.9292, "step": 46 }, { "epoch": 0.8402234636871508, "grad_norm": 0.008133570663630962, "learning_rate": 9.151505349477902e-06, "loss": 11.9289, "step": 47 }, { "epoch": 0.8581005586592179, "grad_norm": 0.012650455348193645, "learning_rate": 7.2790297726755716e-06, "loss": 11.9302, "step": 48 }, { "epoch": 0.8759776536312849, "grad_norm": 0.009691119194030762, "learning_rate": 5.605739079881239e-06, "loss": 11.9307, "step": 49 }, { "epoch": 0.8938547486033519, "grad_norm": 0.009178046137094498, "learning_rate": 4.139434924727359e-06, "loss": 11.9297, "step": 50 }, { "epoch": 0.911731843575419, "grad_norm": 0.008271483704447746, "learning_rate": 2.88695389405898e-06, "loss": 11.9291, "step": 51 }, { "epoch": 0.929608938547486, "grad_norm": 0.00795311015099287, "learning_rate": 1.8541356326100433e-06, "loss": 11.9286, "step": 52 }, { "epoch": 0.9474860335195531, "grad_norm": 0.008445663377642632, "learning_rate": 1.0457956158838544e-06, "loss": 11.932, "step": 53 }, { "epoch": 0.9653631284916201, "grad_norm": 0.009014743380248547, "learning_rate": 4.6570269818346224e-07, "loss": 11.9283, "step": 54 }, { "epoch": 0.9832402234636871, "grad_norm": 0.009628918021917343, "learning_rate": 1.1656154047303691e-07, "loss": 11.9317, "step": 55 }, { "epoch": 1.0033519553072625, "grad_norm": 0.00985956471413374, "learning_rate": 0.0, "loss": 13.9592, "step": 56 }, { "epoch": 1.0033519553072625, "eval_loss": 11.928487777709961, "eval_runtime": 0.3738, "eval_samples_per_second": 254.136, "eval_steps_per_second": 128.405, "step": 56 } ], "logging_steps": 1, "max_steps": 56, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 14, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 26468155392.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }