{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.22753128555176336, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0022753128555176336, "eval_loss": 0.7868214249610901, "eval_runtime": 22.7904, "eval_samples_per_second": 32.47, "eval_steps_per_second": 4.081, "step": 1 }, { "epoch": 0.006825938566552901, "grad_norm": 0.4706664979457855, "learning_rate": 1.5e-05, "loss": 0.7727, "step": 3 }, { "epoch": 0.013651877133105802, "grad_norm": 0.45865100622177124, "learning_rate": 3e-05, "loss": 0.7437, "step": 6 }, { "epoch": 0.020477815699658702, "grad_norm": 0.45330241322517395, "learning_rate": 4.5e-05, "loss": 0.757, "step": 9 }, { "epoch": 0.020477815699658702, "eval_loss": 0.7322847843170166, "eval_runtime": 22.75, "eval_samples_per_second": 32.527, "eval_steps_per_second": 4.088, "step": 9 }, { "epoch": 0.027303754266211604, "grad_norm": 0.3881157338619232, "learning_rate": 4.993910125649561e-05, "loss": 0.7386, "step": 12 }, { "epoch": 0.034129692832764506, "grad_norm": 0.38104698061943054, "learning_rate": 4.962019382530521e-05, "loss": 0.7012, "step": 15 }, { "epoch": 0.040955631399317405, "grad_norm": 0.44238075613975525, "learning_rate": 4.9031542398457974e-05, "loss": 0.684, "step": 18 }, { "epoch": 0.040955631399317405, "eval_loss": 0.6357378363609314, "eval_runtime": 22.7947, "eval_samples_per_second": 32.464, "eval_steps_per_second": 4.08, "step": 18 }, { "epoch": 0.04778156996587031, "grad_norm": 0.24668432772159576, "learning_rate": 4.817959636416969e-05, "loss": 0.591, "step": 21 }, { "epoch": 0.05460750853242321, "grad_norm": 0.3645310699939728, "learning_rate": 4.707368982147318e-05, "loss": 0.586, "step": 24 }, { "epoch": 0.06143344709897611, "grad_norm": 0.23351751267910004, "learning_rate": 4.572593931387604e-05, "loss": 0.6196, "step": 27 }, { "epoch": 0.06143344709897611, "eval_loss": 0.6290053129196167, "eval_runtime": 22.8178, "eval_samples_per_second": 32.431, "eval_steps_per_second": 4.076, "step": 27 }, { "epoch": 0.06825938566552901, "grad_norm": 0.27666857838630676, "learning_rate": 4.415111107797445e-05, "loss": 0.6233, "step": 30 }, { "epoch": 0.07508532423208192, "grad_norm": 0.18860295414924622, "learning_rate": 4.2366459261474933e-05, "loss": 0.5808, "step": 33 }, { "epoch": 0.08191126279863481, "grad_norm": 0.19348275661468506, "learning_rate": 4.039153688314145e-05, "loss": 0.6348, "step": 36 }, { "epoch": 0.08191126279863481, "eval_loss": 0.6226277351379395, "eval_runtime": 22.8843, "eval_samples_per_second": 32.337, "eval_steps_per_second": 4.064, "step": 36 }, { "epoch": 0.08873720136518772, "grad_norm": 0.19154131412506104, "learning_rate": 3.824798160583012e-05, "loss": 0.6262, "step": 39 }, { "epoch": 0.09556313993174062, "grad_norm": 0.20268936455249786, "learning_rate": 3.5959278669726935e-05, "loss": 0.6444, "step": 42 }, { "epoch": 0.10238907849829351, "grad_norm": 0.21229299902915955, "learning_rate": 3.355050358314172e-05, "loss": 0.6208, "step": 45 }, { "epoch": 0.10238907849829351, "eval_loss": 0.6191264986991882, "eval_runtime": 22.8984, "eval_samples_per_second": 32.317, "eval_steps_per_second": 4.061, "step": 45 }, { "epoch": 0.10921501706484642, "grad_norm": 0.25168129801750183, "learning_rate": 3.104804738999169e-05, "loss": 0.6224, "step": 48 }, { "epoch": 0.11604095563139932, "grad_norm": 0.15763932466506958, "learning_rate": 2.8479327524001636e-05, "loss": 0.6254, "step": 51 }, { "epoch": 0.12286689419795221, "grad_norm": 0.1863393485546112, "learning_rate": 2.587248741756253e-05, "loss": 0.5725, "step": 54 }, { "epoch": 0.12286689419795221, "eval_loss": 0.616894006729126, "eval_runtime": 22.8578, "eval_samples_per_second": 32.374, "eval_steps_per_second": 4.069, "step": 54 }, { "epoch": 0.1296928327645051, "grad_norm": 0.1760629266500473, "learning_rate": 2.3256088156396868e-05, "loss": 0.6303, "step": 57 }, { "epoch": 0.13651877133105803, "grad_norm": 0.19018211960792542, "learning_rate": 2.0658795558326743e-05, "loss": 0.6522, "step": 60 }, { "epoch": 0.14334470989761092, "grad_norm": 0.22535236179828644, "learning_rate": 1.8109066104575023e-05, "loss": 0.651, "step": 63 }, { "epoch": 0.14334470989761092, "eval_loss": 0.6154171228408813, "eval_runtime": 22.8611, "eval_samples_per_second": 32.369, "eval_steps_per_second": 4.068, "step": 63 }, { "epoch": 0.15017064846416384, "grad_norm": 0.1817297488451004, "learning_rate": 1.56348351646022e-05, "loss": 0.5489, "step": 66 }, { "epoch": 0.15699658703071673, "grad_norm": 0.1770661175251007, "learning_rate": 1.3263210930352737e-05, "loss": 0.6358, "step": 69 }, { "epoch": 0.16382252559726962, "grad_norm": 0.19404783844947815, "learning_rate": 1.1020177413231334e-05, "loss": 0.5854, "step": 72 }, { "epoch": 0.16382252559726962, "eval_loss": 0.6145646572113037, "eval_runtime": 22.8744, "eval_samples_per_second": 32.351, "eval_steps_per_second": 4.066, "step": 72 }, { "epoch": 0.17064846416382254, "grad_norm": 0.20881330966949463, "learning_rate": 8.930309757836517e-06, "loss": 0.6257, "step": 75 }, { "epoch": 0.17747440273037543, "grad_norm": 0.192912757396698, "learning_rate": 7.016504991533726e-06, "loss": 0.5348, "step": 78 }, { "epoch": 0.18430034129692832, "grad_norm": 0.19463366270065308, "learning_rate": 5.299731159831953e-06, "loss": 0.6411, "step": 81 }, { "epoch": 0.18430034129692832, "eval_loss": 0.6141290664672852, "eval_runtime": 22.9508, "eval_samples_per_second": 32.243, "eval_steps_per_second": 4.052, "step": 81 }, { "epoch": 0.19112627986348124, "grad_norm": 0.18446682393550873, "learning_rate": 3.798797596089351e-06, "loss": 0.573, "step": 84 }, { "epoch": 0.19795221843003413, "grad_norm": 0.1912892609834671, "learning_rate": 2.5301488425208296e-06, "loss": 0.5757, "step": 87 }, { "epoch": 0.20477815699658702, "grad_norm": 0.1818179190158844, "learning_rate": 1.5076844803522922e-06, "loss": 0.5692, "step": 90 }, { "epoch": 0.20477815699658702, "eval_loss": 0.6138909459114075, "eval_runtime": 22.9124, "eval_samples_per_second": 32.297, "eval_steps_per_second": 4.059, "step": 90 }, { "epoch": 0.21160409556313994, "grad_norm": 0.179690420627594, "learning_rate": 7.426068431000882e-07, "loss": 0.5427, "step": 93 }, { "epoch": 0.21843003412969283, "grad_norm": 0.19969414174556732, "learning_rate": 2.4329828146074095e-07, "loss": 0.6306, "step": 96 }, { "epoch": 0.22525597269624573, "grad_norm": 0.1736162155866623, "learning_rate": 1.522932452260595e-08, "loss": 0.5382, "step": 99 }, { "epoch": 0.22525597269624573, "eval_loss": 0.6138170957565308, "eval_runtime": 22.8743, "eval_samples_per_second": 32.351, "eval_steps_per_second": 4.066, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.030543701429453e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }