{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9814814814814814, "eval_steps": 14, "global_step": 54, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.037037037037037035, "grad_norm": 13.693925857543945, "learning_rate": 2.0000000000000002e-07, "loss": 1.4366, "step": 1 }, { "epoch": 0.037037037037037035, "eval_loss": 1.4367035627365112, "eval_runtime": 19.7904, "eval_samples_per_second": 35.27, "eval_steps_per_second": 4.447, "step": 1 }, { "epoch": 0.07407407407407407, "grad_norm": 13.327592849731445, "learning_rate": 4.0000000000000003e-07, "loss": 1.4017, "step": 2 }, { "epoch": 0.1111111111111111, "grad_norm": 13.196478843688965, "learning_rate": 6.000000000000001e-07, "loss": 1.4017, "step": 3 }, { "epoch": 0.14814814814814814, "grad_norm": 13.321550369262695, "learning_rate": 8.000000000000001e-07, "loss": 1.4058, "step": 4 }, { "epoch": 0.18518518518518517, "grad_norm": 12.840996742248535, "learning_rate": 1.0000000000000002e-06, "loss": 1.4149, "step": 5 }, { "epoch": 0.2222222222222222, "grad_norm": 11.752096176147461, "learning_rate": 1.2000000000000002e-06, "loss": 1.3709, "step": 6 }, { "epoch": 0.25925925925925924, "grad_norm": 11.358497619628906, "learning_rate": 1.4000000000000001e-06, "loss": 1.3723, "step": 7 }, { "epoch": 0.2962962962962963, "grad_norm": 7.285672187805176, "learning_rate": 1.6000000000000001e-06, "loss": 1.3024, "step": 8 }, { "epoch": 0.3333333333333333, "grad_norm": 6.211453914642334, "learning_rate": 1.8000000000000001e-06, "loss": 1.3428, "step": 9 }, { "epoch": 0.37037037037037035, "grad_norm": 4.729733467102051, "learning_rate": 2.0000000000000003e-06, "loss": 1.3021, "step": 10 }, { "epoch": 0.4074074074074074, "grad_norm": 5.45022439956665, "learning_rate": 2.2e-06, "loss": 1.2561, "step": 11 }, { "epoch": 0.4444444444444444, "grad_norm": 6.256317138671875, "learning_rate": 2.4000000000000003e-06, "loss": 1.3131, "step": 12 }, { "epoch": 0.48148148148148145, "grad_norm": 5.992193698883057, "learning_rate": 2.6e-06, "loss": 1.2764, "step": 13 }, { "epoch": 0.5185185185185185, "grad_norm": 5.3906707763671875, "learning_rate": 2.8000000000000003e-06, "loss": 1.2883, "step": 14 }, { "epoch": 0.5185185185185185, "eval_loss": 1.2674976587295532, "eval_runtime": 18.8899, "eval_samples_per_second": 36.951, "eval_steps_per_second": 4.659, "step": 14 }, { "epoch": 0.5555555555555556, "grad_norm": 4.330776691436768, "learning_rate": 3e-06, "loss": 1.2553, "step": 15 }, { "epoch": 0.5925925925925926, "grad_norm": 3.870635509490967, "learning_rate": 3.2000000000000003e-06, "loss": 1.2092, "step": 16 }, { "epoch": 0.6296296296296297, "grad_norm": 3.076308012008667, "learning_rate": 3.4000000000000005e-06, "loss": 1.2735, "step": 17 }, { "epoch": 0.6666666666666666, "grad_norm": 2.6835415363311768, "learning_rate": 3.6000000000000003e-06, "loss": 1.2449, "step": 18 }, { "epoch": 0.7037037037037037, "grad_norm": 2.1219379901885986, "learning_rate": 3.8000000000000005e-06, "loss": 1.2051, "step": 19 }, { "epoch": 0.7407407407407407, "grad_norm": 1.8215879201889038, "learning_rate": 4.000000000000001e-06, "loss": 1.171, "step": 20 }, { "epoch": 0.7777777777777778, "grad_norm": 2.0634374618530273, "learning_rate": 4.2000000000000004e-06, "loss": 1.2243, "step": 21 }, { "epoch": 0.8148148148148148, "grad_norm": 1.9009621143341064, "learning_rate": 4.4e-06, "loss": 1.1914, "step": 22 }, { "epoch": 0.8518518518518519, "grad_norm": 1.8763676881790161, "learning_rate": 4.600000000000001e-06, "loss": 1.1752, "step": 23 }, { "epoch": 0.8888888888888888, "grad_norm": 1.8934900760650635, "learning_rate": 4.800000000000001e-06, "loss": 1.1793, "step": 24 }, { "epoch": 0.9259259259259259, "grad_norm": 1.7864941358566284, "learning_rate": 5e-06, "loss": 1.1839, "step": 25 }, { "epoch": 0.9629629629629629, "grad_norm": 1.810880184173584, "learning_rate": 5.2e-06, "loss": 1.1728, "step": 26 }, { "epoch": 1.0, "grad_norm": 1.7052356004714966, "learning_rate": 5.400000000000001e-06, "loss": 1.1623, "step": 27 }, { "epoch": 1.0185185185185186, "grad_norm": 1.6250964403152466, "learning_rate": 5.600000000000001e-06, "loss": 1.1254, "step": 28 }, { "epoch": 1.0185185185185186, "eval_loss": 1.160744071006775, "eval_runtime": 18.8648, "eval_samples_per_second": 37.0, "eval_steps_per_second": 4.665, "step": 28 }, { "epoch": 1.0555555555555556, "grad_norm": 1.8527966737747192, "learning_rate": 5.8e-06, "loss": 1.0638, "step": 29 }, { "epoch": 1.0925925925925926, "grad_norm": 1.7427172660827637, "learning_rate": 6e-06, "loss": 1.0382, "step": 30 }, { "epoch": 1.1296296296296295, "grad_norm": 1.7577691078186035, "learning_rate": 6.200000000000001e-06, "loss": 1.0523, "step": 31 }, { "epoch": 1.1666666666666667, "grad_norm": 1.928122639656067, "learning_rate": 6.4000000000000006e-06, "loss": 1.0711, "step": 32 }, { "epoch": 1.2037037037037037, "grad_norm": 1.7540444135665894, "learning_rate": 6.600000000000001e-06, "loss": 1.0296, "step": 33 }, { "epoch": 1.2407407407407407, "grad_norm": 1.704374074935913, "learning_rate": 6.800000000000001e-06, "loss": 1.0126, "step": 34 }, { "epoch": 1.2777777777777777, "grad_norm": 1.7199629545211792, "learning_rate": 7e-06, "loss": 1.0091, "step": 35 }, { "epoch": 1.3148148148148149, "grad_norm": 1.6979806423187256, "learning_rate": 7.2000000000000005e-06, "loss": 1.0189, "step": 36 }, { "epoch": 1.3518518518518519, "grad_norm": 1.7349421977996826, "learning_rate": 7.4e-06, "loss": 0.9761, "step": 37 }, { "epoch": 1.3888888888888888, "grad_norm": 1.5777740478515625, "learning_rate": 7.600000000000001e-06, "loss": 0.9847, "step": 38 }, { "epoch": 1.425925925925926, "grad_norm": 1.9043402671813965, "learning_rate": 7.800000000000002e-06, "loss": 0.9688, "step": 39 }, { "epoch": 1.462962962962963, "grad_norm": 1.5200198888778687, "learning_rate": 8.000000000000001e-06, "loss": 0.9511, "step": 40 }, { "epoch": 1.5, "grad_norm": 1.7094305753707886, "learning_rate": 8.2e-06, "loss": 0.9597, "step": 41 }, { "epoch": 1.5370370370370372, "grad_norm": 1.7840018272399902, "learning_rate": 8.400000000000001e-06, "loss": 0.9361, "step": 42 }, { "epoch": 1.5370370370370372, "eval_loss": 1.13677179813385, "eval_runtime": 18.8462, "eval_samples_per_second": 37.037, "eval_steps_per_second": 4.669, "step": 42 }, { "epoch": 1.574074074074074, "grad_norm": 1.6459747552871704, "learning_rate": 8.6e-06, "loss": 0.9506, "step": 43 }, { "epoch": 1.6111111111111112, "grad_norm": 1.922658085823059, "learning_rate": 8.8e-06, "loss": 0.9846, "step": 44 }, { "epoch": 1.6481481481481481, "grad_norm": 1.8302316665649414, "learning_rate": 9e-06, "loss": 0.9371, "step": 45 }, { "epoch": 1.6851851851851851, "grad_norm": 1.6393502950668335, "learning_rate": 9.200000000000002e-06, "loss": 0.8898, "step": 46 }, { "epoch": 1.7222222222222223, "grad_norm": 1.9181392192840576, "learning_rate": 9.4e-06, "loss": 0.9555, "step": 47 }, { "epoch": 1.7592592592592593, "grad_norm": 1.7563830614089966, "learning_rate": 9.600000000000001e-06, "loss": 0.943, "step": 48 }, { "epoch": 1.7962962962962963, "grad_norm": 1.8117369413375854, "learning_rate": 9.800000000000001e-06, "loss": 0.9278, "step": 49 }, { "epoch": 1.8333333333333335, "grad_norm": 1.6542695760726929, "learning_rate": 1e-05, "loss": 0.973, "step": 50 }, { "epoch": 1.8703703703703702, "grad_norm": 1.7787063121795654, "learning_rate": 8.535533905932739e-06, "loss": 0.951, "step": 51 }, { "epoch": 1.9074074074074074, "grad_norm": 1.6953744888305664, "learning_rate": 5e-06, "loss": 0.951, "step": 52 }, { "epoch": 1.9444444444444444, "grad_norm": 1.9485039710998535, "learning_rate": 1.4644660940672628e-06, "loss": 0.9492, "step": 53 }, { "epoch": 1.9814814814814814, "grad_norm": 1.531893253326416, "learning_rate": 0.0, "loss": 0.9236, "step": 54 } ], "logging_steps": 1, "max_steps": 54, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 14, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.9665250622279516e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }