|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9814814814814814, |
|
"eval_steps": 14, |
|
"global_step": 54, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.037037037037037035, |
|
"grad_norm": 13.693925857543945, |
|
"learning_rate": 2.0000000000000002e-07, |
|
"loss": 1.4366, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.037037037037037035, |
|
"eval_loss": 1.4367035627365112, |
|
"eval_runtime": 19.7904, |
|
"eval_samples_per_second": 35.27, |
|
"eval_steps_per_second": 4.447, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.07407407407407407, |
|
"grad_norm": 13.327592849731445, |
|
"learning_rate": 4.0000000000000003e-07, |
|
"loss": 1.4017, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.1111111111111111, |
|
"grad_norm": 13.196478843688965, |
|
"learning_rate": 6.000000000000001e-07, |
|
"loss": 1.4017, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.14814814814814814, |
|
"grad_norm": 13.321550369262695, |
|
"learning_rate": 8.000000000000001e-07, |
|
"loss": 1.4058, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.18518518518518517, |
|
"grad_norm": 12.840996742248535, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.4149, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 11.752096176147461, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 1.3709, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.25925925925925924, |
|
"grad_norm": 11.358497619628906, |
|
"learning_rate": 1.4000000000000001e-06, |
|
"loss": 1.3723, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 7.285672187805176, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 1.3024, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 6.211453914642334, |
|
"learning_rate": 1.8000000000000001e-06, |
|
"loss": 1.3428, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.37037037037037035, |
|
"grad_norm": 4.729733467102051, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.3021, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.4074074074074074, |
|
"grad_norm": 5.45022439956665, |
|
"learning_rate": 2.2e-06, |
|
"loss": 1.2561, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 6.256317138671875, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 1.3131, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.48148148148148145, |
|
"grad_norm": 5.992193698883057, |
|
"learning_rate": 2.6e-06, |
|
"loss": 1.2764, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.5185185185185185, |
|
"grad_norm": 5.3906707763671875, |
|
"learning_rate": 2.8000000000000003e-06, |
|
"loss": 1.2883, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.5185185185185185, |
|
"eval_loss": 1.2674976587295532, |
|
"eval_runtime": 18.8899, |
|
"eval_samples_per_second": 36.951, |
|
"eval_steps_per_second": 4.659, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 4.330776691436768, |
|
"learning_rate": 3e-06, |
|
"loss": 1.2553, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 3.870635509490967, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 1.2092, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.6296296296296297, |
|
"grad_norm": 3.076308012008667, |
|
"learning_rate": 3.4000000000000005e-06, |
|
"loss": 1.2735, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 2.6835415363311768, |
|
"learning_rate": 3.6000000000000003e-06, |
|
"loss": 1.2449, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.7037037037037037, |
|
"grad_norm": 2.1219379901885986, |
|
"learning_rate": 3.8000000000000005e-06, |
|
"loss": 1.2051, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 1.8215879201889038, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.171, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.7777777777777778, |
|
"grad_norm": 2.0634374618530273, |
|
"learning_rate": 4.2000000000000004e-06, |
|
"loss": 1.2243, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.8148148148148148, |
|
"grad_norm": 1.9009621143341064, |
|
"learning_rate": 4.4e-06, |
|
"loss": 1.1914, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.8518518518518519, |
|
"grad_norm": 1.8763676881790161, |
|
"learning_rate": 4.600000000000001e-06, |
|
"loss": 1.1752, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 1.8934900760650635, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 1.1793, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.9259259259259259, |
|
"grad_norm": 1.7864941358566284, |
|
"learning_rate": 5e-06, |
|
"loss": 1.1839, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.9629629629629629, |
|
"grad_norm": 1.810880184173584, |
|
"learning_rate": 5.2e-06, |
|
"loss": 1.1728, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.7052356004714966, |
|
"learning_rate": 5.400000000000001e-06, |
|
"loss": 1.1623, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 1.0185185185185186, |
|
"grad_norm": 1.6250964403152466, |
|
"learning_rate": 5.600000000000001e-06, |
|
"loss": 1.1254, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.0185185185185186, |
|
"eval_loss": 1.160744071006775, |
|
"eval_runtime": 18.8648, |
|
"eval_samples_per_second": 37.0, |
|
"eval_steps_per_second": 4.665, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.0555555555555556, |
|
"grad_norm": 1.8527966737747192, |
|
"learning_rate": 5.8e-06, |
|
"loss": 1.0638, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 1.0925925925925926, |
|
"grad_norm": 1.7427172660827637, |
|
"learning_rate": 6e-06, |
|
"loss": 1.0382, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.1296296296296295, |
|
"grad_norm": 1.7577691078186035, |
|
"learning_rate": 6.200000000000001e-06, |
|
"loss": 1.0523, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.1666666666666667, |
|
"grad_norm": 1.928122639656067, |
|
"learning_rate": 6.4000000000000006e-06, |
|
"loss": 1.0711, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.2037037037037037, |
|
"grad_norm": 1.7540444135665894, |
|
"learning_rate": 6.600000000000001e-06, |
|
"loss": 1.0296, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.2407407407407407, |
|
"grad_norm": 1.704374074935913, |
|
"learning_rate": 6.800000000000001e-06, |
|
"loss": 1.0126, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.2777777777777777, |
|
"grad_norm": 1.7199629545211792, |
|
"learning_rate": 7e-06, |
|
"loss": 1.0091, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.3148148148148149, |
|
"grad_norm": 1.6979806423187256, |
|
"learning_rate": 7.2000000000000005e-06, |
|
"loss": 1.0189, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.3518518518518519, |
|
"grad_norm": 1.7349421977996826, |
|
"learning_rate": 7.4e-06, |
|
"loss": 0.9761, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.3888888888888888, |
|
"grad_norm": 1.5777740478515625, |
|
"learning_rate": 7.600000000000001e-06, |
|
"loss": 0.9847, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.425925925925926, |
|
"grad_norm": 1.9043402671813965, |
|
"learning_rate": 7.800000000000002e-06, |
|
"loss": 0.9688, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.462962962962963, |
|
"grad_norm": 1.5200198888778687, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.9511, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.7094305753707886, |
|
"learning_rate": 8.2e-06, |
|
"loss": 0.9597, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.5370370370370372, |
|
"grad_norm": 1.7840018272399902, |
|
"learning_rate": 8.400000000000001e-06, |
|
"loss": 0.9361, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.5370370370370372, |
|
"eval_loss": 1.13677179813385, |
|
"eval_runtime": 18.8462, |
|
"eval_samples_per_second": 37.037, |
|
"eval_steps_per_second": 4.669, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.574074074074074, |
|
"grad_norm": 1.6459747552871704, |
|
"learning_rate": 8.6e-06, |
|
"loss": 0.9506, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.6111111111111112, |
|
"grad_norm": 1.922658085823059, |
|
"learning_rate": 8.8e-06, |
|
"loss": 0.9846, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.6481481481481481, |
|
"grad_norm": 1.8302316665649414, |
|
"learning_rate": 9e-06, |
|
"loss": 0.9371, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.6851851851851851, |
|
"grad_norm": 1.6393502950668335, |
|
"learning_rate": 9.200000000000002e-06, |
|
"loss": 0.8898, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.7222222222222223, |
|
"grad_norm": 1.9181392192840576, |
|
"learning_rate": 9.4e-06, |
|
"loss": 0.9555, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.7592592592592593, |
|
"grad_norm": 1.7563830614089966, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 0.943, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.7962962962962963, |
|
"grad_norm": 1.8117369413375854, |
|
"learning_rate": 9.800000000000001e-06, |
|
"loss": 0.9278, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.8333333333333335, |
|
"grad_norm": 1.6542695760726929, |
|
"learning_rate": 1e-05, |
|
"loss": 0.973, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.8703703703703702, |
|
"grad_norm": 1.7787063121795654, |
|
"learning_rate": 8.535533905932739e-06, |
|
"loss": 0.951, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.9074074074074074, |
|
"grad_norm": 1.6953744888305664, |
|
"learning_rate": 5e-06, |
|
"loss": 0.951, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.9444444444444444, |
|
"grad_norm": 1.9485039710998535, |
|
"learning_rate": 1.4644660940672628e-06, |
|
"loss": 0.9492, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.9814814814814814, |
|
"grad_norm": 1.531893253326416, |
|
"learning_rate": 0.0, |
|
"loss": 0.9236, |
|
"step": 54 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 54, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 14, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.9665250622279516e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|