{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 3000, "global_step": 44343, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.76544212164265e-05, "grad_norm": 6.0, "learning_rate": 0.0002999932345578783, "loss": 4.375, "step": 1 }, { "epoch": 0.20296326364927947, "grad_norm": 1.703125, "learning_rate": 0.000279703673635072, "loss": 2.7717, "step": 3000 }, { "epoch": 0.20296326364927947, "eval_loss": 2.3463752269744873, "eval_runtime": 61.2837, "eval_samples_per_second": 1540.458, "eval_steps_per_second": 6.021, "step": 3000 }, { "epoch": 0.40592652729855894, "grad_norm": 0.94921875, "learning_rate": 0.0002594073472701441, "loss": 2.6786, "step": 6000 }, { "epoch": 0.40592652729855894, "eval_loss": 2.3242292404174805, "eval_runtime": 61.2847, "eval_samples_per_second": 1540.433, "eval_steps_per_second": 6.021, "step": 6000 }, { "epoch": 0.6088897909478385, "grad_norm": 0.91015625, "learning_rate": 0.0002391110209052161, "loss": 2.6556, "step": 9000 }, { "epoch": 0.6088897909478385, "eval_loss": 2.3117165565490723, "eval_runtime": 61.1596, "eval_samples_per_second": 1543.584, "eval_steps_per_second": 6.033, "step": 9000 }, { "epoch": 0.8118530545971179, "grad_norm": 0.88671875, "learning_rate": 0.0002188146945402882, "loss": 2.6474, "step": 12000 }, { "epoch": 0.8118530545971179, "eval_loss": 2.30464506149292, "eval_runtime": 61.272, "eval_samples_per_second": 1540.754, "eval_steps_per_second": 6.022, "step": 12000 }, { "epoch": 1.0148163182463974, "grad_norm": 1.21875, "learning_rate": 0.00019851836817536025, "loss": 2.6412, "step": 15000 }, { "epoch": 1.0148163182463974, "eval_loss": 2.302506685256958, "eval_runtime": 61.3123, "eval_samples_per_second": 1539.74, "eval_steps_per_second": 6.018, "step": 15000 }, { "epoch": 1.217779581895677, "grad_norm": 0.83984375, "learning_rate": 0.0001782220418104323, "loss": 2.6352, "step": 18000 }, { "epoch": 1.217779581895677, "eval_loss": 2.299394369125366, "eval_runtime": 61.2885, "eval_samples_per_second": 1540.338, "eval_steps_per_second": 6.021, "step": 18000 }, { "epoch": 1.4207428455449564, "grad_norm": 0.94140625, "learning_rate": 0.00015792571544550436, "loss": 2.6334, "step": 21000 }, { "epoch": 1.4207428455449564, "eval_loss": 2.2973618507385254, "eval_runtime": 61.2764, "eval_samples_per_second": 1540.642, "eval_steps_per_second": 6.022, "step": 21000 }, { "epoch": 1.6237061091942357, "grad_norm": 0.8515625, "learning_rate": 0.0001376293890805764, "loss": 2.6313, "step": 24000 }, { "epoch": 1.6237061091942357, "eval_loss": 2.2973830699920654, "eval_runtime": 61.1563, "eval_samples_per_second": 1543.668, "eval_steps_per_second": 6.034, "step": 24000 }, { "epoch": 1.8266693728435153, "grad_norm": 0.94140625, "learning_rate": 0.00011733306271564845, "loss": 2.629, "step": 27000 }, { "epoch": 1.8266693728435153, "eval_loss": 2.297150135040283, "eval_runtime": 61.2918, "eval_samples_per_second": 1540.254, "eval_steps_per_second": 6.02, "step": 27000 }, { "epoch": 2.029632636492795, "grad_norm": 0.8671875, "learning_rate": 9.703673635072052e-05, "loss": 2.6292, "step": 30000 }, { "epoch": 2.029632636492795, "eval_loss": 2.2963457107543945, "eval_runtime": 61.4261, "eval_samples_per_second": 1536.887, "eval_steps_per_second": 6.007, "step": 30000 }, { "epoch": 2.232595900142074, "grad_norm": 0.82421875, "learning_rate": 7.674040998579256e-05, "loss": 2.6262, "step": 33000 }, { "epoch": 2.232595900142074, "eval_loss": 2.2962822914123535, "eval_runtime": 61.2609, "eval_samples_per_second": 1541.031, "eval_steps_per_second": 6.023, "step": 33000 }, { "epoch": 2.435559163791354, "grad_norm": 0.9921875, "learning_rate": 5.644408362086462e-05, "loss": 2.6296, "step": 36000 }, { "epoch": 2.435559163791354, "eval_loss": 2.2963669300079346, "eval_runtime": 61.2798, "eval_samples_per_second": 1540.557, "eval_steps_per_second": 6.022, "step": 36000 }, { "epoch": 2.638522427440633, "grad_norm": 0.94140625, "learning_rate": 3.614775725593667e-05, "loss": 2.63, "step": 39000 }, { "epoch": 2.638522427440633, "eval_loss": 2.2954142093658447, "eval_runtime": 61.3072, "eval_samples_per_second": 1539.869, "eval_steps_per_second": 6.019, "step": 39000 }, { "epoch": 2.841485691089913, "grad_norm": 0.88671875, "learning_rate": 1.5851430891008727e-05, "loss": 2.6285, "step": 42000 }, { "epoch": 2.841485691089913, "eval_loss": 2.295731782913208, "eval_runtime": 61.1502, "eval_samples_per_second": 1543.823, "eval_steps_per_second": 6.034, "step": 42000 }, { "epoch": 3.0, "step": 44343, "total_flos": 4.6159075948098355e+17, "train_loss": 2.646649062422479, "train_runtime": 22762.4415, "train_samples_per_second": 498.693, "train_steps_per_second": 1.948 } ], "logging_steps": 3000, "max_steps": 44343, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.6159075948098355e+17, "train_batch_size": 256, "trial_name": null, "trial_params": null }