{ "best_metric": 0.11026904731988907, "best_model_checkpoint": "outputs/checkpoint-270", "epoch": 4.920273348519363, "eval_steps": 500, "global_step": 270, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.36446469248291574, "grad_norm": 3.687648057937622, "learning_rate": 1.8000000000000001e-06, "loss": 0.3728, "step": 20 }, { "epoch": 0.7289293849658315, "grad_norm": 1.6180248260498047, "learning_rate": 3.8000000000000005e-06, "loss": 0.2874, "step": 40 }, { "epoch": 0.9840546697038725, "eval_loss": 0.1868637651205063, "eval_runtime": 47.7396, "eval_samples_per_second": 3.519, "eval_steps_per_second": 0.44, "step": 54 }, { "epoch": 1.0933940774487472, "grad_norm": 0.6248186230659485, "learning_rate": 5.8e-06, "loss": 0.1808, "step": 60 }, { "epoch": 1.4578587699316627, "grad_norm": 0.4216831922531128, "learning_rate": 7.800000000000002e-06, "loss": 0.134, "step": 80 }, { "epoch": 1.8223234624145785, "grad_norm": 1.7952409982681274, "learning_rate": 9.800000000000001e-06, "loss": 0.1238, "step": 100 }, { "epoch": 1.9863325740318907, "eval_loss": 0.1356915980577469, "eval_runtime": 47.72, "eval_samples_per_second": 3.521, "eval_steps_per_second": 0.44, "step": 109 }, { "epoch": 2.1867881548974943, "grad_norm": 0.481364369392395, "learning_rate": 9.7259191408041e-06, "loss": 0.1034, "step": 120 }, { "epoch": 2.55125284738041, "grad_norm": 0.6242639422416687, "learning_rate": 8.816991413705515e-06, "loss": 0.0873, "step": 140 }, { "epoch": 2.9157175398633255, "grad_norm": 0.40104907751083374, "learning_rate": 7.392557845506433e-06, "loss": 0.0863, "step": 160 }, { "epoch": 2.988610478359909, "eval_loss": 0.11830399930477142, "eval_runtime": 47.7245, "eval_samples_per_second": 3.52, "eval_steps_per_second": 0.44, "step": 164 }, { "epoch": 3.2801822323462413, "grad_norm": 0.659656822681427, "learning_rate": 5.644996082651018e-06, "loss": 0.0846, "step": 180 }, { "epoch": 3.644646924829157, "grad_norm": 0.5718321800231934, "learning_rate": 3.8103240247869077e-06, "loss": 0.0774, "step": 200 }, { "epoch": 3.990888382687927, "eval_loss": 0.11092903465032578, "eval_runtime": 47.7405, "eval_samples_per_second": 3.519, "eval_steps_per_second": 0.44, "step": 219 }, { "epoch": 4.009111617312073, "grad_norm": 0.6463438272476196, "learning_rate": 2.136324299597474e-06, "loss": 0.075, "step": 220 }, { "epoch": 4.373576309794989, "grad_norm": 0.44310250878334045, "learning_rate": 8.490798459222477e-07, "loss": 0.0736, "step": 240 }, { "epoch": 4.738041002277904, "grad_norm": 0.5197251439094543, "learning_rate": 1.22440160097817e-07, "loss": 0.0693, "step": 260 }, { "epoch": 4.920273348519363, "eval_loss": 0.11026904731988907, "eval_runtime": 47.7998, "eval_samples_per_second": 3.515, "eval_steps_per_second": 0.439, "step": 270 } ], "logging_steps": 20, "max_steps": 270, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 1.0107505880948736e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }