{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0991348744858884, "eval_steps": 3526, "global_step": 15500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03545596369309318, "grad_norm": 2.768805980682373, "learning_rate": 0.0001964544036306907, "loss": 3.5372, "step": 500 }, { "epoch": 0.07091192738618636, "grad_norm": 2.2298574447631836, "learning_rate": 0.00019290880726138138, "loss": 2.9277, "step": 1000 }, { "epoch": 0.10636789107927953, "grad_norm": 1.8124895095825195, "learning_rate": 0.00018936321089207204, "loss": 2.8216, "step": 1500 }, { "epoch": 0.14182385477237272, "grad_norm": 1.6588631868362427, "learning_rate": 0.00018581761452276274, "loss": 2.7462, "step": 2000 }, { "epoch": 0.1772798184654659, "grad_norm": 1.7708145380020142, "learning_rate": 0.00018227201815345344, "loss": 2.6757, "step": 2500 }, { "epoch": 0.21273578215855907, "grad_norm": 1.6058692932128906, "learning_rate": 0.0001787264217841441, "loss": 2.6299, "step": 3000 }, { "epoch": 0.24819174585165224, "grad_norm": 2.6353378295898438, "learning_rate": 0.00017518082541483478, "loss": 2.57, "step": 3500 }, { "epoch": 0.2500354559636931, "eval_loss": 2.3737287521362305, "eval_runtime": 342.4427, "eval_samples_per_second": 10.297, "eval_steps_per_second": 10.297, "step": 3526 }, { "epoch": 0.28364770954474544, "grad_norm": 2.034212827682495, "learning_rate": 0.00017163522904552547, "loss": 2.5853, "step": 4000 }, { "epoch": 0.3191036732378386, "grad_norm": 3.164950370788574, "learning_rate": 0.00016808963267621617, "loss": 2.4973, "step": 4500 }, { "epoch": 0.3545596369309318, "grad_norm": 1.992343783378601, "learning_rate": 0.0001645440363069068, "loss": 2.5324, "step": 5000 }, { "epoch": 0.39001560062402496, "grad_norm": 2.540403366088867, "learning_rate": 0.0001609984399375975, "loss": 2.4857, "step": 5500 }, { "epoch": 0.42547156431711813, "grad_norm": 5.178898334503174, "learning_rate": 0.0001574528435682882, "loss": 2.47, "step": 6000 }, { "epoch": 0.4609275280102113, "grad_norm": 2.17370867729187, "learning_rate": 0.00015390724719897887, "loss": 2.4353, "step": 6500 }, { "epoch": 0.4963834917033045, "grad_norm": 2.1408374309539795, "learning_rate": 0.00015036165082966954, "loss": 2.4575, "step": 7000 }, { "epoch": 0.5000709119273862, "eval_loss": 2.2161264419555664, "eval_runtime": 347.4584, "eval_samples_per_second": 10.148, "eval_steps_per_second": 10.148, "step": 7052 }, { "epoch": 0.5318394553963977, "grad_norm": 2.286954879760742, "learning_rate": 0.00014681605446036024, "loss": 2.4082, "step": 7500 }, { "epoch": 0.5672954190894909, "grad_norm": 2.3392558097839355, "learning_rate": 0.00014327045809105093, "loss": 2.415, "step": 8000 }, { "epoch": 0.602751382782584, "grad_norm": 2.3896689414978027, "learning_rate": 0.0001397248617217416, "loss": 2.3953, "step": 8500 }, { "epoch": 0.6382073464756772, "grad_norm": 2.069913864135742, "learning_rate": 0.00013617926535243227, "loss": 2.3731, "step": 9000 }, { "epoch": 0.6736633101687703, "grad_norm": 3.516718864440918, "learning_rate": 0.00013263366898312297, "loss": 2.332, "step": 9500 }, { "epoch": 0.7091192738618636, "grad_norm": 2.8783183097839355, "learning_rate": 0.00012908807261381367, "loss": 2.2976, "step": 10000 }, { "epoch": 0.7445752375549567, "grad_norm": 3.2221286296844482, "learning_rate": 0.00012554247624450433, "loss": 2.2903, "step": 10500 }, { "epoch": 0.7501063678910793, "eval_loss": 2.133788824081421, "eval_runtime": 340.207, "eval_samples_per_second": 10.364, "eval_steps_per_second": 10.364, "step": 10578 }, { "epoch": 0.7800312012480499, "grad_norm": 2.0654242038726807, "learning_rate": 0.00012199687987519502, "loss": 2.3177, "step": 11000 }, { "epoch": 0.8154871649411432, "grad_norm": 2.5238709449768066, "learning_rate": 0.0001184512835058857, "loss": 2.2966, "step": 11500 }, { "epoch": 0.8509431286342363, "grad_norm": 2.6544277667999268, "learning_rate": 0.00011490568713657637, "loss": 2.2993, "step": 12000 }, { "epoch": 0.8863990923273295, "grad_norm": 2.4742555618286133, "learning_rate": 0.00011136009076726705, "loss": 2.2583, "step": 12500 }, { "epoch": 0.9218550560204226, "grad_norm": 1.410945177078247, "learning_rate": 0.00010781449439795775, "loss": 2.2652, "step": 13000 }, { "epoch": 0.9573110197135158, "grad_norm": 2.3635599613189697, "learning_rate": 0.00010426889802864843, "loss": 2.3116, "step": 13500 }, { "epoch": 0.992766983406609, "grad_norm": 1.9908702373504639, "learning_rate": 0.0001007233016593391, "loss": 2.2715, "step": 14000 }, { "epoch": 1.0001418238547723, "eval_loss": 2.0845015048980713, "eval_runtime": 347.9194, "eval_samples_per_second": 10.135, "eval_steps_per_second": 10.135, "step": 14104 }, { "epoch": 1.0282229470997022, "grad_norm": 2.456178903579712, "learning_rate": 9.717770529002978e-05, "loss": 2.2921, "step": 14500 }, { "epoch": 1.0636789107927953, "grad_norm": 3.4969823360443115, "learning_rate": 9.363210892072048e-05, "loss": 2.216, "step": 15000 }, { "epoch": 1.0991348744858884, "grad_norm": 1.8081468343734741, "learning_rate": 9.008651255141115e-05, "loss": 2.242, "step": 15500 } ], "logging_steps": 500, "max_steps": 28204, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.7341146652672e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }