{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.6, "eval_steps": 1, "global_step": 60, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.16, "grad_norm": 10.0, "learning_rate": 2.5e-05, "loss": 0.7248, "step": 1 }, { "epoch": 0.16, "eval_accuracy": 0.504, "eval_loss": 0.7169140577316284, "eval_runtime": 1.7694, "eval_samples_per_second": 141.288, "eval_steps_per_second": 3.956, "step": 1 }, { "epoch": 0.32, "grad_norm": 11.3125, "learning_rate": 5e-05, "loss": 0.7141, "step": 2 }, { "epoch": 0.32, "eval_accuracy": 0.512, "eval_loss": 0.713210940361023, "eval_runtime": 1.908, "eval_samples_per_second": 131.027, "eval_steps_per_second": 3.669, "step": 2 }, { "epoch": 0.48, "grad_norm": 8.5, "learning_rate": 4.913793103448276e-05, "loss": 0.72, "step": 3 }, { "epoch": 0.48, "eval_accuracy": 0.516, "eval_loss": 0.7183730602264404, "eval_runtime": 1.9102, "eval_samples_per_second": 130.879, "eval_steps_per_second": 3.665, "step": 3 }, { "epoch": 0.64, "grad_norm": 9.1875, "learning_rate": 4.827586206896552e-05, "loss": 0.7007, "step": 4 }, { "epoch": 0.64, "eval_accuracy": 0.516, "eval_loss": 0.7356113195419312, "eval_runtime": 1.908, "eval_samples_per_second": 131.029, "eval_steps_per_second": 3.669, "step": 4 }, { "epoch": 0.8, "grad_norm": 9.4375, "learning_rate": 4.741379310344828e-05, "loss": 0.8173, "step": 5 }, { "epoch": 0.8, "eval_accuracy": 0.516, "eval_loss": 0.7247294783592224, "eval_runtime": 1.9079, "eval_samples_per_second": 131.036, "eval_steps_per_second": 3.669, "step": 5 }, { "epoch": 0.96, "grad_norm": 10.6875, "learning_rate": 4.655172413793104e-05, "loss": 0.716, "step": 6 }, { "epoch": 0.96, "eval_accuracy": 0.52, "eval_loss": 0.7145995497703552, "eval_runtime": 1.906, "eval_samples_per_second": 131.162, "eval_steps_per_second": 3.673, "step": 6 }, { "epoch": 1.12, "grad_norm": 3.15625, "learning_rate": 4.5689655172413794e-05, "loss": 0.7381, "step": 7 }, { "epoch": 1.12, "eval_accuracy": 0.52, "eval_loss": 0.7026171684265137, "eval_runtime": 1.9082, "eval_samples_per_second": 131.015, "eval_steps_per_second": 3.668, "step": 7 }, { "epoch": 1.28, "grad_norm": 3.0625, "learning_rate": 4.482758620689655e-05, "loss": 0.6973, "step": 8 }, { "epoch": 1.28, "eval_accuracy": 0.52, "eval_loss": 0.6966249942779541, "eval_runtime": 1.9071, "eval_samples_per_second": 131.086, "eval_steps_per_second": 3.67, "step": 8 }, { "epoch": 1.44, "grad_norm": 1.2734375, "learning_rate": 4.396551724137931e-05, "loss": 0.6871, "step": 9 }, { "epoch": 1.44, "eval_accuracy": 0.516, "eval_loss": 0.6961289048194885, "eval_runtime": 1.9103, "eval_samples_per_second": 130.872, "eval_steps_per_second": 3.664, "step": 9 }, { "epoch": 1.6, "grad_norm": 4.21875, "learning_rate": 4.3103448275862066e-05, "loss": 0.6943, "step": 10 }, { "epoch": 1.6, "eval_accuracy": 0.508, "eval_loss": 0.6979374885559082, "eval_runtime": 1.9058, "eval_samples_per_second": 131.178, "eval_steps_per_second": 3.673, "step": 10 }, { "epoch": 1.76, "grad_norm": 1.8671875, "learning_rate": 4.224137931034483e-05, "loss": 0.7017, "step": 11 }, { "epoch": 1.76, "eval_accuracy": 0.508, "eval_loss": 0.6969297528266907, "eval_runtime": 1.9056, "eval_samples_per_second": 131.193, "eval_steps_per_second": 3.673, "step": 11 }, { "epoch": 1.92, "grad_norm": 6.28125, "learning_rate": 4.1379310344827587e-05, "loss": 0.6898, "step": 12 }, { "epoch": 1.92, "eval_accuracy": 0.512, "eval_loss": 0.6961719393730164, "eval_runtime": 1.9059, "eval_samples_per_second": 131.172, "eval_steps_per_second": 3.673, "step": 12 }, { "epoch": 2.08, "grad_norm": 2.75, "learning_rate": 4.0517241379310344e-05, "loss": 0.6945, "step": 13 }, { "epoch": 2.08, "eval_accuracy": 0.508, "eval_loss": 0.6973124742507935, "eval_runtime": 1.9053, "eval_samples_per_second": 131.212, "eval_steps_per_second": 3.674, "step": 13 }, { "epoch": 2.24, "grad_norm": 1.4765625, "learning_rate": 3.965517241379311e-05, "loss": 0.713, "step": 14 }, { "epoch": 2.24, "eval_accuracy": 0.512, "eval_loss": 0.6960234642028809, "eval_runtime": 1.9114, "eval_samples_per_second": 130.792, "eval_steps_per_second": 3.662, "step": 14 }, { "epoch": 2.4, "grad_norm": 1.1171875, "learning_rate": 3.8793103448275865e-05, "loss": 0.682, "step": 15 }, { "epoch": 2.4, "eval_accuracy": 0.52, "eval_loss": 0.6956601738929749, "eval_runtime": 1.9077, "eval_samples_per_second": 131.049, "eval_steps_per_second": 3.669, "step": 15 }, { "epoch": 2.56, "grad_norm": 1.2734375, "learning_rate": 3.793103448275862e-05, "loss": 0.68, "step": 16 }, { "epoch": 2.56, "eval_accuracy": 0.52, "eval_loss": 0.6956210732460022, "eval_runtime": 1.9087, "eval_samples_per_second": 130.982, "eval_steps_per_second": 3.668, "step": 16 }, { "epoch": 2.7199999999999998, "grad_norm": 4.5625, "learning_rate": 3.7068965517241385e-05, "loss": 0.6793, "step": 17 }, { "epoch": 2.7199999999999998, "eval_accuracy": 0.52, "eval_loss": 0.6956055164337158, "eval_runtime": 1.9077, "eval_samples_per_second": 131.05, "eval_steps_per_second": 3.669, "step": 17 }, { "epoch": 2.88, "grad_norm": 2.15625, "learning_rate": 3.620689655172414e-05, "loss": 0.6879, "step": 18 }, { "epoch": 2.88, "eval_accuracy": 0.52, "eval_loss": 0.6959649324417114, "eval_runtime": 1.9058, "eval_samples_per_second": 131.178, "eval_steps_per_second": 3.673, "step": 18 }, { "epoch": 3.04, "grad_norm": 2.796875, "learning_rate": 3.53448275862069e-05, "loss": 0.6963, "step": 19 }, { "epoch": 3.04, "eval_accuracy": 0.52, "eval_loss": 0.697027325630188, "eval_runtime": 1.8566, "eval_samples_per_second": 134.654, "eval_steps_per_second": 3.77, "step": 19 }, { "epoch": 3.2, "grad_norm": 5.3125, "learning_rate": 3.4482758620689657e-05, "loss": 0.6925, "step": 20 }, { "epoch": 3.2, "eval_accuracy": 0.52, "eval_loss": 0.6985077857971191, "eval_runtime": 1.9069, "eval_samples_per_second": 131.1, "eval_steps_per_second": 3.671, "step": 20 }, { "epoch": 3.36, "grad_norm": 2.984375, "learning_rate": 3.3620689655172414e-05, "loss": 0.6911, "step": 21 }, { "epoch": 3.36, "eval_accuracy": 0.52, "eval_loss": 0.699636697769165, "eval_runtime": 1.859, "eval_samples_per_second": 134.479, "eval_steps_per_second": 3.765, "step": 21 }, { "epoch": 3.52, "grad_norm": 3.59375, "learning_rate": 3.275862068965517e-05, "loss": 0.6882, "step": 22 }, { "epoch": 3.52, "eval_accuracy": 0.52, "eval_loss": 0.7012538909912109, "eval_runtime": 1.9094, "eval_samples_per_second": 130.928, "eval_steps_per_second": 3.666, "step": 22 }, { "epoch": 3.68, "grad_norm": 3.875, "learning_rate": 3.1896551724137935e-05, "loss": 0.7016, "step": 23 }, { "epoch": 3.68, "eval_accuracy": 0.52, "eval_loss": 0.7007499933242798, "eval_runtime": 1.9071, "eval_samples_per_second": 131.091, "eval_steps_per_second": 3.671, "step": 23 }, { "epoch": 3.84, "grad_norm": 5.46875, "learning_rate": 3.103448275862069e-05, "loss": 0.6972, "step": 24 }, { "epoch": 3.84, "eval_accuracy": 0.52, "eval_loss": 0.7000390887260437, "eval_runtime": 1.9058, "eval_samples_per_second": 131.18, "eval_steps_per_second": 3.673, "step": 24 }, { "epoch": 4.0, "grad_norm": 3.125, "learning_rate": 3.017241379310345e-05, "loss": 0.7288, "step": 25 }, { "epoch": 4.0, "eval_accuracy": 0.52, "eval_loss": 0.697628915309906, "eval_runtime": 1.9065, "eval_samples_per_second": 131.13, "eval_steps_per_second": 3.672, "step": 25 }, { "epoch": 4.16, "grad_norm": 1.9765625, "learning_rate": 2.9310344827586206e-05, "loss": 0.7239, "step": 26 }, { "epoch": 4.16, "eval_accuracy": 0.52, "eval_loss": 0.6958945393562317, "eval_runtime": 1.9088, "eval_samples_per_second": 130.974, "eval_steps_per_second": 3.667, "step": 26 }, { "epoch": 4.32, "grad_norm": 7.125, "learning_rate": 2.844827586206897e-05, "loss": 0.6701, "step": 27 }, { "epoch": 4.32, "eval_accuracy": 0.52, "eval_loss": 0.6942812204360962, "eval_runtime": 1.9094, "eval_samples_per_second": 130.934, "eval_steps_per_second": 3.666, "step": 27 }, { "epoch": 4.48, "grad_norm": 1.484375, "learning_rate": 2.7586206896551727e-05, "loss": 0.7093, "step": 28 }, { "epoch": 4.48, "eval_accuracy": 0.52, "eval_loss": 0.6918594241142273, "eval_runtime": 1.9103, "eval_samples_per_second": 130.869, "eval_steps_per_second": 3.664, "step": 28 }, { "epoch": 4.64, "grad_norm": 5.0, "learning_rate": 2.672413793103448e-05, "loss": 0.6803, "step": 29 }, { "epoch": 4.64, "eval_accuracy": 0.52, "eval_loss": 0.6925703287124634, "eval_runtime": 1.9097, "eval_samples_per_second": 130.908, "eval_steps_per_second": 3.665, "step": 29 }, { "epoch": 4.8, "grad_norm": 0.91015625, "learning_rate": 2.5862068965517244e-05, "loss": 0.6953, "step": 30 }, { "epoch": 4.8, "eval_accuracy": 0.516, "eval_loss": 0.6929374933242798, "eval_runtime": 1.9088, "eval_samples_per_second": 130.971, "eval_steps_per_second": 3.667, "step": 30 }, { "epoch": 4.96, "grad_norm": 4.1875, "learning_rate": 2.5e-05, "loss": 0.6946, "step": 31 }, { "epoch": 4.96, "eval_accuracy": 0.508, "eval_loss": 0.6934375166893005, "eval_runtime": 1.908, "eval_samples_per_second": 131.026, "eval_steps_per_second": 3.669, "step": 31 }, { "epoch": 5.12, "grad_norm": 1.40625, "learning_rate": 2.413793103448276e-05, "loss": 0.7016, "step": 32 }, { "epoch": 5.12, "eval_accuracy": 0.504, "eval_loss": 0.6947265863418579, "eval_runtime": 1.9084, "eval_samples_per_second": 131.001, "eval_steps_per_second": 3.668, "step": 32 }, { "epoch": 5.28, "grad_norm": 3.03125, "learning_rate": 2.327586206896552e-05, "loss": 0.6953, "step": 33 }, { "epoch": 5.28, "eval_accuracy": 0.504, "eval_loss": 0.6949218511581421, "eval_runtime": 1.9078, "eval_samples_per_second": 131.039, "eval_steps_per_second": 3.669, "step": 33 }, { "epoch": 5.44, "grad_norm": 1.4609375, "learning_rate": 2.2413793103448276e-05, "loss": 0.6936, "step": 34 }, { "epoch": 5.44, "eval_accuracy": 0.504, "eval_loss": 0.6933984160423279, "eval_runtime": 1.9062, "eval_samples_per_second": 131.154, "eval_steps_per_second": 3.672, "step": 34 }, { "epoch": 5.6, "grad_norm": 4.21875, "learning_rate": 2.1551724137931033e-05, "loss": 0.6759, "step": 35 }, { "epoch": 5.6, "eval_accuracy": 0.512, "eval_loss": 0.6927656531333923, "eval_runtime": 1.9079, "eval_samples_per_second": 131.032, "eval_steps_per_second": 3.669, "step": 35 }, { "epoch": 5.76, "grad_norm": 3.890625, "learning_rate": 2.0689655172413793e-05, "loss": 0.6911, "step": 36 }, { "epoch": 5.76, "eval_accuracy": 0.516, "eval_loss": 0.691476583480835, "eval_runtime": 1.9084, "eval_samples_per_second": 131.001, "eval_steps_per_second": 3.668, "step": 36 }, { "epoch": 5.92, "grad_norm": 3.921875, "learning_rate": 1.9827586206896554e-05, "loss": 0.7045, "step": 37 }, { "epoch": 5.92, "eval_accuracy": 0.516, "eval_loss": 0.6917109489440918, "eval_runtime": 1.9104, "eval_samples_per_second": 130.863, "eval_steps_per_second": 3.664, "step": 37 }, { "epoch": 6.08, "grad_norm": 6.53125, "learning_rate": 1.896551724137931e-05, "loss": 0.6951, "step": 38 }, { "epoch": 6.08, "eval_accuracy": 0.516, "eval_loss": 0.6929374933242798, "eval_runtime": 1.9058, "eval_samples_per_second": 131.178, "eval_steps_per_second": 3.673, "step": 38 }, { "epoch": 6.24, "grad_norm": 3.671875, "learning_rate": 1.810344827586207e-05, "loss": 0.6766, "step": 39 }, { "epoch": 6.24, "eval_accuracy": 0.52, "eval_loss": 0.6927499771118164, "eval_runtime": 1.908, "eval_samples_per_second": 131.03, "eval_steps_per_second": 3.669, "step": 39 }, { "epoch": 6.4, "grad_norm": 2.421875, "learning_rate": 1.7241379310344828e-05, "loss": 0.6964, "step": 40 }, { "epoch": 6.4, "eval_accuracy": 0.52, "eval_loss": 0.693472683429718, "eval_runtime": 1.9083, "eval_samples_per_second": 131.006, "eval_steps_per_second": 3.668, "step": 40 }, { "epoch": 6.5600000000000005, "grad_norm": 0.9765625, "learning_rate": 1.6379310344827585e-05, "loss": 0.6899, "step": 41 }, { "epoch": 6.5600000000000005, "eval_accuracy": 0.52, "eval_loss": 0.69287109375, "eval_runtime": 1.9071, "eval_samples_per_second": 131.089, "eval_steps_per_second": 3.67, "step": 41 }, { "epoch": 6.72, "grad_norm": 3.21875, "learning_rate": 1.5517241379310346e-05, "loss": 0.6783, "step": 42 }, { "epoch": 6.72, "eval_accuracy": 0.52, "eval_loss": 0.6933398246765137, "eval_runtime": 1.9073, "eval_samples_per_second": 131.076, "eval_steps_per_second": 3.67, "step": 42 }, { "epoch": 6.88, "grad_norm": 0.87890625, "learning_rate": 1.4655172413793103e-05, "loss": 0.6921, "step": 43 }, { "epoch": 6.88, "eval_accuracy": 0.52, "eval_loss": 0.6933085918426514, "eval_runtime": 1.9103, "eval_samples_per_second": 130.87, "eval_steps_per_second": 3.664, "step": 43 }, { "epoch": 7.04, "grad_norm": 2.609375, "learning_rate": 1.3793103448275863e-05, "loss": 0.684, "step": 44 }, { "epoch": 7.04, "eval_accuracy": 0.52, "eval_loss": 0.6930469274520874, "eval_runtime": 1.9088, "eval_samples_per_second": 130.974, "eval_steps_per_second": 3.667, "step": 44 }, { "epoch": 7.2, "grad_norm": 6.3125, "learning_rate": 1.2931034482758622e-05, "loss": 0.7027, "step": 45 }, { "epoch": 7.2, "eval_accuracy": 0.52, "eval_loss": 0.6939062476158142, "eval_runtime": 1.909, "eval_samples_per_second": 130.958, "eval_steps_per_second": 3.667, "step": 45 }, { "epoch": 7.36, "grad_norm": 3.78125, "learning_rate": 1.206896551724138e-05, "loss": 0.6893, "step": 46 }, { "epoch": 7.36, "eval_accuracy": 0.52, "eval_loss": 0.6937148571014404, "eval_runtime": 1.9058, "eval_samples_per_second": 131.178, "eval_steps_per_second": 3.673, "step": 46 }, { "epoch": 7.52, "grad_norm": 3.4375, "learning_rate": 1.1206896551724138e-05, "loss": 0.6754, "step": 47 }, { "epoch": 7.52, "eval_accuracy": 0.52, "eval_loss": 0.6943007707595825, "eval_runtime": 1.8591, "eval_samples_per_second": 134.472, "eval_steps_per_second": 3.765, "step": 47 }, { "epoch": 7.68, "grad_norm": 1.7109375, "learning_rate": 1.0344827586206897e-05, "loss": 0.7103, "step": 48 }, { "epoch": 7.68, "eval_accuracy": 0.52, "eval_loss": 0.6930898427963257, "eval_runtime": 1.9082, "eval_samples_per_second": 131.014, "eval_steps_per_second": 3.668, "step": 48 }, { "epoch": 7.84, "grad_norm": 6.0625, "learning_rate": 9.482758620689655e-06, "loss": 0.6965, "step": 49 }, { "epoch": 7.84, "eval_accuracy": 0.52, "eval_loss": 0.6940312385559082, "eval_runtime": 1.9059, "eval_samples_per_second": 131.173, "eval_steps_per_second": 3.673, "step": 49 }, { "epoch": 8.0, "grad_norm": 5.46875, "learning_rate": 8.620689655172414e-06, "loss": 0.682, "step": 50 }, { "epoch": 8.0, "eval_accuracy": 0.52, "eval_loss": 0.6938086152076721, "eval_runtime": 1.9046, "eval_samples_per_second": 131.261, "eval_steps_per_second": 3.675, "step": 50 }, { "epoch": 8.16, "grad_norm": 2.15625, "learning_rate": 7.758620689655173e-06, "loss": 0.6914, "step": 51 }, { "epoch": 8.16, "eval_accuracy": 0.52, "eval_loss": 0.6924687623977661, "eval_runtime": 1.9055, "eval_samples_per_second": 131.197, "eval_steps_per_second": 3.674, "step": 51 }, { "epoch": 8.32, "grad_norm": 4.09375, "learning_rate": 6.896551724137932e-06, "loss": 0.6963, "step": 52 }, { "epoch": 8.32, "eval_accuracy": 0.52, "eval_loss": 0.6934570074081421, "eval_runtime": 1.9059, "eval_samples_per_second": 131.17, "eval_steps_per_second": 3.673, "step": 52 }, { "epoch": 8.48, "grad_norm": 1.2265625, "learning_rate": 6.03448275862069e-06, "loss": 0.6962, "step": 53 }, { "epoch": 8.48, "eval_accuracy": 0.52, "eval_loss": 0.6923359632492065, "eval_runtime": 1.9059, "eval_samples_per_second": 131.173, "eval_steps_per_second": 3.673, "step": 53 }, { "epoch": 8.64, "grad_norm": 3.578125, "learning_rate": 5.172413793103448e-06, "loss": 0.6692, "step": 54 }, { "epoch": 8.64, "eval_accuracy": 0.52, "eval_loss": 0.6927656531333923, "eval_runtime": 1.906, "eval_samples_per_second": 131.168, "eval_steps_per_second": 3.673, "step": 54 }, { "epoch": 8.8, "grad_norm": 1.7421875, "learning_rate": 4.310344827586207e-06, "loss": 0.6991, "step": 55 }, { "epoch": 8.8, "eval_accuracy": 0.52, "eval_loss": 0.6923280954360962, "eval_runtime": 1.9063, "eval_samples_per_second": 131.146, "eval_steps_per_second": 3.672, "step": 55 }, { "epoch": 8.96, "grad_norm": 1.1796875, "learning_rate": 3.448275862068966e-06, "loss": 0.6808, "step": 56 }, { "epoch": 8.96, "eval_accuracy": 0.52, "eval_loss": 0.692800760269165, "eval_runtime": 1.9066, "eval_samples_per_second": 131.124, "eval_steps_per_second": 3.671, "step": 56 }, { "epoch": 9.12, "grad_norm": 9.25, "learning_rate": 2.586206896551724e-06, "loss": 0.7043, "step": 57 }, { "epoch": 9.12, "eval_accuracy": 0.52, "eval_loss": 0.6924726366996765, "eval_runtime": 1.9066, "eval_samples_per_second": 131.123, "eval_steps_per_second": 3.671, "step": 57 }, { "epoch": 9.28, "grad_norm": 1.15625, "learning_rate": 1.724137931034483e-06, "loss": 0.6786, "step": 58 }, { "epoch": 9.28, "eval_accuracy": 0.52, "eval_loss": 0.6920429468154907, "eval_runtime": 1.9069, "eval_samples_per_second": 131.104, "eval_steps_per_second": 3.671, "step": 58 }, { "epoch": 9.44, "grad_norm": 6.125, "learning_rate": 8.620689655172415e-07, "loss": 0.6852, "step": 59 }, { "epoch": 9.44, "eval_accuracy": 0.52, "eval_loss": 0.6916835904121399, "eval_runtime": 1.9088, "eval_samples_per_second": 130.97, "eval_steps_per_second": 3.667, "step": 59 }, { "epoch": 9.6, "grad_norm": 1.3828125, "learning_rate": 0.0, "loss": 0.6888, "step": 60 }, { "epoch": 9.6, "eval_accuracy": 0.52, "eval_loss": 0.6920117139816284, "eval_runtime": 1.9079, "eval_samples_per_second": 131.035, "eval_steps_per_second": 3.669, "step": 60 }, { "epoch": 9.6, "step": 60, "total_flos": 2.337091978736435e+16, "train_loss": 0.6971277634302775, "train_runtime": 291.0168, "train_samples_per_second": 34.362, "train_steps_per_second": 0.206 } ], "logging_steps": 1, "max_steps": 60, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.337091978736435e+16, "train_batch_size": 10, "trial_name": null, "trial_params": null }