{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0033222591362125, "eval_steps": 57, "global_step": 226, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004429678848283499, "eval_loss": 2.5983426570892334, "eval_runtime": 5.0405, "eval_samples_per_second": 18.847, "eval_steps_per_second": 9.523, "step": 1 }, { "epoch": 0.0221483942414175, "grad_norm": 1.4756156206130981, "learning_rate": 1.6666666666666667e-05, "loss": 1.9339, "step": 5 }, { "epoch": 0.044296788482835, "grad_norm": 1.9938647747039795, "learning_rate": 3.3333333333333335e-05, "loss": 2.2285, "step": 10 }, { "epoch": 0.0664451827242525, "grad_norm": 2.0396549701690674, "learning_rate": 5e-05, "loss": 2.5228, "step": 15 }, { "epoch": 0.08859357696567, "grad_norm": 1.985188603401184, "learning_rate": 6.666666666666667e-05, "loss": 1.8434, "step": 20 }, { "epoch": 0.11074197120708748, "grad_norm": 2.442502975463867, "learning_rate": 8.333333333333334e-05, "loss": 1.5617, "step": 25 }, { "epoch": 0.132890365448505, "grad_norm": 2.1058413982391357, "learning_rate": 0.0001, "loss": 1.4216, "step": 30 }, { "epoch": 0.15503875968992248, "grad_norm": 2.086914300918579, "learning_rate": 9.983951473748578e-05, "loss": 1.2727, "step": 35 }, { "epoch": 0.17718715393134, "grad_norm": 1.712249994277954, "learning_rate": 9.935908917072252e-05, "loss": 1.4231, "step": 40 }, { "epoch": 0.19933554817275748, "grad_norm": 2.7731285095214844, "learning_rate": 9.85618073486382e-05, "loss": 1.482, "step": 45 }, { "epoch": 0.22148394241417496, "grad_norm": 5.423552989959717, "learning_rate": 9.745278735053343e-05, "loss": 2.1623, "step": 50 }, { "epoch": 0.24363233665559247, "grad_norm": 1.5658822059631348, "learning_rate": 9.603914843102941e-05, "loss": 0.7675, "step": 55 }, { "epoch": 0.25249169435215946, "eval_loss": 1.1176186800003052, "eval_runtime": 5.0421, "eval_samples_per_second": 18.841, "eval_steps_per_second": 9.52, "step": 57 }, { "epoch": 0.26578073089701, "grad_norm": 1.5977628231048584, "learning_rate": 9.432996531865002e-05, "loss": 0.9452, "step": 60 }, { "epoch": 0.28792912513842744, "grad_norm": 1.3801822662353516, "learning_rate": 9.233620996141421e-05, "loss": 1.0662, "step": 65 }, { "epoch": 0.31007751937984496, "grad_norm": 1.4718172550201416, "learning_rate": 9.007068109339784e-05, "loss": 1.021, "step": 70 }, { "epoch": 0.33222591362126247, "grad_norm": 1.4416835308074951, "learning_rate": 8.754792207440557e-05, "loss": 1.1909, "step": 75 }, { "epoch": 0.35437430786268, "grad_norm": 1.3599156141281128, "learning_rate": 8.478412753017433e-05, "loss": 1.0854, "step": 80 }, { "epoch": 0.37652270210409744, "grad_norm": 1.7577108144760132, "learning_rate": 8.179703939242276e-05, "loss": 0.9214, "step": 85 }, { "epoch": 0.39867109634551495, "grad_norm": 1.8137191534042358, "learning_rate": 7.860583300610849e-05, "loss": 1.1752, "step": 90 }, { "epoch": 0.42081949058693247, "grad_norm": 1.6590023040771484, "learning_rate": 7.52309940350173e-05, "loss": 1.2202, "step": 95 }, { "epoch": 0.4429678848283499, "grad_norm": 2.184537410736084, "learning_rate": 7.169418695587791e-05, "loss": 1.8343, "step": 100 }, { "epoch": 0.46511627906976744, "grad_norm": 1.2622162103652954, "learning_rate": 6.801811598519268e-05, "loss": 0.8566, "step": 105 }, { "epoch": 0.48726467331118495, "grad_norm": 1.2197306156158447, "learning_rate": 6.422637933155162e-05, "loss": 1.1106, "step": 110 }, { "epoch": 0.5049833887043189, "eval_loss": 1.0898905992507935, "eval_runtime": 5.0519, "eval_samples_per_second": 18.805, "eval_steps_per_second": 9.501, "step": 114 }, { "epoch": 0.5094130675526024, "grad_norm": 1.1158766746520996, "learning_rate": 6.0343317709044546e-05, "loss": 1.0906, "step": 115 }, { "epoch": 0.53156146179402, "grad_norm": 1.4978259801864624, "learning_rate": 5.6393858084225305e-05, "loss": 0.8869, "step": 120 }, { "epoch": 0.5537098560354374, "grad_norm": 1.5120636224746704, "learning_rate": 5.240335365968104e-05, "loss": 1.1671, "step": 125 }, { "epoch": 0.5758582502768549, "grad_norm": 1.4071290493011475, "learning_rate": 4.839742112141724e-05, "loss": 1.2143, "step": 130 }, { "epoch": 0.5980066445182725, "grad_norm": 1.576568365097046, "learning_rate": 4.4401776194834613e-05, "loss": 1.0833, "step": 135 }, { "epoch": 0.6201550387596899, "grad_norm": 1.2496248483657837, "learning_rate": 4.04420685649314e-05, "loss": 1.1847, "step": 140 }, { "epoch": 0.6423034330011074, "grad_norm": 1.3625198602676392, "learning_rate": 3.654371722044616e-05, "loss": 1.067, "step": 145 }, { "epoch": 0.6644518272425249, "grad_norm": 2.972466230392456, "learning_rate": 3.273174727893463e-05, "loss": 1.7863, "step": 150 }, { "epoch": 0.6866002214839424, "grad_norm": 0.9294478297233582, "learning_rate": 2.9030629340267164e-05, "loss": 0.9216, "step": 155 }, { "epoch": 0.70874861572536, "grad_norm": 1.618067979812622, "learning_rate": 2.5464122399803125e-05, "loss": 1.0288, "step": 160 }, { "epoch": 0.7308970099667774, "grad_norm": 1.3609340190887451, "learning_rate": 2.2055121329646418e-05, "loss": 0.9452, "step": 165 }, { "epoch": 0.7530454042081949, "grad_norm": 1.2449376583099365, "learning_rate": 1.8825509907063327e-05, "loss": 0.8706, "step": 170 }, { "epoch": 0.7574750830564784, "eval_loss": 1.0665383338928223, "eval_runtime": 5.0548, "eval_samples_per_second": 18.794, "eval_steps_per_second": 9.496, "step": 171 }, { "epoch": 0.7751937984496124, "grad_norm": 1.1113590002059937, "learning_rate": 1.5796020333532695e-05, "loss": 1.0823, "step": 175 }, { "epoch": 0.7973421926910299, "grad_norm": 1.1141512393951416, "learning_rate": 1.2986100146234232e-05, "loss": 0.8785, "step": 180 }, { "epoch": 0.8194905869324474, "grad_norm": 1.8312709331512451, "learning_rate": 1.0413787376324019e-05, "loss": 1.4858, "step": 185 }, { "epoch": 0.8416389811738649, "grad_norm": 1.527754545211792, "learning_rate": 8.09559475540797e-06, "loss": 1.3716, "step": 190 }, { "epoch": 0.8637873754152824, "grad_norm": 2.084320545196533, "learning_rate": 6.0464037135391395e-06, "loss": 1.3418, "step": 195 }, { "epoch": 0.8859357696566998, "grad_norm": 2.9202771186828613, "learning_rate": 4.279368849209381e-06, "loss": 1.5893, "step": 200 }, { "epoch": 0.9080841638981174, "grad_norm": 1.439396619796753, "learning_rate": 2.8058334845816213e-06, "loss": 0.9657, "step": 205 }, { "epoch": 0.9302325581395349, "grad_norm": 1.4320131540298462, "learning_rate": 1.6352568480485276e-06, "loss": 1.0156, "step": 210 }, { "epoch": 0.9523809523809523, "grad_norm": 1.5984946489334106, "learning_rate": 7.751533515623799e-07, "loss": 1.0568, "step": 215 }, { "epoch": 0.9745293466223699, "grad_norm": 1.2688162326812744, "learning_rate": 2.310443525400885e-07, "loss": 0.9799, "step": 220 }, { "epoch": 0.9966777408637874, "grad_norm": 2.5447208881378174, "learning_rate": 6.422710003439747e-09, "loss": 1.5954, "step": 225 } ], "logging_steps": 5, "max_steps": 226, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.098941995142349e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }