{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9993935718617344, "eval_steps": 500, "global_step": 412, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02425712553062462, "grad_norm": 86.94617462158203, "learning_rate": 1.1904761904761906e-06, "loss": 2.9554, "step": 10 }, { "epoch": 0.04851425106124924, "grad_norm": 29.724809646606445, "learning_rate": 2.380952380952381e-06, "loss": 1.5779, "step": 20 }, { "epoch": 0.07277137659187387, "grad_norm": 16.428190231323242, "learning_rate": 3.5714285714285718e-06, "loss": 1.2023, "step": 30 }, { "epoch": 0.09702850212249849, "grad_norm": 17.30933380126953, "learning_rate": 4.761904761904762e-06, "loss": 1.1538, "step": 40 }, { "epoch": 0.1212856276531231, "grad_norm": 15.137809753417969, "learning_rate": 4.994234734765043e-06, "loss": 1.0789, "step": 50 }, { "epoch": 0.14554275318374774, "grad_norm": 11.950181007385254, "learning_rate": 4.9708589101037306e-06, "loss": 1.0103, "step": 60 }, { "epoch": 0.16979987871437235, "grad_norm": 14.55068302154541, "learning_rate": 4.92968049037552e-06, "loss": 1.0078, "step": 70 }, { "epoch": 0.19405700424499697, "grad_norm": 27.079729080200195, "learning_rate": 4.870996167038154e-06, "loss": 1.0301, "step": 80 }, { "epoch": 0.2183141297756216, "grad_norm": 16.41851043701172, "learning_rate": 4.7952287619860276e-06, "loss": 1.0042, "step": 90 }, { "epoch": 0.2425712553062462, "grad_norm": 14.759162902832031, "learning_rate": 4.702924181108745e-06, "loss": 0.9602, "step": 100 }, { "epoch": 0.2668283808368708, "grad_norm": 14.426393508911133, "learning_rate": 4.594747481026685e-06, "loss": 0.978, "step": 110 }, { "epoch": 0.2910855063674955, "grad_norm": 11.475569725036621, "learning_rate": 4.471478077342798e-06, "loss": 0.9809, "step": 120 }, { "epoch": 0.31534263189812006, "grad_norm": 14.514723777770996, "learning_rate": 4.334004128935342e-06, "loss": 0.981, "step": 130 }, { "epoch": 0.3395997574287447, "grad_norm": 11.994640350341797, "learning_rate": 4.183316138752799e-06, "loss": 0.9829, "step": 140 }, { "epoch": 0.3638568829593693, "grad_norm": 81.57245635986328, "learning_rate": 4.020499817217441e-06, "loss": 0.9799, "step": 150 }, { "epoch": 0.38811400848999394, "grad_norm": 13.99566650390625, "learning_rate": 3.84672825965686e-06, "loss": 0.9717, "step": 160 }, { "epoch": 0.41237113402061853, "grad_norm": 12.517782211303711, "learning_rate": 3.663253494125244e-06, "loss": 0.9327, "step": 170 }, { "epoch": 0.4366282595512432, "grad_norm": 13.505069732666016, "learning_rate": 3.4713974605125634e-06, "loss": 0.9839, "step": 180 }, { "epoch": 0.46088538508186777, "grad_norm": 12.123452186584473, "learning_rate": 3.272542485937369e-06, "loss": 0.9508, "step": 190 }, { "epoch": 0.4851425106124924, "grad_norm": 29.056503295898438, "learning_rate": 3.0681213250482255e-06, "loss": 0.9075, "step": 200 }, { "epoch": 0.509399636143117, "grad_norm": 11.140923500061035, "learning_rate": 2.8596068369936386e-06, "loss": 0.9405, "step": 210 }, { "epoch": 0.5336567616737417, "grad_norm": 14.713190078735352, "learning_rate": 2.648501373438142e-06, "loss": 0.8641, "step": 220 }, { "epoch": 0.5579138872043663, "grad_norm": 19.95207405090332, "learning_rate": 2.436325954084122e-06, "loss": 0.9529, "step": 230 }, { "epoch": 0.582171012734991, "grad_norm": 11.159050941467285, "learning_rate": 2.2246093076900145e-06, "loss": 0.9201, "step": 240 }, { "epoch": 0.6064281382656155, "grad_norm": 12.661371231079102, "learning_rate": 2.014876857544562e-06, "loss": 0.9249, "step": 250 }, { "epoch": 0.6306852637962401, "grad_norm": 12.855792045593262, "learning_rate": 1.8086397307570724e-06, "loss": 0.9223, "step": 260 }, { "epoch": 0.6549423893268648, "grad_norm": 12.543617248535156, "learning_rate": 1.6073838705519618e-06, "loss": 0.9151, "step": 270 }, { "epoch": 0.6791995148574894, "grad_norm": 11.495418548583984, "learning_rate": 1.4125593300137767e-06, "loss": 0.8661, "step": 280 }, { "epoch": 0.7034566403881141, "grad_norm": 15.451556205749512, "learning_rate": 1.2255698244214863e-06, "loss": 0.946, "step": 290 }, { "epoch": 0.7277137659187386, "grad_norm": 14.111011505126953, "learning_rate": 1.0477626174477403e-06, "loss": 0.9041, "step": 300 }, { "epoch": 0.7519708914493632, "grad_norm": 9.91163444519043, "learning_rate": 8.804188140932251e-07, "loss": 0.8724, "step": 310 }, { "epoch": 0.7762280169799879, "grad_norm": 17.55638313293457, "learning_rate": 7.247441302957858e-07, "loss": 0.888, "step": 320 }, { "epoch": 0.8004851425106125, "grad_norm": 16.491750717163086, "learning_rate": 5.818602057194589e-07, "loss": 0.8579, "step": 330 }, { "epoch": 0.8247422680412371, "grad_norm": 10.457083702087402, "learning_rate": 4.527965223149958e-07, "loss": 0.8965, "step": 340 }, { "epoch": 0.8489993935718617, "grad_norm": 10.995795249938965, "learning_rate": 3.3848298687881143e-07, "loss": 0.8624, "step": 350 }, { "epoch": 0.8732565191024864, "grad_norm": 15.046560287475586, "learning_rate": 2.397432310532133e-07, "loss": 0.8749, "step": 360 }, { "epoch": 0.897513644633111, "grad_norm": 12.04796314239502, "learning_rate": 1.5728867704154076e-07, "loss": 0.857, "step": 370 }, { "epoch": 0.9217707701637355, "grad_norm": 14.930426597595215, "learning_rate": 9.171341179489034e-08, "loss": 0.8997, "step": 380 }, { "epoch": 0.9460278956943602, "grad_norm": 16.0552978515625, "learning_rate": 4.348990660201669e-08, "loss": 0.8803, "step": 390 }, { "epoch": 0.9702850212249848, "grad_norm": 11.000052452087402, "learning_rate": 1.296561292287446e-08, "loss": 0.95, "step": 400 }, { "epoch": 0.9945421467556095, "grad_norm": 14.236842155456543, "learning_rate": 3.604589928837832e-10, "loss": 0.8863, "step": 410 }, { "epoch": 0.9993935718617344, "step": 412, "total_flos": 4.874908633965527e+17, "train_loss": 1.011520906270129, "train_runtime": 2109.7946, "train_samples_per_second": 25.008, "train_steps_per_second": 0.195 } ], "logging_steps": 10, "max_steps": 412, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.874908633965527e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }