|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9993935718617344, |
|
"eval_steps": 500, |
|
"global_step": 412, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02425712553062462, |
|
"grad_norm": 86.94617462158203, |
|
"learning_rate": 1.1904761904761906e-06, |
|
"loss": 2.9554, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04851425106124924, |
|
"grad_norm": 29.724809646606445, |
|
"learning_rate": 2.380952380952381e-06, |
|
"loss": 1.5779, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07277137659187387, |
|
"grad_norm": 16.428190231323242, |
|
"learning_rate": 3.5714285714285718e-06, |
|
"loss": 1.2023, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09702850212249849, |
|
"grad_norm": 17.30933380126953, |
|
"learning_rate": 4.761904761904762e-06, |
|
"loss": 1.1538, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1212856276531231, |
|
"grad_norm": 15.137809753417969, |
|
"learning_rate": 4.994234734765043e-06, |
|
"loss": 1.0789, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.14554275318374774, |
|
"grad_norm": 11.950181007385254, |
|
"learning_rate": 4.9708589101037306e-06, |
|
"loss": 1.0103, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16979987871437235, |
|
"grad_norm": 14.55068302154541, |
|
"learning_rate": 4.92968049037552e-06, |
|
"loss": 1.0078, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.19405700424499697, |
|
"grad_norm": 27.079729080200195, |
|
"learning_rate": 4.870996167038154e-06, |
|
"loss": 1.0301, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2183141297756216, |
|
"grad_norm": 16.41851043701172, |
|
"learning_rate": 4.7952287619860276e-06, |
|
"loss": 1.0042, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2425712553062462, |
|
"grad_norm": 14.759162902832031, |
|
"learning_rate": 4.702924181108745e-06, |
|
"loss": 0.9602, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2668283808368708, |
|
"grad_norm": 14.426393508911133, |
|
"learning_rate": 4.594747481026685e-06, |
|
"loss": 0.978, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2910855063674955, |
|
"grad_norm": 11.475569725036621, |
|
"learning_rate": 4.471478077342798e-06, |
|
"loss": 0.9809, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.31534263189812006, |
|
"grad_norm": 14.514723777770996, |
|
"learning_rate": 4.334004128935342e-06, |
|
"loss": 0.981, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3395997574287447, |
|
"grad_norm": 11.994640350341797, |
|
"learning_rate": 4.183316138752799e-06, |
|
"loss": 0.9829, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3638568829593693, |
|
"grad_norm": 81.57245635986328, |
|
"learning_rate": 4.020499817217441e-06, |
|
"loss": 0.9799, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.38811400848999394, |
|
"grad_norm": 13.99566650390625, |
|
"learning_rate": 3.84672825965686e-06, |
|
"loss": 0.9717, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.41237113402061853, |
|
"grad_norm": 12.517782211303711, |
|
"learning_rate": 3.663253494125244e-06, |
|
"loss": 0.9327, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4366282595512432, |
|
"grad_norm": 13.505069732666016, |
|
"learning_rate": 3.4713974605125634e-06, |
|
"loss": 0.9839, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.46088538508186777, |
|
"grad_norm": 12.123452186584473, |
|
"learning_rate": 3.272542485937369e-06, |
|
"loss": 0.9508, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4851425106124924, |
|
"grad_norm": 29.056503295898438, |
|
"learning_rate": 3.0681213250482255e-06, |
|
"loss": 0.9075, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.509399636143117, |
|
"grad_norm": 11.140923500061035, |
|
"learning_rate": 2.8596068369936386e-06, |
|
"loss": 0.9405, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5336567616737417, |
|
"grad_norm": 14.713190078735352, |
|
"learning_rate": 2.648501373438142e-06, |
|
"loss": 0.8641, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5579138872043663, |
|
"grad_norm": 19.95207405090332, |
|
"learning_rate": 2.436325954084122e-06, |
|
"loss": 0.9529, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.582171012734991, |
|
"grad_norm": 11.159050941467285, |
|
"learning_rate": 2.2246093076900145e-06, |
|
"loss": 0.9201, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6064281382656155, |
|
"grad_norm": 12.661371231079102, |
|
"learning_rate": 2.014876857544562e-06, |
|
"loss": 0.9249, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6306852637962401, |
|
"grad_norm": 12.855792045593262, |
|
"learning_rate": 1.8086397307570724e-06, |
|
"loss": 0.9223, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6549423893268648, |
|
"grad_norm": 12.543617248535156, |
|
"learning_rate": 1.6073838705519618e-06, |
|
"loss": 0.9151, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6791995148574894, |
|
"grad_norm": 11.495418548583984, |
|
"learning_rate": 1.4125593300137767e-06, |
|
"loss": 0.8661, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7034566403881141, |
|
"grad_norm": 15.451556205749512, |
|
"learning_rate": 1.2255698244214863e-06, |
|
"loss": 0.946, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7277137659187386, |
|
"grad_norm": 14.111011505126953, |
|
"learning_rate": 1.0477626174477403e-06, |
|
"loss": 0.9041, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7519708914493632, |
|
"grad_norm": 9.91163444519043, |
|
"learning_rate": 8.804188140932251e-07, |
|
"loss": 0.8724, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7762280169799879, |
|
"grad_norm": 17.55638313293457, |
|
"learning_rate": 7.247441302957858e-07, |
|
"loss": 0.888, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8004851425106125, |
|
"grad_norm": 16.491750717163086, |
|
"learning_rate": 5.818602057194589e-07, |
|
"loss": 0.8579, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8247422680412371, |
|
"grad_norm": 10.457083702087402, |
|
"learning_rate": 4.527965223149958e-07, |
|
"loss": 0.8965, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8489993935718617, |
|
"grad_norm": 10.995795249938965, |
|
"learning_rate": 3.3848298687881143e-07, |
|
"loss": 0.8624, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8732565191024864, |
|
"grad_norm": 15.046560287475586, |
|
"learning_rate": 2.397432310532133e-07, |
|
"loss": 0.8749, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.897513644633111, |
|
"grad_norm": 12.04796314239502, |
|
"learning_rate": 1.5728867704154076e-07, |
|
"loss": 0.857, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9217707701637355, |
|
"grad_norm": 14.930426597595215, |
|
"learning_rate": 9.171341179489034e-08, |
|
"loss": 0.8997, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9460278956943602, |
|
"grad_norm": 16.0552978515625, |
|
"learning_rate": 4.348990660201669e-08, |
|
"loss": 0.8803, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9702850212249848, |
|
"grad_norm": 11.000052452087402, |
|
"learning_rate": 1.296561292287446e-08, |
|
"loss": 0.95, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9945421467556095, |
|
"grad_norm": 14.236842155456543, |
|
"learning_rate": 3.604589928837832e-10, |
|
"loss": 0.8863, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9993935718617344, |
|
"step": 412, |
|
"total_flos": 4.874908633965527e+17, |
|
"train_loss": 1.011520906270129, |
|
"train_runtime": 2109.7946, |
|
"train_samples_per_second": 25.008, |
|
"train_steps_per_second": 0.195 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 412, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.874908633965527e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|