0x1202's picture
Training in progress, epoch 1, checkpoint
5b5a90e verified
raw
history blame
9.34 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0033222591362125,
"eval_steps": 57,
"global_step": 226,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004429678848283499,
"eval_loss": 2.5983426570892334,
"eval_runtime": 5.0405,
"eval_samples_per_second": 18.847,
"eval_steps_per_second": 9.523,
"step": 1
},
{
"epoch": 0.0221483942414175,
"grad_norm": 1.4756156206130981,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.9339,
"step": 5
},
{
"epoch": 0.044296788482835,
"grad_norm": 1.9938647747039795,
"learning_rate": 3.3333333333333335e-05,
"loss": 2.2285,
"step": 10
},
{
"epoch": 0.0664451827242525,
"grad_norm": 2.0396549701690674,
"learning_rate": 5e-05,
"loss": 2.5228,
"step": 15
},
{
"epoch": 0.08859357696567,
"grad_norm": 1.985188603401184,
"learning_rate": 6.666666666666667e-05,
"loss": 1.8434,
"step": 20
},
{
"epoch": 0.11074197120708748,
"grad_norm": 2.442502975463867,
"learning_rate": 8.333333333333334e-05,
"loss": 1.5617,
"step": 25
},
{
"epoch": 0.132890365448505,
"grad_norm": 2.1058413982391357,
"learning_rate": 0.0001,
"loss": 1.4216,
"step": 30
},
{
"epoch": 0.15503875968992248,
"grad_norm": 2.086914300918579,
"learning_rate": 9.983951473748578e-05,
"loss": 1.2727,
"step": 35
},
{
"epoch": 0.17718715393134,
"grad_norm": 1.712249994277954,
"learning_rate": 9.935908917072252e-05,
"loss": 1.4231,
"step": 40
},
{
"epoch": 0.19933554817275748,
"grad_norm": 2.7731285095214844,
"learning_rate": 9.85618073486382e-05,
"loss": 1.482,
"step": 45
},
{
"epoch": 0.22148394241417496,
"grad_norm": 5.423552989959717,
"learning_rate": 9.745278735053343e-05,
"loss": 2.1623,
"step": 50
},
{
"epoch": 0.24363233665559247,
"grad_norm": 1.5658822059631348,
"learning_rate": 9.603914843102941e-05,
"loss": 0.7675,
"step": 55
},
{
"epoch": 0.25249169435215946,
"eval_loss": 1.1176186800003052,
"eval_runtime": 5.0421,
"eval_samples_per_second": 18.841,
"eval_steps_per_second": 9.52,
"step": 57
},
{
"epoch": 0.26578073089701,
"grad_norm": 1.5977628231048584,
"learning_rate": 9.432996531865002e-05,
"loss": 0.9452,
"step": 60
},
{
"epoch": 0.28792912513842744,
"grad_norm": 1.3801822662353516,
"learning_rate": 9.233620996141421e-05,
"loss": 1.0662,
"step": 65
},
{
"epoch": 0.31007751937984496,
"grad_norm": 1.4718172550201416,
"learning_rate": 9.007068109339784e-05,
"loss": 1.021,
"step": 70
},
{
"epoch": 0.33222591362126247,
"grad_norm": 1.4416835308074951,
"learning_rate": 8.754792207440557e-05,
"loss": 1.1909,
"step": 75
},
{
"epoch": 0.35437430786268,
"grad_norm": 1.3599156141281128,
"learning_rate": 8.478412753017433e-05,
"loss": 1.0854,
"step": 80
},
{
"epoch": 0.37652270210409744,
"grad_norm": 1.7577108144760132,
"learning_rate": 8.179703939242276e-05,
"loss": 0.9214,
"step": 85
},
{
"epoch": 0.39867109634551495,
"grad_norm": 1.8137191534042358,
"learning_rate": 7.860583300610849e-05,
"loss": 1.1752,
"step": 90
},
{
"epoch": 0.42081949058693247,
"grad_norm": 1.6590023040771484,
"learning_rate": 7.52309940350173e-05,
"loss": 1.2202,
"step": 95
},
{
"epoch": 0.4429678848283499,
"grad_norm": 2.184537410736084,
"learning_rate": 7.169418695587791e-05,
"loss": 1.8343,
"step": 100
},
{
"epoch": 0.46511627906976744,
"grad_norm": 1.2622162103652954,
"learning_rate": 6.801811598519268e-05,
"loss": 0.8566,
"step": 105
},
{
"epoch": 0.48726467331118495,
"grad_norm": 1.2197306156158447,
"learning_rate": 6.422637933155162e-05,
"loss": 1.1106,
"step": 110
},
{
"epoch": 0.5049833887043189,
"eval_loss": 1.0898905992507935,
"eval_runtime": 5.0519,
"eval_samples_per_second": 18.805,
"eval_steps_per_second": 9.501,
"step": 114
},
{
"epoch": 0.5094130675526024,
"grad_norm": 1.1158766746520996,
"learning_rate": 6.0343317709044546e-05,
"loss": 1.0906,
"step": 115
},
{
"epoch": 0.53156146179402,
"grad_norm": 1.4978259801864624,
"learning_rate": 5.6393858084225305e-05,
"loss": 0.8869,
"step": 120
},
{
"epoch": 0.5537098560354374,
"grad_norm": 1.5120636224746704,
"learning_rate": 5.240335365968104e-05,
"loss": 1.1671,
"step": 125
},
{
"epoch": 0.5758582502768549,
"grad_norm": 1.4071290493011475,
"learning_rate": 4.839742112141724e-05,
"loss": 1.2143,
"step": 130
},
{
"epoch": 0.5980066445182725,
"grad_norm": 1.576568365097046,
"learning_rate": 4.4401776194834613e-05,
"loss": 1.0833,
"step": 135
},
{
"epoch": 0.6201550387596899,
"grad_norm": 1.2496248483657837,
"learning_rate": 4.04420685649314e-05,
"loss": 1.1847,
"step": 140
},
{
"epoch": 0.6423034330011074,
"grad_norm": 1.3625198602676392,
"learning_rate": 3.654371722044616e-05,
"loss": 1.067,
"step": 145
},
{
"epoch": 0.6644518272425249,
"grad_norm": 2.972466230392456,
"learning_rate": 3.273174727893463e-05,
"loss": 1.7863,
"step": 150
},
{
"epoch": 0.6866002214839424,
"grad_norm": 0.9294478297233582,
"learning_rate": 2.9030629340267164e-05,
"loss": 0.9216,
"step": 155
},
{
"epoch": 0.70874861572536,
"grad_norm": 1.618067979812622,
"learning_rate": 2.5464122399803125e-05,
"loss": 1.0288,
"step": 160
},
{
"epoch": 0.7308970099667774,
"grad_norm": 1.3609340190887451,
"learning_rate": 2.2055121329646418e-05,
"loss": 0.9452,
"step": 165
},
{
"epoch": 0.7530454042081949,
"grad_norm": 1.2449376583099365,
"learning_rate": 1.8825509907063327e-05,
"loss": 0.8706,
"step": 170
},
{
"epoch": 0.7574750830564784,
"eval_loss": 1.0665383338928223,
"eval_runtime": 5.0548,
"eval_samples_per_second": 18.794,
"eval_steps_per_second": 9.496,
"step": 171
},
{
"epoch": 0.7751937984496124,
"grad_norm": 1.1113590002059937,
"learning_rate": 1.5796020333532695e-05,
"loss": 1.0823,
"step": 175
},
{
"epoch": 0.7973421926910299,
"grad_norm": 1.1141512393951416,
"learning_rate": 1.2986100146234232e-05,
"loss": 0.8785,
"step": 180
},
{
"epoch": 0.8194905869324474,
"grad_norm": 1.8312709331512451,
"learning_rate": 1.0413787376324019e-05,
"loss": 1.4858,
"step": 185
},
{
"epoch": 0.8416389811738649,
"grad_norm": 1.527754545211792,
"learning_rate": 8.09559475540797e-06,
"loss": 1.3716,
"step": 190
},
{
"epoch": 0.8637873754152824,
"grad_norm": 2.084320545196533,
"learning_rate": 6.0464037135391395e-06,
"loss": 1.3418,
"step": 195
},
{
"epoch": 0.8859357696566998,
"grad_norm": 2.9202771186828613,
"learning_rate": 4.279368849209381e-06,
"loss": 1.5893,
"step": 200
},
{
"epoch": 0.9080841638981174,
"grad_norm": 1.439396619796753,
"learning_rate": 2.8058334845816213e-06,
"loss": 0.9657,
"step": 205
},
{
"epoch": 0.9302325581395349,
"grad_norm": 1.4320131540298462,
"learning_rate": 1.6352568480485276e-06,
"loss": 1.0156,
"step": 210
},
{
"epoch": 0.9523809523809523,
"grad_norm": 1.5984946489334106,
"learning_rate": 7.751533515623799e-07,
"loss": 1.0568,
"step": 215
},
{
"epoch": 0.9745293466223699,
"grad_norm": 1.2688162326812744,
"learning_rate": 2.310443525400885e-07,
"loss": 0.9799,
"step": 220
},
{
"epoch": 0.9966777408637874,
"grad_norm": 2.5447208881378174,
"learning_rate": 6.422710003439747e-09,
"loss": 1.5954,
"step": 225
}
],
"logging_steps": 5,
"max_steps": 226,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.098941995142349e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}