|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.0425260472039124, |
|
"eval_steps": 13, |
|
"global_step": 50, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0008505209440782479, |
|
"grad_norm": 7.646979331970215, |
|
"learning_rate": 5e-06, |
|
"loss": 11.2339, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0008505209440782479, |
|
"eval_loss": 2.182969570159912, |
|
"eval_runtime": 18.1957, |
|
"eval_samples_per_second": 27.204, |
|
"eval_steps_per_second": 13.63, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0017010418881564958, |
|
"grad_norm": 2.7476022243499756, |
|
"learning_rate": 1e-05, |
|
"loss": 8.1504, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0025515628322347436, |
|
"grad_norm": 4.530780792236328, |
|
"learning_rate": 1.5e-05, |
|
"loss": 9.479, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0034020837763129915, |
|
"grad_norm": 4.3689985275268555, |
|
"learning_rate": 2e-05, |
|
"loss": 8.6752, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.00425260472039124, |
|
"grad_norm": 4.250224590301514, |
|
"learning_rate": 2.5e-05, |
|
"loss": 8.7246, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.005103125664469487, |
|
"grad_norm": 3.909888505935669, |
|
"learning_rate": 3e-05, |
|
"loss": 8.8007, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.005953646608547736, |
|
"grad_norm": 4.2319536209106445, |
|
"learning_rate": 3.5e-05, |
|
"loss": 8.6001, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.006804167552625983, |
|
"grad_norm": 3.7226130962371826, |
|
"learning_rate": 4e-05, |
|
"loss": 8.6729, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.007654688496704231, |
|
"grad_norm": 4.978116512298584, |
|
"learning_rate": 4.5e-05, |
|
"loss": 9.7467, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.00850520944078248, |
|
"grad_norm": 7.846011161804199, |
|
"learning_rate": 5e-05, |
|
"loss": 10.2529, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.009355730384860728, |
|
"grad_norm": 7.632503032684326, |
|
"learning_rate": 4.99229333433282e-05, |
|
"loss": 10.6994, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.010206251328938975, |
|
"grad_norm": 6.259686470031738, |
|
"learning_rate": 4.9692208514878444e-05, |
|
"loss": 9.2648, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.011056772273017223, |
|
"grad_norm": 4.645693302154541, |
|
"learning_rate": 4.9309248009941914e-05, |
|
"loss": 8.8708, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.011056772273017223, |
|
"eval_loss": 2.120561122894287, |
|
"eval_runtime": 18.1798, |
|
"eval_samples_per_second": 27.228, |
|
"eval_steps_per_second": 13.642, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.011907293217095471, |
|
"grad_norm": 3.9054300785064697, |
|
"learning_rate": 4.877641290737884e-05, |
|
"loss": 7.6945, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.01275781416117372, |
|
"grad_norm": 6.3420305252075195, |
|
"learning_rate": 4.8096988312782174e-05, |
|
"loss": 8.6295, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.013608335105251966, |
|
"grad_norm": 6.098008155822754, |
|
"learning_rate": 4.72751631047092e-05, |
|
"loss": 7.3762, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.014458856049330214, |
|
"grad_norm": 6.397308826446533, |
|
"learning_rate": 4.6316004108852305e-05, |
|
"loss": 8.6222, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.015309376993408463, |
|
"grad_norm": 4.59157133102417, |
|
"learning_rate": 4.522542485937369e-05, |
|
"loss": 7.8273, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.01615989793748671, |
|
"grad_norm": 5.998552322387695, |
|
"learning_rate": 4.401014914000078e-05, |
|
"loss": 9.8392, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.01701041888156496, |
|
"grad_norm": 4.757502555847168, |
|
"learning_rate": 4.267766952966369e-05, |
|
"loss": 8.913, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.017860939825643206, |
|
"grad_norm": 5.794950485229492, |
|
"learning_rate": 4.123620120825459e-05, |
|
"loss": 8.4942, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.018711460769721456, |
|
"grad_norm": 7.268848419189453, |
|
"learning_rate": 3.969463130731183e-05, |
|
"loss": 9.6959, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.019561981713799702, |
|
"grad_norm": 4.623784065246582, |
|
"learning_rate": 3.8062464117898724e-05, |
|
"loss": 7.3824, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.02041250265787795, |
|
"grad_norm": 4.572867393493652, |
|
"learning_rate": 3.634976249348867e-05, |
|
"loss": 8.0812, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0212630236019562, |
|
"grad_norm": 6.01804780960083, |
|
"learning_rate": 3.456708580912725e-05, |
|
"loss": 8.0338, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.022113544546034446, |
|
"grad_norm": 4.490617275238037, |
|
"learning_rate": 3.272542485937369e-05, |
|
"loss": 7.893, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.022113544546034446, |
|
"eval_loss": 1.961138367652893, |
|
"eval_runtime": 18.1047, |
|
"eval_samples_per_second": 27.341, |
|
"eval_steps_per_second": 13.698, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.022964065490112696, |
|
"grad_norm": 3.840120553970337, |
|
"learning_rate": 3.083613409639764e-05, |
|
"loss": 6.86, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.023814586434190942, |
|
"grad_norm": 5.417391777038574, |
|
"learning_rate": 2.8910861626005776e-05, |
|
"loss": 7.7778, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.02466510737826919, |
|
"grad_norm": 3.8475501537323, |
|
"learning_rate": 2.6961477393196126e-05, |
|
"loss": 5.9285, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.02551562832234744, |
|
"grad_norm": 4.8878374099731445, |
|
"learning_rate": 2.5e-05, |
|
"loss": 7.3934, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.026366149266425686, |
|
"grad_norm": 4.846117973327637, |
|
"learning_rate": 2.303852260680388e-05, |
|
"loss": 7.3275, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.027216670210503932, |
|
"grad_norm": 3.879229784011841, |
|
"learning_rate": 2.1089138373994223e-05, |
|
"loss": 6.676, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.028067191154582182, |
|
"grad_norm": 5.315415859222412, |
|
"learning_rate": 1.9163865903602374e-05, |
|
"loss": 7.6324, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.02891771209866043, |
|
"grad_norm": 4.311729907989502, |
|
"learning_rate": 1.7274575140626318e-05, |
|
"loss": 6.9114, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.02976823304273868, |
|
"grad_norm": 6.797874450683594, |
|
"learning_rate": 1.5432914190872757e-05, |
|
"loss": 8.9163, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.030618753986816925, |
|
"grad_norm": 6.546699047088623, |
|
"learning_rate": 1.3650237506511331e-05, |
|
"loss": 9.0227, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.03146927493089517, |
|
"grad_norm": 4.277498722076416, |
|
"learning_rate": 1.1937535882101281e-05, |
|
"loss": 7.4829, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.03231979587497342, |
|
"grad_norm": 5.2016143798828125, |
|
"learning_rate": 1.0305368692688174e-05, |
|
"loss": 7.472, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.03317031681905167, |
|
"grad_norm": 4.864084243774414, |
|
"learning_rate": 8.763798791745411e-06, |
|
"loss": 8.2183, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.03317031681905167, |
|
"eval_loss": 1.9271198511123657, |
|
"eval_runtime": 18.0882, |
|
"eval_samples_per_second": 27.366, |
|
"eval_steps_per_second": 13.711, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.03402083776312992, |
|
"grad_norm": 4.204705715179443, |
|
"learning_rate": 7.3223304703363135e-06, |
|
"loss": 7.1296, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.034871358707208165, |
|
"grad_norm": 6.853778839111328, |
|
"learning_rate": 5.989850859999227e-06, |
|
"loss": 7.9356, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.03572187965128641, |
|
"grad_norm": 6.098489284515381, |
|
"learning_rate": 4.7745751406263165e-06, |
|
"loss": 9.2043, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.03657240059536466, |
|
"grad_norm": 6.261697292327881, |
|
"learning_rate": 3.6839958911476957e-06, |
|
"loss": 9.5531, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.03742292153944291, |
|
"grad_norm": 5.243067741394043, |
|
"learning_rate": 2.7248368952908053e-06, |
|
"loss": 8.3141, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.03827344248352116, |
|
"grad_norm": 5.846104145050049, |
|
"learning_rate": 1.9030116872178316e-06, |
|
"loss": 8.37, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.039123963427599405, |
|
"grad_norm": 8.3400239944458, |
|
"learning_rate": 1.2235870926211619e-06, |
|
"loss": 8.7011, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.03997448437167765, |
|
"grad_norm": 4.996219158172607, |
|
"learning_rate": 6.907519900580861e-07, |
|
"loss": 7.4367, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.0408250053157559, |
|
"grad_norm": 4.701981067657471, |
|
"learning_rate": 3.077914851215585e-07, |
|
"loss": 7.4361, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.04167552625983415, |
|
"grad_norm": 5.824567794799805, |
|
"learning_rate": 7.706665667180091e-08, |
|
"loss": 9.0153, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.0425260472039124, |
|
"grad_norm": 5.769680023193359, |
|
"learning_rate": 0.0, |
|
"loss": 8.7723, |
|
"step": 50 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 50, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 13, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8387975649951744.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|