|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.6530612244897959, |
|
"eval_steps": 500, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0326530612244898, |
|
"grad_norm": 0.3159657120704651, |
|
"learning_rate": 4.9985361990992455e-05, |
|
"loss": 0.1654, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0653061224489796, |
|
"grad_norm": 0.32706642150878906, |
|
"learning_rate": 4.9941465105674435e-05, |
|
"loss": 0.1369, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.09795918367346938, |
|
"grad_norm": 0.33407843112945557, |
|
"learning_rate": 4.986836074908616e-05, |
|
"loss": 0.1259, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1306122448979592, |
|
"grad_norm": 0.3189881443977356, |
|
"learning_rate": 4.976613452940604e-05, |
|
"loss": 0.1041, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.16326530612244897, |
|
"grad_norm": 0.3424989581108093, |
|
"learning_rate": 4.9634906157700036e-05, |
|
"loss": 0.1004, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.19591836734693877, |
|
"grad_norm": 0.3253389298915863, |
|
"learning_rate": 4.9474829307735115e-05, |
|
"loss": 0.0941, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.22857142857142856, |
|
"grad_norm": 0.26078635454177856, |
|
"learning_rate": 4.9286091436021015e-05, |
|
"loss": 0.0867, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2612244897959184, |
|
"grad_norm": 0.252139687538147, |
|
"learning_rate": 4.906891356229103e-05, |
|
"loss": 0.0853, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2938775510204082, |
|
"grad_norm": 0.3403972387313843, |
|
"learning_rate": 4.882355001067892e-05, |
|
"loss": 0.0863, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.32653061224489793, |
|
"grad_norm": 0.4710679352283478, |
|
"learning_rate": 4.855028811189496e-05, |
|
"loss": 0.0874, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.35918367346938773, |
|
"grad_norm": 0.3147217929363251, |
|
"learning_rate": 4.8249447866750025e-05, |
|
"loss": 0.0733, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.39183673469387753, |
|
"grad_norm": 0.3265310823917389, |
|
"learning_rate": 4.792138157142158e-05, |
|
"loss": 0.0719, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.42448979591836733, |
|
"grad_norm": 0.35432252287864685, |
|
"learning_rate": 4.75664734049005e-05, |
|
"loss": 0.0824, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.45714285714285713, |
|
"grad_norm": 0.3701626658439636, |
|
"learning_rate": 4.7185138979101864e-05, |
|
"loss": 0.0731, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4897959183673469, |
|
"grad_norm": 0.35868266224861145, |
|
"learning_rate": 4.677782485216644e-05, |
|
"loss": 0.0725, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5224489795918368, |
|
"grad_norm": 0.32440632581710815, |
|
"learning_rate": 4.6345008005522966e-05, |
|
"loss": 0.0694, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5551020408163265, |
|
"grad_norm": 0.3003002405166626, |
|
"learning_rate": 4.588719528532342e-05, |
|
"loss": 0.072, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5877551020408164, |
|
"grad_norm": 0.34989920258522034, |
|
"learning_rate": 4.540492280890555e-05, |
|
"loss": 0.0646, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6204081632653061, |
|
"grad_norm": 0.473254919052124, |
|
"learning_rate": 4.4898755336977673e-05, |
|
"loss": 0.0732, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6530612244897959, |
|
"grad_norm": 0.30768489837646484, |
|
"learning_rate": 4.436928561226087e-05, |
|
"loss": 0.068, |
|
"step": 100 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 459, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"total_flos": 6.206244600230707e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|