|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.0026274073619954284, |
|
"eval_steps": 25, |
|
"global_step": 75, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 3.5032098159939045e-05, |
|
"grad_norm": 8.220430374145508, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 16.7863, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 3.5032098159939045e-05, |
|
"eval_loss": 2.912421226501465, |
|
"eval_runtime": 3850.8422, |
|
"eval_samples_per_second": 6.243, |
|
"eval_steps_per_second": 3.121, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 7.006419631987809e-05, |
|
"grad_norm": 10.870444297790527, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 16.9888, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.00010509629447981713, |
|
"grad_norm": 11.990473747253418, |
|
"learning_rate": 0.0001, |
|
"loss": 19.7956, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.00014012839263975618, |
|
"grad_norm": 16.78184700012207, |
|
"learning_rate": 9.99524110790929e-05, |
|
"loss": 19.0291, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.00017516049079969523, |
|
"grad_norm": 14.768333435058594, |
|
"learning_rate": 9.980973490458728e-05, |
|
"loss": 18.0519, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.00021019258895963426, |
|
"grad_norm": 14.792781829833984, |
|
"learning_rate": 9.957224306869053e-05, |
|
"loss": 15.9296, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0002452246871195733, |
|
"grad_norm": 13.751137733459473, |
|
"learning_rate": 9.924038765061042e-05, |
|
"loss": 21.8163, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.00028025678527951236, |
|
"grad_norm": 11.666454315185547, |
|
"learning_rate": 9.881480035599667e-05, |
|
"loss": 17.2661, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0003152888834394514, |
|
"grad_norm": 13.513507843017578, |
|
"learning_rate": 9.829629131445342e-05, |
|
"loss": 17.5924, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.00035032098159939047, |
|
"grad_norm": 13.285558700561523, |
|
"learning_rate": 9.768584753741134e-05, |
|
"loss": 16.776, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00038535307975932947, |
|
"grad_norm": 14.108022689819336, |
|
"learning_rate": 9.698463103929542e-05, |
|
"loss": 16.9609, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0004203851779192685, |
|
"grad_norm": 12.603458404541016, |
|
"learning_rate": 9.619397662556435e-05, |
|
"loss": 14.9006, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.00045541727607920757, |
|
"grad_norm": 14.595187187194824, |
|
"learning_rate": 9.53153893518325e-05, |
|
"loss": 17.0331, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0004904493742391466, |
|
"grad_norm": 13.04394817352295, |
|
"learning_rate": 9.435054165891109e-05, |
|
"loss": 15.4006, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0005254814723990856, |
|
"grad_norm": 15.335989952087402, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 14.6069, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0005605135705590247, |
|
"grad_norm": 17.090463638305664, |
|
"learning_rate": 9.21695722906443e-05, |
|
"loss": 17.1589, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0005955456687189637, |
|
"grad_norm": 14.205001831054688, |
|
"learning_rate": 9.09576022144496e-05, |
|
"loss": 15.1409, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0006305777668789028, |
|
"grad_norm": 14.655081748962402, |
|
"learning_rate": 8.966766701456177e-05, |
|
"loss": 15.1379, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0006656098650388418, |
|
"grad_norm": 12.469857215881348, |
|
"learning_rate": 8.83022221559489e-05, |
|
"loss": 13.5586, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0007006419631987809, |
|
"grad_norm": 25.291624069213867, |
|
"learning_rate": 8.68638668405062e-05, |
|
"loss": 14.093, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0007356740613587199, |
|
"grad_norm": 16.53830337524414, |
|
"learning_rate": 8.535533905932738e-05, |
|
"loss": 16.5948, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0007707061595186589, |
|
"grad_norm": 14.183712005615234, |
|
"learning_rate": 8.377951038078302e-05, |
|
"loss": 14.1989, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.000805738257678598, |
|
"grad_norm": 20.805938720703125, |
|
"learning_rate": 8.213938048432697e-05, |
|
"loss": 15.1565, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.000840770355838537, |
|
"grad_norm": 13.752044677734375, |
|
"learning_rate": 8.043807145043604e-05, |
|
"loss": 14.4959, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0008758024539984761, |
|
"grad_norm": 15.55850601196289, |
|
"learning_rate": 7.86788218175523e-05, |
|
"loss": 15.3995, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0008758024539984761, |
|
"eval_loss": 1.7509880065917969, |
|
"eval_runtime": 3845.137, |
|
"eval_samples_per_second": 6.252, |
|
"eval_steps_per_second": 3.126, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0009108345521584151, |
|
"grad_norm": 12.968778610229492, |
|
"learning_rate": 7.68649804173412e-05, |
|
"loss": 13.9149, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0009458666503183541, |
|
"grad_norm": 18.30549430847168, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 14.3664, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0009808987484782932, |
|
"grad_norm": 10.476753234863281, |
|
"learning_rate": 7.308743066175172e-05, |
|
"loss": 13.5992, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0010159308466382324, |
|
"grad_norm": 13.235085487365723, |
|
"learning_rate": 7.113091308703498e-05, |
|
"loss": 14.0461, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0010509629447981712, |
|
"grad_norm": 12.34658432006836, |
|
"learning_rate": 6.91341716182545e-05, |
|
"loss": 12.8937, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0010859950429581103, |
|
"grad_norm": 10.956889152526855, |
|
"learning_rate": 6.710100716628344e-05, |
|
"loss": 12.5274, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0011210271411180495, |
|
"grad_norm": 11.746081352233887, |
|
"learning_rate": 6.503528997521366e-05, |
|
"loss": 12.4693, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0011560592392779886, |
|
"grad_norm": 15.51623821258545, |
|
"learning_rate": 6.294095225512603e-05, |
|
"loss": 12.3992, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.0011910913374379275, |
|
"grad_norm": 12.37278938293457, |
|
"learning_rate": 6.0821980696905146e-05, |
|
"loss": 12.1944, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0012261234355978666, |
|
"grad_norm": 10.765904426574707, |
|
"learning_rate": 5.868240888334653e-05, |
|
"loss": 12.5313, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0012611555337578057, |
|
"grad_norm": 14.782493591308594, |
|
"learning_rate": 5.6526309611002594e-05, |
|
"loss": 15.8179, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0012961876319177446, |
|
"grad_norm": 13.46235466003418, |
|
"learning_rate": 5.435778713738292e-05, |
|
"loss": 14.1264, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0013312197300776837, |
|
"grad_norm": 11.297664642333984, |
|
"learning_rate": 5.218096936826681e-05, |
|
"loss": 13.6005, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0013662518282376228, |
|
"grad_norm": 11.96649169921875, |
|
"learning_rate": 5e-05, |
|
"loss": 10.5872, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0014012839263975619, |
|
"grad_norm": 13.083653450012207, |
|
"learning_rate": 4.781903063173321e-05, |
|
"loss": 12.7705, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0014363160245575008, |
|
"grad_norm": 12.904142379760742, |
|
"learning_rate": 4.564221286261709e-05, |
|
"loss": 15.0502, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.0014713481227174399, |
|
"grad_norm": 13.312074661254883, |
|
"learning_rate": 4.347369038899744e-05, |
|
"loss": 11.7917, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.001506380220877379, |
|
"grad_norm": 15.562873840332031, |
|
"learning_rate": 4.131759111665349e-05, |
|
"loss": 13.647, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.0015414123190373179, |
|
"grad_norm": 15.31185531616211, |
|
"learning_rate": 3.917801930309486e-05, |
|
"loss": 15.8205, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.001576444417197257, |
|
"grad_norm": 17.315444946289062, |
|
"learning_rate": 3.705904774487396e-05, |
|
"loss": 17.6123, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.001611476515357196, |
|
"grad_norm": 13.117803573608398, |
|
"learning_rate": 3.4964710024786354e-05, |
|
"loss": 12.6254, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.001646508613517135, |
|
"grad_norm": 15.708817481994629, |
|
"learning_rate": 3.289899283371657e-05, |
|
"loss": 14.3401, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.001681540711677074, |
|
"grad_norm": 22.316585540771484, |
|
"learning_rate": 3.086582838174551e-05, |
|
"loss": 12.2662, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.0017165728098370132, |
|
"grad_norm": 24.079978942871094, |
|
"learning_rate": 2.886908691296504e-05, |
|
"loss": 15.551, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.0017516049079969523, |
|
"grad_norm": 44.766876220703125, |
|
"learning_rate": 2.6912569338248315e-05, |
|
"loss": 16.2846, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0017516049079969523, |
|
"eval_loss": 1.6892892122268677, |
|
"eval_runtime": 3848.953, |
|
"eval_samples_per_second": 6.246, |
|
"eval_steps_per_second": 3.123, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0017866370061568912, |
|
"grad_norm": 11.697985649108887, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 10.0276, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.0018216691043168303, |
|
"grad_norm": 13.977499961853027, |
|
"learning_rate": 2.3135019582658802e-05, |
|
"loss": 12.4535, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.0018567012024767694, |
|
"grad_norm": 14.096190452575684, |
|
"learning_rate": 2.132117818244771e-05, |
|
"loss": 12.3367, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.0018917333006367083, |
|
"grad_norm": 13.248291969299316, |
|
"learning_rate": 1.9561928549563968e-05, |
|
"loss": 10.8346, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.0019267653987966474, |
|
"grad_norm": 16.24111557006836, |
|
"learning_rate": 1.7860619515673033e-05, |
|
"loss": 12.1668, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.0019617974969565865, |
|
"grad_norm": 16.580856323242188, |
|
"learning_rate": 1.622048961921699e-05, |
|
"loss": 11.8119, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.0019968295951165254, |
|
"grad_norm": 11.197458267211914, |
|
"learning_rate": 1.4644660940672627e-05, |
|
"loss": 11.7317, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.0020318616932764647, |
|
"grad_norm": 12.0034761428833, |
|
"learning_rate": 1.3136133159493802e-05, |
|
"loss": 15.88, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.0020668937914364036, |
|
"grad_norm": 11.464460372924805, |
|
"learning_rate": 1.1697777844051105e-05, |
|
"loss": 14.5566, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.0021019258895963425, |
|
"grad_norm": 9.426189422607422, |
|
"learning_rate": 1.0332332985438248e-05, |
|
"loss": 11.9118, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.002136957987756282, |
|
"grad_norm": 11.631290435791016, |
|
"learning_rate": 9.042397785550405e-06, |
|
"loss": 13.0088, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.0021719900859162207, |
|
"grad_norm": 9.637537956237793, |
|
"learning_rate": 7.830427709355725e-06, |
|
"loss": 12.759, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.0022070221840761596, |
|
"grad_norm": 11.821331024169922, |
|
"learning_rate": 6.698729810778065e-06, |
|
"loss": 13.3715, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.002242054282236099, |
|
"grad_norm": 12.932907104492188, |
|
"learning_rate": 5.649458341088915e-06, |
|
"loss": 16.1093, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.002277086380396038, |
|
"grad_norm": 9.686279296875, |
|
"learning_rate": 4.684610648167503e-06, |
|
"loss": 12.3969, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.002312118478555977, |
|
"grad_norm": 11.873120307922363, |
|
"learning_rate": 3.8060233744356633e-06, |
|
"loss": 11.2338, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.002347150576715916, |
|
"grad_norm": 10.407732009887695, |
|
"learning_rate": 3.0153689607045845e-06, |
|
"loss": 14.9716, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.002382182674875855, |
|
"grad_norm": 12.169331550598145, |
|
"learning_rate": 2.314152462588659e-06, |
|
"loss": 14.9951, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.0024172147730357942, |
|
"grad_norm": 14.04606819152832, |
|
"learning_rate": 1.70370868554659e-06, |
|
"loss": 15.8657, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.002452246871195733, |
|
"grad_norm": 9.512072563171387, |
|
"learning_rate": 1.1851996440033319e-06, |
|
"loss": 13.8743, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.002487278969355672, |
|
"grad_norm": 10.393471717834473, |
|
"learning_rate": 7.596123493895991e-07, |
|
"loss": 12.4094, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.0025223110675156113, |
|
"grad_norm": 10.671348571777344, |
|
"learning_rate": 4.277569313094809e-07, |
|
"loss": 12.0238, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.0025573431656755502, |
|
"grad_norm": 11.587846755981445, |
|
"learning_rate": 1.9026509541272275e-07, |
|
"loss": 13.5365, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.002592375263835489, |
|
"grad_norm": 9.723920822143555, |
|
"learning_rate": 4.7588920907110094e-08, |
|
"loss": 12.6632, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.0026274073619954284, |
|
"grad_norm": 10.614508628845215, |
|
"learning_rate": 0.0, |
|
"loss": 11.4377, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0026274073619954284, |
|
"eval_loss": 1.660986304283142, |
|
"eval_runtime": 3847.2283, |
|
"eval_samples_per_second": 6.248, |
|
"eval_steps_per_second": 3.124, |
|
"step": 75 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 75, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.91356572008448e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|