adapters-opt-bnb8-QLORA-super_glue-wic
/
trainer_state-opt-bnb8-QLORA-super_glue-wic-sequence_classification.json
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 9.6, | |
"eval_steps": 1, | |
"global_step": 120, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.08, | |
"grad_norm": 15.745562553405762, | |
"learning_rate": 2.5e-05, | |
"loss": 0.8605, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.08, | |
"eval_accuracy": 0.484, | |
"eval_loss": 0.8637617230415344, | |
"eval_runtime": 2.5683, | |
"eval_samples_per_second": 97.341, | |
"eval_steps_per_second": 2.726, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.16, | |
"grad_norm": 19.96647834777832, | |
"learning_rate": 5e-05, | |
"loss": 0.9163, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.16, | |
"eval_accuracy": 0.488, | |
"eval_loss": 0.810189425945282, | |
"eval_runtime": 2.552, | |
"eval_samples_per_second": 97.963, | |
"eval_steps_per_second": 2.743, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.24, | |
"grad_norm": 19.000354766845703, | |
"learning_rate": 4.957627118644068e-05, | |
"loss": 0.8681, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.24, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.7167539000511169, | |
"eval_runtime": 2.5526, | |
"eval_samples_per_second": 97.938, | |
"eval_steps_per_second": 2.742, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.32, | |
"grad_norm": 4.575774669647217, | |
"learning_rate": 4.915254237288136e-05, | |
"loss": 0.6484, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.32, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.7121953368186951, | |
"eval_runtime": 2.5001, | |
"eval_samples_per_second": 99.994, | |
"eval_steps_per_second": 2.8, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.4, | |
"grad_norm": 2.526747703552246, | |
"learning_rate": 4.8728813559322034e-05, | |
"loss": 0.7006, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.4, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.7209541201591492, | |
"eval_runtime": 2.5542, | |
"eval_samples_per_second": 97.879, | |
"eval_steps_per_second": 2.741, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.48, | |
"grad_norm": 2.2623257637023926, | |
"learning_rate": 4.8305084745762714e-05, | |
"loss": 0.6587, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.48, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.737973153591156, | |
"eval_runtime": 2.553, | |
"eval_samples_per_second": 97.925, | |
"eval_steps_per_second": 2.742, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.56, | |
"grad_norm": 6.5833001136779785, | |
"learning_rate": 4.788135593220339e-05, | |
"loss": 0.6634, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.56, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.7435795664787292, | |
"eval_runtime": 2.5637, | |
"eval_samples_per_second": 97.517, | |
"eval_steps_per_second": 2.73, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.64, | |
"grad_norm": 11.965208053588867, | |
"learning_rate": 4.745762711864407e-05, | |
"loss": 0.7901, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.64, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.7358012795448303, | |
"eval_runtime": 2.5517, | |
"eval_samples_per_second": 97.974, | |
"eval_steps_per_second": 2.743, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.72, | |
"grad_norm": 11.749293327331543, | |
"learning_rate": 4.703389830508475e-05, | |
"loss": 0.7845, | |
"step": 9 | |
}, | |
{ | |
"epoch": 0.72, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.7179194092750549, | |
"eval_runtime": 2.5585, | |
"eval_samples_per_second": 97.713, | |
"eval_steps_per_second": 2.736, | |
"step": 9 | |
}, | |
{ | |
"epoch": 0.8, | |
"grad_norm": 7.4664130210876465, | |
"learning_rate": 4.6610169491525425e-05, | |
"loss": 0.9122, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.8, | |
"eval_accuracy": 0.484, | |
"eval_loss": 0.6973310708999634, | |
"eval_runtime": 2.5527, | |
"eval_samples_per_second": 97.934, | |
"eval_steps_per_second": 2.742, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.88, | |
"grad_norm": 5.599862098693848, | |
"learning_rate": 4.6186440677966104e-05, | |
"loss": 0.7581, | |
"step": 11 | |
}, | |
{ | |
"epoch": 0.88, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.6890488266944885, | |
"eval_runtime": 2.5516, | |
"eval_samples_per_second": 97.976, | |
"eval_steps_per_second": 2.743, | |
"step": 11 | |
}, | |
{ | |
"epoch": 0.96, | |
"grad_norm": 5.2170634269714355, | |
"learning_rate": 4.5762711864406784e-05, | |
"loss": 0.6821, | |
"step": 12 | |
}, | |
{ | |
"epoch": 0.96, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.6933399438858032, | |
"eval_runtime": 2.5513, | |
"eval_samples_per_second": 97.991, | |
"eval_steps_per_second": 2.744, | |
"step": 12 | |
}, | |
{ | |
"epoch": 1.04, | |
"grad_norm": 5.609626293182373, | |
"learning_rate": 4.533898305084746e-05, | |
"loss": 0.6992, | |
"step": 13 | |
}, | |
{ | |
"epoch": 1.04, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.7035390734672546, | |
"eval_runtime": 2.5496, | |
"eval_samples_per_second": 98.056, | |
"eval_steps_per_second": 2.746, | |
"step": 13 | |
}, | |
{ | |
"epoch": 1.12, | |
"grad_norm": 7.854551792144775, | |
"learning_rate": 4.491525423728814e-05, | |
"loss": 0.7384, | |
"step": 14 | |
}, | |
{ | |
"epoch": 1.12, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.710085928440094, | |
"eval_runtime": 2.5489, | |
"eval_samples_per_second": 98.08, | |
"eval_steps_per_second": 2.746, | |
"step": 14 | |
}, | |
{ | |
"epoch": 1.2, | |
"grad_norm": 5.026305198669434, | |
"learning_rate": 4.4491525423728816e-05, | |
"loss": 0.7223, | |
"step": 15 | |
}, | |
{ | |
"epoch": 1.2, | |
"eval_accuracy": 0.508, | |
"eval_loss": 0.7157421708106995, | |
"eval_runtime": 2.5501, | |
"eval_samples_per_second": 98.036, | |
"eval_steps_per_second": 2.745, | |
"step": 15 | |
}, | |
{ | |
"epoch": 1.28, | |
"grad_norm": 13.441658973693848, | |
"learning_rate": 4.4067796610169495e-05, | |
"loss": 0.7687, | |
"step": 16 | |
}, | |
{ | |
"epoch": 1.28, | |
"eval_accuracy": 0.508, | |
"eval_loss": 0.7153124809265137, | |
"eval_runtime": 2.5486, | |
"eval_samples_per_second": 98.095, | |
"eval_steps_per_second": 2.747, | |
"step": 16 | |
}, | |
{ | |
"epoch": 1.3599999999999999, | |
"grad_norm": 7.546152591705322, | |
"learning_rate": 4.3644067796610175e-05, | |
"loss": 0.7453, | |
"step": 17 | |
}, | |
{ | |
"epoch": 1.3599999999999999, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.7087851762771606, | |
"eval_runtime": 2.5485, | |
"eval_samples_per_second": 98.095, | |
"eval_steps_per_second": 2.747, | |
"step": 17 | |
}, | |
{ | |
"epoch": 1.44, | |
"grad_norm": 3.2455766201019287, | |
"learning_rate": 4.3220338983050854e-05, | |
"loss": 0.7231, | |
"step": 18 | |
}, | |
{ | |
"epoch": 1.44, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.6991875171661377, | |
"eval_runtime": 2.5539, | |
"eval_samples_per_second": 97.889, | |
"eval_steps_per_second": 2.741, | |
"step": 18 | |
}, | |
{ | |
"epoch": 1.52, | |
"grad_norm": 8.115966796875, | |
"learning_rate": 4.279661016949153e-05, | |
"loss": 0.7162, | |
"step": 19 | |
}, | |
{ | |
"epoch": 1.52, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.6926875114440918, | |
"eval_runtime": 2.548, | |
"eval_samples_per_second": 98.118, | |
"eval_steps_per_second": 2.747, | |
"step": 19 | |
}, | |
{ | |
"epoch": 1.6, | |
"grad_norm": 2.57649564743042, | |
"learning_rate": 4.2372881355932206e-05, | |
"loss": 0.6927, | |
"step": 20 | |
}, | |
{ | |
"epoch": 1.6, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.688406229019165, | |
"eval_runtime": 2.5496, | |
"eval_samples_per_second": 98.054, | |
"eval_steps_per_second": 2.746, | |
"step": 20 | |
}, | |
{ | |
"epoch": 1.6800000000000002, | |
"grad_norm": 2.991580009460449, | |
"learning_rate": 4.1949152542372886e-05, | |
"loss": 0.7097, | |
"step": 21 | |
}, | |
{ | |
"epoch": 1.6800000000000002, | |
"eval_accuracy": 0.536, | |
"eval_loss": 0.6855859160423279, | |
"eval_runtime": 2.5522, | |
"eval_samples_per_second": 97.953, | |
"eval_steps_per_second": 2.743, | |
"step": 21 | |
}, | |
{ | |
"epoch": 1.76, | |
"grad_norm": 2.709691286087036, | |
"learning_rate": 4.152542372881356e-05, | |
"loss": 0.6887, | |
"step": 22 | |
}, | |
{ | |
"epoch": 1.76, | |
"eval_accuracy": 0.504, | |
"eval_loss": 0.6868593692779541, | |
"eval_runtime": 2.5523, | |
"eval_samples_per_second": 97.95, | |
"eval_steps_per_second": 2.743, | |
"step": 22 | |
}, | |
{ | |
"epoch": 1.8399999999999999, | |
"grad_norm": 5.620818138122559, | |
"learning_rate": 4.110169491525424e-05, | |
"loss": 0.6656, | |
"step": 23 | |
}, | |
{ | |
"epoch": 1.8399999999999999, | |
"eval_accuracy": 0.48, | |
"eval_loss": 0.6900019645690918, | |
"eval_runtime": 2.5504, | |
"eval_samples_per_second": 98.023, | |
"eval_steps_per_second": 2.745, | |
"step": 23 | |
}, | |
{ | |
"epoch": 1.92, | |
"grad_norm": 12.563273429870605, | |
"learning_rate": 4.067796610169492e-05, | |
"loss": 0.7211, | |
"step": 24 | |
}, | |
{ | |
"epoch": 1.92, | |
"eval_accuracy": 0.5, | |
"eval_loss": 0.6927011609077454, | |
"eval_runtime": 2.5512, | |
"eval_samples_per_second": 97.994, | |
"eval_steps_per_second": 2.744, | |
"step": 24 | |
}, | |
{ | |
"epoch": 2.0, | |
"grad_norm": 3.7670013904571533, | |
"learning_rate": 4.025423728813559e-05, | |
"loss": 0.7575, | |
"step": 25 | |
}, | |
{ | |
"epoch": 2.0, | |
"eval_accuracy": 0.496, | |
"eval_loss": 0.6926933526992798, | |
"eval_runtime": 2.5561, | |
"eval_samples_per_second": 97.805, | |
"eval_steps_per_second": 2.739, | |
"step": 25 | |
}, | |
{ | |
"epoch": 2.08, | |
"grad_norm": 4.824467658996582, | |
"learning_rate": 3.983050847457627e-05, | |
"loss": 0.7596, | |
"step": 26 | |
}, | |
{ | |
"epoch": 2.08, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.6877753734588623, | |
"eval_runtime": 2.5044, | |
"eval_samples_per_second": 99.823, | |
"eval_steps_per_second": 2.795, | |
"step": 26 | |
}, | |
{ | |
"epoch": 2.16, | |
"grad_norm": 1.8508061170578003, | |
"learning_rate": 3.940677966101695e-05, | |
"loss": 0.7391, | |
"step": 27 | |
}, | |
{ | |
"epoch": 2.16, | |
"eval_accuracy": 0.5, | |
"eval_loss": 0.6870742440223694, | |
"eval_runtime": 2.5639, | |
"eval_samples_per_second": 97.509, | |
"eval_steps_per_second": 2.73, | |
"step": 27 | |
}, | |
{ | |
"epoch": 2.24, | |
"grad_norm": 3.564190149307251, | |
"learning_rate": 3.898305084745763e-05, | |
"loss": 0.6948, | |
"step": 28 | |
}, | |
{ | |
"epoch": 2.24, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6858359575271606, | |
"eval_runtime": 2.5544, | |
"eval_samples_per_second": 97.87, | |
"eval_steps_per_second": 2.74, | |
"step": 28 | |
}, | |
{ | |
"epoch": 2.32, | |
"grad_norm": 5.474400997161865, | |
"learning_rate": 3.855932203389831e-05, | |
"loss": 0.7572, | |
"step": 29 | |
}, | |
{ | |
"epoch": 2.32, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6866718530654907, | |
"eval_runtime": 2.5557, | |
"eval_samples_per_second": 97.821, | |
"eval_steps_per_second": 2.739, | |
"step": 29 | |
}, | |
{ | |
"epoch": 2.4, | |
"grad_norm": 2.450674533843994, | |
"learning_rate": 3.813559322033898e-05, | |
"loss": 0.6643, | |
"step": 30 | |
}, | |
{ | |
"epoch": 2.4, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6864023208618164, | |
"eval_runtime": 2.5543, | |
"eval_samples_per_second": 97.875, | |
"eval_steps_per_second": 2.741, | |
"step": 30 | |
}, | |
{ | |
"epoch": 2.48, | |
"grad_norm": 3.568068265914917, | |
"learning_rate": 3.771186440677966e-05, | |
"loss": 0.7332, | |
"step": 31 | |
}, | |
{ | |
"epoch": 2.48, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.6899687647819519, | |
"eval_runtime": 2.5505, | |
"eval_samples_per_second": 98.019, | |
"eval_steps_per_second": 2.745, | |
"step": 31 | |
}, | |
{ | |
"epoch": 2.56, | |
"grad_norm": 2.2910633087158203, | |
"learning_rate": 3.728813559322034e-05, | |
"loss": 0.7119, | |
"step": 32 | |
}, | |
{ | |
"epoch": 2.56, | |
"eval_accuracy": 0.508, | |
"eval_loss": 0.6921640634536743, | |
"eval_runtime": 2.5545, | |
"eval_samples_per_second": 97.865, | |
"eval_steps_per_second": 2.74, | |
"step": 32 | |
}, | |
{ | |
"epoch": 2.64, | |
"grad_norm": 2.090604066848755, | |
"learning_rate": 3.686440677966102e-05, | |
"loss": 0.7247, | |
"step": 33 | |
}, | |
{ | |
"epoch": 2.64, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.6914921998977661, | |
"eval_runtime": 2.5526, | |
"eval_samples_per_second": 97.939, | |
"eval_steps_per_second": 2.742, | |
"step": 33 | |
}, | |
{ | |
"epoch": 2.7199999999999998, | |
"grad_norm": 6.273575782775879, | |
"learning_rate": 3.644067796610169e-05, | |
"loss": 0.7364, | |
"step": 34 | |
}, | |
{ | |
"epoch": 2.7199999999999998, | |
"eval_accuracy": 0.508, | |
"eval_loss": 0.6918437480926514, | |
"eval_runtime": 2.5551, | |
"eval_samples_per_second": 97.843, | |
"eval_steps_per_second": 2.74, | |
"step": 34 | |
}, | |
{ | |
"epoch": 2.8, | |
"grad_norm": 6.723451614379883, | |
"learning_rate": 3.601694915254237e-05, | |
"loss": 0.6936, | |
"step": 35 | |
}, | |
{ | |
"epoch": 2.8, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.6923359632492065, | |
"eval_runtime": 2.5513, | |
"eval_samples_per_second": 97.991, | |
"eval_steps_per_second": 2.744, | |
"step": 35 | |
}, | |
{ | |
"epoch": 2.88, | |
"grad_norm": 10.801304817199707, | |
"learning_rate": 3.559322033898305e-05, | |
"loss": 0.6749, | |
"step": 36 | |
}, | |
{ | |
"epoch": 2.88, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.6919687390327454, | |
"eval_runtime": 2.5553, | |
"eval_samples_per_second": 97.836, | |
"eval_steps_per_second": 2.739, | |
"step": 36 | |
}, | |
{ | |
"epoch": 2.96, | |
"grad_norm": 7.005471229553223, | |
"learning_rate": 3.516949152542373e-05, | |
"loss": 0.7283, | |
"step": 37 | |
}, | |
{ | |
"epoch": 2.96, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.690359354019165, | |
"eval_runtime": 2.5563, | |
"eval_samples_per_second": 97.799, | |
"eval_steps_per_second": 2.738, | |
"step": 37 | |
}, | |
{ | |
"epoch": 3.04, | |
"grad_norm": 3.912996530532837, | |
"learning_rate": 3.474576271186441e-05, | |
"loss": 0.6633, | |
"step": 38 | |
}, | |
{ | |
"epoch": 3.04, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6878437399864197, | |
"eval_runtime": 2.5515, | |
"eval_samples_per_second": 97.982, | |
"eval_steps_per_second": 2.743, | |
"step": 38 | |
}, | |
{ | |
"epoch": 3.12, | |
"grad_norm": 3.429208278656006, | |
"learning_rate": 3.432203389830508e-05, | |
"loss": 0.705, | |
"step": 39 | |
}, | |
{ | |
"epoch": 3.12, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6873710751533508, | |
"eval_runtime": 2.5571, | |
"eval_samples_per_second": 97.766, | |
"eval_steps_per_second": 2.737, | |
"step": 39 | |
}, | |
{ | |
"epoch": 3.2, | |
"grad_norm": 3.2059762477874756, | |
"learning_rate": 3.389830508474576e-05, | |
"loss": 0.6913, | |
"step": 40 | |
}, | |
{ | |
"epoch": 3.2, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.689160168170929, | |
"eval_runtime": 2.5594, | |
"eval_samples_per_second": 97.681, | |
"eval_steps_per_second": 2.735, | |
"step": 40 | |
}, | |
{ | |
"epoch": 3.2800000000000002, | |
"grad_norm": 1.8953251838684082, | |
"learning_rate": 3.347457627118644e-05, | |
"loss": 0.6501, | |
"step": 41 | |
}, | |
{ | |
"epoch": 3.2800000000000002, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6878241896629333, | |
"eval_runtime": 2.5561, | |
"eval_samples_per_second": 97.806, | |
"eval_steps_per_second": 2.739, | |
"step": 41 | |
}, | |
{ | |
"epoch": 3.36, | |
"grad_norm": 3.3232831954956055, | |
"learning_rate": 3.305084745762712e-05, | |
"loss": 0.7278, | |
"step": 42 | |
}, | |
{ | |
"epoch": 3.36, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6906406283378601, | |
"eval_runtime": 2.5543, | |
"eval_samples_per_second": 97.873, | |
"eval_steps_per_second": 2.74, | |
"step": 42 | |
}, | |
{ | |
"epoch": 3.44, | |
"grad_norm": 7.255284309387207, | |
"learning_rate": 3.26271186440678e-05, | |
"loss": 0.6755, | |
"step": 43 | |
}, | |
{ | |
"epoch": 3.44, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6893867254257202, | |
"eval_runtime": 2.5555, | |
"eval_samples_per_second": 97.827, | |
"eval_steps_per_second": 2.739, | |
"step": 43 | |
}, | |
{ | |
"epoch": 3.52, | |
"grad_norm": 5.801779270172119, | |
"learning_rate": 3.2203389830508473e-05, | |
"loss": 0.7261, | |
"step": 44 | |
}, | |
{ | |
"epoch": 3.52, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.689304769039154, | |
"eval_runtime": 2.5009, | |
"eval_samples_per_second": 99.965, | |
"eval_steps_per_second": 2.799, | |
"step": 44 | |
}, | |
{ | |
"epoch": 3.6, | |
"grad_norm": 6.175045967102051, | |
"learning_rate": 3.177966101694915e-05, | |
"loss": 0.6614, | |
"step": 45 | |
}, | |
{ | |
"epoch": 3.6, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6881933808326721, | |
"eval_runtime": 2.5565, | |
"eval_samples_per_second": 97.788, | |
"eval_steps_per_second": 2.738, | |
"step": 45 | |
}, | |
{ | |
"epoch": 3.68, | |
"grad_norm": 7.691817760467529, | |
"learning_rate": 3.135593220338983e-05, | |
"loss": 0.7862, | |
"step": 46 | |
}, | |
{ | |
"epoch": 3.68, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6858007907867432, | |
"eval_runtime": 2.556, | |
"eval_samples_per_second": 97.808, | |
"eval_steps_per_second": 2.739, | |
"step": 46 | |
}, | |
{ | |
"epoch": 3.76, | |
"grad_norm": 7.117946624755859, | |
"learning_rate": 3.093220338983051e-05, | |
"loss": 0.6939, | |
"step": 47 | |
}, | |
{ | |
"epoch": 3.76, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6848788857460022, | |
"eval_runtime": 2.5617, | |
"eval_samples_per_second": 97.593, | |
"eval_steps_per_second": 2.733, | |
"step": 47 | |
}, | |
{ | |
"epoch": 3.84, | |
"grad_norm": 3.879883289337158, | |
"learning_rate": 3.050847457627119e-05, | |
"loss": 0.6851, | |
"step": 48 | |
}, | |
{ | |
"epoch": 3.84, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6833086013793945, | |
"eval_runtime": 2.5485, | |
"eval_samples_per_second": 98.095, | |
"eval_steps_per_second": 2.747, | |
"step": 48 | |
}, | |
{ | |
"epoch": 3.92, | |
"grad_norm": 7.152946949005127, | |
"learning_rate": 3.0084745762711864e-05, | |
"loss": 0.7251, | |
"step": 49 | |
}, | |
{ | |
"epoch": 3.92, | |
"eval_accuracy": 0.508, | |
"eval_loss": 0.6816601753234863, | |
"eval_runtime": 2.5514, | |
"eval_samples_per_second": 97.984, | |
"eval_steps_per_second": 2.744, | |
"step": 49 | |
}, | |
{ | |
"epoch": 4.0, | |
"grad_norm": 1.905958890914917, | |
"learning_rate": 2.9661016949152544e-05, | |
"loss": 0.7008, | |
"step": 50 | |
}, | |
{ | |
"epoch": 4.0, | |
"eval_accuracy": 0.508, | |
"eval_loss": 0.6813437342643738, | |
"eval_runtime": 2.5476, | |
"eval_samples_per_second": 98.131, | |
"eval_steps_per_second": 2.748, | |
"step": 50 | |
}, | |
{ | |
"epoch": 4.08, | |
"grad_norm": 3.4690747261047363, | |
"learning_rate": 2.9237288135593223e-05, | |
"loss": 0.7161, | |
"step": 51 | |
}, | |
{ | |
"epoch": 4.08, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6803905963897705, | |
"eval_runtime": 2.552, | |
"eval_samples_per_second": 97.961, | |
"eval_steps_per_second": 2.743, | |
"step": 51 | |
}, | |
{ | |
"epoch": 4.16, | |
"grad_norm": 4.801358699798584, | |
"learning_rate": 2.88135593220339e-05, | |
"loss": 0.6932, | |
"step": 52 | |
}, | |
{ | |
"epoch": 4.16, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.6818437576293945, | |
"eval_runtime": 2.5499, | |
"eval_samples_per_second": 98.043, | |
"eval_steps_per_second": 2.745, | |
"step": 52 | |
}, | |
{ | |
"epoch": 4.24, | |
"grad_norm": 11.071053504943848, | |
"learning_rate": 2.838983050847458e-05, | |
"loss": 0.6743, | |
"step": 53 | |
}, | |
{ | |
"epoch": 4.24, | |
"eval_accuracy": 0.5, | |
"eval_loss": 0.6845312714576721, | |
"eval_runtime": 2.552, | |
"eval_samples_per_second": 97.962, | |
"eval_steps_per_second": 2.743, | |
"step": 53 | |
}, | |
{ | |
"epoch": 4.32, | |
"grad_norm": 2.7743890285491943, | |
"learning_rate": 2.7966101694915255e-05, | |
"loss": 0.7102, | |
"step": 54 | |
}, | |
{ | |
"epoch": 4.32, | |
"eval_accuracy": 0.5, | |
"eval_loss": 0.6837421655654907, | |
"eval_runtime": 2.5501, | |
"eval_samples_per_second": 98.035, | |
"eval_steps_per_second": 2.745, | |
"step": 54 | |
}, | |
{ | |
"epoch": 4.4, | |
"grad_norm": 4.514011859893799, | |
"learning_rate": 2.754237288135593e-05, | |
"loss": 0.7169, | |
"step": 55 | |
}, | |
{ | |
"epoch": 4.4, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.6851328015327454, | |
"eval_runtime": 2.4999, | |
"eval_samples_per_second": 100.005, | |
"eval_steps_per_second": 2.8, | |
"step": 55 | |
}, | |
{ | |
"epoch": 4.48, | |
"grad_norm": 4.223740100860596, | |
"learning_rate": 2.711864406779661e-05, | |
"loss": 0.735, | |
"step": 56 | |
}, | |
{ | |
"epoch": 4.48, | |
"eval_accuracy": 0.54, | |
"eval_loss": 0.6856874823570251, | |
"eval_runtime": 2.501, | |
"eval_samples_per_second": 99.959, | |
"eval_steps_per_second": 2.799, | |
"step": 56 | |
}, | |
{ | |
"epoch": 4.5600000000000005, | |
"grad_norm": 8.217570304870605, | |
"learning_rate": 2.669491525423729e-05, | |
"loss": 0.7155, | |
"step": 57 | |
}, | |
{ | |
"epoch": 4.5600000000000005, | |
"eval_accuracy": 0.56, | |
"eval_loss": 0.686718761920929, | |
"eval_runtime": 2.455, | |
"eval_samples_per_second": 101.832, | |
"eval_steps_per_second": 2.851, | |
"step": 57 | |
}, | |
{ | |
"epoch": 4.64, | |
"grad_norm": 7.043219089508057, | |
"learning_rate": 2.627118644067797e-05, | |
"loss": 0.696, | |
"step": 58 | |
}, | |
{ | |
"epoch": 4.64, | |
"eval_accuracy": 0.536, | |
"eval_loss": 0.6860937476158142, | |
"eval_runtime": 2.5524, | |
"eval_samples_per_second": 97.946, | |
"eval_steps_per_second": 2.742, | |
"step": 58 | |
}, | |
{ | |
"epoch": 4.72, | |
"grad_norm": 5.340857982635498, | |
"learning_rate": 2.5847457627118642e-05, | |
"loss": 0.715, | |
"step": 59 | |
}, | |
{ | |
"epoch": 4.72, | |
"eval_accuracy": 0.572, | |
"eval_loss": 0.6860234141349792, | |
"eval_runtime": 2.5519, | |
"eval_samples_per_second": 97.968, | |
"eval_steps_per_second": 2.743, | |
"step": 59 | |
}, | |
{ | |
"epoch": 4.8, | |
"grad_norm": 3.935725212097168, | |
"learning_rate": 2.5423728813559322e-05, | |
"loss": 0.6934, | |
"step": 60 | |
}, | |
{ | |
"epoch": 4.8, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.6857656240463257, | |
"eval_runtime": 2.5599, | |
"eval_samples_per_second": 97.659, | |
"eval_steps_per_second": 2.734, | |
"step": 60 | |
}, | |
{ | |
"epoch": 4.88, | |
"grad_norm": 5.759483814239502, | |
"learning_rate": 2.5e-05, | |
"loss": 0.6659, | |
"step": 61 | |
}, | |
{ | |
"epoch": 4.88, | |
"eval_accuracy": 0.54, | |
"eval_loss": 0.6850937604904175, | |
"eval_runtime": 2.556, | |
"eval_samples_per_second": 97.809, | |
"eval_steps_per_second": 2.739, | |
"step": 61 | |
}, | |
{ | |
"epoch": 4.96, | |
"grad_norm": 9.154269218444824, | |
"learning_rate": 2.457627118644068e-05, | |
"loss": 0.7122, | |
"step": 62 | |
}, | |
{ | |
"epoch": 4.96, | |
"eval_accuracy": 0.54, | |
"eval_loss": 0.683843731880188, | |
"eval_runtime": 2.5171, | |
"eval_samples_per_second": 99.322, | |
"eval_steps_per_second": 2.781, | |
"step": 62 | |
}, | |
{ | |
"epoch": 5.04, | |
"grad_norm": 3.8628180027008057, | |
"learning_rate": 2.4152542372881357e-05, | |
"loss": 0.685, | |
"step": 63 | |
}, | |
{ | |
"epoch": 5.04, | |
"eval_accuracy": 0.536, | |
"eval_loss": 0.6826719045639038, | |
"eval_runtime": 2.5523, | |
"eval_samples_per_second": 97.95, | |
"eval_steps_per_second": 2.743, | |
"step": 63 | |
}, | |
{ | |
"epoch": 5.12, | |
"grad_norm": 3.074619770050049, | |
"learning_rate": 2.3728813559322036e-05, | |
"loss": 0.6702, | |
"step": 64 | |
}, | |
{ | |
"epoch": 5.12, | |
"eval_accuracy": 0.54, | |
"eval_loss": 0.6835703253746033, | |
"eval_runtime": 2.5526, | |
"eval_samples_per_second": 97.94, | |
"eval_steps_per_second": 2.742, | |
"step": 64 | |
}, | |
{ | |
"epoch": 5.2, | |
"grad_norm": 3.274127721786499, | |
"learning_rate": 2.3305084745762712e-05, | |
"loss": 0.7247, | |
"step": 65 | |
}, | |
{ | |
"epoch": 5.2, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.6832265853881836, | |
"eval_runtime": 2.5575, | |
"eval_samples_per_second": 97.752, | |
"eval_steps_per_second": 2.737, | |
"step": 65 | |
}, | |
{ | |
"epoch": 5.28, | |
"grad_norm": 3.9896106719970703, | |
"learning_rate": 2.2881355932203392e-05, | |
"loss": 0.7237, | |
"step": 66 | |
}, | |
{ | |
"epoch": 5.28, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.683218777179718, | |
"eval_runtime": 2.5521, | |
"eval_samples_per_second": 97.959, | |
"eval_steps_per_second": 2.743, | |
"step": 66 | |
}, | |
{ | |
"epoch": 5.36, | |
"grad_norm": 4.6198410987854, | |
"learning_rate": 2.245762711864407e-05, | |
"loss": 0.6874, | |
"step": 67 | |
}, | |
{ | |
"epoch": 5.36, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6817265748977661, | |
"eval_runtime": 2.5508, | |
"eval_samples_per_second": 98.007, | |
"eval_steps_per_second": 2.744, | |
"step": 67 | |
}, | |
{ | |
"epoch": 5.44, | |
"grad_norm": 2.7638332843780518, | |
"learning_rate": 2.2033898305084748e-05, | |
"loss": 0.6759, | |
"step": 68 | |
}, | |
{ | |
"epoch": 5.44, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6810156106948853, | |
"eval_runtime": 2.5592, | |
"eval_samples_per_second": 97.689, | |
"eval_steps_per_second": 2.735, | |
"step": 68 | |
}, | |
{ | |
"epoch": 5.52, | |
"grad_norm": 5.45823860168457, | |
"learning_rate": 2.1610169491525427e-05, | |
"loss": 0.6508, | |
"step": 69 | |
}, | |
{ | |
"epoch": 5.52, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6822657585144043, | |
"eval_runtime": 2.5549, | |
"eval_samples_per_second": 97.85, | |
"eval_steps_per_second": 2.74, | |
"step": 69 | |
}, | |
{ | |
"epoch": 5.6, | |
"grad_norm": 1.344992756843567, | |
"learning_rate": 2.1186440677966103e-05, | |
"loss": 0.65, | |
"step": 70 | |
}, | |
{ | |
"epoch": 5.6, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6806054711341858, | |
"eval_runtime": 2.5531, | |
"eval_samples_per_second": 97.92, | |
"eval_steps_per_second": 2.742, | |
"step": 70 | |
}, | |
{ | |
"epoch": 5.68, | |
"grad_norm": 3.803818941116333, | |
"learning_rate": 2.076271186440678e-05, | |
"loss": 0.6818, | |
"step": 71 | |
}, | |
{ | |
"epoch": 5.68, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6822031140327454, | |
"eval_runtime": 2.5599, | |
"eval_samples_per_second": 97.661, | |
"eval_steps_per_second": 2.734, | |
"step": 71 | |
}, | |
{ | |
"epoch": 5.76, | |
"grad_norm": 3.8192968368530273, | |
"learning_rate": 2.033898305084746e-05, | |
"loss": 0.6384, | |
"step": 72 | |
}, | |
{ | |
"epoch": 5.76, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6826953887939453, | |
"eval_runtime": 2.5524, | |
"eval_samples_per_second": 97.945, | |
"eval_steps_per_second": 2.742, | |
"step": 72 | |
}, | |
{ | |
"epoch": 5.84, | |
"grad_norm": 2.2677340507507324, | |
"learning_rate": 1.9915254237288135e-05, | |
"loss": 0.6847, | |
"step": 73 | |
}, | |
{ | |
"epoch": 5.84, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.6820663809776306, | |
"eval_runtime": 2.5706, | |
"eval_samples_per_second": 97.253, | |
"eval_steps_per_second": 2.723, | |
"step": 73 | |
}, | |
{ | |
"epoch": 5.92, | |
"grad_norm": 10.372380256652832, | |
"learning_rate": 1.9491525423728814e-05, | |
"loss": 0.7528, | |
"step": 74 | |
}, | |
{ | |
"epoch": 5.92, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.683914065361023, | |
"eval_runtime": 2.5522, | |
"eval_samples_per_second": 97.953, | |
"eval_steps_per_second": 2.743, | |
"step": 74 | |
}, | |
{ | |
"epoch": 6.0, | |
"grad_norm": 2.46616268157959, | |
"learning_rate": 1.906779661016949e-05, | |
"loss": 0.6733, | |
"step": 75 | |
}, | |
{ | |
"epoch": 6.0, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.6828632950782776, | |
"eval_runtime": 2.5571, | |
"eval_samples_per_second": 97.766, | |
"eval_steps_per_second": 2.737, | |
"step": 75 | |
}, | |
{ | |
"epoch": 6.08, | |
"grad_norm": 4.386812686920166, | |
"learning_rate": 1.864406779661017e-05, | |
"loss": 0.6642, | |
"step": 76 | |
}, | |
{ | |
"epoch": 6.08, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.6845312714576721, | |
"eval_runtime": 2.5496, | |
"eval_samples_per_second": 98.055, | |
"eval_steps_per_second": 2.746, | |
"step": 76 | |
}, | |
{ | |
"epoch": 6.16, | |
"grad_norm": 2.428257703781128, | |
"learning_rate": 1.8220338983050846e-05, | |
"loss": 0.669, | |
"step": 77 | |
}, | |
{ | |
"epoch": 6.16, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.6866054534912109, | |
"eval_runtime": 2.5556, | |
"eval_samples_per_second": 97.825, | |
"eval_steps_per_second": 2.739, | |
"step": 77 | |
}, | |
{ | |
"epoch": 6.24, | |
"grad_norm": 3.2233493328094482, | |
"learning_rate": 1.7796610169491526e-05, | |
"loss": 0.6525, | |
"step": 78 | |
}, | |
{ | |
"epoch": 6.24, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6888242363929749, | |
"eval_runtime": 2.5518, | |
"eval_samples_per_second": 97.969, | |
"eval_steps_per_second": 2.743, | |
"step": 78 | |
}, | |
{ | |
"epoch": 6.32, | |
"grad_norm": 4.684359550476074, | |
"learning_rate": 1.7372881355932205e-05, | |
"loss": 0.6827, | |
"step": 79 | |
}, | |
{ | |
"epoch": 6.32, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6881914138793945, | |
"eval_runtime": 2.5526, | |
"eval_samples_per_second": 97.938, | |
"eval_steps_per_second": 2.742, | |
"step": 79 | |
}, | |
{ | |
"epoch": 6.4, | |
"grad_norm": 6.635712623596191, | |
"learning_rate": 1.694915254237288e-05, | |
"loss": 0.7138, | |
"step": 80 | |
}, | |
{ | |
"epoch": 6.4, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.688589870929718, | |
"eval_runtime": 2.5551, | |
"eval_samples_per_second": 97.843, | |
"eval_steps_per_second": 2.74, | |
"step": 80 | |
}, | |
{ | |
"epoch": 6.48, | |
"grad_norm": 2.153329610824585, | |
"learning_rate": 1.652542372881356e-05, | |
"loss": 0.6533, | |
"step": 81 | |
}, | |
{ | |
"epoch": 6.48, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6895253658294678, | |
"eval_runtime": 2.5548, | |
"eval_samples_per_second": 97.856, | |
"eval_steps_per_second": 2.74, | |
"step": 81 | |
}, | |
{ | |
"epoch": 6.5600000000000005, | |
"grad_norm": 8.309370994567871, | |
"learning_rate": 1.6101694915254237e-05, | |
"loss": 0.7126, | |
"step": 82 | |
}, | |
{ | |
"epoch": 6.5600000000000005, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6872695088386536, | |
"eval_runtime": 2.5551, | |
"eval_samples_per_second": 97.844, | |
"eval_steps_per_second": 2.74, | |
"step": 82 | |
}, | |
{ | |
"epoch": 6.64, | |
"grad_norm": 7.506027698516846, | |
"learning_rate": 1.5677966101694916e-05, | |
"loss": 0.7333, | |
"step": 83 | |
}, | |
{ | |
"epoch": 6.64, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6848945021629333, | |
"eval_runtime": 2.5576, | |
"eval_samples_per_second": 97.75, | |
"eval_steps_per_second": 2.737, | |
"step": 83 | |
}, | |
{ | |
"epoch": 6.72, | |
"grad_norm": 2.9009928703308105, | |
"learning_rate": 1.5254237288135596e-05, | |
"loss": 0.6469, | |
"step": 84 | |
}, | |
{ | |
"epoch": 6.72, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6854609251022339, | |
"eval_runtime": 2.5528, | |
"eval_samples_per_second": 97.93, | |
"eval_steps_per_second": 2.742, | |
"step": 84 | |
}, | |
{ | |
"epoch": 6.8, | |
"grad_norm": 5.017767429351807, | |
"learning_rate": 1.4830508474576272e-05, | |
"loss": 0.7521, | |
"step": 85 | |
}, | |
{ | |
"epoch": 6.8, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6840038895606995, | |
"eval_runtime": 2.5591, | |
"eval_samples_per_second": 97.691, | |
"eval_steps_per_second": 2.735, | |
"step": 85 | |
}, | |
{ | |
"epoch": 6.88, | |
"grad_norm": 2.006927013397217, | |
"learning_rate": 1.440677966101695e-05, | |
"loss": 0.6516, | |
"step": 86 | |
}, | |
{ | |
"epoch": 6.88, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6825664043426514, | |
"eval_runtime": 2.5642, | |
"eval_samples_per_second": 97.496, | |
"eval_steps_per_second": 2.73, | |
"step": 86 | |
}, | |
{ | |
"epoch": 6.96, | |
"grad_norm": 5.504487037658691, | |
"learning_rate": 1.3983050847457627e-05, | |
"loss": 0.7072, | |
"step": 87 | |
}, | |
{ | |
"epoch": 6.96, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.6815390586853027, | |
"eval_runtime": 2.5618, | |
"eval_samples_per_second": 97.588, | |
"eval_steps_per_second": 2.732, | |
"step": 87 | |
}, | |
{ | |
"epoch": 7.04, | |
"grad_norm": 3.175321578979492, | |
"learning_rate": 1.3559322033898305e-05, | |
"loss": 0.6543, | |
"step": 88 | |
}, | |
{ | |
"epoch": 7.04, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6796953082084656, | |
"eval_runtime": 2.5612, | |
"eval_samples_per_second": 97.61, | |
"eval_steps_per_second": 2.733, | |
"step": 88 | |
}, | |
{ | |
"epoch": 7.12, | |
"grad_norm": 9.99293327331543, | |
"learning_rate": 1.3135593220338985e-05, | |
"loss": 0.7078, | |
"step": 89 | |
}, | |
{ | |
"epoch": 7.12, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6775000691413879, | |
"eval_runtime": 2.5598, | |
"eval_samples_per_second": 97.663, | |
"eval_steps_per_second": 2.735, | |
"step": 89 | |
}, | |
{ | |
"epoch": 7.2, | |
"grad_norm": 6.317586421966553, | |
"learning_rate": 1.2711864406779661e-05, | |
"loss": 0.7115, | |
"step": 90 | |
}, | |
{ | |
"epoch": 7.2, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6765468716621399, | |
"eval_runtime": 2.5596, | |
"eval_samples_per_second": 97.67, | |
"eval_steps_per_second": 2.735, | |
"step": 90 | |
}, | |
{ | |
"epoch": 7.28, | |
"grad_norm": 6.010474681854248, | |
"learning_rate": 1.228813559322034e-05, | |
"loss": 0.6348, | |
"step": 91 | |
}, | |
{ | |
"epoch": 7.28, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.6753984093666077, | |
"eval_runtime": 2.5578, | |
"eval_samples_per_second": 97.741, | |
"eval_steps_per_second": 2.737, | |
"step": 91 | |
}, | |
{ | |
"epoch": 7.36, | |
"grad_norm": 1.554281234741211, | |
"learning_rate": 1.1864406779661018e-05, | |
"loss": 0.6907, | |
"step": 92 | |
}, | |
{ | |
"epoch": 7.36, | |
"eval_accuracy": 0.552, | |
"eval_loss": 0.6742578148841858, | |
"eval_runtime": 2.5536, | |
"eval_samples_per_second": 97.901, | |
"eval_steps_per_second": 2.741, | |
"step": 92 | |
}, | |
{ | |
"epoch": 7.44, | |
"grad_norm": 2.691506862640381, | |
"learning_rate": 1.1440677966101696e-05, | |
"loss": 0.6474, | |
"step": 93 | |
}, | |
{ | |
"epoch": 7.44, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.6753789186477661, | |
"eval_runtime": 2.5576, | |
"eval_samples_per_second": 97.747, | |
"eval_steps_per_second": 2.737, | |
"step": 93 | |
}, | |
{ | |
"epoch": 7.52, | |
"grad_norm": 3.9982895851135254, | |
"learning_rate": 1.1016949152542374e-05, | |
"loss": 0.6816, | |
"step": 94 | |
}, | |
{ | |
"epoch": 7.52, | |
"eval_accuracy": 0.536, | |
"eval_loss": 0.6741992235183716, | |
"eval_runtime": 2.5538, | |
"eval_samples_per_second": 97.894, | |
"eval_steps_per_second": 2.741, | |
"step": 94 | |
}, | |
{ | |
"epoch": 7.6, | |
"grad_norm": 4.3524274826049805, | |
"learning_rate": 1.0593220338983052e-05, | |
"loss": 0.6974, | |
"step": 95 | |
}, | |
{ | |
"epoch": 7.6, | |
"eval_accuracy": 0.544, | |
"eval_loss": 0.6741132736206055, | |
"eval_runtime": 2.5546, | |
"eval_samples_per_second": 97.862, | |
"eval_steps_per_second": 2.74, | |
"step": 95 | |
}, | |
{ | |
"epoch": 7.68, | |
"grad_norm": 2.858217477798462, | |
"learning_rate": 1.016949152542373e-05, | |
"loss": 0.7131, | |
"step": 96 | |
}, | |
{ | |
"epoch": 7.68, | |
"eval_accuracy": 0.556, | |
"eval_loss": 0.6736015677452087, | |
"eval_runtime": 2.5522, | |
"eval_samples_per_second": 97.955, | |
"eval_steps_per_second": 2.743, | |
"step": 96 | |
}, | |
{ | |
"epoch": 7.76, | |
"grad_norm": 5.655324459075928, | |
"learning_rate": 9.745762711864407e-06, | |
"loss": 0.639, | |
"step": 97 | |
}, | |
{ | |
"epoch": 7.76, | |
"eval_accuracy": 0.56, | |
"eval_loss": 0.6730742454528809, | |
"eval_runtime": 2.559, | |
"eval_samples_per_second": 97.694, | |
"eval_steps_per_second": 2.735, | |
"step": 97 | |
}, | |
{ | |
"epoch": 7.84, | |
"grad_norm": 11.034967422485352, | |
"learning_rate": 9.322033898305085e-06, | |
"loss": 0.6931, | |
"step": 98 | |
}, | |
{ | |
"epoch": 7.84, | |
"eval_accuracy": 0.552, | |
"eval_loss": 0.6729413866996765, | |
"eval_runtime": 2.5511, | |
"eval_samples_per_second": 97.998, | |
"eval_steps_per_second": 2.744, | |
"step": 98 | |
}, | |
{ | |
"epoch": 7.92, | |
"grad_norm": 4.288939476013184, | |
"learning_rate": 8.898305084745763e-06, | |
"loss": 0.7052, | |
"step": 99 | |
}, | |
{ | |
"epoch": 7.92, | |
"eval_accuracy": 0.536, | |
"eval_loss": 0.6727031469345093, | |
"eval_runtime": 2.5507, | |
"eval_samples_per_second": 98.012, | |
"eval_steps_per_second": 2.744, | |
"step": 99 | |
}, | |
{ | |
"epoch": 8.0, | |
"grad_norm": 7.271831035614014, | |
"learning_rate": 8.47457627118644e-06, | |
"loss": 0.6804, | |
"step": 100 | |
}, | |
{ | |
"epoch": 8.0, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.6717929840087891, | |
"eval_runtime": 2.5062, | |
"eval_samples_per_second": 99.751, | |
"eval_steps_per_second": 2.793, | |
"step": 100 | |
}, | |
{ | |
"epoch": 8.08, | |
"grad_norm": 3.726989269256592, | |
"learning_rate": 8.050847457627118e-06, | |
"loss": 0.6751, | |
"step": 101 | |
}, | |
{ | |
"epoch": 8.08, | |
"eval_accuracy": 0.536, | |
"eval_loss": 0.6729882955551147, | |
"eval_runtime": 2.552, | |
"eval_samples_per_second": 97.962, | |
"eval_steps_per_second": 2.743, | |
"step": 101 | |
}, | |
{ | |
"epoch": 8.16, | |
"grad_norm": 3.427084445953369, | |
"learning_rate": 7.627118644067798e-06, | |
"loss": 0.6867, | |
"step": 102 | |
}, | |
{ | |
"epoch": 8.16, | |
"eval_accuracy": 0.536, | |
"eval_loss": 0.6727109551429749, | |
"eval_runtime": 2.5498, | |
"eval_samples_per_second": 98.046, | |
"eval_steps_per_second": 2.745, | |
"step": 102 | |
}, | |
{ | |
"epoch": 8.24, | |
"grad_norm": 7.673038005828857, | |
"learning_rate": 7.203389830508475e-06, | |
"loss": 0.6712, | |
"step": 103 | |
}, | |
{ | |
"epoch": 8.24, | |
"eval_accuracy": 0.548, | |
"eval_loss": 0.6731953024864197, | |
"eval_runtime": 2.564, | |
"eval_samples_per_second": 97.503, | |
"eval_steps_per_second": 2.73, | |
"step": 103 | |
}, | |
{ | |
"epoch": 8.32, | |
"grad_norm": 4.261835098266602, | |
"learning_rate": 6.779661016949153e-06, | |
"loss": 0.7308, | |
"step": 104 | |
}, | |
{ | |
"epoch": 8.32, | |
"eval_accuracy": 0.556, | |
"eval_loss": 0.6727851629257202, | |
"eval_runtime": 2.5486, | |
"eval_samples_per_second": 98.091, | |
"eval_steps_per_second": 2.747, | |
"step": 104 | |
}, | |
{ | |
"epoch": 8.4, | |
"grad_norm": 4.702732086181641, | |
"learning_rate": 6.3559322033898304e-06, | |
"loss": 0.7019, | |
"step": 105 | |
}, | |
{ | |
"epoch": 8.4, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.6733944416046143, | |
"eval_runtime": 2.5543, | |
"eval_samples_per_second": 97.876, | |
"eval_steps_per_second": 2.741, | |
"step": 105 | |
}, | |
{ | |
"epoch": 8.48, | |
"grad_norm": 1.8300518989562988, | |
"learning_rate": 5.932203389830509e-06, | |
"loss": 0.6733, | |
"step": 106 | |
}, | |
{ | |
"epoch": 8.48, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6754140853881836, | |
"eval_runtime": 2.5508, | |
"eval_samples_per_second": 98.009, | |
"eval_steps_per_second": 2.744, | |
"step": 106 | |
}, | |
{ | |
"epoch": 8.56, | |
"grad_norm": 4.873719692230225, | |
"learning_rate": 5.508474576271187e-06, | |
"loss": 0.6503, | |
"step": 107 | |
}, | |
{ | |
"epoch": 8.56, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6744531393051147, | |
"eval_runtime": 2.5498, | |
"eval_samples_per_second": 98.045, | |
"eval_steps_per_second": 2.745, | |
"step": 107 | |
}, | |
{ | |
"epoch": 8.64, | |
"grad_norm": 2.1070873737335205, | |
"learning_rate": 5.084745762711865e-06, | |
"loss": 0.6976, | |
"step": 108 | |
}, | |
{ | |
"epoch": 8.64, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6756640672683716, | |
"eval_runtime": 2.5509, | |
"eval_samples_per_second": 98.005, | |
"eval_steps_per_second": 2.744, | |
"step": 108 | |
}, | |
{ | |
"epoch": 8.72, | |
"grad_norm": 1.433050513267517, | |
"learning_rate": 4.6610169491525425e-06, | |
"loss": 0.6447, | |
"step": 109 | |
}, | |
{ | |
"epoch": 8.72, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6764531135559082, | |
"eval_runtime": 2.5599, | |
"eval_samples_per_second": 97.66, | |
"eval_steps_per_second": 2.734, | |
"step": 109 | |
}, | |
{ | |
"epoch": 8.8, | |
"grad_norm": 2.937915086746216, | |
"learning_rate": 4.23728813559322e-06, | |
"loss": 0.6964, | |
"step": 110 | |
}, | |
{ | |
"epoch": 8.8, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.6768437623977661, | |
"eval_runtime": 2.5495, | |
"eval_samples_per_second": 98.058, | |
"eval_steps_per_second": 2.746, | |
"step": 110 | |
}, | |
{ | |
"epoch": 8.88, | |
"grad_norm": 1.9771907329559326, | |
"learning_rate": 3.813559322033899e-06, | |
"loss": 0.6625, | |
"step": 111 | |
}, | |
{ | |
"epoch": 8.88, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.677441418170929, | |
"eval_runtime": 2.5474, | |
"eval_samples_per_second": 98.139, | |
"eval_steps_per_second": 2.748, | |
"step": 111 | |
}, | |
{ | |
"epoch": 8.96, | |
"grad_norm": 8.43002986907959, | |
"learning_rate": 3.3898305084745763e-06, | |
"loss": 0.7092, | |
"step": 112 | |
}, | |
{ | |
"epoch": 8.96, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6763046979904175, | |
"eval_runtime": 2.5006, | |
"eval_samples_per_second": 99.975, | |
"eval_steps_per_second": 2.799, | |
"step": 112 | |
}, | |
{ | |
"epoch": 9.04, | |
"grad_norm": 16.455059051513672, | |
"learning_rate": 2.9661016949152545e-06, | |
"loss": 0.6859, | |
"step": 113 | |
}, | |
{ | |
"epoch": 9.04, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6773906350135803, | |
"eval_runtime": 2.5491, | |
"eval_samples_per_second": 98.075, | |
"eval_steps_per_second": 2.746, | |
"step": 113 | |
}, | |
{ | |
"epoch": 9.12, | |
"grad_norm": 3.359854221343994, | |
"learning_rate": 2.5423728813559323e-06, | |
"loss": 0.6657, | |
"step": 114 | |
}, | |
{ | |
"epoch": 9.12, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6762812733650208, | |
"eval_runtime": 2.5524, | |
"eval_samples_per_second": 97.946, | |
"eval_steps_per_second": 2.742, | |
"step": 114 | |
}, | |
{ | |
"epoch": 9.2, | |
"grad_norm": 3.9168431758880615, | |
"learning_rate": 2.11864406779661e-06, | |
"loss": 0.6546, | |
"step": 115 | |
}, | |
{ | |
"epoch": 9.2, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6755898594856262, | |
"eval_runtime": 2.5568, | |
"eval_samples_per_second": 97.78, | |
"eval_steps_per_second": 2.738, | |
"step": 115 | |
}, | |
{ | |
"epoch": 9.28, | |
"grad_norm": 3.361232280731201, | |
"learning_rate": 1.6949152542372882e-06, | |
"loss": 0.7128, | |
"step": 116 | |
}, | |
{ | |
"epoch": 9.28, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.6770547032356262, | |
"eval_runtime": 2.5602, | |
"eval_samples_per_second": 97.649, | |
"eval_steps_per_second": 2.734, | |
"step": 116 | |
}, | |
{ | |
"epoch": 9.36, | |
"grad_norm": 2.9631147384643555, | |
"learning_rate": 1.2711864406779662e-06, | |
"loss": 0.6779, | |
"step": 117 | |
}, | |
{ | |
"epoch": 9.36, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6765038967132568, | |
"eval_runtime": 2.5606, | |
"eval_samples_per_second": 97.633, | |
"eval_steps_per_second": 2.734, | |
"step": 117 | |
}, | |
{ | |
"epoch": 9.44, | |
"grad_norm": 4.792584419250488, | |
"learning_rate": 8.474576271186441e-07, | |
"loss": 0.6241, | |
"step": 118 | |
}, | |
{ | |
"epoch": 9.44, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.6751171946525574, | |
"eval_runtime": 2.5493, | |
"eval_samples_per_second": 98.066, | |
"eval_steps_per_second": 2.746, | |
"step": 118 | |
}, | |
{ | |
"epoch": 9.52, | |
"grad_norm": 5.438122749328613, | |
"learning_rate": 4.2372881355932204e-07, | |
"loss": 0.6748, | |
"step": 119 | |
}, | |
{ | |
"epoch": 9.52, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.6753047108650208, | |
"eval_runtime": 2.5608, | |
"eval_samples_per_second": 97.624, | |
"eval_steps_per_second": 2.733, | |
"step": 119 | |
}, | |
{ | |
"epoch": 9.6, | |
"grad_norm": 2.0531022548675537, | |
"learning_rate": 0.0, | |
"loss": 0.6738, | |
"step": 120 | |
}, | |
{ | |
"epoch": 9.6, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6763242483139038, | |
"eval_runtime": 2.5002, | |
"eval_samples_per_second": 99.994, | |
"eval_steps_per_second": 2.8, | |
"step": 120 | |
}, | |
{ | |
"epoch": 9.6, | |
"step": 120, | |
"total_flos": 2.3370919602814976e+16, | |
"train_loss": 0.7025685042142868, | |
"train_runtime": 671.5636, | |
"train_samples_per_second": 14.891, | |
"train_steps_per_second": 0.179 | |
} | |
], | |
"logging_steps": 1, | |
"max_steps": 120, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 10, | |
"save_steps": 500, | |
"total_flos": 2.3370919602814976e+16, | |
"train_batch_size": 10, | |
"trial_name": null, | |
"trial_params": null | |
} | |