adapters-opt-bnb8-QLORA-super_glue-boolq
/
trainer_state-opt-bnb8-QLORA-super_glue-boolq-sequence_classification.json
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 1.984, | |
"eval_steps": 1, | |
"global_step": 124, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.016, | |
"grad_norm": 19.792200088500977, | |
"learning_rate": 2.5e-05, | |
"loss": 0.9587, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.016, | |
"eval_accuracy": 0.4, | |
"eval_loss": 1.2008212804794312, | |
"eval_runtime": 9.5896, | |
"eval_samples_per_second": 26.07, | |
"eval_steps_per_second": 3.337, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.032, | |
"grad_norm": 26.413536071777344, | |
"learning_rate": 5e-05, | |
"loss": 1.0902, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.032, | |
"eval_accuracy": 0.4, | |
"eval_loss": 1.1015820503234863, | |
"eval_runtime": 9.4487, | |
"eval_samples_per_second": 26.459, | |
"eval_steps_per_second": 3.387, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.048, | |
"grad_norm": 16.49271011352539, | |
"learning_rate": 4.959016393442623e-05, | |
"loss": 0.8807, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.048, | |
"eval_accuracy": 0.428, | |
"eval_loss": 0.9215332269668579, | |
"eval_runtime": 9.4596, | |
"eval_samples_per_second": 26.428, | |
"eval_steps_per_second": 3.383, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.064, | |
"grad_norm": 30.666654586791992, | |
"learning_rate": 4.918032786885246e-05, | |
"loss": 0.9722, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.064, | |
"eval_accuracy": 0.476, | |
"eval_loss": 0.8115702867507935, | |
"eval_runtime": 9.4643, | |
"eval_samples_per_second": 26.415, | |
"eval_steps_per_second": 3.381, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.08, | |
"grad_norm": 10.605666160583496, | |
"learning_rate": 4.8770491803278687e-05, | |
"loss": 0.7957, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.08, | |
"eval_accuracy": 0.476, | |
"eval_loss": 0.7770839929580688, | |
"eval_runtime": 9.4208, | |
"eval_samples_per_second": 26.537, | |
"eval_steps_per_second": 3.397, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.096, | |
"grad_norm": 15.425003051757812, | |
"learning_rate": 4.836065573770492e-05, | |
"loss": 0.5228, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.096, | |
"eval_accuracy": 0.504, | |
"eval_loss": 0.805004894733429, | |
"eval_runtime": 9.4526, | |
"eval_samples_per_second": 26.448, | |
"eval_steps_per_second": 3.385, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.112, | |
"grad_norm": 10.599884033203125, | |
"learning_rate": 4.795081967213115e-05, | |
"loss": 0.739, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.112, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.8277338743209839, | |
"eval_runtime": 9.4867, | |
"eval_samples_per_second": 26.353, | |
"eval_steps_per_second": 3.373, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.128, | |
"grad_norm": 32.59059143066406, | |
"learning_rate": 4.754098360655738e-05, | |
"loss": 1.2898, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.128, | |
"eval_accuracy": 0.564, | |
"eval_loss": 0.8153437376022339, | |
"eval_runtime": 9.4204, | |
"eval_samples_per_second": 26.538, | |
"eval_steps_per_second": 3.397, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.144, | |
"grad_norm": 22.269615173339844, | |
"learning_rate": 4.713114754098361e-05, | |
"loss": 0.9083, | |
"step": 9 | |
}, | |
{ | |
"epoch": 0.144, | |
"eval_accuracy": 0.612, | |
"eval_loss": 0.7878813743591309, | |
"eval_runtime": 9.4054, | |
"eval_samples_per_second": 26.581, | |
"eval_steps_per_second": 3.402, | |
"step": 9 | |
}, | |
{ | |
"epoch": 0.16, | |
"grad_norm": 8.692429542541504, | |
"learning_rate": 4.672131147540984e-05, | |
"loss": 0.5176, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.16, | |
"eval_accuracy": 0.6, | |
"eval_loss": 0.7594003677368164, | |
"eval_runtime": 9.4569, | |
"eval_samples_per_second": 26.436, | |
"eval_steps_per_second": 3.384, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.176, | |
"grad_norm": 9.585773468017578, | |
"learning_rate": 4.631147540983607e-05, | |
"loss": 0.7224, | |
"step": 11 | |
}, | |
{ | |
"epoch": 0.176, | |
"eval_accuracy": 0.604, | |
"eval_loss": 0.7379999756813049, | |
"eval_runtime": 9.4177, | |
"eval_samples_per_second": 26.546, | |
"eval_steps_per_second": 3.398, | |
"step": 11 | |
}, | |
{ | |
"epoch": 0.192, | |
"grad_norm": 10.833252906799316, | |
"learning_rate": 4.59016393442623e-05, | |
"loss": 0.8363, | |
"step": 12 | |
}, | |
{ | |
"epoch": 0.192, | |
"eval_accuracy": 0.588, | |
"eval_loss": 0.7211699485778809, | |
"eval_runtime": 9.4924, | |
"eval_samples_per_second": 26.337, | |
"eval_steps_per_second": 3.371, | |
"step": 12 | |
}, | |
{ | |
"epoch": 0.208, | |
"grad_norm": 17.36851692199707, | |
"learning_rate": 4.549180327868853e-05, | |
"loss": 0.868, | |
"step": 13 | |
}, | |
{ | |
"epoch": 0.208, | |
"eval_accuracy": 0.58, | |
"eval_loss": 0.7057519555091858, | |
"eval_runtime": 9.4694, | |
"eval_samples_per_second": 26.401, | |
"eval_steps_per_second": 3.379, | |
"step": 13 | |
}, | |
{ | |
"epoch": 0.224, | |
"grad_norm": 9.86408805847168, | |
"learning_rate": 4.508196721311476e-05, | |
"loss": 0.5603, | |
"step": 14 | |
}, | |
{ | |
"epoch": 0.224, | |
"eval_accuracy": 0.584, | |
"eval_loss": 0.7084277272224426, | |
"eval_runtime": 9.4106, | |
"eval_samples_per_second": 26.566, | |
"eval_steps_per_second": 3.4, | |
"step": 14 | |
}, | |
{ | |
"epoch": 0.24, | |
"grad_norm": 6.7643585205078125, | |
"learning_rate": 4.467213114754098e-05, | |
"loss": 0.6958, | |
"step": 15 | |
}, | |
{ | |
"epoch": 0.24, | |
"eval_accuracy": 0.572, | |
"eval_loss": 0.7116777300834656, | |
"eval_runtime": 9.4526, | |
"eval_samples_per_second": 26.448, | |
"eval_steps_per_second": 3.385, | |
"step": 15 | |
}, | |
{ | |
"epoch": 0.256, | |
"grad_norm": 13.049300193786621, | |
"learning_rate": 4.426229508196721e-05, | |
"loss": 0.5868, | |
"step": 16 | |
}, | |
{ | |
"epoch": 0.256, | |
"eval_accuracy": 0.576, | |
"eval_loss": 0.7166406512260437, | |
"eval_runtime": 9.4248, | |
"eval_samples_per_second": 26.526, | |
"eval_steps_per_second": 3.395, | |
"step": 16 | |
}, | |
{ | |
"epoch": 0.272, | |
"grad_norm": 12.840044021606445, | |
"learning_rate": 4.3852459016393444e-05, | |
"loss": 0.5497, | |
"step": 17 | |
}, | |
{ | |
"epoch": 0.272, | |
"eval_accuracy": 0.588, | |
"eval_loss": 0.7239003777503967, | |
"eval_runtime": 9.4276, | |
"eval_samples_per_second": 26.518, | |
"eval_steps_per_second": 3.394, | |
"step": 17 | |
}, | |
{ | |
"epoch": 0.288, | |
"grad_norm": 9.021048545837402, | |
"learning_rate": 4.3442622950819674e-05, | |
"loss": 0.7557, | |
"step": 18 | |
}, | |
{ | |
"epoch": 0.288, | |
"eval_accuracy": 0.592, | |
"eval_loss": 0.728591799736023, | |
"eval_runtime": 9.4602, | |
"eval_samples_per_second": 26.427, | |
"eval_steps_per_second": 3.383, | |
"step": 18 | |
}, | |
{ | |
"epoch": 0.304, | |
"grad_norm": 15.930183410644531, | |
"learning_rate": 4.3032786885245904e-05, | |
"loss": 0.8174, | |
"step": 19 | |
}, | |
{ | |
"epoch": 0.304, | |
"eval_accuracy": 0.588, | |
"eval_loss": 0.7309414148330688, | |
"eval_runtime": 9.4241, | |
"eval_samples_per_second": 26.528, | |
"eval_steps_per_second": 3.396, | |
"step": 19 | |
}, | |
{ | |
"epoch": 0.32, | |
"grad_norm": 25.526287078857422, | |
"learning_rate": 4.262295081967213e-05, | |
"loss": 0.9582, | |
"step": 20 | |
}, | |
{ | |
"epoch": 0.32, | |
"eval_accuracy": 0.58, | |
"eval_loss": 0.7253652215003967, | |
"eval_runtime": 9.487, | |
"eval_samples_per_second": 26.352, | |
"eval_steps_per_second": 3.373, | |
"step": 20 | |
}, | |
{ | |
"epoch": 0.336, | |
"grad_norm": 16.851058959960938, | |
"learning_rate": 4.2213114754098365e-05, | |
"loss": 0.7394, | |
"step": 21 | |
}, | |
{ | |
"epoch": 0.336, | |
"eval_accuracy": 0.572, | |
"eval_loss": 0.721359372138977, | |
"eval_runtime": 9.4839, | |
"eval_samples_per_second": 26.361, | |
"eval_steps_per_second": 3.374, | |
"step": 21 | |
}, | |
{ | |
"epoch": 0.352, | |
"grad_norm": 16.92612648010254, | |
"learning_rate": 4.1803278688524595e-05, | |
"loss": 0.7682, | |
"step": 22 | |
}, | |
{ | |
"epoch": 0.352, | |
"eval_accuracy": 0.58, | |
"eval_loss": 0.7189823985099792, | |
"eval_runtime": 9.4414, | |
"eval_samples_per_second": 26.479, | |
"eval_steps_per_second": 3.389, | |
"step": 22 | |
}, | |
{ | |
"epoch": 0.368, | |
"grad_norm": 9.329913139343262, | |
"learning_rate": 4.1393442622950826e-05, | |
"loss": 0.5394, | |
"step": 23 | |
}, | |
{ | |
"epoch": 0.368, | |
"eval_accuracy": 0.564, | |
"eval_loss": 0.7176367044448853, | |
"eval_runtime": 9.4362, | |
"eval_samples_per_second": 26.494, | |
"eval_steps_per_second": 3.391, | |
"step": 23 | |
}, | |
{ | |
"epoch": 0.384, | |
"grad_norm": 16.587936401367188, | |
"learning_rate": 4.098360655737705e-05, | |
"loss": 0.7886, | |
"step": 24 | |
}, | |
{ | |
"epoch": 0.384, | |
"eval_accuracy": 0.572, | |
"eval_loss": 0.7161562442779541, | |
"eval_runtime": 9.4353, | |
"eval_samples_per_second": 26.496, | |
"eval_steps_per_second": 3.392, | |
"step": 24 | |
}, | |
{ | |
"epoch": 0.4, | |
"grad_norm": 15.896271705627441, | |
"learning_rate": 4.057377049180328e-05, | |
"loss": 0.5579, | |
"step": 25 | |
}, | |
{ | |
"epoch": 0.4, | |
"eval_accuracy": 0.572, | |
"eval_loss": 0.7171699404716492, | |
"eval_runtime": 9.4618, | |
"eval_samples_per_second": 26.422, | |
"eval_steps_per_second": 3.382, | |
"step": 25 | |
}, | |
{ | |
"epoch": 0.416, | |
"grad_norm": 6.284942626953125, | |
"learning_rate": 4.016393442622951e-05, | |
"loss": 0.619, | |
"step": 26 | |
}, | |
{ | |
"epoch": 0.416, | |
"eval_accuracy": 0.576, | |
"eval_loss": 0.7149707078933716, | |
"eval_runtime": 9.4462, | |
"eval_samples_per_second": 26.466, | |
"eval_steps_per_second": 3.388, | |
"step": 26 | |
}, | |
{ | |
"epoch": 0.432, | |
"grad_norm": 7.851229667663574, | |
"learning_rate": 3.975409836065574e-05, | |
"loss": 0.6796, | |
"step": 27 | |
}, | |
{ | |
"epoch": 0.432, | |
"eval_accuracy": 0.572, | |
"eval_loss": 0.7145332098007202, | |
"eval_runtime": 9.4497, | |
"eval_samples_per_second": 26.456, | |
"eval_steps_per_second": 3.386, | |
"step": 27 | |
}, | |
{ | |
"epoch": 0.448, | |
"grad_norm": 6.50039529800415, | |
"learning_rate": 3.934426229508197e-05, | |
"loss": 0.8046, | |
"step": 28 | |
}, | |
{ | |
"epoch": 0.448, | |
"eval_accuracy": 0.568, | |
"eval_loss": 0.7118340134620667, | |
"eval_runtime": 9.4429, | |
"eval_samples_per_second": 26.475, | |
"eval_steps_per_second": 3.389, | |
"step": 28 | |
}, | |
{ | |
"epoch": 0.464, | |
"grad_norm": 10.894524574279785, | |
"learning_rate": 3.89344262295082e-05, | |
"loss": 0.6829, | |
"step": 29 | |
}, | |
{ | |
"epoch": 0.464, | |
"eval_accuracy": 0.568, | |
"eval_loss": 0.7091230750083923, | |
"eval_runtime": 9.4254, | |
"eval_samples_per_second": 26.524, | |
"eval_steps_per_second": 3.395, | |
"step": 29 | |
}, | |
{ | |
"epoch": 0.48, | |
"grad_norm": 17.76140594482422, | |
"learning_rate": 3.8524590163934424e-05, | |
"loss": 0.8194, | |
"step": 30 | |
}, | |
{ | |
"epoch": 0.48, | |
"eval_accuracy": 0.548, | |
"eval_loss": 0.7109335660934448, | |
"eval_runtime": 9.4302, | |
"eval_samples_per_second": 26.511, | |
"eval_steps_per_second": 3.393, | |
"step": 30 | |
}, | |
{ | |
"epoch": 0.496, | |
"grad_norm": 4.884728908538818, | |
"learning_rate": 3.8114754098360655e-05, | |
"loss": 0.6432, | |
"step": 31 | |
}, | |
{ | |
"epoch": 0.496, | |
"eval_accuracy": 0.536, | |
"eval_loss": 0.7137030959129333, | |
"eval_runtime": 9.4055, | |
"eval_samples_per_second": 26.58, | |
"eval_steps_per_second": 3.402, | |
"step": 31 | |
}, | |
{ | |
"epoch": 0.512, | |
"grad_norm": 8.217907905578613, | |
"learning_rate": 3.7704918032786885e-05, | |
"loss": 0.6199, | |
"step": 32 | |
}, | |
{ | |
"epoch": 0.512, | |
"eval_accuracy": 0.536, | |
"eval_loss": 0.7147109508514404, | |
"eval_runtime": 9.4314, | |
"eval_samples_per_second": 26.507, | |
"eval_steps_per_second": 3.393, | |
"step": 32 | |
}, | |
{ | |
"epoch": 0.528, | |
"grad_norm": 5.067286014556885, | |
"learning_rate": 3.729508196721312e-05, | |
"loss": 0.5238, | |
"step": 33 | |
}, | |
{ | |
"epoch": 0.528, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.7139023542404175, | |
"eval_runtime": 9.4057, | |
"eval_samples_per_second": 26.579, | |
"eval_steps_per_second": 3.402, | |
"step": 33 | |
}, | |
{ | |
"epoch": 0.544, | |
"grad_norm": 9.185476303100586, | |
"learning_rate": 3.6885245901639346e-05, | |
"loss": 0.5065, | |
"step": 34 | |
}, | |
{ | |
"epoch": 0.544, | |
"eval_accuracy": 0.54, | |
"eval_loss": 0.7080722451210022, | |
"eval_runtime": 9.4148, | |
"eval_samples_per_second": 26.554, | |
"eval_steps_per_second": 3.399, | |
"step": 34 | |
}, | |
{ | |
"epoch": 0.56, | |
"grad_norm": 10.447481155395508, | |
"learning_rate": 3.6475409836065576e-05, | |
"loss": 0.7825, | |
"step": 35 | |
}, | |
{ | |
"epoch": 0.56, | |
"eval_accuracy": 0.556, | |
"eval_loss": 0.7053359150886536, | |
"eval_runtime": 9.4329, | |
"eval_samples_per_second": 26.503, | |
"eval_steps_per_second": 3.392, | |
"step": 35 | |
}, | |
{ | |
"epoch": 0.576, | |
"grad_norm": 9.977537155151367, | |
"learning_rate": 3.6065573770491806e-05, | |
"loss": 0.7256, | |
"step": 36 | |
}, | |
{ | |
"epoch": 0.576, | |
"eval_accuracy": 0.556, | |
"eval_loss": 0.7060820460319519, | |
"eval_runtime": 9.426, | |
"eval_samples_per_second": 26.522, | |
"eval_steps_per_second": 3.395, | |
"step": 36 | |
}, | |
{ | |
"epoch": 0.592, | |
"grad_norm": 8.119141578674316, | |
"learning_rate": 3.5655737704918037e-05, | |
"loss": 0.7407, | |
"step": 37 | |
}, | |
{ | |
"epoch": 0.592, | |
"eval_accuracy": 0.544, | |
"eval_loss": 0.7100077867507935, | |
"eval_runtime": 9.4303, | |
"eval_samples_per_second": 26.51, | |
"eval_steps_per_second": 3.393, | |
"step": 37 | |
}, | |
{ | |
"epoch": 0.608, | |
"grad_norm": 13.609740257263184, | |
"learning_rate": 3.524590163934427e-05, | |
"loss": 0.6665, | |
"step": 38 | |
}, | |
{ | |
"epoch": 0.608, | |
"eval_accuracy": 0.544, | |
"eval_loss": 0.7075429558753967, | |
"eval_runtime": 9.4113, | |
"eval_samples_per_second": 26.564, | |
"eval_steps_per_second": 3.4, | |
"step": 38 | |
}, | |
{ | |
"epoch": 0.624, | |
"grad_norm": 22.365285873413086, | |
"learning_rate": 3.483606557377049e-05, | |
"loss": 0.8188, | |
"step": 39 | |
}, | |
{ | |
"epoch": 0.624, | |
"eval_accuracy": 0.564, | |
"eval_loss": 0.7029336094856262, | |
"eval_runtime": 9.4257, | |
"eval_samples_per_second": 26.523, | |
"eval_steps_per_second": 3.395, | |
"step": 39 | |
}, | |
{ | |
"epoch": 0.64, | |
"grad_norm": 10.358452796936035, | |
"learning_rate": 3.442622950819672e-05, | |
"loss": 0.6671, | |
"step": 40 | |
}, | |
{ | |
"epoch": 0.64, | |
"eval_accuracy": 0.568, | |
"eval_loss": 0.6954512000083923, | |
"eval_runtime": 9.4493, | |
"eval_samples_per_second": 26.457, | |
"eval_steps_per_second": 3.386, | |
"step": 40 | |
}, | |
{ | |
"epoch": 0.656, | |
"grad_norm": 15.979942321777344, | |
"learning_rate": 3.401639344262295e-05, | |
"loss": 0.7222, | |
"step": 41 | |
}, | |
{ | |
"epoch": 0.656, | |
"eval_accuracy": 0.568, | |
"eval_loss": 0.6924257874488831, | |
"eval_runtime": 9.4502, | |
"eval_samples_per_second": 26.454, | |
"eval_steps_per_second": 3.386, | |
"step": 41 | |
}, | |
{ | |
"epoch": 0.672, | |
"grad_norm": 16.25983428955078, | |
"learning_rate": 3.360655737704918e-05, | |
"loss": 0.7285, | |
"step": 42 | |
}, | |
{ | |
"epoch": 0.672, | |
"eval_accuracy": 0.576, | |
"eval_loss": 0.6920918226242065, | |
"eval_runtime": 9.4123, | |
"eval_samples_per_second": 26.561, | |
"eval_steps_per_second": 3.4, | |
"step": 42 | |
}, | |
{ | |
"epoch": 0.688, | |
"grad_norm": 7.8817853927612305, | |
"learning_rate": 3.319672131147541e-05, | |
"loss": 0.7068, | |
"step": 43 | |
}, | |
{ | |
"epoch": 0.688, | |
"eval_accuracy": 0.588, | |
"eval_loss": 0.693978488445282, | |
"eval_runtime": 9.4142, | |
"eval_samples_per_second": 26.556, | |
"eval_steps_per_second": 3.399, | |
"step": 43 | |
}, | |
{ | |
"epoch": 0.704, | |
"grad_norm": 11.203206062316895, | |
"learning_rate": 3.2786885245901635e-05, | |
"loss": 0.613, | |
"step": 44 | |
}, | |
{ | |
"epoch": 0.704, | |
"eval_accuracy": 0.6, | |
"eval_loss": 0.6923867464065552, | |
"eval_runtime": 9.4098, | |
"eval_samples_per_second": 26.568, | |
"eval_steps_per_second": 3.401, | |
"step": 44 | |
}, | |
{ | |
"epoch": 0.72, | |
"grad_norm": 8.55033016204834, | |
"learning_rate": 3.237704918032787e-05, | |
"loss": 0.5672, | |
"step": 45 | |
}, | |
{ | |
"epoch": 0.72, | |
"eval_accuracy": 0.604, | |
"eval_loss": 0.695925772190094, | |
"eval_runtime": 9.4467, | |
"eval_samples_per_second": 26.464, | |
"eval_steps_per_second": 3.387, | |
"step": 45 | |
}, | |
{ | |
"epoch": 0.736, | |
"grad_norm": 9.487948417663574, | |
"learning_rate": 3.19672131147541e-05, | |
"loss": 0.6208, | |
"step": 46 | |
}, | |
{ | |
"epoch": 0.736, | |
"eval_accuracy": 0.604, | |
"eval_loss": 0.7002148628234863, | |
"eval_runtime": 9.4163, | |
"eval_samples_per_second": 26.55, | |
"eval_steps_per_second": 3.398, | |
"step": 46 | |
}, | |
{ | |
"epoch": 0.752, | |
"grad_norm": 7.840662479400635, | |
"learning_rate": 3.155737704918033e-05, | |
"loss": 0.6282, | |
"step": 47 | |
}, | |
{ | |
"epoch": 0.752, | |
"eval_accuracy": 0.608, | |
"eval_loss": 0.7034921646118164, | |
"eval_runtime": 9.4244, | |
"eval_samples_per_second": 26.527, | |
"eval_steps_per_second": 3.395, | |
"step": 47 | |
}, | |
{ | |
"epoch": 0.768, | |
"grad_norm": 6.098258972167969, | |
"learning_rate": 3.114754098360656e-05, | |
"loss": 0.6129, | |
"step": 48 | |
}, | |
{ | |
"epoch": 0.768, | |
"eval_accuracy": 0.604, | |
"eval_loss": 0.7040849328041077, | |
"eval_runtime": 9.3957, | |
"eval_samples_per_second": 26.608, | |
"eval_steps_per_second": 3.406, | |
"step": 48 | |
}, | |
{ | |
"epoch": 0.784, | |
"grad_norm": 7.861691951751709, | |
"learning_rate": 3.073770491803279e-05, | |
"loss": 0.6396, | |
"step": 49 | |
}, | |
{ | |
"epoch": 0.784, | |
"eval_accuracy": 0.608, | |
"eval_loss": 0.7040830254554749, | |
"eval_runtime": 9.396, | |
"eval_samples_per_second": 26.607, | |
"eval_steps_per_second": 3.406, | |
"step": 49 | |
}, | |
{ | |
"epoch": 0.8, | |
"grad_norm": 9.376338958740234, | |
"learning_rate": 3.0327868852459017e-05, | |
"loss": 0.5983, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.8, | |
"eval_accuracy": 0.608, | |
"eval_loss": 0.7050849795341492, | |
"eval_runtime": 9.4089, | |
"eval_samples_per_second": 26.571, | |
"eval_steps_per_second": 3.401, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.816, | |
"grad_norm": 8.683838844299316, | |
"learning_rate": 2.9918032786885248e-05, | |
"loss": 0.6681, | |
"step": 51 | |
}, | |
{ | |
"epoch": 0.816, | |
"eval_accuracy": 0.604, | |
"eval_loss": 0.705935537815094, | |
"eval_runtime": 9.3804, | |
"eval_samples_per_second": 26.651, | |
"eval_steps_per_second": 3.411, | |
"step": 51 | |
}, | |
{ | |
"epoch": 0.832, | |
"grad_norm": 17.765621185302734, | |
"learning_rate": 2.9508196721311478e-05, | |
"loss": 0.8503, | |
"step": 52 | |
}, | |
{ | |
"epoch": 0.832, | |
"eval_accuracy": 0.604, | |
"eval_loss": 0.6994922161102295, | |
"eval_runtime": 9.4185, | |
"eval_samples_per_second": 26.544, | |
"eval_steps_per_second": 3.398, | |
"step": 52 | |
}, | |
{ | |
"epoch": 0.848, | |
"grad_norm": 15.548516273498535, | |
"learning_rate": 2.9098360655737705e-05, | |
"loss": 0.7585, | |
"step": 53 | |
}, | |
{ | |
"epoch": 0.848, | |
"eval_accuracy": 0.6, | |
"eval_loss": 0.692019522190094, | |
"eval_runtime": 9.5871, | |
"eval_samples_per_second": 26.077, | |
"eval_steps_per_second": 3.338, | |
"step": 53 | |
}, | |
{ | |
"epoch": 0.864, | |
"grad_norm": 8.666825294494629, | |
"learning_rate": 2.8688524590163935e-05, | |
"loss": 0.5713, | |
"step": 54 | |
}, | |
{ | |
"epoch": 0.864, | |
"eval_accuracy": 0.6, | |
"eval_loss": 0.68896484375, | |
"eval_runtime": 9.4277, | |
"eval_samples_per_second": 26.518, | |
"eval_steps_per_second": 3.394, | |
"step": 54 | |
}, | |
{ | |
"epoch": 0.88, | |
"grad_norm": 16.585477828979492, | |
"learning_rate": 2.8278688524590162e-05, | |
"loss": 0.7261, | |
"step": 55 | |
}, | |
{ | |
"epoch": 0.88, | |
"eval_accuracy": 0.6, | |
"eval_loss": 0.6847422122955322, | |
"eval_runtime": 9.4025, | |
"eval_samples_per_second": 26.589, | |
"eval_steps_per_second": 3.403, | |
"step": 55 | |
}, | |
{ | |
"epoch": 0.896, | |
"grad_norm": 17.52354621887207, | |
"learning_rate": 2.7868852459016392e-05, | |
"loss": 0.7457, | |
"step": 56 | |
}, | |
{ | |
"epoch": 0.896, | |
"eval_accuracy": 0.604, | |
"eval_loss": 0.6801777482032776, | |
"eval_runtime": 9.4431, | |
"eval_samples_per_second": 26.474, | |
"eval_steps_per_second": 3.389, | |
"step": 56 | |
}, | |
{ | |
"epoch": 0.912, | |
"grad_norm": 14.731335639953613, | |
"learning_rate": 2.7459016393442626e-05, | |
"loss": 0.8242, | |
"step": 57 | |
}, | |
{ | |
"epoch": 0.912, | |
"eval_accuracy": 0.576, | |
"eval_loss": 0.6787323951721191, | |
"eval_runtime": 9.4268, | |
"eval_samples_per_second": 26.52, | |
"eval_steps_per_second": 3.395, | |
"step": 57 | |
}, | |
{ | |
"epoch": 0.928, | |
"grad_norm": 6.853959083557129, | |
"learning_rate": 2.7049180327868856e-05, | |
"loss": 0.7688, | |
"step": 58 | |
}, | |
{ | |
"epoch": 0.928, | |
"eval_accuracy": 0.568, | |
"eval_loss": 0.6817187666893005, | |
"eval_runtime": 9.4078, | |
"eval_samples_per_second": 26.574, | |
"eval_steps_per_second": 3.401, | |
"step": 58 | |
}, | |
{ | |
"epoch": 0.944, | |
"grad_norm": 13.072829246520996, | |
"learning_rate": 2.6639344262295087e-05, | |
"loss": 0.5804, | |
"step": 59 | |
}, | |
{ | |
"epoch": 0.944, | |
"eval_accuracy": 0.572, | |
"eval_loss": 0.685714840888977, | |
"eval_runtime": 9.422, | |
"eval_samples_per_second": 26.534, | |
"eval_steps_per_second": 3.396, | |
"step": 59 | |
}, | |
{ | |
"epoch": 0.96, | |
"grad_norm": 8.29138469696045, | |
"learning_rate": 2.6229508196721314e-05, | |
"loss": 0.8167, | |
"step": 60 | |
}, | |
{ | |
"epoch": 0.96, | |
"eval_accuracy": 0.568, | |
"eval_loss": 0.6867265701293945, | |
"eval_runtime": 9.4234, | |
"eval_samples_per_second": 26.53, | |
"eval_steps_per_second": 3.396, | |
"step": 60 | |
}, | |
{ | |
"epoch": 0.976, | |
"grad_norm": 5.209651470184326, | |
"learning_rate": 2.5819672131147544e-05, | |
"loss": 0.5874, | |
"step": 61 | |
}, | |
{ | |
"epoch": 0.976, | |
"eval_accuracy": 0.576, | |
"eval_loss": 0.6885351538658142, | |
"eval_runtime": 9.4091, | |
"eval_samples_per_second": 26.57, | |
"eval_steps_per_second": 3.401, | |
"step": 61 | |
}, | |
{ | |
"epoch": 0.992, | |
"grad_norm": 8.127976417541504, | |
"learning_rate": 2.540983606557377e-05, | |
"loss": 0.6197, | |
"step": 62 | |
}, | |
{ | |
"epoch": 0.992, | |
"eval_accuracy": 0.572, | |
"eval_loss": 0.6853671669960022, | |
"eval_runtime": 9.4343, | |
"eval_samples_per_second": 26.499, | |
"eval_steps_per_second": 3.392, | |
"step": 62 | |
}, | |
{ | |
"epoch": 1.008, | |
"grad_norm": 4.938397407531738, | |
"learning_rate": 2.5e-05, | |
"loss": 0.6458, | |
"step": 63 | |
}, | |
{ | |
"epoch": 1.008, | |
"eval_accuracy": 0.584, | |
"eval_loss": 0.6829023361206055, | |
"eval_runtime": 9.4315, | |
"eval_samples_per_second": 26.507, | |
"eval_steps_per_second": 3.393, | |
"step": 63 | |
}, | |
{ | |
"epoch": 1.024, | |
"grad_norm": 15.248034477233887, | |
"learning_rate": 2.459016393442623e-05, | |
"loss": 0.7218, | |
"step": 64 | |
}, | |
{ | |
"epoch": 1.024, | |
"eval_accuracy": 0.592, | |
"eval_loss": 0.6791366934776306, | |
"eval_runtime": 9.4284, | |
"eval_samples_per_second": 26.516, | |
"eval_steps_per_second": 3.394, | |
"step": 64 | |
}, | |
{ | |
"epoch": 1.04, | |
"grad_norm": 5.217968463897705, | |
"learning_rate": 2.418032786885246e-05, | |
"loss": 0.6869, | |
"step": 65 | |
}, | |
{ | |
"epoch": 1.04, | |
"eval_accuracy": 0.592, | |
"eval_loss": 0.6775898337364197, | |
"eval_runtime": 9.4135, | |
"eval_samples_per_second": 26.558, | |
"eval_steps_per_second": 3.399, | |
"step": 65 | |
}, | |
{ | |
"epoch": 1.056, | |
"grad_norm": 8.960049629211426, | |
"learning_rate": 2.377049180327869e-05, | |
"loss": 0.7135, | |
"step": 66 | |
}, | |
{ | |
"epoch": 1.056, | |
"eval_accuracy": 0.592, | |
"eval_loss": 0.6763710975646973, | |
"eval_runtime": 9.43, | |
"eval_samples_per_second": 26.511, | |
"eval_steps_per_second": 3.393, | |
"step": 66 | |
}, | |
{ | |
"epoch": 1.072, | |
"grad_norm": 14.524127960205078, | |
"learning_rate": 2.336065573770492e-05, | |
"loss": 0.7343, | |
"step": 67 | |
}, | |
{ | |
"epoch": 1.072, | |
"eval_accuracy": 0.596, | |
"eval_loss": 0.673941433429718, | |
"eval_runtime": 9.4381, | |
"eval_samples_per_second": 26.488, | |
"eval_steps_per_second": 3.39, | |
"step": 67 | |
}, | |
{ | |
"epoch": 1.088, | |
"grad_norm": 14.215781211853027, | |
"learning_rate": 2.295081967213115e-05, | |
"loss": 0.7439, | |
"step": 68 | |
}, | |
{ | |
"epoch": 1.088, | |
"eval_accuracy": 0.596, | |
"eval_loss": 0.6748945116996765, | |
"eval_runtime": 9.4059, | |
"eval_samples_per_second": 26.579, | |
"eval_steps_per_second": 3.402, | |
"step": 68 | |
}, | |
{ | |
"epoch": 1.104, | |
"grad_norm": 5.426934719085693, | |
"learning_rate": 2.254098360655738e-05, | |
"loss": 0.5504, | |
"step": 69 | |
}, | |
{ | |
"epoch": 1.104, | |
"eval_accuracy": 0.6, | |
"eval_loss": 0.6768242120742798, | |
"eval_runtime": 9.4117, | |
"eval_samples_per_second": 26.563, | |
"eval_steps_per_second": 3.4, | |
"step": 69 | |
}, | |
{ | |
"epoch": 1.12, | |
"grad_norm": 14.354090690612793, | |
"learning_rate": 2.2131147540983607e-05, | |
"loss": 0.696, | |
"step": 70 | |
}, | |
{ | |
"epoch": 1.12, | |
"eval_accuracy": 0.596, | |
"eval_loss": 0.6765508055686951, | |
"eval_runtime": 9.4291, | |
"eval_samples_per_second": 26.514, | |
"eval_steps_per_second": 3.394, | |
"step": 70 | |
}, | |
{ | |
"epoch": 1.1360000000000001, | |
"grad_norm": 11.328275680541992, | |
"learning_rate": 2.1721311475409837e-05, | |
"loss": 0.6042, | |
"step": 71 | |
}, | |
{ | |
"epoch": 1.1360000000000001, | |
"eval_accuracy": 0.596, | |
"eval_loss": 0.6768398284912109, | |
"eval_runtime": 9.4156, | |
"eval_samples_per_second": 26.552, | |
"eval_steps_per_second": 3.399, | |
"step": 71 | |
}, | |
{ | |
"epoch": 1.152, | |
"grad_norm": 9.158403396606445, | |
"learning_rate": 2.1311475409836064e-05, | |
"loss": 0.4853, | |
"step": 72 | |
}, | |
{ | |
"epoch": 1.152, | |
"eval_accuracy": 0.604, | |
"eval_loss": 0.6750390529632568, | |
"eval_runtime": 9.4378, | |
"eval_samples_per_second": 26.489, | |
"eval_steps_per_second": 3.391, | |
"step": 72 | |
}, | |
{ | |
"epoch": 1.168, | |
"grad_norm": 7.848287105560303, | |
"learning_rate": 2.0901639344262298e-05, | |
"loss": 0.6744, | |
"step": 73 | |
}, | |
{ | |
"epoch": 1.168, | |
"eval_accuracy": 0.6, | |
"eval_loss": 0.6753163933753967, | |
"eval_runtime": 9.4125, | |
"eval_samples_per_second": 26.56, | |
"eval_steps_per_second": 3.4, | |
"step": 73 | |
}, | |
{ | |
"epoch": 1.184, | |
"grad_norm": 11.083074569702148, | |
"learning_rate": 2.0491803278688525e-05, | |
"loss": 0.7398, | |
"step": 74 | |
}, | |
{ | |
"epoch": 1.184, | |
"eval_accuracy": 0.596, | |
"eval_loss": 0.676925778388977, | |
"eval_runtime": 9.421, | |
"eval_samples_per_second": 26.536, | |
"eval_steps_per_second": 3.397, | |
"step": 74 | |
}, | |
{ | |
"epoch": 1.2, | |
"grad_norm": 8.224617958068848, | |
"learning_rate": 2.0081967213114755e-05, | |
"loss": 0.6029, | |
"step": 75 | |
}, | |
{ | |
"epoch": 1.2, | |
"eval_accuracy": 0.596, | |
"eval_loss": 0.677783191204071, | |
"eval_runtime": 9.4291, | |
"eval_samples_per_second": 26.514, | |
"eval_steps_per_second": 3.394, | |
"step": 75 | |
}, | |
{ | |
"epoch": 1.216, | |
"grad_norm": 17.132051467895508, | |
"learning_rate": 1.9672131147540985e-05, | |
"loss": 0.6935, | |
"step": 76 | |
}, | |
{ | |
"epoch": 1.216, | |
"eval_accuracy": 0.596, | |
"eval_loss": 0.6787539124488831, | |
"eval_runtime": 9.4075, | |
"eval_samples_per_second": 26.575, | |
"eval_steps_per_second": 3.402, | |
"step": 76 | |
}, | |
{ | |
"epoch": 1.232, | |
"grad_norm": 8.447811126708984, | |
"learning_rate": 1.9262295081967212e-05, | |
"loss": 0.7292, | |
"step": 77 | |
}, | |
{ | |
"epoch": 1.232, | |
"eval_accuracy": 0.6, | |
"eval_loss": 0.6795663833618164, | |
"eval_runtime": 9.4049, | |
"eval_samples_per_second": 26.582, | |
"eval_steps_per_second": 3.402, | |
"step": 77 | |
}, | |
{ | |
"epoch": 1.248, | |
"grad_norm": 4.971631050109863, | |
"learning_rate": 1.8852459016393442e-05, | |
"loss": 0.6192, | |
"step": 78 | |
}, | |
{ | |
"epoch": 1.248, | |
"eval_accuracy": 0.6, | |
"eval_loss": 0.6786601543426514, | |
"eval_runtime": 9.4102, | |
"eval_samples_per_second": 26.567, | |
"eval_steps_per_second": 3.401, | |
"step": 78 | |
}, | |
{ | |
"epoch": 1.264, | |
"grad_norm": 8.30854320526123, | |
"learning_rate": 1.8442622950819673e-05, | |
"loss": 0.6979, | |
"step": 79 | |
}, | |
{ | |
"epoch": 1.264, | |
"eval_accuracy": 0.6, | |
"eval_loss": 0.6776171922683716, | |
"eval_runtime": 9.4206, | |
"eval_samples_per_second": 26.537, | |
"eval_steps_per_second": 3.397, | |
"step": 79 | |
}, | |
{ | |
"epoch": 1.28, | |
"grad_norm": 9.044068336486816, | |
"learning_rate": 1.8032786885245903e-05, | |
"loss": 0.7554, | |
"step": 80 | |
}, | |
{ | |
"epoch": 1.28, | |
"eval_accuracy": 0.596, | |
"eval_loss": 0.6768652200698853, | |
"eval_runtime": 9.4398, | |
"eval_samples_per_second": 26.484, | |
"eval_steps_per_second": 3.39, | |
"step": 80 | |
}, | |
{ | |
"epoch": 1.296, | |
"grad_norm": 22.36913299560547, | |
"learning_rate": 1.7622950819672133e-05, | |
"loss": 0.7857, | |
"step": 81 | |
}, | |
{ | |
"epoch": 1.296, | |
"eval_accuracy": 0.584, | |
"eval_loss": 0.6760781407356262, | |
"eval_runtime": 9.4344, | |
"eval_samples_per_second": 26.499, | |
"eval_steps_per_second": 3.392, | |
"step": 81 | |
}, | |
{ | |
"epoch": 1.312, | |
"grad_norm": 9.494186401367188, | |
"learning_rate": 1.721311475409836e-05, | |
"loss": 0.7903, | |
"step": 82 | |
}, | |
{ | |
"epoch": 1.312, | |
"eval_accuracy": 0.576, | |
"eval_loss": 0.6796757578849792, | |
"eval_runtime": 9.3991, | |
"eval_samples_per_second": 26.598, | |
"eval_steps_per_second": 3.405, | |
"step": 82 | |
}, | |
{ | |
"epoch": 1.328, | |
"grad_norm": 6.161738395690918, | |
"learning_rate": 1.680327868852459e-05, | |
"loss": 0.714, | |
"step": 83 | |
}, | |
{ | |
"epoch": 1.328, | |
"eval_accuracy": 0.576, | |
"eval_loss": 0.6806288957595825, | |
"eval_runtime": 9.424, | |
"eval_samples_per_second": 26.528, | |
"eval_steps_per_second": 3.396, | |
"step": 83 | |
}, | |
{ | |
"epoch": 1.3439999999999999, | |
"grad_norm": 10.077332496643066, | |
"learning_rate": 1.6393442622950818e-05, | |
"loss": 0.7107, | |
"step": 84 | |
}, | |
{ | |
"epoch": 1.3439999999999999, | |
"eval_accuracy": 0.584, | |
"eval_loss": 0.6848242282867432, | |
"eval_runtime": 9.4189, | |
"eval_samples_per_second": 26.542, | |
"eval_steps_per_second": 3.397, | |
"step": 84 | |
}, | |
{ | |
"epoch": 1.3599999999999999, | |
"grad_norm": 14.34889030456543, | |
"learning_rate": 1.598360655737705e-05, | |
"loss": 0.6276, | |
"step": 85 | |
}, | |
{ | |
"epoch": 1.3599999999999999, | |
"eval_accuracy": 0.588, | |
"eval_loss": 0.6862617135047913, | |
"eval_runtime": 9.4148, | |
"eval_samples_per_second": 26.554, | |
"eval_steps_per_second": 3.399, | |
"step": 85 | |
}, | |
{ | |
"epoch": 1.376, | |
"grad_norm": 9.223981857299805, | |
"learning_rate": 1.557377049180328e-05, | |
"loss": 0.7295, | |
"step": 86 | |
}, | |
{ | |
"epoch": 1.376, | |
"eval_accuracy": 0.588, | |
"eval_loss": 0.6857773661613464, | |
"eval_runtime": 9.4043, | |
"eval_samples_per_second": 26.584, | |
"eval_steps_per_second": 3.403, | |
"step": 86 | |
}, | |
{ | |
"epoch": 1.392, | |
"grad_norm": 13.143969535827637, | |
"learning_rate": 1.5163934426229509e-05, | |
"loss": 0.6597, | |
"step": 87 | |
}, | |
{ | |
"epoch": 1.392, | |
"eval_accuracy": 0.588, | |
"eval_loss": 0.6872578263282776, | |
"eval_runtime": 9.4212, | |
"eval_samples_per_second": 26.536, | |
"eval_steps_per_second": 3.397, | |
"step": 87 | |
}, | |
{ | |
"epoch": 1.408, | |
"grad_norm": 22.58281898498535, | |
"learning_rate": 1.4754098360655739e-05, | |
"loss": 0.6335, | |
"step": 88 | |
}, | |
{ | |
"epoch": 1.408, | |
"eval_accuracy": 0.58, | |
"eval_loss": 0.6847929954528809, | |
"eval_runtime": 9.4232, | |
"eval_samples_per_second": 26.53, | |
"eval_steps_per_second": 3.396, | |
"step": 88 | |
}, | |
{ | |
"epoch": 1.424, | |
"grad_norm": 12.670473098754883, | |
"learning_rate": 1.4344262295081968e-05, | |
"loss": 0.7245, | |
"step": 89 | |
}, | |
{ | |
"epoch": 1.424, | |
"eval_accuracy": 0.572, | |
"eval_loss": 0.6834453344345093, | |
"eval_runtime": 9.4138, | |
"eval_samples_per_second": 26.557, | |
"eval_steps_per_second": 3.399, | |
"step": 89 | |
}, | |
{ | |
"epoch": 1.44, | |
"grad_norm": 20.81968879699707, | |
"learning_rate": 1.3934426229508196e-05, | |
"loss": 0.5546, | |
"step": 90 | |
}, | |
{ | |
"epoch": 1.44, | |
"eval_accuracy": 0.568, | |
"eval_loss": 0.6808554530143738, | |
"eval_runtime": 9.4208, | |
"eval_samples_per_second": 26.537, | |
"eval_steps_per_second": 3.397, | |
"step": 90 | |
}, | |
{ | |
"epoch": 1.456, | |
"grad_norm": 8.033720016479492, | |
"learning_rate": 1.3524590163934428e-05, | |
"loss": 0.6482, | |
"step": 91 | |
}, | |
{ | |
"epoch": 1.456, | |
"eval_accuracy": 0.568, | |
"eval_loss": 0.6760781407356262, | |
"eval_runtime": 9.408, | |
"eval_samples_per_second": 26.573, | |
"eval_steps_per_second": 3.401, | |
"step": 91 | |
}, | |
{ | |
"epoch": 1.472, | |
"grad_norm": 9.656173706054688, | |
"learning_rate": 1.3114754098360657e-05, | |
"loss": 0.6814, | |
"step": 92 | |
}, | |
{ | |
"epoch": 1.472, | |
"eval_accuracy": 0.572, | |
"eval_loss": 0.6791015863418579, | |
"eval_runtime": 9.4039, | |
"eval_samples_per_second": 26.585, | |
"eval_steps_per_second": 3.403, | |
"step": 92 | |
}, | |
{ | |
"epoch": 1.488, | |
"grad_norm": 4.5396599769592285, | |
"learning_rate": 1.2704918032786885e-05, | |
"loss": 0.5693, | |
"step": 93 | |
}, | |
{ | |
"epoch": 1.488, | |
"eval_accuracy": 0.584, | |
"eval_loss": 0.6775078177452087, | |
"eval_runtime": 9.4321, | |
"eval_samples_per_second": 26.505, | |
"eval_steps_per_second": 3.393, | |
"step": 93 | |
}, | |
{ | |
"epoch": 1.504, | |
"grad_norm": 11.05844783782959, | |
"learning_rate": 1.2295081967213116e-05, | |
"loss": 0.5369, | |
"step": 94 | |
}, | |
{ | |
"epoch": 1.504, | |
"eval_accuracy": 0.58, | |
"eval_loss": 0.6771523356437683, | |
"eval_runtime": 9.4156, | |
"eval_samples_per_second": 26.552, | |
"eval_steps_per_second": 3.399, | |
"step": 94 | |
}, | |
{ | |
"epoch": 1.52, | |
"grad_norm": 19.972246170043945, | |
"learning_rate": 1.1885245901639344e-05, | |
"loss": 0.7144, | |
"step": 95 | |
}, | |
{ | |
"epoch": 1.52, | |
"eval_accuracy": 0.576, | |
"eval_loss": 0.6779101490974426, | |
"eval_runtime": 9.4028, | |
"eval_samples_per_second": 26.588, | |
"eval_steps_per_second": 3.403, | |
"step": 95 | |
}, | |
{ | |
"epoch": 1.536, | |
"grad_norm": 11.014993667602539, | |
"learning_rate": 1.1475409836065575e-05, | |
"loss": 0.6405, | |
"step": 96 | |
}, | |
{ | |
"epoch": 1.536, | |
"eval_accuracy": 0.564, | |
"eval_loss": 0.6772187352180481, | |
"eval_runtime": 9.4126, | |
"eval_samples_per_second": 26.56, | |
"eval_steps_per_second": 3.4, | |
"step": 96 | |
}, | |
{ | |
"epoch": 1.552, | |
"grad_norm": 8.04190444946289, | |
"learning_rate": 1.1065573770491803e-05, | |
"loss": 0.7893, | |
"step": 97 | |
}, | |
{ | |
"epoch": 1.552, | |
"eval_accuracy": 0.584, | |
"eval_loss": 0.6751992106437683, | |
"eval_runtime": 9.4142, | |
"eval_samples_per_second": 26.556, | |
"eval_steps_per_second": 3.399, | |
"step": 97 | |
}, | |
{ | |
"epoch": 1.568, | |
"grad_norm": 8.616044044494629, | |
"learning_rate": 1.0655737704918032e-05, | |
"loss": 0.6448, | |
"step": 98 | |
}, | |
{ | |
"epoch": 1.568, | |
"eval_accuracy": 0.568, | |
"eval_loss": 0.6759804487228394, | |
"eval_runtime": 9.4235, | |
"eval_samples_per_second": 26.529, | |
"eval_steps_per_second": 3.396, | |
"step": 98 | |
}, | |
{ | |
"epoch": 1.584, | |
"grad_norm": 12.122180938720703, | |
"learning_rate": 1.0245901639344262e-05, | |
"loss": 0.5828, | |
"step": 99 | |
}, | |
{ | |
"epoch": 1.584, | |
"eval_accuracy": 0.576, | |
"eval_loss": 0.6741952896118164, | |
"eval_runtime": 9.4162, | |
"eval_samples_per_second": 26.55, | |
"eval_steps_per_second": 3.398, | |
"step": 99 | |
}, | |
{ | |
"epoch": 1.6, | |
"grad_norm": 15.246779441833496, | |
"learning_rate": 9.836065573770493e-06, | |
"loss": 0.6762, | |
"step": 100 | |
}, | |
{ | |
"epoch": 1.6, | |
"eval_accuracy": 0.572, | |
"eval_loss": 0.6730703115463257, | |
"eval_runtime": 9.406, | |
"eval_samples_per_second": 26.579, | |
"eval_steps_per_second": 3.402, | |
"step": 100 | |
}, | |
{ | |
"epoch": 1.616, | |
"grad_norm": 16.69089126586914, | |
"learning_rate": 9.426229508196721e-06, | |
"loss": 0.6432, | |
"step": 101 | |
}, | |
{ | |
"epoch": 1.616, | |
"eval_accuracy": 0.584, | |
"eval_loss": 0.6738671660423279, | |
"eval_runtime": 9.4165, | |
"eval_samples_per_second": 26.549, | |
"eval_steps_per_second": 3.398, | |
"step": 101 | |
}, | |
{ | |
"epoch": 1.6320000000000001, | |
"grad_norm": 8.9694242477417, | |
"learning_rate": 9.016393442622952e-06, | |
"loss": 0.5826, | |
"step": 102 | |
}, | |
{ | |
"epoch": 1.6320000000000001, | |
"eval_accuracy": 0.58, | |
"eval_loss": 0.6729843616485596, | |
"eval_runtime": 9.4844, | |
"eval_samples_per_second": 26.359, | |
"eval_steps_per_second": 3.374, | |
"step": 102 | |
}, | |
{ | |
"epoch": 1.6480000000000001, | |
"grad_norm": 9.330092430114746, | |
"learning_rate": 8.60655737704918e-06, | |
"loss": 0.6224, | |
"step": 103 | |
}, | |
{ | |
"epoch": 1.6480000000000001, | |
"eval_accuracy": 0.584, | |
"eval_loss": 0.673214852809906, | |
"eval_runtime": 9.4192, | |
"eval_samples_per_second": 26.541, | |
"eval_steps_per_second": 3.397, | |
"step": 103 | |
}, | |
{ | |
"epoch": 1.6640000000000001, | |
"grad_norm": 7.138861179351807, | |
"learning_rate": 8.196721311475409e-06, | |
"loss": 0.6262, | |
"step": 104 | |
}, | |
{ | |
"epoch": 1.6640000000000001, | |
"eval_accuracy": 0.592, | |
"eval_loss": 0.6745429635047913, | |
"eval_runtime": 9.4226, | |
"eval_samples_per_second": 26.532, | |
"eval_steps_per_second": 3.396, | |
"step": 104 | |
}, | |
{ | |
"epoch": 1.6800000000000002, | |
"grad_norm": 7.4160356521606445, | |
"learning_rate": 7.78688524590164e-06, | |
"loss": 0.6451, | |
"step": 105 | |
}, | |
{ | |
"epoch": 1.6800000000000002, | |
"eval_accuracy": 0.592, | |
"eval_loss": 0.6730429530143738, | |
"eval_runtime": 9.4489, | |
"eval_samples_per_second": 26.458, | |
"eval_steps_per_second": 3.387, | |
"step": 105 | |
}, | |
{ | |
"epoch": 1.696, | |
"grad_norm": 5.479573726654053, | |
"learning_rate": 7.3770491803278695e-06, | |
"loss": 0.5948, | |
"step": 106 | |
}, | |
{ | |
"epoch": 1.696, | |
"eval_accuracy": 0.6, | |
"eval_loss": 0.6731171607971191, | |
"eval_runtime": 9.4414, | |
"eval_samples_per_second": 26.479, | |
"eval_steps_per_second": 3.389, | |
"step": 106 | |
}, | |
{ | |
"epoch": 1.712, | |
"grad_norm": 9.357452392578125, | |
"learning_rate": 6.967213114754098e-06, | |
"loss": 0.7451, | |
"step": 107 | |
}, | |
{ | |
"epoch": 1.712, | |
"eval_accuracy": 0.58, | |
"eval_loss": 0.6747695207595825, | |
"eval_runtime": 9.4087, | |
"eval_samples_per_second": 26.571, | |
"eval_steps_per_second": 3.401, | |
"step": 107 | |
}, | |
{ | |
"epoch": 1.728, | |
"grad_norm": 10.986834526062012, | |
"learning_rate": 6.557377049180328e-06, | |
"loss": 0.5922, | |
"step": 108 | |
}, | |
{ | |
"epoch": 1.728, | |
"eval_accuracy": 0.588, | |
"eval_loss": 0.6725429892539978, | |
"eval_runtime": 9.4208, | |
"eval_samples_per_second": 26.537, | |
"eval_steps_per_second": 3.397, | |
"step": 108 | |
}, | |
{ | |
"epoch": 1.744, | |
"grad_norm": 6.625186920166016, | |
"learning_rate": 6.147540983606558e-06, | |
"loss": 0.6454, | |
"step": 109 | |
}, | |
{ | |
"epoch": 1.744, | |
"eval_accuracy": 0.592, | |
"eval_loss": 0.6714960932731628, | |
"eval_runtime": 9.4316, | |
"eval_samples_per_second": 26.507, | |
"eval_steps_per_second": 3.393, | |
"step": 109 | |
}, | |
{ | |
"epoch": 1.76, | |
"grad_norm": 9.619455337524414, | |
"learning_rate": 5.737704918032787e-06, | |
"loss": 0.601, | |
"step": 110 | |
}, | |
{ | |
"epoch": 1.76, | |
"eval_accuracy": 0.596, | |
"eval_loss": 0.671625018119812, | |
"eval_runtime": 9.4295, | |
"eval_samples_per_second": 26.512, | |
"eval_steps_per_second": 3.394, | |
"step": 110 | |
}, | |
{ | |
"epoch": 1.776, | |
"grad_norm": 10.5454683303833, | |
"learning_rate": 5.327868852459016e-06, | |
"loss": 0.7236, | |
"step": 111 | |
}, | |
{ | |
"epoch": 1.776, | |
"eval_accuracy": 0.592, | |
"eval_loss": 0.6704453229904175, | |
"eval_runtime": 9.4138, | |
"eval_samples_per_second": 26.557, | |
"eval_steps_per_second": 3.399, | |
"step": 111 | |
}, | |
{ | |
"epoch": 1.792, | |
"grad_norm": 9.553342819213867, | |
"learning_rate": 4.918032786885246e-06, | |
"loss": 0.7825, | |
"step": 112 | |
}, | |
{ | |
"epoch": 1.792, | |
"eval_accuracy": 0.596, | |
"eval_loss": 0.673535168170929, | |
"eval_runtime": 9.4206, | |
"eval_samples_per_second": 26.538, | |
"eval_steps_per_second": 3.397, | |
"step": 112 | |
}, | |
{ | |
"epoch": 1.808, | |
"grad_norm": 7.810243129730225, | |
"learning_rate": 4.508196721311476e-06, | |
"loss": 0.6302, | |
"step": 113 | |
}, | |
{ | |
"epoch": 1.808, | |
"eval_accuracy": 0.584, | |
"eval_loss": 0.670703113079071, | |
"eval_runtime": 9.5051, | |
"eval_samples_per_second": 26.302, | |
"eval_steps_per_second": 3.367, | |
"step": 113 | |
}, | |
{ | |
"epoch": 1.8239999999999998, | |
"grad_norm": 15.086982727050781, | |
"learning_rate": 4.098360655737704e-06, | |
"loss": 0.6824, | |
"step": 114 | |
}, | |
{ | |
"epoch": 1.8239999999999998, | |
"eval_accuracy": 0.584, | |
"eval_loss": 0.6711757779121399, | |
"eval_runtime": 9.432, | |
"eval_samples_per_second": 26.505, | |
"eval_steps_per_second": 3.393, | |
"step": 114 | |
}, | |
{ | |
"epoch": 1.8399999999999999, | |
"grad_norm": 13.564058303833008, | |
"learning_rate": 3.6885245901639347e-06, | |
"loss": 0.6208, | |
"step": 115 | |
}, | |
{ | |
"epoch": 1.8399999999999999, | |
"eval_accuracy": 0.588, | |
"eval_loss": 0.6693046689033508, | |
"eval_runtime": 9.4215, | |
"eval_samples_per_second": 26.535, | |
"eval_steps_per_second": 3.396, | |
"step": 115 | |
}, | |
{ | |
"epoch": 1.8559999999999999, | |
"grad_norm": 7.943946361541748, | |
"learning_rate": 3.278688524590164e-06, | |
"loss": 0.6987, | |
"step": 116 | |
}, | |
{ | |
"epoch": 1.8559999999999999, | |
"eval_accuracy": 0.588, | |
"eval_loss": 0.671625018119812, | |
"eval_runtime": 9.4001, | |
"eval_samples_per_second": 26.595, | |
"eval_steps_per_second": 3.404, | |
"step": 116 | |
}, | |
{ | |
"epoch": 1.8719999999999999, | |
"grad_norm": 6.293920993804932, | |
"learning_rate": 2.8688524590163937e-06, | |
"loss": 0.5587, | |
"step": 117 | |
}, | |
{ | |
"epoch": 1.8719999999999999, | |
"eval_accuracy": 0.588, | |
"eval_loss": 0.670785129070282, | |
"eval_runtime": 9.3933, | |
"eval_samples_per_second": 26.615, | |
"eval_steps_per_second": 3.407, | |
"step": 117 | |
}, | |
{ | |
"epoch": 1.888, | |
"grad_norm": 5.374147415161133, | |
"learning_rate": 2.459016393442623e-06, | |
"loss": 0.6304, | |
"step": 118 | |
}, | |
{ | |
"epoch": 1.888, | |
"eval_accuracy": 0.592, | |
"eval_loss": 0.6705155968666077, | |
"eval_runtime": 9.4015, | |
"eval_samples_per_second": 26.592, | |
"eval_steps_per_second": 3.404, | |
"step": 118 | |
}, | |
{ | |
"epoch": 1.904, | |
"grad_norm": 11.269082069396973, | |
"learning_rate": 2.049180327868852e-06, | |
"loss": 0.4528, | |
"step": 119 | |
}, | |
{ | |
"epoch": 1.904, | |
"eval_accuracy": 0.584, | |
"eval_loss": 0.6711132526397705, | |
"eval_runtime": 9.4407, | |
"eval_samples_per_second": 26.481, | |
"eval_steps_per_second": 3.39, | |
"step": 119 | |
}, | |
{ | |
"epoch": 1.92, | |
"grad_norm": 20.449726104736328, | |
"learning_rate": 1.639344262295082e-06, | |
"loss": 0.7061, | |
"step": 120 | |
}, | |
{ | |
"epoch": 1.92, | |
"eval_accuracy": 0.58, | |
"eval_loss": 0.6705625057220459, | |
"eval_runtime": 9.4641, | |
"eval_samples_per_second": 26.416, | |
"eval_steps_per_second": 3.381, | |
"step": 120 | |
}, | |
{ | |
"epoch": 1.936, | |
"grad_norm": 13.892779350280762, | |
"learning_rate": 1.2295081967213116e-06, | |
"loss": 0.5595, | |
"step": 121 | |
}, | |
{ | |
"epoch": 1.936, | |
"eval_accuracy": 0.588, | |
"eval_loss": 0.670035183429718, | |
"eval_runtime": 9.4443, | |
"eval_samples_per_second": 26.471, | |
"eval_steps_per_second": 3.388, | |
"step": 121 | |
}, | |
{ | |
"epoch": 1.952, | |
"grad_norm": 4.646062850952148, | |
"learning_rate": 8.19672131147541e-07, | |
"loss": 0.5968, | |
"step": 122 | |
}, | |
{ | |
"epoch": 1.952, | |
"eval_accuracy": 0.588, | |
"eval_loss": 0.6705195307731628, | |
"eval_runtime": 9.4452, | |
"eval_samples_per_second": 26.468, | |
"eval_steps_per_second": 3.388, | |
"step": 122 | |
}, | |
{ | |
"epoch": 1.968, | |
"grad_norm": 5.045331001281738, | |
"learning_rate": 4.098360655737705e-07, | |
"loss": 0.577, | |
"step": 123 | |
}, | |
{ | |
"epoch": 1.968, | |
"eval_accuracy": 0.584, | |
"eval_loss": 0.6710820198059082, | |
"eval_runtime": 9.4702, | |
"eval_samples_per_second": 26.399, | |
"eval_steps_per_second": 3.379, | |
"step": 123 | |
}, | |
{ | |
"epoch": 1.984, | |
"grad_norm": 12.286917686462402, | |
"learning_rate": 0.0, | |
"loss": 0.5765, | |
"step": 124 | |
}, | |
{ | |
"epoch": 1.984, | |
"eval_accuracy": 0.58, | |
"eval_loss": 0.6720273494720459, | |
"eval_runtime": 9.4365, | |
"eval_samples_per_second": 26.493, | |
"eval_steps_per_second": 3.391, | |
"step": 124 | |
}, | |
{ | |
"epoch": 1.984, | |
"step": 124, | |
"total_flos": 1.3708912645636096e+16, | |
"train_loss": 0.6877071011450983, | |
"train_runtime": 1489.9136, | |
"train_samples_per_second": 1.342, | |
"train_steps_per_second": 0.083 | |
} | |
], | |
"logging_steps": 1, | |
"max_steps": 124, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 2, | |
"save_steps": 500, | |
"total_flos": 1.3708912645636096e+16, | |
"train_batch_size": 2, | |
"trial_name": null, | |
"trial_params": null | |
} | |