adapters-opt-bnb8-QLORA-super_glue-boolq / trainer_state-opt-bnb8-QLORA-super_glue-boolq-sequence_classification.json
RMHalak's picture
Task: SequenceClassification
56adb50 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.984,
"eval_steps": 1,
"global_step": 124,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.016,
"grad_norm": 19.792200088500977,
"learning_rate": 2.5e-05,
"loss": 0.9587,
"step": 1
},
{
"epoch": 0.016,
"eval_accuracy": 0.4,
"eval_loss": 1.2008212804794312,
"eval_runtime": 9.5896,
"eval_samples_per_second": 26.07,
"eval_steps_per_second": 3.337,
"step": 1
},
{
"epoch": 0.032,
"grad_norm": 26.413536071777344,
"learning_rate": 5e-05,
"loss": 1.0902,
"step": 2
},
{
"epoch": 0.032,
"eval_accuracy": 0.4,
"eval_loss": 1.1015820503234863,
"eval_runtime": 9.4487,
"eval_samples_per_second": 26.459,
"eval_steps_per_second": 3.387,
"step": 2
},
{
"epoch": 0.048,
"grad_norm": 16.49271011352539,
"learning_rate": 4.959016393442623e-05,
"loss": 0.8807,
"step": 3
},
{
"epoch": 0.048,
"eval_accuracy": 0.428,
"eval_loss": 0.9215332269668579,
"eval_runtime": 9.4596,
"eval_samples_per_second": 26.428,
"eval_steps_per_second": 3.383,
"step": 3
},
{
"epoch": 0.064,
"grad_norm": 30.666654586791992,
"learning_rate": 4.918032786885246e-05,
"loss": 0.9722,
"step": 4
},
{
"epoch": 0.064,
"eval_accuracy": 0.476,
"eval_loss": 0.8115702867507935,
"eval_runtime": 9.4643,
"eval_samples_per_second": 26.415,
"eval_steps_per_second": 3.381,
"step": 4
},
{
"epoch": 0.08,
"grad_norm": 10.605666160583496,
"learning_rate": 4.8770491803278687e-05,
"loss": 0.7957,
"step": 5
},
{
"epoch": 0.08,
"eval_accuracy": 0.476,
"eval_loss": 0.7770839929580688,
"eval_runtime": 9.4208,
"eval_samples_per_second": 26.537,
"eval_steps_per_second": 3.397,
"step": 5
},
{
"epoch": 0.096,
"grad_norm": 15.425003051757812,
"learning_rate": 4.836065573770492e-05,
"loss": 0.5228,
"step": 6
},
{
"epoch": 0.096,
"eval_accuracy": 0.504,
"eval_loss": 0.805004894733429,
"eval_runtime": 9.4526,
"eval_samples_per_second": 26.448,
"eval_steps_per_second": 3.385,
"step": 6
},
{
"epoch": 0.112,
"grad_norm": 10.599884033203125,
"learning_rate": 4.795081967213115e-05,
"loss": 0.739,
"step": 7
},
{
"epoch": 0.112,
"eval_accuracy": 0.532,
"eval_loss": 0.8277338743209839,
"eval_runtime": 9.4867,
"eval_samples_per_second": 26.353,
"eval_steps_per_second": 3.373,
"step": 7
},
{
"epoch": 0.128,
"grad_norm": 32.59059143066406,
"learning_rate": 4.754098360655738e-05,
"loss": 1.2898,
"step": 8
},
{
"epoch": 0.128,
"eval_accuracy": 0.564,
"eval_loss": 0.8153437376022339,
"eval_runtime": 9.4204,
"eval_samples_per_second": 26.538,
"eval_steps_per_second": 3.397,
"step": 8
},
{
"epoch": 0.144,
"grad_norm": 22.269615173339844,
"learning_rate": 4.713114754098361e-05,
"loss": 0.9083,
"step": 9
},
{
"epoch": 0.144,
"eval_accuracy": 0.612,
"eval_loss": 0.7878813743591309,
"eval_runtime": 9.4054,
"eval_samples_per_second": 26.581,
"eval_steps_per_second": 3.402,
"step": 9
},
{
"epoch": 0.16,
"grad_norm": 8.692429542541504,
"learning_rate": 4.672131147540984e-05,
"loss": 0.5176,
"step": 10
},
{
"epoch": 0.16,
"eval_accuracy": 0.6,
"eval_loss": 0.7594003677368164,
"eval_runtime": 9.4569,
"eval_samples_per_second": 26.436,
"eval_steps_per_second": 3.384,
"step": 10
},
{
"epoch": 0.176,
"grad_norm": 9.585773468017578,
"learning_rate": 4.631147540983607e-05,
"loss": 0.7224,
"step": 11
},
{
"epoch": 0.176,
"eval_accuracy": 0.604,
"eval_loss": 0.7379999756813049,
"eval_runtime": 9.4177,
"eval_samples_per_second": 26.546,
"eval_steps_per_second": 3.398,
"step": 11
},
{
"epoch": 0.192,
"grad_norm": 10.833252906799316,
"learning_rate": 4.59016393442623e-05,
"loss": 0.8363,
"step": 12
},
{
"epoch": 0.192,
"eval_accuracy": 0.588,
"eval_loss": 0.7211699485778809,
"eval_runtime": 9.4924,
"eval_samples_per_second": 26.337,
"eval_steps_per_second": 3.371,
"step": 12
},
{
"epoch": 0.208,
"grad_norm": 17.36851692199707,
"learning_rate": 4.549180327868853e-05,
"loss": 0.868,
"step": 13
},
{
"epoch": 0.208,
"eval_accuracy": 0.58,
"eval_loss": 0.7057519555091858,
"eval_runtime": 9.4694,
"eval_samples_per_second": 26.401,
"eval_steps_per_second": 3.379,
"step": 13
},
{
"epoch": 0.224,
"grad_norm": 9.86408805847168,
"learning_rate": 4.508196721311476e-05,
"loss": 0.5603,
"step": 14
},
{
"epoch": 0.224,
"eval_accuracy": 0.584,
"eval_loss": 0.7084277272224426,
"eval_runtime": 9.4106,
"eval_samples_per_second": 26.566,
"eval_steps_per_second": 3.4,
"step": 14
},
{
"epoch": 0.24,
"grad_norm": 6.7643585205078125,
"learning_rate": 4.467213114754098e-05,
"loss": 0.6958,
"step": 15
},
{
"epoch": 0.24,
"eval_accuracy": 0.572,
"eval_loss": 0.7116777300834656,
"eval_runtime": 9.4526,
"eval_samples_per_second": 26.448,
"eval_steps_per_second": 3.385,
"step": 15
},
{
"epoch": 0.256,
"grad_norm": 13.049300193786621,
"learning_rate": 4.426229508196721e-05,
"loss": 0.5868,
"step": 16
},
{
"epoch": 0.256,
"eval_accuracy": 0.576,
"eval_loss": 0.7166406512260437,
"eval_runtime": 9.4248,
"eval_samples_per_second": 26.526,
"eval_steps_per_second": 3.395,
"step": 16
},
{
"epoch": 0.272,
"grad_norm": 12.840044021606445,
"learning_rate": 4.3852459016393444e-05,
"loss": 0.5497,
"step": 17
},
{
"epoch": 0.272,
"eval_accuracy": 0.588,
"eval_loss": 0.7239003777503967,
"eval_runtime": 9.4276,
"eval_samples_per_second": 26.518,
"eval_steps_per_second": 3.394,
"step": 17
},
{
"epoch": 0.288,
"grad_norm": 9.021048545837402,
"learning_rate": 4.3442622950819674e-05,
"loss": 0.7557,
"step": 18
},
{
"epoch": 0.288,
"eval_accuracy": 0.592,
"eval_loss": 0.728591799736023,
"eval_runtime": 9.4602,
"eval_samples_per_second": 26.427,
"eval_steps_per_second": 3.383,
"step": 18
},
{
"epoch": 0.304,
"grad_norm": 15.930183410644531,
"learning_rate": 4.3032786885245904e-05,
"loss": 0.8174,
"step": 19
},
{
"epoch": 0.304,
"eval_accuracy": 0.588,
"eval_loss": 0.7309414148330688,
"eval_runtime": 9.4241,
"eval_samples_per_second": 26.528,
"eval_steps_per_second": 3.396,
"step": 19
},
{
"epoch": 0.32,
"grad_norm": 25.526287078857422,
"learning_rate": 4.262295081967213e-05,
"loss": 0.9582,
"step": 20
},
{
"epoch": 0.32,
"eval_accuracy": 0.58,
"eval_loss": 0.7253652215003967,
"eval_runtime": 9.487,
"eval_samples_per_second": 26.352,
"eval_steps_per_second": 3.373,
"step": 20
},
{
"epoch": 0.336,
"grad_norm": 16.851058959960938,
"learning_rate": 4.2213114754098365e-05,
"loss": 0.7394,
"step": 21
},
{
"epoch": 0.336,
"eval_accuracy": 0.572,
"eval_loss": 0.721359372138977,
"eval_runtime": 9.4839,
"eval_samples_per_second": 26.361,
"eval_steps_per_second": 3.374,
"step": 21
},
{
"epoch": 0.352,
"grad_norm": 16.92612648010254,
"learning_rate": 4.1803278688524595e-05,
"loss": 0.7682,
"step": 22
},
{
"epoch": 0.352,
"eval_accuracy": 0.58,
"eval_loss": 0.7189823985099792,
"eval_runtime": 9.4414,
"eval_samples_per_second": 26.479,
"eval_steps_per_second": 3.389,
"step": 22
},
{
"epoch": 0.368,
"grad_norm": 9.329913139343262,
"learning_rate": 4.1393442622950826e-05,
"loss": 0.5394,
"step": 23
},
{
"epoch": 0.368,
"eval_accuracy": 0.564,
"eval_loss": 0.7176367044448853,
"eval_runtime": 9.4362,
"eval_samples_per_second": 26.494,
"eval_steps_per_second": 3.391,
"step": 23
},
{
"epoch": 0.384,
"grad_norm": 16.587936401367188,
"learning_rate": 4.098360655737705e-05,
"loss": 0.7886,
"step": 24
},
{
"epoch": 0.384,
"eval_accuracy": 0.572,
"eval_loss": 0.7161562442779541,
"eval_runtime": 9.4353,
"eval_samples_per_second": 26.496,
"eval_steps_per_second": 3.392,
"step": 24
},
{
"epoch": 0.4,
"grad_norm": 15.896271705627441,
"learning_rate": 4.057377049180328e-05,
"loss": 0.5579,
"step": 25
},
{
"epoch": 0.4,
"eval_accuracy": 0.572,
"eval_loss": 0.7171699404716492,
"eval_runtime": 9.4618,
"eval_samples_per_second": 26.422,
"eval_steps_per_second": 3.382,
"step": 25
},
{
"epoch": 0.416,
"grad_norm": 6.284942626953125,
"learning_rate": 4.016393442622951e-05,
"loss": 0.619,
"step": 26
},
{
"epoch": 0.416,
"eval_accuracy": 0.576,
"eval_loss": 0.7149707078933716,
"eval_runtime": 9.4462,
"eval_samples_per_second": 26.466,
"eval_steps_per_second": 3.388,
"step": 26
},
{
"epoch": 0.432,
"grad_norm": 7.851229667663574,
"learning_rate": 3.975409836065574e-05,
"loss": 0.6796,
"step": 27
},
{
"epoch": 0.432,
"eval_accuracy": 0.572,
"eval_loss": 0.7145332098007202,
"eval_runtime": 9.4497,
"eval_samples_per_second": 26.456,
"eval_steps_per_second": 3.386,
"step": 27
},
{
"epoch": 0.448,
"grad_norm": 6.50039529800415,
"learning_rate": 3.934426229508197e-05,
"loss": 0.8046,
"step": 28
},
{
"epoch": 0.448,
"eval_accuracy": 0.568,
"eval_loss": 0.7118340134620667,
"eval_runtime": 9.4429,
"eval_samples_per_second": 26.475,
"eval_steps_per_second": 3.389,
"step": 28
},
{
"epoch": 0.464,
"grad_norm": 10.894524574279785,
"learning_rate": 3.89344262295082e-05,
"loss": 0.6829,
"step": 29
},
{
"epoch": 0.464,
"eval_accuracy": 0.568,
"eval_loss": 0.7091230750083923,
"eval_runtime": 9.4254,
"eval_samples_per_second": 26.524,
"eval_steps_per_second": 3.395,
"step": 29
},
{
"epoch": 0.48,
"grad_norm": 17.76140594482422,
"learning_rate": 3.8524590163934424e-05,
"loss": 0.8194,
"step": 30
},
{
"epoch": 0.48,
"eval_accuracy": 0.548,
"eval_loss": 0.7109335660934448,
"eval_runtime": 9.4302,
"eval_samples_per_second": 26.511,
"eval_steps_per_second": 3.393,
"step": 30
},
{
"epoch": 0.496,
"grad_norm": 4.884728908538818,
"learning_rate": 3.8114754098360655e-05,
"loss": 0.6432,
"step": 31
},
{
"epoch": 0.496,
"eval_accuracy": 0.536,
"eval_loss": 0.7137030959129333,
"eval_runtime": 9.4055,
"eval_samples_per_second": 26.58,
"eval_steps_per_second": 3.402,
"step": 31
},
{
"epoch": 0.512,
"grad_norm": 8.217907905578613,
"learning_rate": 3.7704918032786885e-05,
"loss": 0.6199,
"step": 32
},
{
"epoch": 0.512,
"eval_accuracy": 0.536,
"eval_loss": 0.7147109508514404,
"eval_runtime": 9.4314,
"eval_samples_per_second": 26.507,
"eval_steps_per_second": 3.393,
"step": 32
},
{
"epoch": 0.528,
"grad_norm": 5.067286014556885,
"learning_rate": 3.729508196721312e-05,
"loss": 0.5238,
"step": 33
},
{
"epoch": 0.528,
"eval_accuracy": 0.528,
"eval_loss": 0.7139023542404175,
"eval_runtime": 9.4057,
"eval_samples_per_second": 26.579,
"eval_steps_per_second": 3.402,
"step": 33
},
{
"epoch": 0.544,
"grad_norm": 9.185476303100586,
"learning_rate": 3.6885245901639346e-05,
"loss": 0.5065,
"step": 34
},
{
"epoch": 0.544,
"eval_accuracy": 0.54,
"eval_loss": 0.7080722451210022,
"eval_runtime": 9.4148,
"eval_samples_per_second": 26.554,
"eval_steps_per_second": 3.399,
"step": 34
},
{
"epoch": 0.56,
"grad_norm": 10.447481155395508,
"learning_rate": 3.6475409836065576e-05,
"loss": 0.7825,
"step": 35
},
{
"epoch": 0.56,
"eval_accuracy": 0.556,
"eval_loss": 0.7053359150886536,
"eval_runtime": 9.4329,
"eval_samples_per_second": 26.503,
"eval_steps_per_second": 3.392,
"step": 35
},
{
"epoch": 0.576,
"grad_norm": 9.977537155151367,
"learning_rate": 3.6065573770491806e-05,
"loss": 0.7256,
"step": 36
},
{
"epoch": 0.576,
"eval_accuracy": 0.556,
"eval_loss": 0.7060820460319519,
"eval_runtime": 9.426,
"eval_samples_per_second": 26.522,
"eval_steps_per_second": 3.395,
"step": 36
},
{
"epoch": 0.592,
"grad_norm": 8.119141578674316,
"learning_rate": 3.5655737704918037e-05,
"loss": 0.7407,
"step": 37
},
{
"epoch": 0.592,
"eval_accuracy": 0.544,
"eval_loss": 0.7100077867507935,
"eval_runtime": 9.4303,
"eval_samples_per_second": 26.51,
"eval_steps_per_second": 3.393,
"step": 37
},
{
"epoch": 0.608,
"grad_norm": 13.609740257263184,
"learning_rate": 3.524590163934427e-05,
"loss": 0.6665,
"step": 38
},
{
"epoch": 0.608,
"eval_accuracy": 0.544,
"eval_loss": 0.7075429558753967,
"eval_runtime": 9.4113,
"eval_samples_per_second": 26.564,
"eval_steps_per_second": 3.4,
"step": 38
},
{
"epoch": 0.624,
"grad_norm": 22.365285873413086,
"learning_rate": 3.483606557377049e-05,
"loss": 0.8188,
"step": 39
},
{
"epoch": 0.624,
"eval_accuracy": 0.564,
"eval_loss": 0.7029336094856262,
"eval_runtime": 9.4257,
"eval_samples_per_second": 26.523,
"eval_steps_per_second": 3.395,
"step": 39
},
{
"epoch": 0.64,
"grad_norm": 10.358452796936035,
"learning_rate": 3.442622950819672e-05,
"loss": 0.6671,
"step": 40
},
{
"epoch": 0.64,
"eval_accuracy": 0.568,
"eval_loss": 0.6954512000083923,
"eval_runtime": 9.4493,
"eval_samples_per_second": 26.457,
"eval_steps_per_second": 3.386,
"step": 40
},
{
"epoch": 0.656,
"grad_norm": 15.979942321777344,
"learning_rate": 3.401639344262295e-05,
"loss": 0.7222,
"step": 41
},
{
"epoch": 0.656,
"eval_accuracy": 0.568,
"eval_loss": 0.6924257874488831,
"eval_runtime": 9.4502,
"eval_samples_per_second": 26.454,
"eval_steps_per_second": 3.386,
"step": 41
},
{
"epoch": 0.672,
"grad_norm": 16.25983428955078,
"learning_rate": 3.360655737704918e-05,
"loss": 0.7285,
"step": 42
},
{
"epoch": 0.672,
"eval_accuracy": 0.576,
"eval_loss": 0.6920918226242065,
"eval_runtime": 9.4123,
"eval_samples_per_second": 26.561,
"eval_steps_per_second": 3.4,
"step": 42
},
{
"epoch": 0.688,
"grad_norm": 7.8817853927612305,
"learning_rate": 3.319672131147541e-05,
"loss": 0.7068,
"step": 43
},
{
"epoch": 0.688,
"eval_accuracy": 0.588,
"eval_loss": 0.693978488445282,
"eval_runtime": 9.4142,
"eval_samples_per_second": 26.556,
"eval_steps_per_second": 3.399,
"step": 43
},
{
"epoch": 0.704,
"grad_norm": 11.203206062316895,
"learning_rate": 3.2786885245901635e-05,
"loss": 0.613,
"step": 44
},
{
"epoch": 0.704,
"eval_accuracy": 0.6,
"eval_loss": 0.6923867464065552,
"eval_runtime": 9.4098,
"eval_samples_per_second": 26.568,
"eval_steps_per_second": 3.401,
"step": 44
},
{
"epoch": 0.72,
"grad_norm": 8.55033016204834,
"learning_rate": 3.237704918032787e-05,
"loss": 0.5672,
"step": 45
},
{
"epoch": 0.72,
"eval_accuracy": 0.604,
"eval_loss": 0.695925772190094,
"eval_runtime": 9.4467,
"eval_samples_per_second": 26.464,
"eval_steps_per_second": 3.387,
"step": 45
},
{
"epoch": 0.736,
"grad_norm": 9.487948417663574,
"learning_rate": 3.19672131147541e-05,
"loss": 0.6208,
"step": 46
},
{
"epoch": 0.736,
"eval_accuracy": 0.604,
"eval_loss": 0.7002148628234863,
"eval_runtime": 9.4163,
"eval_samples_per_second": 26.55,
"eval_steps_per_second": 3.398,
"step": 46
},
{
"epoch": 0.752,
"grad_norm": 7.840662479400635,
"learning_rate": 3.155737704918033e-05,
"loss": 0.6282,
"step": 47
},
{
"epoch": 0.752,
"eval_accuracy": 0.608,
"eval_loss": 0.7034921646118164,
"eval_runtime": 9.4244,
"eval_samples_per_second": 26.527,
"eval_steps_per_second": 3.395,
"step": 47
},
{
"epoch": 0.768,
"grad_norm": 6.098258972167969,
"learning_rate": 3.114754098360656e-05,
"loss": 0.6129,
"step": 48
},
{
"epoch": 0.768,
"eval_accuracy": 0.604,
"eval_loss": 0.7040849328041077,
"eval_runtime": 9.3957,
"eval_samples_per_second": 26.608,
"eval_steps_per_second": 3.406,
"step": 48
},
{
"epoch": 0.784,
"grad_norm": 7.861691951751709,
"learning_rate": 3.073770491803279e-05,
"loss": 0.6396,
"step": 49
},
{
"epoch": 0.784,
"eval_accuracy": 0.608,
"eval_loss": 0.7040830254554749,
"eval_runtime": 9.396,
"eval_samples_per_second": 26.607,
"eval_steps_per_second": 3.406,
"step": 49
},
{
"epoch": 0.8,
"grad_norm": 9.376338958740234,
"learning_rate": 3.0327868852459017e-05,
"loss": 0.5983,
"step": 50
},
{
"epoch": 0.8,
"eval_accuracy": 0.608,
"eval_loss": 0.7050849795341492,
"eval_runtime": 9.4089,
"eval_samples_per_second": 26.571,
"eval_steps_per_second": 3.401,
"step": 50
},
{
"epoch": 0.816,
"grad_norm": 8.683838844299316,
"learning_rate": 2.9918032786885248e-05,
"loss": 0.6681,
"step": 51
},
{
"epoch": 0.816,
"eval_accuracy": 0.604,
"eval_loss": 0.705935537815094,
"eval_runtime": 9.3804,
"eval_samples_per_second": 26.651,
"eval_steps_per_second": 3.411,
"step": 51
},
{
"epoch": 0.832,
"grad_norm": 17.765621185302734,
"learning_rate": 2.9508196721311478e-05,
"loss": 0.8503,
"step": 52
},
{
"epoch": 0.832,
"eval_accuracy": 0.604,
"eval_loss": 0.6994922161102295,
"eval_runtime": 9.4185,
"eval_samples_per_second": 26.544,
"eval_steps_per_second": 3.398,
"step": 52
},
{
"epoch": 0.848,
"grad_norm": 15.548516273498535,
"learning_rate": 2.9098360655737705e-05,
"loss": 0.7585,
"step": 53
},
{
"epoch": 0.848,
"eval_accuracy": 0.6,
"eval_loss": 0.692019522190094,
"eval_runtime": 9.5871,
"eval_samples_per_second": 26.077,
"eval_steps_per_second": 3.338,
"step": 53
},
{
"epoch": 0.864,
"grad_norm": 8.666825294494629,
"learning_rate": 2.8688524590163935e-05,
"loss": 0.5713,
"step": 54
},
{
"epoch": 0.864,
"eval_accuracy": 0.6,
"eval_loss": 0.68896484375,
"eval_runtime": 9.4277,
"eval_samples_per_second": 26.518,
"eval_steps_per_second": 3.394,
"step": 54
},
{
"epoch": 0.88,
"grad_norm": 16.585477828979492,
"learning_rate": 2.8278688524590162e-05,
"loss": 0.7261,
"step": 55
},
{
"epoch": 0.88,
"eval_accuracy": 0.6,
"eval_loss": 0.6847422122955322,
"eval_runtime": 9.4025,
"eval_samples_per_second": 26.589,
"eval_steps_per_second": 3.403,
"step": 55
},
{
"epoch": 0.896,
"grad_norm": 17.52354621887207,
"learning_rate": 2.7868852459016392e-05,
"loss": 0.7457,
"step": 56
},
{
"epoch": 0.896,
"eval_accuracy": 0.604,
"eval_loss": 0.6801777482032776,
"eval_runtime": 9.4431,
"eval_samples_per_second": 26.474,
"eval_steps_per_second": 3.389,
"step": 56
},
{
"epoch": 0.912,
"grad_norm": 14.731335639953613,
"learning_rate": 2.7459016393442626e-05,
"loss": 0.8242,
"step": 57
},
{
"epoch": 0.912,
"eval_accuracy": 0.576,
"eval_loss": 0.6787323951721191,
"eval_runtime": 9.4268,
"eval_samples_per_second": 26.52,
"eval_steps_per_second": 3.395,
"step": 57
},
{
"epoch": 0.928,
"grad_norm": 6.853959083557129,
"learning_rate": 2.7049180327868856e-05,
"loss": 0.7688,
"step": 58
},
{
"epoch": 0.928,
"eval_accuracy": 0.568,
"eval_loss": 0.6817187666893005,
"eval_runtime": 9.4078,
"eval_samples_per_second": 26.574,
"eval_steps_per_second": 3.401,
"step": 58
},
{
"epoch": 0.944,
"grad_norm": 13.072829246520996,
"learning_rate": 2.6639344262295087e-05,
"loss": 0.5804,
"step": 59
},
{
"epoch": 0.944,
"eval_accuracy": 0.572,
"eval_loss": 0.685714840888977,
"eval_runtime": 9.422,
"eval_samples_per_second": 26.534,
"eval_steps_per_second": 3.396,
"step": 59
},
{
"epoch": 0.96,
"grad_norm": 8.29138469696045,
"learning_rate": 2.6229508196721314e-05,
"loss": 0.8167,
"step": 60
},
{
"epoch": 0.96,
"eval_accuracy": 0.568,
"eval_loss": 0.6867265701293945,
"eval_runtime": 9.4234,
"eval_samples_per_second": 26.53,
"eval_steps_per_second": 3.396,
"step": 60
},
{
"epoch": 0.976,
"grad_norm": 5.209651470184326,
"learning_rate": 2.5819672131147544e-05,
"loss": 0.5874,
"step": 61
},
{
"epoch": 0.976,
"eval_accuracy": 0.576,
"eval_loss": 0.6885351538658142,
"eval_runtime": 9.4091,
"eval_samples_per_second": 26.57,
"eval_steps_per_second": 3.401,
"step": 61
},
{
"epoch": 0.992,
"grad_norm": 8.127976417541504,
"learning_rate": 2.540983606557377e-05,
"loss": 0.6197,
"step": 62
},
{
"epoch": 0.992,
"eval_accuracy": 0.572,
"eval_loss": 0.6853671669960022,
"eval_runtime": 9.4343,
"eval_samples_per_second": 26.499,
"eval_steps_per_second": 3.392,
"step": 62
},
{
"epoch": 1.008,
"grad_norm": 4.938397407531738,
"learning_rate": 2.5e-05,
"loss": 0.6458,
"step": 63
},
{
"epoch": 1.008,
"eval_accuracy": 0.584,
"eval_loss": 0.6829023361206055,
"eval_runtime": 9.4315,
"eval_samples_per_second": 26.507,
"eval_steps_per_second": 3.393,
"step": 63
},
{
"epoch": 1.024,
"grad_norm": 15.248034477233887,
"learning_rate": 2.459016393442623e-05,
"loss": 0.7218,
"step": 64
},
{
"epoch": 1.024,
"eval_accuracy": 0.592,
"eval_loss": 0.6791366934776306,
"eval_runtime": 9.4284,
"eval_samples_per_second": 26.516,
"eval_steps_per_second": 3.394,
"step": 64
},
{
"epoch": 1.04,
"grad_norm": 5.217968463897705,
"learning_rate": 2.418032786885246e-05,
"loss": 0.6869,
"step": 65
},
{
"epoch": 1.04,
"eval_accuracy": 0.592,
"eval_loss": 0.6775898337364197,
"eval_runtime": 9.4135,
"eval_samples_per_second": 26.558,
"eval_steps_per_second": 3.399,
"step": 65
},
{
"epoch": 1.056,
"grad_norm": 8.960049629211426,
"learning_rate": 2.377049180327869e-05,
"loss": 0.7135,
"step": 66
},
{
"epoch": 1.056,
"eval_accuracy": 0.592,
"eval_loss": 0.6763710975646973,
"eval_runtime": 9.43,
"eval_samples_per_second": 26.511,
"eval_steps_per_second": 3.393,
"step": 66
},
{
"epoch": 1.072,
"grad_norm": 14.524127960205078,
"learning_rate": 2.336065573770492e-05,
"loss": 0.7343,
"step": 67
},
{
"epoch": 1.072,
"eval_accuracy": 0.596,
"eval_loss": 0.673941433429718,
"eval_runtime": 9.4381,
"eval_samples_per_second": 26.488,
"eval_steps_per_second": 3.39,
"step": 67
},
{
"epoch": 1.088,
"grad_norm": 14.215781211853027,
"learning_rate": 2.295081967213115e-05,
"loss": 0.7439,
"step": 68
},
{
"epoch": 1.088,
"eval_accuracy": 0.596,
"eval_loss": 0.6748945116996765,
"eval_runtime": 9.4059,
"eval_samples_per_second": 26.579,
"eval_steps_per_second": 3.402,
"step": 68
},
{
"epoch": 1.104,
"grad_norm": 5.426934719085693,
"learning_rate": 2.254098360655738e-05,
"loss": 0.5504,
"step": 69
},
{
"epoch": 1.104,
"eval_accuracy": 0.6,
"eval_loss": 0.6768242120742798,
"eval_runtime": 9.4117,
"eval_samples_per_second": 26.563,
"eval_steps_per_second": 3.4,
"step": 69
},
{
"epoch": 1.12,
"grad_norm": 14.354090690612793,
"learning_rate": 2.2131147540983607e-05,
"loss": 0.696,
"step": 70
},
{
"epoch": 1.12,
"eval_accuracy": 0.596,
"eval_loss": 0.6765508055686951,
"eval_runtime": 9.4291,
"eval_samples_per_second": 26.514,
"eval_steps_per_second": 3.394,
"step": 70
},
{
"epoch": 1.1360000000000001,
"grad_norm": 11.328275680541992,
"learning_rate": 2.1721311475409837e-05,
"loss": 0.6042,
"step": 71
},
{
"epoch": 1.1360000000000001,
"eval_accuracy": 0.596,
"eval_loss": 0.6768398284912109,
"eval_runtime": 9.4156,
"eval_samples_per_second": 26.552,
"eval_steps_per_second": 3.399,
"step": 71
},
{
"epoch": 1.152,
"grad_norm": 9.158403396606445,
"learning_rate": 2.1311475409836064e-05,
"loss": 0.4853,
"step": 72
},
{
"epoch": 1.152,
"eval_accuracy": 0.604,
"eval_loss": 0.6750390529632568,
"eval_runtime": 9.4378,
"eval_samples_per_second": 26.489,
"eval_steps_per_second": 3.391,
"step": 72
},
{
"epoch": 1.168,
"grad_norm": 7.848287105560303,
"learning_rate": 2.0901639344262298e-05,
"loss": 0.6744,
"step": 73
},
{
"epoch": 1.168,
"eval_accuracy": 0.6,
"eval_loss": 0.6753163933753967,
"eval_runtime": 9.4125,
"eval_samples_per_second": 26.56,
"eval_steps_per_second": 3.4,
"step": 73
},
{
"epoch": 1.184,
"grad_norm": 11.083074569702148,
"learning_rate": 2.0491803278688525e-05,
"loss": 0.7398,
"step": 74
},
{
"epoch": 1.184,
"eval_accuracy": 0.596,
"eval_loss": 0.676925778388977,
"eval_runtime": 9.421,
"eval_samples_per_second": 26.536,
"eval_steps_per_second": 3.397,
"step": 74
},
{
"epoch": 1.2,
"grad_norm": 8.224617958068848,
"learning_rate": 2.0081967213114755e-05,
"loss": 0.6029,
"step": 75
},
{
"epoch": 1.2,
"eval_accuracy": 0.596,
"eval_loss": 0.677783191204071,
"eval_runtime": 9.4291,
"eval_samples_per_second": 26.514,
"eval_steps_per_second": 3.394,
"step": 75
},
{
"epoch": 1.216,
"grad_norm": 17.132051467895508,
"learning_rate": 1.9672131147540985e-05,
"loss": 0.6935,
"step": 76
},
{
"epoch": 1.216,
"eval_accuracy": 0.596,
"eval_loss": 0.6787539124488831,
"eval_runtime": 9.4075,
"eval_samples_per_second": 26.575,
"eval_steps_per_second": 3.402,
"step": 76
},
{
"epoch": 1.232,
"grad_norm": 8.447811126708984,
"learning_rate": 1.9262295081967212e-05,
"loss": 0.7292,
"step": 77
},
{
"epoch": 1.232,
"eval_accuracy": 0.6,
"eval_loss": 0.6795663833618164,
"eval_runtime": 9.4049,
"eval_samples_per_second": 26.582,
"eval_steps_per_second": 3.402,
"step": 77
},
{
"epoch": 1.248,
"grad_norm": 4.971631050109863,
"learning_rate": 1.8852459016393442e-05,
"loss": 0.6192,
"step": 78
},
{
"epoch": 1.248,
"eval_accuracy": 0.6,
"eval_loss": 0.6786601543426514,
"eval_runtime": 9.4102,
"eval_samples_per_second": 26.567,
"eval_steps_per_second": 3.401,
"step": 78
},
{
"epoch": 1.264,
"grad_norm": 8.30854320526123,
"learning_rate": 1.8442622950819673e-05,
"loss": 0.6979,
"step": 79
},
{
"epoch": 1.264,
"eval_accuracy": 0.6,
"eval_loss": 0.6776171922683716,
"eval_runtime": 9.4206,
"eval_samples_per_second": 26.537,
"eval_steps_per_second": 3.397,
"step": 79
},
{
"epoch": 1.28,
"grad_norm": 9.044068336486816,
"learning_rate": 1.8032786885245903e-05,
"loss": 0.7554,
"step": 80
},
{
"epoch": 1.28,
"eval_accuracy": 0.596,
"eval_loss": 0.6768652200698853,
"eval_runtime": 9.4398,
"eval_samples_per_second": 26.484,
"eval_steps_per_second": 3.39,
"step": 80
},
{
"epoch": 1.296,
"grad_norm": 22.36913299560547,
"learning_rate": 1.7622950819672133e-05,
"loss": 0.7857,
"step": 81
},
{
"epoch": 1.296,
"eval_accuracy": 0.584,
"eval_loss": 0.6760781407356262,
"eval_runtime": 9.4344,
"eval_samples_per_second": 26.499,
"eval_steps_per_second": 3.392,
"step": 81
},
{
"epoch": 1.312,
"grad_norm": 9.494186401367188,
"learning_rate": 1.721311475409836e-05,
"loss": 0.7903,
"step": 82
},
{
"epoch": 1.312,
"eval_accuracy": 0.576,
"eval_loss": 0.6796757578849792,
"eval_runtime": 9.3991,
"eval_samples_per_second": 26.598,
"eval_steps_per_second": 3.405,
"step": 82
},
{
"epoch": 1.328,
"grad_norm": 6.161738395690918,
"learning_rate": 1.680327868852459e-05,
"loss": 0.714,
"step": 83
},
{
"epoch": 1.328,
"eval_accuracy": 0.576,
"eval_loss": 0.6806288957595825,
"eval_runtime": 9.424,
"eval_samples_per_second": 26.528,
"eval_steps_per_second": 3.396,
"step": 83
},
{
"epoch": 1.3439999999999999,
"grad_norm": 10.077332496643066,
"learning_rate": 1.6393442622950818e-05,
"loss": 0.7107,
"step": 84
},
{
"epoch": 1.3439999999999999,
"eval_accuracy": 0.584,
"eval_loss": 0.6848242282867432,
"eval_runtime": 9.4189,
"eval_samples_per_second": 26.542,
"eval_steps_per_second": 3.397,
"step": 84
},
{
"epoch": 1.3599999999999999,
"grad_norm": 14.34889030456543,
"learning_rate": 1.598360655737705e-05,
"loss": 0.6276,
"step": 85
},
{
"epoch": 1.3599999999999999,
"eval_accuracy": 0.588,
"eval_loss": 0.6862617135047913,
"eval_runtime": 9.4148,
"eval_samples_per_second": 26.554,
"eval_steps_per_second": 3.399,
"step": 85
},
{
"epoch": 1.376,
"grad_norm": 9.223981857299805,
"learning_rate": 1.557377049180328e-05,
"loss": 0.7295,
"step": 86
},
{
"epoch": 1.376,
"eval_accuracy": 0.588,
"eval_loss": 0.6857773661613464,
"eval_runtime": 9.4043,
"eval_samples_per_second": 26.584,
"eval_steps_per_second": 3.403,
"step": 86
},
{
"epoch": 1.392,
"grad_norm": 13.143969535827637,
"learning_rate": 1.5163934426229509e-05,
"loss": 0.6597,
"step": 87
},
{
"epoch": 1.392,
"eval_accuracy": 0.588,
"eval_loss": 0.6872578263282776,
"eval_runtime": 9.4212,
"eval_samples_per_second": 26.536,
"eval_steps_per_second": 3.397,
"step": 87
},
{
"epoch": 1.408,
"grad_norm": 22.58281898498535,
"learning_rate": 1.4754098360655739e-05,
"loss": 0.6335,
"step": 88
},
{
"epoch": 1.408,
"eval_accuracy": 0.58,
"eval_loss": 0.6847929954528809,
"eval_runtime": 9.4232,
"eval_samples_per_second": 26.53,
"eval_steps_per_second": 3.396,
"step": 88
},
{
"epoch": 1.424,
"grad_norm": 12.670473098754883,
"learning_rate": 1.4344262295081968e-05,
"loss": 0.7245,
"step": 89
},
{
"epoch": 1.424,
"eval_accuracy": 0.572,
"eval_loss": 0.6834453344345093,
"eval_runtime": 9.4138,
"eval_samples_per_second": 26.557,
"eval_steps_per_second": 3.399,
"step": 89
},
{
"epoch": 1.44,
"grad_norm": 20.81968879699707,
"learning_rate": 1.3934426229508196e-05,
"loss": 0.5546,
"step": 90
},
{
"epoch": 1.44,
"eval_accuracy": 0.568,
"eval_loss": 0.6808554530143738,
"eval_runtime": 9.4208,
"eval_samples_per_second": 26.537,
"eval_steps_per_second": 3.397,
"step": 90
},
{
"epoch": 1.456,
"grad_norm": 8.033720016479492,
"learning_rate": 1.3524590163934428e-05,
"loss": 0.6482,
"step": 91
},
{
"epoch": 1.456,
"eval_accuracy": 0.568,
"eval_loss": 0.6760781407356262,
"eval_runtime": 9.408,
"eval_samples_per_second": 26.573,
"eval_steps_per_second": 3.401,
"step": 91
},
{
"epoch": 1.472,
"grad_norm": 9.656173706054688,
"learning_rate": 1.3114754098360657e-05,
"loss": 0.6814,
"step": 92
},
{
"epoch": 1.472,
"eval_accuracy": 0.572,
"eval_loss": 0.6791015863418579,
"eval_runtime": 9.4039,
"eval_samples_per_second": 26.585,
"eval_steps_per_second": 3.403,
"step": 92
},
{
"epoch": 1.488,
"grad_norm": 4.5396599769592285,
"learning_rate": 1.2704918032786885e-05,
"loss": 0.5693,
"step": 93
},
{
"epoch": 1.488,
"eval_accuracy": 0.584,
"eval_loss": 0.6775078177452087,
"eval_runtime": 9.4321,
"eval_samples_per_second": 26.505,
"eval_steps_per_second": 3.393,
"step": 93
},
{
"epoch": 1.504,
"grad_norm": 11.05844783782959,
"learning_rate": 1.2295081967213116e-05,
"loss": 0.5369,
"step": 94
},
{
"epoch": 1.504,
"eval_accuracy": 0.58,
"eval_loss": 0.6771523356437683,
"eval_runtime": 9.4156,
"eval_samples_per_second": 26.552,
"eval_steps_per_second": 3.399,
"step": 94
},
{
"epoch": 1.52,
"grad_norm": 19.972246170043945,
"learning_rate": 1.1885245901639344e-05,
"loss": 0.7144,
"step": 95
},
{
"epoch": 1.52,
"eval_accuracy": 0.576,
"eval_loss": 0.6779101490974426,
"eval_runtime": 9.4028,
"eval_samples_per_second": 26.588,
"eval_steps_per_second": 3.403,
"step": 95
},
{
"epoch": 1.536,
"grad_norm": 11.014993667602539,
"learning_rate": 1.1475409836065575e-05,
"loss": 0.6405,
"step": 96
},
{
"epoch": 1.536,
"eval_accuracy": 0.564,
"eval_loss": 0.6772187352180481,
"eval_runtime": 9.4126,
"eval_samples_per_second": 26.56,
"eval_steps_per_second": 3.4,
"step": 96
},
{
"epoch": 1.552,
"grad_norm": 8.04190444946289,
"learning_rate": 1.1065573770491803e-05,
"loss": 0.7893,
"step": 97
},
{
"epoch": 1.552,
"eval_accuracy": 0.584,
"eval_loss": 0.6751992106437683,
"eval_runtime": 9.4142,
"eval_samples_per_second": 26.556,
"eval_steps_per_second": 3.399,
"step": 97
},
{
"epoch": 1.568,
"grad_norm": 8.616044044494629,
"learning_rate": 1.0655737704918032e-05,
"loss": 0.6448,
"step": 98
},
{
"epoch": 1.568,
"eval_accuracy": 0.568,
"eval_loss": 0.6759804487228394,
"eval_runtime": 9.4235,
"eval_samples_per_second": 26.529,
"eval_steps_per_second": 3.396,
"step": 98
},
{
"epoch": 1.584,
"grad_norm": 12.122180938720703,
"learning_rate": 1.0245901639344262e-05,
"loss": 0.5828,
"step": 99
},
{
"epoch": 1.584,
"eval_accuracy": 0.576,
"eval_loss": 0.6741952896118164,
"eval_runtime": 9.4162,
"eval_samples_per_second": 26.55,
"eval_steps_per_second": 3.398,
"step": 99
},
{
"epoch": 1.6,
"grad_norm": 15.246779441833496,
"learning_rate": 9.836065573770493e-06,
"loss": 0.6762,
"step": 100
},
{
"epoch": 1.6,
"eval_accuracy": 0.572,
"eval_loss": 0.6730703115463257,
"eval_runtime": 9.406,
"eval_samples_per_second": 26.579,
"eval_steps_per_second": 3.402,
"step": 100
},
{
"epoch": 1.616,
"grad_norm": 16.69089126586914,
"learning_rate": 9.426229508196721e-06,
"loss": 0.6432,
"step": 101
},
{
"epoch": 1.616,
"eval_accuracy": 0.584,
"eval_loss": 0.6738671660423279,
"eval_runtime": 9.4165,
"eval_samples_per_second": 26.549,
"eval_steps_per_second": 3.398,
"step": 101
},
{
"epoch": 1.6320000000000001,
"grad_norm": 8.9694242477417,
"learning_rate": 9.016393442622952e-06,
"loss": 0.5826,
"step": 102
},
{
"epoch": 1.6320000000000001,
"eval_accuracy": 0.58,
"eval_loss": 0.6729843616485596,
"eval_runtime": 9.4844,
"eval_samples_per_second": 26.359,
"eval_steps_per_second": 3.374,
"step": 102
},
{
"epoch": 1.6480000000000001,
"grad_norm": 9.330092430114746,
"learning_rate": 8.60655737704918e-06,
"loss": 0.6224,
"step": 103
},
{
"epoch": 1.6480000000000001,
"eval_accuracy": 0.584,
"eval_loss": 0.673214852809906,
"eval_runtime": 9.4192,
"eval_samples_per_second": 26.541,
"eval_steps_per_second": 3.397,
"step": 103
},
{
"epoch": 1.6640000000000001,
"grad_norm": 7.138861179351807,
"learning_rate": 8.196721311475409e-06,
"loss": 0.6262,
"step": 104
},
{
"epoch": 1.6640000000000001,
"eval_accuracy": 0.592,
"eval_loss": 0.6745429635047913,
"eval_runtime": 9.4226,
"eval_samples_per_second": 26.532,
"eval_steps_per_second": 3.396,
"step": 104
},
{
"epoch": 1.6800000000000002,
"grad_norm": 7.4160356521606445,
"learning_rate": 7.78688524590164e-06,
"loss": 0.6451,
"step": 105
},
{
"epoch": 1.6800000000000002,
"eval_accuracy": 0.592,
"eval_loss": 0.6730429530143738,
"eval_runtime": 9.4489,
"eval_samples_per_second": 26.458,
"eval_steps_per_second": 3.387,
"step": 105
},
{
"epoch": 1.696,
"grad_norm": 5.479573726654053,
"learning_rate": 7.3770491803278695e-06,
"loss": 0.5948,
"step": 106
},
{
"epoch": 1.696,
"eval_accuracy": 0.6,
"eval_loss": 0.6731171607971191,
"eval_runtime": 9.4414,
"eval_samples_per_second": 26.479,
"eval_steps_per_second": 3.389,
"step": 106
},
{
"epoch": 1.712,
"grad_norm": 9.357452392578125,
"learning_rate": 6.967213114754098e-06,
"loss": 0.7451,
"step": 107
},
{
"epoch": 1.712,
"eval_accuracy": 0.58,
"eval_loss": 0.6747695207595825,
"eval_runtime": 9.4087,
"eval_samples_per_second": 26.571,
"eval_steps_per_second": 3.401,
"step": 107
},
{
"epoch": 1.728,
"grad_norm": 10.986834526062012,
"learning_rate": 6.557377049180328e-06,
"loss": 0.5922,
"step": 108
},
{
"epoch": 1.728,
"eval_accuracy": 0.588,
"eval_loss": 0.6725429892539978,
"eval_runtime": 9.4208,
"eval_samples_per_second": 26.537,
"eval_steps_per_second": 3.397,
"step": 108
},
{
"epoch": 1.744,
"grad_norm": 6.625186920166016,
"learning_rate": 6.147540983606558e-06,
"loss": 0.6454,
"step": 109
},
{
"epoch": 1.744,
"eval_accuracy": 0.592,
"eval_loss": 0.6714960932731628,
"eval_runtime": 9.4316,
"eval_samples_per_second": 26.507,
"eval_steps_per_second": 3.393,
"step": 109
},
{
"epoch": 1.76,
"grad_norm": 9.619455337524414,
"learning_rate": 5.737704918032787e-06,
"loss": 0.601,
"step": 110
},
{
"epoch": 1.76,
"eval_accuracy": 0.596,
"eval_loss": 0.671625018119812,
"eval_runtime": 9.4295,
"eval_samples_per_second": 26.512,
"eval_steps_per_second": 3.394,
"step": 110
},
{
"epoch": 1.776,
"grad_norm": 10.5454683303833,
"learning_rate": 5.327868852459016e-06,
"loss": 0.7236,
"step": 111
},
{
"epoch": 1.776,
"eval_accuracy": 0.592,
"eval_loss": 0.6704453229904175,
"eval_runtime": 9.4138,
"eval_samples_per_second": 26.557,
"eval_steps_per_second": 3.399,
"step": 111
},
{
"epoch": 1.792,
"grad_norm": 9.553342819213867,
"learning_rate": 4.918032786885246e-06,
"loss": 0.7825,
"step": 112
},
{
"epoch": 1.792,
"eval_accuracy": 0.596,
"eval_loss": 0.673535168170929,
"eval_runtime": 9.4206,
"eval_samples_per_second": 26.538,
"eval_steps_per_second": 3.397,
"step": 112
},
{
"epoch": 1.808,
"grad_norm": 7.810243129730225,
"learning_rate": 4.508196721311476e-06,
"loss": 0.6302,
"step": 113
},
{
"epoch": 1.808,
"eval_accuracy": 0.584,
"eval_loss": 0.670703113079071,
"eval_runtime": 9.5051,
"eval_samples_per_second": 26.302,
"eval_steps_per_second": 3.367,
"step": 113
},
{
"epoch": 1.8239999999999998,
"grad_norm": 15.086982727050781,
"learning_rate": 4.098360655737704e-06,
"loss": 0.6824,
"step": 114
},
{
"epoch": 1.8239999999999998,
"eval_accuracy": 0.584,
"eval_loss": 0.6711757779121399,
"eval_runtime": 9.432,
"eval_samples_per_second": 26.505,
"eval_steps_per_second": 3.393,
"step": 114
},
{
"epoch": 1.8399999999999999,
"grad_norm": 13.564058303833008,
"learning_rate": 3.6885245901639347e-06,
"loss": 0.6208,
"step": 115
},
{
"epoch": 1.8399999999999999,
"eval_accuracy": 0.588,
"eval_loss": 0.6693046689033508,
"eval_runtime": 9.4215,
"eval_samples_per_second": 26.535,
"eval_steps_per_second": 3.396,
"step": 115
},
{
"epoch": 1.8559999999999999,
"grad_norm": 7.943946361541748,
"learning_rate": 3.278688524590164e-06,
"loss": 0.6987,
"step": 116
},
{
"epoch": 1.8559999999999999,
"eval_accuracy": 0.588,
"eval_loss": 0.671625018119812,
"eval_runtime": 9.4001,
"eval_samples_per_second": 26.595,
"eval_steps_per_second": 3.404,
"step": 116
},
{
"epoch": 1.8719999999999999,
"grad_norm": 6.293920993804932,
"learning_rate": 2.8688524590163937e-06,
"loss": 0.5587,
"step": 117
},
{
"epoch": 1.8719999999999999,
"eval_accuracy": 0.588,
"eval_loss": 0.670785129070282,
"eval_runtime": 9.3933,
"eval_samples_per_second": 26.615,
"eval_steps_per_second": 3.407,
"step": 117
},
{
"epoch": 1.888,
"grad_norm": 5.374147415161133,
"learning_rate": 2.459016393442623e-06,
"loss": 0.6304,
"step": 118
},
{
"epoch": 1.888,
"eval_accuracy": 0.592,
"eval_loss": 0.6705155968666077,
"eval_runtime": 9.4015,
"eval_samples_per_second": 26.592,
"eval_steps_per_second": 3.404,
"step": 118
},
{
"epoch": 1.904,
"grad_norm": 11.269082069396973,
"learning_rate": 2.049180327868852e-06,
"loss": 0.4528,
"step": 119
},
{
"epoch": 1.904,
"eval_accuracy": 0.584,
"eval_loss": 0.6711132526397705,
"eval_runtime": 9.4407,
"eval_samples_per_second": 26.481,
"eval_steps_per_second": 3.39,
"step": 119
},
{
"epoch": 1.92,
"grad_norm": 20.449726104736328,
"learning_rate": 1.639344262295082e-06,
"loss": 0.7061,
"step": 120
},
{
"epoch": 1.92,
"eval_accuracy": 0.58,
"eval_loss": 0.6705625057220459,
"eval_runtime": 9.4641,
"eval_samples_per_second": 26.416,
"eval_steps_per_second": 3.381,
"step": 120
},
{
"epoch": 1.936,
"grad_norm": 13.892779350280762,
"learning_rate": 1.2295081967213116e-06,
"loss": 0.5595,
"step": 121
},
{
"epoch": 1.936,
"eval_accuracy": 0.588,
"eval_loss": 0.670035183429718,
"eval_runtime": 9.4443,
"eval_samples_per_second": 26.471,
"eval_steps_per_second": 3.388,
"step": 121
},
{
"epoch": 1.952,
"grad_norm": 4.646062850952148,
"learning_rate": 8.19672131147541e-07,
"loss": 0.5968,
"step": 122
},
{
"epoch": 1.952,
"eval_accuracy": 0.588,
"eval_loss": 0.6705195307731628,
"eval_runtime": 9.4452,
"eval_samples_per_second": 26.468,
"eval_steps_per_second": 3.388,
"step": 122
},
{
"epoch": 1.968,
"grad_norm": 5.045331001281738,
"learning_rate": 4.098360655737705e-07,
"loss": 0.577,
"step": 123
},
{
"epoch": 1.968,
"eval_accuracy": 0.584,
"eval_loss": 0.6710820198059082,
"eval_runtime": 9.4702,
"eval_samples_per_second": 26.399,
"eval_steps_per_second": 3.379,
"step": 123
},
{
"epoch": 1.984,
"grad_norm": 12.286917686462402,
"learning_rate": 0.0,
"loss": 0.5765,
"step": 124
},
{
"epoch": 1.984,
"eval_accuracy": 0.58,
"eval_loss": 0.6720273494720459,
"eval_runtime": 9.4365,
"eval_samples_per_second": 26.493,
"eval_steps_per_second": 3.391,
"step": 124
},
{
"epoch": 1.984,
"step": 124,
"total_flos": 1.3708912645636096e+16,
"train_loss": 0.6877071011450983,
"train_runtime": 1489.9136,
"train_samples_per_second": 1.342,
"train_steps_per_second": 0.083
}
],
"logging_steps": 1,
"max_steps": 124,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"total_flos": 1.3708912645636096e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}