{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.444444444444445, "eval_steps": 10, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08888888888888889, "grad_norm": 3.339972496032715, "learning_rate": 4e-05, "loss": 2.2556, "step": 10 }, { "epoch": 0.08888888888888889, "eval_loss": 1.899280071258545, "eval_runtime": 8.2861, "eval_samples_per_second": 12.068, "eval_steps_per_second": 12.068, "step": 10 }, { "epoch": 0.17777777777777778, "grad_norm": 4.27776575088501, "learning_rate": 8e-05, "loss": 1.5026, "step": 20 }, { "epoch": 0.17777777777777778, "eval_loss": 0.8436458706855774, "eval_runtime": 8.2809, "eval_samples_per_second": 12.076, "eval_steps_per_second": 12.076, "step": 20 }, { "epoch": 0.26666666666666666, "grad_norm": 2.9827308654785156, "learning_rate": 0.00012, "loss": 0.8291, "step": 30 }, { "epoch": 0.26666666666666666, "eval_loss": 0.6633767485618591, "eval_runtime": 8.2735, "eval_samples_per_second": 12.087, "eval_steps_per_second": 12.087, "step": 30 }, { "epoch": 0.35555555555555557, "grad_norm": 2.1213743686676025, "learning_rate": 0.00016, "loss": 0.6688, "step": 40 }, { "epoch": 0.35555555555555557, "eval_loss": 0.5723012685775757, "eval_runtime": 8.273, "eval_samples_per_second": 12.087, "eval_steps_per_second": 12.087, "step": 40 }, { "epoch": 0.4444444444444444, "grad_norm": 2.0819756984710693, "learning_rate": 0.0002, "loss": 0.6339, "step": 50 }, { "epoch": 0.4444444444444444, "eval_loss": 0.5510777831077576, "eval_runtime": 8.2675, "eval_samples_per_second": 12.096, "eval_steps_per_second": 12.096, "step": 50 }, { "epoch": 0.5333333333333333, "grad_norm": 1.9045137166976929, "learning_rate": 0.00019975640502598244, "loss": 0.5258, "step": 60 }, { "epoch": 0.5333333333333333, "eval_loss": 0.4759778082370758, "eval_runtime": 8.2161, "eval_samples_per_second": 12.171, "eval_steps_per_second": 12.171, "step": 60 }, { "epoch": 0.6222222222222222, "grad_norm": 1.4711352586746216, "learning_rate": 0.00019902680687415705, "loss": 0.4825, "step": 70 }, { "epoch": 0.6222222222222222, "eval_loss": 0.4696222245693207, "eval_runtime": 8.0384, "eval_samples_per_second": 12.44, "eval_steps_per_second": 12.44, "step": 70 }, { "epoch": 0.7111111111111111, "grad_norm": 1.3467007875442505, "learning_rate": 0.00019781476007338058, "loss": 0.5488, "step": 80 }, { "epoch": 0.7111111111111111, "eval_loss": 0.4698783755302429, "eval_runtime": 8.0224, "eval_samples_per_second": 12.465, "eval_steps_per_second": 12.465, "step": 80 }, { "epoch": 0.8, "grad_norm": 1.3427356481552124, "learning_rate": 0.0001961261695938319, "loss": 0.4231, "step": 90 }, { "epoch": 0.8, "eval_loss": 0.45969489216804504, "eval_runtime": 8.0315, "eval_samples_per_second": 12.451, "eval_steps_per_second": 12.451, "step": 90 }, { "epoch": 0.8888888888888888, "grad_norm": 1.3022139072418213, "learning_rate": 0.00019396926207859084, "loss": 0.4558, "step": 100 }, { "epoch": 0.8888888888888888, "eval_loss": 0.4219018220901489, "eval_runtime": 8.0414, "eval_samples_per_second": 12.436, "eval_steps_per_second": 12.436, "step": 100 }, { "epoch": 0.9777777777777777, "grad_norm": 1.149550199508667, "learning_rate": 0.0001913545457642601, "loss": 0.4588, "step": 110 }, { "epoch": 0.9777777777777777, "eval_loss": 0.41754281520843506, "eval_runtime": 8.0271, "eval_samples_per_second": 12.458, "eval_steps_per_second": 12.458, "step": 110 }, { "epoch": 1.0666666666666667, "grad_norm": 1.335131049156189, "learning_rate": 0.00018829475928589271, "loss": 0.4592, "step": 120 }, { "epoch": 1.0666666666666667, "eval_loss": 0.4288429617881775, "eval_runtime": 8.0429, "eval_samples_per_second": 12.433, "eval_steps_per_second": 12.433, "step": 120 }, { "epoch": 1.1555555555555554, "grad_norm": 1.0977760553359985, "learning_rate": 0.0001848048096156426, "loss": 0.2996, "step": 130 }, { "epoch": 1.1555555555555554, "eval_loss": 0.3918496072292328, "eval_runtime": 8.0388, "eval_samples_per_second": 12.44, "eval_steps_per_second": 12.44, "step": 130 }, { "epoch": 1.2444444444444445, "grad_norm": 1.511498212814331, "learning_rate": 0.00018090169943749476, "loss": 0.3269, "step": 140 }, { "epoch": 1.2444444444444445, "eval_loss": 0.41831883788108826, "eval_runtime": 8.0461, "eval_samples_per_second": 12.428, "eval_steps_per_second": 12.428, "step": 140 }, { "epoch": 1.3333333333333333, "grad_norm": 1.6732897758483887, "learning_rate": 0.0001766044443118978, "loss": 0.347, "step": 150 }, { "epoch": 1.3333333333333333, "eval_loss": 0.43138808012008667, "eval_runtime": 8.0313, "eval_samples_per_second": 12.451, "eval_steps_per_second": 12.451, "step": 150 }, { "epoch": 1.4222222222222223, "grad_norm": 1.5496058464050293, "learning_rate": 0.0001719339800338651, "loss": 0.3251, "step": 160 }, { "epoch": 1.4222222222222223, "eval_loss": 0.3889056444168091, "eval_runtime": 8.0322, "eval_samples_per_second": 12.45, "eval_steps_per_second": 12.45, "step": 160 }, { "epoch": 1.511111111111111, "grad_norm": 1.3534138202667236, "learning_rate": 0.00016691306063588583, "loss": 0.3035, "step": 170 }, { "epoch": 1.511111111111111, "eval_loss": 0.37887275218963623, "eval_runtime": 8.0354, "eval_samples_per_second": 12.445, "eval_steps_per_second": 12.445, "step": 170 }, { "epoch": 1.6, "grad_norm": 1.4648184776306152, "learning_rate": 0.0001615661475325658, "loss": 0.3141, "step": 180 }, { "epoch": 1.6, "eval_loss": 0.38691258430480957, "eval_runtime": 8.0374, "eval_samples_per_second": 12.442, "eval_steps_per_second": 12.442, "step": 180 }, { "epoch": 1.6888888888888889, "grad_norm": 1.0812690258026123, "learning_rate": 0.0001559192903470747, "loss": 0.2878, "step": 190 }, { "epoch": 1.6888888888888889, "eval_loss": 0.3909819722175598, "eval_runtime": 8.0411, "eval_samples_per_second": 12.436, "eval_steps_per_second": 12.436, "step": 190 }, { "epoch": 1.7777777777777777, "grad_norm": 1.936132788658142, "learning_rate": 0.00015000000000000001, "loss": 0.3063, "step": 200 }, { "epoch": 1.7777777777777777, "eval_loss": 0.39576366543769836, "eval_runtime": 8.0433, "eval_samples_per_second": 12.433, "eval_steps_per_second": 12.433, "step": 200 }, { "epoch": 1.8666666666666667, "grad_norm": 1.551062822341919, "learning_rate": 0.00014383711467890774, "loss": 0.2748, "step": 210 }, { "epoch": 1.8666666666666667, "eval_loss": 0.3819361627101898, "eval_runtime": 8.0342, "eval_samples_per_second": 12.447, "eval_steps_per_second": 12.447, "step": 210 }, { "epoch": 1.9555555555555557, "grad_norm": 1.2996041774749756, "learning_rate": 0.00013746065934159123, "loss": 0.2725, "step": 220 }, { "epoch": 1.9555555555555557, "eval_loss": 0.4039897620677948, "eval_runtime": 8.024, "eval_samples_per_second": 12.463, "eval_steps_per_second": 12.463, "step": 220 }, { "epoch": 2.0444444444444443, "grad_norm": 1.0756527185440063, "learning_rate": 0.00013090169943749476, "loss": 0.2897, "step": 230 }, { "epoch": 2.0444444444444443, "eval_loss": 0.3928260803222656, "eval_runtime": 8.0625, "eval_samples_per_second": 12.403, "eval_steps_per_second": 12.403, "step": 230 }, { "epoch": 2.1333333333333333, "grad_norm": 0.8828668594360352, "learning_rate": 0.00012419218955996676, "loss": 0.1813, "step": 240 }, { "epoch": 2.1333333333333333, "eval_loss": 0.4048071801662445, "eval_runtime": 8.0422, "eval_samples_per_second": 12.434, "eval_steps_per_second": 12.434, "step": 240 }, { "epoch": 2.2222222222222223, "grad_norm": 1.6169154644012451, "learning_rate": 0.00011736481776669306, "loss": 0.1965, "step": 250 }, { "epoch": 2.2222222222222223, "eval_loss": 0.4035675525665283, "eval_runtime": 8.0344, "eval_samples_per_second": 12.447, "eval_steps_per_second": 12.447, "step": 250 }, { "epoch": 2.311111111111111, "grad_norm": 1.6981542110443115, "learning_rate": 0.00011045284632676536, "loss": 0.1751, "step": 260 }, { "epoch": 2.311111111111111, "eval_loss": 0.4221409857273102, "eval_runtime": 8.0296, "eval_samples_per_second": 12.454, "eval_steps_per_second": 12.454, "step": 260 }, { "epoch": 2.4, "grad_norm": 1.5531835556030273, "learning_rate": 0.00010348994967025012, "loss": 0.1739, "step": 270 }, { "epoch": 2.4, "eval_loss": 0.40371406078338623, "eval_runtime": 8.017, "eval_samples_per_second": 12.474, "eval_steps_per_second": 12.474, "step": 270 }, { "epoch": 2.488888888888889, "grad_norm": 1.6275184154510498, "learning_rate": 9.651005032974994e-05, "loss": 0.1629, "step": 280 }, { "epoch": 2.488888888888889, "eval_loss": 0.41767382621765137, "eval_runtime": 8.026, "eval_samples_per_second": 12.46, "eval_steps_per_second": 12.46, "step": 280 }, { "epoch": 2.5777777777777775, "grad_norm": 1.6192083358764648, "learning_rate": 8.954715367323468e-05, "loss": 0.1919, "step": 290 }, { "epoch": 2.5777777777777775, "eval_loss": 0.4002458155155182, "eval_runtime": 8.0399, "eval_samples_per_second": 12.438, "eval_steps_per_second": 12.438, "step": 290 }, { "epoch": 2.6666666666666665, "grad_norm": 1.4782391786575317, "learning_rate": 8.263518223330697e-05, "loss": 0.1804, "step": 300 }, { "epoch": 2.6666666666666665, "eval_loss": 0.4098145365715027, "eval_runtime": 8.0329, "eval_samples_per_second": 12.449, "eval_steps_per_second": 12.449, "step": 300 }, { "epoch": 2.7555555555555555, "grad_norm": 1.2380214929580688, "learning_rate": 7.580781044003324e-05, "loss": 0.1569, "step": 310 }, { "epoch": 2.7555555555555555, "eval_loss": 0.41249945759773254, "eval_runtime": 8.0306, "eval_samples_per_second": 12.452, "eval_steps_per_second": 12.452, "step": 310 }, { "epoch": 2.8444444444444446, "grad_norm": 1.0430532693862915, "learning_rate": 6.909830056250527e-05, "loss": 0.1914, "step": 320 }, { "epoch": 2.8444444444444446, "eval_loss": 0.4051912724971771, "eval_runtime": 8.0353, "eval_samples_per_second": 12.445, "eval_steps_per_second": 12.445, "step": 320 }, { "epoch": 2.9333333333333336, "grad_norm": 0.8478929400444031, "learning_rate": 6.25393406584088e-05, "loss": 0.144, "step": 330 }, { "epoch": 2.9333333333333336, "eval_loss": 0.4041104018688202, "eval_runtime": 8.0292, "eval_samples_per_second": 12.455, "eval_steps_per_second": 12.455, "step": 330 }, { "epoch": 3.022222222222222, "grad_norm": 0.7308769822120667, "learning_rate": 5.616288532109225e-05, "loss": 0.1738, "step": 340 }, { "epoch": 3.022222222222222, "eval_loss": 0.42209112644195557, "eval_runtime": 8.0367, "eval_samples_per_second": 12.443, "eval_steps_per_second": 12.443, "step": 340 }, { "epoch": 3.111111111111111, "grad_norm": 0.47878319025039673, "learning_rate": 5.000000000000002e-05, "loss": 0.1087, "step": 350 }, { "epoch": 3.111111111111111, "eval_loss": 0.42136698961257935, "eval_runtime": 8.0346, "eval_samples_per_second": 12.446, "eval_steps_per_second": 12.446, "step": 350 }, { "epoch": 3.2, "grad_norm": 0.8724251389503479, "learning_rate": 4.4080709652925336e-05, "loss": 0.0876, "step": 360 }, { "epoch": 3.2, "eval_loss": 0.4379313588142395, "eval_runtime": 8.025, "eval_samples_per_second": 12.461, "eval_steps_per_second": 12.461, "step": 360 }, { "epoch": 3.2888888888888888, "grad_norm": 1.1260199546813965, "learning_rate": 3.843385246743417e-05, "loss": 0.0857, "step": 370 }, { "epoch": 3.2888888888888888, "eval_loss": 0.46546536684036255, "eval_runtime": 8.0261, "eval_samples_per_second": 12.459, "eval_steps_per_second": 12.459, "step": 370 }, { "epoch": 3.3777777777777778, "grad_norm": 0.8650846481323242, "learning_rate": 3.308693936411421e-05, "loss": 0.0978, "step": 380 }, { "epoch": 3.3777777777777778, "eval_loss": 0.4744359850883484, "eval_runtime": 8.0343, "eval_samples_per_second": 12.447, "eval_steps_per_second": 12.447, "step": 380 }, { "epoch": 3.466666666666667, "grad_norm": 1.4763766527175903, "learning_rate": 2.8066019966134904e-05, "loss": 0.0746, "step": 390 }, { "epoch": 3.466666666666667, "eval_loss": 0.4815245568752289, "eval_runtime": 8.0329, "eval_samples_per_second": 12.449, "eval_steps_per_second": 12.449, "step": 390 }, { "epoch": 3.5555555555555554, "grad_norm": 0.7600903511047363, "learning_rate": 2.339555568810221e-05, "loss": 0.0897, "step": 400 }, { "epoch": 3.5555555555555554, "eval_loss": 0.4889250695705414, "eval_runtime": 8.0394, "eval_samples_per_second": 12.439, "eval_steps_per_second": 12.439, "step": 400 }, { "epoch": 3.6444444444444444, "grad_norm": 0.7227004766464233, "learning_rate": 1.9098300562505266e-05, "loss": 0.0645, "step": 410 }, { "epoch": 3.6444444444444444, "eval_loss": 0.4995201826095581, "eval_runtime": 8.0321, "eval_samples_per_second": 12.45, "eval_steps_per_second": 12.45, "step": 410 }, { "epoch": 3.7333333333333334, "grad_norm": 1.2814204692840576, "learning_rate": 1.5195190384357404e-05, "loss": 0.0649, "step": 420 }, { "epoch": 3.7333333333333334, "eval_loss": 0.5078675746917725, "eval_runtime": 8.0419, "eval_samples_per_second": 12.435, "eval_steps_per_second": 12.435, "step": 420 }, { "epoch": 3.822222222222222, "grad_norm": 1.3571559190750122, "learning_rate": 1.1705240714107302e-05, "loss": 0.0896, "step": 430 }, { "epoch": 3.822222222222222, "eval_loss": 0.5097964406013489, "eval_runtime": 8.034, "eval_samples_per_second": 12.447, "eval_steps_per_second": 12.447, "step": 430 }, { "epoch": 3.911111111111111, "grad_norm": 1.125535488128662, "learning_rate": 8.645454235739903e-06, "loss": 0.0788, "step": 440 }, { "epoch": 3.911111111111111, "eval_loss": 0.5094956755638123, "eval_runtime": 8.0631, "eval_samples_per_second": 12.402, "eval_steps_per_second": 12.402, "step": 440 }, { "epoch": 4.0, "grad_norm": 3.562880754470825, "learning_rate": 6.030737921409169e-06, "loss": 0.0886, "step": 450 }, { "epoch": 4.0, "eval_loss": 0.51046222448349, "eval_runtime": 8.1706, "eval_samples_per_second": 12.239, "eval_steps_per_second": 12.239, "step": 450 }, { "epoch": 4.088888888888889, "grad_norm": 0.6980682611465454, "learning_rate": 3.873830406168111e-06, "loss": 0.0471, "step": 460 }, { "epoch": 4.088888888888889, "eval_loss": 0.511073887348175, "eval_runtime": 8.2725, "eval_samples_per_second": 12.088, "eval_steps_per_second": 12.088, "step": 460 }, { "epoch": 4.177777777777778, "grad_norm": 0.7668414115905762, "learning_rate": 2.1852399266194314e-06, "loss": 0.0461, "step": 470 }, { "epoch": 4.177777777777778, "eval_loss": 0.515235185623169, "eval_runtime": 8.2096, "eval_samples_per_second": 12.181, "eval_steps_per_second": 12.181, "step": 470 }, { "epoch": 4.266666666666667, "grad_norm": 1.7168527841567993, "learning_rate": 9.731931258429638e-07, "loss": 0.0607, "step": 480 }, { "epoch": 4.266666666666667, "eval_loss": 0.5151567459106445, "eval_runtime": 8.2315, "eval_samples_per_second": 12.149, "eval_steps_per_second": 12.149, "step": 480 }, { "epoch": 4.355555555555555, "grad_norm": 0.8711584806442261, "learning_rate": 2.4359497401758024e-07, "loss": 0.0473, "step": 490 }, { "epoch": 4.355555555555555, "eval_loss": 0.519190788269043, "eval_runtime": 8.2622, "eval_samples_per_second": 12.103, "eval_steps_per_second": 12.103, "step": 490 }, { "epoch": 4.444444444444445, "grad_norm": 1.0105704069137573, "learning_rate": 0.0, "loss": 0.052, "step": 500 }, { "epoch": 4.444444444444445, "eval_loss": 0.5163235664367676, "eval_runtime": 8.4465, "eval_samples_per_second": 11.839, "eval_steps_per_second": 11.839, "step": 500 }, { "epoch": 4.444444444444445, "step": 500, "total_flos": 1.496402435211264e+16, "train_loss": 0.3140584453344345, "train_runtime": 1600.6227, "train_samples_per_second": 2.499, "train_steps_per_second": 0.312 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.496402435211264e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }