{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.971509971509972, "eval_steps": 100, "global_step": 10500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09, "eval_loss": 0.3346759080886841, "eval_runtime": 218.6538, "eval_samples_per_second": 103.456, "eval_steps_per_second": 6.467, "step": 100 }, { "epoch": 0.19, "eval_loss": 0.31195777654647827, "eval_runtime": 218.3816, "eval_samples_per_second": 103.585, "eval_steps_per_second": 6.475, "step": 200 }, { "epoch": 0.28, "eval_loss": 0.31596091389656067, "eval_runtime": 220.5173, "eval_samples_per_second": 102.582, "eval_steps_per_second": 6.412, "step": 300 }, { "epoch": 0.38, "eval_loss": 0.3161667287349701, "eval_runtime": 218.5361, "eval_samples_per_second": 103.512, "eval_steps_per_second": 6.47, "step": 400 }, { "epoch": 0.47, "learning_rate": 4.7625830959164296e-05, "loss": 0.1792, "step": 500 }, { "epoch": 0.47, "eval_loss": 0.3097754120826721, "eval_runtime": 219.0553, "eval_samples_per_second": 103.266, "eval_steps_per_second": 6.455, "step": 500 }, { "epoch": 0.57, "eval_loss": 0.30482736229896545, "eval_runtime": 218.842, "eval_samples_per_second": 103.367, "eval_steps_per_second": 6.461, "step": 600 }, { "epoch": 0.66, "eval_loss": 0.3050496578216553, "eval_runtime": 219.6074, "eval_samples_per_second": 103.007, "eval_steps_per_second": 6.439, "step": 700 }, { "epoch": 0.76, "eval_loss": 0.30001184344291687, "eval_runtime": 219.2072, "eval_samples_per_second": 103.195, "eval_steps_per_second": 6.451, "step": 800 }, { "epoch": 0.85, "eval_loss": 0.3052184283733368, "eval_runtime": 219.1722, "eval_samples_per_second": 103.211, "eval_steps_per_second": 6.452, "step": 900 }, { "epoch": 0.95, "learning_rate": 4.5251661918328584e-05, "loss": 0.1198, "step": 1000 }, { "epoch": 0.95, "eval_loss": 0.3005639314651489, "eval_runtime": 218.7588, "eval_samples_per_second": 103.406, "eval_steps_per_second": 6.464, "step": 1000 }, { "epoch": 1.04, "eval_loss": 0.29650744795799255, "eval_runtime": 218.5819, "eval_samples_per_second": 103.49, "eval_steps_per_second": 6.469, "step": 1100 }, { "epoch": 1.14, "eval_loss": 0.2948579788208008, "eval_runtime": 219.004, "eval_samples_per_second": 103.29, "eval_steps_per_second": 6.457, "step": 1200 }, { "epoch": 1.23, "eval_loss": 0.28149962425231934, "eval_runtime": 218.3331, "eval_samples_per_second": 103.608, "eval_steps_per_second": 6.476, "step": 1300 }, { "epoch": 1.33, "eval_loss": 0.28821900486946106, "eval_runtime": 218.9147, "eval_samples_per_second": 103.333, "eval_steps_per_second": 6.459, "step": 1400 }, { "epoch": 1.42, "learning_rate": 4.287749287749288e-05, "loss": 0.1092, "step": 1500 }, { "epoch": 1.42, "eval_loss": 0.28900569677352905, "eval_runtime": 218.7775, "eval_samples_per_second": 103.397, "eval_steps_per_second": 6.463, "step": 1500 }, { "epoch": 1.52, "eval_loss": 0.2834137976169586, "eval_runtime": 218.2122, "eval_samples_per_second": 103.665, "eval_steps_per_second": 6.48, "step": 1600 }, { "epoch": 1.61, "eval_loss": 0.2864611744880676, "eval_runtime": 220.2143, "eval_samples_per_second": 102.723, "eval_steps_per_second": 6.421, "step": 1700 }, { "epoch": 1.71, "eval_loss": 0.28005194664001465, "eval_runtime": 218.8438, "eval_samples_per_second": 103.366, "eval_steps_per_second": 6.461, "step": 1800 }, { "epoch": 1.8, "eval_loss": 0.28052985668182373, "eval_runtime": 220.1784, "eval_samples_per_second": 102.739, "eval_steps_per_second": 6.422, "step": 1900 }, { "epoch": 1.9, "learning_rate": 4.050332383665717e-05, "loss": 0.099, "step": 2000 }, { "epoch": 1.9, "eval_loss": 0.2817462384700775, "eval_runtime": 219.1612, "eval_samples_per_second": 103.216, "eval_steps_per_second": 6.452, "step": 2000 }, { "epoch": 1.99, "eval_loss": 0.2855830788612366, "eval_runtime": 218.4225, "eval_samples_per_second": 103.565, "eval_steps_per_second": 6.474, "step": 2100 }, { "epoch": 2.09, "eval_loss": 0.2786203622817993, "eval_runtime": 220.2549, "eval_samples_per_second": 102.704, "eval_steps_per_second": 6.42, "step": 2200 }, { "epoch": 2.18, "eval_loss": 0.282156765460968, "eval_runtime": 218.8776, "eval_samples_per_second": 103.35, "eval_steps_per_second": 6.46, "step": 2300 }, { "epoch": 2.28, "eval_loss": 0.2802504599094391, "eval_runtime": 218.6184, "eval_samples_per_second": 103.473, "eval_steps_per_second": 6.468, "step": 2400 }, { "epoch": 2.37, "learning_rate": 3.8129154795821466e-05, "loss": 0.094, "step": 2500 }, { "epoch": 2.37, "eval_loss": 0.28312984108924866, "eval_runtime": 219.677, "eval_samples_per_second": 102.974, "eval_steps_per_second": 6.437, "step": 2500 }, { "epoch": 2.47, "eval_loss": 0.2841149866580963, "eval_runtime": 219.3064, "eval_samples_per_second": 103.148, "eval_steps_per_second": 6.448, "step": 2600 }, { "epoch": 2.56, "eval_loss": 0.2737499177455902, "eval_runtime": 218.5172, "eval_samples_per_second": 103.52, "eval_steps_per_second": 6.471, "step": 2700 }, { "epoch": 2.66, "eval_loss": 0.27663424611091614, "eval_runtime": 219.4686, "eval_samples_per_second": 103.072, "eval_steps_per_second": 6.443, "step": 2800 }, { "epoch": 2.75, "eval_loss": 0.27256855368614197, "eval_runtime": 218.7767, "eval_samples_per_second": 103.398, "eval_steps_per_second": 6.463, "step": 2900 }, { "epoch": 2.85, "learning_rate": 3.575498575498576e-05, "loss": 0.0891, "step": 3000 }, { "epoch": 2.85, "eval_loss": 0.27302590012550354, "eval_runtime": 218.602, "eval_samples_per_second": 103.48, "eval_steps_per_second": 6.468, "step": 3000 }, { "epoch": 2.94, "eval_loss": 0.27365198731422424, "eval_runtime": 219.6229, "eval_samples_per_second": 102.999, "eval_steps_per_second": 6.438, "step": 3100 }, { "epoch": 3.04, "eval_loss": 0.26861417293548584, "eval_runtime": 219.0414, "eval_samples_per_second": 103.273, "eval_steps_per_second": 6.455, "step": 3200 }, { "epoch": 3.13, "eval_loss": 0.2664526700973511, "eval_runtime": 218.6581, "eval_samples_per_second": 103.454, "eval_steps_per_second": 6.467, "step": 3300 }, { "epoch": 3.23, "eval_loss": 0.27480828762054443, "eval_runtime": 220.5641, "eval_samples_per_second": 102.56, "eval_steps_per_second": 6.411, "step": 3400 }, { "epoch": 3.32, "learning_rate": 3.338081671415005e-05, "loss": 0.0862, "step": 3500 }, { "epoch": 3.32, "eval_loss": 0.26794129610061646, "eval_runtime": 218.5638, "eval_samples_per_second": 103.498, "eval_steps_per_second": 6.47, "step": 3500 }, { "epoch": 3.42, "eval_loss": 0.2703064978122711, "eval_runtime": 218.4977, "eval_samples_per_second": 103.53, "eval_steps_per_second": 6.471, "step": 3600 }, { "epoch": 3.51, "eval_loss": 0.2635132670402527, "eval_runtime": 219.2033, "eval_samples_per_second": 103.196, "eval_steps_per_second": 6.451, "step": 3700 }, { "epoch": 3.61, "eval_loss": 0.27066901326179504, "eval_runtime": 219.2382, "eval_samples_per_second": 103.18, "eval_steps_per_second": 6.45, "step": 3800 }, { "epoch": 3.7, "eval_loss": 0.26447921991348267, "eval_runtime": 219.2131, "eval_samples_per_second": 103.192, "eval_steps_per_second": 6.45, "step": 3900 }, { "epoch": 3.8, "learning_rate": 3.100664767331434e-05, "loss": 0.0838, "step": 4000 }, { "epoch": 3.8, "eval_loss": 0.2692434787750244, "eval_runtime": 219.4395, "eval_samples_per_second": 103.085, "eval_steps_per_second": 6.444, "step": 4000 }, { "epoch": 3.89, "eval_loss": 0.2642222046852112, "eval_runtime": 219.413, "eval_samples_per_second": 103.098, "eval_steps_per_second": 6.444, "step": 4100 }, { "epoch": 3.99, "eval_loss": 0.2643529176712036, "eval_runtime": 219.2041, "eval_samples_per_second": 103.196, "eval_steps_per_second": 6.451, "step": 4200 }, { "epoch": 4.08, "eval_loss": 0.25718453526496887, "eval_runtime": 219.1679, "eval_samples_per_second": 103.213, "eval_steps_per_second": 6.452, "step": 4300 }, { "epoch": 4.18, "eval_loss": 0.26762890815734863, "eval_runtime": 218.6308, "eval_samples_per_second": 103.467, "eval_steps_per_second": 6.468, "step": 4400 }, { "epoch": 4.27, "learning_rate": 2.863247863247863e-05, "loss": 0.0761, "step": 4500 }, { "epoch": 4.27, "eval_loss": 0.26568803191185, "eval_runtime": 220.7096, "eval_samples_per_second": 102.492, "eval_steps_per_second": 6.407, "step": 4500 }, { "epoch": 4.37, "eval_loss": 0.26290062069892883, "eval_runtime": 219.3895, "eval_samples_per_second": 103.109, "eval_steps_per_second": 6.445, "step": 4600 }, { "epoch": 4.46, "eval_loss": 0.26172617077827454, "eval_runtime": 219.4776, "eval_samples_per_second": 103.067, "eval_steps_per_second": 6.443, "step": 4700 }, { "epoch": 4.56, "eval_loss": 0.26161935925483704, "eval_runtime": 219.3677, "eval_samples_per_second": 103.119, "eval_steps_per_second": 6.446, "step": 4800 }, { "epoch": 4.65, "eval_loss": 0.2620932459831238, "eval_runtime": 218.6157, "eval_samples_per_second": 103.474, "eval_steps_per_second": 6.468, "step": 4900 }, { "epoch": 4.75, "learning_rate": 2.6258309591642926e-05, "loss": 0.0741, "step": 5000 }, { "epoch": 4.75, "eval_loss": 0.26253631711006165, "eval_runtime": 218.59, "eval_samples_per_second": 103.486, "eval_steps_per_second": 6.469, "step": 5000 }, { "epoch": 4.84, "eval_loss": 0.25699007511138916, "eval_runtime": 219.4178, "eval_samples_per_second": 103.096, "eval_steps_per_second": 6.444, "step": 5100 }, { "epoch": 4.94, "eval_loss": 0.2583966851234436, "eval_runtime": 218.5589, "eval_samples_per_second": 103.501, "eval_steps_per_second": 6.47, "step": 5200 }, { "epoch": 5.03, "eval_loss": 0.25885534286499023, "eval_runtime": 219.0916, "eval_samples_per_second": 103.249, "eval_steps_per_second": 6.454, "step": 5300 }, { "epoch": 5.13, "eval_loss": 0.25685915350914, "eval_runtime": 219.2049, "eval_samples_per_second": 103.196, "eval_steps_per_second": 6.451, "step": 5400 }, { "epoch": 5.22, "learning_rate": 2.388414055080722e-05, "loss": 0.0769, "step": 5500 }, { "epoch": 5.22, "eval_loss": 0.2619025707244873, "eval_runtime": 219.3928, "eval_samples_per_second": 103.107, "eval_steps_per_second": 6.445, "step": 5500 }, { "epoch": 5.32, "eval_loss": 0.25433388352394104, "eval_runtime": 219.2766, "eval_samples_per_second": 103.162, "eval_steps_per_second": 6.448, "step": 5600 }, { "epoch": 5.41, "eval_loss": 0.25473591685295105, "eval_runtime": 219.1683, "eval_samples_per_second": 103.213, "eval_steps_per_second": 6.452, "step": 5700 }, { "epoch": 5.51, "eval_loss": 0.2583990693092346, "eval_runtime": 218.8181, "eval_samples_per_second": 103.378, "eval_steps_per_second": 6.462, "step": 5800 }, { "epoch": 5.6, "eval_loss": 0.2513364255428314, "eval_runtime": 219.6254, "eval_samples_per_second": 102.998, "eval_steps_per_second": 6.438, "step": 5900 }, { "epoch": 5.7, "learning_rate": 2.150997150997151e-05, "loss": 0.0701, "step": 6000 }, { "epoch": 5.7, "eval_loss": 0.25798743963241577, "eval_runtime": 218.5917, "eval_samples_per_second": 103.485, "eval_steps_per_second": 6.469, "step": 6000 }, { "epoch": 5.79, "eval_loss": 0.252897173166275, "eval_runtime": 218.8327, "eval_samples_per_second": 103.371, "eval_steps_per_second": 6.462, "step": 6100 }, { "epoch": 5.89, "eval_loss": 0.25619062781333923, "eval_runtime": 218.7633, "eval_samples_per_second": 103.404, "eval_steps_per_second": 6.464, "step": 6200 }, { "epoch": 5.98, "eval_loss": 0.25197675824165344, "eval_runtime": 218.5961, "eval_samples_per_second": 103.483, "eval_steps_per_second": 6.469, "step": 6300 }, { "epoch": 6.08, "eval_loss": 0.2581734359264374, "eval_runtime": 219.6175, "eval_samples_per_second": 103.002, "eval_steps_per_second": 6.438, "step": 6400 }, { "epoch": 6.17, "learning_rate": 1.91358024691358e-05, "loss": 0.0684, "step": 6500 }, { "epoch": 6.17, "eval_loss": 0.2550990581512451, "eval_runtime": 218.6851, "eval_samples_per_second": 103.441, "eval_steps_per_second": 6.466, "step": 6500 }, { "epoch": 6.27, "eval_loss": 0.2555626332759857, "eval_runtime": 219.3776, "eval_samples_per_second": 103.114, "eval_steps_per_second": 6.446, "step": 6600 }, { "epoch": 6.36, "eval_loss": 0.25543132424354553, "eval_runtime": 218.6367, "eval_samples_per_second": 103.464, "eval_steps_per_second": 6.467, "step": 6700 }, { "epoch": 6.46, "eval_loss": 0.25568485260009766, "eval_runtime": 219.9462, "eval_samples_per_second": 102.848, "eval_steps_per_second": 6.429, "step": 6800 }, { "epoch": 6.55, "eval_loss": 0.25582244992256165, "eval_runtime": 218.6889, "eval_samples_per_second": 103.439, "eval_steps_per_second": 6.466, "step": 6900 }, { "epoch": 6.65, "learning_rate": 1.6761633428300098e-05, "loss": 0.0662, "step": 7000 }, { "epoch": 6.65, "eval_loss": 0.25293371081352234, "eval_runtime": 219.8419, "eval_samples_per_second": 102.897, "eval_steps_per_second": 6.432, "step": 7000 }, { "epoch": 6.74, "eval_loss": 0.249311164021492, "eval_runtime": 219.1307, "eval_samples_per_second": 103.231, "eval_steps_per_second": 6.453, "step": 7100 }, { "epoch": 6.84, "eval_loss": 0.25428083539009094, "eval_runtime": 218.6179, "eval_samples_per_second": 103.473, "eval_steps_per_second": 6.468, "step": 7200 }, { "epoch": 6.93, "eval_loss": 0.25905051827430725, "eval_runtime": 219.7362, "eval_samples_per_second": 102.946, "eval_steps_per_second": 6.435, "step": 7300 }, { "epoch": 7.03, "eval_loss": 0.25161299109458923, "eval_runtime": 218.5372, "eval_samples_per_second": 103.511, "eval_steps_per_second": 6.47, "step": 7400 }, { "epoch": 7.12, "learning_rate": 1.4387464387464389e-05, "loss": 0.0659, "step": 7500 }, { "epoch": 7.12, "eval_loss": 0.2567010223865509, "eval_runtime": 219.8696, "eval_samples_per_second": 102.884, "eval_steps_per_second": 6.431, "step": 7500 }, { "epoch": 7.22, "eval_loss": 0.2568127512931824, "eval_runtime": 218.4916, "eval_samples_per_second": 103.533, "eval_steps_per_second": 6.472, "step": 7600 }, { "epoch": 7.31, "eval_loss": 0.24921847879886627, "eval_runtime": 219.9605, "eval_samples_per_second": 102.841, "eval_steps_per_second": 6.428, "step": 7700 }, { "epoch": 7.41, "eval_loss": 0.24751408398151398, "eval_runtime": 218.5269, "eval_samples_per_second": 103.516, "eval_steps_per_second": 6.471, "step": 7800 }, { "epoch": 7.5, "eval_loss": 0.24565500020980835, "eval_runtime": 219.112, "eval_samples_per_second": 103.239, "eval_steps_per_second": 6.453, "step": 7900 }, { "epoch": 7.6, "learning_rate": 1.2013295346628681e-05, "loss": 0.0641, "step": 8000 }, { "epoch": 7.6, "eval_loss": 0.2555699646472931, "eval_runtime": 218.4879, "eval_samples_per_second": 103.534, "eval_steps_per_second": 6.472, "step": 8000 }, { "epoch": 7.69, "eval_loss": 0.25907498598098755, "eval_runtime": 218.8796, "eval_samples_per_second": 103.349, "eval_steps_per_second": 6.46, "step": 8100 }, { "epoch": 7.79, "eval_loss": 0.2498636245727539, "eval_runtime": 218.2333, "eval_samples_per_second": 103.655, "eval_steps_per_second": 6.479, "step": 8200 }, { "epoch": 7.88, "eval_loss": 0.24620996415615082, "eval_runtime": 218.4049, "eval_samples_per_second": 103.574, "eval_steps_per_second": 6.474, "step": 8300 }, { "epoch": 7.98, "eval_loss": 0.2514709234237671, "eval_runtime": 218.5279, "eval_samples_per_second": 103.515, "eval_steps_per_second": 6.471, "step": 8400 }, { "epoch": 8.07, "learning_rate": 9.639126305792973e-06, "loss": 0.0614, "step": 8500 }, { "epoch": 8.07, "eval_loss": 0.24893517792224884, "eval_runtime": 218.4312, "eval_samples_per_second": 103.561, "eval_steps_per_second": 6.473, "step": 8500 }, { "epoch": 8.17, "eval_loss": 0.24635082483291626, "eval_runtime": 218.6536, "eval_samples_per_second": 103.456, "eval_steps_per_second": 6.467, "step": 8600 }, { "epoch": 8.26, "eval_loss": 0.24810662865638733, "eval_runtime": 218.4312, "eval_samples_per_second": 103.561, "eval_steps_per_second": 6.473, "step": 8700 }, { "epoch": 8.36, "eval_loss": 0.24930644035339355, "eval_runtime": 218.7482, "eval_samples_per_second": 103.411, "eval_steps_per_second": 6.464, "step": 8800 }, { "epoch": 8.45, "eval_loss": 0.24013535678386688, "eval_runtime": 218.398, "eval_samples_per_second": 103.577, "eval_steps_per_second": 6.474, "step": 8900 }, { "epoch": 8.55, "learning_rate": 7.264957264957266e-06, "loss": 0.0609, "step": 9000 }, { "epoch": 8.55, "eval_loss": 0.2461770623922348, "eval_runtime": 218.9012, "eval_samples_per_second": 103.339, "eval_steps_per_second": 6.46, "step": 9000 }, { "epoch": 8.64, "eval_loss": 0.24963241815567017, "eval_runtime": 218.2697, "eval_samples_per_second": 103.638, "eval_steps_per_second": 6.478, "step": 9100 }, { "epoch": 8.74, "eval_loss": 0.24689340591430664, "eval_runtime": 218.5291, "eval_samples_per_second": 103.515, "eval_steps_per_second": 6.471, "step": 9200 }, { "epoch": 8.83, "eval_loss": 0.2498210072517395, "eval_runtime": 218.2515, "eval_samples_per_second": 103.646, "eval_steps_per_second": 6.479, "step": 9300 }, { "epoch": 8.93, "eval_loss": 0.24612723290920258, "eval_runtime": 218.3684, "eval_samples_per_second": 103.591, "eval_steps_per_second": 6.475, "step": 9400 }, { "epoch": 9.02, "learning_rate": 4.890788224121558e-06, "loss": 0.0588, "step": 9500 }, { "epoch": 9.02, "eval_loss": 0.247446671128273, "eval_runtime": 218.2559, "eval_samples_per_second": 103.644, "eval_steps_per_second": 6.479, "step": 9500 }, { "epoch": 9.12, "eval_loss": 0.24682562053203583, "eval_runtime": 218.332, "eval_samples_per_second": 103.608, "eval_steps_per_second": 6.476, "step": 9600 }, { "epoch": 9.21, "eval_loss": 0.2363210916519165, "eval_runtime": 218.2595, "eval_samples_per_second": 103.643, "eval_steps_per_second": 6.479, "step": 9700 }, { "epoch": 9.31, "eval_loss": 0.24492982029914856, "eval_runtime": 218.2962, "eval_samples_per_second": 103.625, "eval_steps_per_second": 6.477, "step": 9800 }, { "epoch": 9.4, "eval_loss": 0.247760608792305, "eval_runtime": 218.2063, "eval_samples_per_second": 103.668, "eval_steps_per_second": 6.48, "step": 9900 }, { "epoch": 9.5, "learning_rate": 2.51661918328585e-06, "loss": 0.0604, "step": 10000 }, { "epoch": 9.5, "eval_loss": 0.24903397262096405, "eval_runtime": 218.2843, "eval_samples_per_second": 103.631, "eval_steps_per_second": 6.478, "step": 10000 }, { "epoch": 9.59, "eval_loss": 0.25066474080085754, "eval_runtime": 218.2601, "eval_samples_per_second": 103.642, "eval_steps_per_second": 6.479, "step": 10100 }, { "epoch": 9.69, "eval_loss": 0.24707233905792236, "eval_runtime": 218.2446, "eval_samples_per_second": 103.65, "eval_steps_per_second": 6.479, "step": 10200 }, { "epoch": 9.78, "eval_loss": 0.24911357462406158, "eval_runtime": 218.2948, "eval_samples_per_second": 103.626, "eval_steps_per_second": 6.477, "step": 10300 }, { "epoch": 9.88, "eval_loss": 0.24460090696811676, "eval_runtime": 218.238, "eval_samples_per_second": 103.653, "eval_steps_per_second": 6.479, "step": 10400 }, { "epoch": 9.97, "learning_rate": 1.4245014245014247e-07, "loss": 0.0573, "step": 10500 }, { "epoch": 9.97, "eval_loss": 0.24572330713272095, "eval_runtime": 218.3788, "eval_samples_per_second": 103.586, "eval_steps_per_second": 6.475, "step": 10500 } ], "logging_steps": 500, "max_steps": 10530, "num_train_epochs": 10, "save_steps": 100, "total_flos": 1.7676748920639283e+17, "trial_name": null, "trial_params": null }