diff --git "a/checkpoint-320/trainer_state.json" "b/checkpoint-320/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-320/trainer_state.json" @@ -0,0 +1,4833 @@ +{ + "best_metric": 0.6766157746315002, + "best_model_checkpoint": "./checkpoints/llava-v1.6-vicuna-7b_anyres/checkpoint-250", + "epoch": 10.0, + "eval_steps": 1.0, + "global_step": 320, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03125, + "grad_norm": 1.0039058937636163, + "learning_rate": 0.0, + "loss": 1.3969, + "step": 1 + }, + { + "epoch": 0.03125, + "eval_loss": 1.4111441373825073, + "eval_runtime": 50.4639, + "eval_samples_per_second": 3.963, + "eval_steps_per_second": 0.495, + "step": 1 + }, + { + "epoch": 0.0625, + "grad_norm": 0.8420754522690636, + "learning_rate": 2e-05, + "loss": 1.3382, + "step": 2 + }, + { + "epoch": 0.0625, + "eval_loss": 1.4111441373825073, + "eval_runtime": 43.3333, + "eval_samples_per_second": 4.615, + "eval_steps_per_second": 0.577, + "step": 2 + }, + { + "epoch": 0.09375, + "grad_norm": 0.8367925175081548, + "learning_rate": 2e-05, + "loss": 1.3867, + "step": 3 + }, + { + "epoch": 0.09375, + "eval_loss": 1.3688743114471436, + "eval_runtime": 43.5247, + "eval_samples_per_second": 4.595, + "eval_steps_per_second": 0.574, + "step": 3 + }, + { + "epoch": 0.125, + "grad_norm": 0.7061648883003396, + "learning_rate": 2e-05, + "loss": 1.3331, + "step": 4 + }, + { + "epoch": 0.125, + "eval_loss": 1.3259124755859375, + "eval_runtime": 43.4317, + "eval_samples_per_second": 4.605, + "eval_steps_per_second": 0.576, + "step": 4 + }, + { + "epoch": 0.15625, + "grad_norm": 0.8059747640123492, + "learning_rate": 2e-05, + "loss": 1.3031, + "step": 5 + }, + { + "epoch": 0.15625, + "eval_loss": 1.2872124910354614, + "eval_runtime": 43.4379, + "eval_samples_per_second": 4.604, + "eval_steps_per_second": 0.576, + "step": 5 + }, + { + "epoch": 0.1875, + "grad_norm": 0.7045153329302901, + "learning_rate": 2e-05, + "loss": 1.2771, + "step": 6 + }, + { + "epoch": 0.1875, + "eval_loss": 1.2505193948745728, + "eval_runtime": 43.5902, + "eval_samples_per_second": 4.588, + "eval_steps_per_second": 0.574, + "step": 6 + }, + { + "epoch": 0.21875, + "grad_norm": 0.6329971562106237, + "learning_rate": 2e-05, + "loss": 1.249, + "step": 7 + }, + { + "epoch": 0.21875, + "eval_loss": 1.2199320793151855, + "eval_runtime": 43.4066, + "eval_samples_per_second": 4.608, + "eval_steps_per_second": 0.576, + "step": 7 + }, + { + "epoch": 0.25, + "grad_norm": 0.5550979385222247, + "learning_rate": 2e-05, + "loss": 1.2257, + "step": 8 + }, + { + "epoch": 0.25, + "eval_loss": 1.1977466344833374, + "eval_runtime": 43.5387, + "eval_samples_per_second": 4.594, + "eval_steps_per_second": 0.574, + "step": 8 + }, + { + "epoch": 0.28125, + "grad_norm": 0.4406797963422461, + "learning_rate": 2e-05, + "loss": 1.2462, + "step": 9 + }, + { + "epoch": 0.28125, + "eval_loss": 1.179214358329773, + "eval_runtime": 43.4861, + "eval_samples_per_second": 4.599, + "eval_steps_per_second": 0.575, + "step": 9 + }, + { + "epoch": 0.3125, + "grad_norm": 0.42022162096647486, + "learning_rate": 2e-05, + "loss": 1.1858, + "step": 10 + }, + { + "epoch": 0.3125, + "eval_loss": 1.1616674661636353, + "eval_runtime": 43.8611, + "eval_samples_per_second": 4.56, + "eval_steps_per_second": 0.57, + "step": 10 + }, + { + "epoch": 0.34375, + "grad_norm": 0.39691998835013426, + "learning_rate": 2e-05, + "loss": 1.235, + "step": 11 + }, + { + "epoch": 0.34375, + "eval_loss": 1.1443771123886108, + "eval_runtime": 43.5109, + "eval_samples_per_second": 4.597, + "eval_steps_per_second": 0.575, + "step": 11 + }, + { + "epoch": 0.375, + "grad_norm": 0.4500748148291364, + "learning_rate": 2e-05, + "loss": 1.1953, + "step": 12 + }, + { + "epoch": 0.375, + "eval_loss": 1.1261780261993408, + "eval_runtime": 44.8553, + "eval_samples_per_second": 4.459, + "eval_steps_per_second": 0.557, + "step": 12 + }, + { + "epoch": 0.40625, + "grad_norm": 0.4777471950803986, + "learning_rate": 2e-05, + "loss": 1.2094, + "step": 13 + }, + { + "epoch": 0.40625, + "eval_loss": 1.1074599027633667, + "eval_runtime": 43.6762, + "eval_samples_per_second": 4.579, + "eval_steps_per_second": 0.572, + "step": 13 + }, + { + "epoch": 0.4375, + "grad_norm": 0.45433160021015917, + "learning_rate": 2e-05, + "loss": 1.0426, + "step": 14 + }, + { + "epoch": 0.4375, + "eval_loss": 1.089483380317688, + "eval_runtime": 43.9528, + "eval_samples_per_second": 4.55, + "eval_steps_per_second": 0.569, + "step": 14 + }, + { + "epoch": 0.46875, + "grad_norm": 0.39854476457233645, + "learning_rate": 2e-05, + "loss": 1.1595, + "step": 15 + }, + { + "epoch": 0.46875, + "eval_loss": 1.0731947422027588, + "eval_runtime": 43.3809, + "eval_samples_per_second": 4.61, + "eval_steps_per_second": 0.576, + "step": 15 + }, + { + "epoch": 0.5, + "grad_norm": 0.41898459581564557, + "learning_rate": 2e-05, + "loss": 1.0923, + "step": 16 + }, + { + "epoch": 0.5, + "eval_loss": 1.0587964057922363, + "eval_runtime": 46.3861, + "eval_samples_per_second": 4.312, + "eval_steps_per_second": 0.539, + "step": 16 + }, + { + "epoch": 0.53125, + "grad_norm": 0.3748700393546972, + "learning_rate": 2e-05, + "loss": 1.0973, + "step": 17 + }, + { + "epoch": 0.53125, + "eval_loss": 1.0456310510635376, + "eval_runtime": 44.8571, + "eval_samples_per_second": 4.459, + "eval_steps_per_second": 0.557, + "step": 17 + }, + { + "epoch": 0.5625, + "grad_norm": 0.5226526211782249, + "learning_rate": 2e-05, + "loss": 1.0901, + "step": 18 + }, + { + "epoch": 0.5625, + "eval_loss": 1.0317203998565674, + "eval_runtime": 44.6579, + "eval_samples_per_second": 4.478, + "eval_steps_per_second": 0.56, + "step": 18 + }, + { + "epoch": 0.59375, + "grad_norm": 0.3769885031745698, + "learning_rate": 2e-05, + "loss": 1.0033, + "step": 19 + }, + { + "epoch": 0.59375, + "eval_loss": 1.0182812213897705, + "eval_runtime": 44.6735, + "eval_samples_per_second": 4.477, + "eval_steps_per_second": 0.56, + "step": 19 + }, + { + "epoch": 0.625, + "grad_norm": 0.34752776954348064, + "learning_rate": 2e-05, + "loss": 1.1256, + "step": 20 + }, + { + "epoch": 0.625, + "eval_loss": 1.0062216520309448, + "eval_runtime": 44.4317, + "eval_samples_per_second": 4.501, + "eval_steps_per_second": 0.563, + "step": 20 + }, + { + "epoch": 0.65625, + "grad_norm": 0.275958956017114, + "learning_rate": 2e-05, + "loss": 1.0333, + "step": 21 + }, + { + "epoch": 0.65625, + "eval_loss": 0.9957399964332581, + "eval_runtime": 46.4719, + "eval_samples_per_second": 4.304, + "eval_steps_per_second": 0.538, + "step": 21 + }, + { + "epoch": 0.6875, + "grad_norm": 0.31928085878737833, + "learning_rate": 2e-05, + "loss": 1.0847, + "step": 22 + }, + { + "epoch": 0.6875, + "eval_loss": 0.9862645864486694, + "eval_runtime": 46.7925, + "eval_samples_per_second": 4.274, + "eval_steps_per_second": 0.534, + "step": 22 + }, + { + "epoch": 0.71875, + "grad_norm": 0.26966401299568643, + "learning_rate": 2e-05, + "loss": 1.0678, + "step": 23 + }, + { + "epoch": 0.71875, + "eval_loss": 0.9774981141090393, + "eval_runtime": 46.2095, + "eval_samples_per_second": 4.328, + "eval_steps_per_second": 0.541, + "step": 23 + }, + { + "epoch": 0.75, + "grad_norm": 0.24088872786986867, + "learning_rate": 2e-05, + "loss": 1.064, + "step": 24 + }, + { + "epoch": 0.75, + "eval_loss": 0.9695597887039185, + "eval_runtime": 47.1059, + "eval_samples_per_second": 4.246, + "eval_steps_per_second": 0.531, + "step": 24 + }, + { + "epoch": 0.78125, + "grad_norm": 0.27631902106476014, + "learning_rate": 2e-05, + "loss": 1.0141, + "step": 25 + }, + { + "epoch": 0.78125, + "eval_loss": 0.9618983268737793, + "eval_runtime": 46.1528, + "eval_samples_per_second": 4.333, + "eval_steps_per_second": 0.542, + "step": 25 + }, + { + "epoch": 0.8125, + "grad_norm": 0.24434161495988888, + "learning_rate": 2e-05, + "loss": 1.0376, + "step": 26 + }, + { + "epoch": 0.8125, + "eval_loss": 0.9548751711845398, + "eval_runtime": 45.7844, + "eval_samples_per_second": 4.368, + "eval_steps_per_second": 0.546, + "step": 26 + }, + { + "epoch": 0.84375, + "grad_norm": 0.25256672152337845, + "learning_rate": 2e-05, + "loss": 0.9632, + "step": 27 + }, + { + "epoch": 0.84375, + "eval_loss": 0.9482427835464478, + "eval_runtime": 47.8001, + "eval_samples_per_second": 4.184, + "eval_steps_per_second": 0.523, + "step": 27 + }, + { + "epoch": 0.875, + "grad_norm": 0.26872334126279845, + "learning_rate": 2e-05, + "loss": 0.9819, + "step": 28 + }, + { + "epoch": 0.875, + "eval_loss": 0.9416670203208923, + "eval_runtime": 47.157, + "eval_samples_per_second": 4.241, + "eval_steps_per_second": 0.53, + "step": 28 + }, + { + "epoch": 0.90625, + "grad_norm": 0.21711663558311656, + "learning_rate": 2e-05, + "loss": 0.9953, + "step": 29 + }, + { + "epoch": 0.90625, + "eval_loss": 0.9355730414390564, + "eval_runtime": 45.9328, + "eval_samples_per_second": 4.354, + "eval_steps_per_second": 0.544, + "step": 29 + }, + { + "epoch": 0.9375, + "grad_norm": 0.21636473054277702, + "learning_rate": 2e-05, + "loss": 1.0328, + "step": 30 + }, + { + "epoch": 0.9375, + "eval_loss": 0.9298823475837708, + "eval_runtime": 46.0325, + "eval_samples_per_second": 4.345, + "eval_steps_per_second": 0.543, + "step": 30 + }, + { + "epoch": 0.96875, + "grad_norm": 0.2530858798467821, + "learning_rate": 2e-05, + "loss": 0.8713, + "step": 31 + }, + { + "epoch": 0.96875, + "eval_loss": 0.9241495728492737, + "eval_runtime": 46.0309, + "eval_samples_per_second": 4.345, + "eval_steps_per_second": 0.543, + "step": 31 + }, + { + "epoch": 1.0, + "grad_norm": 0.2500917296208238, + "learning_rate": 2e-05, + "loss": 0.9831, + "step": 32 + }, + { + "epoch": 1.0, + "eval_loss": 0.9184038043022156, + "eval_runtime": 46.1304, + "eval_samples_per_second": 4.336, + "eval_steps_per_second": 0.542, + "step": 32 + }, + { + "epoch": 1.03125, + "grad_norm": 0.25563291180685294, + "learning_rate": 2e-05, + "loss": 1.0227, + "step": 33 + }, + { + "epoch": 1.03125, + "eval_loss": 0.9126191735267639, + "eval_runtime": 52.6388, + "eval_samples_per_second": 3.799, + "eval_steps_per_second": 0.475, + "step": 33 + }, + { + "epoch": 1.0625, + "grad_norm": 0.2225226787999786, + "learning_rate": 2e-05, + "loss": 1.0241, + "step": 34 + }, + { + "epoch": 1.0625, + "eval_loss": 0.9070788621902466, + "eval_runtime": 43.6322, + "eval_samples_per_second": 4.584, + "eval_steps_per_second": 0.573, + "step": 34 + }, + { + "epoch": 1.09375, + "grad_norm": 0.2052840697405099, + "learning_rate": 2e-05, + "loss": 1.0476, + "step": 35 + }, + { + "epoch": 1.09375, + "eval_loss": 0.9018412828445435, + "eval_runtime": 43.1975, + "eval_samples_per_second": 4.63, + "eval_steps_per_second": 0.579, + "step": 35 + }, + { + "epoch": 1.125, + "grad_norm": 0.23676392278447683, + "learning_rate": 2e-05, + "loss": 1.01, + "step": 36 + }, + { + "epoch": 1.125, + "eval_loss": 0.8966168761253357, + "eval_runtime": 45.9216, + "eval_samples_per_second": 4.355, + "eval_steps_per_second": 0.544, + "step": 36 + }, + { + "epoch": 1.15625, + "grad_norm": 0.22099733575664926, + "learning_rate": 2e-05, + "loss": 0.9525, + "step": 37 + }, + { + "epoch": 1.15625, + "eval_loss": 0.891795814037323, + "eval_runtime": 44.7872, + "eval_samples_per_second": 4.466, + "eval_steps_per_second": 0.558, + "step": 37 + }, + { + "epoch": 1.1875, + "grad_norm": 0.2527359179725302, + "learning_rate": 2e-05, + "loss": 0.9627, + "step": 38 + }, + { + "epoch": 1.1875, + "eval_loss": 0.8872839212417603, + "eval_runtime": 44.6369, + "eval_samples_per_second": 4.481, + "eval_steps_per_second": 0.56, + "step": 38 + }, + { + "epoch": 1.21875, + "grad_norm": 0.25432158026395235, + "learning_rate": 2e-05, + "loss": 0.9972, + "step": 39 + }, + { + "epoch": 1.21875, + "eval_loss": 0.8827975988388062, + "eval_runtime": 44.7753, + "eval_samples_per_second": 4.467, + "eval_steps_per_second": 0.558, + "step": 39 + }, + { + "epoch": 1.25, + "grad_norm": 0.24171584871667898, + "learning_rate": 2e-05, + "loss": 0.9897, + "step": 40 + }, + { + "epoch": 1.25, + "eval_loss": 0.8785097599029541, + "eval_runtime": 45.0743, + "eval_samples_per_second": 4.437, + "eval_steps_per_second": 0.555, + "step": 40 + }, + { + "epoch": 1.28125, + "grad_norm": 0.23629659647320733, + "learning_rate": 2e-05, + "loss": 0.9641, + "step": 41 + }, + { + "epoch": 1.28125, + "eval_loss": 0.8742367625236511, + "eval_runtime": 45.6624, + "eval_samples_per_second": 4.38, + "eval_steps_per_second": 0.547, + "step": 41 + }, + { + "epoch": 1.3125, + "grad_norm": 0.23515869880744614, + "learning_rate": 2e-05, + "loss": 0.9445, + "step": 42 + }, + { + "epoch": 1.3125, + "eval_loss": 0.8701191544532776, + "eval_runtime": 46.6778, + "eval_samples_per_second": 4.285, + "eval_steps_per_second": 0.536, + "step": 42 + }, + { + "epoch": 1.34375, + "grad_norm": 0.2328447853974619, + "learning_rate": 2e-05, + "loss": 0.9098, + "step": 43 + }, + { + "epoch": 1.34375, + "eval_loss": 0.8661414980888367, + "eval_runtime": 45.7682, + "eval_samples_per_second": 4.37, + "eval_steps_per_second": 0.546, + "step": 43 + }, + { + "epoch": 1.375, + "grad_norm": 0.2208565035546648, + "learning_rate": 2e-05, + "loss": 0.9269, + "step": 44 + }, + { + "epoch": 1.375, + "eval_loss": 0.8625122904777527, + "eval_runtime": 47.7405, + "eval_samples_per_second": 4.189, + "eval_steps_per_second": 0.524, + "step": 44 + }, + { + "epoch": 1.40625, + "grad_norm": 0.24194310531833832, + "learning_rate": 2e-05, + "loss": 0.9126, + "step": 45 + }, + { + "epoch": 1.40625, + "eval_loss": 0.859275221824646, + "eval_runtime": 46.14, + "eval_samples_per_second": 4.335, + "eval_steps_per_second": 0.542, + "step": 45 + }, + { + "epoch": 1.4375, + "grad_norm": 0.23294071980639222, + "learning_rate": 2e-05, + "loss": 0.9525, + "step": 46 + }, + { + "epoch": 1.4375, + "eval_loss": 0.8560716509819031, + "eval_runtime": 47.2955, + "eval_samples_per_second": 4.229, + "eval_steps_per_second": 0.529, + "step": 46 + }, + { + "epoch": 1.46875, + "grad_norm": 0.22565596183142483, + "learning_rate": 2e-05, + "loss": 0.9635, + "step": 47 + }, + { + "epoch": 1.46875, + "eval_loss": 0.8531911373138428, + "eval_runtime": 46.3183, + "eval_samples_per_second": 4.318, + "eval_steps_per_second": 0.54, + "step": 47 + }, + { + "epoch": 1.5, + "grad_norm": 0.23251096636792043, + "learning_rate": 2e-05, + "loss": 0.8684, + "step": 48 + }, + { + "epoch": 1.5, + "eval_loss": 0.8504599928855896, + "eval_runtime": 45.7129, + "eval_samples_per_second": 4.375, + "eval_steps_per_second": 0.547, + "step": 48 + }, + { + "epoch": 1.53125, + "grad_norm": 0.253882583102031, + "learning_rate": 2e-05, + "loss": 0.881, + "step": 49 + }, + { + "epoch": 1.53125, + "eval_loss": 0.8476203680038452, + "eval_runtime": 45.8764, + "eval_samples_per_second": 4.36, + "eval_steps_per_second": 0.545, + "step": 49 + }, + { + "epoch": 1.5625, + "grad_norm": 0.2572282615843019, + "learning_rate": 2e-05, + "loss": 0.8634, + "step": 50 + }, + { + "epoch": 1.5625, + "eval_loss": 0.8446447849273682, + "eval_runtime": 46.1254, + "eval_samples_per_second": 4.336, + "eval_steps_per_second": 0.542, + "step": 50 + }, + { + "epoch": 1.59375, + "grad_norm": 0.24021257130991572, + "learning_rate": 2e-05, + "loss": 0.8915, + "step": 51 + }, + { + "epoch": 1.59375, + "eval_loss": 0.8415327668190002, + "eval_runtime": 45.7173, + "eval_samples_per_second": 4.375, + "eval_steps_per_second": 0.547, + "step": 51 + }, + { + "epoch": 1.625, + "grad_norm": 0.22076828593901424, + "learning_rate": 2e-05, + "loss": 0.7849, + "step": 52 + }, + { + "epoch": 1.625, + "eval_loss": 0.8386600017547607, + "eval_runtime": 45.7889, + "eval_samples_per_second": 4.368, + "eval_steps_per_second": 0.546, + "step": 52 + }, + { + "epoch": 1.65625, + "grad_norm": 0.2255866641078328, + "learning_rate": 2e-05, + "loss": 0.9282, + "step": 53 + }, + { + "epoch": 1.65625, + "eval_loss": 0.8356924653053284, + "eval_runtime": 45.6221, + "eval_samples_per_second": 4.384, + "eval_steps_per_second": 0.548, + "step": 53 + }, + { + "epoch": 1.6875, + "grad_norm": 0.22783298909181773, + "learning_rate": 2e-05, + "loss": 0.9012, + "step": 54 + }, + { + "epoch": 1.6875, + "eval_loss": 0.8328012228012085, + "eval_runtime": 47.1607, + "eval_samples_per_second": 4.241, + "eval_steps_per_second": 0.53, + "step": 54 + }, + { + "epoch": 1.71875, + "grad_norm": 0.22832233862063558, + "learning_rate": 2e-05, + "loss": 0.9055, + "step": 55 + }, + { + "epoch": 1.71875, + "eval_loss": 0.830295741558075, + "eval_runtime": 46.0231, + "eval_samples_per_second": 4.346, + "eval_steps_per_second": 0.543, + "step": 55 + }, + { + "epoch": 1.75, + "grad_norm": 0.2160389858258543, + "learning_rate": 2e-05, + "loss": 0.9646, + "step": 56 + }, + { + "epoch": 1.75, + "eval_loss": 0.8281158208847046, + "eval_runtime": 50.2412, + "eval_samples_per_second": 3.981, + "eval_steps_per_second": 0.498, + "step": 56 + }, + { + "epoch": 1.78125, + "grad_norm": 0.2577519779258931, + "learning_rate": 2e-05, + "loss": 0.8908, + "step": 57 + }, + { + "epoch": 1.78125, + "eval_loss": 0.8254660964012146, + "eval_runtime": 43.4999, + "eval_samples_per_second": 4.598, + "eval_steps_per_second": 0.575, + "step": 57 + }, + { + "epoch": 1.8125, + "grad_norm": 0.2425252190238059, + "learning_rate": 2e-05, + "loss": 0.9392, + "step": 58 + }, + { + "epoch": 1.8125, + "eval_loss": 0.8230564594268799, + "eval_runtime": 43.1396, + "eval_samples_per_second": 4.636, + "eval_steps_per_second": 0.58, + "step": 58 + }, + { + "epoch": 1.84375, + "grad_norm": 0.2403612422125405, + "learning_rate": 2e-05, + "loss": 0.8458, + "step": 59 + }, + { + "epoch": 1.84375, + "eval_loss": 0.8206232190132141, + "eval_runtime": 43.4097, + "eval_samples_per_second": 4.607, + "eval_steps_per_second": 0.576, + "step": 59 + }, + { + "epoch": 1.875, + "grad_norm": 0.24599794763439686, + "learning_rate": 2e-05, + "loss": 0.8533, + "step": 60 + }, + { + "epoch": 1.875, + "eval_loss": 0.8178582787513733, + "eval_runtime": 43.3225, + "eval_samples_per_second": 4.617, + "eval_steps_per_second": 0.577, + "step": 60 + }, + { + "epoch": 1.90625, + "grad_norm": 0.24455796239061778, + "learning_rate": 2e-05, + "loss": 0.9019, + "step": 61 + }, + { + "epoch": 1.90625, + "eval_loss": 0.81532883644104, + "eval_runtime": 43.3919, + "eval_samples_per_second": 4.609, + "eval_steps_per_second": 0.576, + "step": 61 + }, + { + "epoch": 1.9375, + "grad_norm": 0.25994876629591135, + "learning_rate": 2e-05, + "loss": 0.9294, + "step": 62 + }, + { + "epoch": 1.9375, + "eval_loss": 0.813098669052124, + "eval_runtime": 43.5546, + "eval_samples_per_second": 4.592, + "eval_steps_per_second": 0.574, + "step": 62 + }, + { + "epoch": 1.96875, + "grad_norm": 0.2671215171096013, + "learning_rate": 2e-05, + "loss": 0.7728, + "step": 63 + }, + { + "epoch": 1.96875, + "eval_loss": 0.8106216192245483, + "eval_runtime": 43.3363, + "eval_samples_per_second": 4.615, + "eval_steps_per_second": 0.577, + "step": 63 + }, + { + "epoch": 2.0, + "grad_norm": 0.26274475710090606, + "learning_rate": 2e-05, + "loss": 0.8746, + "step": 64 + }, + { + "epoch": 2.0, + "eval_loss": 0.8080699443817139, + "eval_runtime": 44.6331, + "eval_samples_per_second": 4.481, + "eval_steps_per_second": 0.56, + "step": 64 + }, + { + "epoch": 2.03125, + "grad_norm": 0.2775753424365695, + "learning_rate": 2e-05, + "loss": 0.8665, + "step": 65 + }, + { + "epoch": 2.03125, + "eval_loss": 0.8051960468292236, + "eval_runtime": 43.2561, + "eval_samples_per_second": 4.624, + "eval_steps_per_second": 0.578, + "step": 65 + }, + { + "epoch": 2.0625, + "grad_norm": 0.27249086550617724, + "learning_rate": 2e-05, + "loss": 0.8868, + "step": 66 + }, + { + "epoch": 2.0625, + "eval_loss": 0.8029299378395081, + "eval_runtime": 43.1171, + "eval_samples_per_second": 4.639, + "eval_steps_per_second": 0.58, + "step": 66 + }, + { + "epoch": 2.09375, + "grad_norm": 0.2719871749974866, + "learning_rate": 2e-05, + "loss": 0.8651, + "step": 67 + }, + { + "epoch": 2.09375, + "eval_loss": 0.8006068468093872, + "eval_runtime": 43.0661, + "eval_samples_per_second": 4.644, + "eval_steps_per_second": 0.581, + "step": 67 + }, + { + "epoch": 2.125, + "grad_norm": 0.24961006779343242, + "learning_rate": 2e-05, + "loss": 0.9303, + "step": 68 + }, + { + "epoch": 2.125, + "eval_loss": 0.7983291745185852, + "eval_runtime": 44.5821, + "eval_samples_per_second": 4.486, + "eval_steps_per_second": 0.561, + "step": 68 + }, + { + "epoch": 2.15625, + "grad_norm": 0.26632839922388696, + "learning_rate": 2e-05, + "loss": 0.8625, + "step": 69 + }, + { + "epoch": 2.15625, + "eval_loss": 0.7961746454238892, + "eval_runtime": 44.7163, + "eval_samples_per_second": 4.473, + "eval_steps_per_second": 0.559, + "step": 69 + }, + { + "epoch": 2.1875, + "grad_norm": 0.28665202557154024, + "learning_rate": 2e-05, + "loss": 0.8084, + "step": 70 + }, + { + "epoch": 2.1875, + "eval_loss": 0.7937586307525635, + "eval_runtime": 43.1349, + "eval_samples_per_second": 4.637, + "eval_steps_per_second": 0.58, + "step": 70 + }, + { + "epoch": 2.21875, + "grad_norm": 0.25474181970896226, + "learning_rate": 2e-05, + "loss": 0.8943, + "step": 71 + }, + { + "epoch": 2.21875, + "eval_loss": 0.7917373776435852, + "eval_runtime": 43.1701, + "eval_samples_per_second": 4.633, + "eval_steps_per_second": 0.579, + "step": 71 + }, + { + "epoch": 2.25, + "grad_norm": 0.28289708669257335, + "learning_rate": 2e-05, + "loss": 0.8183, + "step": 72 + }, + { + "epoch": 2.25, + "eval_loss": 0.7898543477058411, + "eval_runtime": 43.3669, + "eval_samples_per_second": 4.612, + "eval_steps_per_second": 0.576, + "step": 72 + }, + { + "epoch": 2.28125, + "grad_norm": 0.3081846543495751, + "learning_rate": 2e-05, + "loss": 0.866, + "step": 73 + }, + { + "epoch": 2.28125, + "eval_loss": 0.7878245711326599, + "eval_runtime": 43.2404, + "eval_samples_per_second": 4.625, + "eval_steps_per_second": 0.578, + "step": 73 + }, + { + "epoch": 2.3125, + "grad_norm": 0.25291911217221025, + "learning_rate": 2e-05, + "loss": 0.8643, + "step": 74 + }, + { + "epoch": 2.3125, + "eval_loss": 0.7859254479408264, + "eval_runtime": 43.158, + "eval_samples_per_second": 4.634, + "eval_steps_per_second": 0.579, + "step": 74 + }, + { + "epoch": 2.34375, + "grad_norm": 0.2671411105926486, + "learning_rate": 2e-05, + "loss": 0.9148, + "step": 75 + }, + { + "epoch": 2.34375, + "eval_loss": 0.7841793894767761, + "eval_runtime": 43.5393, + "eval_samples_per_second": 4.594, + "eval_steps_per_second": 0.574, + "step": 75 + }, + { + "epoch": 2.375, + "grad_norm": 0.2649328385798148, + "learning_rate": 2e-05, + "loss": 0.8322, + "step": 76 + }, + { + "epoch": 2.375, + "eval_loss": 0.7824788093566895, + "eval_runtime": 44.6161, + "eval_samples_per_second": 4.483, + "eval_steps_per_second": 0.56, + "step": 76 + }, + { + "epoch": 2.40625, + "grad_norm": 0.2770584815336495, + "learning_rate": 2e-05, + "loss": 0.8845, + "step": 77 + }, + { + "epoch": 2.40625, + "eval_loss": 0.7810197472572327, + "eval_runtime": 44.3474, + "eval_samples_per_second": 4.51, + "eval_steps_per_second": 0.564, + "step": 77 + }, + { + "epoch": 2.4375, + "grad_norm": 0.3134056914363824, + "learning_rate": 2e-05, + "loss": 0.8764, + "step": 78 + }, + { + "epoch": 2.4375, + "eval_loss": 0.7796530723571777, + "eval_runtime": 44.6727, + "eval_samples_per_second": 4.477, + "eval_steps_per_second": 0.56, + "step": 78 + }, + { + "epoch": 2.46875, + "grad_norm": 0.31159260857820364, + "learning_rate": 2e-05, + "loss": 0.8842, + "step": 79 + }, + { + "epoch": 2.46875, + "eval_loss": 0.7792640924453735, + "eval_runtime": 44.9476, + "eval_samples_per_second": 4.45, + "eval_steps_per_second": 0.556, + "step": 79 + }, + { + "epoch": 2.5, + "grad_norm": 0.30072325605647415, + "learning_rate": 2e-05, + "loss": 0.9214, + "step": 80 + }, + { + "epoch": 2.5, + "eval_loss": 0.7791906595230103, + "eval_runtime": 44.5732, + "eval_samples_per_second": 4.487, + "eval_steps_per_second": 0.561, + "step": 80 + }, + { + "epoch": 2.53125, + "grad_norm": 0.3021628861526586, + "learning_rate": 2e-05, + "loss": 0.854, + "step": 81 + }, + { + "epoch": 2.53125, + "eval_loss": 0.7786081433296204, + "eval_runtime": 46.7962, + "eval_samples_per_second": 4.274, + "eval_steps_per_second": 0.534, + "step": 81 + }, + { + "epoch": 2.5625, + "grad_norm": 0.28647643667873335, + "learning_rate": 2e-05, + "loss": 0.915, + "step": 82 + }, + { + "epoch": 2.5625, + "eval_loss": 0.777721643447876, + "eval_runtime": 46.0168, + "eval_samples_per_second": 4.346, + "eval_steps_per_second": 0.543, + "step": 82 + }, + { + "epoch": 2.59375, + "grad_norm": 0.3053967339779788, + "learning_rate": 2e-05, + "loss": 0.8616, + "step": 83 + }, + { + "epoch": 2.59375, + "eval_loss": 0.7763125896453857, + "eval_runtime": 46.9482, + "eval_samples_per_second": 4.26, + "eval_steps_per_second": 0.533, + "step": 83 + }, + { + "epoch": 2.625, + "grad_norm": 0.3285655628944688, + "learning_rate": 2e-05, + "loss": 0.8242, + "step": 84 + }, + { + "epoch": 2.625, + "eval_loss": 0.7744290232658386, + "eval_runtime": 45.8201, + "eval_samples_per_second": 4.365, + "eval_steps_per_second": 0.546, + "step": 84 + }, + { + "epoch": 2.65625, + "grad_norm": 0.29338609850548214, + "learning_rate": 2e-05, + "loss": 0.7927, + "step": 85 + }, + { + "epoch": 2.65625, + "eval_loss": 0.7727124094963074, + "eval_runtime": 47.0822, + "eval_samples_per_second": 4.248, + "eval_steps_per_second": 0.531, + "step": 85 + }, + { + "epoch": 2.6875, + "grad_norm": 0.3360259804530201, + "learning_rate": 2e-05, + "loss": 0.8225, + "step": 86 + }, + { + "epoch": 2.6875, + "eval_loss": 0.7707045078277588, + "eval_runtime": 45.904, + "eval_samples_per_second": 4.357, + "eval_steps_per_second": 0.545, + "step": 86 + }, + { + "epoch": 2.71875, + "grad_norm": 0.3086865804573199, + "learning_rate": 2e-05, + "loss": 0.8428, + "step": 87 + }, + { + "epoch": 2.71875, + "eval_loss": 0.7689979672431946, + "eval_runtime": 46.5498, + "eval_samples_per_second": 4.296, + "eval_steps_per_second": 0.537, + "step": 87 + }, + { + "epoch": 2.75, + "grad_norm": 0.3441174342366127, + "learning_rate": 2e-05, + "loss": 0.9349, + "step": 88 + }, + { + "epoch": 2.75, + "eval_loss": 0.7670918107032776, + "eval_runtime": 45.9533, + "eval_samples_per_second": 4.352, + "eval_steps_per_second": 0.544, + "step": 88 + }, + { + "epoch": 2.78125, + "grad_norm": 0.3192564489143439, + "learning_rate": 2e-05, + "loss": 0.8281, + "step": 89 + }, + { + "epoch": 2.78125, + "eval_loss": 0.7653720378875732, + "eval_runtime": 46.4157, + "eval_samples_per_second": 4.309, + "eval_steps_per_second": 0.539, + "step": 89 + }, + { + "epoch": 2.8125, + "grad_norm": 0.318307521318246, + "learning_rate": 2e-05, + "loss": 0.8826, + "step": 90 + }, + { + "epoch": 2.8125, + "eval_loss": 0.7641046047210693, + "eval_runtime": 43.6527, + "eval_samples_per_second": 4.582, + "eval_steps_per_second": 0.573, + "step": 90 + }, + { + "epoch": 2.84375, + "grad_norm": 0.3088619418824691, + "learning_rate": 2e-05, + "loss": 0.7792, + "step": 91 + }, + { + "epoch": 2.84375, + "eval_loss": 0.7630372643470764, + "eval_runtime": 43.3688, + "eval_samples_per_second": 4.612, + "eval_steps_per_second": 0.576, + "step": 91 + }, + { + "epoch": 2.875, + "grad_norm": 0.31484830204628667, + "learning_rate": 2e-05, + "loss": 0.8771, + "step": 92 + }, + { + "epoch": 2.875, + "eval_loss": 0.7621588110923767, + "eval_runtime": 43.4895, + "eval_samples_per_second": 4.599, + "eval_steps_per_second": 0.575, + "step": 92 + }, + { + "epoch": 2.90625, + "grad_norm": 0.3210986538440627, + "learning_rate": 2e-05, + "loss": 0.8125, + "step": 93 + }, + { + "epoch": 2.90625, + "eval_loss": 0.7610002160072327, + "eval_runtime": 44.5951, + "eval_samples_per_second": 4.485, + "eval_steps_per_second": 0.561, + "step": 93 + }, + { + "epoch": 2.9375, + "grad_norm": 0.3584955691897743, + "learning_rate": 2e-05, + "loss": 0.8869, + "step": 94 + }, + { + "epoch": 2.9375, + "eval_loss": 0.7591326832771301, + "eval_runtime": 44.778, + "eval_samples_per_second": 4.466, + "eval_steps_per_second": 0.558, + "step": 94 + }, + { + "epoch": 2.96875, + "grad_norm": 0.3231987362149406, + "learning_rate": 2e-05, + "loss": 0.828, + "step": 95 + }, + { + "epoch": 2.96875, + "eval_loss": 0.7578966021537781, + "eval_runtime": 44.832, + "eval_samples_per_second": 4.461, + "eval_steps_per_second": 0.558, + "step": 95 + }, + { + "epoch": 3.0, + "grad_norm": 0.3195106075306484, + "learning_rate": 2e-05, + "loss": 0.8, + "step": 96 + }, + { + "epoch": 3.0, + "eval_loss": 0.7563678026199341, + "eval_runtime": 43.2334, + "eval_samples_per_second": 4.626, + "eval_steps_per_second": 0.578, + "step": 96 + }, + { + "epoch": 3.03125, + "grad_norm": 0.3319055768203625, + "learning_rate": 2e-05, + "loss": 0.7632, + "step": 97 + }, + { + "epoch": 3.03125, + "eval_loss": 0.7547956705093384, + "eval_runtime": 50.7388, + "eval_samples_per_second": 3.942, + "eval_steps_per_second": 0.493, + "step": 97 + }, + { + "epoch": 3.0625, + "grad_norm": 0.2995834652715153, + "learning_rate": 2e-05, + "loss": 0.8407, + "step": 98 + }, + { + "epoch": 3.0625, + "eval_loss": 0.7533387541770935, + "eval_runtime": 45.0847, + "eval_samples_per_second": 4.436, + "eval_steps_per_second": 0.555, + "step": 98 + }, + { + "epoch": 3.09375, + "grad_norm": 0.30711749226961915, + "learning_rate": 2e-05, + "loss": 0.8117, + "step": 99 + }, + { + "epoch": 3.09375, + "eval_loss": 0.7517553567886353, + "eval_runtime": 43.2975, + "eval_samples_per_second": 4.619, + "eval_steps_per_second": 0.577, + "step": 99 + }, + { + "epoch": 3.125, + "grad_norm": 0.3443284045264722, + "learning_rate": 2e-05, + "loss": 0.8347, + "step": 100 + }, + { + "epoch": 3.125, + "eval_loss": 0.749790370464325, + "eval_runtime": 43.3922, + "eval_samples_per_second": 4.609, + "eval_steps_per_second": 0.576, + "step": 100 + }, + { + "epoch": 3.15625, + "grad_norm": 0.3080766546496095, + "learning_rate": 2e-05, + "loss": 0.7748, + "step": 101 + }, + { + "epoch": 3.15625, + "eval_loss": 0.7480612397193909, + "eval_runtime": 45.0132, + "eval_samples_per_second": 4.443, + "eval_steps_per_second": 0.555, + "step": 101 + }, + { + "epoch": 3.1875, + "grad_norm": 0.34717566244235637, + "learning_rate": 2e-05, + "loss": 0.8407, + "step": 102 + }, + { + "epoch": 3.1875, + "eval_loss": 0.7468411326408386, + "eval_runtime": 43.1171, + "eval_samples_per_second": 4.639, + "eval_steps_per_second": 0.58, + "step": 102 + }, + { + "epoch": 3.21875, + "grad_norm": 0.3374839165175488, + "learning_rate": 2e-05, + "loss": 0.8498, + "step": 103 + }, + { + "epoch": 3.21875, + "eval_loss": 0.7462002038955688, + "eval_runtime": 44.7301, + "eval_samples_per_second": 4.471, + "eval_steps_per_second": 0.559, + "step": 103 + }, + { + "epoch": 3.25, + "grad_norm": 0.35610377004267274, + "learning_rate": 2e-05, + "loss": 0.7608, + "step": 104 + }, + { + "epoch": 3.25, + "eval_loss": 0.7451856732368469, + "eval_runtime": 43.1396, + "eval_samples_per_second": 4.636, + "eval_steps_per_second": 0.58, + "step": 104 + }, + { + "epoch": 3.28125, + "grad_norm": 0.3147450389365033, + "learning_rate": 2e-05, + "loss": 0.8077, + "step": 105 + }, + { + "epoch": 3.28125, + "eval_loss": 0.7444003224372864, + "eval_runtime": 45.0088, + "eval_samples_per_second": 4.444, + "eval_steps_per_second": 0.555, + "step": 105 + }, + { + "epoch": 3.3125, + "grad_norm": 0.3706462973318254, + "learning_rate": 2e-05, + "loss": 0.8401, + "step": 106 + }, + { + "epoch": 3.3125, + "eval_loss": 0.7432863116264343, + "eval_runtime": 43.5403, + "eval_samples_per_second": 4.593, + "eval_steps_per_second": 0.574, + "step": 106 + }, + { + "epoch": 3.34375, + "grad_norm": 0.40870394852693054, + "learning_rate": 2e-05, + "loss": 0.7369, + "step": 107 + }, + { + "epoch": 3.34375, + "eval_loss": 0.7409774661064148, + "eval_runtime": 43.3731, + "eval_samples_per_second": 4.611, + "eval_steps_per_second": 0.576, + "step": 107 + }, + { + "epoch": 3.375, + "grad_norm": 0.36546514227995835, + "learning_rate": 2e-05, + "loss": 0.7822, + "step": 108 + }, + { + "epoch": 3.375, + "eval_loss": 0.7388054132461548, + "eval_runtime": 43.2852, + "eval_samples_per_second": 4.621, + "eval_steps_per_second": 0.578, + "step": 108 + }, + { + "epoch": 3.40625, + "grad_norm": 0.3623356150462002, + "learning_rate": 2e-05, + "loss": 0.7693, + "step": 109 + }, + { + "epoch": 3.40625, + "eval_loss": 0.7370558977127075, + "eval_runtime": 43.2105, + "eval_samples_per_second": 4.629, + "eval_steps_per_second": 0.579, + "step": 109 + }, + { + "epoch": 3.4375, + "grad_norm": 0.36956774509216733, + "learning_rate": 2e-05, + "loss": 0.7631, + "step": 110 + }, + { + "epoch": 3.4375, + "eval_loss": 0.7354567050933838, + "eval_runtime": 45.0512, + "eval_samples_per_second": 4.439, + "eval_steps_per_second": 0.555, + "step": 110 + }, + { + "epoch": 3.46875, + "grad_norm": 0.37499211223571893, + "learning_rate": 2e-05, + "loss": 0.8397, + "step": 111 + }, + { + "epoch": 3.46875, + "eval_loss": 0.7342872619628906, + "eval_runtime": 44.1989, + "eval_samples_per_second": 4.525, + "eval_steps_per_second": 0.566, + "step": 111 + }, + { + "epoch": 3.5, + "grad_norm": 0.3656781606255811, + "learning_rate": 2e-05, + "loss": 0.8156, + "step": 112 + }, + { + "epoch": 3.5, + "eval_loss": 0.7334136962890625, + "eval_runtime": 43.3314, + "eval_samples_per_second": 4.616, + "eval_steps_per_second": 0.577, + "step": 112 + }, + { + "epoch": 3.53125, + "grad_norm": 0.360531666311953, + "learning_rate": 2e-05, + "loss": 0.9039, + "step": 113 + }, + { + "epoch": 3.53125, + "eval_loss": 0.732928454875946, + "eval_runtime": 43.6452, + "eval_samples_per_second": 4.582, + "eval_steps_per_second": 0.573, + "step": 113 + }, + { + "epoch": 3.5625, + "grad_norm": 0.4106498291544766, + "learning_rate": 2e-05, + "loss": 0.7632, + "step": 114 + }, + { + "epoch": 3.5625, + "eval_loss": 0.7328732013702393, + "eval_runtime": 43.2922, + "eval_samples_per_second": 4.62, + "eval_steps_per_second": 0.577, + "step": 114 + }, + { + "epoch": 3.59375, + "grad_norm": 0.35030054786635473, + "learning_rate": 2e-05, + "loss": 0.8328, + "step": 115 + }, + { + "epoch": 3.59375, + "eval_loss": 0.7332839369773865, + "eval_runtime": 43.1392, + "eval_samples_per_second": 4.636, + "eval_steps_per_second": 0.58, + "step": 115 + }, + { + "epoch": 3.625, + "grad_norm": 0.37866907463824806, + "learning_rate": 2e-05, + "loss": 0.7992, + "step": 116 + }, + { + "epoch": 3.625, + "eval_loss": 0.7333321571350098, + "eval_runtime": 44.5672, + "eval_samples_per_second": 4.488, + "eval_steps_per_second": 0.561, + "step": 116 + }, + { + "epoch": 3.65625, + "grad_norm": 0.3868782215569731, + "learning_rate": 2e-05, + "loss": 0.7929, + "step": 117 + }, + { + "epoch": 3.65625, + "eval_loss": 0.7327985167503357, + "eval_runtime": 45.9132, + "eval_samples_per_second": 4.356, + "eval_steps_per_second": 0.545, + "step": 117 + }, + { + "epoch": 3.6875, + "grad_norm": 0.3823386198135366, + "learning_rate": 2e-05, + "loss": 0.8064, + "step": 118 + }, + { + "epoch": 3.6875, + "eval_loss": 0.7325207591056824, + "eval_runtime": 45.1557, + "eval_samples_per_second": 4.429, + "eval_steps_per_second": 0.554, + "step": 118 + }, + { + "epoch": 3.71875, + "grad_norm": 0.3586002374199349, + "learning_rate": 2e-05, + "loss": 0.8677, + "step": 119 + }, + { + "epoch": 3.71875, + "eval_loss": 0.732402503490448, + "eval_runtime": 44.5906, + "eval_samples_per_second": 4.485, + "eval_steps_per_second": 0.561, + "step": 119 + }, + { + "epoch": 3.75, + "grad_norm": 0.34075042751380596, + "learning_rate": 2e-05, + "loss": 0.8119, + "step": 120 + }, + { + "epoch": 3.75, + "eval_loss": 0.7322152853012085, + "eval_runtime": 44.3386, + "eval_samples_per_second": 4.511, + "eval_steps_per_second": 0.564, + "step": 120 + }, + { + "epoch": 3.78125, + "grad_norm": 0.38915259379047296, + "learning_rate": 2e-05, + "loss": 0.7866, + "step": 121 + }, + { + "epoch": 3.78125, + "eval_loss": 0.7307778000831604, + "eval_runtime": 45.0342, + "eval_samples_per_second": 4.441, + "eval_steps_per_second": 0.555, + "step": 121 + }, + { + "epoch": 3.8125, + "grad_norm": 0.39774471715347587, + "learning_rate": 2e-05, + "loss": 0.8635, + "step": 122 + }, + { + "epoch": 3.8125, + "eval_loss": 0.7294437885284424, + "eval_runtime": 47.2205, + "eval_samples_per_second": 4.235, + "eval_steps_per_second": 0.529, + "step": 122 + }, + { + "epoch": 3.84375, + "grad_norm": 0.3880340672056078, + "learning_rate": 2e-05, + "loss": 0.7834, + "step": 123 + }, + { + "epoch": 3.84375, + "eval_loss": 0.7277958393096924, + "eval_runtime": 45.5116, + "eval_samples_per_second": 4.394, + "eval_steps_per_second": 0.549, + "step": 123 + }, + { + "epoch": 3.875, + "grad_norm": 0.34955832039339413, + "learning_rate": 2e-05, + "loss": 0.8048, + "step": 124 + }, + { + "epoch": 3.875, + "eval_loss": 0.7262464761734009, + "eval_runtime": 45.3196, + "eval_samples_per_second": 4.413, + "eval_steps_per_second": 0.552, + "step": 124 + }, + { + "epoch": 3.90625, + "grad_norm": 0.4502351954206266, + "learning_rate": 2e-05, + "loss": 0.8494, + "step": 125 + }, + { + "epoch": 3.90625, + "eval_loss": 0.724558413028717, + "eval_runtime": 45.2241, + "eval_samples_per_second": 4.422, + "eval_steps_per_second": 0.553, + "step": 125 + }, + { + "epoch": 3.9375, + "grad_norm": 0.40148506382728893, + "learning_rate": 2e-05, + "loss": 0.8163, + "step": 126 + }, + { + "epoch": 3.9375, + "eval_loss": 0.7235116362571716, + "eval_runtime": 46.1839, + "eval_samples_per_second": 4.331, + "eval_steps_per_second": 0.541, + "step": 126 + }, + { + "epoch": 3.96875, + "grad_norm": 0.41595103877364653, + "learning_rate": 2e-05, + "loss": 0.7756, + "step": 127 + }, + { + "epoch": 3.96875, + "eval_loss": 0.7227371335029602, + "eval_runtime": 43.5883, + "eval_samples_per_second": 4.588, + "eval_steps_per_second": 0.574, + "step": 127 + }, + { + "epoch": 4.0, + "grad_norm": 0.3959213167419436, + "learning_rate": 2e-05, + "loss": 0.7107, + "step": 128 + }, + { + "epoch": 4.0, + "eval_loss": 0.721717357635498, + "eval_runtime": 44.8751, + "eval_samples_per_second": 4.457, + "eval_steps_per_second": 0.557, + "step": 128 + }, + { + "epoch": 4.03125, + "grad_norm": 0.34668934768327436, + "learning_rate": 2e-05, + "loss": 0.8028, + "step": 129 + }, + { + "epoch": 4.03125, + "eval_loss": 0.7208954095840454, + "eval_runtime": 43.2092, + "eval_samples_per_second": 4.629, + "eval_steps_per_second": 0.579, + "step": 129 + }, + { + "epoch": 4.0625, + "grad_norm": 0.3776564287872586, + "learning_rate": 2e-05, + "loss": 0.8162, + "step": 130 + }, + { + "epoch": 4.0625, + "eval_loss": 0.7200332880020142, + "eval_runtime": 43.1981, + "eval_samples_per_second": 4.63, + "eval_steps_per_second": 0.579, + "step": 130 + }, + { + "epoch": 4.09375, + "grad_norm": 0.35166731437552645, + "learning_rate": 2e-05, + "loss": 0.814, + "step": 131 + }, + { + "epoch": 4.09375, + "eval_loss": 0.7193570137023926, + "eval_runtime": 43.3306, + "eval_samples_per_second": 4.616, + "eval_steps_per_second": 0.577, + "step": 131 + }, + { + "epoch": 4.125, + "grad_norm": 0.39783214883157875, + "learning_rate": 2e-05, + "loss": 0.7743, + "step": 132 + }, + { + "epoch": 4.125, + "eval_loss": 0.7187802791595459, + "eval_runtime": 44.0701, + "eval_samples_per_second": 4.538, + "eval_steps_per_second": 0.567, + "step": 132 + }, + { + "epoch": 4.15625, + "grad_norm": 0.3828880469066703, + "learning_rate": 2e-05, + "loss": 0.8766, + "step": 133 + }, + { + "epoch": 4.15625, + "eval_loss": 0.7184324860572815, + "eval_runtime": 43.3218, + "eval_samples_per_second": 4.617, + "eval_steps_per_second": 0.577, + "step": 133 + }, + { + "epoch": 4.1875, + "grad_norm": 0.46175115507112535, + "learning_rate": 2e-05, + "loss": 0.7827, + "step": 134 + }, + { + "epoch": 4.1875, + "eval_loss": 0.717852771282196, + "eval_runtime": 43.3706, + "eval_samples_per_second": 4.611, + "eval_steps_per_second": 0.576, + "step": 134 + }, + { + "epoch": 4.21875, + "grad_norm": 0.39552167703322383, + "learning_rate": 2e-05, + "loss": 0.7846, + "step": 135 + }, + { + "epoch": 4.21875, + "eval_loss": 0.7171714901924133, + "eval_runtime": 43.3199, + "eval_samples_per_second": 4.617, + "eval_steps_per_second": 0.577, + "step": 135 + }, + { + "epoch": 4.25, + "grad_norm": 0.40883049825529505, + "learning_rate": 2e-05, + "loss": 0.7711, + "step": 136 + }, + { + "epoch": 4.25, + "eval_loss": 0.7167998552322388, + "eval_runtime": 43.4601, + "eval_samples_per_second": 4.602, + "eval_steps_per_second": 0.575, + "step": 136 + }, + { + "epoch": 4.28125, + "grad_norm": 0.4411120151436577, + "learning_rate": 2e-05, + "loss": 0.755, + "step": 137 + }, + { + "epoch": 4.28125, + "eval_loss": 0.7161502838134766, + "eval_runtime": 45.0586, + "eval_samples_per_second": 4.439, + "eval_steps_per_second": 0.555, + "step": 137 + }, + { + "epoch": 4.3125, + "grad_norm": 0.4307733167956254, + "learning_rate": 2e-05, + "loss": 0.7708, + "step": 138 + }, + { + "epoch": 4.3125, + "eval_loss": 0.7155695557594299, + "eval_runtime": 44.7913, + "eval_samples_per_second": 4.465, + "eval_steps_per_second": 0.558, + "step": 138 + }, + { + "epoch": 4.34375, + "grad_norm": 0.4303129845521591, + "learning_rate": 2e-05, + "loss": 0.7384, + "step": 139 + }, + { + "epoch": 4.34375, + "eval_loss": 0.7146069407463074, + "eval_runtime": 43.3745, + "eval_samples_per_second": 4.611, + "eval_steps_per_second": 0.576, + "step": 139 + }, + { + "epoch": 4.375, + "grad_norm": 0.4160861103360693, + "learning_rate": 2e-05, + "loss": 0.7693, + "step": 140 + }, + { + "epoch": 4.375, + "eval_loss": 0.7138718962669373, + "eval_runtime": 43.2941, + "eval_samples_per_second": 4.62, + "eval_steps_per_second": 0.577, + "step": 140 + }, + { + "epoch": 4.40625, + "grad_norm": 0.3974304749908327, + "learning_rate": 2e-05, + "loss": 0.7855, + "step": 141 + }, + { + "epoch": 4.40625, + "eval_loss": 0.7131789922714233, + "eval_runtime": 43.6908, + "eval_samples_per_second": 4.578, + "eval_steps_per_second": 0.572, + "step": 141 + }, + { + "epoch": 4.4375, + "grad_norm": 0.42212623603465876, + "learning_rate": 2e-05, + "loss": 0.733, + "step": 142 + }, + { + "epoch": 4.4375, + "eval_loss": 0.7126344442367554, + "eval_runtime": 43.5706, + "eval_samples_per_second": 4.59, + "eval_steps_per_second": 0.574, + "step": 142 + }, + { + "epoch": 4.46875, + "grad_norm": 0.4290602874698813, + "learning_rate": 2e-05, + "loss": 0.7372, + "step": 143 + }, + { + "epoch": 4.46875, + "eval_loss": 0.7121153473854065, + "eval_runtime": 44.0917, + "eval_samples_per_second": 4.536, + "eval_steps_per_second": 0.567, + "step": 143 + }, + { + "epoch": 4.5, + "grad_norm": 0.38778639331277664, + "learning_rate": 2e-05, + "loss": 0.715, + "step": 144 + }, + { + "epoch": 4.5, + "eval_loss": 0.7114359140396118, + "eval_runtime": 90.4172, + "eval_samples_per_second": 2.212, + "eval_steps_per_second": 0.276, + "step": 144 + }, + { + "epoch": 4.53125, + "grad_norm": 0.44014343297224434, + "learning_rate": 2e-05, + "loss": 0.802, + "step": 145 + }, + { + "epoch": 4.53125, + "eval_loss": 0.7106121778488159, + "eval_runtime": 43.5235, + "eval_samples_per_second": 4.595, + "eval_steps_per_second": 0.574, + "step": 145 + }, + { + "epoch": 4.5625, + "grad_norm": 0.45549843169611287, + "learning_rate": 2e-05, + "loss": 0.6899, + "step": 146 + }, + { + "epoch": 4.5625, + "eval_loss": 0.7094995975494385, + "eval_runtime": 43.5264, + "eval_samples_per_second": 4.595, + "eval_steps_per_second": 0.574, + "step": 146 + }, + { + "epoch": 4.59375, + "grad_norm": 0.46209967918252776, + "learning_rate": 2e-05, + "loss": 0.7503, + "step": 147 + }, + { + "epoch": 4.59375, + "eval_loss": 0.7082768082618713, + "eval_runtime": 44.8411, + "eval_samples_per_second": 4.46, + "eval_steps_per_second": 0.558, + "step": 147 + }, + { + "epoch": 4.625, + "grad_norm": 0.43001381014670376, + "learning_rate": 2e-05, + "loss": 0.7041, + "step": 148 + }, + { + "epoch": 4.625, + "eval_loss": 0.7072634696960449, + "eval_runtime": 43.1988, + "eval_samples_per_second": 4.63, + "eval_steps_per_second": 0.579, + "step": 148 + }, + { + "epoch": 4.65625, + "grad_norm": 0.4151229594087744, + "learning_rate": 2e-05, + "loss": 0.8181, + "step": 149 + }, + { + "epoch": 4.65625, + "eval_loss": 0.7068669199943542, + "eval_runtime": 43.3996, + "eval_samples_per_second": 4.608, + "eval_steps_per_second": 0.576, + "step": 149 + }, + { + "epoch": 4.6875, + "grad_norm": 0.4534048991771139, + "learning_rate": 2e-05, + "loss": 0.7411, + "step": 150 + }, + { + "epoch": 4.6875, + "eval_loss": 0.7062075734138489, + "eval_runtime": 43.3013, + "eval_samples_per_second": 4.619, + "eval_steps_per_second": 0.577, + "step": 150 + }, + { + "epoch": 4.71875, + "grad_norm": 0.4739932075357852, + "learning_rate": 2e-05, + "loss": 0.7621, + "step": 151 + }, + { + "epoch": 4.71875, + "eval_loss": 0.7047030925750732, + "eval_runtime": 43.4211, + "eval_samples_per_second": 4.606, + "eval_steps_per_second": 0.576, + "step": 151 + }, + { + "epoch": 4.75, + "grad_norm": 0.46573796534078227, + "learning_rate": 2e-05, + "loss": 0.7852, + "step": 152 + }, + { + "epoch": 4.75, + "eval_loss": 0.7033020257949829, + "eval_runtime": 43.4066, + "eval_samples_per_second": 4.608, + "eval_steps_per_second": 0.576, + "step": 152 + }, + { + "epoch": 4.78125, + "grad_norm": 0.463007545995704, + "learning_rate": 2e-05, + "loss": 0.7331, + "step": 153 + }, + { + "epoch": 4.78125, + "eval_loss": 0.7021228671073914, + "eval_runtime": 43.4184, + "eval_samples_per_second": 4.606, + "eval_steps_per_second": 0.576, + "step": 153 + }, + { + "epoch": 4.8125, + "grad_norm": 0.46580692487948094, + "learning_rate": 2e-05, + "loss": 0.76, + "step": 154 + }, + { + "epoch": 4.8125, + "eval_loss": 0.701519250869751, + "eval_runtime": 44.9732, + "eval_samples_per_second": 4.447, + "eval_steps_per_second": 0.556, + "step": 154 + }, + { + "epoch": 4.84375, + "grad_norm": 0.47378674394843967, + "learning_rate": 2e-05, + "loss": 0.6912, + "step": 155 + }, + { + "epoch": 4.84375, + "eval_loss": 0.7011644244194031, + "eval_runtime": 44.898, + "eval_samples_per_second": 4.455, + "eval_steps_per_second": 0.557, + "step": 155 + }, + { + "epoch": 4.875, + "grad_norm": 0.44883703516788587, + "learning_rate": 2e-05, + "loss": 0.812, + "step": 156 + }, + { + "epoch": 4.875, + "eval_loss": 0.7009950876235962, + "eval_runtime": 44.4765, + "eval_samples_per_second": 4.497, + "eval_steps_per_second": 0.562, + "step": 156 + }, + { + "epoch": 4.90625, + "grad_norm": 0.43366130955490684, + "learning_rate": 2e-05, + "loss": 0.7902, + "step": 157 + }, + { + "epoch": 4.90625, + "eval_loss": 0.7011439800262451, + "eval_runtime": 44.3528, + "eval_samples_per_second": 4.509, + "eval_steps_per_second": 0.564, + "step": 157 + }, + { + "epoch": 4.9375, + "grad_norm": 0.4501399670257468, + "learning_rate": 2e-05, + "loss": 0.7927, + "step": 158 + }, + { + "epoch": 4.9375, + "eval_loss": 0.7011370062828064, + "eval_runtime": 46.6518, + "eval_samples_per_second": 4.287, + "eval_steps_per_second": 0.536, + "step": 158 + }, + { + "epoch": 4.96875, + "grad_norm": 0.44946550972510596, + "learning_rate": 2e-05, + "loss": 0.7437, + "step": 159 + }, + { + "epoch": 4.96875, + "eval_loss": 0.7008097767829895, + "eval_runtime": 45.6401, + "eval_samples_per_second": 4.382, + "eval_steps_per_second": 0.548, + "step": 159 + }, + { + "epoch": 5.0, + "grad_norm": 0.455086081766797, + "learning_rate": 2e-05, + "loss": 0.7274, + "step": 160 + }, + { + "epoch": 5.0, + "eval_loss": 0.7002915143966675, + "eval_runtime": 44.5003, + "eval_samples_per_second": 4.494, + "eval_steps_per_second": 0.562, + "step": 160 + }, + { + "epoch": 5.03125, + "grad_norm": 0.42610507864697433, + "learning_rate": 2e-05, + "loss": 0.7084, + "step": 161 + }, + { + "epoch": 5.03125, + "eval_loss": 0.6996615529060364, + "eval_runtime": 50.423, + "eval_samples_per_second": 3.966, + "eval_steps_per_second": 0.496, + "step": 161 + }, + { + "epoch": 5.0625, + "grad_norm": 0.41530618486274595, + "learning_rate": 2e-05, + "loss": 0.8549, + "step": 162 + }, + { + "epoch": 5.0625, + "eval_loss": 0.6996638774871826, + "eval_runtime": 43.3726, + "eval_samples_per_second": 4.611, + "eval_steps_per_second": 0.576, + "step": 162 + }, + { + "epoch": 5.09375, + "grad_norm": 0.46020582285044187, + "learning_rate": 2e-05, + "loss": 0.6554, + "step": 163 + }, + { + "epoch": 5.09375, + "eval_loss": 0.6997809410095215, + "eval_runtime": 43.1108, + "eval_samples_per_second": 4.639, + "eval_steps_per_second": 0.58, + "step": 163 + }, + { + "epoch": 5.125, + "grad_norm": 0.45217206658399783, + "learning_rate": 2e-05, + "loss": 0.7908, + "step": 164 + }, + { + "epoch": 5.125, + "eval_loss": 0.7001843452453613, + "eval_runtime": 43.3575, + "eval_samples_per_second": 4.613, + "eval_steps_per_second": 0.577, + "step": 164 + }, + { + "epoch": 5.15625, + "grad_norm": 0.5297838342887452, + "learning_rate": 2e-05, + "loss": 0.6311, + "step": 165 + }, + { + "epoch": 5.15625, + "eval_loss": 0.6998342871665955, + "eval_runtime": 44.2692, + "eval_samples_per_second": 4.518, + "eval_steps_per_second": 0.565, + "step": 165 + }, + { + "epoch": 5.1875, + "grad_norm": 0.5041508044224997, + "learning_rate": 2e-05, + "loss": 0.7407, + "step": 166 + }, + { + "epoch": 5.1875, + "eval_loss": 0.6997390985488892, + "eval_runtime": 44.9429, + "eval_samples_per_second": 4.45, + "eval_steps_per_second": 0.556, + "step": 166 + }, + { + "epoch": 5.21875, + "grad_norm": 0.4379864270565459, + "learning_rate": 2e-05, + "loss": 0.7601, + "step": 167 + }, + { + "epoch": 5.21875, + "eval_loss": 0.6998906135559082, + "eval_runtime": 44.7922, + "eval_samples_per_second": 4.465, + "eval_steps_per_second": 0.558, + "step": 167 + }, + { + "epoch": 5.25, + "grad_norm": 0.4908573554102339, + "learning_rate": 2e-05, + "loss": 0.802, + "step": 168 + }, + { + "epoch": 5.25, + "eval_loss": 0.6996601819992065, + "eval_runtime": 44.8792, + "eval_samples_per_second": 4.456, + "eval_steps_per_second": 0.557, + "step": 168 + }, + { + "epoch": 5.28125, + "grad_norm": 0.4708754671143599, + "learning_rate": 2e-05, + "loss": 0.7212, + "step": 169 + }, + { + "epoch": 5.28125, + "eval_loss": 0.699320375919342, + "eval_runtime": 42.8958, + "eval_samples_per_second": 4.662, + "eval_steps_per_second": 0.583, + "step": 169 + }, + { + "epoch": 5.3125, + "grad_norm": 0.5157421152452428, + "learning_rate": 2e-05, + "loss": 0.6919, + "step": 170 + }, + { + "epoch": 5.3125, + "eval_loss": 0.6992219686508179, + "eval_runtime": 43.1543, + "eval_samples_per_second": 4.635, + "eval_steps_per_second": 0.579, + "step": 170 + }, + { + "epoch": 5.34375, + "grad_norm": 0.5604495452491726, + "learning_rate": 2e-05, + "loss": 0.708, + "step": 171 + }, + { + "epoch": 5.34375, + "eval_loss": 0.6983294486999512, + "eval_runtime": 43.0431, + "eval_samples_per_second": 4.647, + "eval_steps_per_second": 0.581, + "step": 171 + }, + { + "epoch": 5.375, + "grad_norm": 0.5538353889452822, + "learning_rate": 2e-05, + "loss": 0.7922, + "step": 172 + }, + { + "epoch": 5.375, + "eval_loss": 0.6967844367027283, + "eval_runtime": 43.3554, + "eval_samples_per_second": 4.613, + "eval_steps_per_second": 0.577, + "step": 172 + }, + { + "epoch": 5.40625, + "grad_norm": 0.4750896425737706, + "learning_rate": 2e-05, + "loss": 0.7552, + "step": 173 + }, + { + "epoch": 5.40625, + "eval_loss": 0.6954870820045471, + "eval_runtime": 43.2105, + "eval_samples_per_second": 4.629, + "eval_steps_per_second": 0.579, + "step": 173 + }, + { + "epoch": 5.4375, + "grad_norm": 0.4939578777629157, + "learning_rate": 2e-05, + "loss": 0.793, + "step": 174 + }, + { + "epoch": 5.4375, + "eval_loss": 0.6942651271820068, + "eval_runtime": 43.2018, + "eval_samples_per_second": 4.629, + "eval_steps_per_second": 0.579, + "step": 174 + }, + { + "epoch": 5.46875, + "grad_norm": 0.5275775814858564, + "learning_rate": 2e-05, + "loss": 0.7812, + "step": 175 + }, + { + "epoch": 5.46875, + "eval_loss": 0.6938748359680176, + "eval_runtime": 43.0238, + "eval_samples_per_second": 4.649, + "eval_steps_per_second": 0.581, + "step": 175 + }, + { + "epoch": 5.5, + "grad_norm": 0.516931179872771, + "learning_rate": 2e-05, + "loss": 0.7157, + "step": 176 + }, + { + "epoch": 5.5, + "eval_loss": 0.6937347650527954, + "eval_runtime": 44.7687, + "eval_samples_per_second": 4.467, + "eval_steps_per_second": 0.558, + "step": 176 + }, + { + "epoch": 5.53125, + "grad_norm": 0.527427864430588, + "learning_rate": 2e-05, + "loss": 0.7505, + "step": 177 + }, + { + "epoch": 5.53125, + "eval_loss": 0.6932395696640015, + "eval_runtime": 44.5644, + "eval_samples_per_second": 4.488, + "eval_steps_per_second": 0.561, + "step": 177 + }, + { + "epoch": 5.5625, + "grad_norm": 0.5073638107520839, + "learning_rate": 2e-05, + "loss": 0.7893, + "step": 178 + }, + { + "epoch": 5.5625, + "eval_loss": 0.692828357219696, + "eval_runtime": 46.0526, + "eval_samples_per_second": 4.343, + "eval_steps_per_second": 0.543, + "step": 178 + }, + { + "epoch": 5.59375, + "grad_norm": 0.5234480045460208, + "learning_rate": 2e-05, + "loss": 0.6786, + "step": 179 + }, + { + "epoch": 5.59375, + "eval_loss": 0.6927328705787659, + "eval_runtime": 44.4221, + "eval_samples_per_second": 4.502, + "eval_steps_per_second": 0.563, + "step": 179 + }, + { + "epoch": 5.625, + "grad_norm": 0.509921375319416, + "learning_rate": 2e-05, + "loss": 0.6839, + "step": 180 + }, + { + "epoch": 5.625, + "eval_loss": 0.6922880411148071, + "eval_runtime": 44.5254, + "eval_samples_per_second": 4.492, + "eval_steps_per_second": 0.561, + "step": 180 + }, + { + "epoch": 5.65625, + "grad_norm": 0.5307701692724383, + "learning_rate": 2e-05, + "loss": 0.6949, + "step": 181 + }, + { + "epoch": 5.65625, + "eval_loss": 0.6916860938072205, + "eval_runtime": 46.1897, + "eval_samples_per_second": 4.33, + "eval_steps_per_second": 0.541, + "step": 181 + }, + { + "epoch": 5.6875, + "grad_norm": 0.5405944672270007, + "learning_rate": 2e-05, + "loss": 0.6644, + "step": 182 + }, + { + "epoch": 5.6875, + "eval_loss": 0.6913076639175415, + "eval_runtime": 45.6494, + "eval_samples_per_second": 4.381, + "eval_steps_per_second": 0.548, + "step": 182 + }, + { + "epoch": 5.71875, + "grad_norm": 0.5911050914106935, + "learning_rate": 2e-05, + "loss": 0.6993, + "step": 183 + }, + { + "epoch": 5.71875, + "eval_loss": 0.6910421848297119, + "eval_runtime": 45.6849, + "eval_samples_per_second": 4.378, + "eval_steps_per_second": 0.547, + "step": 183 + }, + { + "epoch": 5.75, + "grad_norm": 0.5738317262291136, + "learning_rate": 2e-05, + "loss": 0.6909, + "step": 184 + }, + { + "epoch": 5.75, + "eval_loss": 0.6906780004501343, + "eval_runtime": 45.8103, + "eval_samples_per_second": 4.366, + "eval_steps_per_second": 0.546, + "step": 184 + }, + { + "epoch": 5.78125, + "grad_norm": 0.6176885912626084, + "learning_rate": 2e-05, + "loss": 0.7418, + "step": 185 + }, + { + "epoch": 5.78125, + "eval_loss": 0.6897534132003784, + "eval_runtime": 46.2895, + "eval_samples_per_second": 4.321, + "eval_steps_per_second": 0.54, + "step": 185 + }, + { + "epoch": 5.8125, + "grad_norm": 0.5804047612157957, + "learning_rate": 2e-05, + "loss": 0.7046, + "step": 186 + }, + { + "epoch": 5.8125, + "eval_loss": 0.6883871555328369, + "eval_runtime": 46.9282, + "eval_samples_per_second": 4.262, + "eval_steps_per_second": 0.533, + "step": 186 + }, + { + "epoch": 5.84375, + "grad_norm": 0.5408722725454089, + "learning_rate": 2e-05, + "loss": 0.7561, + "step": 187 + }, + { + "epoch": 5.84375, + "eval_loss": 0.6878187656402588, + "eval_runtime": 47.6969, + "eval_samples_per_second": 4.193, + "eval_steps_per_second": 0.524, + "step": 187 + }, + { + "epoch": 5.875, + "grad_norm": 0.5492560188161619, + "learning_rate": 2e-05, + "loss": 0.6903, + "step": 188 + }, + { + "epoch": 5.875, + "eval_loss": 0.6882662773132324, + "eval_runtime": 47.2072, + "eval_samples_per_second": 4.237, + "eval_steps_per_second": 0.53, + "step": 188 + }, + { + "epoch": 5.90625, + "grad_norm": 0.5286439760924038, + "learning_rate": 2e-05, + "loss": 0.7036, + "step": 189 + }, + { + "epoch": 5.90625, + "eval_loss": 0.6890198588371277, + "eval_runtime": 47.4378, + "eval_samples_per_second": 4.216, + "eval_steps_per_second": 0.527, + "step": 189 + }, + { + "epoch": 5.9375, + "grad_norm": 0.5540465829524065, + "learning_rate": 2e-05, + "loss": 0.715, + "step": 190 + }, + { + "epoch": 5.9375, + "eval_loss": 0.6893854737281799, + "eval_runtime": 47.5957, + "eval_samples_per_second": 4.202, + "eval_steps_per_second": 0.525, + "step": 190 + }, + { + "epoch": 5.96875, + "grad_norm": 0.543055712644853, + "learning_rate": 2e-05, + "loss": 0.7122, + "step": 191 + }, + { + "epoch": 5.96875, + "eval_loss": 0.688640296459198, + "eval_runtime": 47.2791, + "eval_samples_per_second": 4.23, + "eval_steps_per_second": 0.529, + "step": 191 + }, + { + "epoch": 6.0, + "grad_norm": 0.5243011011968818, + "learning_rate": 2e-05, + "loss": 0.6989, + "step": 192 + }, + { + "epoch": 6.0, + "eval_loss": 0.6877474784851074, + "eval_runtime": 49.6808, + "eval_samples_per_second": 4.026, + "eval_steps_per_second": 0.503, + "step": 192 + }, + { + "epoch": 6.03125, + "grad_norm": 0.5427998890836598, + "learning_rate": 2e-05, + "loss": 0.7643, + "step": 193 + }, + { + "epoch": 6.03125, + "eval_loss": 0.6871516704559326, + "eval_runtime": 43.2416, + "eval_samples_per_second": 4.625, + "eval_steps_per_second": 0.578, + "step": 193 + }, + { + "epoch": 6.0625, + "grad_norm": 0.4848261239833822, + "learning_rate": 2e-05, + "loss": 0.7333, + "step": 194 + }, + { + "epoch": 6.0625, + "eval_loss": 0.6872122287750244, + "eval_runtime": 43.027, + "eval_samples_per_second": 4.648, + "eval_steps_per_second": 0.581, + "step": 194 + }, + { + "epoch": 6.09375, + "grad_norm": 0.5476878256408845, + "learning_rate": 2e-05, + "loss": 0.6621, + "step": 195 + }, + { + "epoch": 6.09375, + "eval_loss": 0.6873424053192139, + "eval_runtime": 43.0047, + "eval_samples_per_second": 4.651, + "eval_steps_per_second": 0.581, + "step": 195 + }, + { + "epoch": 6.125, + "grad_norm": 0.5198863257357437, + "learning_rate": 2e-05, + "loss": 0.6936, + "step": 196 + }, + { + "epoch": 6.125, + "eval_loss": 0.6874563097953796, + "eval_runtime": 43.2855, + "eval_samples_per_second": 4.62, + "eval_steps_per_second": 0.578, + "step": 196 + }, + { + "epoch": 6.15625, + "grad_norm": 0.5705568756769012, + "learning_rate": 2e-05, + "loss": 0.7237, + "step": 197 + }, + { + "epoch": 6.15625, + "eval_loss": 0.6877203583717346, + "eval_runtime": 44.8778, + "eval_samples_per_second": 4.457, + "eval_steps_per_second": 0.557, + "step": 197 + }, + { + "epoch": 6.1875, + "grad_norm": 0.5546703873264635, + "learning_rate": 2e-05, + "loss": 0.8033, + "step": 198 + }, + { + "epoch": 6.1875, + "eval_loss": 0.6876934170722961, + "eval_runtime": 43.3351, + "eval_samples_per_second": 4.615, + "eval_steps_per_second": 0.577, + "step": 198 + }, + { + "epoch": 6.21875, + "grad_norm": 0.5846944975931198, + "learning_rate": 2e-05, + "loss": 0.6687, + "step": 199 + }, + { + "epoch": 6.21875, + "eval_loss": 0.6877866983413696, + "eval_runtime": 43.1456, + "eval_samples_per_second": 4.635, + "eval_steps_per_second": 0.579, + "step": 199 + }, + { + "epoch": 6.25, + "grad_norm": 0.5882658410555619, + "learning_rate": 2e-05, + "loss": 0.7169, + "step": 200 + }, + { + "epoch": 6.25, + "eval_loss": 0.6881275773048401, + "eval_runtime": 44.9645, + "eval_samples_per_second": 4.448, + "eval_steps_per_second": 0.556, + "step": 200 + }, + { + "epoch": 6.28125, + "grad_norm": 0.5831610447904351, + "learning_rate": 2e-05, + "loss": 0.7394, + "step": 201 + }, + { + "epoch": 6.28125, + "eval_loss": 0.6888833045959473, + "eval_runtime": 45.09, + "eval_samples_per_second": 4.436, + "eval_steps_per_second": 0.554, + "step": 201 + }, + { + "epoch": 6.3125, + "grad_norm": 0.6592966385691889, + "learning_rate": 2e-05, + "loss": 0.6537, + "step": 202 + }, + { + "epoch": 6.3125, + "eval_loss": 0.6880140900611877, + "eval_runtime": 43.2447, + "eval_samples_per_second": 4.625, + "eval_steps_per_second": 0.578, + "step": 202 + }, + { + "epoch": 6.34375, + "grad_norm": 0.558654488415818, + "learning_rate": 2e-05, + "loss": 0.7991, + "step": 203 + }, + { + "epoch": 6.34375, + "eval_loss": 0.6874076724052429, + "eval_runtime": 42.9406, + "eval_samples_per_second": 4.658, + "eval_steps_per_second": 0.582, + "step": 203 + }, + { + "epoch": 6.375, + "grad_norm": 0.6342316949523702, + "learning_rate": 2e-05, + "loss": 0.6403, + "step": 204 + }, + { + "epoch": 6.375, + "eval_loss": 0.6866291761398315, + "eval_runtime": 43.1217, + "eval_samples_per_second": 4.638, + "eval_steps_per_second": 0.58, + "step": 204 + }, + { + "epoch": 6.40625, + "grad_norm": 0.544206621558966, + "learning_rate": 2e-05, + "loss": 0.6314, + "step": 205 + }, + { + "epoch": 6.40625, + "eval_loss": 0.6863086223602295, + "eval_runtime": 43.2951, + "eval_samples_per_second": 4.619, + "eval_steps_per_second": 0.577, + "step": 205 + }, + { + "epoch": 6.4375, + "grad_norm": 0.6380097809956626, + "learning_rate": 2e-05, + "loss": 0.6851, + "step": 206 + }, + { + "epoch": 6.4375, + "eval_loss": 0.6859965324401855, + "eval_runtime": 44.9257, + "eval_samples_per_second": 4.452, + "eval_steps_per_second": 0.556, + "step": 206 + }, + { + "epoch": 6.46875, + "grad_norm": 0.5870799307885896, + "learning_rate": 2e-05, + "loss": 0.7367, + "step": 207 + }, + { + "epoch": 6.46875, + "eval_loss": 0.6856269836425781, + "eval_runtime": 44.8384, + "eval_samples_per_second": 4.46, + "eval_steps_per_second": 0.558, + "step": 207 + }, + { + "epoch": 6.5, + "grad_norm": 0.6115022356518031, + "learning_rate": 2e-05, + "loss": 0.6814, + "step": 208 + }, + { + "epoch": 6.5, + "eval_loss": 0.6856591701507568, + "eval_runtime": 42.9528, + "eval_samples_per_second": 4.656, + "eval_steps_per_second": 0.582, + "step": 208 + }, + { + "epoch": 6.53125, + "grad_norm": 0.6655918462314045, + "learning_rate": 2e-05, + "loss": 0.657, + "step": 209 + }, + { + "epoch": 6.53125, + "eval_loss": 0.6854197978973389, + "eval_runtime": 43.2366, + "eval_samples_per_second": 4.626, + "eval_steps_per_second": 0.578, + "step": 209 + }, + { + "epoch": 6.5625, + "grad_norm": 0.6102352184035382, + "learning_rate": 2e-05, + "loss": 0.6343, + "step": 210 + }, + { + "epoch": 6.5625, + "eval_loss": 0.6852834820747375, + "eval_runtime": 43.1789, + "eval_samples_per_second": 4.632, + "eval_steps_per_second": 0.579, + "step": 210 + }, + { + "epoch": 6.59375, + "grad_norm": 0.6354143085331753, + "learning_rate": 2e-05, + "loss": 0.6736, + "step": 211 + }, + { + "epoch": 6.59375, + "eval_loss": 0.6851873993873596, + "eval_runtime": 44.5173, + "eval_samples_per_second": 4.493, + "eval_steps_per_second": 0.562, + "step": 211 + }, + { + "epoch": 6.625, + "grad_norm": 0.6069083787831553, + "learning_rate": 2e-05, + "loss": 0.6466, + "step": 212 + }, + { + "epoch": 6.625, + "eval_loss": 0.6846270561218262, + "eval_runtime": 44.7412, + "eval_samples_per_second": 4.47, + "eval_steps_per_second": 0.559, + "step": 212 + }, + { + "epoch": 6.65625, + "grad_norm": 0.5918704953369675, + "learning_rate": 2e-05, + "loss": 0.7174, + "step": 213 + }, + { + "epoch": 6.65625, + "eval_loss": 0.6842523217201233, + "eval_runtime": 46.0503, + "eval_samples_per_second": 4.343, + "eval_steps_per_second": 0.543, + "step": 213 + }, + { + "epoch": 6.6875, + "grad_norm": 0.5824866849171524, + "learning_rate": 2e-05, + "loss": 0.6955, + "step": 214 + }, + { + "epoch": 6.6875, + "eval_loss": 0.6838890314102173, + "eval_runtime": 44.5781, + "eval_samples_per_second": 4.487, + "eval_steps_per_second": 0.561, + "step": 214 + }, + { + "epoch": 6.71875, + "grad_norm": 0.6278777152900226, + "learning_rate": 2e-05, + "loss": 0.6926, + "step": 215 + }, + { + "epoch": 6.71875, + "eval_loss": 0.6827735900878906, + "eval_runtime": 44.483, + "eval_samples_per_second": 4.496, + "eval_steps_per_second": 0.562, + "step": 215 + }, + { + "epoch": 6.75, + "grad_norm": 0.6627082254561003, + "learning_rate": 2e-05, + "loss": 0.6931, + "step": 216 + }, + { + "epoch": 6.75, + "eval_loss": 0.6818405389785767, + "eval_runtime": 46.0477, + "eval_samples_per_second": 4.343, + "eval_steps_per_second": 0.543, + "step": 216 + }, + { + "epoch": 6.78125, + "grad_norm": 0.6551951149808454, + "learning_rate": 2e-05, + "loss": 0.6386, + "step": 217 + }, + { + "epoch": 6.78125, + "eval_loss": 0.6824897527694702, + "eval_runtime": 47.3712, + "eval_samples_per_second": 4.222, + "eval_steps_per_second": 0.528, + "step": 217 + }, + { + "epoch": 6.8125, + "grad_norm": 0.6821330786477059, + "learning_rate": 2e-05, + "loss": 0.635, + "step": 218 + }, + { + "epoch": 6.8125, + "eval_loss": 0.6829469203948975, + "eval_runtime": 46.2003, + "eval_samples_per_second": 4.329, + "eval_steps_per_second": 0.541, + "step": 218 + }, + { + "epoch": 6.84375, + "grad_norm": 0.7440273168609611, + "learning_rate": 2e-05, + "loss": 0.7286, + "step": 219 + }, + { + "epoch": 6.84375, + "eval_loss": 0.6824621558189392, + "eval_runtime": 45.8201, + "eval_samples_per_second": 4.365, + "eval_steps_per_second": 0.546, + "step": 219 + }, + { + "epoch": 6.875, + "grad_norm": 0.7007032012854347, + "learning_rate": 2e-05, + "loss": 0.7376, + "step": 220 + }, + { + "epoch": 6.875, + "eval_loss": 0.6805981397628784, + "eval_runtime": 45.7474, + "eval_samples_per_second": 4.372, + "eval_steps_per_second": 0.546, + "step": 220 + }, + { + "epoch": 6.90625, + "grad_norm": 0.6422764032088494, + "learning_rate": 2e-05, + "loss": 0.6959, + "step": 221 + }, + { + "epoch": 6.90625, + "eval_loss": 0.679237961769104, + "eval_runtime": 48.4646, + "eval_samples_per_second": 4.127, + "eval_steps_per_second": 0.516, + "step": 221 + }, + { + "epoch": 6.9375, + "grad_norm": 0.7159695125034813, + "learning_rate": 2e-05, + "loss": 0.6894, + "step": 222 + }, + { + "epoch": 6.9375, + "eval_loss": 0.6775233745574951, + "eval_runtime": 47.4563, + "eval_samples_per_second": 4.214, + "eval_steps_per_second": 0.527, + "step": 222 + }, + { + "epoch": 6.96875, + "grad_norm": 0.6358380926544867, + "learning_rate": 2e-05, + "loss": 0.7073, + "step": 223 + }, + { + "epoch": 6.96875, + "eval_loss": 0.6766613721847534, + "eval_runtime": 47.4483, + "eval_samples_per_second": 4.215, + "eval_steps_per_second": 0.527, + "step": 223 + }, + { + "epoch": 7.0, + "grad_norm": 0.6716901613635139, + "learning_rate": 2e-05, + "loss": 0.76, + "step": 224 + }, + { + "epoch": 7.0, + "eval_loss": 0.6770586371421814, + "eval_runtime": 47.0209, + "eval_samples_per_second": 4.253, + "eval_steps_per_second": 0.532, + "step": 224 + }, + { + "epoch": 7.03125, + "grad_norm": 0.5953096184448028, + "learning_rate": 2e-05, + "loss": 0.6798, + "step": 225 + }, + { + "epoch": 7.03125, + "eval_loss": 0.6774635314941406, + "eval_runtime": 51.4624, + "eval_samples_per_second": 3.886, + "eval_steps_per_second": 0.486, + "step": 225 + }, + { + "epoch": 7.0625, + "grad_norm": 0.6549589081607252, + "learning_rate": 2e-05, + "loss": 0.6122, + "step": 226 + }, + { + "epoch": 7.0625, + "eval_loss": 0.6784033179283142, + "eval_runtime": 45.8732, + "eval_samples_per_second": 4.36, + "eval_steps_per_second": 0.545, + "step": 226 + }, + { + "epoch": 7.09375, + "grad_norm": 0.6573259751745981, + "learning_rate": 2e-05, + "loss": 0.6829, + "step": 227 + }, + { + "epoch": 7.09375, + "eval_loss": 0.6796069145202637, + "eval_runtime": 44.2994, + "eval_samples_per_second": 4.515, + "eval_steps_per_second": 0.564, + "step": 227 + }, + { + "epoch": 7.125, + "grad_norm": 0.725599779122791, + "learning_rate": 2e-05, + "loss": 0.6336, + "step": 228 + }, + { + "epoch": 7.125, + "eval_loss": 0.681220531463623, + "eval_runtime": 45.7641, + "eval_samples_per_second": 4.37, + "eval_steps_per_second": 0.546, + "step": 228 + }, + { + "epoch": 7.15625, + "grad_norm": 0.7811517272176121, + "learning_rate": 2e-05, + "loss": 0.6387, + "step": 229 + }, + { + "epoch": 7.15625, + "eval_loss": 0.6828885674476624, + "eval_runtime": 44.7953, + "eval_samples_per_second": 4.465, + "eval_steps_per_second": 0.558, + "step": 229 + }, + { + "epoch": 7.1875, + "grad_norm": 0.6760384395465522, + "learning_rate": 2e-05, + "loss": 0.6245, + "step": 230 + }, + { + "epoch": 7.1875, + "eval_loss": 0.6845852732658386, + "eval_runtime": 44.3812, + "eval_samples_per_second": 4.506, + "eval_steps_per_second": 0.563, + "step": 230 + }, + { + "epoch": 7.21875, + "grad_norm": 0.7361186814868562, + "learning_rate": 2e-05, + "loss": 0.7128, + "step": 231 + }, + { + "epoch": 7.21875, + "eval_loss": 0.685402512550354, + "eval_runtime": 44.3763, + "eval_samples_per_second": 4.507, + "eval_steps_per_second": 0.563, + "step": 231 + }, + { + "epoch": 7.25, + "grad_norm": 0.7299978196751681, + "learning_rate": 2e-05, + "loss": 0.7176, + "step": 232 + }, + { + "epoch": 7.25, + "eval_loss": 0.685026228427887, + "eval_runtime": 44.3181, + "eval_samples_per_second": 4.513, + "eval_steps_per_second": 0.564, + "step": 232 + }, + { + "epoch": 7.28125, + "grad_norm": 0.8584091654553072, + "learning_rate": 2e-05, + "loss": 0.6653, + "step": 233 + }, + { + "epoch": 7.28125, + "eval_loss": 0.6831257343292236, + "eval_runtime": 44.3805, + "eval_samples_per_second": 4.506, + "eval_steps_per_second": 0.563, + "step": 233 + }, + { + "epoch": 7.3125, + "grad_norm": 0.6919046534495772, + "learning_rate": 2e-05, + "loss": 0.6968, + "step": 234 + }, + { + "epoch": 7.3125, + "eval_loss": 0.6820144653320312, + "eval_runtime": 44.3397, + "eval_samples_per_second": 4.511, + "eval_steps_per_second": 0.564, + "step": 234 + }, + { + "epoch": 7.34375, + "grad_norm": 0.6716381808914595, + "learning_rate": 2e-05, + "loss": 0.6626, + "step": 235 + }, + { + "epoch": 7.34375, + "eval_loss": 0.6815916299819946, + "eval_runtime": 44.2997, + "eval_samples_per_second": 4.515, + "eval_steps_per_second": 0.564, + "step": 235 + }, + { + "epoch": 7.375, + "grad_norm": 0.7098466238055623, + "learning_rate": 2e-05, + "loss": 0.629, + "step": 236 + }, + { + "epoch": 7.375, + "eval_loss": 0.681601881980896, + "eval_runtime": 44.2722, + "eval_samples_per_second": 4.518, + "eval_steps_per_second": 0.565, + "step": 236 + }, + { + "epoch": 7.40625, + "grad_norm": 0.7700763843474521, + "learning_rate": 2e-05, + "loss": 0.6796, + "step": 237 + }, + { + "epoch": 7.40625, + "eval_loss": 0.6809589862823486, + "eval_runtime": 44.4518, + "eval_samples_per_second": 4.499, + "eval_steps_per_second": 0.562, + "step": 237 + }, + { + "epoch": 7.4375, + "grad_norm": 0.7925088234539602, + "learning_rate": 2e-05, + "loss": 0.6722, + "step": 238 + }, + { + "epoch": 7.4375, + "eval_loss": 0.6801493763923645, + "eval_runtime": 44.4078, + "eval_samples_per_second": 4.504, + "eval_steps_per_second": 0.563, + "step": 238 + }, + { + "epoch": 7.46875, + "grad_norm": 0.6778717561377235, + "learning_rate": 2e-05, + "loss": 0.6889, + "step": 239 + }, + { + "epoch": 7.46875, + "eval_loss": 0.6798510551452637, + "eval_runtime": 44.3303, + "eval_samples_per_second": 4.512, + "eval_steps_per_second": 0.564, + "step": 239 + }, + { + "epoch": 7.5, + "grad_norm": 0.6683599876699755, + "learning_rate": 2e-05, + "loss": 0.6383, + "step": 240 + }, + { + "epoch": 7.5, + "eval_loss": 0.6800721883773804, + "eval_runtime": 44.5868, + "eval_samples_per_second": 4.486, + "eval_steps_per_second": 0.561, + "step": 240 + }, + { + "epoch": 7.53125, + "grad_norm": 0.6242371910913779, + "learning_rate": 2e-05, + "loss": 0.6809, + "step": 241 + }, + { + "epoch": 7.53125, + "eval_loss": 0.6809727549552917, + "eval_runtime": 44.7112, + "eval_samples_per_second": 4.473, + "eval_steps_per_second": 0.559, + "step": 241 + }, + { + "epoch": 7.5625, + "grad_norm": 0.6966989602775038, + "learning_rate": 2e-05, + "loss": 0.6777, + "step": 242 + }, + { + "epoch": 7.5625, + "eval_loss": 0.6819994449615479, + "eval_runtime": 44.3272, + "eval_samples_per_second": 4.512, + "eval_steps_per_second": 0.564, + "step": 242 + }, + { + "epoch": 7.59375, + "grad_norm": 0.7373050917062219, + "learning_rate": 2e-05, + "loss": 0.6622, + "step": 243 + }, + { + "epoch": 7.59375, + "eval_loss": 0.6821829080581665, + "eval_runtime": 46.0527, + "eval_samples_per_second": 4.343, + "eval_steps_per_second": 0.543, + "step": 243 + }, + { + "epoch": 7.625, + "grad_norm": 0.8266617785650243, + "learning_rate": 2e-05, + "loss": 0.7248, + "step": 244 + }, + { + "epoch": 7.625, + "eval_loss": 0.6813778877258301, + "eval_runtime": 45.7663, + "eval_samples_per_second": 4.37, + "eval_steps_per_second": 0.546, + "step": 244 + }, + { + "epoch": 7.65625, + "grad_norm": 0.7459146574284048, + "learning_rate": 2e-05, + "loss": 0.6301, + "step": 245 + }, + { + "epoch": 7.65625, + "eval_loss": 0.6811490654945374, + "eval_runtime": 45.9361, + "eval_samples_per_second": 4.354, + "eval_steps_per_second": 0.544, + "step": 245 + }, + { + "epoch": 7.6875, + "grad_norm": 0.7612602223178182, + "learning_rate": 2e-05, + "loss": 0.6713, + "step": 246 + }, + { + "epoch": 7.6875, + "eval_loss": 0.6800392866134644, + "eval_runtime": 46.3359, + "eval_samples_per_second": 4.316, + "eval_steps_per_second": 0.54, + "step": 246 + }, + { + "epoch": 7.71875, + "grad_norm": 0.7391445622441601, + "learning_rate": 2e-05, + "loss": 0.6721, + "step": 247 + }, + { + "epoch": 7.71875, + "eval_loss": 0.6794085502624512, + "eval_runtime": 46.9877, + "eval_samples_per_second": 4.256, + "eval_steps_per_second": 0.532, + "step": 247 + }, + { + "epoch": 7.75, + "grad_norm": 0.7019243161207622, + "learning_rate": 2e-05, + "loss": 0.6578, + "step": 248 + }, + { + "epoch": 7.75, + "eval_loss": 0.6786046624183655, + "eval_runtime": 46.0364, + "eval_samples_per_second": 4.344, + "eval_steps_per_second": 0.543, + "step": 248 + }, + { + "epoch": 7.78125, + "grad_norm": 0.7933438921741315, + "learning_rate": 2e-05, + "loss": 0.7023, + "step": 249 + }, + { + "epoch": 7.78125, + "eval_loss": 0.6770951747894287, + "eval_runtime": 45.6655, + "eval_samples_per_second": 4.38, + "eval_steps_per_second": 0.547, + "step": 249 + }, + { + "epoch": 7.8125, + "grad_norm": 0.7313927502966258, + "learning_rate": 2e-05, + "loss": 0.7114, + "step": 250 + }, + { + "epoch": 7.8125, + "eval_loss": 0.6766157746315002, + "eval_runtime": 45.7602, + "eval_samples_per_second": 4.371, + "eval_steps_per_second": 0.546, + "step": 250 + }, + { + "epoch": 7.84375, + "grad_norm": 0.7235467321597684, + "learning_rate": 2e-05, + "loss": 0.6259, + "step": 251 + }, + { + "epoch": 7.84375, + "eval_loss": 0.6770395040512085, + "eval_runtime": 46.9839, + "eval_samples_per_second": 4.257, + "eval_steps_per_second": 0.532, + "step": 251 + }, + { + "epoch": 7.875, + "grad_norm": 0.773244621810685, + "learning_rate": 2e-05, + "loss": 0.6262, + "step": 252 + }, + { + "epoch": 7.875, + "eval_loss": 0.6780049800872803, + "eval_runtime": 46.9808, + "eval_samples_per_second": 4.257, + "eval_steps_per_second": 0.532, + "step": 252 + }, + { + "epoch": 7.90625, + "grad_norm": 0.7620627775664955, + "learning_rate": 2e-05, + "loss": 0.7219, + "step": 253 + }, + { + "epoch": 7.90625, + "eval_loss": 0.6781153678894043, + "eval_runtime": 49.7208, + "eval_samples_per_second": 4.022, + "eval_steps_per_second": 0.503, + "step": 253 + }, + { + "epoch": 7.9375, + "grad_norm": 0.7332381519045823, + "learning_rate": 2e-05, + "loss": 0.6777, + "step": 254 + }, + { + "epoch": 7.9375, + "eval_loss": 0.6787923574447632, + "eval_runtime": 43.1001, + "eval_samples_per_second": 4.64, + "eval_steps_per_second": 0.58, + "step": 254 + }, + { + "epoch": 7.96875, + "grad_norm": 0.7847956878083815, + "learning_rate": 2e-05, + "loss": 0.5983, + "step": 255 + }, + { + "epoch": 7.96875, + "eval_loss": 0.6779956817626953, + "eval_runtime": 43.1273, + "eval_samples_per_second": 4.637, + "eval_steps_per_second": 0.58, + "step": 255 + }, + { + "epoch": 8.0, + "grad_norm": 0.7095399891563587, + "learning_rate": 2e-05, + "loss": 0.6609, + "step": 256 + }, + { + "epoch": 8.0, + "eval_loss": 0.677204430103302, + "eval_runtime": 43.0632, + "eval_samples_per_second": 4.644, + "eval_steps_per_second": 0.581, + "step": 256 + }, + { + "epoch": 8.03125, + "grad_norm": 0.7654004838243704, + "learning_rate": 2e-05, + "loss": 0.6297, + "step": 257 + }, + { + "epoch": 8.03125, + "eval_loss": 0.6774580478668213, + "eval_runtime": 50.3948, + "eval_samples_per_second": 3.969, + "eval_steps_per_second": 0.496, + "step": 257 + }, + { + "epoch": 8.0625, + "grad_norm": 0.7337064337590912, + "learning_rate": 2e-05, + "loss": 0.6446, + "step": 258 + }, + { + "epoch": 8.0625, + "eval_loss": 0.6788855195045471, + "eval_runtime": 43.163, + "eval_samples_per_second": 4.634, + "eval_steps_per_second": 0.579, + "step": 258 + }, + { + "epoch": 8.09375, + "grad_norm": 0.7426969285671609, + "learning_rate": 2e-05, + "loss": 0.6732, + "step": 259 + }, + { + "epoch": 8.09375, + "eval_loss": 0.6811656355857849, + "eval_runtime": 43.113, + "eval_samples_per_second": 4.639, + "eval_steps_per_second": 0.58, + "step": 259 + }, + { + "epoch": 8.125, + "grad_norm": 0.8495552484217858, + "learning_rate": 2e-05, + "loss": 0.6857, + "step": 260 + }, + { + "epoch": 8.125, + "eval_loss": 0.6831929683685303, + "eval_runtime": 43.0506, + "eval_samples_per_second": 4.646, + "eval_steps_per_second": 0.581, + "step": 260 + }, + { + "epoch": 8.15625, + "grad_norm": 0.8137654207236353, + "learning_rate": 2e-05, + "loss": 0.6076, + "step": 261 + }, + { + "epoch": 8.15625, + "eval_loss": 0.685956597328186, + "eval_runtime": 43.6958, + "eval_samples_per_second": 4.577, + "eval_steps_per_second": 0.572, + "step": 261 + }, + { + "epoch": 8.1875, + "grad_norm": 0.7920289131050305, + "learning_rate": 2e-05, + "loss": 0.723, + "step": 262 + }, + { + "epoch": 8.1875, + "eval_loss": 0.6895143389701843, + "eval_runtime": 44.5485, + "eval_samples_per_second": 4.489, + "eval_steps_per_second": 0.561, + "step": 262 + }, + { + "epoch": 8.21875, + "grad_norm": 0.9058951636873679, + "learning_rate": 2e-05, + "loss": 0.5836, + "step": 263 + }, + { + "epoch": 8.21875, + "eval_loss": 0.6920652985572815, + "eval_runtime": 43.2986, + "eval_samples_per_second": 4.619, + "eval_steps_per_second": 0.577, + "step": 263 + }, + { + "epoch": 8.25, + "grad_norm": 0.8945234539908303, + "learning_rate": 2e-05, + "loss": 0.6484, + "step": 264 + }, + { + "epoch": 8.25, + "eval_loss": 0.69307541847229, + "eval_runtime": 43.0302, + "eval_samples_per_second": 4.648, + "eval_steps_per_second": 0.581, + "step": 264 + }, + { + "epoch": 8.28125, + "grad_norm": 0.9973855113532047, + "learning_rate": 2e-05, + "loss": 0.6735, + "step": 265 + }, + { + "epoch": 8.28125, + "eval_loss": 0.6918882727622986, + "eval_runtime": 43.035, + "eval_samples_per_second": 4.647, + "eval_steps_per_second": 0.581, + "step": 265 + }, + { + "epoch": 8.3125, + "grad_norm": 0.8604633375599925, + "learning_rate": 2e-05, + "loss": 0.6618, + "step": 266 + }, + { + "epoch": 8.3125, + "eval_loss": 0.6895372867584229, + "eval_runtime": 43.31, + "eval_samples_per_second": 4.618, + "eval_steps_per_second": 0.577, + "step": 266 + }, + { + "epoch": 8.34375, + "grad_norm": 0.8414418828391491, + "learning_rate": 2e-05, + "loss": 0.5879, + "step": 267 + }, + { + "epoch": 8.34375, + "eval_loss": 0.687466561794281, + "eval_runtime": 43.1943, + "eval_samples_per_second": 4.63, + "eval_steps_per_second": 0.579, + "step": 267 + }, + { + "epoch": 8.375, + "grad_norm": 0.9186307751895403, + "learning_rate": 2e-05, + "loss": 0.6488, + "step": 268 + }, + { + "epoch": 8.375, + "eval_loss": 0.6843683123588562, + "eval_runtime": 43.0073, + "eval_samples_per_second": 4.65, + "eval_steps_per_second": 0.581, + "step": 268 + }, + { + "epoch": 8.40625, + "grad_norm": 0.8308076771594943, + "learning_rate": 2e-05, + "loss": 0.6357, + "step": 269 + }, + { + "epoch": 8.40625, + "eval_loss": 0.6821109056472778, + "eval_runtime": 43.2217, + "eval_samples_per_second": 4.627, + "eval_steps_per_second": 0.578, + "step": 269 + }, + { + "epoch": 8.4375, + "grad_norm": 0.7743250830620387, + "learning_rate": 2e-05, + "loss": 0.6408, + "step": 270 + }, + { + "epoch": 8.4375, + "eval_loss": 0.6811809539794922, + "eval_runtime": 44.8789, + "eval_samples_per_second": 4.456, + "eval_steps_per_second": 0.557, + "step": 270 + }, + { + "epoch": 8.46875, + "grad_norm": 0.8351441656367814, + "learning_rate": 2e-05, + "loss": 0.5387, + "step": 271 + }, + { + "epoch": 8.46875, + "eval_loss": 0.6824797987937927, + "eval_runtime": 43.2723, + "eval_samples_per_second": 4.622, + "eval_steps_per_second": 0.578, + "step": 271 + }, + { + "epoch": 8.5, + "grad_norm": 0.7808346216305826, + "learning_rate": 2e-05, + "loss": 0.637, + "step": 272 + }, + { + "epoch": 8.5, + "eval_loss": 0.6853922009468079, + "eval_runtime": 43.1091, + "eval_samples_per_second": 4.639, + "eval_steps_per_second": 0.58, + "step": 272 + }, + { + "epoch": 8.53125, + "grad_norm": 0.8566382439854656, + "learning_rate": 2e-05, + "loss": 0.6524, + "step": 273 + }, + { + "epoch": 8.53125, + "eval_loss": 0.6853267550468445, + "eval_runtime": 43.1515, + "eval_samples_per_second": 4.635, + "eval_steps_per_second": 0.579, + "step": 273 + }, + { + "epoch": 8.5625, + "grad_norm": 0.872427052560813, + "learning_rate": 2e-05, + "loss": 0.6393, + "step": 274 + }, + { + "epoch": 8.5625, + "eval_loss": 0.6836146712303162, + "eval_runtime": 44.9084, + "eval_samples_per_second": 4.454, + "eval_steps_per_second": 0.557, + "step": 274 + }, + { + "epoch": 8.59375, + "grad_norm": 0.8437899827314175, + "learning_rate": 2e-05, + "loss": 0.6506, + "step": 275 + }, + { + "epoch": 8.59375, + "eval_loss": 0.6817864179611206, + "eval_runtime": 44.8879, + "eval_samples_per_second": 4.456, + "eval_steps_per_second": 0.557, + "step": 275 + }, + { + "epoch": 8.625, + "grad_norm": 0.8790612317241222, + "learning_rate": 2e-05, + "loss": 0.6442, + "step": 276 + }, + { + "epoch": 8.625, + "eval_loss": 0.6796035766601562, + "eval_runtime": 43.3127, + "eval_samples_per_second": 4.618, + "eval_steps_per_second": 0.577, + "step": 276 + }, + { + "epoch": 8.65625, + "grad_norm": 0.8158092597576191, + "learning_rate": 2e-05, + "loss": 0.5893, + "step": 277 + }, + { + "epoch": 8.65625, + "eval_loss": 0.6795459985733032, + "eval_runtime": 44.6925, + "eval_samples_per_second": 4.475, + "eval_steps_per_second": 0.559, + "step": 277 + }, + { + "epoch": 8.6875, + "grad_norm": 0.878065597316925, + "learning_rate": 2e-05, + "loss": 0.6418, + "step": 278 + }, + { + "epoch": 8.6875, + "eval_loss": 0.6804844737052917, + "eval_runtime": 43.1496, + "eval_samples_per_second": 4.635, + "eval_steps_per_second": 0.579, + "step": 278 + }, + { + "epoch": 8.71875, + "grad_norm": 0.8184085366861941, + "learning_rate": 2e-05, + "loss": 0.6007, + "step": 279 + }, + { + "epoch": 8.71875, + "eval_loss": 0.6821385025978088, + "eval_runtime": 44.9156, + "eval_samples_per_second": 4.453, + "eval_steps_per_second": 0.557, + "step": 279 + }, + { + "epoch": 8.75, + "grad_norm": 0.9005368790411379, + "learning_rate": 2e-05, + "loss": 0.6683, + "step": 280 + }, + { + "epoch": 8.75, + "eval_loss": 0.6848174333572388, + "eval_runtime": 43.544, + "eval_samples_per_second": 4.593, + "eval_steps_per_second": 0.574, + "step": 280 + }, + { + "epoch": 8.78125, + "grad_norm": 0.8154265443661354, + "learning_rate": 2e-05, + "loss": 0.612, + "step": 281 + }, + { + "epoch": 8.78125, + "eval_loss": 0.6864734888076782, + "eval_runtime": 44.5814, + "eval_samples_per_second": 4.486, + "eval_steps_per_second": 0.561, + "step": 281 + }, + { + "epoch": 8.8125, + "grad_norm": 0.8905054313305548, + "learning_rate": 2e-05, + "loss": 0.5992, + "step": 282 + }, + { + "epoch": 8.8125, + "eval_loss": 0.6864038109779358, + "eval_runtime": 44.0748, + "eval_samples_per_second": 4.538, + "eval_steps_per_second": 0.567, + "step": 282 + }, + { + "epoch": 8.84375, + "grad_norm": 0.8492838619646935, + "learning_rate": 2e-05, + "loss": 0.5775, + "step": 283 + }, + { + "epoch": 8.84375, + "eval_loss": 0.686205267906189, + "eval_runtime": 44.1813, + "eval_samples_per_second": 4.527, + "eval_steps_per_second": 0.566, + "step": 283 + }, + { + "epoch": 8.875, + "grad_norm": 0.8739982729224768, + "learning_rate": 2e-05, + "loss": 0.5447, + "step": 284 + }, + { + "epoch": 8.875, + "eval_loss": 0.6865501403808594, + "eval_runtime": 46.5428, + "eval_samples_per_second": 4.297, + "eval_steps_per_second": 0.537, + "step": 284 + }, + { + "epoch": 8.90625, + "grad_norm": 0.9936570525936491, + "learning_rate": 2e-05, + "loss": 0.59, + "step": 285 + }, + { + "epoch": 8.90625, + "eval_loss": 0.6856868267059326, + "eval_runtime": 44.6352, + "eval_samples_per_second": 4.481, + "eval_steps_per_second": 0.56, + "step": 285 + }, + { + "epoch": 8.9375, + "grad_norm": 0.9517307402112732, + "learning_rate": 2e-05, + "loss": 0.7253, + "step": 286 + }, + { + "epoch": 8.9375, + "eval_loss": 0.6847086548805237, + "eval_runtime": 47.1289, + "eval_samples_per_second": 4.244, + "eval_steps_per_second": 0.53, + "step": 286 + }, + { + "epoch": 8.96875, + "grad_norm": 0.8541430299481336, + "learning_rate": 2e-05, + "loss": 0.6436, + "step": 287 + }, + { + "epoch": 8.96875, + "eval_loss": 0.6847487092018127, + "eval_runtime": 46.395, + "eval_samples_per_second": 4.311, + "eval_steps_per_second": 0.539, + "step": 287 + }, + { + "epoch": 9.0, + "grad_norm": 0.9356185152979635, + "learning_rate": 2e-05, + "loss": 0.5919, + "step": 288 + }, + { + "epoch": 9.0, + "eval_loss": 0.6830996870994568, + "eval_runtime": 45.0389, + "eval_samples_per_second": 4.441, + "eval_steps_per_second": 0.555, + "step": 288 + }, + { + "epoch": 9.03125, + "grad_norm": 0.895841912664687, + "learning_rate": 2e-05, + "loss": 0.6074, + "step": 289 + }, + { + "epoch": 9.03125, + "eval_loss": 0.6805940866470337, + "eval_runtime": 43.275, + "eval_samples_per_second": 4.622, + "eval_steps_per_second": 0.578, + "step": 289 + }, + { + "epoch": 9.0625, + "grad_norm": 0.8181374187415763, + "learning_rate": 2e-05, + "loss": 0.6233, + "step": 290 + }, + { + "epoch": 9.0625, + "eval_loss": 0.679899275302887, + "eval_runtime": 43.4137, + "eval_samples_per_second": 4.607, + "eval_steps_per_second": 0.576, + "step": 290 + }, + { + "epoch": 9.09375, + "grad_norm": 0.8491986564498026, + "learning_rate": 2e-05, + "loss": 0.6262, + "step": 291 + }, + { + "epoch": 9.09375, + "eval_loss": 0.682360053062439, + "eval_runtime": 45.2147, + "eval_samples_per_second": 4.423, + "eval_steps_per_second": 0.553, + "step": 291 + }, + { + "epoch": 9.125, + "grad_norm": 0.9355368723165358, + "learning_rate": 2e-05, + "loss": 0.595, + "step": 292 + }, + { + "epoch": 9.125, + "eval_loss": 0.6852359175682068, + "eval_runtime": 44.335, + "eval_samples_per_second": 4.511, + "eval_steps_per_second": 0.564, + "step": 292 + }, + { + "epoch": 9.15625, + "grad_norm": 0.9196086439363605, + "learning_rate": 2e-05, + "loss": 0.6534, + "step": 293 + }, + { + "epoch": 9.15625, + "eval_loss": 0.6872662305831909, + "eval_runtime": 46.5007, + "eval_samples_per_second": 4.301, + "eval_steps_per_second": 0.538, + "step": 293 + }, + { + "epoch": 9.1875, + "grad_norm": 0.8393737542433595, + "learning_rate": 2e-05, + "loss": 0.5908, + "step": 294 + }, + { + "epoch": 9.1875, + "eval_loss": 0.6902926564216614, + "eval_runtime": 46.1133, + "eval_samples_per_second": 4.337, + "eval_steps_per_second": 0.542, + "step": 294 + }, + { + "epoch": 9.21875, + "grad_norm": 1.00507877022181, + "learning_rate": 2e-05, + "loss": 0.536, + "step": 295 + }, + { + "epoch": 9.21875, + "eval_loss": 0.6969813704490662, + "eval_runtime": 45.209, + "eval_samples_per_second": 4.424, + "eval_steps_per_second": 0.553, + "step": 295 + }, + { + "epoch": 9.25, + "grad_norm": 0.9241164807887086, + "learning_rate": 2e-05, + "loss": 0.5562, + "step": 296 + }, + { + "epoch": 9.25, + "eval_loss": 0.7055781483650208, + "eval_runtime": 44.1347, + "eval_samples_per_second": 4.532, + "eval_steps_per_second": 0.566, + "step": 296 + }, + { + "epoch": 9.28125, + "grad_norm": 1.085449108925152, + "learning_rate": 2e-05, + "loss": 0.6582, + "step": 297 + }, + { + "epoch": 9.28125, + "eval_loss": 0.7090529799461365, + "eval_runtime": 46.0924, + "eval_samples_per_second": 4.339, + "eval_steps_per_second": 0.542, + "step": 297 + }, + { + "epoch": 9.3125, + "grad_norm": 1.2857794830276748, + "learning_rate": 2e-05, + "loss": 0.5942, + "step": 298 + }, + { + "epoch": 9.3125, + "eval_loss": 0.7092991471290588, + "eval_runtime": 45.9455, + "eval_samples_per_second": 4.353, + "eval_steps_per_second": 0.544, + "step": 298 + }, + { + "epoch": 9.34375, + "grad_norm": 1.1012657793973455, + "learning_rate": 2e-05, + "loss": 0.5681, + "step": 299 + }, + { + "epoch": 9.34375, + "eval_loss": 0.7078263759613037, + "eval_runtime": 44.3361, + "eval_samples_per_second": 4.511, + "eval_steps_per_second": 0.564, + "step": 299 + }, + { + "epoch": 9.375, + "grad_norm": 1.0150133491916107, + "learning_rate": 2e-05, + "loss": 0.5829, + "step": 300 + }, + { + "epoch": 9.375, + "eval_loss": 0.7039945721626282, + "eval_runtime": 46.0368, + "eval_samples_per_second": 4.344, + "eval_steps_per_second": 0.543, + "step": 300 + }, + { + "epoch": 9.40625, + "grad_norm": 1.0183449928898174, + "learning_rate": 2e-05, + "loss": 0.5622, + "step": 301 + }, + { + "epoch": 9.40625, + "eval_loss": 0.6970013380050659, + "eval_runtime": 44.3071, + "eval_samples_per_second": 4.514, + "eval_steps_per_second": 0.564, + "step": 301 + }, + { + "epoch": 9.4375, + "grad_norm": 1.160561076731859, + "learning_rate": 2e-05, + "loss": 0.6207, + "step": 302 + }, + { + "epoch": 9.4375, + "eval_loss": 0.6882898211479187, + "eval_runtime": 44.3423, + "eval_samples_per_second": 4.51, + "eval_steps_per_second": 0.564, + "step": 302 + }, + { + "epoch": 9.46875, + "grad_norm": 0.9775130871533282, + "learning_rate": 2e-05, + "loss": 0.6121, + "step": 303 + }, + { + "epoch": 9.46875, + "eval_loss": 0.6842953562736511, + "eval_runtime": 45.0998, + "eval_samples_per_second": 4.435, + "eval_steps_per_second": 0.554, + "step": 303 + }, + { + "epoch": 9.5, + "grad_norm": 0.8440645832373606, + "learning_rate": 2e-05, + "loss": 0.6495, + "step": 304 + }, + { + "epoch": 9.5, + "eval_loss": 0.6841378808021545, + "eval_runtime": 44.4679, + "eval_samples_per_second": 4.498, + "eval_steps_per_second": 0.562, + "step": 304 + }, + { + "epoch": 9.53125, + "grad_norm": 0.9112261594523882, + "learning_rate": 2e-05, + "loss": 0.6188, + "step": 305 + }, + { + "epoch": 9.53125, + "eval_loss": 0.6845135688781738, + "eval_runtime": 44.4427, + "eval_samples_per_second": 4.5, + "eval_steps_per_second": 0.563, + "step": 305 + }, + { + "epoch": 9.5625, + "grad_norm": 1.0253409237396724, + "learning_rate": 2e-05, + "loss": 0.602, + "step": 306 + }, + { + "epoch": 9.5625, + "eval_loss": 0.6839584112167358, + "eval_runtime": 44.1975, + "eval_samples_per_second": 4.525, + "eval_steps_per_second": 0.566, + "step": 306 + }, + { + "epoch": 9.59375, + "grad_norm": 1.0395385110757185, + "learning_rate": 2e-05, + "loss": 0.6007, + "step": 307 + }, + { + "epoch": 9.59375, + "eval_loss": 0.6852008104324341, + "eval_runtime": 44.4015, + "eval_samples_per_second": 4.504, + "eval_steps_per_second": 0.563, + "step": 307 + }, + { + "epoch": 9.625, + "grad_norm": 0.9468230481893222, + "learning_rate": 2e-05, + "loss": 0.6376, + "step": 308 + }, + { + "epoch": 9.625, + "eval_loss": 0.6902636885643005, + "eval_runtime": 45.6849, + "eval_samples_per_second": 4.378, + "eval_steps_per_second": 0.547, + "step": 308 + }, + { + "epoch": 9.65625, + "grad_norm": 0.9298141136824676, + "learning_rate": 2e-05, + "loss": 0.6094, + "step": 309 + }, + { + "epoch": 9.65625, + "eval_loss": 0.6970698833465576, + "eval_runtime": 44.2879, + "eval_samples_per_second": 4.516, + "eval_steps_per_second": 0.564, + "step": 309 + }, + { + "epoch": 9.6875, + "grad_norm": 1.2537810836544294, + "learning_rate": 2e-05, + "loss": 0.6049, + "step": 310 + }, + { + "epoch": 9.6875, + "eval_loss": 0.6991828083992004, + "eval_runtime": 46.2429, + "eval_samples_per_second": 4.325, + "eval_steps_per_second": 0.541, + "step": 310 + }, + { + "epoch": 9.71875, + "grad_norm": 1.082420692181638, + "learning_rate": 2e-05, + "loss": 0.5241, + "step": 311 + }, + { + "epoch": 9.71875, + "eval_loss": 0.7002778649330139, + "eval_runtime": 44.2468, + "eval_samples_per_second": 4.52, + "eval_steps_per_second": 0.565, + "step": 311 + }, + { + "epoch": 9.75, + "grad_norm": 1.0383910110357883, + "learning_rate": 2e-05, + "loss": 0.6162, + "step": 312 + }, + { + "epoch": 9.75, + "eval_loss": 0.7004844546318054, + "eval_runtime": 44.357, + "eval_samples_per_second": 4.509, + "eval_steps_per_second": 0.564, + "step": 312 + }, + { + "epoch": 9.78125, + "grad_norm": 0.9375392905585037, + "learning_rate": 2e-05, + "loss": 0.6082, + "step": 313 + }, + { + "epoch": 9.78125, + "eval_loss": 0.6998957991600037, + "eval_runtime": 44.3911, + "eval_samples_per_second": 4.505, + "eval_steps_per_second": 0.563, + "step": 313 + }, + { + "epoch": 9.8125, + "grad_norm": 1.080227501802435, + "learning_rate": 2e-05, + "loss": 0.5826, + "step": 314 + }, + { + "epoch": 9.8125, + "eval_loss": 0.698168158531189, + "eval_runtime": 44.2481, + "eval_samples_per_second": 4.52, + "eval_steps_per_second": 0.565, + "step": 314 + }, + { + "epoch": 9.84375, + "grad_norm": 0.9707388919250783, + "learning_rate": 2e-05, + "loss": 0.61, + "step": 315 + }, + { + "epoch": 9.84375, + "eval_loss": 0.6951956152915955, + "eval_runtime": 44.4353, + "eval_samples_per_second": 4.501, + "eval_steps_per_second": 0.563, + "step": 315 + }, + { + "epoch": 9.875, + "grad_norm": 0.9491238644745222, + "learning_rate": 2e-05, + "loss": 0.5957, + "step": 316 + }, + { + "epoch": 9.875, + "eval_loss": 0.6926063299179077, + "eval_runtime": 45.2893, + "eval_samples_per_second": 4.416, + "eval_steps_per_second": 0.552, + "step": 316 + }, + { + "epoch": 9.90625, + "grad_norm": 1.0530872213679219, + "learning_rate": 2e-05, + "loss": 0.5611, + "step": 317 + }, + { + "epoch": 9.90625, + "eval_loss": 0.6899718642234802, + "eval_runtime": 45.2963, + "eval_samples_per_second": 4.415, + "eval_steps_per_second": 0.552, + "step": 317 + }, + { + "epoch": 9.9375, + "grad_norm": 1.0052684640770637, + "learning_rate": 2e-05, + "loss": 0.5838, + "step": 318 + }, + { + "epoch": 9.9375, + "eval_loss": 0.6875657439231873, + "eval_runtime": 45.4935, + "eval_samples_per_second": 4.396, + "eval_steps_per_second": 0.55, + "step": 318 + }, + { + "epoch": 9.96875, + "grad_norm": 1.1010229534386275, + "learning_rate": 2e-05, + "loss": 0.6106, + "step": 319 + }, + { + "epoch": 9.96875, + "eval_loss": 0.6842039227485657, + "eval_runtime": 46.6347, + "eval_samples_per_second": 4.289, + "eval_steps_per_second": 0.536, + "step": 319 + }, + { + "epoch": 10.0, + "grad_norm": 1.0125768255592298, + "learning_rate": 2e-05, + "loss": 0.5698, + "step": 320 + }, + { + "epoch": 10.0, + "eval_loss": 0.6834940910339355, + "eval_runtime": 45.1875, + "eval_samples_per_second": 4.426, + "eval_steps_per_second": 0.553, + "step": 320 + } + ], + "logging_steps": 1.0, + "max_steps": 320, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 5, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 414794833330176.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}