diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18016 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.0, + "global_step": 24000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 1e-06, + "loss": 1.3946, + "step": 20 + }, + { + "epoch": 0.01, + "eval_accuracy": 0.24016666666666667, + "eval_loss": 1.3855726718902588, + "eval_runtime": 15.834, + "eval_samples_per_second": 1515.727, + "eval_steps_per_second": 4.737, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 1e-06, + "loss": 1.3801, + "step": 40 + }, + { + "epoch": 0.02, + "eval_accuracy": 0.2665416666666667, + "eval_loss": 1.3742554187774658, + "eval_runtime": 15.8222, + "eval_samples_per_second": 1516.855, + "eval_steps_per_second": 4.74, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 1e-06, + "loss": 1.37, + "step": 60 + }, + { + "epoch": 0.03, + "eval_accuracy": 0.299875, + "eval_loss": 1.362697958946228, + "eval_runtime": 15.4839, + "eval_samples_per_second": 1549.994, + "eval_steps_per_second": 4.844, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 1e-06, + "loss": 1.3559, + "step": 80 + }, + { + "epoch": 0.03, + "eval_accuracy": 0.3585, + "eval_loss": 1.3499468564987183, + "eval_runtime": 15.9453, + "eval_samples_per_second": 1505.143, + "eval_steps_per_second": 4.704, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 1e-06, + "loss": 1.3425, + "step": 100 + }, + { + "epoch": 0.04, + "eval_accuracy": 0.43916666666666665, + "eval_loss": 1.3347731828689575, + "eval_runtime": 15.6185, + "eval_samples_per_second": 1536.635, + "eval_steps_per_second": 4.802, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 1e-06, + "loss": 1.326, + "step": 120 + }, + { + "epoch": 0.05, + "eval_accuracy": 0.5287083333333333, + "eval_loss": 1.3159666061401367, + "eval_runtime": 16.2205, + "eval_samples_per_second": 1479.607, + "eval_steps_per_second": 4.624, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 1e-06, + "loss": 1.3043, + "step": 140 + }, + { + "epoch": 0.06, + "eval_accuracy": 0.58625, + "eval_loss": 1.2926729917526245, + "eval_runtime": 15.8692, + "eval_samples_per_second": 1512.368, + "eval_steps_per_second": 4.726, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 1e-06, + "loss": 1.283, + "step": 160 + }, + { + "epoch": 0.07, + "eval_accuracy": 0.6705833333333333, + "eval_loss": 1.2648732662200928, + "eval_runtime": 15.5702, + "eval_samples_per_second": 1541.406, + "eval_steps_per_second": 4.817, + "step": 160 + }, + { + "epoch": 0.07, + "learning_rate": 1e-06, + "loss": 1.2501, + "step": 180 + }, + { + "epoch": 0.07, + "eval_accuracy": 0.7289583333333334, + "eval_loss": 1.2298475503921509, + "eval_runtime": 15.3558, + "eval_samples_per_second": 1562.932, + "eval_steps_per_second": 4.884, + "step": 180 + }, + { + "epoch": 0.08, + "learning_rate": 1e-06, + "loss": 1.2218, + "step": 200 + }, + { + "epoch": 0.08, + "eval_accuracy": 0.7775, + "eval_loss": 1.1907247304916382, + "eval_runtime": 15.6618, + "eval_samples_per_second": 1532.387, + "eval_steps_per_second": 4.789, + "step": 200 + }, + { + "epoch": 0.09, + "learning_rate": 1e-06, + "loss": 1.1819, + "step": 220 + }, + { + "epoch": 0.09, + "eval_accuracy": 0.8095833333333333, + "eval_loss": 1.1489461660385132, + "eval_runtime": 16.7595, + "eval_samples_per_second": 1432.027, + "eval_steps_per_second": 4.475, + "step": 220 + }, + { + "epoch": 0.1, + "learning_rate": 1e-06, + "loss": 1.1375, + "step": 240 + }, + { + "epoch": 0.1, + "eval_accuracy": 0.8254583333333333, + "eval_loss": 1.1064685583114624, + "eval_runtime": 15.9246, + "eval_samples_per_second": 1507.098, + "eval_steps_per_second": 4.71, + "step": 240 + }, + { + "epoch": 0.11, + "learning_rate": 1e-06, + "loss": 1.0918, + "step": 260 + }, + { + "epoch": 0.11, + "eval_accuracy": 0.83575, + "eval_loss": 1.0633411407470703, + "eval_runtime": 15.4403, + "eval_samples_per_second": 1554.376, + "eval_steps_per_second": 4.857, + "step": 260 + }, + { + "epoch": 0.12, + "learning_rate": 1e-06, + "loss": 1.0493, + "step": 280 + }, + { + "epoch": 0.12, + "eval_accuracy": 0.8455833333333334, + "eval_loss": 1.0193486213684082, + "eval_runtime": 15.5383, + "eval_samples_per_second": 1544.57, + "eval_steps_per_second": 4.827, + "step": 280 + }, + { + "epoch": 0.12, + "learning_rate": 1e-06, + "loss": 1.0137, + "step": 300 + }, + { + "epoch": 0.12, + "eval_accuracy": 0.8547916666666666, + "eval_loss": 0.9759964942932129, + "eval_runtime": 15.7415, + "eval_samples_per_second": 1524.631, + "eval_steps_per_second": 4.764, + "step": 300 + }, + { + "epoch": 0.13, + "learning_rate": 1e-06, + "loss": 0.9799, + "step": 320 + }, + { + "epoch": 0.13, + "eval_accuracy": 0.856875, + "eval_loss": 0.9349467754364014, + "eval_runtime": 15.9509, + "eval_samples_per_second": 1504.613, + "eval_steps_per_second": 4.702, + "step": 320 + }, + { + "epoch": 0.14, + "learning_rate": 1e-06, + "loss": 0.9269, + "step": 340 + }, + { + "epoch": 0.14, + "eval_accuracy": 0.862125, + "eval_loss": 0.8932501077651978, + "eval_runtime": 15.5524, + "eval_samples_per_second": 1543.167, + "eval_steps_per_second": 4.822, + "step": 340 + }, + { + "epoch": 0.15, + "learning_rate": 1e-06, + "loss": 0.8982, + "step": 360 + }, + { + "epoch": 0.15, + "eval_accuracy": 0.866125, + "eval_loss": 0.8538751006126404, + "eval_runtime": 16.1674, + "eval_samples_per_second": 1484.473, + "eval_steps_per_second": 4.639, + "step": 360 + }, + { + "epoch": 0.16, + "learning_rate": 1e-06, + "loss": 0.854, + "step": 380 + }, + { + "epoch": 0.16, + "eval_accuracy": 0.8698333333333333, + "eval_loss": 0.8160046935081482, + "eval_runtime": 15.675, + "eval_samples_per_second": 1531.105, + "eval_steps_per_second": 4.785, + "step": 380 + }, + { + "epoch": 0.17, + "learning_rate": 1e-06, + "loss": 0.8087, + "step": 400 + }, + { + "epoch": 0.17, + "eval_accuracy": 0.8705416666666667, + "eval_loss": 0.7807948589324951, + "eval_runtime": 15.6603, + "eval_samples_per_second": 1532.542, + "eval_steps_per_second": 4.789, + "step": 400 + }, + { + "epoch": 0.17, + "learning_rate": 1e-06, + "loss": 0.7941, + "step": 420 + }, + { + "epoch": 0.17, + "eval_accuracy": 0.8717916666666666, + "eval_loss": 0.7479580044746399, + "eval_runtime": 15.5761, + "eval_samples_per_second": 1540.822, + "eval_steps_per_second": 4.815, + "step": 420 + }, + { + "epoch": 0.18, + "learning_rate": 1e-06, + "loss": 0.7443, + "step": 440 + }, + { + "epoch": 0.18, + "eval_accuracy": 0.872, + "eval_loss": 0.7175572514533997, + "eval_runtime": 15.9105, + "eval_samples_per_second": 1508.436, + "eval_steps_per_second": 4.714, + "step": 440 + }, + { + "epoch": 0.19, + "learning_rate": 1e-06, + "loss": 0.7233, + "step": 460 + }, + { + "epoch": 0.19, + "eval_accuracy": 0.87075, + "eval_loss": 0.6889370679855347, + "eval_runtime": 16.1633, + "eval_samples_per_second": 1484.847, + "eval_steps_per_second": 4.64, + "step": 460 + }, + { + "epoch": 0.2, + "learning_rate": 1e-06, + "loss": 0.6985, + "step": 480 + }, + { + "epoch": 0.2, + "eval_accuracy": 0.8758333333333334, + "eval_loss": 0.6611708402633667, + "eval_runtime": 15.5607, + "eval_samples_per_second": 1542.351, + "eval_steps_per_second": 4.82, + "step": 480 + }, + { + "epoch": 0.21, + "learning_rate": 1e-06, + "loss": 0.6754, + "step": 500 + }, + { + "epoch": 0.21, + "eval_accuracy": 0.8768333333333334, + "eval_loss": 0.635998547077179, + "eval_runtime": 15.8738, + "eval_samples_per_second": 1511.926, + "eval_steps_per_second": 4.725, + "step": 500 + }, + { + "epoch": 0.22, + "learning_rate": 1e-06, + "loss": 0.6536, + "step": 520 + }, + { + "epoch": 0.22, + "eval_accuracy": 0.8788333333333334, + "eval_loss": 0.6131625175476074, + "eval_runtime": 15.5202, + "eval_samples_per_second": 1546.371, + "eval_steps_per_second": 4.832, + "step": 520 + }, + { + "epoch": 0.23, + "learning_rate": 1e-06, + "loss": 0.614, + "step": 540 + }, + { + "epoch": 0.23, + "eval_accuracy": 0.8811666666666667, + "eval_loss": 0.5912534594535828, + "eval_runtime": 16.1599, + "eval_samples_per_second": 1485.154, + "eval_steps_per_second": 4.641, + "step": 540 + }, + { + "epoch": 0.23, + "learning_rate": 1e-06, + "loss": 0.5963, + "step": 560 + }, + { + "epoch": 0.23, + "eval_accuracy": 0.8809583333333333, + "eval_loss": 0.5706831216812134, + "eval_runtime": 15.5035, + "eval_samples_per_second": 1548.034, + "eval_steps_per_second": 4.838, + "step": 560 + }, + { + "epoch": 0.24, + "learning_rate": 1e-06, + "loss": 0.5894, + "step": 580 + }, + { + "epoch": 0.24, + "eval_accuracy": 0.8815416666666667, + "eval_loss": 0.5526705980300903, + "eval_runtime": 15.515, + "eval_samples_per_second": 1546.894, + "eval_steps_per_second": 4.834, + "step": 580 + }, + { + "epoch": 0.25, + "learning_rate": 1e-06, + "loss": 0.5504, + "step": 600 + }, + { + "epoch": 0.25, + "eval_accuracy": 0.8835416666666667, + "eval_loss": 0.5349943041801453, + "eval_runtime": 15.5986, + "eval_samples_per_second": 1538.598, + "eval_steps_per_second": 4.808, + "step": 600 + }, + { + "epoch": 0.26, + "learning_rate": 1e-06, + "loss": 0.5434, + "step": 620 + }, + { + "epoch": 0.26, + "eval_accuracy": 0.8850833333333333, + "eval_loss": 0.5199493169784546, + "eval_runtime": 15.5783, + "eval_samples_per_second": 1540.605, + "eval_steps_per_second": 4.814, + "step": 620 + }, + { + "epoch": 0.27, + "learning_rate": 1e-06, + "loss": 0.5325, + "step": 640 + }, + { + "epoch": 0.27, + "eval_accuracy": 0.8842083333333334, + "eval_loss": 0.5056445598602295, + "eval_runtime": 16.1601, + "eval_samples_per_second": 1485.138, + "eval_steps_per_second": 4.641, + "step": 640 + }, + { + "epoch": 0.28, + "learning_rate": 1e-06, + "loss": 0.4894, + "step": 660 + }, + { + "epoch": 0.28, + "eval_accuracy": 0.8850416666666666, + "eval_loss": 0.4912012815475464, + "eval_runtime": 15.5535, + "eval_samples_per_second": 1543.06, + "eval_steps_per_second": 4.822, + "step": 660 + }, + { + "epoch": 0.28, + "learning_rate": 1e-06, + "loss": 0.4955, + "step": 680 + }, + { + "epoch": 0.28, + "eval_accuracy": 0.8854166666666666, + "eval_loss": 0.4787778854370117, + "eval_runtime": 15.9742, + "eval_samples_per_second": 1502.42, + "eval_steps_per_second": 4.695, + "step": 680 + }, + { + "epoch": 0.29, + "learning_rate": 1e-06, + "loss": 0.5002, + "step": 700 + }, + { + "epoch": 0.29, + "eval_accuracy": 0.8854166666666666, + "eval_loss": 0.4684358835220337, + "eval_runtime": 15.4896, + "eval_samples_per_second": 1549.429, + "eval_steps_per_second": 4.842, + "step": 700 + }, + { + "epoch": 0.3, + "learning_rate": 1e-06, + "loss": 0.4621, + "step": 720 + }, + { + "epoch": 0.3, + "eval_accuracy": 0.8866666666666667, + "eval_loss": 0.4581121802330017, + "eval_runtime": 15.7197, + "eval_samples_per_second": 1526.742, + "eval_steps_per_second": 4.771, + "step": 720 + }, + { + "epoch": 0.31, + "learning_rate": 1e-06, + "loss": 0.4873, + "step": 740 + }, + { + "epoch": 0.31, + "eval_accuracy": 0.8860833333333333, + "eval_loss": 0.4509838819503784, + "eval_runtime": 15.7293, + "eval_samples_per_second": 1525.818, + "eval_steps_per_second": 4.768, + "step": 740 + }, + { + "epoch": 0.32, + "learning_rate": 1e-06, + "loss": 0.4575, + "step": 760 + }, + { + "epoch": 0.32, + "eval_accuracy": 0.88875, + "eval_loss": 0.4410632252693176, + "eval_runtime": 15.4276, + "eval_samples_per_second": 1555.649, + "eval_steps_per_second": 4.861, + "step": 760 + }, + { + "epoch": 0.33, + "learning_rate": 1e-06, + "loss": 0.4365, + "step": 780 + }, + { + "epoch": 0.33, + "eval_accuracy": 0.8889583333333333, + "eval_loss": 0.4337460398674011, + "eval_runtime": 15.9845, + "eval_samples_per_second": 1501.455, + "eval_steps_per_second": 4.692, + "step": 780 + }, + { + "epoch": 0.33, + "learning_rate": 1e-06, + "loss": 0.4306, + "step": 800 + }, + { + "epoch": 0.33, + "eval_accuracy": 0.8873333333333333, + "eval_loss": 0.42743387818336487, + "eval_runtime": 15.705, + "eval_samples_per_second": 1528.179, + "eval_steps_per_second": 4.776, + "step": 800 + }, + { + "epoch": 0.34, + "learning_rate": 1e-06, + "loss": 0.4367, + "step": 820 + }, + { + "epoch": 0.34, + "eval_accuracy": 0.889625, + "eval_loss": 0.41997990012168884, + "eval_runtime": 15.2578, + "eval_samples_per_second": 1572.97, + "eval_steps_per_second": 4.916, + "step": 820 + }, + { + "epoch": 0.35, + "learning_rate": 1e-06, + "loss": 0.4643, + "step": 840 + }, + { + "epoch": 0.35, + "eval_accuracy": 0.8909583333333333, + "eval_loss": 0.41470062732696533, + "eval_runtime": 15.4309, + "eval_samples_per_second": 1555.323, + "eval_steps_per_second": 4.86, + "step": 840 + }, + { + "epoch": 0.36, + "learning_rate": 1e-06, + "loss": 0.4483, + "step": 860 + }, + { + "epoch": 0.36, + "eval_accuracy": 0.8922916666666667, + "eval_loss": 0.40827029943466187, + "eval_runtime": 15.9503, + "eval_samples_per_second": 1504.673, + "eval_steps_per_second": 4.702, + "step": 860 + }, + { + "epoch": 0.37, + "learning_rate": 1e-06, + "loss": 0.3937, + "step": 880 + }, + { + "epoch": 0.37, + "eval_accuracy": 0.8928333333333334, + "eval_loss": 0.40155351161956787, + "eval_runtime": 15.6448, + "eval_samples_per_second": 1534.053, + "eval_steps_per_second": 4.794, + "step": 880 + }, + { + "epoch": 0.38, + "learning_rate": 1e-06, + "loss": 0.4074, + "step": 900 + }, + { + "epoch": 0.38, + "eval_accuracy": 0.89125, + "eval_loss": 0.39755621552467346, + "eval_runtime": 16.1245, + "eval_samples_per_second": 1488.416, + "eval_steps_per_second": 4.651, + "step": 900 + }, + { + "epoch": 0.38, + "learning_rate": 1e-06, + "loss": 0.4153, + "step": 920 + }, + { + "epoch": 0.38, + "eval_accuracy": 0.891125, + "eval_loss": 0.39380690455436707, + "eval_runtime": 15.6816, + "eval_samples_per_second": 1530.453, + "eval_steps_per_second": 4.783, + "step": 920 + }, + { + "epoch": 0.39, + "learning_rate": 1e-06, + "loss": 0.3934, + "step": 940 + }, + { + "epoch": 0.39, + "eval_accuracy": 0.8914166666666666, + "eval_loss": 0.3885686695575714, + "eval_runtime": 15.0444, + "eval_samples_per_second": 1595.283, + "eval_steps_per_second": 4.985, + "step": 940 + }, + { + "epoch": 0.4, + "learning_rate": 1e-06, + "loss": 0.4486, + "step": 960 + }, + { + "epoch": 0.4, + "eval_accuracy": 0.8915, + "eval_loss": 0.3852112293243408, + "eval_runtime": 15.3141, + "eval_samples_per_second": 1567.186, + "eval_steps_per_second": 4.897, + "step": 960 + }, + { + "epoch": 0.41, + "learning_rate": 1e-06, + "loss": 0.3908, + "step": 980 + }, + { + "epoch": 0.41, + "eval_accuracy": 0.8928333333333334, + "eval_loss": 0.3800646662712097, + "eval_runtime": 15.4314, + "eval_samples_per_second": 1555.271, + "eval_steps_per_second": 4.86, + "step": 980 + }, + { + "epoch": 0.42, + "learning_rate": 1e-06, + "loss": 0.3888, + "step": 1000 + }, + { + "epoch": 0.42, + "eval_accuracy": 0.894375, + "eval_loss": 0.3761395514011383, + "eval_runtime": 15.8671, + "eval_samples_per_second": 1512.567, + "eval_steps_per_second": 4.727, + "step": 1000 + }, + { + "epoch": 0.42, + "learning_rate": 1e-06, + "loss": 0.3801, + "step": 1020 + }, + { + "epoch": 0.42, + "eval_accuracy": 0.8949583333333333, + "eval_loss": 0.3725646734237671, + "eval_runtime": 15.3688, + "eval_samples_per_second": 1561.603, + "eval_steps_per_second": 4.88, + "step": 1020 + }, + { + "epoch": 0.43, + "learning_rate": 1e-06, + "loss": 0.387, + "step": 1040 + }, + { + "epoch": 0.43, + "eval_accuracy": 0.894125, + "eval_loss": 0.3705978989601135, + "eval_runtime": 15.7377, + "eval_samples_per_second": 1525.005, + "eval_steps_per_second": 4.766, + "step": 1040 + }, + { + "epoch": 0.44, + "learning_rate": 1e-06, + "loss": 0.4346, + "step": 1060 + }, + { + "epoch": 0.44, + "eval_accuracy": 0.8947916666666667, + "eval_loss": 0.367171049118042, + "eval_runtime": 14.9848, + "eval_samples_per_second": 1601.62, + "eval_steps_per_second": 5.005, + "step": 1060 + }, + { + "epoch": 0.45, + "learning_rate": 1e-06, + "loss": 0.3806, + "step": 1080 + }, + { + "epoch": 0.45, + "eval_accuracy": 0.8936666666666667, + "eval_loss": 0.3656676709651947, + "eval_runtime": 15.8929, + "eval_samples_per_second": 1510.112, + "eval_steps_per_second": 4.719, + "step": 1080 + }, + { + "epoch": 0.46, + "learning_rate": 1e-06, + "loss": 0.3633, + "step": 1100 + }, + { + "epoch": 0.46, + "eval_accuracy": 0.8948333333333334, + "eval_loss": 0.3619757294654846, + "eval_runtime": 15.254, + "eval_samples_per_second": 1573.363, + "eval_steps_per_second": 4.917, + "step": 1100 + }, + { + "epoch": 0.47, + "learning_rate": 1e-06, + "loss": 0.3429, + "step": 1120 + }, + { + "epoch": 0.47, + "eval_accuracy": 0.8954583333333334, + "eval_loss": 0.35918018221855164, + "eval_runtime": 15.6719, + "eval_samples_per_second": 1531.4, + "eval_steps_per_second": 4.786, + "step": 1120 + }, + { + "epoch": 0.47, + "learning_rate": 1e-06, + "loss": 0.3681, + "step": 1140 + }, + { + "epoch": 0.47, + "eval_accuracy": 0.896625, + "eval_loss": 0.3563116788864136, + "eval_runtime": 16.068, + "eval_samples_per_second": 1493.649, + "eval_steps_per_second": 4.668, + "step": 1140 + }, + { + "epoch": 0.48, + "learning_rate": 1e-06, + "loss": 0.3624, + "step": 1160 + }, + { + "epoch": 0.48, + "eval_accuracy": 0.8964583333333334, + "eval_loss": 0.35381361842155457, + "eval_runtime": 15.992, + "eval_samples_per_second": 1500.748, + "eval_steps_per_second": 4.69, + "step": 1160 + }, + { + "epoch": 0.49, + "learning_rate": 1e-06, + "loss": 0.3808, + "step": 1180 + }, + { + "epoch": 0.49, + "eval_accuracy": 0.895625, + "eval_loss": 0.35254761576652527, + "eval_runtime": 16.151, + "eval_samples_per_second": 1485.976, + "eval_steps_per_second": 4.644, + "step": 1180 + }, + { + "epoch": 0.5, + "learning_rate": 1e-06, + "loss": 0.3626, + "step": 1200 + }, + { + "epoch": 0.5, + "eval_accuracy": 0.897, + "eval_loss": 0.34913721680641174, + "eval_runtime": 15.769, + "eval_samples_per_second": 1521.97, + "eval_steps_per_second": 4.756, + "step": 1200 + }, + { + "epoch": 0.51, + "learning_rate": 1e-06, + "loss": 0.3278, + "step": 1220 + }, + { + "epoch": 0.51, + "eval_accuracy": 0.897375, + "eval_loss": 0.3466154932975769, + "eval_runtime": 16.1468, + "eval_samples_per_second": 1486.363, + "eval_steps_per_second": 4.645, + "step": 1220 + }, + { + "epoch": 0.52, + "learning_rate": 1e-06, + "loss": 0.3463, + "step": 1240 + }, + { + "epoch": 0.52, + "eval_accuracy": 0.8966666666666666, + "eval_loss": 0.34570518136024475, + "eval_runtime": 15.3513, + "eval_samples_per_second": 1563.389, + "eval_steps_per_second": 4.886, + "step": 1240 + }, + { + "epoch": 0.53, + "learning_rate": 1e-06, + "loss": 0.371, + "step": 1260 + }, + { + "epoch": 0.53, + "eval_accuracy": 0.8975416666666667, + "eval_loss": 0.3436819016933441, + "eval_runtime": 15.6461, + "eval_samples_per_second": 1533.926, + "eval_steps_per_second": 4.794, + "step": 1260 + }, + { + "epoch": 0.53, + "learning_rate": 1e-06, + "loss": 0.326, + "step": 1280 + }, + { + "epoch": 0.53, + "eval_accuracy": 0.8983333333333333, + "eval_loss": 0.3413917124271393, + "eval_runtime": 15.3084, + "eval_samples_per_second": 1567.764, + "eval_steps_per_second": 4.899, + "step": 1280 + }, + { + "epoch": 0.54, + "learning_rate": 1e-06, + "loss": 0.3639, + "step": 1300 + }, + { + "epoch": 0.54, + "eval_accuracy": 0.898, + "eval_loss": 0.33931395411491394, + "eval_runtime": 15.4706, + "eval_samples_per_second": 1551.329, + "eval_steps_per_second": 4.848, + "step": 1300 + }, + { + "epoch": 0.55, + "learning_rate": 1e-06, + "loss": 0.3537, + "step": 1320 + }, + { + "epoch": 0.55, + "eval_accuracy": 0.8990416666666666, + "eval_loss": 0.33674728870391846, + "eval_runtime": 15.8671, + "eval_samples_per_second": 1512.563, + "eval_steps_per_second": 4.727, + "step": 1320 + }, + { + "epoch": 0.56, + "learning_rate": 1e-06, + "loss": 0.3831, + "step": 1340 + }, + { + "epoch": 0.56, + "eval_accuracy": 0.898875, + "eval_loss": 0.3362838625907898, + "eval_runtime": 15.4851, + "eval_samples_per_second": 1549.872, + "eval_steps_per_second": 4.843, + "step": 1340 + }, + { + "epoch": 0.57, + "learning_rate": 1e-06, + "loss": 0.3559, + "step": 1360 + }, + { + "epoch": 0.57, + "eval_accuracy": 0.8994166666666666, + "eval_loss": 0.3338731527328491, + "eval_runtime": 15.8107, + "eval_samples_per_second": 1517.958, + "eval_steps_per_second": 4.744, + "step": 1360 + }, + { + "epoch": 0.57, + "learning_rate": 1e-06, + "loss": 0.3706, + "step": 1380 + }, + { + "epoch": 0.57, + "eval_accuracy": 0.899375, + "eval_loss": 0.3323739767074585, + "eval_runtime": 15.3746, + "eval_samples_per_second": 1561.014, + "eval_steps_per_second": 4.878, + "step": 1380 + }, + { + "epoch": 0.58, + "learning_rate": 1e-06, + "loss": 0.374, + "step": 1400 + }, + { + "epoch": 0.58, + "eval_accuracy": 0.8998333333333334, + "eval_loss": 0.33071157336235046, + "eval_runtime": 16.2242, + "eval_samples_per_second": 1479.273, + "eval_steps_per_second": 4.623, + "step": 1400 + }, + { + "epoch": 0.59, + "learning_rate": 1e-06, + "loss": 0.3515, + "step": 1420 + }, + { + "epoch": 0.59, + "eval_accuracy": 0.8999583333333333, + "eval_loss": 0.3292597532272339, + "eval_runtime": 15.6689, + "eval_samples_per_second": 1531.696, + "eval_steps_per_second": 4.787, + "step": 1420 + }, + { + "epoch": 0.6, + "learning_rate": 1e-06, + "loss": 0.3854, + "step": 1440 + }, + { + "epoch": 0.6, + "eval_accuracy": 0.8996666666666666, + "eval_loss": 0.32925406098365784, + "eval_runtime": 15.8799, + "eval_samples_per_second": 1511.341, + "eval_steps_per_second": 4.723, + "step": 1440 + }, + { + "epoch": 0.61, + "learning_rate": 1e-06, + "loss": 0.3363, + "step": 1460 + }, + { + "epoch": 0.61, + "eval_accuracy": 0.8998333333333334, + "eval_loss": 0.327946275472641, + "eval_runtime": 15.54, + "eval_samples_per_second": 1544.404, + "eval_steps_per_second": 4.826, + "step": 1460 + }, + { + "epoch": 0.62, + "learning_rate": 1e-06, + "loss": 0.3199, + "step": 1480 + }, + { + "epoch": 0.62, + "eval_accuracy": 0.900625, + "eval_loss": 0.32561612129211426, + "eval_runtime": 15.692, + "eval_samples_per_second": 1529.44, + "eval_steps_per_second": 4.779, + "step": 1480 + }, + { + "epoch": 0.62, + "learning_rate": 1e-06, + "loss": 0.356, + "step": 1500 + }, + { + "epoch": 0.62, + "eval_accuracy": 0.9007916666666667, + "eval_loss": 0.324897825717926, + "eval_runtime": 15.6415, + "eval_samples_per_second": 1534.381, + "eval_steps_per_second": 4.795, + "step": 1500 + }, + { + "epoch": 0.63, + "learning_rate": 1e-06, + "loss": 0.3566, + "step": 1520 + }, + { + "epoch": 0.63, + "eval_accuracy": 0.9013333333333333, + "eval_loss": 0.32291579246520996, + "eval_runtime": 16.0404, + "eval_samples_per_second": 1496.219, + "eval_steps_per_second": 4.676, + "step": 1520 + }, + { + "epoch": 0.64, + "learning_rate": 1e-06, + "loss": 0.3419, + "step": 1540 + }, + { + "epoch": 0.64, + "eval_accuracy": 0.9010833333333333, + "eval_loss": 0.3221406042575836, + "eval_runtime": 15.5217, + "eval_samples_per_second": 1546.222, + "eval_steps_per_second": 4.832, + "step": 1540 + }, + { + "epoch": 0.65, + "learning_rate": 1e-06, + "loss": 0.3453, + "step": 1560 + }, + { + "epoch": 0.65, + "eval_accuracy": 0.9005833333333333, + "eval_loss": 0.32188284397125244, + "eval_runtime": 15.8978, + "eval_samples_per_second": 1509.639, + "eval_steps_per_second": 4.718, + "step": 1560 + }, + { + "epoch": 0.66, + "learning_rate": 1e-06, + "loss": 0.3132, + "step": 1580 + }, + { + "epoch": 0.66, + "eval_accuracy": 0.901875, + "eval_loss": 0.31994542479515076, + "eval_runtime": 15.5539, + "eval_samples_per_second": 1543.025, + "eval_steps_per_second": 4.822, + "step": 1580 + }, + { + "epoch": 0.67, + "learning_rate": 1e-06, + "loss": 0.3457, + "step": 1600 + }, + { + "epoch": 0.67, + "eval_accuracy": 0.9012083333333333, + "eval_loss": 0.32011812925338745, + "eval_runtime": 15.7109, + "eval_samples_per_second": 1527.598, + "eval_steps_per_second": 4.774, + "step": 1600 + }, + { + "epoch": 0.68, + "learning_rate": 1e-06, + "loss": 0.3036, + "step": 1620 + }, + { + "epoch": 0.68, + "eval_accuracy": 0.9016666666666666, + "eval_loss": 0.31820276379585266, + "eval_runtime": 15.5575, + "eval_samples_per_second": 1542.661, + "eval_steps_per_second": 4.821, + "step": 1620 + }, + { + "epoch": 0.68, + "learning_rate": 1e-06, + "loss": 0.3359, + "step": 1640 + }, + { + "epoch": 0.68, + "eval_accuracy": 0.902125, + "eval_loss": 0.316621333360672, + "eval_runtime": 16.0532, + "eval_samples_per_second": 1495.031, + "eval_steps_per_second": 4.672, + "step": 1640 + }, + { + "epoch": 0.69, + "learning_rate": 1e-06, + "loss": 0.3291, + "step": 1660 + }, + { + "epoch": 0.69, + "eval_accuracy": 0.9020833333333333, + "eval_loss": 0.31562095880508423, + "eval_runtime": 15.7321, + "eval_samples_per_second": 1525.545, + "eval_steps_per_second": 4.767, + "step": 1660 + }, + { + "epoch": 0.7, + "learning_rate": 1e-06, + "loss": 0.3586, + "step": 1680 + }, + { + "epoch": 0.7, + "eval_accuracy": 0.9005833333333333, + "eval_loss": 0.3162485659122467, + "eval_runtime": 15.5682, + "eval_samples_per_second": 1541.608, + "eval_steps_per_second": 4.818, + "step": 1680 + }, + { + "epoch": 0.71, + "learning_rate": 1e-06, + "loss": 0.3002, + "step": 1700 + }, + { + "epoch": 0.71, + "eval_accuracy": 0.9013333333333333, + "eval_loss": 0.31555384397506714, + "eval_runtime": 15.6458, + "eval_samples_per_second": 1533.959, + "eval_steps_per_second": 4.794, + "step": 1700 + }, + { + "epoch": 0.72, + "learning_rate": 1e-06, + "loss": 0.3743, + "step": 1720 + }, + { + "epoch": 0.72, + "eval_accuracy": 0.9024583333333334, + "eval_loss": 0.3135637044906616, + "eval_runtime": 15.9237, + "eval_samples_per_second": 1507.184, + "eval_steps_per_second": 4.71, + "step": 1720 + }, + { + "epoch": 0.72, + "learning_rate": 1e-06, + "loss": 0.3506, + "step": 1740 + }, + { + "epoch": 0.72, + "eval_accuracy": 0.9029166666666667, + "eval_loss": 0.3118599057197571, + "eval_runtime": 15.6284, + "eval_samples_per_second": 1535.667, + "eval_steps_per_second": 4.799, + "step": 1740 + }, + { + "epoch": 0.73, + "learning_rate": 1e-06, + "loss": 0.3328, + "step": 1760 + }, + { + "epoch": 0.73, + "eval_accuracy": 0.9022083333333333, + "eval_loss": 0.31298351287841797, + "eval_runtime": 15.7141, + "eval_samples_per_second": 1527.295, + "eval_steps_per_second": 4.773, + "step": 1760 + }, + { + "epoch": 0.74, + "learning_rate": 1e-06, + "loss": 0.328, + "step": 1780 + }, + { + "epoch": 0.74, + "eval_accuracy": 0.9025416666666667, + "eval_loss": 0.31090086698532104, + "eval_runtime": 16.0505, + "eval_samples_per_second": 1495.277, + "eval_steps_per_second": 4.673, + "step": 1780 + }, + { + "epoch": 0.75, + "learning_rate": 1e-06, + "loss": 0.3312, + "step": 1800 + }, + { + "epoch": 0.75, + "eval_accuracy": 0.9022083333333333, + "eval_loss": 0.31056439876556396, + "eval_runtime": 15.8194, + "eval_samples_per_second": 1517.124, + "eval_steps_per_second": 4.741, + "step": 1800 + }, + { + "epoch": 0.76, + "learning_rate": 1e-06, + "loss": 0.3176, + "step": 1820 + }, + { + "epoch": 0.76, + "eval_accuracy": 0.9030833333333333, + "eval_loss": 0.3086493909358978, + "eval_runtime": 15.8105, + "eval_samples_per_second": 1517.979, + "eval_steps_per_second": 4.744, + "step": 1820 + }, + { + "epoch": 0.77, + "learning_rate": 1e-06, + "loss": 0.326, + "step": 1840 + }, + { + "epoch": 0.77, + "eval_accuracy": 0.9035416666666667, + "eval_loss": 0.30771201848983765, + "eval_runtime": 15.6087, + "eval_samples_per_second": 1537.601, + "eval_steps_per_second": 4.805, + "step": 1840 + }, + { + "epoch": 0.78, + "learning_rate": 1e-06, + "loss": 0.3308, + "step": 1860 + }, + { + "epoch": 0.78, + "eval_accuracy": 0.9033333333333333, + "eval_loss": 0.30720219016075134, + "eval_runtime": 15.7737, + "eval_samples_per_second": 1521.525, + "eval_steps_per_second": 4.755, + "step": 1860 + }, + { + "epoch": 0.78, + "learning_rate": 1e-06, + "loss": 0.2876, + "step": 1880 + }, + { + "epoch": 0.78, + "eval_accuracy": 0.90275, + "eval_loss": 0.30769696831703186, + "eval_runtime": 16.0101, + "eval_samples_per_second": 1499.051, + "eval_steps_per_second": 4.685, + "step": 1880 + }, + { + "epoch": 0.79, + "learning_rate": 1e-06, + "loss": 0.3326, + "step": 1900 + }, + { + "epoch": 0.79, + "eval_accuracy": 0.9028333333333334, + "eval_loss": 0.3074987530708313, + "eval_runtime": 15.7279, + "eval_samples_per_second": 1525.953, + "eval_steps_per_second": 4.769, + "step": 1900 + }, + { + "epoch": 0.8, + "learning_rate": 1e-06, + "loss": 0.2999, + "step": 1920 + }, + { + "epoch": 0.8, + "eval_accuracy": 0.9040416666666666, + "eval_loss": 0.3049142062664032, + "eval_runtime": 15.7379, + "eval_samples_per_second": 1524.978, + "eval_steps_per_second": 4.766, + "step": 1920 + }, + { + "epoch": 0.81, + "learning_rate": 1e-06, + "loss": 0.3207, + "step": 1940 + }, + { + "epoch": 0.81, + "eval_accuracy": 0.9042083333333333, + "eval_loss": 0.30403196811676025, + "eval_runtime": 16.0526, + "eval_samples_per_second": 1495.086, + "eval_steps_per_second": 4.672, + "step": 1940 + }, + { + "epoch": 0.82, + "learning_rate": 1e-06, + "loss": 0.3126, + "step": 1960 + }, + { + "epoch": 0.82, + "eval_accuracy": 0.903625, + "eval_loss": 0.3041483461856842, + "eval_runtime": 15.4127, + "eval_samples_per_second": 1557.161, + "eval_steps_per_second": 4.866, + "step": 1960 + }, + { + "epoch": 0.82, + "learning_rate": 1e-06, + "loss": 0.3785, + "step": 1980 + }, + { + "epoch": 0.82, + "eval_accuracy": 0.9036666666666666, + "eval_loss": 0.30376511812210083, + "eval_runtime": 15.6455, + "eval_samples_per_second": 1533.984, + "eval_steps_per_second": 4.794, + "step": 1980 + }, + { + "epoch": 0.83, + "learning_rate": 1e-06, + "loss": 0.3015, + "step": 2000 + }, + { + "epoch": 0.83, + "eval_accuracy": 0.9044166666666666, + "eval_loss": 0.30233460664749146, + "eval_runtime": 15.6366, + "eval_samples_per_second": 1534.859, + "eval_steps_per_second": 4.796, + "step": 2000 + }, + { + "epoch": 0.84, + "learning_rate": 1e-06, + "loss": 0.348, + "step": 2020 + }, + { + "epoch": 0.84, + "eval_accuracy": 0.9045833333333333, + "eval_loss": 0.30240559577941895, + "eval_runtime": 15.8212, + "eval_samples_per_second": 1516.949, + "eval_steps_per_second": 4.74, + "step": 2020 + }, + { + "epoch": 0.85, + "learning_rate": 1e-06, + "loss": 0.2947, + "step": 2040 + }, + { + "epoch": 0.85, + "eval_accuracy": 0.904875, + "eval_loss": 0.30133891105651855, + "eval_runtime": 15.8234, + "eval_samples_per_second": 1516.737, + "eval_steps_per_second": 4.74, + "step": 2040 + }, + { + "epoch": 0.86, + "learning_rate": 1e-06, + "loss": 0.3344, + "step": 2060 + }, + { + "epoch": 0.86, + "eval_accuracy": 0.9049583333333333, + "eval_loss": 0.30154794454574585, + "eval_runtime": 15.2879, + "eval_samples_per_second": 1569.873, + "eval_steps_per_second": 4.906, + "step": 2060 + }, + { + "epoch": 0.87, + "learning_rate": 1e-06, + "loss": 0.3436, + "step": 2080 + }, + { + "epoch": 0.87, + "eval_accuracy": 0.9045833333333333, + "eval_loss": 0.29952293634414673, + "eval_runtime": 16.2949, + "eval_samples_per_second": 1472.856, + "eval_steps_per_second": 4.603, + "step": 2080 + }, + { + "epoch": 0.88, + "learning_rate": 1e-06, + "loss": 0.3179, + "step": 2100 + }, + { + "epoch": 0.88, + "eval_accuracy": 0.9036666666666666, + "eval_loss": 0.30100810527801514, + "eval_runtime": 15.803, + "eval_samples_per_second": 1518.698, + "eval_steps_per_second": 4.746, + "step": 2100 + }, + { + "epoch": 0.88, + "learning_rate": 1e-06, + "loss": 0.3045, + "step": 2120 + }, + { + "epoch": 0.88, + "eval_accuracy": 0.9050833333333334, + "eval_loss": 0.29866886138916016, + "eval_runtime": 15.5575, + "eval_samples_per_second": 1542.663, + "eval_steps_per_second": 4.821, + "step": 2120 + }, + { + "epoch": 0.89, + "learning_rate": 1e-06, + "loss": 0.2797, + "step": 2140 + }, + { + "epoch": 0.89, + "eval_accuracy": 0.905125, + "eval_loss": 0.2977401614189148, + "eval_runtime": 15.9997, + "eval_samples_per_second": 1500.024, + "eval_steps_per_second": 4.688, + "step": 2140 + }, + { + "epoch": 0.9, + "learning_rate": 1e-06, + "loss": 0.3044, + "step": 2160 + }, + { + "epoch": 0.9, + "eval_accuracy": 0.9046666666666666, + "eval_loss": 0.29765772819519043, + "eval_runtime": 15.813, + "eval_samples_per_second": 1517.737, + "eval_steps_per_second": 4.743, + "step": 2160 + }, + { + "epoch": 0.91, + "learning_rate": 1e-06, + "loss": 0.2948, + "step": 2180 + }, + { + "epoch": 0.91, + "eval_accuracy": 0.9055, + "eval_loss": 0.2965656518936157, + "eval_runtime": 15.4407, + "eval_samples_per_second": 1554.336, + "eval_steps_per_second": 4.857, + "step": 2180 + }, + { + "epoch": 0.92, + "learning_rate": 1e-06, + "loss": 0.2803, + "step": 2200 + }, + { + "epoch": 0.92, + "eval_accuracy": 0.9052083333333333, + "eval_loss": 0.2959022521972656, + "eval_runtime": 15.9407, + "eval_samples_per_second": 1505.578, + "eval_steps_per_second": 4.705, + "step": 2200 + }, + { + "epoch": 0.93, + "learning_rate": 1e-06, + "loss": 0.3008, + "step": 2220 + }, + { + "epoch": 0.93, + "eval_accuracy": 0.9055833333333333, + "eval_loss": 0.2946693003177643, + "eval_runtime": 15.495, + "eval_samples_per_second": 1548.887, + "eval_steps_per_second": 4.84, + "step": 2220 + }, + { + "epoch": 0.93, + "learning_rate": 1e-06, + "loss": 0.3163, + "step": 2240 + }, + { + "epoch": 0.93, + "eval_accuracy": 0.905625, + "eval_loss": 0.2954801321029663, + "eval_runtime": 15.9747, + "eval_samples_per_second": 1502.377, + "eval_steps_per_second": 4.695, + "step": 2240 + }, + { + "epoch": 0.94, + "learning_rate": 1e-06, + "loss": 0.3094, + "step": 2260 + }, + { + "epoch": 0.94, + "eval_accuracy": 0.90525, + "eval_loss": 0.2948620617389679, + "eval_runtime": 15.5314, + "eval_samples_per_second": 1545.257, + "eval_steps_per_second": 4.829, + "step": 2260 + }, + { + "epoch": 0.95, + "learning_rate": 1e-06, + "loss": 0.2932, + "step": 2280 + }, + { + "epoch": 0.95, + "eval_accuracy": 0.9058333333333334, + "eval_loss": 0.29374802112579346, + "eval_runtime": 15.9691, + "eval_samples_per_second": 1502.905, + "eval_steps_per_second": 4.697, + "step": 2280 + }, + { + "epoch": 0.96, + "learning_rate": 1e-06, + "loss": 0.2963, + "step": 2300 + }, + { + "epoch": 0.96, + "eval_accuracy": 0.9058333333333334, + "eval_loss": 0.29317864775657654, + "eval_runtime": 15.765, + "eval_samples_per_second": 1522.362, + "eval_steps_per_second": 4.757, + "step": 2300 + }, + { + "epoch": 0.97, + "learning_rate": 1e-06, + "loss": 0.3027, + "step": 2320 + }, + { + "epoch": 0.97, + "eval_accuracy": 0.905375, + "eval_loss": 0.2939450442790985, + "eval_runtime": 15.8745, + "eval_samples_per_second": 1511.86, + "eval_steps_per_second": 4.725, + "step": 2320 + }, + { + "epoch": 0.97, + "learning_rate": 1e-06, + "loss": 0.3017, + "step": 2340 + }, + { + "epoch": 0.97, + "eval_accuracy": 0.9062083333333333, + "eval_loss": 0.29151102900505066, + "eval_runtime": 15.7805, + "eval_samples_per_second": 1520.865, + "eval_steps_per_second": 4.753, + "step": 2340 + }, + { + "epoch": 0.98, + "learning_rate": 1e-06, + "loss": 0.347, + "step": 2360 + }, + { + "epoch": 0.98, + "eval_accuracy": 0.9069583333333333, + "eval_loss": 0.2903897166252136, + "eval_runtime": 15.6602, + "eval_samples_per_second": 1532.552, + "eval_steps_per_second": 4.789, + "step": 2360 + }, + { + "epoch": 0.99, + "learning_rate": 1e-06, + "loss": 0.311, + "step": 2380 + }, + { + "epoch": 0.99, + "eval_accuracy": 0.906375, + "eval_loss": 0.29053354263305664, + "eval_runtime": 15.4212, + "eval_samples_per_second": 1556.296, + "eval_steps_per_second": 4.863, + "step": 2380 + }, + { + "epoch": 1.0, + "learning_rate": 1e-06, + "loss": 0.3007, + "step": 2400 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.9072083333333333, + "eval_loss": 0.290608286857605, + "eval_runtime": 15.7304, + "eval_samples_per_second": 1525.709, + "eval_steps_per_second": 4.768, + "step": 2400 + }, + { + "epoch": 1.01, + "learning_rate": 1e-06, + "loss": 0.3577, + "step": 2420 + }, + { + "epoch": 1.01, + "eval_accuracy": 0.9072083333333333, + "eval_loss": 0.289432168006897, + "eval_runtime": 16.5863, + "eval_samples_per_second": 1446.973, + "eval_steps_per_second": 4.522, + "step": 2420 + }, + { + "epoch": 1.02, + "learning_rate": 1e-06, + "loss": 0.2775, + "step": 2440 + }, + { + "epoch": 1.02, + "eval_accuracy": 0.906625, + "eval_loss": 0.2914583384990692, + "eval_runtime": 15.8866, + "eval_samples_per_second": 1510.711, + "eval_steps_per_second": 4.721, + "step": 2440 + }, + { + "epoch": 1.02, + "learning_rate": 1e-06, + "loss": 0.326, + "step": 2460 + }, + { + "epoch": 1.02, + "eval_accuracy": 0.907, + "eval_loss": 0.28882813453674316, + "eval_runtime": 15.4052, + "eval_samples_per_second": 1557.919, + "eval_steps_per_second": 4.868, + "step": 2460 + }, + { + "epoch": 1.03, + "learning_rate": 1e-06, + "loss": 0.2958, + "step": 2480 + }, + { + "epoch": 1.03, + "eval_accuracy": 0.9076666666666666, + "eval_loss": 0.2885444760322571, + "eval_runtime": 15.4629, + "eval_samples_per_second": 1552.103, + "eval_steps_per_second": 4.85, + "step": 2480 + }, + { + "epoch": 1.04, + "learning_rate": 1e-06, + "loss": 0.2543, + "step": 2500 + }, + { + "epoch": 1.04, + "eval_accuracy": 0.90825, + "eval_loss": 0.28599992394447327, + "eval_runtime": 15.7097, + "eval_samples_per_second": 1527.717, + "eval_steps_per_second": 4.774, + "step": 2500 + }, + { + "epoch": 1.05, + "learning_rate": 1e-06, + "loss": 0.287, + "step": 2520 + }, + { + "epoch": 1.05, + "eval_accuracy": 0.908875, + "eval_loss": 0.285250186920166, + "eval_runtime": 15.7948, + "eval_samples_per_second": 1519.484, + "eval_steps_per_second": 4.748, + "step": 2520 + }, + { + "epoch": 1.06, + "learning_rate": 1e-06, + "loss": 0.3226, + "step": 2540 + }, + { + "epoch": 1.06, + "eval_accuracy": 0.908375, + "eval_loss": 0.28508585691452026, + "eval_runtime": 15.6253, + "eval_samples_per_second": 1535.966, + "eval_steps_per_second": 4.8, + "step": 2540 + }, + { + "epoch": 1.07, + "learning_rate": 1e-06, + "loss": 0.2912, + "step": 2560 + }, + { + "epoch": 1.07, + "eval_accuracy": 0.9077083333333333, + "eval_loss": 0.28835633397102356, + "eval_runtime": 15.302, + "eval_samples_per_second": 1568.422, + "eval_steps_per_second": 4.901, + "step": 2560 + }, + { + "epoch": 1.07, + "learning_rate": 1e-06, + "loss": 0.3344, + "step": 2580 + }, + { + "epoch": 1.07, + "eval_accuracy": 0.9082916666666667, + "eval_loss": 0.2841767966747284, + "eval_runtime": 15.7333, + "eval_samples_per_second": 1525.424, + "eval_steps_per_second": 4.767, + "step": 2580 + }, + { + "epoch": 1.08, + "learning_rate": 1e-06, + "loss": 0.2883, + "step": 2600 + }, + { + "epoch": 1.08, + "eval_accuracy": 0.9075, + "eval_loss": 0.28568732738494873, + "eval_runtime": 15.9862, + "eval_samples_per_second": 1501.294, + "eval_steps_per_second": 4.692, + "step": 2600 + }, + { + "epoch": 1.09, + "learning_rate": 1e-06, + "loss": 0.2958, + "step": 2620 + }, + { + "epoch": 1.09, + "eval_accuracy": 0.90875, + "eval_loss": 0.2833400368690491, + "eval_runtime": 15.9204, + "eval_samples_per_second": 1507.497, + "eval_steps_per_second": 4.711, + "step": 2620 + }, + { + "epoch": 1.1, + "learning_rate": 1e-06, + "loss": 0.2913, + "step": 2640 + }, + { + "epoch": 1.1, + "eval_accuracy": 0.9085, + "eval_loss": 0.2836286723613739, + "eval_runtime": 16.0314, + "eval_samples_per_second": 1497.058, + "eval_steps_per_second": 4.678, + "step": 2640 + }, + { + "epoch": 1.11, + "learning_rate": 1e-06, + "loss": 0.3007, + "step": 2660 + }, + { + "epoch": 1.11, + "eval_accuracy": 0.908625, + "eval_loss": 0.2823469638824463, + "eval_runtime": 15.5726, + "eval_samples_per_second": 1541.172, + "eval_steps_per_second": 4.816, + "step": 2660 + }, + { + "epoch": 1.12, + "learning_rate": 1e-06, + "loss": 0.3152, + "step": 2680 + }, + { + "epoch": 1.12, + "eval_accuracy": 0.9082083333333333, + "eval_loss": 0.2841881811618805, + "eval_runtime": 15.7571, + "eval_samples_per_second": 1523.126, + "eval_steps_per_second": 4.76, + "step": 2680 + }, + { + "epoch": 1.12, + "learning_rate": 1e-06, + "loss": 0.2857, + "step": 2700 + }, + { + "epoch": 1.12, + "eval_accuracy": 0.9079583333333333, + "eval_loss": 0.28267306089401245, + "eval_runtime": 15.9307, + "eval_samples_per_second": 1506.523, + "eval_steps_per_second": 4.708, + "step": 2700 + }, + { + "epoch": 1.13, + "learning_rate": 1e-06, + "loss": 0.2821, + "step": 2720 + }, + { + "epoch": 1.13, + "eval_accuracy": 0.9085416666666667, + "eval_loss": 0.28154897689819336, + "eval_runtime": 15.5765, + "eval_samples_per_second": 1540.779, + "eval_steps_per_second": 4.815, + "step": 2720 + }, + { + "epoch": 1.14, + "learning_rate": 1e-06, + "loss": 0.2775, + "step": 2740 + }, + { + "epoch": 1.14, + "eval_accuracy": 0.9074166666666666, + "eval_loss": 0.2842314839363098, + "eval_runtime": 16.5328, + "eval_samples_per_second": 1451.656, + "eval_steps_per_second": 4.536, + "step": 2740 + }, + { + "epoch": 1.15, + "learning_rate": 1e-06, + "loss": 0.2964, + "step": 2760 + }, + { + "epoch": 1.15, + "eval_accuracy": 0.9084166666666667, + "eval_loss": 0.2818900942802429, + "eval_runtime": 15.7457, + "eval_samples_per_second": 1524.229, + "eval_steps_per_second": 4.763, + "step": 2760 + }, + { + "epoch": 1.16, + "learning_rate": 1e-06, + "loss": 0.3245, + "step": 2780 + }, + { + "epoch": 1.16, + "eval_accuracy": 0.9088333333333334, + "eval_loss": 0.2803751826286316, + "eval_runtime": 15.6563, + "eval_samples_per_second": 1532.925, + "eval_steps_per_second": 4.79, + "step": 2780 + }, + { + "epoch": 1.17, + "learning_rate": 1e-06, + "loss": 0.2706, + "step": 2800 + }, + { + "epoch": 1.17, + "eval_accuracy": 0.909125, + "eval_loss": 0.2802983820438385, + "eval_runtime": 15.9707, + "eval_samples_per_second": 1502.752, + "eval_steps_per_second": 4.696, + "step": 2800 + }, + { + "epoch": 1.18, + "learning_rate": 1e-06, + "loss": 0.2674, + "step": 2820 + }, + { + "epoch": 1.18, + "eval_accuracy": 0.908875, + "eval_loss": 0.2793467342853546, + "eval_runtime": 15.9613, + "eval_samples_per_second": 1503.634, + "eval_steps_per_second": 4.699, + "step": 2820 + }, + { + "epoch": 1.18, + "learning_rate": 1e-06, + "loss": 0.3296, + "step": 2840 + }, + { + "epoch": 1.18, + "eval_accuracy": 0.9093333333333333, + "eval_loss": 0.2792114019393921, + "eval_runtime": 15.5813, + "eval_samples_per_second": 1540.308, + "eval_steps_per_second": 4.813, + "step": 2840 + }, + { + "epoch": 1.19, + "learning_rate": 1e-06, + "loss": 0.2993, + "step": 2860 + }, + { + "epoch": 1.19, + "eval_accuracy": 0.909125, + "eval_loss": 0.27838194370269775, + "eval_runtime": 15.9048, + "eval_samples_per_second": 1508.979, + "eval_steps_per_second": 4.716, + "step": 2860 + }, + { + "epoch": 1.2, + "learning_rate": 1e-06, + "loss": 0.3305, + "step": 2880 + }, + { + "epoch": 1.2, + "eval_accuracy": 0.9084583333333334, + "eval_loss": 0.2802034914493561, + "eval_runtime": 15.8179, + "eval_samples_per_second": 1517.271, + "eval_steps_per_second": 4.741, + "step": 2880 + }, + { + "epoch": 1.21, + "learning_rate": 1e-06, + "loss": 0.2984, + "step": 2900 + }, + { + "epoch": 1.21, + "eval_accuracy": 0.9088333333333334, + "eval_loss": 0.2789742052555084, + "eval_runtime": 15.8715, + "eval_samples_per_second": 1512.14, + "eval_steps_per_second": 4.725, + "step": 2900 + }, + { + "epoch": 1.22, + "learning_rate": 1e-06, + "loss": 0.2748, + "step": 2920 + }, + { + "epoch": 1.22, + "eval_accuracy": 0.9088333333333334, + "eval_loss": 0.2786828875541687, + "eval_runtime": 15.744, + "eval_samples_per_second": 1524.387, + "eval_steps_per_second": 4.764, + "step": 2920 + }, + { + "epoch": 1.23, + "learning_rate": 1e-06, + "loss": 0.2708, + "step": 2940 + }, + { + "epoch": 1.23, + "eval_accuracy": 0.909125, + "eval_loss": 0.2787318527698517, + "eval_runtime": 15.8587, + "eval_samples_per_second": 1513.363, + "eval_steps_per_second": 4.729, + "step": 2940 + }, + { + "epoch": 1.23, + "learning_rate": 1e-06, + "loss": 0.3062, + "step": 2960 + }, + { + "epoch": 1.23, + "eval_accuracy": 0.910125, + "eval_loss": 0.2767701745033264, + "eval_runtime": 15.5088, + "eval_samples_per_second": 1547.508, + "eval_steps_per_second": 4.836, + "step": 2960 + }, + { + "epoch": 1.24, + "learning_rate": 1e-06, + "loss": 0.3039, + "step": 2980 + }, + { + "epoch": 1.24, + "eval_accuracy": 0.910375, + "eval_loss": 0.2769327163696289, + "eval_runtime": 15.7715, + "eval_samples_per_second": 1521.733, + "eval_steps_per_second": 4.755, + "step": 2980 + }, + { + "epoch": 1.25, + "learning_rate": 1e-06, + "loss": 0.2889, + "step": 3000 + }, + { + "epoch": 1.25, + "eval_accuracy": 0.9106666666666666, + "eval_loss": 0.2753864526748657, + "eval_runtime": 16.0825, + "eval_samples_per_second": 1492.309, + "eval_steps_per_second": 4.663, + "step": 3000 + }, + { + "epoch": 1.26, + "learning_rate": 1e-06, + "loss": 0.2964, + "step": 3020 + }, + { + "epoch": 1.26, + "eval_accuracy": 0.9102916666666667, + "eval_loss": 0.2752026319503784, + "eval_runtime": 15.8059, + "eval_samples_per_second": 1518.419, + "eval_steps_per_second": 4.745, + "step": 3020 + }, + { + "epoch": 1.27, + "learning_rate": 1e-06, + "loss": 0.2951, + "step": 3040 + }, + { + "epoch": 1.27, + "eval_accuracy": 0.9099583333333333, + "eval_loss": 0.27485111355781555, + "eval_runtime": 15.7664, + "eval_samples_per_second": 1522.226, + "eval_steps_per_second": 4.757, + "step": 3040 + }, + { + "epoch": 1.27, + "learning_rate": 1e-06, + "loss": 0.2879, + "step": 3060 + }, + { + "epoch": 1.27, + "eval_accuracy": 0.91075, + "eval_loss": 0.27414020895957947, + "eval_runtime": 15.6142, + "eval_samples_per_second": 1537.064, + "eval_steps_per_second": 4.803, + "step": 3060 + }, + { + "epoch": 1.28, + "learning_rate": 1e-06, + "loss": 0.2795, + "step": 3080 + }, + { + "epoch": 1.28, + "eval_accuracy": 0.9106666666666666, + "eval_loss": 0.2753457725048065, + "eval_runtime": 15.8605, + "eval_samples_per_second": 1513.198, + "eval_steps_per_second": 4.729, + "step": 3080 + }, + { + "epoch": 1.29, + "learning_rate": 1e-06, + "loss": 0.2794, + "step": 3100 + }, + { + "epoch": 1.29, + "eval_accuracy": 0.9100416666666666, + "eval_loss": 0.27380895614624023, + "eval_runtime": 16.3184, + "eval_samples_per_second": 1470.733, + "eval_steps_per_second": 4.596, + "step": 3100 + }, + { + "epoch": 1.3, + "learning_rate": 1e-06, + "loss": 0.258, + "step": 3120 + }, + { + "epoch": 1.3, + "eval_accuracy": 0.910625, + "eval_loss": 0.27472469210624695, + "eval_runtime": 15.8282, + "eval_samples_per_second": 1516.281, + "eval_steps_per_second": 4.738, + "step": 3120 + }, + { + "epoch": 1.31, + "learning_rate": 1e-06, + "loss": 0.2551, + "step": 3140 + }, + { + "epoch": 1.31, + "eval_accuracy": 0.91, + "eval_loss": 0.274568110704422, + "eval_runtime": 15.7686, + "eval_samples_per_second": 1522.008, + "eval_steps_per_second": 4.756, + "step": 3140 + }, + { + "epoch": 1.32, + "learning_rate": 1e-06, + "loss": 0.3086, + "step": 3160 + }, + { + "epoch": 1.32, + "eval_accuracy": 0.9097083333333333, + "eval_loss": 0.27524352073669434, + "eval_runtime": 16.163, + "eval_samples_per_second": 1484.869, + "eval_steps_per_second": 4.64, + "step": 3160 + }, + { + "epoch": 1.32, + "learning_rate": 1e-06, + "loss": 0.2812, + "step": 3180 + }, + { + "epoch": 1.32, + "eval_accuracy": 0.9117083333333333, + "eval_loss": 0.27365365624427795, + "eval_runtime": 15.5225, + "eval_samples_per_second": 1546.138, + "eval_steps_per_second": 4.832, + "step": 3180 + }, + { + "epoch": 1.33, + "learning_rate": 1e-06, + "loss": 0.3016, + "step": 3200 + }, + { + "epoch": 1.33, + "eval_accuracy": 0.911, + "eval_loss": 0.271597683429718, + "eval_runtime": 15.8761, + "eval_samples_per_second": 1511.709, + "eval_steps_per_second": 4.724, + "step": 3200 + }, + { + "epoch": 1.34, + "learning_rate": 1e-06, + "loss": 0.2913, + "step": 3220 + }, + { + "epoch": 1.34, + "eval_accuracy": 0.9109583333333333, + "eval_loss": 0.27143821120262146, + "eval_runtime": 16.2761, + "eval_samples_per_second": 1474.556, + "eval_steps_per_second": 4.608, + "step": 3220 + }, + { + "epoch": 1.35, + "learning_rate": 1e-06, + "loss": 0.2634, + "step": 3240 + }, + { + "epoch": 1.35, + "eval_accuracy": 0.909, + "eval_loss": 0.2747085988521576, + "eval_runtime": 16.3191, + "eval_samples_per_second": 1470.669, + "eval_steps_per_second": 4.596, + "step": 3240 + }, + { + "epoch": 1.36, + "learning_rate": 1e-06, + "loss": 0.2845, + "step": 3260 + }, + { + "epoch": 1.36, + "eval_accuracy": 0.9109166666666667, + "eval_loss": 0.2722584903240204, + "eval_runtime": 15.7837, + "eval_samples_per_second": 1520.554, + "eval_steps_per_second": 4.752, + "step": 3260 + }, + { + "epoch": 1.37, + "learning_rate": 1e-06, + "loss": 0.2864, + "step": 3280 + }, + { + "epoch": 1.37, + "eval_accuracy": 0.9112083333333333, + "eval_loss": 0.2707350552082062, + "eval_runtime": 15.9926, + "eval_samples_per_second": 1500.697, + "eval_steps_per_second": 4.69, + "step": 3280 + }, + { + "epoch": 1.38, + "learning_rate": 1e-06, + "loss": 0.2794, + "step": 3300 + }, + { + "epoch": 1.38, + "eval_accuracy": 0.9111666666666667, + "eval_loss": 0.2700752317905426, + "eval_runtime": 15.6416, + "eval_samples_per_second": 1534.367, + "eval_steps_per_second": 4.795, + "step": 3300 + }, + { + "epoch": 1.38, + "learning_rate": 1e-06, + "loss": 0.3216, + "step": 3320 + }, + { + "epoch": 1.38, + "eval_accuracy": 0.9105, + "eval_loss": 0.27095627784729004, + "eval_runtime": 16.2168, + "eval_samples_per_second": 1479.95, + "eval_steps_per_second": 4.625, + "step": 3320 + }, + { + "epoch": 1.39, + "learning_rate": 1e-06, + "loss": 0.2738, + "step": 3340 + }, + { + "epoch": 1.39, + "eval_accuracy": 0.9102083333333333, + "eval_loss": 0.2725074291229248, + "eval_runtime": 15.7189, + "eval_samples_per_second": 1526.821, + "eval_steps_per_second": 4.771, + "step": 3340 + }, + { + "epoch": 1.4, + "learning_rate": 1e-06, + "loss": 0.301, + "step": 3360 + }, + { + "epoch": 1.4, + "eval_accuracy": 0.9102083333333333, + "eval_loss": 0.27188044786453247, + "eval_runtime": 15.3899, + "eval_samples_per_second": 1559.463, + "eval_steps_per_second": 4.873, + "step": 3360 + }, + { + "epoch": 1.41, + "learning_rate": 1e-06, + "loss": 0.3144, + "step": 3380 + }, + { + "epoch": 1.41, + "eval_accuracy": 0.911125, + "eval_loss": 0.269314169883728, + "eval_runtime": 16.1663, + "eval_samples_per_second": 1484.568, + "eval_steps_per_second": 4.639, + "step": 3380 + }, + { + "epoch": 1.42, + "learning_rate": 1e-06, + "loss": 0.2914, + "step": 3400 + }, + { + "epoch": 1.42, + "eval_accuracy": 0.9120416666666666, + "eval_loss": 0.26871222257614136, + "eval_runtime": 15.7364, + "eval_samples_per_second": 1525.13, + "eval_steps_per_second": 4.766, + "step": 3400 + }, + { + "epoch": 1.43, + "learning_rate": 1e-06, + "loss": 0.252, + "step": 3420 + }, + { + "epoch": 1.43, + "eval_accuracy": 0.9114166666666667, + "eval_loss": 0.26832154393196106, + "eval_runtime": 16.1928, + "eval_samples_per_second": 1482.143, + "eval_steps_per_second": 4.632, + "step": 3420 + }, + { + "epoch": 1.43, + "learning_rate": 1e-06, + "loss": 0.2616, + "step": 3440 + }, + { + "epoch": 1.43, + "eval_accuracy": 0.9119583333333333, + "eval_loss": 0.2678380608558655, + "eval_runtime": 15.9974, + "eval_samples_per_second": 1500.246, + "eval_steps_per_second": 4.688, + "step": 3440 + }, + { + "epoch": 1.44, + "learning_rate": 1e-06, + "loss": 0.247, + "step": 3460 + }, + { + "epoch": 1.44, + "eval_accuracy": 0.9126666666666666, + "eval_loss": 0.2679081857204437, + "eval_runtime": 15.5041, + "eval_samples_per_second": 1547.976, + "eval_steps_per_second": 4.837, + "step": 3460 + }, + { + "epoch": 1.45, + "learning_rate": 1e-06, + "loss": 0.279, + "step": 3480 + }, + { + "epoch": 1.45, + "eval_accuracy": 0.912, + "eval_loss": 0.2675539553165436, + "eval_runtime": 16.0139, + "eval_samples_per_second": 1498.702, + "eval_steps_per_second": 4.683, + "step": 3480 + }, + { + "epoch": 1.46, + "learning_rate": 1e-06, + "loss": 0.2823, + "step": 3500 + }, + { + "epoch": 1.46, + "eval_accuracy": 0.9124166666666667, + "eval_loss": 0.26708924770355225, + "eval_runtime": 15.8242, + "eval_samples_per_second": 1516.667, + "eval_steps_per_second": 4.74, + "step": 3500 + }, + { + "epoch": 1.47, + "learning_rate": 1e-06, + "loss": 0.2769, + "step": 3520 + }, + { + "epoch": 1.47, + "eval_accuracy": 0.9125, + "eval_loss": 0.26735562086105347, + "eval_runtime": 16.0609, + "eval_samples_per_second": 1494.311, + "eval_steps_per_second": 4.67, + "step": 3520 + }, + { + "epoch": 1.48, + "learning_rate": 1e-06, + "loss": 0.253, + "step": 3540 + }, + { + "epoch": 1.48, + "eval_accuracy": 0.9122916666666666, + "eval_loss": 0.2679881453514099, + "eval_runtime": 15.8153, + "eval_samples_per_second": 1517.514, + "eval_steps_per_second": 4.742, + "step": 3540 + }, + { + "epoch": 1.48, + "learning_rate": 1e-06, + "loss": 0.2398, + "step": 3560 + }, + { + "epoch": 1.48, + "eval_accuracy": 0.9125, + "eval_loss": 0.26599201560020447, + "eval_runtime": 15.854, + "eval_samples_per_second": 1513.812, + "eval_steps_per_second": 4.731, + "step": 3560 + }, + { + "epoch": 1.49, + "learning_rate": 1e-06, + "loss": 0.2524, + "step": 3580 + }, + { + "epoch": 1.49, + "eval_accuracy": 0.912875, + "eval_loss": 0.2660870850086212, + "eval_runtime": 15.72, + "eval_samples_per_second": 1526.713, + "eval_steps_per_second": 4.771, + "step": 3580 + }, + { + "epoch": 1.5, + "learning_rate": 1e-06, + "loss": 0.2794, + "step": 3600 + }, + { + "epoch": 1.5, + "eval_accuracy": 0.9122916666666666, + "eval_loss": 0.2667754590511322, + "eval_runtime": 15.9732, + "eval_samples_per_second": 1502.513, + "eval_steps_per_second": 4.695, + "step": 3600 + }, + { + "epoch": 1.51, + "learning_rate": 1e-06, + "loss": 0.2378, + "step": 3620 + }, + { + "epoch": 1.51, + "eval_accuracy": 0.912375, + "eval_loss": 0.26728901267051697, + "eval_runtime": 15.3559, + "eval_samples_per_second": 1562.918, + "eval_steps_per_second": 4.884, + "step": 3620 + }, + { + "epoch": 1.52, + "learning_rate": 1e-06, + "loss": 0.2309, + "step": 3640 + }, + { + "epoch": 1.52, + "eval_accuracy": 0.91175, + "eval_loss": 0.2677074074745178, + "eval_runtime": 15.7934, + "eval_samples_per_second": 1519.623, + "eval_steps_per_second": 4.749, + "step": 3640 + }, + { + "epoch": 1.52, + "learning_rate": 1e-06, + "loss": 0.2414, + "step": 3660 + }, + { + "epoch": 1.52, + "eval_accuracy": 0.9127083333333333, + "eval_loss": 0.2664356529712677, + "eval_runtime": 16.3142, + "eval_samples_per_second": 1471.115, + "eval_steps_per_second": 4.597, + "step": 3660 + }, + { + "epoch": 1.53, + "learning_rate": 1e-06, + "loss": 0.2698, + "step": 3680 + }, + { + "epoch": 1.53, + "eval_accuracy": 0.9130416666666666, + "eval_loss": 0.26516926288604736, + "eval_runtime": 15.5498, + "eval_samples_per_second": 1543.428, + "eval_steps_per_second": 4.823, + "step": 3680 + }, + { + "epoch": 1.54, + "learning_rate": 1e-06, + "loss": 0.2674, + "step": 3700 + }, + { + "epoch": 1.54, + "eval_accuracy": 0.912875, + "eval_loss": 0.26480814814567566, + "eval_runtime": 15.9433, + "eval_samples_per_second": 1505.331, + "eval_steps_per_second": 4.704, + "step": 3700 + }, + { + "epoch": 1.55, + "learning_rate": 1e-06, + "loss": 0.3004, + "step": 3720 + }, + { + "epoch": 1.55, + "eval_accuracy": 0.9120833333333334, + "eval_loss": 0.26566416025161743, + "eval_runtime": 16.4814, + "eval_samples_per_second": 1456.188, + "eval_steps_per_second": 4.551, + "step": 3720 + }, + { + "epoch": 1.56, + "learning_rate": 1e-06, + "loss": 0.2621, + "step": 3740 + }, + { + "epoch": 1.56, + "eval_accuracy": 0.9130833333333334, + "eval_loss": 0.2644825577735901, + "eval_runtime": 15.8572, + "eval_samples_per_second": 1513.507, + "eval_steps_per_second": 4.73, + "step": 3740 + }, + { + "epoch": 1.57, + "learning_rate": 1e-06, + "loss": 0.2691, + "step": 3760 + }, + { + "epoch": 1.57, + "eval_accuracy": 0.9137916666666667, + "eval_loss": 0.26423123478889465, + "eval_runtime": 15.9709, + "eval_samples_per_second": 1502.733, + "eval_steps_per_second": 4.696, + "step": 3760 + }, + { + "epoch": 1.57, + "learning_rate": 1e-06, + "loss": 0.2768, + "step": 3780 + }, + { + "epoch": 1.57, + "eval_accuracy": 0.9114583333333334, + "eval_loss": 0.2679901421070099, + "eval_runtime": 15.6044, + "eval_samples_per_second": 1538.026, + "eval_steps_per_second": 4.806, + "step": 3780 + }, + { + "epoch": 1.58, + "learning_rate": 1e-06, + "loss": 0.2768, + "step": 3800 + }, + { + "epoch": 1.58, + "eval_accuracy": 0.9123333333333333, + "eval_loss": 0.2639557421207428, + "eval_runtime": 16.2935, + "eval_samples_per_second": 1472.978, + "eval_steps_per_second": 4.603, + "step": 3800 + }, + { + "epoch": 1.59, + "learning_rate": 1e-06, + "loss": 0.2308, + "step": 3820 + }, + { + "epoch": 1.59, + "eval_accuracy": 0.9118333333333334, + "eval_loss": 0.2661728262901306, + "eval_runtime": 15.4841, + "eval_samples_per_second": 1549.978, + "eval_steps_per_second": 4.844, + "step": 3820 + }, + { + "epoch": 1.6, + "learning_rate": 1e-06, + "loss": 0.2591, + "step": 3840 + }, + { + "epoch": 1.6, + "eval_accuracy": 0.9131666666666667, + "eval_loss": 0.2664397954940796, + "eval_runtime": 15.6675, + "eval_samples_per_second": 1531.834, + "eval_steps_per_second": 4.787, + "step": 3840 + }, + { + "epoch": 1.61, + "learning_rate": 1e-06, + "loss": 0.2496, + "step": 3860 + }, + { + "epoch": 1.61, + "eval_accuracy": 0.9125, + "eval_loss": 0.2657550573348999, + "eval_runtime": 15.6288, + "eval_samples_per_second": 1535.631, + "eval_steps_per_second": 4.799, + "step": 3860 + }, + { + "epoch": 1.62, + "learning_rate": 1e-06, + "loss": 0.2609, + "step": 3880 + }, + { + "epoch": 1.62, + "eval_accuracy": 0.91225, + "eval_loss": 0.26461654901504517, + "eval_runtime": 15.6184, + "eval_samples_per_second": 1536.644, + "eval_steps_per_second": 4.802, + "step": 3880 + }, + { + "epoch": 1.62, + "learning_rate": 1e-06, + "loss": 0.2491, + "step": 3900 + }, + { + "epoch": 1.62, + "eval_accuracy": 0.9132916666666666, + "eval_loss": 0.2625100314617157, + "eval_runtime": 15.868, + "eval_samples_per_second": 1512.482, + "eval_steps_per_second": 4.727, + "step": 3900 + }, + { + "epoch": 1.63, + "learning_rate": 1e-06, + "loss": 0.2609, + "step": 3920 + }, + { + "epoch": 1.63, + "eval_accuracy": 0.91375, + "eval_loss": 0.2650201916694641, + "eval_runtime": 15.7536, + "eval_samples_per_second": 1523.463, + "eval_steps_per_second": 4.761, + "step": 3920 + }, + { + "epoch": 1.64, + "learning_rate": 1e-06, + "loss": 0.225, + "step": 3940 + }, + { + "epoch": 1.64, + "eval_accuracy": 0.9124166666666667, + "eval_loss": 0.262962281703949, + "eval_runtime": 15.7571, + "eval_samples_per_second": 1523.123, + "eval_steps_per_second": 4.76, + "step": 3940 + }, + { + "epoch": 1.65, + "learning_rate": 1e-06, + "loss": 0.3099, + "step": 3960 + }, + { + "epoch": 1.65, + "eval_accuracy": 0.9139583333333333, + "eval_loss": 0.26255694031715393, + "eval_runtime": 15.7275, + "eval_samples_per_second": 1525.985, + "eval_steps_per_second": 4.769, + "step": 3960 + }, + { + "epoch": 1.66, + "learning_rate": 1e-06, + "loss": 0.2627, + "step": 3980 + }, + { + "epoch": 1.66, + "eval_accuracy": 0.91425, + "eval_loss": 0.26082727313041687, + "eval_runtime": 15.785, + "eval_samples_per_second": 1520.427, + "eval_steps_per_second": 4.751, + "step": 3980 + }, + { + "epoch": 1.67, + "learning_rate": 1e-06, + "loss": 0.2831, + "step": 4000 + }, + { + "epoch": 1.67, + "eval_accuracy": 0.9128333333333334, + "eval_loss": 0.2629248797893524, + "eval_runtime": 15.9672, + "eval_samples_per_second": 1503.078, + "eval_steps_per_second": 4.697, + "step": 4000 + }, + { + "epoch": 1.68, + "learning_rate": 1e-06, + "loss": 0.2806, + "step": 4020 + }, + { + "epoch": 1.68, + "eval_accuracy": 0.9131666666666667, + "eval_loss": 0.2608546316623688, + "eval_runtime": 15.7614, + "eval_samples_per_second": 1522.706, + "eval_steps_per_second": 4.758, + "step": 4020 + }, + { + "epoch": 1.68, + "learning_rate": 1e-06, + "loss": 0.3095, + "step": 4040 + }, + { + "epoch": 1.68, + "eval_accuracy": 0.914125, + "eval_loss": 0.263700008392334, + "eval_runtime": 15.7288, + "eval_samples_per_second": 1525.864, + "eval_steps_per_second": 4.768, + "step": 4040 + }, + { + "epoch": 1.69, + "learning_rate": 1e-06, + "loss": 0.2637, + "step": 4060 + }, + { + "epoch": 1.69, + "eval_accuracy": 0.9152083333333333, + "eval_loss": 0.26146170496940613, + "eval_runtime": 15.6613, + "eval_samples_per_second": 1532.442, + "eval_steps_per_second": 4.789, + "step": 4060 + }, + { + "epoch": 1.7, + "learning_rate": 1e-06, + "loss": 0.3105, + "step": 4080 + }, + { + "epoch": 1.7, + "eval_accuracy": 0.9137083333333333, + "eval_loss": 0.25999248027801514, + "eval_runtime": 15.6145, + "eval_samples_per_second": 1537.029, + "eval_steps_per_second": 4.803, + "step": 4080 + }, + { + "epoch": 1.71, + "learning_rate": 1e-06, + "loss": 0.2665, + "step": 4100 + }, + { + "epoch": 1.71, + "eval_accuracy": 0.912875, + "eval_loss": 0.2612546980381012, + "eval_runtime": 16.3137, + "eval_samples_per_second": 1471.158, + "eval_steps_per_second": 4.597, + "step": 4100 + }, + { + "epoch": 1.72, + "learning_rate": 1e-06, + "loss": 0.2582, + "step": 4120 + }, + { + "epoch": 1.72, + "eval_accuracy": 0.9130416666666666, + "eval_loss": 0.2623673975467682, + "eval_runtime": 15.824, + "eval_samples_per_second": 1516.686, + "eval_steps_per_second": 4.74, + "step": 4120 + }, + { + "epoch": 1.73, + "learning_rate": 1e-06, + "loss": 0.2425, + "step": 4140 + }, + { + "epoch": 1.73, + "eval_accuracy": 0.91425, + "eval_loss": 0.25996851921081543, + "eval_runtime": 16.2345, + "eval_samples_per_second": 1478.333, + "eval_steps_per_second": 4.62, + "step": 4140 + }, + { + "epoch": 1.73, + "learning_rate": 1e-06, + "loss": 0.2431, + "step": 4160 + }, + { + "epoch": 1.73, + "eval_accuracy": 0.914, + "eval_loss": 0.25984445214271545, + "eval_runtime": 15.7995, + "eval_samples_per_second": 1519.04, + "eval_steps_per_second": 4.747, + "step": 4160 + }, + { + "epoch": 1.74, + "learning_rate": 1e-06, + "loss": 0.2423, + "step": 4180 + }, + { + "epoch": 1.74, + "eval_accuracy": 0.9142916666666666, + "eval_loss": 0.2597646713256836, + "eval_runtime": 15.5687, + "eval_samples_per_second": 1541.552, + "eval_steps_per_second": 4.817, + "step": 4180 + }, + { + "epoch": 1.75, + "learning_rate": 1e-06, + "loss": 0.2594, + "step": 4200 + }, + { + "epoch": 1.75, + "eval_accuracy": 0.9134583333333334, + "eval_loss": 0.26066410541534424, + "eval_runtime": 15.9904, + "eval_samples_per_second": 1500.899, + "eval_steps_per_second": 4.69, + "step": 4200 + }, + { + "epoch": 1.76, + "learning_rate": 1e-06, + "loss": 0.2647, + "step": 4220 + }, + { + "epoch": 1.76, + "eval_accuracy": 0.914, + "eval_loss": 0.25917357206344604, + "eval_runtime": 15.5439, + "eval_samples_per_second": 1544.015, + "eval_steps_per_second": 4.825, + "step": 4220 + }, + { + "epoch": 1.77, + "learning_rate": 1e-06, + "loss": 0.282, + "step": 4240 + }, + { + "epoch": 1.77, + "eval_accuracy": 0.9147083333333333, + "eval_loss": 0.258999764919281, + "eval_runtime": 15.9706, + "eval_samples_per_second": 1502.761, + "eval_steps_per_second": 4.696, + "step": 4240 + }, + { + "epoch": 1.77, + "learning_rate": 1e-06, + "loss": 0.246, + "step": 4260 + }, + { + "epoch": 1.77, + "eval_accuracy": 0.9139166666666667, + "eval_loss": 0.25847962498664856, + "eval_runtime": 15.9552, + "eval_samples_per_second": 1504.21, + "eval_steps_per_second": 4.701, + "step": 4260 + }, + { + "epoch": 1.78, + "learning_rate": 1e-06, + "loss": 0.2558, + "step": 4280 + }, + { + "epoch": 1.78, + "eval_accuracy": 0.9138333333333334, + "eval_loss": 0.25929775834083557, + "eval_runtime": 15.5972, + "eval_samples_per_second": 1538.742, + "eval_steps_per_second": 4.809, + "step": 4280 + }, + { + "epoch": 1.79, + "learning_rate": 1e-06, + "loss": 0.2249, + "step": 4300 + }, + { + "epoch": 1.79, + "eval_accuracy": 0.9144166666666667, + "eval_loss": 0.258432537317276, + "eval_runtime": 15.9462, + "eval_samples_per_second": 1505.062, + "eval_steps_per_second": 4.703, + "step": 4300 + }, + { + "epoch": 1.8, + "learning_rate": 1e-06, + "loss": 0.2619, + "step": 4320 + }, + { + "epoch": 1.8, + "eval_accuracy": 0.9135416666666667, + "eval_loss": 0.25964200496673584, + "eval_runtime": 15.7728, + "eval_samples_per_second": 1521.607, + "eval_steps_per_second": 4.755, + "step": 4320 + }, + { + "epoch": 1.81, + "learning_rate": 1e-06, + "loss": 0.2789, + "step": 4340 + }, + { + "epoch": 1.81, + "eval_accuracy": 0.9133333333333333, + "eval_loss": 0.26067379117012024, + "eval_runtime": 16.1036, + "eval_samples_per_second": 1490.346, + "eval_steps_per_second": 4.657, + "step": 4340 + }, + { + "epoch": 1.82, + "learning_rate": 1e-06, + "loss": 0.2714, + "step": 4360 + }, + { + "epoch": 1.82, + "eval_accuracy": 0.9144583333333334, + "eval_loss": 0.25738218426704407, + "eval_runtime": 15.9298, + "eval_samples_per_second": 1506.611, + "eval_steps_per_second": 4.708, + "step": 4360 + }, + { + "epoch": 1.82, + "learning_rate": 1e-06, + "loss": 0.2662, + "step": 4380 + }, + { + "epoch": 1.82, + "eval_accuracy": 0.9156666666666666, + "eval_loss": 0.2561679184436798, + "eval_runtime": 16.0834, + "eval_samples_per_second": 1492.218, + "eval_steps_per_second": 4.663, + "step": 4380 + }, + { + "epoch": 1.83, + "learning_rate": 1e-06, + "loss": 0.2773, + "step": 4400 + }, + { + "epoch": 1.83, + "eval_accuracy": 0.914, + "eval_loss": 0.2571386694908142, + "eval_runtime": 15.7636, + "eval_samples_per_second": 1522.493, + "eval_steps_per_second": 4.758, + "step": 4400 + }, + { + "epoch": 1.84, + "learning_rate": 1e-06, + "loss": 0.2478, + "step": 4420 + }, + { + "epoch": 1.84, + "eval_accuracy": 0.9152916666666666, + "eval_loss": 0.25946521759033203, + "eval_runtime": 15.8816, + "eval_samples_per_second": 1511.187, + "eval_steps_per_second": 4.722, + "step": 4420 + }, + { + "epoch": 1.85, + "learning_rate": 1e-06, + "loss": 0.2517, + "step": 4440 + }, + { + "epoch": 1.85, + "eval_accuracy": 0.9144583333333334, + "eval_loss": 0.2570332884788513, + "eval_runtime": 15.9553, + "eval_samples_per_second": 1504.206, + "eval_steps_per_second": 4.701, + "step": 4440 + }, + { + "epoch": 1.86, + "learning_rate": 1e-06, + "loss": 0.2539, + "step": 4460 + }, + { + "epoch": 1.86, + "eval_accuracy": 0.9137916666666667, + "eval_loss": 0.25786763429641724, + "eval_runtime": 15.6911, + "eval_samples_per_second": 1529.534, + "eval_steps_per_second": 4.78, + "step": 4460 + }, + { + "epoch": 1.87, + "learning_rate": 1e-06, + "loss": 0.2635, + "step": 4480 + }, + { + "epoch": 1.87, + "eval_accuracy": 0.9154166666666667, + "eval_loss": 0.25599122047424316, + "eval_runtime": 15.95, + "eval_samples_per_second": 1504.705, + "eval_steps_per_second": 4.702, + "step": 4480 + }, + { + "epoch": 1.88, + "learning_rate": 1e-06, + "loss": 0.3007, + "step": 4500 + }, + { + "epoch": 1.88, + "eval_accuracy": 0.914625, + "eval_loss": 0.25718793272972107, + "eval_runtime": 16.0115, + "eval_samples_per_second": 1498.923, + "eval_steps_per_second": 4.684, + "step": 4500 + }, + { + "epoch": 1.88, + "learning_rate": 1e-06, + "loss": 0.2865, + "step": 4520 + }, + { + "epoch": 1.88, + "eval_accuracy": 0.915125, + "eval_loss": 0.25561970472335815, + "eval_runtime": 15.2788, + "eval_samples_per_second": 1570.809, + "eval_steps_per_second": 4.909, + "step": 4520 + }, + { + "epoch": 1.89, + "learning_rate": 1e-06, + "loss": 0.2234, + "step": 4540 + }, + { + "epoch": 1.89, + "eval_accuracy": 0.9139583333333333, + "eval_loss": 0.2578865885734558, + "eval_runtime": 15.8725, + "eval_samples_per_second": 1512.049, + "eval_steps_per_second": 4.725, + "step": 4540 + }, + { + "epoch": 1.9, + "learning_rate": 1e-06, + "loss": 0.2864, + "step": 4560 + }, + { + "epoch": 1.9, + "eval_accuracy": 0.9142083333333333, + "eval_loss": 0.25844380259513855, + "eval_runtime": 15.9686, + "eval_samples_per_second": 1502.947, + "eval_steps_per_second": 4.697, + "step": 4560 + }, + { + "epoch": 1.91, + "learning_rate": 1e-06, + "loss": 0.229, + "step": 4580 + }, + { + "epoch": 1.91, + "eval_accuracy": 0.9151666666666667, + "eval_loss": 0.2548525631427765, + "eval_runtime": 15.8099, + "eval_samples_per_second": 1518.04, + "eval_steps_per_second": 4.744, + "step": 4580 + }, + { + "epoch": 1.92, + "learning_rate": 1e-06, + "loss": 0.2584, + "step": 4600 + }, + { + "epoch": 1.92, + "eval_accuracy": 0.9155, + "eval_loss": 0.25401976704597473, + "eval_runtime": 15.9608, + "eval_samples_per_second": 1503.688, + "eval_steps_per_second": 4.699, + "step": 4600 + }, + { + "epoch": 1.93, + "learning_rate": 1e-06, + "loss": 0.3175, + "step": 4620 + }, + { + "epoch": 1.93, + "eval_accuracy": 0.9151666666666667, + "eval_loss": 0.2564151883125305, + "eval_runtime": 16.163, + "eval_samples_per_second": 1484.877, + "eval_steps_per_second": 4.64, + "step": 4620 + }, + { + "epoch": 1.93, + "learning_rate": 1e-06, + "loss": 0.3066, + "step": 4640 + }, + { + "epoch": 1.93, + "eval_accuracy": 0.9160416666666666, + "eval_loss": 0.2530251443386078, + "eval_runtime": 15.8188, + "eval_samples_per_second": 1517.184, + "eval_steps_per_second": 4.741, + "step": 4640 + }, + { + "epoch": 1.94, + "learning_rate": 1e-06, + "loss": 0.2746, + "step": 4660 + }, + { + "epoch": 1.94, + "eval_accuracy": 0.9144583333333334, + "eval_loss": 0.25700142979621887, + "eval_runtime": 15.9454, + "eval_samples_per_second": 1505.134, + "eval_steps_per_second": 4.704, + "step": 4660 + }, + { + "epoch": 1.95, + "learning_rate": 1e-06, + "loss": 0.2608, + "step": 4680 + }, + { + "epoch": 1.95, + "eval_accuracy": 0.915625, + "eval_loss": 0.25347229838371277, + "eval_runtime": 15.8024, + "eval_samples_per_second": 1518.76, + "eval_steps_per_second": 4.746, + "step": 4680 + }, + { + "epoch": 1.96, + "learning_rate": 1e-06, + "loss": 0.233, + "step": 4700 + }, + { + "epoch": 1.96, + "eval_accuracy": 0.9157083333333333, + "eval_loss": 0.2529941201210022, + "eval_runtime": 16.3246, + "eval_samples_per_second": 1470.174, + "eval_steps_per_second": 4.594, + "step": 4700 + }, + { + "epoch": 1.97, + "learning_rate": 1e-06, + "loss": 0.2556, + "step": 4720 + }, + { + "epoch": 1.97, + "eval_accuracy": 0.9156666666666666, + "eval_loss": 0.25271856784820557, + "eval_runtime": 15.8607, + "eval_samples_per_second": 1513.176, + "eval_steps_per_second": 4.729, + "step": 4720 + }, + { + "epoch": 1.98, + "learning_rate": 1e-06, + "loss": 0.253, + "step": 4740 + }, + { + "epoch": 1.98, + "eval_accuracy": 0.9163333333333333, + "eval_loss": 0.251432865858078, + "eval_runtime": 16.1859, + "eval_samples_per_second": 1482.768, + "eval_steps_per_second": 4.634, + "step": 4740 + }, + { + "epoch": 1.98, + "learning_rate": 1e-06, + "loss": 0.212, + "step": 4760 + }, + { + "epoch": 1.98, + "eval_accuracy": 0.9162916666666666, + "eval_loss": 0.25113439559936523, + "eval_runtime": 15.5374, + "eval_samples_per_second": 1544.657, + "eval_steps_per_second": 4.827, + "step": 4760 + }, + { + "epoch": 1.99, + "learning_rate": 1e-06, + "loss": 0.2573, + "step": 4780 + }, + { + "epoch": 1.99, + "eval_accuracy": 0.9160833333333334, + "eval_loss": 0.25215280055999756, + "eval_runtime": 16.3238, + "eval_samples_per_second": 1470.242, + "eval_steps_per_second": 4.595, + "step": 4780 + }, + { + "epoch": 2.0, + "learning_rate": 1e-06, + "loss": 0.2533, + "step": 4800 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.9161666666666667, + "eval_loss": 0.25234469771385193, + "eval_runtime": 15.7714, + "eval_samples_per_second": 1521.744, + "eval_steps_per_second": 4.755, + "step": 4800 + }, + { + "epoch": 2.01, + "learning_rate": 1e-06, + "loss": 0.2638, + "step": 4820 + }, + { + "epoch": 2.01, + "eval_accuracy": 0.9162916666666666, + "eval_loss": 0.25334304571151733, + "eval_runtime": 16.0308, + "eval_samples_per_second": 1497.122, + "eval_steps_per_second": 4.679, + "step": 4820 + }, + { + "epoch": 2.02, + "learning_rate": 1e-06, + "loss": 0.2763, + "step": 4840 + }, + { + "epoch": 2.02, + "eval_accuracy": 0.9144166666666667, + "eval_loss": 0.25832322239875793, + "eval_runtime": 15.8766, + "eval_samples_per_second": 1511.661, + "eval_steps_per_second": 4.724, + "step": 4840 + }, + { + "epoch": 2.02, + "learning_rate": 1e-06, + "loss": 0.2503, + "step": 4860 + }, + { + "epoch": 2.02, + "eval_accuracy": 0.9145416666666667, + "eval_loss": 0.2566870450973511, + "eval_runtime": 15.4218, + "eval_samples_per_second": 1556.237, + "eval_steps_per_second": 4.863, + "step": 4860 + }, + { + "epoch": 2.03, + "learning_rate": 1e-06, + "loss": 0.2439, + "step": 4880 + }, + { + "epoch": 2.03, + "eval_accuracy": 0.916, + "eval_loss": 0.2518457770347595, + "eval_runtime": 16.4312, + "eval_samples_per_second": 1460.639, + "eval_steps_per_second": 4.564, + "step": 4880 + }, + { + "epoch": 2.04, + "learning_rate": 1e-06, + "loss": 0.2539, + "step": 4900 + }, + { + "epoch": 2.04, + "eval_accuracy": 0.9162083333333333, + "eval_loss": 0.25036895275115967, + "eval_runtime": 15.7834, + "eval_samples_per_second": 1520.583, + "eval_steps_per_second": 4.752, + "step": 4900 + }, + { + "epoch": 2.05, + "learning_rate": 1e-06, + "loss": 0.2508, + "step": 4920 + }, + { + "epoch": 2.05, + "eval_accuracy": 0.9161666666666667, + "eval_loss": 0.25022852420806885, + "eval_runtime": 15.2801, + "eval_samples_per_second": 1570.674, + "eval_steps_per_second": 4.908, + "step": 4920 + }, + { + "epoch": 2.06, + "learning_rate": 1e-06, + "loss": 0.2793, + "step": 4940 + }, + { + "epoch": 2.06, + "eval_accuracy": 0.917125, + "eval_loss": 0.25100791454315186, + "eval_runtime": 15.9757, + "eval_samples_per_second": 1502.281, + "eval_steps_per_second": 4.695, + "step": 4940 + }, + { + "epoch": 2.07, + "learning_rate": 1e-06, + "loss": 0.2369, + "step": 4960 + }, + { + "epoch": 2.07, + "eval_accuracy": 0.9169166666666667, + "eval_loss": 0.25172725319862366, + "eval_runtime": 15.9075, + "eval_samples_per_second": 1508.718, + "eval_steps_per_second": 4.715, + "step": 4960 + }, + { + "epoch": 2.08, + "learning_rate": 1e-06, + "loss": 0.2337, + "step": 4980 + }, + { + "epoch": 2.08, + "eval_accuracy": 0.9175833333333333, + "eval_loss": 0.24899640679359436, + "eval_runtime": 15.735, + "eval_samples_per_second": 1525.264, + "eval_steps_per_second": 4.766, + "step": 4980 + }, + { + "epoch": 2.08, + "learning_rate": 1e-06, + "loss": 0.2683, + "step": 5000 + }, + { + "epoch": 2.08, + "eval_accuracy": 0.9167916666666667, + "eval_loss": 0.24926243722438812, + "eval_runtime": 16.2679, + "eval_samples_per_second": 1475.302, + "eval_steps_per_second": 4.61, + "step": 5000 + }, + { + "epoch": 2.09, + "learning_rate": 1e-06, + "loss": 0.2288, + "step": 5020 + }, + { + "epoch": 2.09, + "eval_accuracy": 0.9171666666666667, + "eval_loss": 0.24971893429756165, + "eval_runtime": 15.7076, + "eval_samples_per_second": 1527.924, + "eval_steps_per_second": 4.775, + "step": 5020 + }, + { + "epoch": 2.1, + "learning_rate": 1e-06, + "loss": 0.209, + "step": 5040 + }, + { + "epoch": 2.1, + "eval_accuracy": 0.9169583333333333, + "eval_loss": 0.24874989688396454, + "eval_runtime": 15.9175, + "eval_samples_per_second": 1507.777, + "eval_steps_per_second": 4.712, + "step": 5040 + }, + { + "epoch": 2.11, + "learning_rate": 1e-06, + "loss": 0.2272, + "step": 5060 + }, + { + "epoch": 2.11, + "eval_accuracy": 0.9155416666666667, + "eval_loss": 0.2508015036582947, + "eval_runtime": 15.4995, + "eval_samples_per_second": 1548.44, + "eval_steps_per_second": 4.839, + "step": 5060 + }, + { + "epoch": 2.12, + "learning_rate": 1e-06, + "loss": 0.2537, + "step": 5080 + }, + { + "epoch": 2.12, + "eval_accuracy": 0.9165833333333333, + "eval_loss": 0.250166654586792, + "eval_runtime": 16.1014, + "eval_samples_per_second": 1490.551, + "eval_steps_per_second": 4.658, + "step": 5080 + }, + { + "epoch": 2.12, + "learning_rate": 1e-06, + "loss": 0.3171, + "step": 5100 + }, + { + "epoch": 2.12, + "eval_accuracy": 0.9159166666666667, + "eval_loss": 0.25174057483673096, + "eval_runtime": 15.868, + "eval_samples_per_second": 1512.48, + "eval_steps_per_second": 4.727, + "step": 5100 + }, + { + "epoch": 2.13, + "learning_rate": 1e-06, + "loss": 0.1955, + "step": 5120 + }, + { + "epoch": 2.13, + "eval_accuracy": 0.9165833333333333, + "eval_loss": 0.2481849193572998, + "eval_runtime": 15.8739, + "eval_samples_per_second": 1511.915, + "eval_steps_per_second": 4.725, + "step": 5120 + }, + { + "epoch": 2.14, + "learning_rate": 1e-06, + "loss": 0.2618, + "step": 5140 + }, + { + "epoch": 2.14, + "eval_accuracy": 0.9178333333333333, + "eval_loss": 0.24656133353710175, + "eval_runtime": 16.2181, + "eval_samples_per_second": 1479.83, + "eval_steps_per_second": 4.624, + "step": 5140 + }, + { + "epoch": 2.15, + "learning_rate": 1e-06, + "loss": 0.2334, + "step": 5160 + }, + { + "epoch": 2.15, + "eval_accuracy": 0.9174583333333334, + "eval_loss": 0.24714942276477814, + "eval_runtime": 15.6119, + "eval_samples_per_second": 1537.293, + "eval_steps_per_second": 4.804, + "step": 5160 + }, + { + "epoch": 2.16, + "learning_rate": 1e-06, + "loss": 0.2253, + "step": 5180 + }, + { + "epoch": 2.16, + "eval_accuracy": 0.917125, + "eval_loss": 0.2473263293504715, + "eval_runtime": 16.4346, + "eval_samples_per_second": 1460.331, + "eval_steps_per_second": 4.564, + "step": 5180 + }, + { + "epoch": 2.17, + "learning_rate": 1e-06, + "loss": 0.2735, + "step": 5200 + }, + { + "epoch": 2.17, + "eval_accuracy": 0.9169583333333333, + "eval_loss": 0.24832595884799957, + "eval_runtime": 15.9352, + "eval_samples_per_second": 1506.103, + "eval_steps_per_second": 4.707, + "step": 5200 + }, + { + "epoch": 2.17, + "learning_rate": 1e-06, + "loss": 0.2119, + "step": 5220 + }, + { + "epoch": 2.17, + "eval_accuracy": 0.9175, + "eval_loss": 0.24637369811534882, + "eval_runtime": 15.8502, + "eval_samples_per_second": 1514.172, + "eval_steps_per_second": 4.732, + "step": 5220 + }, + { + "epoch": 2.18, + "learning_rate": 1e-06, + "loss": 0.2338, + "step": 5240 + }, + { + "epoch": 2.18, + "eval_accuracy": 0.9175416666666667, + "eval_loss": 0.24698366224765778, + "eval_runtime": 15.7545, + "eval_samples_per_second": 1523.378, + "eval_steps_per_second": 4.761, + "step": 5240 + }, + { + "epoch": 2.19, + "learning_rate": 1e-06, + "loss": 0.2233, + "step": 5260 + }, + { + "epoch": 2.19, + "eval_accuracy": 0.9155, + "eval_loss": 0.24797461926937103, + "eval_runtime": 15.7338, + "eval_samples_per_second": 1525.377, + "eval_steps_per_second": 4.767, + "step": 5260 + }, + { + "epoch": 2.2, + "learning_rate": 1e-06, + "loss": 0.243, + "step": 5280 + }, + { + "epoch": 2.2, + "eval_accuracy": 0.916375, + "eval_loss": 0.24534855782985687, + "eval_runtime": 15.9549, + "eval_samples_per_second": 1504.244, + "eval_steps_per_second": 4.701, + "step": 5280 + }, + { + "epoch": 2.21, + "learning_rate": 1e-06, + "loss": 0.2476, + "step": 5300 + }, + { + "epoch": 2.21, + "eval_accuracy": 0.917125, + "eval_loss": 0.24429555237293243, + "eval_runtime": 16.2563, + "eval_samples_per_second": 1476.355, + "eval_steps_per_second": 4.614, + "step": 5300 + }, + { + "epoch": 2.22, + "learning_rate": 1e-06, + "loss": 0.2481, + "step": 5320 + }, + { + "epoch": 2.22, + "eval_accuracy": 0.9174166666666667, + "eval_loss": 0.24369333684444427, + "eval_runtime": 15.6541, + "eval_samples_per_second": 1533.143, + "eval_steps_per_second": 4.791, + "step": 5320 + }, + { + "epoch": 2.23, + "learning_rate": 1e-06, + "loss": 0.2505, + "step": 5340 + }, + { + "epoch": 2.23, + "eval_accuracy": 0.9179583333333333, + "eval_loss": 0.24328631162643433, + "eval_runtime": 15.9128, + "eval_samples_per_second": 1508.223, + "eval_steps_per_second": 4.713, + "step": 5340 + }, + { + "epoch": 2.23, + "learning_rate": 1e-06, + "loss": 0.2591, + "step": 5360 + }, + { + "epoch": 2.23, + "eval_accuracy": 0.9164583333333334, + "eval_loss": 0.24514025449752808, + "eval_runtime": 16.021, + "eval_samples_per_second": 1498.036, + "eval_steps_per_second": 4.681, + "step": 5360 + }, + { + "epoch": 2.24, + "learning_rate": 1e-06, + "loss": 0.2342, + "step": 5380 + }, + { + "epoch": 2.24, + "eval_accuracy": 0.9154166666666667, + "eval_loss": 0.2472737431526184, + "eval_runtime": 15.688, + "eval_samples_per_second": 1529.832, + "eval_steps_per_second": 4.781, + "step": 5380 + }, + { + "epoch": 2.25, + "learning_rate": 1e-06, + "loss": 0.2405, + "step": 5400 + }, + { + "epoch": 2.25, + "eval_accuracy": 0.9179166666666667, + "eval_loss": 0.24237360060214996, + "eval_runtime": 16.4849, + "eval_samples_per_second": 1455.882, + "eval_steps_per_second": 4.55, + "step": 5400 + }, + { + "epoch": 2.26, + "learning_rate": 1e-06, + "loss": 0.2803, + "step": 5420 + }, + { + "epoch": 2.26, + "eval_accuracy": 0.9185416666666667, + "eval_loss": 0.2412734031677246, + "eval_runtime": 15.7027, + "eval_samples_per_second": 1528.403, + "eval_steps_per_second": 4.776, + "step": 5420 + }, + { + "epoch": 2.27, + "learning_rate": 1e-06, + "loss": 0.2816, + "step": 5440 + }, + { + "epoch": 2.27, + "eval_accuracy": 0.9178333333333333, + "eval_loss": 0.2425076812505722, + "eval_runtime": 15.8848, + "eval_samples_per_second": 1510.874, + "eval_steps_per_second": 4.721, + "step": 5440 + }, + { + "epoch": 2.27, + "learning_rate": 1e-06, + "loss": 0.2489, + "step": 5460 + }, + { + "epoch": 2.27, + "eval_accuracy": 0.9172916666666666, + "eval_loss": 0.24504542350769043, + "eval_runtime": 16.3712, + "eval_samples_per_second": 1465.986, + "eval_steps_per_second": 4.581, + "step": 5460 + }, + { + "epoch": 2.28, + "learning_rate": 1e-06, + "loss": 0.2346, + "step": 5480 + }, + { + "epoch": 2.28, + "eval_accuracy": 0.9180833333333334, + "eval_loss": 0.2423253357410431, + "eval_runtime": 15.5334, + "eval_samples_per_second": 1545.062, + "eval_steps_per_second": 4.828, + "step": 5480 + }, + { + "epoch": 2.29, + "learning_rate": 1e-06, + "loss": 0.251, + "step": 5500 + }, + { + "epoch": 2.29, + "eval_accuracy": 0.9185, + "eval_loss": 0.24269212782382965, + "eval_runtime": 16.0617, + "eval_samples_per_second": 1494.234, + "eval_steps_per_second": 4.669, + "step": 5500 + }, + { + "epoch": 2.3, + "learning_rate": 1e-06, + "loss": 0.2574, + "step": 5520 + }, + { + "epoch": 2.3, + "eval_accuracy": 0.9187083333333333, + "eval_loss": 0.2401323914527893, + "eval_runtime": 15.7783, + "eval_samples_per_second": 1521.078, + "eval_steps_per_second": 4.753, + "step": 5520 + }, + { + "epoch": 2.31, + "learning_rate": 1e-06, + "loss": 0.2227, + "step": 5540 + }, + { + "epoch": 2.31, + "eval_accuracy": 0.9179583333333333, + "eval_loss": 0.24057930707931519, + "eval_runtime": 16.3047, + "eval_samples_per_second": 1471.971, + "eval_steps_per_second": 4.6, + "step": 5540 + }, + { + "epoch": 2.32, + "learning_rate": 1e-06, + "loss": 0.2362, + "step": 5560 + }, + { + "epoch": 2.32, + "eval_accuracy": 0.9185833333333333, + "eval_loss": 0.23938481509685516, + "eval_runtime": 15.6702, + "eval_samples_per_second": 1531.572, + "eval_steps_per_second": 4.786, + "step": 5560 + }, + { + "epoch": 2.33, + "learning_rate": 1e-06, + "loss": 0.2346, + "step": 5580 + }, + { + "epoch": 2.33, + "eval_accuracy": 0.918625, + "eval_loss": 0.23980508744716644, + "eval_runtime": 15.999, + "eval_samples_per_second": 1500.095, + "eval_steps_per_second": 4.688, + "step": 5580 + }, + { + "epoch": 2.33, + "learning_rate": 1e-06, + "loss": 0.2308, + "step": 5600 + }, + { + "epoch": 2.33, + "eval_accuracy": 0.9172083333333333, + "eval_loss": 0.24078369140625, + "eval_runtime": 16.2118, + "eval_samples_per_second": 1480.402, + "eval_steps_per_second": 4.626, + "step": 5600 + }, + { + "epoch": 2.34, + "learning_rate": 1e-06, + "loss": 0.2668, + "step": 5620 + }, + { + "epoch": 2.34, + "eval_accuracy": 0.9185833333333333, + "eval_loss": 0.23922114074230194, + "eval_runtime": 15.9955, + "eval_samples_per_second": 1500.419, + "eval_steps_per_second": 4.689, + "step": 5620 + }, + { + "epoch": 2.35, + "learning_rate": 1e-06, + "loss": 0.2363, + "step": 5640 + }, + { + "epoch": 2.35, + "eval_accuracy": 0.9182083333333333, + "eval_loss": 0.23886892199516296, + "eval_runtime": 15.6875, + "eval_samples_per_second": 1529.876, + "eval_steps_per_second": 4.781, + "step": 5640 + }, + { + "epoch": 2.36, + "learning_rate": 1e-06, + "loss": 0.2152, + "step": 5660 + }, + { + "epoch": 2.36, + "eval_accuracy": 0.918125, + "eval_loss": 0.23896987736225128, + "eval_runtime": 15.4793, + "eval_samples_per_second": 1550.456, + "eval_steps_per_second": 4.845, + "step": 5660 + }, + { + "epoch": 2.37, + "learning_rate": 1e-06, + "loss": 0.2759, + "step": 5680 + }, + { + "epoch": 2.37, + "eval_accuracy": 0.9181666666666667, + "eval_loss": 0.23786494135856628, + "eval_runtime": 15.5978, + "eval_samples_per_second": 1538.675, + "eval_steps_per_second": 4.808, + "step": 5680 + }, + { + "epoch": 2.38, + "learning_rate": 1e-06, + "loss": 0.2274, + "step": 5700 + }, + { + "epoch": 2.38, + "eval_accuracy": 0.9188333333333333, + "eval_loss": 0.23672978579998016, + "eval_runtime": 15.6992, + "eval_samples_per_second": 1528.744, + "eval_steps_per_second": 4.777, + "step": 5700 + }, + { + "epoch": 2.38, + "learning_rate": 1e-06, + "loss": 0.239, + "step": 5720 + }, + { + "epoch": 2.38, + "eval_accuracy": 0.9190833333333334, + "eval_loss": 0.23753681778907776, + "eval_runtime": 15.6792, + "eval_samples_per_second": 1530.692, + "eval_steps_per_second": 4.783, + "step": 5720 + }, + { + "epoch": 2.39, + "learning_rate": 1e-06, + "loss": 0.2195, + "step": 5740 + }, + { + "epoch": 2.39, + "eval_accuracy": 0.9187083333333333, + "eval_loss": 0.2378937304019928, + "eval_runtime": 16.0105, + "eval_samples_per_second": 1499.013, + "eval_steps_per_second": 4.684, + "step": 5740 + }, + { + "epoch": 2.4, + "learning_rate": 1e-06, + "loss": 0.2454, + "step": 5760 + }, + { + "epoch": 2.4, + "eval_accuracy": 0.9187916666666667, + "eval_loss": 0.23726117610931396, + "eval_runtime": 15.4532, + "eval_samples_per_second": 1553.077, + "eval_steps_per_second": 4.853, + "step": 5760 + }, + { + "epoch": 2.41, + "learning_rate": 1e-06, + "loss": 0.2376, + "step": 5780 + }, + { + "epoch": 2.41, + "eval_accuracy": 0.9192916666666666, + "eval_loss": 0.23683802783489227, + "eval_runtime": 16.4115, + "eval_samples_per_second": 1462.385, + "eval_steps_per_second": 4.57, + "step": 5780 + }, + { + "epoch": 2.42, + "learning_rate": 1e-06, + "loss": 0.253, + "step": 5800 + }, + { + "epoch": 2.42, + "eval_accuracy": 0.9193333333333333, + "eval_loss": 0.23695851862430573, + "eval_runtime": 15.7863, + "eval_samples_per_second": 1520.307, + "eval_steps_per_second": 4.751, + "step": 5800 + }, + { + "epoch": 2.42, + "learning_rate": 1e-06, + "loss": 0.2365, + "step": 5820 + }, + { + "epoch": 2.42, + "eval_accuracy": 0.919375, + "eval_loss": 0.23560389876365662, + "eval_runtime": 16.216, + "eval_samples_per_second": 1480.022, + "eval_steps_per_second": 4.625, + "step": 5820 + }, + { + "epoch": 2.43, + "learning_rate": 1e-06, + "loss": 0.2025, + "step": 5840 + }, + { + "epoch": 2.43, + "eval_accuracy": 0.9199583333333333, + "eval_loss": 0.2355504035949707, + "eval_runtime": 15.4916, + "eval_samples_per_second": 1549.222, + "eval_steps_per_second": 4.841, + "step": 5840 + }, + { + "epoch": 2.44, + "learning_rate": 1e-06, + "loss": 0.2115, + "step": 5860 + }, + { + "epoch": 2.44, + "eval_accuracy": 0.9187083333333333, + "eval_loss": 0.23711217939853668, + "eval_runtime": 15.6181, + "eval_samples_per_second": 1536.68, + "eval_steps_per_second": 4.802, + "step": 5860 + }, + { + "epoch": 2.45, + "learning_rate": 1e-06, + "loss": 0.191, + "step": 5880 + }, + { + "epoch": 2.45, + "eval_accuracy": 0.9195, + "eval_loss": 0.2373773753643036, + "eval_runtime": 15.8036, + "eval_samples_per_second": 1518.638, + "eval_steps_per_second": 4.746, + "step": 5880 + }, + { + "epoch": 2.46, + "learning_rate": 1e-06, + "loss": 0.2228, + "step": 5900 + }, + { + "epoch": 2.46, + "eval_accuracy": 0.9200416666666666, + "eval_loss": 0.23531056940555573, + "eval_runtime": 15.9661, + "eval_samples_per_second": 1503.188, + "eval_steps_per_second": 4.697, + "step": 5900 + }, + { + "epoch": 2.47, + "learning_rate": 1e-06, + "loss": 0.2151, + "step": 5920 + }, + { + "epoch": 2.47, + "eval_accuracy": 0.919375, + "eval_loss": 0.23577865958213806, + "eval_runtime": 15.3031, + "eval_samples_per_second": 1568.307, + "eval_steps_per_second": 4.901, + "step": 5920 + }, + { + "epoch": 2.48, + "learning_rate": 1e-06, + "loss": 0.2351, + "step": 5940 + }, + { + "epoch": 2.48, + "eval_accuracy": 0.9205, + "eval_loss": 0.23470845818519592, + "eval_runtime": 15.6649, + "eval_samples_per_second": 1532.088, + "eval_steps_per_second": 4.788, + "step": 5940 + }, + { + "epoch": 2.48, + "learning_rate": 1e-06, + "loss": 0.2535, + "step": 5960 + }, + { + "epoch": 2.48, + "eval_accuracy": 0.9204166666666667, + "eval_loss": 0.23469573259353638, + "eval_runtime": 15.8743, + "eval_samples_per_second": 1511.875, + "eval_steps_per_second": 4.725, + "step": 5960 + }, + { + "epoch": 2.49, + "learning_rate": 1e-06, + "loss": 0.2646, + "step": 5980 + }, + { + "epoch": 2.49, + "eval_accuracy": 0.9199166666666667, + "eval_loss": 0.23572835326194763, + "eval_runtime": 15.9499, + "eval_samples_per_second": 1504.715, + "eval_steps_per_second": 4.702, + "step": 5980 + }, + { + "epoch": 2.5, + "learning_rate": 1e-06, + "loss": 0.2495, + "step": 6000 + }, + { + "epoch": 2.5, + "eval_accuracy": 0.9185, + "eval_loss": 0.237389475107193, + "eval_runtime": 15.8761, + "eval_samples_per_second": 1511.71, + "eval_steps_per_second": 4.724, + "step": 6000 + }, + { + "epoch": 2.51, + "learning_rate": 1e-06, + "loss": 0.2383, + "step": 6020 + }, + { + "epoch": 2.51, + "eval_accuracy": 0.91875, + "eval_loss": 0.23722399771213531, + "eval_runtime": 16.1535, + "eval_samples_per_second": 1485.747, + "eval_steps_per_second": 4.643, + "step": 6020 + }, + { + "epoch": 2.52, + "learning_rate": 1e-06, + "loss": 0.2103, + "step": 6040 + }, + { + "epoch": 2.52, + "eval_accuracy": 0.9197916666666667, + "eval_loss": 0.23570208251476288, + "eval_runtime": 15.6602, + "eval_samples_per_second": 1532.549, + "eval_steps_per_second": 4.789, + "step": 6040 + }, + { + "epoch": 2.52, + "learning_rate": 1e-06, + "loss": 0.2667, + "step": 6060 + }, + { + "epoch": 2.52, + "eval_accuracy": 0.9205, + "eval_loss": 0.2345227301120758, + "eval_runtime": 15.9993, + "eval_samples_per_second": 1500.065, + "eval_steps_per_second": 4.688, + "step": 6060 + }, + { + "epoch": 2.53, + "learning_rate": 1e-06, + "loss": 0.2229, + "step": 6080 + }, + { + "epoch": 2.53, + "eval_accuracy": 0.9203333333333333, + "eval_loss": 0.23675419390201569, + "eval_runtime": 16.0513, + "eval_samples_per_second": 1495.205, + "eval_steps_per_second": 4.673, + "step": 6080 + }, + { + "epoch": 2.54, + "learning_rate": 1e-06, + "loss": 0.2794, + "step": 6100 + }, + { + "epoch": 2.54, + "eval_accuracy": 0.9181666666666667, + "eval_loss": 0.2398298680782318, + "eval_runtime": 15.8057, + "eval_samples_per_second": 1518.443, + "eval_steps_per_second": 4.745, + "step": 6100 + }, + { + "epoch": 2.55, + "learning_rate": 1e-06, + "loss": 0.2191, + "step": 6120 + }, + { + "epoch": 2.55, + "eval_accuracy": 0.919125, + "eval_loss": 0.23680013418197632, + "eval_runtime": 16.0686, + "eval_samples_per_second": 1493.6, + "eval_steps_per_second": 4.668, + "step": 6120 + }, + { + "epoch": 2.56, + "learning_rate": 1e-06, + "loss": 0.263, + "step": 6140 + }, + { + "epoch": 2.56, + "eval_accuracy": 0.9210833333333334, + "eval_loss": 0.23318885266780853, + "eval_runtime": 15.8077, + "eval_samples_per_second": 1518.25, + "eval_steps_per_second": 4.745, + "step": 6140 + }, + { + "epoch": 2.57, + "learning_rate": 1e-06, + "loss": 0.2008, + "step": 6160 + }, + { + "epoch": 2.57, + "eval_accuracy": 0.9214166666666667, + "eval_loss": 0.2328587919473648, + "eval_runtime": 15.7426, + "eval_samples_per_second": 1524.526, + "eval_steps_per_second": 4.764, + "step": 6160 + }, + { + "epoch": 2.58, + "learning_rate": 1e-06, + "loss": 0.2226, + "step": 6180 + }, + { + "epoch": 2.58, + "eval_accuracy": 0.9210416666666666, + "eval_loss": 0.23366689682006836, + "eval_runtime": 15.6914, + "eval_samples_per_second": 1529.497, + "eval_steps_per_second": 4.78, + "step": 6180 + }, + { + "epoch": 2.58, + "learning_rate": 1e-06, + "loss": 0.2261, + "step": 6200 + }, + { + "epoch": 2.58, + "eval_accuracy": 0.9207083333333334, + "eval_loss": 0.23435989022254944, + "eval_runtime": 15.7413, + "eval_samples_per_second": 1524.655, + "eval_steps_per_second": 4.765, + "step": 6200 + }, + { + "epoch": 2.59, + "learning_rate": 1e-06, + "loss": 0.2481, + "step": 6220 + }, + { + "epoch": 2.59, + "eval_accuracy": 0.9221666666666667, + "eval_loss": 0.23215261101722717, + "eval_runtime": 15.7855, + "eval_samples_per_second": 1520.387, + "eval_steps_per_second": 4.751, + "step": 6220 + }, + { + "epoch": 2.6, + "learning_rate": 1e-06, + "loss": 0.2235, + "step": 6240 + }, + { + "epoch": 2.6, + "eval_accuracy": 0.9214583333333334, + "eval_loss": 0.23404672741889954, + "eval_runtime": 16.0667, + "eval_samples_per_second": 1493.777, + "eval_steps_per_second": 4.668, + "step": 6240 + }, + { + "epoch": 2.61, + "learning_rate": 1e-06, + "loss": 0.2821, + "step": 6260 + }, + { + "epoch": 2.61, + "eval_accuracy": 0.9206666666666666, + "eval_loss": 0.2340671271085739, + "eval_runtime": 16.2416, + "eval_samples_per_second": 1477.689, + "eval_steps_per_second": 4.618, + "step": 6260 + }, + { + "epoch": 2.62, + "learning_rate": 1e-06, + "loss": 0.1988, + "step": 6280 + }, + { + "epoch": 2.62, + "eval_accuracy": 0.9215, + "eval_loss": 0.2360154539346695, + "eval_runtime": 15.5902, + "eval_samples_per_second": 1539.425, + "eval_steps_per_second": 4.811, + "step": 6280 + }, + { + "epoch": 2.62, + "learning_rate": 1e-06, + "loss": 0.2407, + "step": 6300 + }, + { + "epoch": 2.62, + "eval_accuracy": 0.9207083333333334, + "eval_loss": 0.23405057191848755, + "eval_runtime": 16.1083, + "eval_samples_per_second": 1489.915, + "eval_steps_per_second": 4.656, + "step": 6300 + }, + { + "epoch": 2.63, + "learning_rate": 1e-06, + "loss": 0.2138, + "step": 6320 + }, + { + "epoch": 2.63, + "eval_accuracy": 0.9222916666666666, + "eval_loss": 0.23155492544174194, + "eval_runtime": 15.9219, + "eval_samples_per_second": 1507.362, + "eval_steps_per_second": 4.711, + "step": 6320 + }, + { + "epoch": 2.64, + "learning_rate": 1e-06, + "loss": 0.2171, + "step": 6340 + }, + { + "epoch": 2.64, + "eval_accuracy": 0.9215833333333333, + "eval_loss": 0.23096613585948944, + "eval_runtime": 15.2563, + "eval_samples_per_second": 1573.117, + "eval_steps_per_second": 4.916, + "step": 6340 + }, + { + "epoch": 2.65, + "learning_rate": 1e-06, + "loss": 0.2302, + "step": 6360 + }, + { + "epoch": 2.65, + "eval_accuracy": 0.9212083333333333, + "eval_loss": 0.23060353100299835, + "eval_runtime": 16.0532, + "eval_samples_per_second": 1495.028, + "eval_steps_per_second": 4.672, + "step": 6360 + }, + { + "epoch": 2.66, + "learning_rate": 1e-06, + "loss": 0.2394, + "step": 6380 + }, + { + "epoch": 2.66, + "eval_accuracy": 0.9209166666666667, + "eval_loss": 0.23088908195495605, + "eval_runtime": 15.9256, + "eval_samples_per_second": 1507.003, + "eval_steps_per_second": 4.709, + "step": 6380 + }, + { + "epoch": 2.67, + "learning_rate": 1e-06, + "loss": 0.2321, + "step": 6400 + }, + { + "epoch": 2.67, + "eval_accuracy": 0.9219166666666667, + "eval_loss": 0.22929558157920837, + "eval_runtime": 15.7096, + "eval_samples_per_second": 1527.727, + "eval_steps_per_second": 4.774, + "step": 6400 + }, + { + "epoch": 2.67, + "learning_rate": 1e-06, + "loss": 0.2011, + "step": 6420 + }, + { + "epoch": 2.67, + "eval_accuracy": 0.92225, + "eval_loss": 0.2290237993001938, + "eval_runtime": 15.5552, + "eval_samples_per_second": 1542.895, + "eval_steps_per_second": 4.822, + "step": 6420 + }, + { + "epoch": 2.68, + "learning_rate": 1e-06, + "loss": 0.2235, + "step": 6440 + }, + { + "epoch": 2.68, + "eval_accuracy": 0.9219583333333333, + "eval_loss": 0.23050343990325928, + "eval_runtime": 16.0981, + "eval_samples_per_second": 1490.855, + "eval_steps_per_second": 4.659, + "step": 6440 + }, + { + "epoch": 2.69, + "learning_rate": 1e-06, + "loss": 0.2638, + "step": 6460 + }, + { + "epoch": 2.69, + "eval_accuracy": 0.9218333333333333, + "eval_loss": 0.23081812262535095, + "eval_runtime": 16.0097, + "eval_samples_per_second": 1499.096, + "eval_steps_per_second": 4.685, + "step": 6460 + }, + { + "epoch": 2.7, + "learning_rate": 1e-06, + "loss": 0.2767, + "step": 6480 + }, + { + "epoch": 2.7, + "eval_accuracy": 0.92225, + "eval_loss": 0.23322133719921112, + "eval_runtime": 16.0242, + "eval_samples_per_second": 1497.736, + "eval_steps_per_second": 4.68, + "step": 6480 + }, + { + "epoch": 2.71, + "learning_rate": 1e-06, + "loss": 0.2332, + "step": 6500 + }, + { + "epoch": 2.71, + "eval_accuracy": 0.9226666666666666, + "eval_loss": 0.22899757325649261, + "eval_runtime": 16.6235, + "eval_samples_per_second": 1443.741, + "eval_steps_per_second": 4.512, + "step": 6500 + }, + { + "epoch": 2.72, + "learning_rate": 1e-06, + "loss": 0.2104, + "step": 6520 + }, + { + "epoch": 2.72, + "eval_accuracy": 0.9220833333333334, + "eval_loss": 0.22903680801391602, + "eval_runtime": 15.8598, + "eval_samples_per_second": 1513.257, + "eval_steps_per_second": 4.729, + "step": 6520 + }, + { + "epoch": 2.73, + "learning_rate": 1e-06, + "loss": 0.2148, + "step": 6540 + }, + { + "epoch": 2.73, + "eval_accuracy": 0.9219166666666667, + "eval_loss": 0.2291133552789688, + "eval_runtime": 16.0042, + "eval_samples_per_second": 1499.604, + "eval_steps_per_second": 4.686, + "step": 6540 + }, + { + "epoch": 2.73, + "learning_rate": 1e-06, + "loss": 0.2708, + "step": 6560 + }, + { + "epoch": 2.73, + "eval_accuracy": 0.922, + "eval_loss": 0.22891123592853546, + "eval_runtime": 15.8338, + "eval_samples_per_second": 1515.741, + "eval_steps_per_second": 4.737, + "step": 6560 + }, + { + "epoch": 2.74, + "learning_rate": 1e-06, + "loss": 0.2155, + "step": 6580 + }, + { + "epoch": 2.74, + "eval_accuracy": 0.9213333333333333, + "eval_loss": 0.2305193543434143, + "eval_runtime": 15.7359, + "eval_samples_per_second": 1525.179, + "eval_steps_per_second": 4.766, + "step": 6580 + }, + { + "epoch": 2.75, + "learning_rate": 1e-06, + "loss": 0.2149, + "step": 6600 + }, + { + "epoch": 2.75, + "eval_accuracy": 0.921375, + "eval_loss": 0.22942766547203064, + "eval_runtime": 15.983, + "eval_samples_per_second": 1501.595, + "eval_steps_per_second": 4.692, + "step": 6600 + }, + { + "epoch": 2.76, + "learning_rate": 1e-06, + "loss": 0.2396, + "step": 6620 + }, + { + "epoch": 2.76, + "eval_accuracy": 0.9215, + "eval_loss": 0.23061256110668182, + "eval_runtime": 15.913, + "eval_samples_per_second": 1508.203, + "eval_steps_per_second": 4.713, + "step": 6620 + }, + { + "epoch": 2.77, + "learning_rate": 1e-06, + "loss": 0.2572, + "step": 6640 + }, + { + "epoch": 2.77, + "eval_accuracy": 0.9215, + "eval_loss": 0.23231017589569092, + "eval_runtime": 15.4396, + "eval_samples_per_second": 1554.44, + "eval_steps_per_second": 4.858, + "step": 6640 + }, + { + "epoch": 2.77, + "learning_rate": 1e-06, + "loss": 0.2536, + "step": 6660 + }, + { + "epoch": 2.77, + "eval_accuracy": 0.9224166666666667, + "eval_loss": 0.2280372679233551, + "eval_runtime": 15.4345, + "eval_samples_per_second": 1554.958, + "eval_steps_per_second": 4.859, + "step": 6660 + }, + { + "epoch": 2.78, + "learning_rate": 1e-06, + "loss": 0.1892, + "step": 6680 + }, + { + "epoch": 2.78, + "eval_accuracy": 0.9220416666666666, + "eval_loss": 0.22818611562252045, + "eval_runtime": 16.0452, + "eval_samples_per_second": 1495.777, + "eval_steps_per_second": 4.674, + "step": 6680 + }, + { + "epoch": 2.79, + "learning_rate": 1e-06, + "loss": 0.2689, + "step": 6700 + }, + { + "epoch": 2.79, + "eval_accuracy": 0.92125, + "eval_loss": 0.229068323969841, + "eval_runtime": 15.6442, + "eval_samples_per_second": 1534.111, + "eval_steps_per_second": 4.794, + "step": 6700 + }, + { + "epoch": 2.8, + "learning_rate": 1e-06, + "loss": 0.3018, + "step": 6720 + }, + { + "epoch": 2.8, + "eval_accuracy": 0.9225, + "eval_loss": 0.22712692618370056, + "eval_runtime": 15.8562, + "eval_samples_per_second": 1513.605, + "eval_steps_per_second": 4.73, + "step": 6720 + }, + { + "epoch": 2.81, + "learning_rate": 1e-06, + "loss": 0.1971, + "step": 6740 + }, + { + "epoch": 2.81, + "eval_accuracy": 0.9229583333333333, + "eval_loss": 0.22696258127689362, + "eval_runtime": 15.7829, + "eval_samples_per_second": 1520.631, + "eval_steps_per_second": 4.752, + "step": 6740 + }, + { + "epoch": 2.82, + "learning_rate": 1e-06, + "loss": 0.2276, + "step": 6760 + }, + { + "epoch": 2.82, + "eval_accuracy": 0.923375, + "eval_loss": 0.22678209841251373, + "eval_runtime": 15.81, + "eval_samples_per_second": 1518.027, + "eval_steps_per_second": 4.744, + "step": 6760 + }, + { + "epoch": 2.83, + "learning_rate": 1e-06, + "loss": 0.2141, + "step": 6780 + }, + { + "epoch": 2.83, + "eval_accuracy": 0.923, + "eval_loss": 0.2268705815076828, + "eval_runtime": 16.2609, + "eval_samples_per_second": 1475.93, + "eval_steps_per_second": 4.612, + "step": 6780 + }, + { + "epoch": 2.83, + "learning_rate": 1e-06, + "loss": 0.2376, + "step": 6800 + }, + { + "epoch": 2.83, + "eval_accuracy": 0.9234583333333334, + "eval_loss": 0.22707809507846832, + "eval_runtime": 15.6779, + "eval_samples_per_second": 1530.813, + "eval_steps_per_second": 4.784, + "step": 6800 + }, + { + "epoch": 2.84, + "learning_rate": 1e-06, + "loss": 0.2237, + "step": 6820 + }, + { + "epoch": 2.84, + "eval_accuracy": 0.9207916666666667, + "eval_loss": 0.231236070394516, + "eval_runtime": 16.1718, + "eval_samples_per_second": 1484.064, + "eval_steps_per_second": 4.638, + "step": 6820 + }, + { + "epoch": 2.85, + "learning_rate": 1e-06, + "loss": 0.2114, + "step": 6840 + }, + { + "epoch": 2.85, + "eval_accuracy": 0.9222083333333333, + "eval_loss": 0.2280959039926529, + "eval_runtime": 15.646, + "eval_samples_per_second": 1533.937, + "eval_steps_per_second": 4.794, + "step": 6840 + }, + { + "epoch": 2.86, + "learning_rate": 1e-06, + "loss": 0.2037, + "step": 6860 + }, + { + "epoch": 2.86, + "eval_accuracy": 0.9230833333333334, + "eval_loss": 0.22740183770656586, + "eval_runtime": 16.2073, + "eval_samples_per_second": 1480.812, + "eval_steps_per_second": 4.628, + "step": 6860 + }, + { + "epoch": 2.87, + "learning_rate": 1e-06, + "loss": 0.2412, + "step": 6880 + }, + { + "epoch": 2.87, + "eval_accuracy": 0.9233333333333333, + "eval_loss": 0.22586235404014587, + "eval_runtime": 15.8595, + "eval_samples_per_second": 1513.292, + "eval_steps_per_second": 4.729, + "step": 6880 + }, + { + "epoch": 2.88, + "learning_rate": 1e-06, + "loss": 0.2105, + "step": 6900 + }, + { + "epoch": 2.88, + "eval_accuracy": 0.9239583333333333, + "eval_loss": 0.22604452073574066, + "eval_runtime": 15.5265, + "eval_samples_per_second": 1545.746, + "eval_steps_per_second": 4.83, + "step": 6900 + }, + { + "epoch": 2.88, + "learning_rate": 1e-06, + "loss": 0.2209, + "step": 6920 + }, + { + "epoch": 2.88, + "eval_accuracy": 0.922625, + "eval_loss": 0.22909581661224365, + "eval_runtime": 16.0771, + "eval_samples_per_second": 1492.802, + "eval_steps_per_second": 4.665, + "step": 6920 + }, + { + "epoch": 2.89, + "learning_rate": 1e-06, + "loss": 0.2323, + "step": 6940 + }, + { + "epoch": 2.89, + "eval_accuracy": 0.9240833333333334, + "eval_loss": 0.2258378565311432, + "eval_runtime": 15.5648, + "eval_samples_per_second": 1541.936, + "eval_steps_per_second": 4.819, + "step": 6940 + }, + { + "epoch": 2.9, + "learning_rate": 1e-06, + "loss": 0.2416, + "step": 6960 + }, + { + "epoch": 2.9, + "eval_accuracy": 0.9242916666666666, + "eval_loss": 0.22519205510616302, + "eval_runtime": 16.1579, + "eval_samples_per_second": 1485.345, + "eval_steps_per_second": 4.642, + "step": 6960 + }, + { + "epoch": 2.91, + "learning_rate": 1e-06, + "loss": 0.2369, + "step": 6980 + }, + { + "epoch": 2.91, + "eval_accuracy": 0.9219166666666667, + "eval_loss": 0.22782516479492188, + "eval_runtime": 15.7414, + "eval_samples_per_second": 1524.641, + "eval_steps_per_second": 4.765, + "step": 6980 + }, + { + "epoch": 2.92, + "learning_rate": 1e-06, + "loss": 0.2218, + "step": 7000 + }, + { + "epoch": 2.92, + "eval_accuracy": 0.91975, + "eval_loss": 0.23249071836471558, + "eval_runtime": 16.1495, + "eval_samples_per_second": 1486.113, + "eval_steps_per_second": 4.644, + "step": 7000 + }, + { + "epoch": 2.92, + "learning_rate": 1e-06, + "loss": 0.2479, + "step": 7020 + }, + { + "epoch": 2.92, + "eval_accuracy": 0.922375, + "eval_loss": 0.2269277721643448, + "eval_runtime": 15.707, + "eval_samples_per_second": 1527.979, + "eval_steps_per_second": 4.775, + "step": 7020 + }, + { + "epoch": 2.93, + "learning_rate": 1e-06, + "loss": 0.2174, + "step": 7040 + }, + { + "epoch": 2.93, + "eval_accuracy": 0.9234583333333334, + "eval_loss": 0.22529129683971405, + "eval_runtime": 15.7426, + "eval_samples_per_second": 1524.524, + "eval_steps_per_second": 4.764, + "step": 7040 + }, + { + "epoch": 2.94, + "learning_rate": 1e-06, + "loss": 0.2243, + "step": 7060 + }, + { + "epoch": 2.94, + "eval_accuracy": 0.9230416666666666, + "eval_loss": 0.22688570618629456, + "eval_runtime": 16.3366, + "eval_samples_per_second": 1469.094, + "eval_steps_per_second": 4.591, + "step": 7060 + }, + { + "epoch": 2.95, + "learning_rate": 1e-06, + "loss": 0.2822, + "step": 7080 + }, + { + "epoch": 2.95, + "eval_accuracy": 0.9227916666666667, + "eval_loss": 0.23044590651988983, + "eval_runtime": 15.8096, + "eval_samples_per_second": 1518.063, + "eval_steps_per_second": 4.744, + "step": 7080 + }, + { + "epoch": 2.96, + "learning_rate": 1e-06, + "loss": 0.2161, + "step": 7100 + }, + { + "epoch": 2.96, + "eval_accuracy": 0.9220833333333334, + "eval_loss": 0.22724518179893494, + "eval_runtime": 16.2528, + "eval_samples_per_second": 1476.67, + "eval_steps_per_second": 4.615, + "step": 7100 + }, + { + "epoch": 2.97, + "learning_rate": 1e-06, + "loss": 0.238, + "step": 7120 + }, + { + "epoch": 2.97, + "eval_accuracy": 0.9245, + "eval_loss": 0.22448720037937164, + "eval_runtime": 15.5514, + "eval_samples_per_second": 1543.268, + "eval_steps_per_second": 4.823, + "step": 7120 + }, + { + "epoch": 2.98, + "learning_rate": 1e-06, + "loss": 0.238, + "step": 7140 + }, + { + "epoch": 2.98, + "eval_accuracy": 0.924125, + "eval_loss": 0.22591613233089447, + "eval_runtime": 15.8997, + "eval_samples_per_second": 1509.461, + "eval_steps_per_second": 4.717, + "step": 7140 + }, + { + "epoch": 2.98, + "learning_rate": 1e-06, + "loss": 0.1969, + "step": 7160 + }, + { + "epoch": 2.98, + "eval_accuracy": 0.924125, + "eval_loss": 0.22452926635742188, + "eval_runtime": 15.5234, + "eval_samples_per_second": 1546.054, + "eval_steps_per_second": 4.831, + "step": 7160 + }, + { + "epoch": 2.99, + "learning_rate": 1e-06, + "loss": 0.1897, + "step": 7180 + }, + { + "epoch": 2.99, + "eval_accuracy": 0.923625, + "eval_loss": 0.22510305047035217, + "eval_runtime": 15.9816, + "eval_samples_per_second": 1501.726, + "eval_steps_per_second": 4.693, + "step": 7180 + }, + { + "epoch": 3.0, + "learning_rate": 1e-06, + "loss": 0.2168, + "step": 7200 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.9232916666666666, + "eval_loss": 0.22502471506595612, + "eval_runtime": 15.6986, + "eval_samples_per_second": 1528.802, + "eval_steps_per_second": 4.778, + "step": 7200 + }, + { + "epoch": 3.01, + "learning_rate": 1e-06, + "loss": 0.251, + "step": 7220 + }, + { + "epoch": 3.01, + "eval_accuracy": 0.9230833333333334, + "eval_loss": 0.22653472423553467, + "eval_runtime": 16.1799, + "eval_samples_per_second": 1483.32, + "eval_steps_per_second": 4.635, + "step": 7220 + }, + { + "epoch": 3.02, + "learning_rate": 1e-06, + "loss": 0.2315, + "step": 7240 + }, + { + "epoch": 3.02, + "eval_accuracy": 0.923125, + "eval_loss": 0.2254115790128708, + "eval_runtime": 15.9152, + "eval_samples_per_second": 1507.997, + "eval_steps_per_second": 4.712, + "step": 7240 + }, + { + "epoch": 3.02, + "learning_rate": 1e-06, + "loss": 0.2055, + "step": 7260 + }, + { + "epoch": 3.02, + "eval_accuracy": 0.9245416666666667, + "eval_loss": 0.22353042662143707, + "eval_runtime": 15.5398, + "eval_samples_per_second": 1544.419, + "eval_steps_per_second": 4.826, + "step": 7260 + }, + { + "epoch": 3.03, + "learning_rate": 1e-06, + "loss": 0.2277, + "step": 7280 + }, + { + "epoch": 3.03, + "eval_accuracy": 0.9244583333333334, + "eval_loss": 0.2269069403409958, + "eval_runtime": 16.1706, + "eval_samples_per_second": 1484.176, + "eval_steps_per_second": 4.638, + "step": 7280 + }, + { + "epoch": 3.04, + "learning_rate": 1e-06, + "loss": 0.2379, + "step": 7300 + }, + { + "epoch": 3.04, + "eval_accuracy": 0.9248333333333333, + "eval_loss": 0.22591404616832733, + "eval_runtime": 16.0606, + "eval_samples_per_second": 1494.339, + "eval_steps_per_second": 4.67, + "step": 7300 + }, + { + "epoch": 3.05, + "learning_rate": 1e-06, + "loss": 0.1889, + "step": 7320 + }, + { + "epoch": 3.05, + "eval_accuracy": 0.9246666666666666, + "eval_loss": 0.22398823499679565, + "eval_runtime": 15.816, + "eval_samples_per_second": 1517.45, + "eval_steps_per_second": 4.742, + "step": 7320 + }, + { + "epoch": 3.06, + "learning_rate": 1e-06, + "loss": 0.2386, + "step": 7340 + }, + { + "epoch": 3.06, + "eval_accuracy": 0.9240416666666667, + "eval_loss": 0.22369486093521118, + "eval_runtime": 16.3833, + "eval_samples_per_second": 1464.908, + "eval_steps_per_second": 4.578, + "step": 7340 + }, + { + "epoch": 3.07, + "learning_rate": 1e-06, + "loss": 0.2135, + "step": 7360 + }, + { + "epoch": 3.07, + "eval_accuracy": 0.9237083333333334, + "eval_loss": 0.22461163997650146, + "eval_runtime": 15.8713, + "eval_samples_per_second": 1512.165, + "eval_steps_per_second": 4.726, + "step": 7360 + }, + { + "epoch": 3.08, + "learning_rate": 1e-06, + "loss": 0.2411, + "step": 7380 + }, + { + "epoch": 3.08, + "eval_accuracy": 0.923875, + "eval_loss": 0.2259892225265503, + "eval_runtime": 16.3476, + "eval_samples_per_second": 1468.103, + "eval_steps_per_second": 4.588, + "step": 7380 + }, + { + "epoch": 3.08, + "learning_rate": 1e-06, + "loss": 0.2206, + "step": 7400 + }, + { + "epoch": 3.08, + "eval_accuracy": 0.9233333333333333, + "eval_loss": 0.22540873289108276, + "eval_runtime": 15.7971, + "eval_samples_per_second": 1519.265, + "eval_steps_per_second": 4.748, + "step": 7400 + }, + { + "epoch": 3.09, + "learning_rate": 1e-06, + "loss": 0.221, + "step": 7420 + }, + { + "epoch": 3.09, + "eval_accuracy": 0.9237083333333334, + "eval_loss": 0.2240631878376007, + "eval_runtime": 15.554, + "eval_samples_per_second": 1543.013, + "eval_steps_per_second": 4.822, + "step": 7420 + }, + { + "epoch": 3.1, + "learning_rate": 1e-06, + "loss": 0.2189, + "step": 7440 + }, + { + "epoch": 3.1, + "eval_accuracy": 0.9245, + "eval_loss": 0.2240666151046753, + "eval_runtime": 15.7074, + "eval_samples_per_second": 1527.938, + "eval_steps_per_second": 4.775, + "step": 7440 + }, + { + "epoch": 3.11, + "learning_rate": 1e-06, + "loss": 0.2222, + "step": 7460 + }, + { + "epoch": 3.11, + "eval_accuracy": 0.9242916666666666, + "eval_loss": 0.22323697805404663, + "eval_runtime": 15.5541, + "eval_samples_per_second": 1543.003, + "eval_steps_per_second": 4.822, + "step": 7460 + }, + { + "epoch": 3.12, + "learning_rate": 1e-06, + "loss": 0.2227, + "step": 7480 + }, + { + "epoch": 3.12, + "eval_accuracy": 0.92575, + "eval_loss": 0.221858412027359, + "eval_runtime": 16.1116, + "eval_samples_per_second": 1489.607, + "eval_steps_per_second": 4.655, + "step": 7480 + }, + { + "epoch": 3.12, + "learning_rate": 1e-06, + "loss": 0.2375, + "step": 7500 + }, + { + "epoch": 3.12, + "eval_accuracy": 0.9249166666666667, + "eval_loss": 0.2222757488489151, + "eval_runtime": 15.423, + "eval_samples_per_second": 1556.117, + "eval_steps_per_second": 4.863, + "step": 7500 + }, + { + "epoch": 3.13, + "learning_rate": 1e-06, + "loss": 0.2124, + "step": 7520 + }, + { + "epoch": 3.13, + "eval_accuracy": 0.9242083333333333, + "eval_loss": 0.2229667603969574, + "eval_runtime": 16.1122, + "eval_samples_per_second": 1489.556, + "eval_steps_per_second": 4.655, + "step": 7520 + }, + { + "epoch": 3.14, + "learning_rate": 1e-06, + "loss": 0.2383, + "step": 7540 + }, + { + "epoch": 3.14, + "eval_accuracy": 0.9243333333333333, + "eval_loss": 0.2248881459236145, + "eval_runtime": 15.7471, + "eval_samples_per_second": 1524.086, + "eval_steps_per_second": 4.763, + "step": 7540 + }, + { + "epoch": 3.15, + "learning_rate": 1e-06, + "loss": 0.2735, + "step": 7560 + }, + { + "epoch": 3.15, + "eval_accuracy": 0.9245833333333333, + "eval_loss": 0.22355443239212036, + "eval_runtime": 16.0099, + "eval_samples_per_second": 1499.071, + "eval_steps_per_second": 4.685, + "step": 7560 + }, + { + "epoch": 3.16, + "learning_rate": 1e-06, + "loss": 0.195, + "step": 7580 + }, + { + "epoch": 3.16, + "eval_accuracy": 0.9247916666666667, + "eval_loss": 0.2215178906917572, + "eval_runtime": 16.1768, + "eval_samples_per_second": 1483.608, + "eval_steps_per_second": 4.636, + "step": 7580 + }, + { + "epoch": 3.17, + "learning_rate": 1e-06, + "loss": 0.208, + "step": 7600 + }, + { + "epoch": 3.17, + "eval_accuracy": 0.9236666666666666, + "eval_loss": 0.22357864677906036, + "eval_runtime": 15.8058, + "eval_samples_per_second": 1518.434, + "eval_steps_per_second": 4.745, + "step": 7600 + }, + { + "epoch": 3.17, + "learning_rate": 1e-06, + "loss": 0.2019, + "step": 7620 + }, + { + "epoch": 3.17, + "eval_accuracy": 0.9232916666666666, + "eval_loss": 0.2229277789592743, + "eval_runtime": 16.1663, + "eval_samples_per_second": 1484.567, + "eval_steps_per_second": 4.639, + "step": 7620 + }, + { + "epoch": 3.18, + "learning_rate": 1e-06, + "loss": 0.242, + "step": 7640 + }, + { + "epoch": 3.18, + "eval_accuracy": 0.925125, + "eval_loss": 0.22199295461177826, + "eval_runtime": 15.6259, + "eval_samples_per_second": 1535.909, + "eval_steps_per_second": 4.8, + "step": 7640 + }, + { + "epoch": 3.19, + "learning_rate": 1e-06, + "loss": 0.2209, + "step": 7660 + }, + { + "epoch": 3.19, + "eval_accuracy": 0.924375, + "eval_loss": 0.22275349497795105, + "eval_runtime": 15.853, + "eval_samples_per_second": 1513.909, + "eval_steps_per_second": 4.731, + "step": 7660 + }, + { + "epoch": 3.2, + "learning_rate": 1e-06, + "loss": 0.263, + "step": 7680 + }, + { + "epoch": 3.2, + "eval_accuracy": 0.9236666666666666, + "eval_loss": 0.22205360233783722, + "eval_runtime": 15.6683, + "eval_samples_per_second": 1531.757, + "eval_steps_per_second": 4.787, + "step": 7680 + }, + { + "epoch": 3.21, + "learning_rate": 1e-06, + "loss": 0.1923, + "step": 7700 + }, + { + "epoch": 3.21, + "eval_accuracy": 0.9255, + "eval_loss": 0.22050580382347107, + "eval_runtime": 16.26, + "eval_samples_per_second": 1476.012, + "eval_steps_per_second": 4.613, + "step": 7700 + }, + { + "epoch": 3.22, + "learning_rate": 1e-06, + "loss": 0.2203, + "step": 7720 + }, + { + "epoch": 3.22, + "eval_accuracy": 0.9250833333333334, + "eval_loss": 0.22197723388671875, + "eval_runtime": 16.2074, + "eval_samples_per_second": 1480.801, + "eval_steps_per_second": 4.628, + "step": 7720 + }, + { + "epoch": 3.23, + "learning_rate": 1e-06, + "loss": 0.2166, + "step": 7740 + }, + { + "epoch": 3.23, + "eval_accuracy": 0.9254583333333334, + "eval_loss": 0.2208959013223648, + "eval_runtime": 16.0355, + "eval_samples_per_second": 1496.675, + "eval_steps_per_second": 4.677, + "step": 7740 + }, + { + "epoch": 3.23, + "learning_rate": 1e-06, + "loss": 0.2545, + "step": 7760 + }, + { + "epoch": 3.23, + "eval_accuracy": 0.9252083333333333, + "eval_loss": 0.22131887078285217, + "eval_runtime": 15.9945, + "eval_samples_per_second": 1500.516, + "eval_steps_per_second": 4.689, + "step": 7760 + }, + { + "epoch": 3.24, + "learning_rate": 1e-06, + "loss": 0.1604, + "step": 7780 + }, + { + "epoch": 3.24, + "eval_accuracy": 0.92425, + "eval_loss": 0.22284860908985138, + "eval_runtime": 16.005, + "eval_samples_per_second": 1499.53, + "eval_steps_per_second": 4.686, + "step": 7780 + }, + { + "epoch": 3.25, + "learning_rate": 1e-06, + "loss": 0.2, + "step": 7800 + }, + { + "epoch": 3.25, + "eval_accuracy": 0.923875, + "eval_loss": 0.22088997066020966, + "eval_runtime": 15.4149, + "eval_samples_per_second": 1556.938, + "eval_steps_per_second": 4.865, + "step": 7800 + }, + { + "epoch": 3.26, + "learning_rate": 1e-06, + "loss": 0.2373, + "step": 7820 + }, + { + "epoch": 3.26, + "eval_accuracy": 0.9215833333333333, + "eval_loss": 0.2259444147348404, + "eval_runtime": 16.167, + "eval_samples_per_second": 1484.503, + "eval_steps_per_second": 4.639, + "step": 7820 + }, + { + "epoch": 3.27, + "learning_rate": 1e-06, + "loss": 0.217, + "step": 7840 + }, + { + "epoch": 3.27, + "eval_accuracy": 0.9252083333333333, + "eval_loss": 0.22143015265464783, + "eval_runtime": 15.727, + "eval_samples_per_second": 1526.036, + "eval_steps_per_second": 4.769, + "step": 7840 + }, + { + "epoch": 3.27, + "learning_rate": 1e-06, + "loss": 0.2172, + "step": 7860 + }, + { + "epoch": 3.27, + "eval_accuracy": 0.9234583333333334, + "eval_loss": 0.22263799607753754, + "eval_runtime": 16.2672, + "eval_samples_per_second": 1475.362, + "eval_steps_per_second": 4.611, + "step": 7860 + }, + { + "epoch": 3.28, + "learning_rate": 1e-06, + "loss": 0.1959, + "step": 7880 + }, + { + "epoch": 3.28, + "eval_accuracy": 0.9235833333333333, + "eval_loss": 0.22203776240348816, + "eval_runtime": 15.4597, + "eval_samples_per_second": 1552.421, + "eval_steps_per_second": 4.851, + "step": 7880 + }, + { + "epoch": 3.29, + "learning_rate": 1e-06, + "loss": 0.1781, + "step": 7900 + }, + { + "epoch": 3.29, + "eval_accuracy": 0.9245, + "eval_loss": 0.22064486145973206, + "eval_runtime": 15.8085, + "eval_samples_per_second": 1518.167, + "eval_steps_per_second": 4.744, + "step": 7900 + }, + { + "epoch": 3.3, + "learning_rate": 1e-06, + "loss": 0.2024, + "step": 7920 + }, + { + "epoch": 3.3, + "eval_accuracy": 0.9228333333333333, + "eval_loss": 0.22356781363487244, + "eval_runtime": 16.1166, + "eval_samples_per_second": 1489.15, + "eval_steps_per_second": 4.654, + "step": 7920 + }, + { + "epoch": 3.31, + "learning_rate": 1e-06, + "loss": 0.196, + "step": 7940 + }, + { + "epoch": 3.31, + "eval_accuracy": 0.923375, + "eval_loss": 0.22183743119239807, + "eval_runtime": 15.9128, + "eval_samples_per_second": 1508.22, + "eval_steps_per_second": 4.713, + "step": 7940 + }, + { + "epoch": 3.32, + "learning_rate": 1e-06, + "loss": 0.2462, + "step": 7960 + }, + { + "epoch": 3.32, + "eval_accuracy": 0.925875, + "eval_loss": 0.21925699710845947, + "eval_runtime": 15.9525, + "eval_samples_per_second": 1504.469, + "eval_steps_per_second": 4.701, + "step": 7960 + }, + { + "epoch": 3.33, + "learning_rate": 1e-06, + "loss": 0.205, + "step": 7980 + }, + { + "epoch": 3.33, + "eval_accuracy": 0.9255833333333333, + "eval_loss": 0.21881194412708282, + "eval_runtime": 15.9116, + "eval_samples_per_second": 1508.338, + "eval_steps_per_second": 4.714, + "step": 7980 + }, + { + "epoch": 3.33, + "learning_rate": 1e-06, + "loss": 0.1793, + "step": 8000 + }, + { + "epoch": 3.33, + "eval_accuracy": 0.9254583333333334, + "eval_loss": 0.21929273009300232, + "eval_runtime": 16.6645, + "eval_samples_per_second": 1440.183, + "eval_steps_per_second": 4.501, + "step": 8000 + }, + { + "epoch": 3.34, + "learning_rate": 1e-06, + "loss": 0.2551, + "step": 8020 + }, + { + "epoch": 3.34, + "eval_accuracy": 0.9253333333333333, + "eval_loss": 0.21935345232486725, + "eval_runtime": 15.8842, + "eval_samples_per_second": 1510.936, + "eval_steps_per_second": 4.722, + "step": 8020 + }, + { + "epoch": 3.35, + "learning_rate": 1e-06, + "loss": 0.2471, + "step": 8040 + }, + { + "epoch": 3.35, + "eval_accuracy": 0.9250416666666667, + "eval_loss": 0.220913827419281, + "eval_runtime": 15.7346, + "eval_samples_per_second": 1525.304, + "eval_steps_per_second": 4.767, + "step": 8040 + }, + { + "epoch": 3.36, + "learning_rate": 1e-06, + "loss": 0.1765, + "step": 8060 + }, + { + "epoch": 3.36, + "eval_accuracy": 0.9254166666666667, + "eval_loss": 0.22260655462741852, + "eval_runtime": 15.7321, + "eval_samples_per_second": 1525.546, + "eval_steps_per_second": 4.767, + "step": 8060 + }, + { + "epoch": 3.37, + "learning_rate": 1e-06, + "loss": 0.161, + "step": 8080 + }, + { + "epoch": 3.37, + "eval_accuracy": 0.9257083333333334, + "eval_loss": 0.2232980579137802, + "eval_runtime": 15.5673, + "eval_samples_per_second": 1541.698, + "eval_steps_per_second": 4.818, + "step": 8080 + }, + { + "epoch": 3.38, + "learning_rate": 1e-06, + "loss": 0.2243, + "step": 8100 + }, + { + "epoch": 3.38, + "eval_accuracy": 0.9247083333333334, + "eval_loss": 0.22126658260822296, + "eval_runtime": 16.3031, + "eval_samples_per_second": 1472.111, + "eval_steps_per_second": 4.6, + "step": 8100 + }, + { + "epoch": 3.38, + "learning_rate": 1e-06, + "loss": 0.2044, + "step": 8120 + }, + { + "epoch": 3.38, + "eval_accuracy": 0.92525, + "eval_loss": 0.21990346908569336, + "eval_runtime": 15.9234, + "eval_samples_per_second": 1507.215, + "eval_steps_per_second": 4.71, + "step": 8120 + }, + { + "epoch": 3.39, + "learning_rate": 1e-06, + "loss": 0.2213, + "step": 8140 + }, + { + "epoch": 3.39, + "eval_accuracy": 0.9261666666666667, + "eval_loss": 0.21853962540626526, + "eval_runtime": 15.7101, + "eval_samples_per_second": 1527.683, + "eval_steps_per_second": 4.774, + "step": 8140 + }, + { + "epoch": 3.4, + "learning_rate": 1e-06, + "loss": 0.23, + "step": 8160 + }, + { + "epoch": 3.4, + "eval_accuracy": 0.9267916666666667, + "eval_loss": 0.21748250722885132, + "eval_runtime": 15.4099, + "eval_samples_per_second": 1557.437, + "eval_steps_per_second": 4.867, + "step": 8160 + }, + { + "epoch": 3.41, + "learning_rate": 1e-06, + "loss": 0.2377, + "step": 8180 + }, + { + "epoch": 3.41, + "eval_accuracy": 0.9244166666666667, + "eval_loss": 0.21937525272369385, + "eval_runtime": 15.8123, + "eval_samples_per_second": 1517.802, + "eval_steps_per_second": 4.743, + "step": 8180 + }, + { + "epoch": 3.42, + "learning_rate": 1e-06, + "loss": 0.2349, + "step": 8200 + }, + { + "epoch": 3.42, + "eval_accuracy": 0.9235833333333333, + "eval_loss": 0.21988487243652344, + "eval_runtime": 15.604, + "eval_samples_per_second": 1538.067, + "eval_steps_per_second": 4.806, + "step": 8200 + }, + { + "epoch": 3.42, + "learning_rate": 1e-06, + "loss": 0.2217, + "step": 8220 + }, + { + "epoch": 3.42, + "eval_accuracy": 0.9260416666666667, + "eval_loss": 0.21746017038822174, + "eval_runtime": 15.8834, + "eval_samples_per_second": 1511.008, + "eval_steps_per_second": 4.722, + "step": 8220 + }, + { + "epoch": 3.43, + "learning_rate": 1e-06, + "loss": 0.2005, + "step": 8240 + }, + { + "epoch": 3.43, + "eval_accuracy": 0.9264166666666667, + "eval_loss": 0.21770620346069336, + "eval_runtime": 16.0248, + "eval_samples_per_second": 1497.677, + "eval_steps_per_second": 4.68, + "step": 8240 + }, + { + "epoch": 3.44, + "learning_rate": 1e-06, + "loss": 0.2193, + "step": 8260 + }, + { + "epoch": 3.44, + "eval_accuracy": 0.9264583333333334, + "eval_loss": 0.21887263655662537, + "eval_runtime": 15.827, + "eval_samples_per_second": 1516.398, + "eval_steps_per_second": 4.739, + "step": 8260 + }, + { + "epoch": 3.45, + "learning_rate": 1e-06, + "loss": 0.1551, + "step": 8280 + }, + { + "epoch": 3.45, + "eval_accuracy": 0.92625, + "eval_loss": 0.22045257687568665, + "eval_runtime": 15.6985, + "eval_samples_per_second": 1528.808, + "eval_steps_per_second": 4.778, + "step": 8280 + }, + { + "epoch": 3.46, + "learning_rate": 1e-06, + "loss": 0.2399, + "step": 8300 + }, + { + "epoch": 3.46, + "eval_accuracy": 0.9254583333333334, + "eval_loss": 0.21784846484661102, + "eval_runtime": 15.9829, + "eval_samples_per_second": 1501.6, + "eval_steps_per_second": 4.693, + "step": 8300 + }, + { + "epoch": 3.47, + "learning_rate": 1e-06, + "loss": 0.2308, + "step": 8320 + }, + { + "epoch": 3.47, + "eval_accuracy": 0.9246666666666666, + "eval_loss": 0.21774353086948395, + "eval_runtime": 15.8498, + "eval_samples_per_second": 1514.211, + "eval_steps_per_second": 4.732, + "step": 8320 + }, + { + "epoch": 3.48, + "learning_rate": 1e-06, + "loss": 0.1661, + "step": 8340 + }, + { + "epoch": 3.48, + "eval_accuracy": 0.924625, + "eval_loss": 0.21844609081745148, + "eval_runtime": 16.0502, + "eval_samples_per_second": 1495.308, + "eval_steps_per_second": 4.673, + "step": 8340 + }, + { + "epoch": 3.48, + "learning_rate": 1e-06, + "loss": 0.2014, + "step": 8360 + }, + { + "epoch": 3.48, + "eval_accuracy": 0.9242916666666666, + "eval_loss": 0.21898412704467773, + "eval_runtime": 15.9262, + "eval_samples_per_second": 1506.951, + "eval_steps_per_second": 4.709, + "step": 8360 + }, + { + "epoch": 3.49, + "learning_rate": 1e-06, + "loss": 0.2667, + "step": 8380 + }, + { + "epoch": 3.49, + "eval_accuracy": 0.9242083333333333, + "eval_loss": 0.21949926018714905, + "eval_runtime": 15.9646, + "eval_samples_per_second": 1503.329, + "eval_steps_per_second": 4.698, + "step": 8380 + }, + { + "epoch": 3.5, + "learning_rate": 1e-06, + "loss": 0.205, + "step": 8400 + }, + { + "epoch": 3.5, + "eval_accuracy": 0.9252916666666666, + "eval_loss": 0.21843333542346954, + "eval_runtime": 16.4182, + "eval_samples_per_second": 1461.79, + "eval_steps_per_second": 4.568, + "step": 8400 + }, + { + "epoch": 3.51, + "learning_rate": 1e-06, + "loss": 0.1982, + "step": 8420 + }, + { + "epoch": 3.51, + "eval_accuracy": 0.9259166666666667, + "eval_loss": 0.21639133989810944, + "eval_runtime": 15.6132, + "eval_samples_per_second": 1537.157, + "eval_steps_per_second": 4.804, + "step": 8420 + }, + { + "epoch": 3.52, + "learning_rate": 1e-06, + "loss": 0.2511, + "step": 8440 + }, + { + "epoch": 3.52, + "eval_accuracy": 0.926125, + "eval_loss": 0.21578435599803925, + "eval_runtime": 15.8122, + "eval_samples_per_second": 1517.813, + "eval_steps_per_second": 4.743, + "step": 8440 + }, + { + "epoch": 3.52, + "learning_rate": 1e-06, + "loss": 0.2627, + "step": 8460 + }, + { + "epoch": 3.52, + "eval_accuracy": 0.9263333333333333, + "eval_loss": 0.2152308076620102, + "eval_runtime": 16.1172, + "eval_samples_per_second": 1489.089, + "eval_steps_per_second": 4.653, + "step": 8460 + }, + { + "epoch": 3.53, + "learning_rate": 1e-06, + "loss": 0.1905, + "step": 8480 + }, + { + "epoch": 3.53, + "eval_accuracy": 0.9267083333333334, + "eval_loss": 0.215366929769516, + "eval_runtime": 16.1037, + "eval_samples_per_second": 1490.344, + "eval_steps_per_second": 4.657, + "step": 8480 + }, + { + "epoch": 3.54, + "learning_rate": 1e-06, + "loss": 0.2349, + "step": 8500 + }, + { + "epoch": 3.54, + "eval_accuracy": 0.9254166666666667, + "eval_loss": 0.216691255569458, + "eval_runtime": 15.7789, + "eval_samples_per_second": 1521.022, + "eval_steps_per_second": 4.753, + "step": 8500 + }, + { + "epoch": 3.55, + "learning_rate": 1e-06, + "loss": 0.1732, + "step": 8520 + }, + { + "epoch": 3.55, + "eval_accuracy": 0.9255833333333333, + "eval_loss": 0.2171410173177719, + "eval_runtime": 15.9111, + "eval_samples_per_second": 1508.379, + "eval_steps_per_second": 4.714, + "step": 8520 + }, + { + "epoch": 3.56, + "learning_rate": 1e-06, + "loss": 0.2152, + "step": 8540 + }, + { + "epoch": 3.56, + "eval_accuracy": 0.9257916666666667, + "eval_loss": 0.21639755368232727, + "eval_runtime": 16.6264, + "eval_samples_per_second": 1443.488, + "eval_steps_per_second": 4.511, + "step": 8540 + }, + { + "epoch": 3.57, + "learning_rate": 1e-06, + "loss": 0.2487, + "step": 8560 + }, + { + "epoch": 3.57, + "eval_accuracy": 0.92575, + "eval_loss": 0.21681177616119385, + "eval_runtime": 15.9061, + "eval_samples_per_second": 1508.852, + "eval_steps_per_second": 4.715, + "step": 8560 + }, + { + "epoch": 3.58, + "learning_rate": 1e-06, + "loss": 0.2411, + "step": 8580 + }, + { + "epoch": 3.58, + "eval_accuracy": 0.9255833333333333, + "eval_loss": 0.2175518274307251, + "eval_runtime": 16.2026, + "eval_samples_per_second": 1481.242, + "eval_steps_per_second": 4.629, + "step": 8580 + }, + { + "epoch": 3.58, + "learning_rate": 1e-06, + "loss": 0.2113, + "step": 8600 + }, + { + "epoch": 3.58, + "eval_accuracy": 0.9257916666666667, + "eval_loss": 0.21555760502815247, + "eval_runtime": 15.883, + "eval_samples_per_second": 1511.052, + "eval_steps_per_second": 4.722, + "step": 8600 + }, + { + "epoch": 3.59, + "learning_rate": 1e-06, + "loss": 0.2065, + "step": 8620 + }, + { + "epoch": 3.59, + "eval_accuracy": 0.9265833333333333, + "eval_loss": 0.21459566056728363, + "eval_runtime": 15.8987, + "eval_samples_per_second": 1509.554, + "eval_steps_per_second": 4.717, + "step": 8620 + }, + { + "epoch": 3.6, + "learning_rate": 1e-06, + "loss": 0.2317, + "step": 8640 + }, + { + "epoch": 3.6, + "eval_accuracy": 0.926, + "eval_loss": 0.21562263369560242, + "eval_runtime": 15.7909, + "eval_samples_per_second": 1519.863, + "eval_steps_per_second": 4.75, + "step": 8640 + }, + { + "epoch": 3.61, + "learning_rate": 1e-06, + "loss": 0.2217, + "step": 8660 + }, + { + "epoch": 3.61, + "eval_accuracy": 0.9255416666666667, + "eval_loss": 0.21699927747249603, + "eval_runtime": 15.7983, + "eval_samples_per_second": 1519.146, + "eval_steps_per_second": 4.747, + "step": 8660 + }, + { + "epoch": 3.62, + "learning_rate": 1e-06, + "loss": 0.2296, + "step": 8680 + }, + { + "epoch": 3.62, + "eval_accuracy": 0.9252916666666666, + "eval_loss": 0.21904636919498444, + "eval_runtime": 15.8426, + "eval_samples_per_second": 1514.899, + "eval_steps_per_second": 4.734, + "step": 8680 + }, + { + "epoch": 3.62, + "learning_rate": 1e-06, + "loss": 0.2143, + "step": 8700 + }, + { + "epoch": 3.62, + "eval_accuracy": 0.9249166666666667, + "eval_loss": 0.21984702348709106, + "eval_runtime": 15.7532, + "eval_samples_per_second": 1523.499, + "eval_steps_per_second": 4.761, + "step": 8700 + }, + { + "epoch": 3.63, + "learning_rate": 1e-06, + "loss": 0.215, + "step": 8720 + }, + { + "epoch": 3.63, + "eval_accuracy": 0.9260416666666667, + "eval_loss": 0.21465341746807098, + "eval_runtime": 15.8302, + "eval_samples_per_second": 1516.094, + "eval_steps_per_second": 4.738, + "step": 8720 + }, + { + "epoch": 3.64, + "learning_rate": 1e-06, + "loss": 0.2109, + "step": 8740 + }, + { + "epoch": 3.64, + "eval_accuracy": 0.9271666666666667, + "eval_loss": 0.21410001814365387, + "eval_runtime": 15.7941, + "eval_samples_per_second": 1519.554, + "eval_steps_per_second": 4.749, + "step": 8740 + }, + { + "epoch": 3.65, + "learning_rate": 1e-06, + "loss": 0.1908, + "step": 8760 + }, + { + "epoch": 3.65, + "eval_accuracy": 0.9242083333333333, + "eval_loss": 0.21745717525482178, + "eval_runtime": 16.1017, + "eval_samples_per_second": 1490.524, + "eval_steps_per_second": 4.658, + "step": 8760 + }, + { + "epoch": 3.66, + "learning_rate": 1e-06, + "loss": 0.1899, + "step": 8780 + }, + { + "epoch": 3.66, + "eval_accuracy": 0.9242916666666666, + "eval_loss": 0.21800926327705383, + "eval_runtime": 15.5229, + "eval_samples_per_second": 1546.1, + "eval_steps_per_second": 4.832, + "step": 8780 + }, + { + "epoch": 3.67, + "learning_rate": 1e-06, + "loss": 0.1681, + "step": 8800 + }, + { + "epoch": 3.67, + "eval_accuracy": 0.925625, + "eval_loss": 0.2183951884508133, + "eval_runtime": 15.9633, + "eval_samples_per_second": 1503.444, + "eval_steps_per_second": 4.698, + "step": 8800 + }, + { + "epoch": 3.67, + "learning_rate": 1e-06, + "loss": 0.2134, + "step": 8820 + }, + { + "epoch": 3.67, + "eval_accuracy": 0.9258333333333333, + "eval_loss": 0.21758043766021729, + "eval_runtime": 15.6658, + "eval_samples_per_second": 1532.001, + "eval_steps_per_second": 4.788, + "step": 8820 + }, + { + "epoch": 3.68, + "learning_rate": 1e-06, + "loss": 0.1788, + "step": 8840 + }, + { + "epoch": 3.68, + "eval_accuracy": 0.9257916666666667, + "eval_loss": 0.2156967669725418, + "eval_runtime": 16.1604, + "eval_samples_per_second": 1485.116, + "eval_steps_per_second": 4.641, + "step": 8840 + }, + { + "epoch": 3.69, + "learning_rate": 1e-06, + "loss": 0.1963, + "step": 8860 + }, + { + "epoch": 3.69, + "eval_accuracy": 0.9265833333333333, + "eval_loss": 0.21555079519748688, + "eval_runtime": 15.9521, + "eval_samples_per_second": 1504.506, + "eval_steps_per_second": 4.702, + "step": 8860 + }, + { + "epoch": 3.7, + "learning_rate": 1e-06, + "loss": 0.1962, + "step": 8880 + }, + { + "epoch": 3.7, + "eval_accuracy": 0.9252083333333333, + "eval_loss": 0.2161911576986313, + "eval_runtime": 16.0835, + "eval_samples_per_second": 1492.21, + "eval_steps_per_second": 4.663, + "step": 8880 + }, + { + "epoch": 3.71, + "learning_rate": 1e-06, + "loss": 0.1587, + "step": 8900 + }, + { + "epoch": 3.71, + "eval_accuracy": 0.9257916666666667, + "eval_loss": 0.21644917130470276, + "eval_runtime": 16.0872, + "eval_samples_per_second": 1491.867, + "eval_steps_per_second": 4.662, + "step": 8900 + }, + { + "epoch": 3.72, + "learning_rate": 1e-06, + "loss": 0.1926, + "step": 8920 + }, + { + "epoch": 3.72, + "eval_accuracy": 0.924875, + "eval_loss": 0.21802347898483276, + "eval_runtime": 15.7349, + "eval_samples_per_second": 1525.267, + "eval_steps_per_second": 4.766, + "step": 8920 + }, + { + "epoch": 3.73, + "learning_rate": 1e-06, + "loss": 0.2268, + "step": 8940 + }, + { + "epoch": 3.73, + "eval_accuracy": 0.9250416666666667, + "eval_loss": 0.21704040467739105, + "eval_runtime": 16.0405, + "eval_samples_per_second": 1496.209, + "eval_steps_per_second": 4.676, + "step": 8940 + }, + { + "epoch": 3.73, + "learning_rate": 1e-06, + "loss": 0.191, + "step": 8960 + }, + { + "epoch": 3.73, + "eval_accuracy": 0.9265416666666667, + "eval_loss": 0.21474313735961914, + "eval_runtime": 16.1261, + "eval_samples_per_second": 1488.27, + "eval_steps_per_second": 4.651, + "step": 8960 + }, + { + "epoch": 3.74, + "learning_rate": 1e-06, + "loss": 0.242, + "step": 8980 + }, + { + "epoch": 3.74, + "eval_accuracy": 0.92675, + "eval_loss": 0.21450480818748474, + "eval_runtime": 15.8452, + "eval_samples_per_second": 1514.651, + "eval_steps_per_second": 4.733, + "step": 8980 + }, + { + "epoch": 3.75, + "learning_rate": 1e-06, + "loss": 0.2096, + "step": 9000 + }, + { + "epoch": 3.75, + "eval_accuracy": 0.925625, + "eval_loss": 0.2159881889820099, + "eval_runtime": 15.8929, + "eval_samples_per_second": 1510.111, + "eval_steps_per_second": 4.719, + "step": 9000 + }, + { + "epoch": 3.76, + "learning_rate": 1e-06, + "loss": 0.1713, + "step": 9020 + }, + { + "epoch": 3.76, + "eval_accuracy": 0.9254583333333334, + "eval_loss": 0.2180025428533554, + "eval_runtime": 16.0033, + "eval_samples_per_second": 1499.686, + "eval_steps_per_second": 4.687, + "step": 9020 + }, + { + "epoch": 3.77, + "learning_rate": 1e-06, + "loss": 0.2437, + "step": 9040 + }, + { + "epoch": 3.77, + "eval_accuracy": 0.9272083333333333, + "eval_loss": 0.21439102292060852, + "eval_runtime": 15.8661, + "eval_samples_per_second": 1512.663, + "eval_steps_per_second": 4.727, + "step": 9040 + }, + { + "epoch": 3.77, + "learning_rate": 1e-06, + "loss": 0.2058, + "step": 9060 + }, + { + "epoch": 3.77, + "eval_accuracy": 0.9265416666666667, + "eval_loss": 0.21493494510650635, + "eval_runtime": 16.417, + "eval_samples_per_second": 1461.901, + "eval_steps_per_second": 4.568, + "step": 9060 + }, + { + "epoch": 3.78, + "learning_rate": 1e-06, + "loss": 0.2107, + "step": 9080 + }, + { + "epoch": 3.78, + "eval_accuracy": 0.9262083333333333, + "eval_loss": 0.21356363594532013, + "eval_runtime": 15.9759, + "eval_samples_per_second": 1502.265, + "eval_steps_per_second": 4.695, + "step": 9080 + }, + { + "epoch": 3.79, + "learning_rate": 1e-06, + "loss": 0.2274, + "step": 9100 + }, + { + "epoch": 3.79, + "eval_accuracy": 0.9263333333333333, + "eval_loss": 0.21457020938396454, + "eval_runtime": 16.1086, + "eval_samples_per_second": 1489.887, + "eval_steps_per_second": 4.656, + "step": 9100 + }, + { + "epoch": 3.8, + "learning_rate": 1e-06, + "loss": 0.1802, + "step": 9120 + }, + { + "epoch": 3.8, + "eval_accuracy": 0.9258333333333333, + "eval_loss": 0.21796223521232605, + "eval_runtime": 16.0162, + "eval_samples_per_second": 1498.485, + "eval_steps_per_second": 4.683, + "step": 9120 + }, + { + "epoch": 3.81, + "learning_rate": 1e-06, + "loss": 0.1901, + "step": 9140 + }, + { + "epoch": 3.81, + "eval_accuracy": 0.9270416666666667, + "eval_loss": 0.21377238631248474, + "eval_runtime": 16.0799, + "eval_samples_per_second": 1492.542, + "eval_steps_per_second": 4.664, + "step": 9140 + }, + { + "epoch": 3.82, + "learning_rate": 1e-06, + "loss": 0.1868, + "step": 9160 + }, + { + "epoch": 3.82, + "eval_accuracy": 0.9264583333333334, + "eval_loss": 0.214362233877182, + "eval_runtime": 15.7967, + "eval_samples_per_second": 1519.305, + "eval_steps_per_second": 4.748, + "step": 9160 + }, + { + "epoch": 3.83, + "learning_rate": 1e-06, + "loss": 0.159, + "step": 9180 + }, + { + "epoch": 3.83, + "eval_accuracy": 0.927875, + "eval_loss": 0.21380971372127533, + "eval_runtime": 15.6723, + "eval_samples_per_second": 1531.362, + "eval_steps_per_second": 4.786, + "step": 9180 + }, + { + "epoch": 3.83, + "learning_rate": 1e-06, + "loss": 0.2448, + "step": 9200 + }, + { + "epoch": 3.83, + "eval_accuracy": 0.9277083333333334, + "eval_loss": 0.2141939252614975, + "eval_runtime": 16.2577, + "eval_samples_per_second": 1476.225, + "eval_steps_per_second": 4.613, + "step": 9200 + }, + { + "epoch": 3.84, + "learning_rate": 1e-06, + "loss": 0.2412, + "step": 9220 + }, + { + "epoch": 3.84, + "eval_accuracy": 0.9275416666666667, + "eval_loss": 0.2140486091375351, + "eval_runtime": 15.679, + "eval_samples_per_second": 1530.709, + "eval_steps_per_second": 4.783, + "step": 9220 + }, + { + "epoch": 3.85, + "learning_rate": 1e-06, + "loss": 0.2457, + "step": 9240 + }, + { + "epoch": 3.85, + "eval_accuracy": 0.925375, + "eval_loss": 0.21611127257347107, + "eval_runtime": 16.288, + "eval_samples_per_second": 1473.473, + "eval_steps_per_second": 4.605, + "step": 9240 + }, + { + "epoch": 3.86, + "learning_rate": 1e-06, + "loss": 0.1918, + "step": 9260 + }, + { + "epoch": 3.86, + "eval_accuracy": 0.92625, + "eval_loss": 0.21398600935935974, + "eval_runtime": 15.9302, + "eval_samples_per_second": 1506.573, + "eval_steps_per_second": 4.708, + "step": 9260 + }, + { + "epoch": 3.87, + "learning_rate": 1e-06, + "loss": 0.2424, + "step": 9280 + }, + { + "epoch": 3.87, + "eval_accuracy": 0.92875, + "eval_loss": 0.21145953238010406, + "eval_runtime": 16.2556, + "eval_samples_per_second": 1476.419, + "eval_steps_per_second": 4.614, + "step": 9280 + }, + { + "epoch": 3.88, + "learning_rate": 1e-06, + "loss": 0.2059, + "step": 9300 + }, + { + "epoch": 3.88, + "eval_accuracy": 0.9278333333333333, + "eval_loss": 0.21115827560424805, + "eval_runtime": 16.0018, + "eval_samples_per_second": 1499.836, + "eval_steps_per_second": 4.687, + "step": 9300 + }, + { + "epoch": 3.88, + "learning_rate": 1e-06, + "loss": 0.2415, + "step": 9320 + }, + { + "epoch": 3.88, + "eval_accuracy": 0.9265, + "eval_loss": 0.212614968419075, + "eval_runtime": 15.9163, + "eval_samples_per_second": 1507.888, + "eval_steps_per_second": 4.712, + "step": 9320 + }, + { + "epoch": 3.89, + "learning_rate": 1e-06, + "loss": 0.2312, + "step": 9340 + }, + { + "epoch": 3.89, + "eval_accuracy": 0.9279166666666666, + "eval_loss": 0.21092422306537628, + "eval_runtime": 15.7459, + "eval_samples_per_second": 1524.21, + "eval_steps_per_second": 4.763, + "step": 9340 + }, + { + "epoch": 3.9, + "learning_rate": 1e-06, + "loss": 0.2002, + "step": 9360 + }, + { + "epoch": 3.9, + "eval_accuracy": 0.927, + "eval_loss": 0.21157173812389374, + "eval_runtime": 15.674, + "eval_samples_per_second": 1531.197, + "eval_steps_per_second": 4.785, + "step": 9360 + }, + { + "epoch": 3.91, + "learning_rate": 1e-06, + "loss": 0.2061, + "step": 9380 + }, + { + "epoch": 3.91, + "eval_accuracy": 0.9255416666666667, + "eval_loss": 0.21427378058433533, + "eval_runtime": 16.16, + "eval_samples_per_second": 1485.144, + "eval_steps_per_second": 4.641, + "step": 9380 + }, + { + "epoch": 3.92, + "learning_rate": 1e-06, + "loss": 0.1892, + "step": 9400 + }, + { + "epoch": 3.92, + "eval_accuracy": 0.9269583333333333, + "eval_loss": 0.21478785574436188, + "eval_runtime": 15.666, + "eval_samples_per_second": 1531.978, + "eval_steps_per_second": 4.787, + "step": 9400 + }, + { + "epoch": 3.92, + "learning_rate": 1e-06, + "loss": 0.242, + "step": 9420 + }, + { + "epoch": 3.92, + "eval_accuracy": 0.9268333333333333, + "eval_loss": 0.21459507942199707, + "eval_runtime": 16.0867, + "eval_samples_per_second": 1491.912, + "eval_steps_per_second": 4.662, + "step": 9420 + }, + { + "epoch": 3.93, + "learning_rate": 1e-06, + "loss": 0.2151, + "step": 9440 + }, + { + "epoch": 3.93, + "eval_accuracy": 0.9257916666666667, + "eval_loss": 0.21316887438297272, + "eval_runtime": 16.0112, + "eval_samples_per_second": 1498.948, + "eval_steps_per_second": 4.684, + "step": 9440 + }, + { + "epoch": 3.94, + "learning_rate": 1e-06, + "loss": 0.2057, + "step": 9460 + }, + { + "epoch": 3.94, + "eval_accuracy": 0.927125, + "eval_loss": 0.21309146285057068, + "eval_runtime": 15.8401, + "eval_samples_per_second": 1515.139, + "eval_steps_per_second": 4.735, + "step": 9460 + }, + { + "epoch": 3.95, + "learning_rate": 1e-06, + "loss": 0.2165, + "step": 9480 + }, + { + "epoch": 3.95, + "eval_accuracy": 0.9276666666666666, + "eval_loss": 0.21158146858215332, + "eval_runtime": 16.0289, + "eval_samples_per_second": 1497.291, + "eval_steps_per_second": 4.679, + "step": 9480 + }, + { + "epoch": 3.96, + "learning_rate": 1e-06, + "loss": 0.1845, + "step": 9500 + }, + { + "epoch": 3.96, + "eval_accuracy": 0.9277083333333334, + "eval_loss": 0.21126095950603485, + "eval_runtime": 15.8162, + "eval_samples_per_second": 1517.433, + "eval_steps_per_second": 4.742, + "step": 9500 + }, + { + "epoch": 3.97, + "learning_rate": 1e-06, + "loss": 0.1787, + "step": 9520 + }, + { + "epoch": 3.97, + "eval_accuracy": 0.927, + "eval_loss": 0.21392786502838135, + "eval_runtime": 16.2866, + "eval_samples_per_second": 1473.607, + "eval_steps_per_second": 4.605, + "step": 9520 + }, + { + "epoch": 3.98, + "learning_rate": 1e-06, + "loss": 0.1947, + "step": 9540 + }, + { + "epoch": 3.98, + "eval_accuracy": 0.927875, + "eval_loss": 0.2135414332151413, + "eval_runtime": 15.7812, + "eval_samples_per_second": 1520.796, + "eval_steps_per_second": 4.752, + "step": 9540 + }, + { + "epoch": 3.98, + "learning_rate": 1e-06, + "loss": 0.1802, + "step": 9560 + }, + { + "epoch": 3.98, + "eval_accuracy": 0.9288333333333333, + "eval_loss": 0.2114471048116684, + "eval_runtime": 15.9961, + "eval_samples_per_second": 1500.37, + "eval_steps_per_second": 4.689, + "step": 9560 + }, + { + "epoch": 3.99, + "learning_rate": 1e-06, + "loss": 0.1865, + "step": 9580 + }, + { + "epoch": 3.99, + "eval_accuracy": 0.9280833333333334, + "eval_loss": 0.21309266984462738, + "eval_runtime": 16.0813, + "eval_samples_per_second": 1492.415, + "eval_steps_per_second": 4.664, + "step": 9580 + }, + { + "epoch": 4.0, + "learning_rate": 1e-06, + "loss": 0.2346, + "step": 9600 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.9282083333333333, + "eval_loss": 0.21224650740623474, + "eval_runtime": 15.806, + "eval_samples_per_second": 1518.415, + "eval_steps_per_second": 4.745, + "step": 9600 + }, + { + "epoch": 4.01, + "learning_rate": 1e-06, + "loss": 0.1973, + "step": 9620 + }, + { + "epoch": 4.01, + "eval_accuracy": 0.9283333333333333, + "eval_loss": 0.2113197296857834, + "eval_runtime": 16.0449, + "eval_samples_per_second": 1495.802, + "eval_steps_per_second": 4.674, + "step": 9620 + }, + { + "epoch": 4.02, + "learning_rate": 1e-06, + "loss": 0.1873, + "step": 9640 + }, + { + "epoch": 4.02, + "eval_accuracy": 0.928, + "eval_loss": 0.21195828914642334, + "eval_runtime": 16.0572, + "eval_samples_per_second": 1494.657, + "eval_steps_per_second": 4.671, + "step": 9640 + }, + { + "epoch": 4.03, + "learning_rate": 1e-06, + "loss": 0.1883, + "step": 9660 + }, + { + "epoch": 4.03, + "eval_accuracy": 0.9275416666666667, + "eval_loss": 0.21404027938842773, + "eval_runtime": 16.1167, + "eval_samples_per_second": 1489.14, + "eval_steps_per_second": 4.654, + "step": 9660 + }, + { + "epoch": 4.03, + "learning_rate": 1e-06, + "loss": 0.2386, + "step": 9680 + }, + { + "epoch": 4.03, + "eval_accuracy": 0.9255, + "eval_loss": 0.21560032665729523, + "eval_runtime": 15.8482, + "eval_samples_per_second": 1514.369, + "eval_steps_per_second": 4.732, + "step": 9680 + }, + { + "epoch": 4.04, + "learning_rate": 1e-06, + "loss": 0.1594, + "step": 9700 + }, + { + "epoch": 4.04, + "eval_accuracy": 0.9266666666666666, + "eval_loss": 0.2141159623861313, + "eval_runtime": 15.8534, + "eval_samples_per_second": 1513.868, + "eval_steps_per_second": 4.731, + "step": 9700 + }, + { + "epoch": 4.05, + "learning_rate": 1e-06, + "loss": 0.1805, + "step": 9720 + }, + { + "epoch": 4.05, + "eval_accuracy": 0.927875, + "eval_loss": 0.21176785230636597, + "eval_runtime": 15.5441, + "eval_samples_per_second": 1543.994, + "eval_steps_per_second": 4.825, + "step": 9720 + }, + { + "epoch": 4.06, + "learning_rate": 1e-06, + "loss": 0.2121, + "step": 9740 + }, + { + "epoch": 4.06, + "eval_accuracy": 0.927, + "eval_loss": 0.21185992658138275, + "eval_runtime": 16.1338, + "eval_samples_per_second": 1487.557, + "eval_steps_per_second": 4.649, + "step": 9740 + }, + { + "epoch": 4.07, + "learning_rate": 1e-06, + "loss": 0.1764, + "step": 9760 + }, + { + "epoch": 4.07, + "eval_accuracy": 0.926, + "eval_loss": 0.21480970084667206, + "eval_runtime": 15.7121, + "eval_samples_per_second": 1527.483, + "eval_steps_per_second": 4.773, + "step": 9760 + }, + { + "epoch": 4.08, + "learning_rate": 1e-06, + "loss": 0.2067, + "step": 9780 + }, + { + "epoch": 4.08, + "eval_accuracy": 0.9278333333333333, + "eval_loss": 0.21278834342956543, + "eval_runtime": 15.7391, + "eval_samples_per_second": 1524.869, + "eval_steps_per_second": 4.765, + "step": 9780 + }, + { + "epoch": 4.08, + "learning_rate": 1e-06, + "loss": 0.2219, + "step": 9800 + }, + { + "epoch": 4.08, + "eval_accuracy": 0.9279166666666666, + "eval_loss": 0.21168003976345062, + "eval_runtime": 15.591, + "eval_samples_per_second": 1539.347, + "eval_steps_per_second": 4.81, + "step": 9800 + }, + { + "epoch": 4.09, + "learning_rate": 1e-06, + "loss": 0.1931, + "step": 9820 + }, + { + "epoch": 4.09, + "eval_accuracy": 0.9288333333333333, + "eval_loss": 0.2115764170885086, + "eval_runtime": 16.0212, + "eval_samples_per_second": 1498.017, + "eval_steps_per_second": 4.681, + "step": 9820 + }, + { + "epoch": 4.1, + "learning_rate": 1e-06, + "loss": 0.198, + "step": 9840 + }, + { + "epoch": 4.1, + "eval_accuracy": 0.9270833333333334, + "eval_loss": 0.21085327863693237, + "eval_runtime": 16.2758, + "eval_samples_per_second": 1474.58, + "eval_steps_per_second": 4.608, + "step": 9840 + }, + { + "epoch": 4.11, + "learning_rate": 1e-06, + "loss": 0.1664, + "step": 9860 + }, + { + "epoch": 4.11, + "eval_accuracy": 0.9275833333333333, + "eval_loss": 0.21043312549591064, + "eval_runtime": 15.7593, + "eval_samples_per_second": 1522.91, + "eval_steps_per_second": 4.759, + "step": 9860 + }, + { + "epoch": 4.12, + "learning_rate": 1e-06, + "loss": 0.1998, + "step": 9880 + }, + { + "epoch": 4.12, + "eval_accuracy": 0.9276666666666666, + "eval_loss": 0.21023598313331604, + "eval_runtime": 15.8663, + "eval_samples_per_second": 1512.639, + "eval_steps_per_second": 4.727, + "step": 9880 + }, + { + "epoch": 4.12, + "learning_rate": 1e-06, + "loss": 0.2338, + "step": 9900 + }, + { + "epoch": 4.12, + "eval_accuracy": 0.9267916666666667, + "eval_loss": 0.21208404004573822, + "eval_runtime": 15.987, + "eval_samples_per_second": 1501.219, + "eval_steps_per_second": 4.691, + "step": 9900 + }, + { + "epoch": 4.13, + "learning_rate": 1e-06, + "loss": 0.1807, + "step": 9920 + }, + { + "epoch": 4.13, + "eval_accuracy": 0.9267083333333334, + "eval_loss": 0.21226836740970612, + "eval_runtime": 15.4893, + "eval_samples_per_second": 1549.453, + "eval_steps_per_second": 4.842, + "step": 9920 + }, + { + "epoch": 4.14, + "learning_rate": 1e-06, + "loss": 0.2055, + "step": 9940 + }, + { + "epoch": 4.14, + "eval_accuracy": 0.92775, + "eval_loss": 0.21014344692230225, + "eval_runtime": 16.5702, + "eval_samples_per_second": 1448.387, + "eval_steps_per_second": 4.526, + "step": 9940 + }, + { + "epoch": 4.15, + "learning_rate": 1e-06, + "loss": 0.2094, + "step": 9960 + }, + { + "epoch": 4.15, + "eval_accuracy": 0.9275, + "eval_loss": 0.21054843068122864, + "eval_runtime": 15.7855, + "eval_samples_per_second": 1520.387, + "eval_steps_per_second": 4.751, + "step": 9960 + }, + { + "epoch": 4.16, + "learning_rate": 1e-06, + "loss": 0.1675, + "step": 9980 + }, + { + "epoch": 4.16, + "eval_accuracy": 0.9266666666666666, + "eval_loss": 0.21459108591079712, + "eval_runtime": 15.728, + "eval_samples_per_second": 1525.936, + "eval_steps_per_second": 4.769, + "step": 9980 + }, + { + "epoch": 4.17, + "learning_rate": 1e-06, + "loss": 0.2043, + "step": 10000 + }, + { + "epoch": 4.17, + "eval_accuracy": 0.9267916666666667, + "eval_loss": 0.2115868628025055, + "eval_runtime": 16.2887, + "eval_samples_per_second": 1473.412, + "eval_steps_per_second": 4.604, + "step": 10000 + }, + { + "epoch": 4.17, + "learning_rate": 1e-06, + "loss": 0.1625, + "step": 10020 + }, + { + "epoch": 4.17, + "eval_accuracy": 0.927625, + "eval_loss": 0.21186378598213196, + "eval_runtime": 15.6418, + "eval_samples_per_second": 1534.346, + "eval_steps_per_second": 4.795, + "step": 10020 + }, + { + "epoch": 4.18, + "learning_rate": 1e-06, + "loss": 0.1761, + "step": 10040 + }, + { + "epoch": 4.18, + "eval_accuracy": 0.92725, + "eval_loss": 0.2122122347354889, + "eval_runtime": 16.1634, + "eval_samples_per_second": 1484.835, + "eval_steps_per_second": 4.64, + "step": 10040 + }, + { + "epoch": 4.19, + "learning_rate": 1e-06, + "loss": 0.1787, + "step": 10060 + }, + { + "epoch": 4.19, + "eval_accuracy": 0.9278333333333333, + "eval_loss": 0.21212342381477356, + "eval_runtime": 15.87, + "eval_samples_per_second": 1512.288, + "eval_steps_per_second": 4.726, + "step": 10060 + }, + { + "epoch": 4.2, + "learning_rate": 1e-06, + "loss": 0.1773, + "step": 10080 + }, + { + "epoch": 4.2, + "eval_accuracy": 0.927375, + "eval_loss": 0.21214234828948975, + "eval_runtime": 15.6627, + "eval_samples_per_second": 1532.306, + "eval_steps_per_second": 4.788, + "step": 10080 + }, + { + "epoch": 4.21, + "learning_rate": 1e-06, + "loss": 0.2317, + "step": 10100 + }, + { + "epoch": 4.21, + "eval_accuracy": 0.9284166666666667, + "eval_loss": 0.21087035536766052, + "eval_runtime": 15.9309, + "eval_samples_per_second": 1506.51, + "eval_steps_per_second": 4.708, + "step": 10100 + }, + { + "epoch": 4.22, + "learning_rate": 1e-06, + "loss": 0.1909, + "step": 10120 + }, + { + "epoch": 4.22, + "eval_accuracy": 0.9279583333333333, + "eval_loss": 0.2117665410041809, + "eval_runtime": 15.8849, + "eval_samples_per_second": 1510.864, + "eval_steps_per_second": 4.721, + "step": 10120 + }, + { + "epoch": 4.22, + "learning_rate": 1e-06, + "loss": 0.2192, + "step": 10140 + }, + { + "epoch": 4.22, + "eval_accuracy": 0.9291666666666667, + "eval_loss": 0.20962686836719513, + "eval_runtime": 16.1628, + "eval_samples_per_second": 1484.887, + "eval_steps_per_second": 4.64, + "step": 10140 + }, + { + "epoch": 4.23, + "learning_rate": 1e-06, + "loss": 0.1977, + "step": 10160 + }, + { + "epoch": 4.23, + "eval_accuracy": 0.9277083333333334, + "eval_loss": 0.21077552437782288, + "eval_runtime": 15.8089, + "eval_samples_per_second": 1518.136, + "eval_steps_per_second": 4.744, + "step": 10160 + }, + { + "epoch": 4.24, + "learning_rate": 1e-06, + "loss": 0.2138, + "step": 10180 + }, + { + "epoch": 4.24, + "eval_accuracy": 0.9267083333333334, + "eval_loss": 0.2138536125421524, + "eval_runtime": 15.891, + "eval_samples_per_second": 1510.292, + "eval_steps_per_second": 4.72, + "step": 10180 + }, + { + "epoch": 4.25, + "learning_rate": 1e-06, + "loss": 0.1753, + "step": 10200 + }, + { + "epoch": 4.25, + "eval_accuracy": 0.92675, + "eval_loss": 0.21307241916656494, + "eval_runtime": 16.0289, + "eval_samples_per_second": 1497.293, + "eval_steps_per_second": 4.679, + "step": 10200 + }, + { + "epoch": 4.26, + "learning_rate": 1e-06, + "loss": 0.2094, + "step": 10220 + }, + { + "epoch": 4.26, + "eval_accuracy": 0.9283333333333333, + "eval_loss": 0.21122166514396667, + "eval_runtime": 16.1206, + "eval_samples_per_second": 1488.781, + "eval_steps_per_second": 4.652, + "step": 10220 + }, + { + "epoch": 4.27, + "learning_rate": 1e-06, + "loss": 0.204, + "step": 10240 + }, + { + "epoch": 4.27, + "eval_accuracy": 0.92825, + "eval_loss": 0.20935103297233582, + "eval_runtime": 15.822, + "eval_samples_per_second": 1516.879, + "eval_steps_per_second": 4.74, + "step": 10240 + }, + { + "epoch": 4.28, + "learning_rate": 1e-06, + "loss": 0.2196, + "step": 10260 + }, + { + "epoch": 4.28, + "eval_accuracy": 0.927, + "eval_loss": 0.2119808942079544, + "eval_runtime": 16.0008, + "eval_samples_per_second": 1499.92, + "eval_steps_per_second": 4.687, + "step": 10260 + }, + { + "epoch": 4.28, + "learning_rate": 1e-06, + "loss": 0.2122, + "step": 10280 + }, + { + "epoch": 4.28, + "eval_accuracy": 0.9271666666666667, + "eval_loss": 0.21251654624938965, + "eval_runtime": 16.2621, + "eval_samples_per_second": 1475.821, + "eval_steps_per_second": 4.612, + "step": 10280 + }, + { + "epoch": 4.29, + "learning_rate": 1e-06, + "loss": 0.1534, + "step": 10300 + }, + { + "epoch": 4.29, + "eval_accuracy": 0.9284166666666667, + "eval_loss": 0.2099744975566864, + "eval_runtime": 15.4991, + "eval_samples_per_second": 1548.474, + "eval_steps_per_second": 4.839, + "step": 10300 + }, + { + "epoch": 4.3, + "learning_rate": 1e-06, + "loss": 0.1917, + "step": 10320 + }, + { + "epoch": 4.3, + "eval_accuracy": 0.9281666666666667, + "eval_loss": 0.2101805955171585, + "eval_runtime": 15.541, + "eval_samples_per_second": 1544.303, + "eval_steps_per_second": 4.826, + "step": 10320 + }, + { + "epoch": 4.31, + "learning_rate": 1e-06, + "loss": 0.201, + "step": 10340 + }, + { + "epoch": 4.31, + "eval_accuracy": 0.928125, + "eval_loss": 0.20908385515213013, + "eval_runtime": 15.6138, + "eval_samples_per_second": 1537.104, + "eval_steps_per_second": 4.803, + "step": 10340 + }, + { + "epoch": 4.32, + "learning_rate": 1e-06, + "loss": 0.1736, + "step": 10360 + }, + { + "epoch": 4.32, + "eval_accuracy": 0.9292083333333333, + "eval_loss": 0.20927385985851288, + "eval_runtime": 15.7173, + "eval_samples_per_second": 1526.975, + "eval_steps_per_second": 4.772, + "step": 10360 + }, + { + "epoch": 4.33, + "learning_rate": 1e-06, + "loss": 0.1948, + "step": 10380 + }, + { + "epoch": 4.33, + "eval_accuracy": 0.928625, + "eval_loss": 0.2104508876800537, + "eval_runtime": 15.7589, + "eval_samples_per_second": 1522.952, + "eval_steps_per_second": 4.759, + "step": 10380 + }, + { + "epoch": 4.33, + "learning_rate": 1e-06, + "loss": 0.1967, + "step": 10400 + }, + { + "epoch": 4.33, + "eval_accuracy": 0.9270833333333334, + "eval_loss": 0.21193169057369232, + "eval_runtime": 15.5786, + "eval_samples_per_second": 1540.575, + "eval_steps_per_second": 4.814, + "step": 10400 + }, + { + "epoch": 4.34, + "learning_rate": 1e-06, + "loss": 0.1722, + "step": 10420 + }, + { + "epoch": 4.34, + "eval_accuracy": 0.9289583333333333, + "eval_loss": 0.20838379859924316, + "eval_runtime": 16.4064, + "eval_samples_per_second": 1462.84, + "eval_steps_per_second": 4.571, + "step": 10420 + }, + { + "epoch": 4.35, + "learning_rate": 1e-06, + "loss": 0.1855, + "step": 10440 + }, + { + "epoch": 4.35, + "eval_accuracy": 0.928375, + "eval_loss": 0.20829389989376068, + "eval_runtime": 15.8035, + "eval_samples_per_second": 1518.649, + "eval_steps_per_second": 4.746, + "step": 10440 + }, + { + "epoch": 4.36, + "learning_rate": 1e-06, + "loss": 0.2067, + "step": 10460 + }, + { + "epoch": 4.36, + "eval_accuracy": 0.92925, + "eval_loss": 0.20744042098522186, + "eval_runtime": 15.8104, + "eval_samples_per_second": 1517.993, + "eval_steps_per_second": 4.744, + "step": 10460 + }, + { + "epoch": 4.37, + "learning_rate": 1e-06, + "loss": 0.1925, + "step": 10480 + }, + { + "epoch": 4.37, + "eval_accuracy": 0.9292916666666666, + "eval_loss": 0.20747624337673187, + "eval_runtime": 16.0704, + "eval_samples_per_second": 1493.428, + "eval_steps_per_second": 4.667, + "step": 10480 + }, + { + "epoch": 4.38, + "learning_rate": 1e-06, + "loss": 0.2027, + "step": 10500 + }, + { + "epoch": 4.38, + "eval_accuracy": 0.92775, + "eval_loss": 0.2088412493467331, + "eval_runtime": 15.8843, + "eval_samples_per_second": 1510.928, + "eval_steps_per_second": 4.722, + "step": 10500 + }, + { + "epoch": 4.38, + "learning_rate": 1e-06, + "loss": 0.1763, + "step": 10520 + }, + { + "epoch": 4.38, + "eval_accuracy": 0.928125, + "eval_loss": 0.20948942005634308, + "eval_runtime": 15.8584, + "eval_samples_per_second": 1513.39, + "eval_steps_per_second": 4.729, + "step": 10520 + }, + { + "epoch": 4.39, + "learning_rate": 1e-06, + "loss": 0.2019, + "step": 10540 + }, + { + "epoch": 4.39, + "eval_accuracy": 0.9283333333333333, + "eval_loss": 0.20972661674022675, + "eval_runtime": 16.0436, + "eval_samples_per_second": 1495.92, + "eval_steps_per_second": 4.675, + "step": 10540 + }, + { + "epoch": 4.4, + "learning_rate": 1e-06, + "loss": 0.2411, + "step": 10560 + }, + { + "epoch": 4.4, + "eval_accuracy": 0.9285, + "eval_loss": 0.21000780165195465, + "eval_runtime": 15.8037, + "eval_samples_per_second": 1518.635, + "eval_steps_per_second": 4.746, + "step": 10560 + }, + { + "epoch": 4.41, + "learning_rate": 1e-06, + "loss": 0.1896, + "step": 10580 + }, + { + "epoch": 4.41, + "eval_accuracy": 0.9274583333333334, + "eval_loss": 0.20897161960601807, + "eval_runtime": 15.8212, + "eval_samples_per_second": 1516.948, + "eval_steps_per_second": 4.74, + "step": 10580 + }, + { + "epoch": 4.42, + "learning_rate": 1e-06, + "loss": 0.2153, + "step": 10600 + }, + { + "epoch": 4.42, + "eval_accuracy": 0.9294166666666667, + "eval_loss": 0.2074136584997177, + "eval_runtime": 16.2219, + "eval_samples_per_second": 1479.478, + "eval_steps_per_second": 4.623, + "step": 10600 + }, + { + "epoch": 4.42, + "learning_rate": 1e-06, + "loss": 0.2157, + "step": 10620 + }, + { + "epoch": 4.42, + "eval_accuracy": 0.9291666666666667, + "eval_loss": 0.20682406425476074, + "eval_runtime": 15.4922, + "eval_samples_per_second": 1549.169, + "eval_steps_per_second": 4.841, + "step": 10620 + }, + { + "epoch": 4.43, + "learning_rate": 1e-06, + "loss": 0.1676, + "step": 10640 + }, + { + "epoch": 4.43, + "eval_accuracy": 0.9277916666666667, + "eval_loss": 0.20833227038383484, + "eval_runtime": 16.1598, + "eval_samples_per_second": 1485.169, + "eval_steps_per_second": 4.641, + "step": 10640 + }, + { + "epoch": 4.44, + "learning_rate": 1e-06, + "loss": 0.2071, + "step": 10660 + }, + { + "epoch": 4.44, + "eval_accuracy": 0.9270833333333334, + "eval_loss": 0.20793978869915009, + "eval_runtime": 15.9207, + "eval_samples_per_second": 1507.475, + "eval_steps_per_second": 4.711, + "step": 10660 + }, + { + "epoch": 4.45, + "learning_rate": 1e-06, + "loss": 0.2311, + "step": 10680 + }, + { + "epoch": 4.45, + "eval_accuracy": 0.9275, + "eval_loss": 0.2090083658695221, + "eval_runtime": 15.8466, + "eval_samples_per_second": 1514.52, + "eval_steps_per_second": 4.733, + "step": 10680 + }, + { + "epoch": 4.46, + "learning_rate": 1e-06, + "loss": 0.1938, + "step": 10700 + }, + { + "epoch": 4.46, + "eval_accuracy": 0.9301666666666667, + "eval_loss": 0.20576812326908112, + "eval_runtime": 16.5431, + "eval_samples_per_second": 1450.759, + "eval_steps_per_second": 4.534, + "step": 10700 + }, + { + "epoch": 4.47, + "learning_rate": 1e-06, + "loss": 0.2202, + "step": 10720 + }, + { + "epoch": 4.47, + "eval_accuracy": 0.9292083333333333, + "eval_loss": 0.20580460131168365, + "eval_runtime": 15.9051, + "eval_samples_per_second": 1508.949, + "eval_steps_per_second": 4.715, + "step": 10720 + }, + { + "epoch": 4.47, + "learning_rate": 1e-06, + "loss": 0.1872, + "step": 10740 + }, + { + "epoch": 4.47, + "eval_accuracy": 0.9278333333333333, + "eval_loss": 0.20827758312225342, + "eval_runtime": 15.8978, + "eval_samples_per_second": 1509.642, + "eval_steps_per_second": 4.718, + "step": 10740 + }, + { + "epoch": 4.48, + "learning_rate": 1e-06, + "loss": 0.2034, + "step": 10760 + }, + { + "epoch": 4.48, + "eval_accuracy": 0.927875, + "eval_loss": 0.2091016322374344, + "eval_runtime": 15.7146, + "eval_samples_per_second": 1527.242, + "eval_steps_per_second": 4.773, + "step": 10760 + }, + { + "epoch": 4.49, + "learning_rate": 1e-06, + "loss": 0.2204, + "step": 10780 + }, + { + "epoch": 4.49, + "eval_accuracy": 0.9276666666666666, + "eval_loss": 0.20921491086483002, + "eval_runtime": 16.115, + "eval_samples_per_second": 1489.295, + "eval_steps_per_second": 4.654, + "step": 10780 + }, + { + "epoch": 4.5, + "learning_rate": 1e-06, + "loss": 0.216, + "step": 10800 + }, + { + "epoch": 4.5, + "eval_accuracy": 0.927625, + "eval_loss": 0.20790547132492065, + "eval_runtime": 15.9443, + "eval_samples_per_second": 1505.242, + "eval_steps_per_second": 4.704, + "step": 10800 + }, + { + "epoch": 4.51, + "learning_rate": 1e-06, + "loss": 0.1824, + "step": 10820 + }, + { + "epoch": 4.51, + "eval_accuracy": 0.9280833333333334, + "eval_loss": 0.20656706392765045, + "eval_runtime": 16.1164, + "eval_samples_per_second": 1489.165, + "eval_steps_per_second": 4.654, + "step": 10820 + }, + { + "epoch": 4.52, + "learning_rate": 1e-06, + "loss": 0.2098, + "step": 10840 + }, + { + "epoch": 4.52, + "eval_accuracy": 0.9296666666666666, + "eval_loss": 0.20553721487522125, + "eval_runtime": 15.8252, + "eval_samples_per_second": 1516.573, + "eval_steps_per_second": 4.739, + "step": 10840 + }, + { + "epoch": 4.53, + "learning_rate": 1e-06, + "loss": 0.2258, + "step": 10860 + }, + { + "epoch": 4.53, + "eval_accuracy": 0.9294583333333334, + "eval_loss": 0.2055697739124298, + "eval_runtime": 15.8329, + "eval_samples_per_second": 1515.829, + "eval_steps_per_second": 4.737, + "step": 10860 + }, + { + "epoch": 4.53, + "learning_rate": 1e-06, + "loss": 0.2086, + "step": 10880 + }, + { + "epoch": 4.53, + "eval_accuracy": 0.9282083333333333, + "eval_loss": 0.20601527392864227, + "eval_runtime": 16.046, + "eval_samples_per_second": 1495.703, + "eval_steps_per_second": 4.674, + "step": 10880 + }, + { + "epoch": 4.54, + "learning_rate": 1e-06, + "loss": 0.1725, + "step": 10900 + }, + { + "epoch": 4.54, + "eval_accuracy": 0.9290416666666667, + "eval_loss": 0.20634058117866516, + "eval_runtime": 15.8771, + "eval_samples_per_second": 1511.608, + "eval_steps_per_second": 4.724, + "step": 10900 + }, + { + "epoch": 4.55, + "learning_rate": 1e-06, + "loss": 0.1899, + "step": 10920 + }, + { + "epoch": 4.55, + "eval_accuracy": 0.9292916666666666, + "eval_loss": 0.20672395825386047, + "eval_runtime": 16.1064, + "eval_samples_per_second": 1490.089, + "eval_steps_per_second": 4.657, + "step": 10920 + }, + { + "epoch": 4.56, + "learning_rate": 1e-06, + "loss": 0.1959, + "step": 10940 + }, + { + "epoch": 4.56, + "eval_accuracy": 0.9266666666666666, + "eval_loss": 0.21057891845703125, + "eval_runtime": 15.9754, + "eval_samples_per_second": 1502.309, + "eval_steps_per_second": 4.695, + "step": 10940 + }, + { + "epoch": 4.57, + "learning_rate": 1e-06, + "loss": 0.2029, + "step": 10960 + }, + { + "epoch": 4.57, + "eval_accuracy": 0.928875, + "eval_loss": 0.20633479952812195, + "eval_runtime": 15.7725, + "eval_samples_per_second": 1521.637, + "eval_steps_per_second": 4.755, + "step": 10960 + }, + { + "epoch": 4.58, + "learning_rate": 1e-06, + "loss": 0.1843, + "step": 10980 + }, + { + "epoch": 4.58, + "eval_accuracy": 0.9300833333333334, + "eval_loss": 0.2045080065727234, + "eval_runtime": 16.6018, + "eval_samples_per_second": 1445.625, + "eval_steps_per_second": 4.518, + "step": 10980 + }, + { + "epoch": 4.58, + "learning_rate": 1e-06, + "loss": 0.2249, + "step": 11000 + }, + { + "epoch": 4.58, + "eval_accuracy": 0.93025, + "eval_loss": 0.20456919074058533, + "eval_runtime": 15.8868, + "eval_samples_per_second": 1510.688, + "eval_steps_per_second": 4.721, + "step": 11000 + }, + { + "epoch": 4.59, + "learning_rate": 1e-06, + "loss": 0.1764, + "step": 11020 + }, + { + "epoch": 4.59, + "eval_accuracy": 0.92925, + "eval_loss": 0.20611906051635742, + "eval_runtime": 16.0015, + "eval_samples_per_second": 1499.856, + "eval_steps_per_second": 4.687, + "step": 11020 + }, + { + "epoch": 4.6, + "learning_rate": 1e-06, + "loss": 0.2269, + "step": 11040 + }, + { + "epoch": 4.6, + "eval_accuracy": 0.9290416666666667, + "eval_loss": 0.20687995851039886, + "eval_runtime": 15.9876, + "eval_samples_per_second": 1501.163, + "eval_steps_per_second": 4.691, + "step": 11040 + }, + { + "epoch": 4.61, + "learning_rate": 1e-06, + "loss": 0.234, + "step": 11060 + }, + { + "epoch": 4.61, + "eval_accuracy": 0.92825, + "eval_loss": 0.20855723321437836, + "eval_runtime": 15.799, + "eval_samples_per_second": 1519.081, + "eval_steps_per_second": 4.747, + "step": 11060 + }, + { + "epoch": 4.62, + "learning_rate": 1e-06, + "loss": 0.1925, + "step": 11080 + }, + { + "epoch": 4.62, + "eval_accuracy": 0.9294166666666667, + "eval_loss": 0.2057270109653473, + "eval_runtime": 15.765, + "eval_samples_per_second": 1522.358, + "eval_steps_per_second": 4.757, + "step": 11080 + }, + { + "epoch": 4.62, + "learning_rate": 1e-06, + "loss": 0.1949, + "step": 11100 + }, + { + "epoch": 4.62, + "eval_accuracy": 0.9285, + "eval_loss": 0.206070676445961, + "eval_runtime": 16.1103, + "eval_samples_per_second": 1489.726, + "eval_steps_per_second": 4.655, + "step": 11100 + }, + { + "epoch": 4.63, + "learning_rate": 1e-06, + "loss": 0.1928, + "step": 11120 + }, + { + "epoch": 4.63, + "eval_accuracy": 0.9290416666666667, + "eval_loss": 0.20591707527637482, + "eval_runtime": 15.8556, + "eval_samples_per_second": 1513.659, + "eval_steps_per_second": 4.73, + "step": 11120 + }, + { + "epoch": 4.64, + "learning_rate": 1e-06, + "loss": 0.1744, + "step": 11140 + }, + { + "epoch": 4.64, + "eval_accuracy": 0.9286666666666666, + "eval_loss": 0.20713801681995392, + "eval_runtime": 16.1571, + "eval_samples_per_second": 1485.413, + "eval_steps_per_second": 4.642, + "step": 11140 + }, + { + "epoch": 4.65, + "learning_rate": 1e-06, + "loss": 0.2161, + "step": 11160 + }, + { + "epoch": 4.65, + "eval_accuracy": 0.92925, + "eval_loss": 0.20511361956596375, + "eval_runtime": 16.4581, + "eval_samples_per_second": 1458.25, + "eval_steps_per_second": 4.557, + "step": 11160 + }, + { + "epoch": 4.66, + "learning_rate": 1e-06, + "loss": 0.168, + "step": 11180 + }, + { + "epoch": 4.66, + "eval_accuracy": 0.92925, + "eval_loss": 0.2065957635641098, + "eval_runtime": 15.7867, + "eval_samples_per_second": 1520.267, + "eval_steps_per_second": 4.751, + "step": 11180 + }, + { + "epoch": 4.67, + "learning_rate": 1e-06, + "loss": 0.1739, + "step": 11200 + }, + { + "epoch": 4.67, + "eval_accuracy": 0.9285833333333333, + "eval_loss": 0.20561246573925018, + "eval_runtime": 15.6163, + "eval_samples_per_second": 1536.854, + "eval_steps_per_second": 4.803, + "step": 11200 + }, + { + "epoch": 4.67, + "learning_rate": 1e-06, + "loss": 0.1816, + "step": 11220 + }, + { + "epoch": 4.67, + "eval_accuracy": 0.9287916666666667, + "eval_loss": 0.20476531982421875, + "eval_runtime": 16.2877, + "eval_samples_per_second": 1473.506, + "eval_steps_per_second": 4.605, + "step": 11220 + }, + { + "epoch": 4.68, + "learning_rate": 1e-06, + "loss": 0.1931, + "step": 11240 + }, + { + "epoch": 4.68, + "eval_accuracy": 0.9285416666666667, + "eval_loss": 0.20711849629878998, + "eval_runtime": 15.8294, + "eval_samples_per_second": 1516.17, + "eval_steps_per_second": 4.738, + "step": 11240 + }, + { + "epoch": 4.69, + "learning_rate": 1e-06, + "loss": 0.1945, + "step": 11260 + }, + { + "epoch": 4.69, + "eval_accuracy": 0.9281666666666667, + "eval_loss": 0.20717017352581024, + "eval_runtime": 16.0348, + "eval_samples_per_second": 1496.744, + "eval_steps_per_second": 4.677, + "step": 11260 + }, + { + "epoch": 4.7, + "learning_rate": 1e-06, + "loss": 0.1929, + "step": 11280 + }, + { + "epoch": 4.7, + "eval_accuracy": 0.928875, + "eval_loss": 0.20755107700824738, + "eval_runtime": 16.3373, + "eval_samples_per_second": 1469.027, + "eval_steps_per_second": 4.591, + "step": 11280 + }, + { + "epoch": 4.71, + "learning_rate": 1e-06, + "loss": 0.1903, + "step": 11300 + }, + { + "epoch": 4.71, + "eval_accuracy": 0.929875, + "eval_loss": 0.20404241979122162, + "eval_runtime": 16.0151, + "eval_samples_per_second": 1498.586, + "eval_steps_per_second": 4.683, + "step": 11300 + }, + { + "epoch": 4.72, + "learning_rate": 1e-06, + "loss": 0.2051, + "step": 11320 + }, + { + "epoch": 4.72, + "eval_accuracy": 0.9290416666666667, + "eval_loss": 0.20393158495426178, + "eval_runtime": 16.464, + "eval_samples_per_second": 1457.729, + "eval_steps_per_second": 4.555, + "step": 11320 + }, + { + "epoch": 4.72, + "learning_rate": 1e-06, + "loss": 0.1614, + "step": 11340 + }, + { + "epoch": 4.72, + "eval_accuracy": 0.929125, + "eval_loss": 0.20461545884609222, + "eval_runtime": 15.8151, + "eval_samples_per_second": 1517.534, + "eval_steps_per_second": 4.742, + "step": 11340 + }, + { + "epoch": 4.73, + "learning_rate": 1e-06, + "loss": 0.1968, + "step": 11360 + }, + { + "epoch": 4.73, + "eval_accuracy": 0.9283333333333333, + "eval_loss": 0.20622652769088745, + "eval_runtime": 15.9434, + "eval_samples_per_second": 1505.328, + "eval_steps_per_second": 4.704, + "step": 11360 + }, + { + "epoch": 4.74, + "learning_rate": 1e-06, + "loss": 0.2091, + "step": 11380 + }, + { + "epoch": 4.74, + "eval_accuracy": 0.926375, + "eval_loss": 0.2111745923757553, + "eval_runtime": 15.9338, + "eval_samples_per_second": 1506.236, + "eval_steps_per_second": 4.707, + "step": 11380 + }, + { + "epoch": 4.75, + "learning_rate": 1e-06, + "loss": 0.2072, + "step": 11400 + }, + { + "epoch": 4.75, + "eval_accuracy": 0.9277916666666667, + "eval_loss": 0.2086385041475296, + "eval_runtime": 15.7955, + "eval_samples_per_second": 1519.425, + "eval_steps_per_second": 4.748, + "step": 11400 + }, + { + "epoch": 4.76, + "learning_rate": 1e-06, + "loss": 0.2298, + "step": 11420 + }, + { + "epoch": 4.76, + "eval_accuracy": 0.9287083333333334, + "eval_loss": 0.2052876353263855, + "eval_runtime": 16.0941, + "eval_samples_per_second": 1491.226, + "eval_steps_per_second": 4.66, + "step": 11420 + }, + { + "epoch": 4.77, + "learning_rate": 1e-06, + "loss": 0.1874, + "step": 11440 + }, + { + "epoch": 4.77, + "eval_accuracy": 0.9292916666666666, + "eval_loss": 0.20625039935112, + "eval_runtime": 15.9003, + "eval_samples_per_second": 1509.408, + "eval_steps_per_second": 4.717, + "step": 11440 + }, + { + "epoch": 4.78, + "learning_rate": 1e-06, + "loss": 0.1838, + "step": 11460 + }, + { + "epoch": 4.78, + "eval_accuracy": 0.928, + "eval_loss": 0.20656456053256989, + "eval_runtime": 16.1911, + "eval_samples_per_second": 1482.299, + "eval_steps_per_second": 4.632, + "step": 11460 + }, + { + "epoch": 4.78, + "learning_rate": 1e-06, + "loss": 0.2015, + "step": 11480 + }, + { + "epoch": 4.78, + "eval_accuracy": 0.929625, + "eval_loss": 0.20533688366413116, + "eval_runtime": 15.5948, + "eval_samples_per_second": 1538.974, + "eval_steps_per_second": 4.809, + "step": 11480 + }, + { + "epoch": 4.79, + "learning_rate": 1e-06, + "loss": 0.1822, + "step": 11500 + }, + { + "epoch": 4.79, + "eval_accuracy": 0.9284583333333334, + "eval_loss": 0.20839504897594452, + "eval_runtime": 15.9851, + "eval_samples_per_second": 1501.396, + "eval_steps_per_second": 4.692, + "step": 11500 + }, + { + "epoch": 4.8, + "learning_rate": 1e-06, + "loss": 0.2209, + "step": 11520 + }, + { + "epoch": 4.8, + "eval_accuracy": 0.9295, + "eval_loss": 0.20546448230743408, + "eval_runtime": 15.8868, + "eval_samples_per_second": 1510.688, + "eval_steps_per_second": 4.721, + "step": 11520 + }, + { + "epoch": 4.81, + "learning_rate": 1e-06, + "loss": 0.1918, + "step": 11540 + }, + { + "epoch": 4.81, + "eval_accuracy": 0.929625, + "eval_loss": 0.20511560142040253, + "eval_runtime": 16.5324, + "eval_samples_per_second": 1451.696, + "eval_steps_per_second": 4.537, + "step": 11540 + }, + { + "epoch": 4.82, + "learning_rate": 1e-06, + "loss": 0.2252, + "step": 11560 + }, + { + "epoch": 4.82, + "eval_accuracy": 0.9294166666666667, + "eval_loss": 0.2052285224199295, + "eval_runtime": 15.5246, + "eval_samples_per_second": 1545.931, + "eval_steps_per_second": 4.831, + "step": 11560 + }, + { + "epoch": 4.83, + "learning_rate": 1e-06, + "loss": 0.1929, + "step": 11580 + }, + { + "epoch": 4.83, + "eval_accuracy": 0.9300833333333334, + "eval_loss": 0.20385289192199707, + "eval_runtime": 15.4266, + "eval_samples_per_second": 1555.758, + "eval_steps_per_second": 4.862, + "step": 11580 + }, + { + "epoch": 4.83, + "learning_rate": 1e-06, + "loss": 0.1889, + "step": 11600 + }, + { + "epoch": 4.83, + "eval_accuracy": 0.9298333333333333, + "eval_loss": 0.2047443836927414, + "eval_runtime": 15.9168, + "eval_samples_per_second": 1507.841, + "eval_steps_per_second": 4.712, + "step": 11600 + }, + { + "epoch": 4.84, + "learning_rate": 1e-06, + "loss": 0.1812, + "step": 11620 + }, + { + "epoch": 4.84, + "eval_accuracy": 0.929, + "eval_loss": 0.20585575699806213, + "eval_runtime": 15.6349, + "eval_samples_per_second": 1535.024, + "eval_steps_per_second": 4.797, + "step": 11620 + }, + { + "epoch": 4.85, + "learning_rate": 1e-06, + "loss": 0.2473, + "step": 11640 + }, + { + "epoch": 4.85, + "eval_accuracy": 0.9304166666666667, + "eval_loss": 0.2033381462097168, + "eval_runtime": 16.0063, + "eval_samples_per_second": 1499.413, + "eval_steps_per_second": 4.686, + "step": 11640 + }, + { + "epoch": 4.86, + "learning_rate": 1e-06, + "loss": 0.1757, + "step": 11660 + }, + { + "epoch": 4.86, + "eval_accuracy": 0.9302083333333333, + "eval_loss": 0.20283745229244232, + "eval_runtime": 15.8227, + "eval_samples_per_second": 1516.806, + "eval_steps_per_second": 4.74, + "step": 11660 + }, + { + "epoch": 4.87, + "learning_rate": 1e-06, + "loss": 0.2138, + "step": 11680 + }, + { + "epoch": 4.87, + "eval_accuracy": 0.9295833333333333, + "eval_loss": 0.20379288494586945, + "eval_runtime": 15.9957, + "eval_samples_per_second": 1500.406, + "eval_steps_per_second": 4.689, + "step": 11680 + }, + { + "epoch": 4.88, + "learning_rate": 1e-06, + "loss": 0.2594, + "step": 11700 + }, + { + "epoch": 4.88, + "eval_accuracy": 0.9294166666666667, + "eval_loss": 0.20390905439853668, + "eval_runtime": 15.8805, + "eval_samples_per_second": 1511.292, + "eval_steps_per_second": 4.723, + "step": 11700 + }, + { + "epoch": 4.88, + "learning_rate": 1e-06, + "loss": 0.183, + "step": 11720 + }, + { + "epoch": 4.88, + "eval_accuracy": 0.9290416666666667, + "eval_loss": 0.2046515792608261, + "eval_runtime": 16.3804, + "eval_samples_per_second": 1465.166, + "eval_steps_per_second": 4.579, + "step": 11720 + }, + { + "epoch": 4.89, + "learning_rate": 1e-06, + "loss": 0.1479, + "step": 11740 + }, + { + "epoch": 4.89, + "eval_accuracy": 0.9282083333333333, + "eval_loss": 0.20830100774765015, + "eval_runtime": 15.9731, + "eval_samples_per_second": 1502.525, + "eval_steps_per_second": 4.695, + "step": 11740 + }, + { + "epoch": 4.9, + "learning_rate": 1e-06, + "loss": 0.1684, + "step": 11760 + }, + { + "epoch": 4.9, + "eval_accuracy": 0.9296666666666666, + "eval_loss": 0.20390328764915466, + "eval_runtime": 15.9815, + "eval_samples_per_second": 1501.737, + "eval_steps_per_second": 4.693, + "step": 11760 + }, + { + "epoch": 4.91, + "learning_rate": 1e-06, + "loss": 0.1802, + "step": 11780 + }, + { + "epoch": 4.91, + "eval_accuracy": 0.928125, + "eval_loss": 0.2074248492717743, + "eval_runtime": 16.0187, + "eval_samples_per_second": 1498.245, + "eval_steps_per_second": 4.682, + "step": 11780 + }, + { + "epoch": 4.92, + "learning_rate": 1e-06, + "loss": 0.1707, + "step": 11800 + }, + { + "epoch": 4.92, + "eval_accuracy": 0.92875, + "eval_loss": 0.20657067000865936, + "eval_runtime": 15.9205, + "eval_samples_per_second": 1507.488, + "eval_steps_per_second": 4.711, + "step": 11800 + }, + { + "epoch": 4.92, + "learning_rate": 1e-06, + "loss": 0.2031, + "step": 11820 + }, + { + "epoch": 4.92, + "eval_accuracy": 0.928375, + "eval_loss": 0.21004259586334229, + "eval_runtime": 15.5995, + "eval_samples_per_second": 1538.507, + "eval_steps_per_second": 4.808, + "step": 11820 + }, + { + "epoch": 4.93, + "learning_rate": 1e-06, + "loss": 0.2169, + "step": 11840 + }, + { + "epoch": 4.93, + "eval_accuracy": 0.9290416666666667, + "eval_loss": 0.20725244283676147, + "eval_runtime": 15.6197, + "eval_samples_per_second": 1536.517, + "eval_steps_per_second": 4.802, + "step": 11840 + }, + { + "epoch": 4.94, + "learning_rate": 1e-06, + "loss": 0.1941, + "step": 11860 + }, + { + "epoch": 4.94, + "eval_accuracy": 0.930125, + "eval_loss": 0.2036478966474533, + "eval_runtime": 16.2424, + "eval_samples_per_second": 1477.617, + "eval_steps_per_second": 4.618, + "step": 11860 + }, + { + "epoch": 4.95, + "learning_rate": 1e-06, + "loss": 0.2131, + "step": 11880 + }, + { + "epoch": 4.95, + "eval_accuracy": 0.9295, + "eval_loss": 0.2052011936903, + "eval_runtime": 15.8259, + "eval_samples_per_second": 1516.501, + "eval_steps_per_second": 4.739, + "step": 11880 + }, + { + "epoch": 4.96, + "learning_rate": 1e-06, + "loss": 0.1867, + "step": 11900 + }, + { + "epoch": 4.96, + "eval_accuracy": 0.9289583333333333, + "eval_loss": 0.20610161125659943, + "eval_runtime": 15.9156, + "eval_samples_per_second": 1507.958, + "eval_steps_per_second": 4.712, + "step": 11900 + }, + { + "epoch": 4.97, + "learning_rate": 1e-06, + "loss": 0.176, + "step": 11920 + }, + { + "epoch": 4.97, + "eval_accuracy": 0.9302916666666666, + "eval_loss": 0.20278537273406982, + "eval_runtime": 16.3933, + "eval_samples_per_second": 1464.01, + "eval_steps_per_second": 4.575, + "step": 11920 + }, + { + "epoch": 4.97, + "learning_rate": 1e-06, + "loss": 0.1932, + "step": 11940 + }, + { + "epoch": 4.97, + "eval_accuracy": 0.9305833333333333, + "eval_loss": 0.2031938135623932, + "eval_runtime": 15.7386, + "eval_samples_per_second": 1524.912, + "eval_steps_per_second": 4.765, + "step": 11940 + }, + { + "epoch": 4.98, + "learning_rate": 1e-06, + "loss": 0.2253, + "step": 11960 + }, + { + "epoch": 4.98, + "eval_accuracy": 0.930125, + "eval_loss": 0.20466017723083496, + "eval_runtime": 15.9542, + "eval_samples_per_second": 1504.308, + "eval_steps_per_second": 4.701, + "step": 11960 + }, + { + "epoch": 4.99, + "learning_rate": 1e-06, + "loss": 0.2001, + "step": 11980 + }, + { + "epoch": 4.99, + "eval_accuracy": 0.9303333333333333, + "eval_loss": 0.20326323807239532, + "eval_runtime": 16.2585, + "eval_samples_per_second": 1476.147, + "eval_steps_per_second": 4.613, + "step": 11980 + }, + { + "epoch": 5.0, + "learning_rate": 1e-06, + "loss": 0.2234, + "step": 12000 + }, + { + "epoch": 5.0, + "eval_accuracy": 0.9305, + "eval_loss": 0.20482754707336426, + "eval_runtime": 15.5729, + "eval_samples_per_second": 1541.134, + "eval_steps_per_second": 4.816, + "step": 12000 + }, + { + "epoch": 5.01, + "learning_rate": 1e-06, + "loss": 0.2004, + "step": 12020 + }, + { + "epoch": 5.01, + "eval_accuracy": 0.9305416666666667, + "eval_loss": 0.20364505052566528, + "eval_runtime": 16.9313, + "eval_samples_per_second": 1417.493, + "eval_steps_per_second": 4.43, + "step": 12020 + }, + { + "epoch": 5.02, + "learning_rate": 1e-06, + "loss": 0.1859, + "step": 12040 + }, + { + "epoch": 5.02, + "eval_accuracy": 0.9309583333333333, + "eval_loss": 0.2025202065706253, + "eval_runtime": 16.472, + "eval_samples_per_second": 1457.018, + "eval_steps_per_second": 4.553, + "step": 12040 + }, + { + "epoch": 5.03, + "learning_rate": 1e-06, + "loss": 0.19, + "step": 12060 + }, + { + "epoch": 5.03, + "eval_accuracy": 0.931375, + "eval_loss": 0.20227618515491486, + "eval_runtime": 15.727, + "eval_samples_per_second": 1526.038, + "eval_steps_per_second": 4.769, + "step": 12060 + }, + { + "epoch": 5.03, + "learning_rate": 1e-06, + "loss": 0.1645, + "step": 12080 + }, + { + "epoch": 5.03, + "eval_accuracy": 0.9303333333333333, + "eval_loss": 0.20448292791843414, + "eval_runtime": 16.1326, + "eval_samples_per_second": 1487.669, + "eval_steps_per_second": 4.649, + "step": 12080 + }, + { + "epoch": 5.04, + "learning_rate": 1e-06, + "loss": 0.2003, + "step": 12100 + }, + { + "epoch": 5.04, + "eval_accuracy": 0.931875, + "eval_loss": 0.20208178460597992, + "eval_runtime": 16.0837, + "eval_samples_per_second": 1492.198, + "eval_steps_per_second": 4.663, + "step": 12100 + }, + { + "epoch": 5.05, + "learning_rate": 1e-06, + "loss": 0.1788, + "step": 12120 + }, + { + "epoch": 5.05, + "eval_accuracy": 0.9320833333333334, + "eval_loss": 0.2024490088224411, + "eval_runtime": 15.8984, + "eval_samples_per_second": 1509.589, + "eval_steps_per_second": 4.717, + "step": 12120 + }, + { + "epoch": 5.06, + "learning_rate": 1e-06, + "loss": 0.1697, + "step": 12140 + }, + { + "epoch": 5.06, + "eval_accuracy": 0.9314583333333334, + "eval_loss": 0.202731654047966, + "eval_runtime": 15.9929, + "eval_samples_per_second": 1500.664, + "eval_steps_per_second": 4.69, + "step": 12140 + }, + { + "epoch": 5.07, + "learning_rate": 1e-06, + "loss": 0.1824, + "step": 12160 + }, + { + "epoch": 5.07, + "eval_accuracy": 0.9303333333333333, + "eval_loss": 0.2048906683921814, + "eval_runtime": 15.963, + "eval_samples_per_second": 1503.477, + "eval_steps_per_second": 4.698, + "step": 12160 + }, + { + "epoch": 5.08, + "learning_rate": 1e-06, + "loss": 0.1773, + "step": 12180 + }, + { + "epoch": 5.08, + "eval_accuracy": 0.9302916666666666, + "eval_loss": 0.20466017723083496, + "eval_runtime": 16.2544, + "eval_samples_per_second": 1476.528, + "eval_steps_per_second": 4.614, + "step": 12180 + }, + { + "epoch": 5.08, + "learning_rate": 1e-06, + "loss": 0.2324, + "step": 12200 + }, + { + "epoch": 5.08, + "eval_accuracy": 0.9304166666666667, + "eval_loss": 0.20540772378444672, + "eval_runtime": 16.2048, + "eval_samples_per_second": 1481.039, + "eval_steps_per_second": 4.628, + "step": 12200 + }, + { + "epoch": 5.09, + "learning_rate": 1e-06, + "loss": 0.1513, + "step": 12220 + }, + { + "epoch": 5.09, + "eval_accuracy": 0.9298333333333333, + "eval_loss": 0.20598597824573517, + "eval_runtime": 15.5756, + "eval_samples_per_second": 1540.872, + "eval_steps_per_second": 4.815, + "step": 12220 + }, + { + "epoch": 5.1, + "learning_rate": 1e-06, + "loss": 0.1975, + "step": 12240 + }, + { + "epoch": 5.1, + "eval_accuracy": 0.9305833333333333, + "eval_loss": 0.20424893498420715, + "eval_runtime": 15.9543, + "eval_samples_per_second": 1504.299, + "eval_steps_per_second": 4.701, + "step": 12240 + }, + { + "epoch": 5.11, + "learning_rate": 1e-06, + "loss": 0.2149, + "step": 12260 + }, + { + "epoch": 5.11, + "eval_accuracy": 0.9297916666666667, + "eval_loss": 0.205413356423378, + "eval_runtime": 15.6474, + "eval_samples_per_second": 1533.804, + "eval_steps_per_second": 4.793, + "step": 12260 + }, + { + "epoch": 5.12, + "learning_rate": 1e-06, + "loss": 0.1685, + "step": 12280 + }, + { + "epoch": 5.12, + "eval_accuracy": 0.9296666666666666, + "eval_loss": 0.20603446662425995, + "eval_runtime": 16.0146, + "eval_samples_per_second": 1498.636, + "eval_steps_per_second": 4.683, + "step": 12280 + }, + { + "epoch": 5.12, + "learning_rate": 1e-06, + "loss": 0.2097, + "step": 12300 + }, + { + "epoch": 5.12, + "eval_accuracy": 0.93125, + "eval_loss": 0.20300358533859253, + "eval_runtime": 15.8596, + "eval_samples_per_second": 1513.277, + "eval_steps_per_second": 4.729, + "step": 12300 + }, + { + "epoch": 5.13, + "learning_rate": 1e-06, + "loss": 0.2318, + "step": 12320 + }, + { + "epoch": 5.13, + "eval_accuracy": 0.931375, + "eval_loss": 0.20188292860984802, + "eval_runtime": 15.9958, + "eval_samples_per_second": 1500.398, + "eval_steps_per_second": 4.689, + "step": 12320 + }, + { + "epoch": 5.14, + "learning_rate": 1e-06, + "loss": 0.1662, + "step": 12340 + }, + { + "epoch": 5.14, + "eval_accuracy": 0.9291666666666667, + "eval_loss": 0.20399628579616547, + "eval_runtime": 15.942, + "eval_samples_per_second": 1505.459, + "eval_steps_per_second": 4.705, + "step": 12340 + }, + { + "epoch": 5.15, + "learning_rate": 1e-06, + "loss": 0.1792, + "step": 12360 + }, + { + "epoch": 5.15, + "eval_accuracy": 0.92925, + "eval_loss": 0.205734983086586, + "eval_runtime": 16.3128, + "eval_samples_per_second": 1471.239, + "eval_steps_per_second": 4.598, + "step": 12360 + }, + { + "epoch": 5.16, + "learning_rate": 1e-06, + "loss": 0.1762, + "step": 12380 + }, + { + "epoch": 5.16, + "eval_accuracy": 0.9287916666666667, + "eval_loss": 0.2092132568359375, + "eval_runtime": 15.9137, + "eval_samples_per_second": 1508.133, + "eval_steps_per_second": 4.713, + "step": 12380 + }, + { + "epoch": 5.17, + "learning_rate": 1e-06, + "loss": 0.1693, + "step": 12400 + }, + { + "epoch": 5.17, + "eval_accuracy": 0.929375, + "eval_loss": 0.20484225451946259, + "eval_runtime": 16.0154, + "eval_samples_per_second": 1498.562, + "eval_steps_per_second": 4.683, + "step": 12400 + }, + { + "epoch": 5.17, + "learning_rate": 1e-06, + "loss": 0.1659, + "step": 12420 + }, + { + "epoch": 5.17, + "eval_accuracy": 0.9309166666666666, + "eval_loss": 0.2023383527994156, + "eval_runtime": 16.3394, + "eval_samples_per_second": 1468.841, + "eval_steps_per_second": 4.59, + "step": 12420 + }, + { + "epoch": 5.18, + "learning_rate": 1e-06, + "loss": 0.1561, + "step": 12440 + }, + { + "epoch": 5.18, + "eval_accuracy": 0.930875, + "eval_loss": 0.20242756605148315, + "eval_runtime": 15.5429, + "eval_samples_per_second": 1544.111, + "eval_steps_per_second": 4.825, + "step": 12440 + }, + { + "epoch": 5.19, + "learning_rate": 1e-06, + "loss": 0.1826, + "step": 12460 + }, + { + "epoch": 5.19, + "eval_accuracy": 0.9317916666666667, + "eval_loss": 0.2020563930273056, + "eval_runtime": 16.0095, + "eval_samples_per_second": 1499.108, + "eval_steps_per_second": 4.685, + "step": 12460 + }, + { + "epoch": 5.2, + "learning_rate": 1e-06, + "loss": 0.1544, + "step": 12480 + }, + { + "epoch": 5.2, + "eval_accuracy": 0.93075, + "eval_loss": 0.2041223645210266, + "eval_runtime": 15.9404, + "eval_samples_per_second": 1505.609, + "eval_steps_per_second": 4.705, + "step": 12480 + }, + { + "epoch": 5.21, + "learning_rate": 1e-06, + "loss": 0.1836, + "step": 12500 + }, + { + "epoch": 5.21, + "eval_accuracy": 0.9310833333333334, + "eval_loss": 0.20302866399288177, + "eval_runtime": 15.672, + "eval_samples_per_second": 1531.396, + "eval_steps_per_second": 4.786, + "step": 12500 + }, + { + "epoch": 5.22, + "learning_rate": 1e-06, + "loss": 0.1792, + "step": 12520 + }, + { + "epoch": 5.22, + "eval_accuracy": 0.93125, + "eval_loss": 0.20235809683799744, + "eval_runtime": 16.0568, + "eval_samples_per_second": 1494.695, + "eval_steps_per_second": 4.671, + "step": 12520 + }, + { + "epoch": 5.22, + "learning_rate": 1e-06, + "loss": 0.1666, + "step": 12540 + }, + { + "epoch": 5.22, + "eval_accuracy": 0.931875, + "eval_loss": 0.2016390711069107, + "eval_runtime": 15.811, + "eval_samples_per_second": 1517.93, + "eval_steps_per_second": 4.744, + "step": 12540 + }, + { + "epoch": 5.23, + "learning_rate": 1e-06, + "loss": 0.1714, + "step": 12560 + }, + { + "epoch": 5.23, + "eval_accuracy": 0.9312916666666666, + "eval_loss": 0.20161676406860352, + "eval_runtime": 16.4914, + "eval_samples_per_second": 1455.303, + "eval_steps_per_second": 4.548, + "step": 12560 + }, + { + "epoch": 5.24, + "learning_rate": 1e-06, + "loss": 0.2023, + "step": 12580 + }, + { + "epoch": 5.24, + "eval_accuracy": 0.930125, + "eval_loss": 0.20450520515441895, + "eval_runtime": 16.0345, + "eval_samples_per_second": 1496.773, + "eval_steps_per_second": 4.677, + "step": 12580 + }, + { + "epoch": 5.25, + "learning_rate": 1e-06, + "loss": 0.1996, + "step": 12600 + }, + { + "epoch": 5.25, + "eval_accuracy": 0.9287916666666667, + "eval_loss": 0.20813672244548798, + "eval_runtime": 15.8781, + "eval_samples_per_second": 1511.512, + "eval_steps_per_second": 4.723, + "step": 12600 + }, + { + "epoch": 5.26, + "learning_rate": 1e-06, + "loss": 0.2113, + "step": 12620 + }, + { + "epoch": 5.26, + "eval_accuracy": 0.931125, + "eval_loss": 0.20202632248401642, + "eval_runtime": 15.9398, + "eval_samples_per_second": 1505.669, + "eval_steps_per_second": 4.705, + "step": 12620 + }, + { + "epoch": 5.27, + "learning_rate": 1e-06, + "loss": 0.1826, + "step": 12640 + }, + { + "epoch": 5.27, + "eval_accuracy": 0.9315416666666667, + "eval_loss": 0.2010980248451233, + "eval_runtime": 15.6531, + "eval_samples_per_second": 1533.238, + "eval_steps_per_second": 4.791, + "step": 12640 + }, + { + "epoch": 5.28, + "learning_rate": 1e-06, + "loss": 0.2069, + "step": 12660 + }, + { + "epoch": 5.28, + "eval_accuracy": 0.9294583333333334, + "eval_loss": 0.20446471869945526, + "eval_runtime": 15.9931, + "eval_samples_per_second": 1500.646, + "eval_steps_per_second": 4.69, + "step": 12660 + }, + { + "epoch": 5.28, + "learning_rate": 1e-06, + "loss": 0.1621, + "step": 12680 + }, + { + "epoch": 5.28, + "eval_accuracy": 0.9308333333333333, + "eval_loss": 0.20204661786556244, + "eval_runtime": 16.4396, + "eval_samples_per_second": 1459.894, + "eval_steps_per_second": 4.562, + "step": 12680 + }, + { + "epoch": 5.29, + "learning_rate": 1e-06, + "loss": 0.16, + "step": 12700 + }, + { + "epoch": 5.29, + "eval_accuracy": 0.93075, + "eval_loss": 0.20316839218139648, + "eval_runtime": 15.9576, + "eval_samples_per_second": 1503.99, + "eval_steps_per_second": 4.7, + "step": 12700 + }, + { + "epoch": 5.3, + "learning_rate": 1e-06, + "loss": 0.165, + "step": 12720 + }, + { + "epoch": 5.3, + "eval_accuracy": 0.9314583333333334, + "eval_loss": 0.20250557363033295, + "eval_runtime": 15.8087, + "eval_samples_per_second": 1518.148, + "eval_steps_per_second": 4.744, + "step": 12720 + }, + { + "epoch": 5.31, + "learning_rate": 1e-06, + "loss": 0.1894, + "step": 12740 + }, + { + "epoch": 5.31, + "eval_accuracy": 0.931625, + "eval_loss": 0.20032641291618347, + "eval_runtime": 15.4644, + "eval_samples_per_second": 1551.949, + "eval_steps_per_second": 4.85, + "step": 12740 + }, + { + "epoch": 5.32, + "learning_rate": 1e-06, + "loss": 0.1633, + "step": 12760 + }, + { + "epoch": 5.32, + "eval_accuracy": 0.9314166666666667, + "eval_loss": 0.20086656510829926, + "eval_runtime": 16.2759, + "eval_samples_per_second": 1474.571, + "eval_steps_per_second": 4.608, + "step": 12760 + }, + { + "epoch": 5.33, + "learning_rate": 1e-06, + "loss": 0.2289, + "step": 12780 + }, + { + "epoch": 5.33, + "eval_accuracy": 0.932625, + "eval_loss": 0.20015648007392883, + "eval_runtime": 16.3054, + "eval_samples_per_second": 1471.905, + "eval_steps_per_second": 4.6, + "step": 12780 + }, + { + "epoch": 5.33, + "learning_rate": 1e-06, + "loss": 0.1571, + "step": 12800 + }, + { + "epoch": 5.33, + "eval_accuracy": 0.932, + "eval_loss": 0.20201164484024048, + "eval_runtime": 16.2407, + "eval_samples_per_second": 1477.773, + "eval_steps_per_second": 4.618, + "step": 12800 + }, + { + "epoch": 5.34, + "learning_rate": 1e-06, + "loss": 0.2073, + "step": 12820 + }, + { + "epoch": 5.34, + "eval_accuracy": 0.9295416666666667, + "eval_loss": 0.2069837599992752, + "eval_runtime": 17.0384, + "eval_samples_per_second": 1408.581, + "eval_steps_per_second": 4.402, + "step": 12820 + }, + { + "epoch": 5.35, + "learning_rate": 1e-06, + "loss": 0.1605, + "step": 12840 + }, + { + "epoch": 5.35, + "eval_accuracy": 0.93075, + "eval_loss": 0.20616813004016876, + "eval_runtime": 16.8287, + "eval_samples_per_second": 1426.138, + "eval_steps_per_second": 4.457, + "step": 12840 + }, + { + "epoch": 5.36, + "learning_rate": 1e-06, + "loss": 0.1879, + "step": 12860 + }, + { + "epoch": 5.36, + "eval_accuracy": 0.9306666666666666, + "eval_loss": 0.20293764770030975, + "eval_runtime": 17.5436, + "eval_samples_per_second": 1368.02, + "eval_steps_per_second": 4.275, + "step": 12860 + }, + { + "epoch": 5.37, + "learning_rate": 1e-06, + "loss": 0.2104, + "step": 12880 + }, + { + "epoch": 5.37, + "eval_accuracy": 0.9311666666666667, + "eval_loss": 0.20171169936656952, + "eval_runtime": 16.6339, + "eval_samples_per_second": 1442.838, + "eval_steps_per_second": 4.509, + "step": 12880 + }, + { + "epoch": 5.38, + "learning_rate": 1e-06, + "loss": 0.2058, + "step": 12900 + }, + { + "epoch": 5.38, + "eval_accuracy": 0.9315416666666667, + "eval_loss": 0.20113909244537354, + "eval_runtime": 16.0022, + "eval_samples_per_second": 1499.798, + "eval_steps_per_second": 4.687, + "step": 12900 + }, + { + "epoch": 5.38, + "learning_rate": 1e-06, + "loss": 0.1624, + "step": 12920 + }, + { + "epoch": 5.38, + "eval_accuracy": 0.9311666666666667, + "eval_loss": 0.20383241772651672, + "eval_runtime": 15.8073, + "eval_samples_per_second": 1518.289, + "eval_steps_per_second": 4.745, + "step": 12920 + }, + { + "epoch": 5.39, + "learning_rate": 1e-06, + "loss": 0.1765, + "step": 12940 + }, + { + "epoch": 5.39, + "eval_accuracy": 0.9310416666666667, + "eval_loss": 0.20215220749378204, + "eval_runtime": 16.3386, + "eval_samples_per_second": 1468.917, + "eval_steps_per_second": 4.59, + "step": 12940 + }, + { + "epoch": 5.4, + "learning_rate": 1e-06, + "loss": 0.187, + "step": 12960 + }, + { + "epoch": 5.4, + "eval_accuracy": 0.931375, + "eval_loss": 0.20157550275325775, + "eval_runtime": 16.0967, + "eval_samples_per_second": 1490.991, + "eval_steps_per_second": 4.659, + "step": 12960 + }, + { + "epoch": 5.41, + "learning_rate": 1e-06, + "loss": 0.1565, + "step": 12980 + }, + { + "epoch": 5.41, + "eval_accuracy": 0.9317083333333334, + "eval_loss": 0.20059089362621307, + "eval_runtime": 15.7591, + "eval_samples_per_second": 1522.928, + "eval_steps_per_second": 4.759, + "step": 12980 + }, + { + "epoch": 5.42, + "learning_rate": 1e-06, + "loss": 0.1901, + "step": 13000 + }, + { + "epoch": 5.42, + "eval_accuracy": 0.9322916666666666, + "eval_loss": 0.20068366825580597, + "eval_runtime": 15.8354, + "eval_samples_per_second": 1515.591, + "eval_steps_per_second": 4.736, + "step": 13000 + }, + { + "epoch": 5.42, + "learning_rate": 1e-06, + "loss": 0.2473, + "step": 13020 + }, + { + "epoch": 5.42, + "eval_accuracy": 0.9309583333333333, + "eval_loss": 0.20135918259620667, + "eval_runtime": 15.7908, + "eval_samples_per_second": 1519.875, + "eval_steps_per_second": 4.75, + "step": 13020 + }, + { + "epoch": 5.43, + "learning_rate": 1e-06, + "loss": 0.1836, + "step": 13040 + }, + { + "epoch": 5.43, + "eval_accuracy": 0.9314583333333334, + "eval_loss": 0.20122550427913666, + "eval_runtime": 17.2214, + "eval_samples_per_second": 1393.613, + "eval_steps_per_second": 4.355, + "step": 13040 + }, + { + "epoch": 5.44, + "learning_rate": 1e-06, + "loss": 0.1759, + "step": 13060 + }, + { + "epoch": 5.44, + "eval_accuracy": 0.9310416666666667, + "eval_loss": 0.20236973464488983, + "eval_runtime": 17.2846, + "eval_samples_per_second": 1388.518, + "eval_steps_per_second": 4.339, + "step": 13060 + }, + { + "epoch": 5.45, + "learning_rate": 1e-06, + "loss": 0.2385, + "step": 13080 + }, + { + "epoch": 5.45, + "eval_accuracy": 0.9303333333333333, + "eval_loss": 0.202514186501503, + "eval_runtime": 17.5809, + "eval_samples_per_second": 1365.119, + "eval_steps_per_second": 4.266, + "step": 13080 + }, + { + "epoch": 5.46, + "learning_rate": 1e-06, + "loss": 0.1586, + "step": 13100 + }, + { + "epoch": 5.46, + "eval_accuracy": 0.9314583333333334, + "eval_loss": 0.20214223861694336, + "eval_runtime": 17.3368, + "eval_samples_per_second": 1384.339, + "eval_steps_per_second": 4.326, + "step": 13100 + }, + { + "epoch": 5.47, + "learning_rate": 1e-06, + "loss": 0.1734, + "step": 13120 + }, + { + "epoch": 5.47, + "eval_accuracy": 0.9310833333333334, + "eval_loss": 0.20263217389583588, + "eval_runtime": 15.8849, + "eval_samples_per_second": 1510.867, + "eval_steps_per_second": 4.721, + "step": 13120 + }, + { + "epoch": 5.47, + "learning_rate": 1e-06, + "loss": 0.1888, + "step": 13140 + }, + { + "epoch": 5.47, + "eval_accuracy": 0.9295833333333333, + "eval_loss": 0.20405107736587524, + "eval_runtime": 15.9922, + "eval_samples_per_second": 1500.728, + "eval_steps_per_second": 4.69, + "step": 13140 + }, + { + "epoch": 5.48, + "learning_rate": 1e-06, + "loss": 0.1676, + "step": 13160 + }, + { + "epoch": 5.48, + "eval_accuracy": 0.9306666666666666, + "eval_loss": 0.20299804210662842, + "eval_runtime": 15.9348, + "eval_samples_per_second": 1506.14, + "eval_steps_per_second": 4.707, + "step": 13160 + }, + { + "epoch": 5.49, + "learning_rate": 1e-06, + "loss": 0.2462, + "step": 13180 + }, + { + "epoch": 5.49, + "eval_accuracy": 0.9300833333333334, + "eval_loss": 0.20278708636760712, + "eval_runtime": 16.0737, + "eval_samples_per_second": 1493.119, + "eval_steps_per_second": 4.666, + "step": 13180 + }, + { + "epoch": 5.5, + "learning_rate": 1e-06, + "loss": 0.1751, + "step": 13200 + }, + { + "epoch": 5.5, + "eval_accuracy": 0.93125, + "eval_loss": 0.20159900188446045, + "eval_runtime": 15.4631, + "eval_samples_per_second": 1552.08, + "eval_steps_per_second": 4.85, + "step": 13200 + }, + { + "epoch": 5.51, + "learning_rate": 1e-06, + "loss": 0.2113, + "step": 13220 + }, + { + "epoch": 5.51, + "eval_accuracy": 0.929125, + "eval_loss": 0.20587190985679626, + "eval_runtime": 15.9217, + "eval_samples_per_second": 1507.375, + "eval_steps_per_second": 4.711, + "step": 13220 + }, + { + "epoch": 5.52, + "learning_rate": 1e-06, + "loss": 0.1884, + "step": 13240 + }, + { + "epoch": 5.52, + "eval_accuracy": 0.92875, + "eval_loss": 0.20635303854942322, + "eval_runtime": 15.8747, + "eval_samples_per_second": 1511.836, + "eval_steps_per_second": 4.724, + "step": 13240 + }, + { + "epoch": 5.53, + "learning_rate": 1e-06, + "loss": 0.1545, + "step": 13260 + }, + { + "epoch": 5.53, + "eval_accuracy": 0.9289166666666666, + "eval_loss": 0.20559687912464142, + "eval_runtime": 15.9786, + "eval_samples_per_second": 1502.007, + "eval_steps_per_second": 4.694, + "step": 13260 + }, + { + "epoch": 5.53, + "learning_rate": 1e-06, + "loss": 0.209, + "step": 13280 + }, + { + "epoch": 5.53, + "eval_accuracy": 0.9307916666666667, + "eval_loss": 0.20248527824878693, + "eval_runtime": 15.7154, + "eval_samples_per_second": 1527.166, + "eval_steps_per_second": 4.772, + "step": 13280 + }, + { + "epoch": 5.54, + "learning_rate": 1e-06, + "loss": 0.1843, + "step": 13300 + }, + { + "epoch": 5.54, + "eval_accuracy": 0.9307916666666667, + "eval_loss": 0.20135876536369324, + "eval_runtime": 16.3985, + "eval_samples_per_second": 1463.552, + "eval_steps_per_second": 4.574, + "step": 13300 + }, + { + "epoch": 5.55, + "learning_rate": 1e-06, + "loss": 0.2096, + "step": 13320 + }, + { + "epoch": 5.55, + "eval_accuracy": 0.9314583333333334, + "eval_loss": 0.20034638047218323, + "eval_runtime": 15.8661, + "eval_samples_per_second": 1512.659, + "eval_steps_per_second": 4.727, + "step": 13320 + }, + { + "epoch": 5.56, + "learning_rate": 1e-06, + "loss": 0.1821, + "step": 13340 + }, + { + "epoch": 5.56, + "eval_accuracy": 0.9300416666666667, + "eval_loss": 0.20381678640842438, + "eval_runtime": 16.2244, + "eval_samples_per_second": 1479.257, + "eval_steps_per_second": 4.623, + "step": 13340 + }, + { + "epoch": 5.57, + "learning_rate": 1e-06, + "loss": 0.1898, + "step": 13360 + }, + { + "epoch": 5.57, + "eval_accuracy": 0.9309166666666666, + "eval_loss": 0.20166015625, + "eval_runtime": 15.944, + "eval_samples_per_second": 1505.264, + "eval_steps_per_second": 4.704, + "step": 13360 + }, + { + "epoch": 5.58, + "learning_rate": 1e-06, + "loss": 0.2068, + "step": 13380 + }, + { + "epoch": 5.58, + "eval_accuracy": 0.9309583333333333, + "eval_loss": 0.2012653946876526, + "eval_runtime": 16.2049, + "eval_samples_per_second": 1481.035, + "eval_steps_per_second": 4.628, + "step": 13380 + }, + { + "epoch": 5.58, + "learning_rate": 1e-06, + "loss": 0.1822, + "step": 13400 + }, + { + "epoch": 5.58, + "eval_accuracy": 0.931625, + "eval_loss": 0.20043647289276123, + "eval_runtime": 15.8256, + "eval_samples_per_second": 1516.532, + "eval_steps_per_second": 4.739, + "step": 13400 + }, + { + "epoch": 5.59, + "learning_rate": 1e-06, + "loss": 0.1974, + "step": 13420 + }, + { + "epoch": 5.59, + "eval_accuracy": 0.9320416666666667, + "eval_loss": 0.19964149594306946, + "eval_runtime": 15.6329, + "eval_samples_per_second": 1535.22, + "eval_steps_per_second": 4.798, + "step": 13420 + }, + { + "epoch": 5.6, + "learning_rate": 1e-06, + "loss": 0.1634, + "step": 13440 + }, + { + "epoch": 5.6, + "eval_accuracy": 0.93, + "eval_loss": 0.2027622014284134, + "eval_runtime": 15.6182, + "eval_samples_per_second": 1536.672, + "eval_steps_per_second": 4.802, + "step": 13440 + }, + { + "epoch": 5.61, + "learning_rate": 1e-06, + "loss": 0.2075, + "step": 13460 + }, + { + "epoch": 5.61, + "eval_accuracy": 0.92925, + "eval_loss": 0.2044484168291092, + "eval_runtime": 15.6933, + "eval_samples_per_second": 1529.315, + "eval_steps_per_second": 4.779, + "step": 13460 + }, + { + "epoch": 5.62, + "learning_rate": 1e-06, + "loss": 0.1775, + "step": 13480 + }, + { + "epoch": 5.62, + "eval_accuracy": 0.9289583333333333, + "eval_loss": 0.20532788336277008, + "eval_runtime": 15.7257, + "eval_samples_per_second": 1526.161, + "eval_steps_per_second": 4.769, + "step": 13480 + }, + { + "epoch": 5.62, + "learning_rate": 1e-06, + "loss": 0.1648, + "step": 13500 + }, + { + "epoch": 5.62, + "eval_accuracy": 0.9272083333333333, + "eval_loss": 0.2097957581281662, + "eval_runtime": 15.4877, + "eval_samples_per_second": 1549.617, + "eval_steps_per_second": 4.843, + "step": 13500 + }, + { + "epoch": 5.63, + "learning_rate": 1e-06, + "loss": 0.1702, + "step": 13520 + }, + { + "epoch": 5.63, + "eval_accuracy": 0.9284166666666667, + "eval_loss": 0.2071061134338379, + "eval_runtime": 15.7223, + "eval_samples_per_second": 1526.495, + "eval_steps_per_second": 4.77, + "step": 13520 + }, + { + "epoch": 5.64, + "learning_rate": 1e-06, + "loss": 0.1701, + "step": 13540 + }, + { + "epoch": 5.64, + "eval_accuracy": 0.9302083333333333, + "eval_loss": 0.205330029129982, + "eval_runtime": 16.1803, + "eval_samples_per_second": 1483.284, + "eval_steps_per_second": 4.635, + "step": 13540 + }, + { + "epoch": 5.65, + "learning_rate": 1e-06, + "loss": 0.2158, + "step": 13560 + }, + { + "epoch": 5.65, + "eval_accuracy": 0.929375, + "eval_loss": 0.20697665214538574, + "eval_runtime": 15.6178, + "eval_samples_per_second": 1536.706, + "eval_steps_per_second": 4.802, + "step": 13560 + }, + { + "epoch": 5.66, + "learning_rate": 1e-06, + "loss": 0.1622, + "step": 13580 + }, + { + "epoch": 5.66, + "eval_accuracy": 0.9305, + "eval_loss": 0.2030411958694458, + "eval_runtime": 15.9497, + "eval_samples_per_second": 1504.728, + "eval_steps_per_second": 4.702, + "step": 13580 + }, + { + "epoch": 5.67, + "learning_rate": 1e-06, + "loss": 0.1764, + "step": 13600 + }, + { + "epoch": 5.67, + "eval_accuracy": 0.9311666666666667, + "eval_loss": 0.2010955512523651, + "eval_runtime": 15.6626, + "eval_samples_per_second": 1532.316, + "eval_steps_per_second": 4.788, + "step": 13600 + }, + { + "epoch": 5.67, + "learning_rate": 1e-06, + "loss": 0.2191, + "step": 13620 + }, + { + "epoch": 5.67, + "eval_accuracy": 0.93125, + "eval_loss": 0.1993076354265213, + "eval_runtime": 16.3922, + "eval_samples_per_second": 1464.11, + "eval_steps_per_second": 4.575, + "step": 13620 + }, + { + "epoch": 5.68, + "learning_rate": 1e-06, + "loss": 0.1962, + "step": 13640 + }, + { + "epoch": 5.68, + "eval_accuracy": 0.931625, + "eval_loss": 0.19903969764709473, + "eval_runtime": 16.0762, + "eval_samples_per_second": 1492.89, + "eval_steps_per_second": 4.665, + "step": 13640 + }, + { + "epoch": 5.69, + "learning_rate": 1e-06, + "loss": 0.1954, + "step": 13660 + }, + { + "epoch": 5.69, + "eval_accuracy": 0.9315833333333333, + "eval_loss": 0.20040030777454376, + "eval_runtime": 16.0741, + "eval_samples_per_second": 1493.088, + "eval_steps_per_second": 4.666, + "step": 13660 + }, + { + "epoch": 5.7, + "learning_rate": 1e-06, + "loss": 0.1903, + "step": 13680 + }, + { + "epoch": 5.7, + "eval_accuracy": 0.9295, + "eval_loss": 0.20218642055988312, + "eval_runtime": 17.6769, + "eval_samples_per_second": 1357.707, + "eval_steps_per_second": 4.243, + "step": 13680 + }, + { + "epoch": 5.71, + "learning_rate": 1e-06, + "loss": 0.1867, + "step": 13700 + }, + { + "epoch": 5.71, + "eval_accuracy": 0.930375, + "eval_loss": 0.20176221430301666, + "eval_runtime": 16.5988, + "eval_samples_per_second": 1445.891, + "eval_steps_per_second": 4.518, + "step": 13700 + }, + { + "epoch": 5.72, + "learning_rate": 1e-06, + "loss": 0.1991, + "step": 13720 + }, + { + "epoch": 5.72, + "eval_accuracy": 0.93025, + "eval_loss": 0.20237615704536438, + "eval_runtime": 16.9965, + "eval_samples_per_second": 1412.055, + "eval_steps_per_second": 4.413, + "step": 13720 + }, + { + "epoch": 5.72, + "learning_rate": 1e-06, + "loss": 0.1646, + "step": 13740 + }, + { + "epoch": 5.72, + "eval_accuracy": 0.9304583333333334, + "eval_loss": 0.20343907177448273, + "eval_runtime": 17.7322, + "eval_samples_per_second": 1353.468, + "eval_steps_per_second": 4.23, + "step": 13740 + }, + { + "epoch": 5.73, + "learning_rate": 1e-06, + "loss": 0.2051, + "step": 13760 + }, + { + "epoch": 5.73, + "eval_accuracy": 0.9300416666666667, + "eval_loss": 0.2030051052570343, + "eval_runtime": 17.541, + "eval_samples_per_second": 1368.227, + "eval_steps_per_second": 4.276, + "step": 13760 + }, + { + "epoch": 5.74, + "learning_rate": 1e-06, + "loss": 0.1693, + "step": 13780 + }, + { + "epoch": 5.74, + "eval_accuracy": 0.931, + "eval_loss": 0.1998627781867981, + "eval_runtime": 16.6335, + "eval_samples_per_second": 1442.869, + "eval_steps_per_second": 4.509, + "step": 13780 + }, + { + "epoch": 5.75, + "learning_rate": 1e-06, + "loss": 0.1663, + "step": 13800 + }, + { + "epoch": 5.75, + "eval_accuracy": 0.9325, + "eval_loss": 0.1989893913269043, + "eval_runtime": 17.515, + "eval_samples_per_second": 1370.253, + "eval_steps_per_second": 4.282, + "step": 13800 + }, + { + "epoch": 5.76, + "learning_rate": 1e-06, + "loss": 0.1452, + "step": 13820 + }, + { + "epoch": 5.76, + "eval_accuracy": 0.9319583333333333, + "eval_loss": 0.1993263214826584, + "eval_runtime": 17.7237, + "eval_samples_per_second": 1354.116, + "eval_steps_per_second": 4.232, + "step": 13820 + }, + { + "epoch": 5.77, + "learning_rate": 1e-06, + "loss": 0.1535, + "step": 13840 + }, + { + "epoch": 5.77, + "eval_accuracy": 0.932125, + "eval_loss": 0.19914114475250244, + "eval_runtime": 17.0395, + "eval_samples_per_second": 1408.496, + "eval_steps_per_second": 4.402, + "step": 13840 + }, + { + "epoch": 5.78, + "learning_rate": 1e-06, + "loss": 0.2072, + "step": 13860 + }, + { + "epoch": 5.78, + "eval_accuracy": 0.9300833333333334, + "eval_loss": 0.20136070251464844, + "eval_runtime": 17.6045, + "eval_samples_per_second": 1363.289, + "eval_steps_per_second": 4.26, + "step": 13860 + }, + { + "epoch": 5.78, + "learning_rate": 1e-06, + "loss": 0.2069, + "step": 13880 + }, + { + "epoch": 5.78, + "eval_accuracy": 0.9298333333333333, + "eval_loss": 0.2024083137512207, + "eval_runtime": 17.3144, + "eval_samples_per_second": 1386.128, + "eval_steps_per_second": 4.332, + "step": 13880 + }, + { + "epoch": 5.79, + "learning_rate": 1e-06, + "loss": 0.2298, + "step": 13900 + }, + { + "epoch": 5.79, + "eval_accuracy": 0.93225, + "eval_loss": 0.19839055836200714, + "eval_runtime": 18.3284, + "eval_samples_per_second": 1309.44, + "eval_steps_per_second": 4.092, + "step": 13900 + }, + { + "epoch": 5.8, + "learning_rate": 1e-06, + "loss": 0.201, + "step": 13920 + }, + { + "epoch": 5.8, + "eval_accuracy": 0.932125, + "eval_loss": 0.1982399821281433, + "eval_runtime": 18.1938, + "eval_samples_per_second": 1319.131, + "eval_steps_per_second": 4.122, + "step": 13920 + }, + { + "epoch": 5.81, + "learning_rate": 1e-06, + "loss": 0.184, + "step": 13940 + }, + { + "epoch": 5.81, + "eval_accuracy": 0.9317083333333334, + "eval_loss": 0.1989286094903946, + "eval_runtime": 18.406, + "eval_samples_per_second": 1303.923, + "eval_steps_per_second": 4.075, + "step": 13940 + }, + { + "epoch": 5.82, + "learning_rate": 1e-06, + "loss": 0.1942, + "step": 13960 + }, + { + "epoch": 5.82, + "eval_accuracy": 0.9319166666666666, + "eval_loss": 0.19913487136363983, + "eval_runtime": 18.2526, + "eval_samples_per_second": 1314.879, + "eval_steps_per_second": 4.109, + "step": 13960 + }, + { + "epoch": 5.83, + "learning_rate": 1e-06, + "loss": 0.2085, + "step": 13980 + }, + { + "epoch": 5.83, + "eval_accuracy": 0.932375, + "eval_loss": 0.19867363572120667, + "eval_runtime": 17.554, + "eval_samples_per_second": 1367.21, + "eval_steps_per_second": 4.273, + "step": 13980 + }, + { + "epoch": 5.83, + "learning_rate": 1e-06, + "loss": 0.177, + "step": 14000 + }, + { + "epoch": 5.83, + "eval_accuracy": 0.9326666666666666, + "eval_loss": 0.19808466732501984, + "eval_runtime": 16.6018, + "eval_samples_per_second": 1445.627, + "eval_steps_per_second": 4.518, + "step": 14000 + }, + { + "epoch": 5.84, + "learning_rate": 1e-06, + "loss": 0.1746, + "step": 14020 + }, + { + "epoch": 5.84, + "eval_accuracy": 0.9325416666666667, + "eval_loss": 0.19782015681266785, + "eval_runtime": 17.9546, + "eval_samples_per_second": 1336.706, + "eval_steps_per_second": 4.177, + "step": 14020 + }, + { + "epoch": 5.85, + "learning_rate": 1e-06, + "loss": 0.1731, + "step": 14040 + }, + { + "epoch": 5.85, + "eval_accuracy": 0.9310416666666667, + "eval_loss": 0.20012834668159485, + "eval_runtime": 17.9142, + "eval_samples_per_second": 1339.719, + "eval_steps_per_second": 4.187, + "step": 14040 + }, + { + "epoch": 5.86, + "learning_rate": 1e-06, + "loss": 0.1888, + "step": 14060 + }, + { + "epoch": 5.86, + "eval_accuracy": 0.93075, + "eval_loss": 0.20140434801578522, + "eval_runtime": 18.4741, + "eval_samples_per_second": 1299.116, + "eval_steps_per_second": 4.06, + "step": 14060 + }, + { + "epoch": 5.87, + "learning_rate": 1e-06, + "loss": 0.1708, + "step": 14080 + }, + { + "epoch": 5.87, + "eval_accuracy": 0.9315416666666667, + "eval_loss": 0.19907809793949127, + "eval_runtime": 18.0588, + "eval_samples_per_second": 1328.991, + "eval_steps_per_second": 4.153, + "step": 14080 + }, + { + "epoch": 5.88, + "learning_rate": 1e-06, + "loss": 0.1927, + "step": 14100 + }, + { + "epoch": 5.88, + "eval_accuracy": 0.9303333333333333, + "eval_loss": 0.2014717161655426, + "eval_runtime": 17.5458, + "eval_samples_per_second": 1367.846, + "eval_steps_per_second": 4.275, + "step": 14100 + }, + { + "epoch": 5.88, + "learning_rate": 1e-06, + "loss": 0.1756, + "step": 14120 + }, + { + "epoch": 5.88, + "eval_accuracy": 0.9304583333333334, + "eval_loss": 0.20116497576236725, + "eval_runtime": 16.0925, + "eval_samples_per_second": 1491.379, + "eval_steps_per_second": 4.661, + "step": 14120 + }, + { + "epoch": 5.89, + "learning_rate": 1e-06, + "loss": 0.1829, + "step": 14140 + }, + { + "epoch": 5.89, + "eval_accuracy": 0.9317083333333334, + "eval_loss": 0.2000524401664734, + "eval_runtime": 16.8659, + "eval_samples_per_second": 1422.991, + "eval_steps_per_second": 4.447, + "step": 14140 + }, + { + "epoch": 5.9, + "learning_rate": 1e-06, + "loss": 0.1348, + "step": 14160 + }, + { + "epoch": 5.9, + "eval_accuracy": 0.932375, + "eval_loss": 0.19840501248836517, + "eval_runtime": 16.5412, + "eval_samples_per_second": 1450.921, + "eval_steps_per_second": 4.534, + "step": 14160 + }, + { + "epoch": 5.91, + "learning_rate": 1e-06, + "loss": 0.1773, + "step": 14180 + }, + { + "epoch": 5.91, + "eval_accuracy": 0.9315416666666667, + "eval_loss": 0.20062309503555298, + "eval_runtime": 16.2967, + "eval_samples_per_second": 1472.691, + "eval_steps_per_second": 4.602, + "step": 14180 + }, + { + "epoch": 5.92, + "learning_rate": 1e-06, + "loss": 0.1639, + "step": 14200 + }, + { + "epoch": 5.92, + "eval_accuracy": 0.9306666666666666, + "eval_loss": 0.2011023461818695, + "eval_runtime": 16.2819, + "eval_samples_per_second": 1474.03, + "eval_steps_per_second": 4.606, + "step": 14200 + }, + { + "epoch": 5.92, + "learning_rate": 1e-06, + "loss": 0.2275, + "step": 14220 + }, + { + "epoch": 5.92, + "eval_accuracy": 0.9318333333333333, + "eval_loss": 0.19922685623168945, + "eval_runtime": 17.1405, + "eval_samples_per_second": 1400.191, + "eval_steps_per_second": 4.376, + "step": 14220 + }, + { + "epoch": 5.93, + "learning_rate": 1e-06, + "loss": 0.1934, + "step": 14240 + }, + { + "epoch": 5.93, + "eval_accuracy": 0.932125, + "eval_loss": 0.19886505603790283, + "eval_runtime": 16.0782, + "eval_samples_per_second": 1492.703, + "eval_steps_per_second": 4.665, + "step": 14240 + }, + { + "epoch": 5.94, + "learning_rate": 1e-06, + "loss": 0.1847, + "step": 14260 + }, + { + "epoch": 5.94, + "eval_accuracy": 0.9330833333333334, + "eval_loss": 0.19734573364257812, + "eval_runtime": 17.2888, + "eval_samples_per_second": 1388.183, + "eval_steps_per_second": 4.338, + "step": 14260 + }, + { + "epoch": 5.95, + "learning_rate": 1e-06, + "loss": 0.1572, + "step": 14280 + }, + { + "epoch": 5.95, + "eval_accuracy": 0.9324166666666667, + "eval_loss": 0.1978907287120819, + "eval_runtime": 19.318, + "eval_samples_per_second": 1242.362, + "eval_steps_per_second": 3.882, + "step": 14280 + }, + { + "epoch": 5.96, + "learning_rate": 1e-06, + "loss": 0.2191, + "step": 14300 + }, + { + "epoch": 5.96, + "eval_accuracy": 0.9322083333333333, + "eval_loss": 0.19908125698566437, + "eval_runtime": 17.7053, + "eval_samples_per_second": 1355.53, + "eval_steps_per_second": 4.236, + "step": 14300 + }, + { + "epoch": 5.97, + "learning_rate": 1e-06, + "loss": 0.2022, + "step": 14320 + }, + { + "epoch": 5.97, + "eval_accuracy": 0.9321666666666667, + "eval_loss": 0.19820664823055267, + "eval_runtime": 17.7778, + "eval_samples_per_second": 1350.001, + "eval_steps_per_second": 4.219, + "step": 14320 + }, + { + "epoch": 5.97, + "learning_rate": 1e-06, + "loss": 0.1575, + "step": 14340 + }, + { + "epoch": 5.97, + "eval_accuracy": 0.9321666666666667, + "eval_loss": 0.19856464862823486, + "eval_runtime": 17.5559, + "eval_samples_per_second": 1367.059, + "eval_steps_per_second": 4.272, + "step": 14340 + }, + { + "epoch": 5.98, + "learning_rate": 1e-06, + "loss": 0.1826, + "step": 14360 + }, + { + "epoch": 5.98, + "eval_accuracy": 0.9322916666666666, + "eval_loss": 0.19818070530891418, + "eval_runtime": 17.7144, + "eval_samples_per_second": 1354.827, + "eval_steps_per_second": 4.234, + "step": 14360 + }, + { + "epoch": 5.99, + "learning_rate": 1e-06, + "loss": 0.1855, + "step": 14380 + }, + { + "epoch": 5.99, + "eval_accuracy": 0.9326666666666666, + "eval_loss": 0.19788843393325806, + "eval_runtime": 17.5812, + "eval_samples_per_second": 1365.094, + "eval_steps_per_second": 4.266, + "step": 14380 + }, + { + "epoch": 6.0, + "learning_rate": 1e-06, + "loss": 0.2027, + "step": 14400 + }, + { + "epoch": 6.0, + "eval_accuracy": 0.9319583333333333, + "eval_loss": 0.19935990869998932, + "eval_runtime": 17.5236, + "eval_samples_per_second": 1369.581, + "eval_steps_per_second": 4.28, + "step": 14400 + }, + { + "epoch": 6.01, + "learning_rate": 1e-06, + "loss": 0.1585, + "step": 14420 + }, + { + "epoch": 6.01, + "eval_accuracy": 0.9324583333333333, + "eval_loss": 0.20092272758483887, + "eval_runtime": 17.5852, + "eval_samples_per_second": 1364.781, + "eval_steps_per_second": 4.265, + "step": 14420 + }, + { + "epoch": 6.02, + "learning_rate": 1e-06, + "loss": 0.2092, + "step": 14440 + }, + { + "epoch": 6.02, + "eval_accuracy": 0.9339583333333333, + "eval_loss": 0.1970815807580948, + "eval_runtime": 17.1214, + "eval_samples_per_second": 1401.757, + "eval_steps_per_second": 4.38, + "step": 14440 + }, + { + "epoch": 6.03, + "learning_rate": 1e-06, + "loss": 0.1773, + "step": 14460 + }, + { + "epoch": 6.03, + "eval_accuracy": 0.9334583333333333, + "eval_loss": 0.19676099717617035, + "eval_runtime": 16.7515, + "eval_samples_per_second": 1432.707, + "eval_steps_per_second": 4.477, + "step": 14460 + }, + { + "epoch": 6.03, + "learning_rate": 1e-06, + "loss": 0.186, + "step": 14480 + }, + { + "epoch": 6.03, + "eval_accuracy": 0.9323333333333333, + "eval_loss": 0.19818897545337677, + "eval_runtime": 18.0357, + "eval_samples_per_second": 1330.693, + "eval_steps_per_second": 4.158, + "step": 14480 + }, + { + "epoch": 6.04, + "learning_rate": 1e-06, + "loss": 0.1759, + "step": 14500 + }, + { + "epoch": 6.04, + "eval_accuracy": 0.9315, + "eval_loss": 0.19834274053573608, + "eval_runtime": 16.9189, + "eval_samples_per_second": 1418.535, + "eval_steps_per_second": 4.433, + "step": 14500 + }, + { + "epoch": 6.05, + "learning_rate": 1e-06, + "loss": 0.168, + "step": 14520 + }, + { + "epoch": 6.05, + "eval_accuracy": 0.9335416666666667, + "eval_loss": 0.19659017026424408, + "eval_runtime": 16.8237, + "eval_samples_per_second": 1426.555, + "eval_steps_per_second": 4.458, + "step": 14520 + }, + { + "epoch": 6.06, + "learning_rate": 1e-06, + "loss": 0.168, + "step": 14540 + }, + { + "epoch": 6.06, + "eval_accuracy": 0.9322083333333333, + "eval_loss": 0.1991969645023346, + "eval_runtime": 17.2701, + "eval_samples_per_second": 1389.683, + "eval_steps_per_second": 4.343, + "step": 14540 + }, + { + "epoch": 6.07, + "learning_rate": 1e-06, + "loss": 0.2085, + "step": 14560 + }, + { + "epoch": 6.07, + "eval_accuracy": 0.9324583333333333, + "eval_loss": 0.19824150204658508, + "eval_runtime": 17.4283, + "eval_samples_per_second": 1377.067, + "eval_steps_per_second": 4.303, + "step": 14560 + }, + { + "epoch": 6.08, + "learning_rate": 1e-06, + "loss": 0.1866, + "step": 14580 + }, + { + "epoch": 6.08, + "eval_accuracy": 0.93325, + "eval_loss": 0.19639739394187927, + "eval_runtime": 17.7717, + "eval_samples_per_second": 1350.465, + "eval_steps_per_second": 4.22, + "step": 14580 + }, + { + "epoch": 6.08, + "learning_rate": 1e-06, + "loss": 0.1772, + "step": 14600 + }, + { + "epoch": 6.08, + "eval_accuracy": 0.93325, + "eval_loss": 0.19745591282844543, + "eval_runtime": 16.2574, + "eval_samples_per_second": 1476.246, + "eval_steps_per_second": 4.613, + "step": 14600 + }, + { + "epoch": 6.09, + "learning_rate": 1e-06, + "loss": 0.1772, + "step": 14620 + }, + { + "epoch": 6.09, + "eval_accuracy": 0.933375, + "eval_loss": 0.1968727856874466, + "eval_runtime": 16.8404, + "eval_samples_per_second": 1425.142, + "eval_steps_per_second": 4.454, + "step": 14620 + }, + { + "epoch": 6.1, + "learning_rate": 1e-06, + "loss": 0.1442, + "step": 14640 + }, + { + "epoch": 6.1, + "eval_accuracy": 0.9330416666666667, + "eval_loss": 0.19765588641166687, + "eval_runtime": 16.5146, + "eval_samples_per_second": 1453.262, + "eval_steps_per_second": 4.541, + "step": 14640 + }, + { + "epoch": 6.11, + "learning_rate": 1e-06, + "loss": 0.1531, + "step": 14660 + }, + { + "epoch": 6.11, + "eval_accuracy": 0.9324166666666667, + "eval_loss": 0.19997546076774597, + "eval_runtime": 16.1908, + "eval_samples_per_second": 1482.324, + "eval_steps_per_second": 4.632, + "step": 14660 + }, + { + "epoch": 6.12, + "learning_rate": 1e-06, + "loss": 0.2187, + "step": 14680 + }, + { + "epoch": 6.12, + "eval_accuracy": 0.931625, + "eval_loss": 0.20033469796180725, + "eval_runtime": 16.76, + "eval_samples_per_second": 1431.985, + "eval_steps_per_second": 4.475, + "step": 14680 + }, + { + "epoch": 6.12, + "learning_rate": 1e-06, + "loss": 0.1788, + "step": 14700 + }, + { + "epoch": 6.12, + "eval_accuracy": 0.9303333333333333, + "eval_loss": 0.2029111534357071, + "eval_runtime": 16.3811, + "eval_samples_per_second": 1465.1, + "eval_steps_per_second": 4.578, + "step": 14700 + }, + { + "epoch": 6.13, + "learning_rate": 1e-06, + "loss": 0.2018, + "step": 14720 + }, + { + "epoch": 6.13, + "eval_accuracy": 0.9309583333333333, + "eval_loss": 0.20074012875556946, + "eval_runtime": 16.4407, + "eval_samples_per_second": 1459.794, + "eval_steps_per_second": 4.562, + "step": 14720 + }, + { + "epoch": 6.14, + "learning_rate": 1e-06, + "loss": 0.1636, + "step": 14740 + }, + { + "epoch": 6.14, + "eval_accuracy": 0.9325833333333333, + "eval_loss": 0.1991911381483078, + "eval_runtime": 16.8493, + "eval_samples_per_second": 1424.389, + "eval_steps_per_second": 4.451, + "step": 14740 + }, + { + "epoch": 6.15, + "learning_rate": 1e-06, + "loss": 0.1622, + "step": 14760 + }, + { + "epoch": 6.15, + "eval_accuracy": 0.9322916666666666, + "eval_loss": 0.19850997626781464, + "eval_runtime": 16.3552, + "eval_samples_per_second": 1467.424, + "eval_steps_per_second": 4.586, + "step": 14760 + }, + { + "epoch": 6.16, + "learning_rate": 1e-06, + "loss": 0.1947, + "step": 14780 + }, + { + "epoch": 6.16, + "eval_accuracy": 0.9335833333333333, + "eval_loss": 0.19700346887111664, + "eval_runtime": 16.8167, + "eval_samples_per_second": 1427.151, + "eval_steps_per_second": 4.46, + "step": 14780 + }, + { + "epoch": 6.17, + "learning_rate": 1e-06, + "loss": 0.198, + "step": 14800 + }, + { + "epoch": 6.17, + "eval_accuracy": 0.933875, + "eval_loss": 0.19635510444641113, + "eval_runtime": 17.4336, + "eval_samples_per_second": 1376.652, + "eval_steps_per_second": 4.302, + "step": 14800 + }, + { + "epoch": 6.17, + "learning_rate": 1e-06, + "loss": 0.2261, + "step": 14820 + }, + { + "epoch": 6.17, + "eval_accuracy": 0.9331666666666667, + "eval_loss": 0.19697488844394684, + "eval_runtime": 16.4872, + "eval_samples_per_second": 1455.677, + "eval_steps_per_second": 4.549, + "step": 14820 + }, + { + "epoch": 6.18, + "learning_rate": 1e-06, + "loss": 0.2095, + "step": 14840 + }, + { + "epoch": 6.18, + "eval_accuracy": 0.9303333333333333, + "eval_loss": 0.2015983760356903, + "eval_runtime": 16.9074, + "eval_samples_per_second": 1419.494, + "eval_steps_per_second": 4.436, + "step": 14840 + }, + { + "epoch": 6.19, + "learning_rate": 1e-06, + "loss": 0.1589, + "step": 14860 + }, + { + "epoch": 6.19, + "eval_accuracy": 0.9330833333333334, + "eval_loss": 0.19708770513534546, + "eval_runtime": 16.2449, + "eval_samples_per_second": 1477.385, + "eval_steps_per_second": 4.617, + "step": 14860 + }, + { + "epoch": 6.2, + "learning_rate": 1e-06, + "loss": 0.191, + "step": 14880 + }, + { + "epoch": 6.2, + "eval_accuracy": 0.9324166666666667, + "eval_loss": 0.19845397770404816, + "eval_runtime": 17.192, + "eval_samples_per_second": 1395.999, + "eval_steps_per_second": 4.362, + "step": 14880 + }, + { + "epoch": 6.21, + "learning_rate": 1e-06, + "loss": 0.1484, + "step": 14900 + }, + { + "epoch": 6.21, + "eval_accuracy": 0.9337083333333334, + "eval_loss": 0.19584734737873077, + "eval_runtime": 17.8255, + "eval_samples_per_second": 1346.386, + "eval_steps_per_second": 4.207, + "step": 14900 + }, + { + "epoch": 6.22, + "learning_rate": 1e-06, + "loss": 0.1791, + "step": 14920 + }, + { + "epoch": 6.22, + "eval_accuracy": 0.9330416666666667, + "eval_loss": 0.19735968112945557, + "eval_runtime": 17.6186, + "eval_samples_per_second": 1362.195, + "eval_steps_per_second": 4.257, + "step": 14920 + }, + { + "epoch": 6.22, + "learning_rate": 1e-06, + "loss": 0.2077, + "step": 14940 + }, + { + "epoch": 6.22, + "eval_accuracy": 0.932375, + "eval_loss": 0.19711482524871826, + "eval_runtime": 17.4097, + "eval_samples_per_second": 1378.545, + "eval_steps_per_second": 4.308, + "step": 14940 + }, + { + "epoch": 6.23, + "learning_rate": 1e-06, + "loss": 0.1918, + "step": 14960 + }, + { + "epoch": 6.23, + "eval_accuracy": 0.9330416666666667, + "eval_loss": 0.19635051488876343, + "eval_runtime": 16.6079, + "eval_samples_per_second": 1445.094, + "eval_steps_per_second": 4.516, + "step": 14960 + }, + { + "epoch": 6.24, + "learning_rate": 1e-06, + "loss": 0.2291, + "step": 14980 + }, + { + "epoch": 6.24, + "eval_accuracy": 0.9330833333333334, + "eval_loss": 0.19559039175510406, + "eval_runtime": 16.5817, + "eval_samples_per_second": 1447.382, + "eval_steps_per_second": 4.523, + "step": 14980 + }, + { + "epoch": 6.25, + "learning_rate": 1e-06, + "loss": 0.1597, + "step": 15000 + }, + { + "epoch": 6.25, + "eval_accuracy": 0.9320833333333334, + "eval_loss": 0.1965901404619217, + "eval_runtime": 16.2198, + "eval_samples_per_second": 1479.673, + "eval_steps_per_second": 4.624, + "step": 15000 + }, + { + "epoch": 6.26, + "learning_rate": 1e-06, + "loss": 0.1844, + "step": 15020 + }, + { + "epoch": 6.26, + "eval_accuracy": 0.9323333333333333, + "eval_loss": 0.19741342961788177, + "eval_runtime": 16.1794, + "eval_samples_per_second": 1483.371, + "eval_steps_per_second": 4.636, + "step": 15020 + }, + { + "epoch": 6.27, + "learning_rate": 1e-06, + "loss": 0.1653, + "step": 15040 + }, + { + "epoch": 6.27, + "eval_accuracy": 0.9330416666666667, + "eval_loss": 0.1963772028684616, + "eval_runtime": 16.9569, + "eval_samples_per_second": 1415.356, + "eval_steps_per_second": 4.423, + "step": 15040 + }, + { + "epoch": 6.28, + "learning_rate": 1e-06, + "loss": 0.158, + "step": 15060 + }, + { + "epoch": 6.28, + "eval_accuracy": 0.9309583333333333, + "eval_loss": 0.20028692483901978, + "eval_runtime": 16.1902, + "eval_samples_per_second": 1482.38, + "eval_steps_per_second": 4.632, + "step": 15060 + }, + { + "epoch": 6.28, + "learning_rate": 1e-06, + "loss": 0.1602, + "step": 15080 + }, + { + "epoch": 6.28, + "eval_accuracy": 0.932625, + "eval_loss": 0.19667039811611176, + "eval_runtime": 16.4266, + "eval_samples_per_second": 1461.041, + "eval_steps_per_second": 4.566, + "step": 15080 + }, + { + "epoch": 6.29, + "learning_rate": 1e-06, + "loss": 0.1656, + "step": 15100 + }, + { + "epoch": 6.29, + "eval_accuracy": 0.9329166666666666, + "eval_loss": 0.19664861261844635, + "eval_runtime": 17.1479, + "eval_samples_per_second": 1399.589, + "eval_steps_per_second": 4.374, + "step": 15100 + }, + { + "epoch": 6.3, + "learning_rate": 1e-06, + "loss": 0.1691, + "step": 15120 + }, + { + "epoch": 6.3, + "eval_accuracy": 0.9322916666666666, + "eval_loss": 0.19764386117458344, + "eval_runtime": 17.5239, + "eval_samples_per_second": 1369.555, + "eval_steps_per_second": 4.28, + "step": 15120 + }, + { + "epoch": 6.31, + "learning_rate": 1e-06, + "loss": 0.1598, + "step": 15140 + }, + { + "epoch": 6.31, + "eval_accuracy": 0.9333333333333333, + "eval_loss": 0.1968608945608139, + "eval_runtime": 17.6137, + "eval_samples_per_second": 1362.576, + "eval_steps_per_second": 4.258, + "step": 15140 + }, + { + "epoch": 6.32, + "learning_rate": 1e-06, + "loss": 0.1768, + "step": 15160 + }, + { + "epoch": 6.32, + "eval_accuracy": 0.9334583333333333, + "eval_loss": 0.19633811712265015, + "eval_runtime": 16.2937, + "eval_samples_per_second": 1472.96, + "eval_steps_per_second": 4.603, + "step": 15160 + }, + { + "epoch": 6.33, + "learning_rate": 1e-06, + "loss": 0.1653, + "step": 15180 + }, + { + "epoch": 6.33, + "eval_accuracy": 0.9334166666666667, + "eval_loss": 0.1963397115468979, + "eval_runtime": 16.377, + "eval_samples_per_second": 1465.471, + "eval_steps_per_second": 4.58, + "step": 15180 + }, + { + "epoch": 6.33, + "learning_rate": 1e-06, + "loss": 0.1822, + "step": 15200 + }, + { + "epoch": 6.33, + "eval_accuracy": 0.9312916666666666, + "eval_loss": 0.20008063316345215, + "eval_runtime": 16.4032, + "eval_samples_per_second": 1463.126, + "eval_steps_per_second": 4.572, + "step": 15200 + }, + { + "epoch": 6.34, + "learning_rate": 1e-06, + "loss": 0.1895, + "step": 15220 + }, + { + "epoch": 6.34, + "eval_accuracy": 0.9325416666666667, + "eval_loss": 0.19740186631679535, + "eval_runtime": 17.3348, + "eval_samples_per_second": 1384.502, + "eval_steps_per_second": 4.327, + "step": 15220 + }, + { + "epoch": 6.35, + "learning_rate": 1e-06, + "loss": 0.1622, + "step": 15240 + }, + { + "epoch": 6.35, + "eval_accuracy": 0.9315, + "eval_loss": 0.19918110966682434, + "eval_runtime": 16.9401, + "eval_samples_per_second": 1416.753, + "eval_steps_per_second": 4.427, + "step": 15240 + }, + { + "epoch": 6.36, + "learning_rate": 1e-06, + "loss": 0.1982, + "step": 15260 + }, + { + "epoch": 6.36, + "eval_accuracy": 0.93225, + "eval_loss": 0.19843068718910217, + "eval_runtime": 17.3631, + "eval_samples_per_second": 1382.241, + "eval_steps_per_second": 4.32, + "step": 15260 + }, + { + "epoch": 6.37, + "learning_rate": 1e-06, + "loss": 0.172, + "step": 15280 + }, + { + "epoch": 6.37, + "eval_accuracy": 0.9322916666666666, + "eval_loss": 0.19916561245918274, + "eval_runtime": 17.0127, + "eval_samples_per_second": 1410.711, + "eval_steps_per_second": 4.408, + "step": 15280 + }, + { + "epoch": 6.38, + "learning_rate": 1e-06, + "loss": 0.1573, + "step": 15300 + }, + { + "epoch": 6.38, + "eval_accuracy": 0.9306666666666666, + "eval_loss": 0.20143656432628632, + "eval_runtime": 17.4933, + "eval_samples_per_second": 1371.953, + "eval_steps_per_second": 4.287, + "step": 15300 + }, + { + "epoch": 6.38, + "learning_rate": 1e-06, + "loss": 0.158, + "step": 15320 + }, + { + "epoch": 6.38, + "eval_accuracy": 0.9324583333333333, + "eval_loss": 0.19875669479370117, + "eval_runtime": 16.9838, + "eval_samples_per_second": 1413.115, + "eval_steps_per_second": 4.416, + "step": 15320 + }, + { + "epoch": 6.39, + "learning_rate": 1e-06, + "loss": 0.1359, + "step": 15340 + }, + { + "epoch": 6.39, + "eval_accuracy": 0.9317916666666667, + "eval_loss": 0.19885526597499847, + "eval_runtime": 16.3343, + "eval_samples_per_second": 1469.303, + "eval_steps_per_second": 4.592, + "step": 15340 + }, + { + "epoch": 6.4, + "learning_rate": 1e-06, + "loss": 0.2088, + "step": 15360 + }, + { + "epoch": 6.4, + "eval_accuracy": 0.931625, + "eval_loss": 0.19839046895503998, + "eval_runtime": 16.2551, + "eval_samples_per_second": 1476.461, + "eval_steps_per_second": 4.614, + "step": 15360 + }, + { + "epoch": 6.41, + "learning_rate": 1e-06, + "loss": 0.1952, + "step": 15380 + }, + { + "epoch": 6.41, + "eval_accuracy": 0.9315833333333333, + "eval_loss": 0.1983661949634552, + "eval_runtime": 16.7153, + "eval_samples_per_second": 1435.806, + "eval_steps_per_second": 4.487, + "step": 15380 + }, + { + "epoch": 6.42, + "learning_rate": 1e-06, + "loss": 0.1516, + "step": 15400 + }, + { + "epoch": 6.42, + "eval_accuracy": 0.9327083333333334, + "eval_loss": 0.1965571790933609, + "eval_runtime": 16.5815, + "eval_samples_per_second": 1447.4, + "eval_steps_per_second": 4.523, + "step": 15400 + }, + { + "epoch": 6.42, + "learning_rate": 1e-06, + "loss": 0.2063, + "step": 15420 + }, + { + "epoch": 6.42, + "eval_accuracy": 0.9333333333333333, + "eval_loss": 0.19558171927928925, + "eval_runtime": 17.0847, + "eval_samples_per_second": 1404.766, + "eval_steps_per_second": 4.39, + "step": 15420 + }, + { + "epoch": 6.43, + "learning_rate": 1e-06, + "loss": 0.1766, + "step": 15440 + }, + { + "epoch": 6.43, + "eval_accuracy": 0.932875, + "eval_loss": 0.19576992094516754, + "eval_runtime": 17.3112, + "eval_samples_per_second": 1386.388, + "eval_steps_per_second": 4.332, + "step": 15440 + }, + { + "epoch": 6.44, + "learning_rate": 1e-06, + "loss": 0.1711, + "step": 15460 + }, + { + "epoch": 6.44, + "eval_accuracy": 0.93275, + "eval_loss": 0.1965101659297943, + "eval_runtime": 16.7762, + "eval_samples_per_second": 1430.595, + "eval_steps_per_second": 4.471, + "step": 15460 + }, + { + "epoch": 6.45, + "learning_rate": 1e-06, + "loss": 0.1621, + "step": 15480 + }, + { + "epoch": 6.45, + "eval_accuracy": 0.932375, + "eval_loss": 0.19722963869571686, + "eval_runtime": 16.5923, + "eval_samples_per_second": 1446.456, + "eval_steps_per_second": 4.52, + "step": 15480 + }, + { + "epoch": 6.46, + "learning_rate": 1e-06, + "loss": 0.1854, + "step": 15500 + }, + { + "epoch": 6.46, + "eval_accuracy": 0.9300833333333334, + "eval_loss": 0.20203134417533875, + "eval_runtime": 16.701, + "eval_samples_per_second": 1437.04, + "eval_steps_per_second": 4.491, + "step": 15500 + }, + { + "epoch": 6.47, + "learning_rate": 1e-06, + "loss": 0.1731, + "step": 15520 + }, + { + "epoch": 6.47, + "eval_accuracy": 0.9314583333333334, + "eval_loss": 0.19797883927822113, + "eval_runtime": 17.3415, + "eval_samples_per_second": 1383.962, + "eval_steps_per_second": 4.325, + "step": 15520 + }, + { + "epoch": 6.47, + "learning_rate": 1e-06, + "loss": 0.1948, + "step": 15540 + }, + { + "epoch": 6.47, + "eval_accuracy": 0.9307916666666667, + "eval_loss": 0.20005568861961365, + "eval_runtime": 17.5386, + "eval_samples_per_second": 1368.409, + "eval_steps_per_second": 4.276, + "step": 15540 + }, + { + "epoch": 6.48, + "learning_rate": 1e-06, + "loss": 0.2325, + "step": 15560 + }, + { + "epoch": 6.48, + "eval_accuracy": 0.93075, + "eval_loss": 0.1986798346042633, + "eval_runtime": 17.0336, + "eval_samples_per_second": 1408.979, + "eval_steps_per_second": 4.403, + "step": 15560 + }, + { + "epoch": 6.49, + "learning_rate": 1e-06, + "loss": 0.1913, + "step": 15580 + }, + { + "epoch": 6.49, + "eval_accuracy": 0.9315833333333333, + "eval_loss": 0.19669051468372345, + "eval_runtime": 16.2579, + "eval_samples_per_second": 1476.206, + "eval_steps_per_second": 4.613, + "step": 15580 + }, + { + "epoch": 6.5, + "learning_rate": 1e-06, + "loss": 0.1755, + "step": 15600 + }, + { + "epoch": 6.5, + "eval_accuracy": 0.9321666666666667, + "eval_loss": 0.1981978416442871, + "eval_runtime": 16.3857, + "eval_samples_per_second": 1464.688, + "eval_steps_per_second": 4.577, + "step": 15600 + }, + { + "epoch": 6.51, + "learning_rate": 1e-06, + "loss": 0.1944, + "step": 15620 + }, + { + "epoch": 6.51, + "eval_accuracy": 0.9316666666666666, + "eval_loss": 0.20202693343162537, + "eval_runtime": 16.8881, + "eval_samples_per_second": 1421.117, + "eval_steps_per_second": 4.441, + "step": 15620 + }, + { + "epoch": 6.52, + "learning_rate": 1e-06, + "loss": 0.1487, + "step": 15640 + }, + { + "epoch": 6.52, + "eval_accuracy": 0.9341666666666667, + "eval_loss": 0.19385740160942078, + "eval_runtime": 16.5171, + "eval_samples_per_second": 1453.042, + "eval_steps_per_second": 4.541, + "step": 15640 + }, + { + "epoch": 6.53, + "learning_rate": 1e-06, + "loss": 0.1854, + "step": 15660 + }, + { + "epoch": 6.53, + "eval_accuracy": 0.9324166666666667, + "eval_loss": 0.19531066715717316, + "eval_runtime": 17.796, + "eval_samples_per_second": 1348.616, + "eval_steps_per_second": 4.214, + "step": 15660 + }, + { + "epoch": 6.53, + "learning_rate": 1e-06, + "loss": 0.1331, + "step": 15680 + }, + { + "epoch": 6.53, + "eval_accuracy": 0.933125, + "eval_loss": 0.1954515278339386, + "eval_runtime": 17.1458, + "eval_samples_per_second": 1399.762, + "eval_steps_per_second": 4.374, + "step": 15680 + }, + { + "epoch": 6.54, + "learning_rate": 1e-06, + "loss": 0.2017, + "step": 15700 + }, + { + "epoch": 6.54, + "eval_accuracy": 0.93275, + "eval_loss": 0.19526571035385132, + "eval_runtime": 17.0288, + "eval_samples_per_second": 1409.381, + "eval_steps_per_second": 4.404, + "step": 15700 + }, + { + "epoch": 6.55, + "learning_rate": 1e-06, + "loss": 0.1507, + "step": 15720 + }, + { + "epoch": 6.55, + "eval_accuracy": 0.9327083333333334, + "eval_loss": 0.19576336443424225, + "eval_runtime": 16.53, + "eval_samples_per_second": 1451.904, + "eval_steps_per_second": 4.537, + "step": 15720 + }, + { + "epoch": 6.56, + "learning_rate": 1e-06, + "loss": 0.1459, + "step": 15740 + }, + { + "epoch": 6.56, + "eval_accuracy": 0.9308333333333333, + "eval_loss": 0.19955700635910034, + "eval_runtime": 16.5253, + "eval_samples_per_second": 1452.316, + "eval_steps_per_second": 4.538, + "step": 15740 + }, + { + "epoch": 6.57, + "learning_rate": 1e-06, + "loss": 0.1585, + "step": 15760 + }, + { + "epoch": 6.57, + "eval_accuracy": 0.932625, + "eval_loss": 0.1977219432592392, + "eval_runtime": 16.8034, + "eval_samples_per_second": 1428.285, + "eval_steps_per_second": 4.463, + "step": 15760 + }, + { + "epoch": 6.58, + "learning_rate": 1e-06, + "loss": 0.1814, + "step": 15780 + }, + { + "epoch": 6.58, + "eval_accuracy": 0.93375, + "eval_loss": 0.19515223801136017, + "eval_runtime": 17.0293, + "eval_samples_per_second": 1409.337, + "eval_steps_per_second": 4.404, + "step": 15780 + }, + { + "epoch": 6.58, + "learning_rate": 1e-06, + "loss": 0.1383, + "step": 15800 + }, + { + "epoch": 6.58, + "eval_accuracy": 0.9335833333333333, + "eval_loss": 0.19543632864952087, + "eval_runtime": 17.0436, + "eval_samples_per_second": 1408.152, + "eval_steps_per_second": 4.4, + "step": 15800 + }, + { + "epoch": 6.59, + "learning_rate": 1e-06, + "loss": 0.1981, + "step": 15820 + }, + { + "epoch": 6.59, + "eval_accuracy": 0.9323333333333333, + "eval_loss": 0.19736984372138977, + "eval_runtime": 16.4411, + "eval_samples_per_second": 1459.753, + "eval_steps_per_second": 4.562, + "step": 15820 + }, + { + "epoch": 6.6, + "learning_rate": 1e-06, + "loss": 0.1715, + "step": 15840 + }, + { + "epoch": 6.6, + "eval_accuracy": 0.93225, + "eval_loss": 0.1977250874042511, + "eval_runtime": 16.9027, + "eval_samples_per_second": 1419.887, + "eval_steps_per_second": 4.437, + "step": 15840 + }, + { + "epoch": 6.61, + "learning_rate": 1e-06, + "loss": 0.1392, + "step": 15860 + }, + { + "epoch": 6.61, + "eval_accuracy": 0.9314166666666667, + "eval_loss": 0.19858193397521973, + "eval_runtime": 16.2175, + "eval_samples_per_second": 1479.884, + "eval_steps_per_second": 4.625, + "step": 15860 + }, + { + "epoch": 6.62, + "learning_rate": 1e-06, + "loss": 0.1587, + "step": 15880 + }, + { + "epoch": 6.62, + "eval_accuracy": 0.93225, + "eval_loss": 0.19824790954589844, + "eval_runtime": 16.4062, + "eval_samples_per_second": 1462.863, + "eval_steps_per_second": 4.571, + "step": 15880 + }, + { + "epoch": 6.62, + "learning_rate": 1e-06, + "loss": 0.1697, + "step": 15900 + }, + { + "epoch": 6.62, + "eval_accuracy": 0.9299583333333333, + "eval_loss": 0.2014349400997162, + "eval_runtime": 16.3856, + "eval_samples_per_second": 1464.702, + "eval_steps_per_second": 4.577, + "step": 15900 + }, + { + "epoch": 6.63, + "learning_rate": 1e-06, + "loss": 0.1861, + "step": 15920 + }, + { + "epoch": 6.63, + "eval_accuracy": 0.9314166666666667, + "eval_loss": 0.1988927125930786, + "eval_runtime": 16.3893, + "eval_samples_per_second": 1464.366, + "eval_steps_per_second": 4.576, + "step": 15920 + }, + { + "epoch": 6.64, + "learning_rate": 1e-06, + "loss": 0.174, + "step": 15940 + }, + { + "epoch": 6.64, + "eval_accuracy": 0.931375, + "eval_loss": 0.1998647302389145, + "eval_runtime": 16.7731, + "eval_samples_per_second": 1430.861, + "eval_steps_per_second": 4.471, + "step": 15940 + }, + { + "epoch": 6.65, + "learning_rate": 1e-06, + "loss": 0.1641, + "step": 15960 + }, + { + "epoch": 6.65, + "eval_accuracy": 0.9314166666666667, + "eval_loss": 0.20076820254325867, + "eval_runtime": 16.2481, + "eval_samples_per_second": 1477.098, + "eval_steps_per_second": 4.616, + "step": 15960 + }, + { + "epoch": 6.66, + "learning_rate": 1e-06, + "loss": 0.1408, + "step": 15980 + }, + { + "epoch": 6.66, + "eval_accuracy": 0.9344583333333333, + "eval_loss": 0.19549201428890228, + "eval_runtime": 16.2222, + "eval_samples_per_second": 1479.458, + "eval_steps_per_second": 4.623, + "step": 15980 + }, + { + "epoch": 6.67, + "learning_rate": 1e-06, + "loss": 0.1878, + "step": 16000 + }, + { + "epoch": 6.67, + "eval_accuracy": 0.9340833333333334, + "eval_loss": 0.1959511786699295, + "eval_runtime": 16.3546, + "eval_samples_per_second": 1467.474, + "eval_steps_per_second": 4.586, + "step": 16000 + }, + { + "epoch": 6.67, + "learning_rate": 1e-06, + "loss": 0.1425, + "step": 16020 + }, + { + "epoch": 6.67, + "eval_accuracy": 0.9337916666666667, + "eval_loss": 0.1953056901693344, + "eval_runtime": 16.458, + "eval_samples_per_second": 1458.255, + "eval_steps_per_second": 4.557, + "step": 16020 + }, + { + "epoch": 6.68, + "learning_rate": 1e-06, + "loss": 0.184, + "step": 16040 + }, + { + "epoch": 6.68, + "eval_accuracy": 0.931625, + "eval_loss": 0.19988182187080383, + "eval_runtime": 17.0453, + "eval_samples_per_second": 1408.016, + "eval_steps_per_second": 4.4, + "step": 16040 + }, + { + "epoch": 6.69, + "learning_rate": 1e-06, + "loss": 0.1609, + "step": 16060 + }, + { + "epoch": 6.69, + "eval_accuracy": 0.9342916666666666, + "eval_loss": 0.19486786425113678, + "eval_runtime": 16.3548, + "eval_samples_per_second": 1467.457, + "eval_steps_per_second": 4.586, + "step": 16060 + }, + { + "epoch": 6.7, + "learning_rate": 1e-06, + "loss": 0.1757, + "step": 16080 + }, + { + "epoch": 6.7, + "eval_accuracy": 0.93375, + "eval_loss": 0.195814847946167, + "eval_runtime": 16.6686, + "eval_samples_per_second": 1439.835, + "eval_steps_per_second": 4.499, + "step": 16080 + }, + { + "epoch": 6.71, + "learning_rate": 1e-06, + "loss": 0.1657, + "step": 16100 + }, + { + "epoch": 6.71, + "eval_accuracy": 0.9322083333333333, + "eval_loss": 0.19810351729393005, + "eval_runtime": 17.5937, + "eval_samples_per_second": 1364.125, + "eval_steps_per_second": 4.263, + "step": 16100 + }, + { + "epoch": 6.72, + "learning_rate": 1e-06, + "loss": 0.1496, + "step": 16120 + }, + { + "epoch": 6.72, + "eval_accuracy": 0.933875, + "eval_loss": 0.19454562664031982, + "eval_runtime": 17.5068, + "eval_samples_per_second": 1370.9, + "eval_steps_per_second": 4.284, + "step": 16120 + }, + { + "epoch": 6.72, + "learning_rate": 1e-06, + "loss": 0.2053, + "step": 16140 + }, + { + "epoch": 6.72, + "eval_accuracy": 0.9334166666666667, + "eval_loss": 0.19530512392520905, + "eval_runtime": 17.6899, + "eval_samples_per_second": 1356.703, + "eval_steps_per_second": 4.24, + "step": 16140 + }, + { + "epoch": 6.73, + "learning_rate": 1e-06, + "loss": 0.1905, + "step": 16160 + }, + { + "epoch": 6.73, + "eval_accuracy": 0.933375, + "eval_loss": 0.19494038820266724, + "eval_runtime": 17.39, + "eval_samples_per_second": 1380.102, + "eval_steps_per_second": 4.313, + "step": 16160 + }, + { + "epoch": 6.74, + "learning_rate": 1e-06, + "loss": 0.1515, + "step": 16180 + }, + { + "epoch": 6.74, + "eval_accuracy": 0.9335833333333333, + "eval_loss": 0.19500760734081268, + "eval_runtime": 17.8113, + "eval_samples_per_second": 1347.457, + "eval_steps_per_second": 4.211, + "step": 16180 + }, + { + "epoch": 6.75, + "learning_rate": 1e-06, + "loss": 0.1705, + "step": 16200 + }, + { + "epoch": 6.75, + "eval_accuracy": 0.9324166666666667, + "eval_loss": 0.19622166454792023, + "eval_runtime": 17.3835, + "eval_samples_per_second": 1380.617, + "eval_steps_per_second": 4.314, + "step": 16200 + }, + { + "epoch": 6.76, + "learning_rate": 1e-06, + "loss": 0.1756, + "step": 16220 + }, + { + "epoch": 6.76, + "eval_accuracy": 0.9332916666666666, + "eval_loss": 0.1943114697933197, + "eval_runtime": 17.2208, + "eval_samples_per_second": 1393.664, + "eval_steps_per_second": 4.355, + "step": 16220 + }, + { + "epoch": 6.77, + "learning_rate": 1e-06, + "loss": 0.1729, + "step": 16240 + }, + { + "epoch": 6.77, + "eval_accuracy": 0.9343333333333333, + "eval_loss": 0.19441558420658112, + "eval_runtime": 17.5483, + "eval_samples_per_second": 1367.653, + "eval_steps_per_second": 4.274, + "step": 16240 + }, + { + "epoch": 6.78, + "learning_rate": 1e-06, + "loss": 0.1526, + "step": 16260 + }, + { + "epoch": 6.78, + "eval_accuracy": 0.9337083333333334, + "eval_loss": 0.1942491978406906, + "eval_runtime": 16.3394, + "eval_samples_per_second": 1468.842, + "eval_steps_per_second": 4.59, + "step": 16260 + }, + { + "epoch": 6.78, + "learning_rate": 1e-06, + "loss": 0.1684, + "step": 16280 + }, + { + "epoch": 6.78, + "eval_accuracy": 0.9335833333333333, + "eval_loss": 0.1948528289794922, + "eval_runtime": 16.0087, + "eval_samples_per_second": 1499.183, + "eval_steps_per_second": 4.685, + "step": 16280 + }, + { + "epoch": 6.79, + "learning_rate": 1e-06, + "loss": 0.1756, + "step": 16300 + }, + { + "epoch": 6.79, + "eval_accuracy": 0.9330416666666667, + "eval_loss": 0.19495390355587006, + "eval_runtime": 16.737, + "eval_samples_per_second": 1433.945, + "eval_steps_per_second": 4.481, + "step": 16300 + }, + { + "epoch": 6.8, + "learning_rate": 1e-06, + "loss": 0.1706, + "step": 16320 + }, + { + "epoch": 6.8, + "eval_accuracy": 0.9318333333333333, + "eval_loss": 0.19855649769306183, + "eval_runtime": 16.285, + "eval_samples_per_second": 1473.752, + "eval_steps_per_second": 4.605, + "step": 16320 + }, + { + "epoch": 6.81, + "learning_rate": 1e-06, + "loss": 0.1655, + "step": 16340 + }, + { + "epoch": 6.81, + "eval_accuracy": 0.9319166666666666, + "eval_loss": 0.19920918345451355, + "eval_runtime": 17.3955, + "eval_samples_per_second": 1379.671, + "eval_steps_per_second": 4.311, + "step": 16340 + }, + { + "epoch": 6.82, + "learning_rate": 1e-06, + "loss": 0.1858, + "step": 16360 + }, + { + "epoch": 6.82, + "eval_accuracy": 0.9325, + "eval_loss": 0.19792011380195618, + "eval_runtime": 17.319, + "eval_samples_per_second": 1385.762, + "eval_steps_per_second": 4.331, + "step": 16360 + }, + { + "epoch": 6.83, + "learning_rate": 1e-06, + "loss": 0.1926, + "step": 16380 + }, + { + "epoch": 6.83, + "eval_accuracy": 0.9332083333333333, + "eval_loss": 0.19599467515945435, + "eval_runtime": 16.5773, + "eval_samples_per_second": 1447.759, + "eval_steps_per_second": 4.524, + "step": 16380 + }, + { + "epoch": 6.83, + "learning_rate": 1e-06, + "loss": 0.162, + "step": 16400 + }, + { + "epoch": 6.83, + "eval_accuracy": 0.9325, + "eval_loss": 0.1975654661655426, + "eval_runtime": 16.0954, + "eval_samples_per_second": 1491.106, + "eval_steps_per_second": 4.66, + "step": 16400 + }, + { + "epoch": 6.84, + "learning_rate": 1e-06, + "loss": 0.2168, + "step": 16420 + }, + { + "epoch": 6.84, + "eval_accuracy": 0.9322083333333333, + "eval_loss": 0.19847214221954346, + "eval_runtime": 16.3923, + "eval_samples_per_second": 1464.098, + "eval_steps_per_second": 4.575, + "step": 16420 + }, + { + "epoch": 6.85, + "learning_rate": 1e-06, + "loss": 0.173, + "step": 16440 + }, + { + "epoch": 6.85, + "eval_accuracy": 0.9312916666666666, + "eval_loss": 0.2017899602651596, + "eval_runtime": 16.2391, + "eval_samples_per_second": 1477.915, + "eval_steps_per_second": 4.618, + "step": 16440 + }, + { + "epoch": 6.86, + "learning_rate": 1e-06, + "loss": 0.1891, + "step": 16460 + }, + { + "epoch": 6.86, + "eval_accuracy": 0.93225, + "eval_loss": 0.19827592372894287, + "eval_runtime": 16.3796, + "eval_samples_per_second": 1465.236, + "eval_steps_per_second": 4.579, + "step": 16460 + }, + { + "epoch": 6.87, + "learning_rate": 1e-06, + "loss": 0.1619, + "step": 16480 + }, + { + "epoch": 6.87, + "eval_accuracy": 0.9332916666666666, + "eval_loss": 0.19625312089920044, + "eval_runtime": 17.7432, + "eval_samples_per_second": 1352.634, + "eval_steps_per_second": 4.227, + "step": 16480 + }, + { + "epoch": 6.88, + "learning_rate": 1e-06, + "loss": 0.1884, + "step": 16500 + }, + { + "epoch": 6.88, + "eval_accuracy": 0.9324166666666667, + "eval_loss": 0.19771744310855865, + "eval_runtime": 16.6178, + "eval_samples_per_second": 1444.233, + "eval_steps_per_second": 4.513, + "step": 16500 + }, + { + "epoch": 6.88, + "learning_rate": 1e-06, + "loss": 0.1735, + "step": 16520 + }, + { + "epoch": 6.88, + "eval_accuracy": 0.9327916666666667, + "eval_loss": 0.19762367010116577, + "eval_runtime": 16.9624, + "eval_samples_per_second": 1414.896, + "eval_steps_per_second": 4.422, + "step": 16520 + }, + { + "epoch": 6.89, + "learning_rate": 1e-06, + "loss": 0.1949, + "step": 16540 + }, + { + "epoch": 6.89, + "eval_accuracy": 0.9330416666666667, + "eval_loss": 0.19725541770458221, + "eval_runtime": 16.0777, + "eval_samples_per_second": 1492.755, + "eval_steps_per_second": 4.665, + "step": 16540 + }, + { + "epoch": 6.9, + "learning_rate": 1e-06, + "loss": 0.2021, + "step": 16560 + }, + { + "epoch": 6.9, + "eval_accuracy": 0.9340833333333334, + "eval_loss": 0.1951657384634018, + "eval_runtime": 16.8714, + "eval_samples_per_second": 1422.525, + "eval_steps_per_second": 4.445, + "step": 16560 + }, + { + "epoch": 6.91, + "learning_rate": 1e-06, + "loss": 0.1742, + "step": 16580 + }, + { + "epoch": 6.91, + "eval_accuracy": 0.9345833333333333, + "eval_loss": 0.19460086524486542, + "eval_runtime": 16.336, + "eval_samples_per_second": 1469.149, + "eval_steps_per_second": 4.591, + "step": 16580 + }, + { + "epoch": 6.92, + "learning_rate": 1e-06, + "loss": 0.1935, + "step": 16600 + }, + { + "epoch": 6.92, + "eval_accuracy": 0.9344583333333333, + "eval_loss": 0.19422036409378052, + "eval_runtime": 16.245, + "eval_samples_per_second": 1477.38, + "eval_steps_per_second": 4.617, + "step": 16600 + }, + { + "epoch": 6.92, + "learning_rate": 1e-06, + "loss": 0.1709, + "step": 16620 + }, + { + "epoch": 6.92, + "eval_accuracy": 0.9335, + "eval_loss": 0.1949995458126068, + "eval_runtime": 16.3728, + "eval_samples_per_second": 1465.845, + "eval_steps_per_second": 4.581, + "step": 16620 + }, + { + "epoch": 6.93, + "learning_rate": 1e-06, + "loss": 0.1675, + "step": 16640 + }, + { + "epoch": 6.93, + "eval_accuracy": 0.9334166666666667, + "eval_loss": 0.19600391387939453, + "eval_runtime": 16.421, + "eval_samples_per_second": 1461.543, + "eval_steps_per_second": 4.567, + "step": 16640 + }, + { + "epoch": 6.94, + "learning_rate": 1e-06, + "loss": 0.1648, + "step": 16660 + }, + { + "epoch": 6.94, + "eval_accuracy": 0.9329583333333333, + "eval_loss": 0.19674494862556458, + "eval_runtime": 16.3324, + "eval_samples_per_second": 1469.47, + "eval_steps_per_second": 4.592, + "step": 16660 + }, + { + "epoch": 6.95, + "learning_rate": 1e-06, + "loss": 0.1757, + "step": 16680 + }, + { + "epoch": 6.95, + "eval_accuracy": 0.9335833333333333, + "eval_loss": 0.1960502564907074, + "eval_runtime": 17.1637, + "eval_samples_per_second": 1398.302, + "eval_steps_per_second": 4.37, + "step": 16680 + }, + { + "epoch": 6.96, + "learning_rate": 1e-06, + "loss": 0.1743, + "step": 16700 + }, + { + "epoch": 6.96, + "eval_accuracy": 0.9329583333333333, + "eval_loss": 0.19483575224876404, + "eval_runtime": 15.8961, + "eval_samples_per_second": 1509.806, + "eval_steps_per_second": 4.718, + "step": 16700 + }, + { + "epoch": 6.97, + "learning_rate": 1e-06, + "loss": 0.17, + "step": 16720 + }, + { + "epoch": 6.97, + "eval_accuracy": 0.9332083333333333, + "eval_loss": 0.19447939097881317, + "eval_runtime": 16.8287, + "eval_samples_per_second": 1426.132, + "eval_steps_per_second": 4.457, + "step": 16720 + }, + { + "epoch": 6.97, + "learning_rate": 1e-06, + "loss": 0.1625, + "step": 16740 + }, + { + "epoch": 6.97, + "eval_accuracy": 0.933875, + "eval_loss": 0.1952148824930191, + "eval_runtime": 16.0124, + "eval_samples_per_second": 1498.839, + "eval_steps_per_second": 4.684, + "step": 16740 + }, + { + "epoch": 6.98, + "learning_rate": 1e-06, + "loss": 0.1802, + "step": 16760 + }, + { + "epoch": 6.98, + "eval_accuracy": 0.9338333333333333, + "eval_loss": 0.19577091932296753, + "eval_runtime": 16.0326, + "eval_samples_per_second": 1496.951, + "eval_steps_per_second": 4.678, + "step": 16760 + }, + { + "epoch": 6.99, + "learning_rate": 1e-06, + "loss": 0.1855, + "step": 16780 + }, + { + "epoch": 6.99, + "eval_accuracy": 0.932625, + "eval_loss": 0.19726844131946564, + "eval_runtime": 17.465, + "eval_samples_per_second": 1374.176, + "eval_steps_per_second": 4.294, + "step": 16780 + }, + { + "epoch": 7.0, + "learning_rate": 1e-06, + "loss": 0.1623, + "step": 16800 + }, + { + "epoch": 7.0, + "eval_accuracy": 0.9333333333333333, + "eval_loss": 0.19629396498203278, + "eval_runtime": 17.5957, + "eval_samples_per_second": 1363.968, + "eval_steps_per_second": 4.262, + "step": 16800 + }, + { + "epoch": 7.01, + "learning_rate": 1e-06, + "loss": 0.1521, + "step": 16820 + }, + { + "epoch": 7.01, + "eval_accuracy": 0.931625, + "eval_loss": 0.1997893750667572, + "eval_runtime": 17.3287, + "eval_samples_per_second": 1384.986, + "eval_steps_per_second": 4.328, + "step": 16820 + }, + { + "epoch": 7.02, + "learning_rate": 1e-06, + "loss": 0.2071, + "step": 16840 + }, + { + "epoch": 7.02, + "eval_accuracy": 0.9344166666666667, + "eval_loss": 0.1939399242401123, + "eval_runtime": 16.6723, + "eval_samples_per_second": 1439.518, + "eval_steps_per_second": 4.498, + "step": 16840 + }, + { + "epoch": 7.03, + "learning_rate": 1e-06, + "loss": 0.1669, + "step": 16860 + }, + { + "epoch": 7.03, + "eval_accuracy": 0.9325833333333333, + "eval_loss": 0.1989278346300125, + "eval_runtime": 16.9163, + "eval_samples_per_second": 1418.746, + "eval_steps_per_second": 4.434, + "step": 16860 + }, + { + "epoch": 7.03, + "learning_rate": 1e-06, + "loss": 0.1695, + "step": 16880 + }, + { + "epoch": 7.03, + "eval_accuracy": 0.9313333333333333, + "eval_loss": 0.20084674656391144, + "eval_runtime": 17.4712, + "eval_samples_per_second": 1373.69, + "eval_steps_per_second": 4.293, + "step": 16880 + }, + { + "epoch": 7.04, + "learning_rate": 1e-06, + "loss": 0.2092, + "step": 16900 + }, + { + "epoch": 7.04, + "eval_accuracy": 0.9324166666666667, + "eval_loss": 0.19925980269908905, + "eval_runtime": 17.5432, + "eval_samples_per_second": 1368.052, + "eval_steps_per_second": 4.275, + "step": 16900 + }, + { + "epoch": 7.05, + "learning_rate": 1e-06, + "loss": 0.1859, + "step": 16920 + }, + { + "epoch": 7.05, + "eval_accuracy": 0.9331666666666667, + "eval_loss": 0.19550496339797974, + "eval_runtime": 17.387, + "eval_samples_per_second": 1380.338, + "eval_steps_per_second": 4.314, + "step": 16920 + }, + { + "epoch": 7.06, + "learning_rate": 1e-06, + "loss": 0.1407, + "step": 16940 + }, + { + "epoch": 7.06, + "eval_accuracy": 0.9350833333333334, + "eval_loss": 0.19279736280441284, + "eval_runtime": 17.6476, + "eval_samples_per_second": 1359.958, + "eval_steps_per_second": 4.25, + "step": 16940 + }, + { + "epoch": 7.07, + "learning_rate": 1e-06, + "loss": 0.1564, + "step": 16960 + }, + { + "epoch": 7.07, + "eval_accuracy": 0.9344583333333333, + "eval_loss": 0.19368121027946472, + "eval_runtime": 17.8587, + "eval_samples_per_second": 1343.884, + "eval_steps_per_second": 4.2, + "step": 16960 + }, + { + "epoch": 7.08, + "learning_rate": 1e-06, + "loss": 0.1514, + "step": 16980 + }, + { + "epoch": 7.08, + "eval_accuracy": 0.9342083333333333, + "eval_loss": 0.19475901126861572, + "eval_runtime": 17.1608, + "eval_samples_per_second": 1398.536, + "eval_steps_per_second": 4.37, + "step": 16980 + }, + { + "epoch": 7.08, + "learning_rate": 1e-06, + "loss": 0.1425, + "step": 17000 + }, + { + "epoch": 7.08, + "eval_accuracy": 0.932125, + "eval_loss": 0.19865868985652924, + "eval_runtime": 17.0882, + "eval_samples_per_second": 1404.476, + "eval_steps_per_second": 4.389, + "step": 17000 + }, + { + "epoch": 7.09, + "learning_rate": 1e-06, + "loss": 0.1849, + "step": 17020 + }, + { + "epoch": 7.09, + "eval_accuracy": 0.9346666666666666, + "eval_loss": 0.1954139769077301, + "eval_runtime": 17.452, + "eval_samples_per_second": 1375.197, + "eval_steps_per_second": 4.297, + "step": 17020 + }, + { + "epoch": 7.1, + "learning_rate": 1e-06, + "loss": 0.1662, + "step": 17040 + }, + { + "epoch": 7.1, + "eval_accuracy": 0.934875, + "eval_loss": 0.19401773810386658, + "eval_runtime": 16.9959, + "eval_samples_per_second": 1412.107, + "eval_steps_per_second": 4.413, + "step": 17040 + }, + { + "epoch": 7.11, + "learning_rate": 1e-06, + "loss": 0.1887, + "step": 17060 + }, + { + "epoch": 7.11, + "eval_accuracy": 0.9336666666666666, + "eval_loss": 0.1946951448917389, + "eval_runtime": 16.4323, + "eval_samples_per_second": 1460.542, + "eval_steps_per_second": 4.564, + "step": 17060 + }, + { + "epoch": 7.12, + "learning_rate": 1e-06, + "loss": 0.1704, + "step": 17080 + }, + { + "epoch": 7.12, + "eval_accuracy": 0.9337083333333334, + "eval_loss": 0.1954944133758545, + "eval_runtime": 17.7938, + "eval_samples_per_second": 1348.781, + "eval_steps_per_second": 4.215, + "step": 17080 + }, + { + "epoch": 7.12, + "learning_rate": 1e-06, + "loss": 0.2087, + "step": 17100 + }, + { + "epoch": 7.12, + "eval_accuracy": 0.9334166666666667, + "eval_loss": 0.19571976363658905, + "eval_runtime": 16.3621, + "eval_samples_per_second": 1466.803, + "eval_steps_per_second": 4.584, + "step": 17100 + }, + { + "epoch": 7.13, + "learning_rate": 1e-06, + "loss": 0.1576, + "step": 17120 + }, + { + "epoch": 7.13, + "eval_accuracy": 0.934875, + "eval_loss": 0.19232991337776184, + "eval_runtime": 17.0875, + "eval_samples_per_second": 1404.537, + "eval_steps_per_second": 4.389, + "step": 17120 + }, + { + "epoch": 7.14, + "learning_rate": 1e-06, + "loss": 0.1837, + "step": 17140 + }, + { + "epoch": 7.14, + "eval_accuracy": 0.9337083333333334, + "eval_loss": 0.19416970014572144, + "eval_runtime": 17.2421, + "eval_samples_per_second": 1391.941, + "eval_steps_per_second": 4.35, + "step": 17140 + }, + { + "epoch": 7.15, + "learning_rate": 1e-06, + "loss": 0.1771, + "step": 17160 + }, + { + "epoch": 7.15, + "eval_accuracy": 0.9344583333333333, + "eval_loss": 0.19288769364356995, + "eval_runtime": 15.9944, + "eval_samples_per_second": 1500.527, + "eval_steps_per_second": 4.689, + "step": 17160 + }, + { + "epoch": 7.16, + "learning_rate": 1e-06, + "loss": 0.1661, + "step": 17180 + }, + { + "epoch": 7.16, + "eval_accuracy": 0.9346666666666666, + "eval_loss": 0.19377556443214417, + "eval_runtime": 16.1242, + "eval_samples_per_second": 1488.45, + "eval_steps_per_second": 4.651, + "step": 17180 + }, + { + "epoch": 7.17, + "learning_rate": 1e-06, + "loss": 0.1839, + "step": 17200 + }, + { + "epoch": 7.17, + "eval_accuracy": 0.9345833333333333, + "eval_loss": 0.19333358108997345, + "eval_runtime": 16.31, + "eval_samples_per_second": 1471.488, + "eval_steps_per_second": 4.598, + "step": 17200 + }, + { + "epoch": 7.17, + "learning_rate": 1e-06, + "loss": 0.172, + "step": 17220 + }, + { + "epoch": 7.17, + "eval_accuracy": 0.9345416666666667, + "eval_loss": 0.19157427549362183, + "eval_runtime": 16.7874, + "eval_samples_per_second": 1429.64, + "eval_steps_per_second": 4.468, + "step": 17220 + }, + { + "epoch": 7.18, + "learning_rate": 1e-06, + "loss": 0.1563, + "step": 17240 + }, + { + "epoch": 7.18, + "eval_accuracy": 0.9334583333333333, + "eval_loss": 0.19396959245204926, + "eval_runtime": 16.9407, + "eval_samples_per_second": 1416.708, + "eval_steps_per_second": 4.427, + "step": 17240 + }, + { + "epoch": 7.19, + "learning_rate": 1e-06, + "loss": 0.1835, + "step": 17260 + }, + { + "epoch": 7.19, + "eval_accuracy": 0.9312916666666666, + "eval_loss": 0.20071102678775787, + "eval_runtime": 17.2109, + "eval_samples_per_second": 1394.462, + "eval_steps_per_second": 4.358, + "step": 17260 + }, + { + "epoch": 7.2, + "learning_rate": 1e-06, + "loss": 0.1794, + "step": 17280 + }, + { + "epoch": 7.2, + "eval_accuracy": 0.932375, + "eval_loss": 0.1961183249950409, + "eval_runtime": 16.6977, + "eval_samples_per_second": 1437.327, + "eval_steps_per_second": 4.492, + "step": 17280 + }, + { + "epoch": 7.21, + "learning_rate": 1e-06, + "loss": 0.2048, + "step": 17300 + }, + { + "epoch": 7.21, + "eval_accuracy": 0.934125, + "eval_loss": 0.1937197893857956, + "eval_runtime": 17.0498, + "eval_samples_per_second": 1407.643, + "eval_steps_per_second": 4.399, + "step": 17300 + }, + { + "epoch": 7.22, + "learning_rate": 1e-06, + "loss": 0.1609, + "step": 17320 + }, + { + "epoch": 7.22, + "eval_accuracy": 0.9335, + "eval_loss": 0.1952856183052063, + "eval_runtime": 16.2943, + "eval_samples_per_second": 1472.908, + "eval_steps_per_second": 4.603, + "step": 17320 + }, + { + "epoch": 7.22, + "learning_rate": 1e-06, + "loss": 0.1588, + "step": 17340 + }, + { + "epoch": 7.22, + "eval_accuracy": 0.932375, + "eval_loss": 0.19826608896255493, + "eval_runtime": 16.8162, + "eval_samples_per_second": 1427.195, + "eval_steps_per_second": 4.46, + "step": 17340 + }, + { + "epoch": 7.23, + "learning_rate": 1e-06, + "loss": 0.1796, + "step": 17360 + }, + { + "epoch": 7.23, + "eval_accuracy": 0.932375, + "eval_loss": 0.1968182921409607, + "eval_runtime": 17.6501, + "eval_samples_per_second": 1359.765, + "eval_steps_per_second": 4.249, + "step": 17360 + }, + { + "epoch": 7.24, + "learning_rate": 1e-06, + "loss": 0.1834, + "step": 17380 + }, + { + "epoch": 7.24, + "eval_accuracy": 0.932625, + "eval_loss": 0.19632968306541443, + "eval_runtime": 17.2336, + "eval_samples_per_second": 1392.628, + "eval_steps_per_second": 4.352, + "step": 17380 + }, + { + "epoch": 7.25, + "learning_rate": 1e-06, + "loss": 0.1564, + "step": 17400 + }, + { + "epoch": 7.25, + "eval_accuracy": 0.9337916666666667, + "eval_loss": 0.19396378099918365, + "eval_runtime": 17.7707, + "eval_samples_per_second": 1350.539, + "eval_steps_per_second": 4.22, + "step": 17400 + }, + { + "epoch": 7.26, + "learning_rate": 1e-06, + "loss": 0.1513, + "step": 17420 + }, + { + "epoch": 7.26, + "eval_accuracy": 0.932875, + "eval_loss": 0.19559015333652496, + "eval_runtime": 17.1532, + "eval_samples_per_second": 1399.156, + "eval_steps_per_second": 4.372, + "step": 17420 + }, + { + "epoch": 7.27, + "learning_rate": 1e-06, + "loss": 0.1568, + "step": 17440 + }, + { + "epoch": 7.27, + "eval_accuracy": 0.935125, + "eval_loss": 0.19202855229377747, + "eval_runtime": 17.1051, + "eval_samples_per_second": 1403.089, + "eval_steps_per_second": 4.385, + "step": 17440 + }, + { + "epoch": 7.28, + "learning_rate": 1e-06, + "loss": 0.1748, + "step": 17460 + }, + { + "epoch": 7.28, + "eval_accuracy": 0.9349583333333333, + "eval_loss": 0.19286368787288666, + "eval_runtime": 15.5136, + "eval_samples_per_second": 1547.025, + "eval_steps_per_second": 4.834, + "step": 17460 + }, + { + "epoch": 7.28, + "learning_rate": 1e-06, + "loss": 0.1578, + "step": 17480 + }, + { + "epoch": 7.28, + "eval_accuracy": 0.9335, + "eval_loss": 0.194309800863266, + "eval_runtime": 16.1632, + "eval_samples_per_second": 1484.852, + "eval_steps_per_second": 4.64, + "step": 17480 + }, + { + "epoch": 7.29, + "learning_rate": 1e-06, + "loss": 0.1321, + "step": 17500 + }, + { + "epoch": 7.29, + "eval_accuracy": 0.9351666666666667, + "eval_loss": 0.19201384484767914, + "eval_runtime": 16.5178, + "eval_samples_per_second": 1452.974, + "eval_steps_per_second": 4.541, + "step": 17500 + }, + { + "epoch": 7.3, + "learning_rate": 1e-06, + "loss": 0.1963, + "step": 17520 + }, + { + "epoch": 7.3, + "eval_accuracy": 0.9339166666666666, + "eval_loss": 0.19566002488136292, + "eval_runtime": 16.995, + "eval_samples_per_second": 1412.184, + "eval_steps_per_second": 4.413, + "step": 17520 + }, + { + "epoch": 7.31, + "learning_rate": 1e-06, + "loss": 0.1927, + "step": 17540 + }, + { + "epoch": 7.31, + "eval_accuracy": 0.9330416666666667, + "eval_loss": 0.19617126882076263, + "eval_runtime": 17.1067, + "eval_samples_per_second": 1402.962, + "eval_steps_per_second": 4.384, + "step": 17540 + }, + { + "epoch": 7.32, + "learning_rate": 1e-06, + "loss": 0.1658, + "step": 17560 + }, + { + "epoch": 7.32, + "eval_accuracy": 0.9332916666666666, + "eval_loss": 0.195390984416008, + "eval_runtime": 17.0629, + "eval_samples_per_second": 1406.565, + "eval_steps_per_second": 4.396, + "step": 17560 + }, + { + "epoch": 7.33, + "learning_rate": 1e-06, + "loss": 0.1452, + "step": 17580 + }, + { + "epoch": 7.33, + "eval_accuracy": 0.9345833333333333, + "eval_loss": 0.19409912824630737, + "eval_runtime": 16.3617, + "eval_samples_per_second": 1466.84, + "eval_steps_per_second": 4.584, + "step": 17580 + }, + { + "epoch": 7.33, + "learning_rate": 1e-06, + "loss": 0.1992, + "step": 17600 + }, + { + "epoch": 7.33, + "eval_accuracy": 0.934125, + "eval_loss": 0.1932896226644516, + "eval_runtime": 16.2659, + "eval_samples_per_second": 1475.476, + "eval_steps_per_second": 4.611, + "step": 17600 + }, + { + "epoch": 7.34, + "learning_rate": 1e-06, + "loss": 0.1824, + "step": 17620 + }, + { + "epoch": 7.34, + "eval_accuracy": 0.934625, + "eval_loss": 0.1922486424446106, + "eval_runtime": 16.3428, + "eval_samples_per_second": 1468.54, + "eval_steps_per_second": 4.589, + "step": 17620 + }, + { + "epoch": 7.35, + "learning_rate": 1e-06, + "loss": 0.1388, + "step": 17640 + }, + { + "epoch": 7.35, + "eval_accuracy": 0.9345, + "eval_loss": 0.19244614243507385, + "eval_runtime": 17.0407, + "eval_samples_per_second": 1408.391, + "eval_steps_per_second": 4.401, + "step": 17640 + }, + { + "epoch": 7.36, + "learning_rate": 1e-06, + "loss": 0.1732, + "step": 17660 + }, + { + "epoch": 7.36, + "eval_accuracy": 0.9343333333333333, + "eval_loss": 0.19296354055404663, + "eval_runtime": 16.3329, + "eval_samples_per_second": 1469.426, + "eval_steps_per_second": 4.592, + "step": 17660 + }, + { + "epoch": 7.37, + "learning_rate": 1e-06, + "loss": 0.1824, + "step": 17680 + }, + { + "epoch": 7.37, + "eval_accuracy": 0.9335, + "eval_loss": 0.19450555741786957, + "eval_runtime": 16.422, + "eval_samples_per_second": 1461.453, + "eval_steps_per_second": 4.567, + "step": 17680 + }, + { + "epoch": 7.38, + "learning_rate": 1e-06, + "loss": 0.1715, + "step": 17700 + }, + { + "epoch": 7.38, + "eval_accuracy": 0.9337916666666667, + "eval_loss": 0.19444973766803741, + "eval_runtime": 17.29, + "eval_samples_per_second": 1388.083, + "eval_steps_per_second": 4.338, + "step": 17700 + }, + { + "epoch": 7.38, + "learning_rate": 1e-06, + "loss": 0.1228, + "step": 17720 + }, + { + "epoch": 7.38, + "eval_accuracy": 0.9344583333333333, + "eval_loss": 0.1941564828157425, + "eval_runtime": 17.8125, + "eval_samples_per_second": 1347.37, + "eval_steps_per_second": 4.211, + "step": 17720 + }, + { + "epoch": 7.39, + "learning_rate": 1e-06, + "loss": 0.1787, + "step": 17740 + }, + { + "epoch": 7.39, + "eval_accuracy": 0.93425, + "eval_loss": 0.19363898038864136, + "eval_runtime": 16.6565, + "eval_samples_per_second": 1440.875, + "eval_steps_per_second": 4.503, + "step": 17740 + }, + { + "epoch": 7.4, + "learning_rate": 1e-06, + "loss": 0.1422, + "step": 17760 + }, + { + "epoch": 7.4, + "eval_accuracy": 0.9340833333333334, + "eval_loss": 0.19510914385318756, + "eval_runtime": 16.7029, + "eval_samples_per_second": 1436.875, + "eval_steps_per_second": 4.49, + "step": 17760 + }, + { + "epoch": 7.41, + "learning_rate": 1e-06, + "loss": 0.1541, + "step": 17780 + }, + { + "epoch": 7.41, + "eval_accuracy": 0.934, + "eval_loss": 0.19492153823375702, + "eval_runtime": 16.117, + "eval_samples_per_second": 1489.115, + "eval_steps_per_second": 4.653, + "step": 17780 + }, + { + "epoch": 7.42, + "learning_rate": 1e-06, + "loss": 0.188, + "step": 17800 + }, + { + "epoch": 7.42, + "eval_accuracy": 0.9337083333333334, + "eval_loss": 0.1961117386817932, + "eval_runtime": 16.3107, + "eval_samples_per_second": 1471.429, + "eval_steps_per_second": 4.598, + "step": 17800 + }, + { + "epoch": 7.42, + "learning_rate": 1e-06, + "loss": 0.1591, + "step": 17820 + }, + { + "epoch": 7.42, + "eval_accuracy": 0.93475, + "eval_loss": 0.19403943419456482, + "eval_runtime": 16.2708, + "eval_samples_per_second": 1475.037, + "eval_steps_per_second": 4.609, + "step": 17820 + }, + { + "epoch": 7.43, + "learning_rate": 1e-06, + "loss": 0.1526, + "step": 17840 + }, + { + "epoch": 7.43, + "eval_accuracy": 0.933, + "eval_loss": 0.19669285416603088, + "eval_runtime": 16.4196, + "eval_samples_per_second": 1461.668, + "eval_steps_per_second": 4.568, + "step": 17840 + }, + { + "epoch": 7.44, + "learning_rate": 1e-06, + "loss": 0.1988, + "step": 17860 + }, + { + "epoch": 7.44, + "eval_accuracy": 0.933, + "eval_loss": 0.19575349986553192, + "eval_runtime": 17.4894, + "eval_samples_per_second": 1372.256, + "eval_steps_per_second": 4.288, + "step": 17860 + }, + { + "epoch": 7.45, + "learning_rate": 1e-06, + "loss": 0.1471, + "step": 17880 + }, + { + "epoch": 7.45, + "eval_accuracy": 0.9339166666666666, + "eval_loss": 0.19493769109249115, + "eval_runtime": 17.3622, + "eval_samples_per_second": 1382.312, + "eval_steps_per_second": 4.32, + "step": 17880 + }, + { + "epoch": 7.46, + "learning_rate": 1e-06, + "loss": 0.1631, + "step": 17900 + }, + { + "epoch": 7.46, + "eval_accuracy": 0.9326666666666666, + "eval_loss": 0.1954115778207779, + "eval_runtime": 16.3935, + "eval_samples_per_second": 1463.997, + "eval_steps_per_second": 4.575, + "step": 17900 + }, + { + "epoch": 7.47, + "learning_rate": 1e-06, + "loss": 0.2076, + "step": 17920 + }, + { + "epoch": 7.47, + "eval_accuracy": 0.9314583333333334, + "eval_loss": 0.20004239678382874, + "eval_runtime": 16.8491, + "eval_samples_per_second": 1424.406, + "eval_steps_per_second": 4.451, + "step": 17920 + }, + { + "epoch": 7.47, + "learning_rate": 1e-06, + "loss": 0.1541, + "step": 17940 + }, + { + "epoch": 7.47, + "eval_accuracy": 0.9330833333333334, + "eval_loss": 0.1946871429681778, + "eval_runtime": 16.9255, + "eval_samples_per_second": 1417.978, + "eval_steps_per_second": 4.431, + "step": 17940 + }, + { + "epoch": 7.48, + "learning_rate": 1e-06, + "loss": 0.2057, + "step": 17960 + }, + { + "epoch": 7.48, + "eval_accuracy": 0.9335833333333333, + "eval_loss": 0.194530189037323, + "eval_runtime": 17.2774, + "eval_samples_per_second": 1389.094, + "eval_steps_per_second": 4.341, + "step": 17960 + }, + { + "epoch": 7.49, + "learning_rate": 1e-06, + "loss": 0.1721, + "step": 17980 + }, + { + "epoch": 7.49, + "eval_accuracy": 0.934, + "eval_loss": 0.19205187261104584, + "eval_runtime": 16.4737, + "eval_samples_per_second": 1456.864, + "eval_steps_per_second": 4.553, + "step": 17980 + }, + { + "epoch": 7.5, + "learning_rate": 1e-06, + "loss": 0.1528, + "step": 18000 + }, + { + "epoch": 7.5, + "eval_accuracy": 0.934375, + "eval_loss": 0.19169993698596954, + "eval_runtime": 16.6059, + "eval_samples_per_second": 1445.27, + "eval_steps_per_second": 4.516, + "step": 18000 + }, + { + "epoch": 7.51, + "learning_rate": 1e-06, + "loss": 0.1758, + "step": 18020 + }, + { + "epoch": 7.51, + "eval_accuracy": 0.9336666666666666, + "eval_loss": 0.19537770748138428, + "eval_runtime": 16.2403, + "eval_samples_per_second": 1477.804, + "eval_steps_per_second": 4.618, + "step": 18020 + }, + { + "epoch": 7.52, + "learning_rate": 1e-06, + "loss": 0.1757, + "step": 18040 + }, + { + "epoch": 7.52, + "eval_accuracy": 0.9337083333333334, + "eval_loss": 0.19484874606132507, + "eval_runtime": 16.9236, + "eval_samples_per_second": 1418.134, + "eval_steps_per_second": 4.432, + "step": 18040 + }, + { + "epoch": 7.53, + "learning_rate": 1e-06, + "loss": 0.1499, + "step": 18060 + }, + { + "epoch": 7.53, + "eval_accuracy": 0.9348333333333333, + "eval_loss": 0.19193783402442932, + "eval_runtime": 16.6031, + "eval_samples_per_second": 1445.509, + "eval_steps_per_second": 4.517, + "step": 18060 + }, + { + "epoch": 7.53, + "learning_rate": 1e-06, + "loss": 0.2056, + "step": 18080 + }, + { + "epoch": 7.53, + "eval_accuracy": 0.9350416666666667, + "eval_loss": 0.1917405128479004, + "eval_runtime": 17.4615, + "eval_samples_per_second": 1374.451, + "eval_steps_per_second": 4.295, + "step": 18080 + }, + { + "epoch": 7.54, + "learning_rate": 1e-06, + "loss": 0.1646, + "step": 18100 + }, + { + "epoch": 7.54, + "eval_accuracy": 0.9340833333333334, + "eval_loss": 0.1928911805152893, + "eval_runtime": 17.1226, + "eval_samples_per_second": 1401.658, + "eval_steps_per_second": 4.38, + "step": 18100 + }, + { + "epoch": 7.55, + "learning_rate": 1e-06, + "loss": 0.1312, + "step": 18120 + }, + { + "epoch": 7.55, + "eval_accuracy": 0.9336666666666666, + "eval_loss": 0.19554804265499115, + "eval_runtime": 16.8658, + "eval_samples_per_second": 1422.998, + "eval_steps_per_second": 4.447, + "step": 18120 + }, + { + "epoch": 7.56, + "learning_rate": 1e-06, + "loss": 0.1759, + "step": 18140 + }, + { + "epoch": 7.56, + "eval_accuracy": 0.9339166666666666, + "eval_loss": 0.1938326209783554, + "eval_runtime": 16.5184, + "eval_samples_per_second": 1452.925, + "eval_steps_per_second": 4.54, + "step": 18140 + }, + { + "epoch": 7.57, + "learning_rate": 1e-06, + "loss": 0.1907, + "step": 18160 + }, + { + "epoch": 7.57, + "eval_accuracy": 0.934125, + "eval_loss": 0.19356830418109894, + "eval_runtime": 16.2934, + "eval_samples_per_second": 1472.985, + "eval_steps_per_second": 4.603, + "step": 18160 + }, + { + "epoch": 7.58, + "learning_rate": 1e-06, + "loss": 0.156, + "step": 18180 + }, + { + "epoch": 7.58, + "eval_accuracy": 0.932625, + "eval_loss": 0.19519713521003723, + "eval_runtime": 16.5186, + "eval_samples_per_second": 1452.905, + "eval_steps_per_second": 4.54, + "step": 18180 + }, + { + "epoch": 7.58, + "learning_rate": 1e-06, + "loss": 0.1656, + "step": 18200 + }, + { + "epoch": 7.58, + "eval_accuracy": 0.934, + "eval_loss": 0.19366022944450378, + "eval_runtime": 17.0576, + "eval_samples_per_second": 1406.998, + "eval_steps_per_second": 4.397, + "step": 18200 + }, + { + "epoch": 7.59, + "learning_rate": 1e-06, + "loss": 0.1321, + "step": 18220 + }, + { + "epoch": 7.59, + "eval_accuracy": 0.9335, + "eval_loss": 0.19374357163906097, + "eval_runtime": 16.6623, + "eval_samples_per_second": 1440.375, + "eval_steps_per_second": 4.501, + "step": 18220 + }, + { + "epoch": 7.6, + "learning_rate": 1e-06, + "loss": 0.1367, + "step": 18240 + }, + { + "epoch": 7.6, + "eval_accuracy": 0.934875, + "eval_loss": 0.1920449137687683, + "eval_runtime": 17.6693, + "eval_samples_per_second": 1358.287, + "eval_steps_per_second": 4.245, + "step": 18240 + }, + { + "epoch": 7.61, + "learning_rate": 1e-06, + "loss": 0.1935, + "step": 18260 + }, + { + "epoch": 7.61, + "eval_accuracy": 0.9331666666666667, + "eval_loss": 0.19506241381168365, + "eval_runtime": 16.8545, + "eval_samples_per_second": 1423.948, + "eval_steps_per_second": 4.45, + "step": 18260 + }, + { + "epoch": 7.62, + "learning_rate": 1e-06, + "loss": 0.1625, + "step": 18280 + }, + { + "epoch": 7.62, + "eval_accuracy": 0.9312083333333333, + "eval_loss": 0.20022885501384735, + "eval_runtime": 15.6569, + "eval_samples_per_second": 1532.875, + "eval_steps_per_second": 4.79, + "step": 18280 + }, + { + "epoch": 7.62, + "learning_rate": 1e-06, + "loss": 0.1959, + "step": 18300 + }, + { + "epoch": 7.62, + "eval_accuracy": 0.932, + "eval_loss": 0.19843071699142456, + "eval_runtime": 17.5133, + "eval_samples_per_second": 1370.39, + "eval_steps_per_second": 4.282, + "step": 18300 + }, + { + "epoch": 7.63, + "learning_rate": 1e-06, + "loss": 0.1523, + "step": 18320 + }, + { + "epoch": 7.63, + "eval_accuracy": 0.9332916666666666, + "eval_loss": 0.19627678394317627, + "eval_runtime": 17.5177, + "eval_samples_per_second": 1370.041, + "eval_steps_per_second": 4.281, + "step": 18320 + }, + { + "epoch": 7.64, + "learning_rate": 1e-06, + "loss": 0.1248, + "step": 18340 + }, + { + "epoch": 7.64, + "eval_accuracy": 0.9340416666666667, + "eval_loss": 0.1947159618139267, + "eval_runtime": 17.1988, + "eval_samples_per_second": 1395.448, + "eval_steps_per_second": 4.361, + "step": 18340 + }, + { + "epoch": 7.65, + "learning_rate": 1e-06, + "loss": 0.1575, + "step": 18360 + }, + { + "epoch": 7.65, + "eval_accuracy": 0.9345416666666667, + "eval_loss": 0.1939171701669693, + "eval_runtime": 16.9923, + "eval_samples_per_second": 1412.401, + "eval_steps_per_second": 4.414, + "step": 18360 + }, + { + "epoch": 7.66, + "learning_rate": 1e-06, + "loss": 0.145, + "step": 18380 + }, + { + "epoch": 7.66, + "eval_accuracy": 0.9348333333333333, + "eval_loss": 0.19376207888126373, + "eval_runtime": 16.8495, + "eval_samples_per_second": 1424.374, + "eval_steps_per_second": 4.451, + "step": 18380 + }, + { + "epoch": 7.67, + "learning_rate": 1e-06, + "loss": 0.1981, + "step": 18400 + }, + { + "epoch": 7.67, + "eval_accuracy": 0.9344166666666667, + "eval_loss": 0.1934199035167694, + "eval_runtime": 16.679, + "eval_samples_per_second": 1438.939, + "eval_steps_per_second": 4.497, + "step": 18400 + }, + { + "epoch": 7.67, + "learning_rate": 1e-06, + "loss": 0.1237, + "step": 18420 + }, + { + "epoch": 7.67, + "eval_accuracy": 0.9329166666666666, + "eval_loss": 0.19676241278648376, + "eval_runtime": 16.3441, + "eval_samples_per_second": 1468.422, + "eval_steps_per_second": 4.589, + "step": 18420 + }, + { + "epoch": 7.68, + "learning_rate": 1e-06, + "loss": 0.1649, + "step": 18440 + }, + { + "epoch": 7.68, + "eval_accuracy": 0.9333333333333333, + "eval_loss": 0.19533593952655792, + "eval_runtime": 16.5476, + "eval_samples_per_second": 1450.361, + "eval_steps_per_second": 4.532, + "step": 18440 + }, + { + "epoch": 7.69, + "learning_rate": 1e-06, + "loss": 0.1832, + "step": 18460 + }, + { + "epoch": 7.69, + "eval_accuracy": 0.9344166666666667, + "eval_loss": 0.19351038336753845, + "eval_runtime": 16.5442, + "eval_samples_per_second": 1450.662, + "eval_steps_per_second": 4.533, + "step": 18460 + }, + { + "epoch": 7.7, + "learning_rate": 1e-06, + "loss": 0.1337, + "step": 18480 + }, + { + "epoch": 7.7, + "eval_accuracy": 0.9326666666666666, + "eval_loss": 0.197190523147583, + "eval_runtime": 16.6092, + "eval_samples_per_second": 1444.979, + "eval_steps_per_second": 4.516, + "step": 18480 + }, + { + "epoch": 7.71, + "learning_rate": 1e-06, + "loss": 0.1407, + "step": 18500 + }, + { + "epoch": 7.71, + "eval_accuracy": 0.9353333333333333, + "eval_loss": 0.19291214644908905, + "eval_runtime": 16.5801, + "eval_samples_per_second": 1447.52, + "eval_steps_per_second": 4.523, + "step": 18500 + }, + { + "epoch": 7.72, + "learning_rate": 1e-06, + "loss": 0.1489, + "step": 18520 + }, + { + "epoch": 7.72, + "eval_accuracy": 0.9338333333333333, + "eval_loss": 0.19552947580814362, + "eval_runtime": 16.9798, + "eval_samples_per_second": 1413.444, + "eval_steps_per_second": 4.417, + "step": 18520 + }, + { + "epoch": 7.72, + "learning_rate": 1e-06, + "loss": 0.1603, + "step": 18540 + }, + { + "epoch": 7.72, + "eval_accuracy": 0.9324583333333333, + "eval_loss": 0.19699038565158844, + "eval_runtime": 17.4729, + "eval_samples_per_second": 1373.558, + "eval_steps_per_second": 4.292, + "step": 18540 + }, + { + "epoch": 7.73, + "learning_rate": 1e-06, + "loss": 0.1468, + "step": 18560 + }, + { + "epoch": 7.73, + "eval_accuracy": 0.9337083333333334, + "eval_loss": 0.19590060412883759, + "eval_runtime": 16.5983, + "eval_samples_per_second": 1445.927, + "eval_steps_per_second": 4.519, + "step": 18560 + }, + { + "epoch": 7.74, + "learning_rate": 1e-06, + "loss": 0.213, + "step": 18580 + }, + { + "epoch": 7.74, + "eval_accuracy": 0.934125, + "eval_loss": 0.1953480988740921, + "eval_runtime": 15.9179, + "eval_samples_per_second": 1507.733, + "eval_steps_per_second": 4.712, + "step": 18580 + }, + { + "epoch": 7.75, + "learning_rate": 1e-06, + "loss": 0.2005, + "step": 18600 + }, + { + "epoch": 7.75, + "eval_accuracy": 0.934875, + "eval_loss": 0.1933307647705078, + "eval_runtime": 16.5952, + "eval_samples_per_second": 1446.199, + "eval_steps_per_second": 4.519, + "step": 18600 + }, + { + "epoch": 7.76, + "learning_rate": 1e-06, + "loss": 0.1741, + "step": 18620 + }, + { + "epoch": 7.76, + "eval_accuracy": 0.9349583333333333, + "eval_loss": 0.19205695390701294, + "eval_runtime": 16.7425, + "eval_samples_per_second": 1433.48, + "eval_steps_per_second": 4.48, + "step": 18620 + }, + { + "epoch": 7.77, + "learning_rate": 1e-06, + "loss": 0.1618, + "step": 18640 + }, + { + "epoch": 7.77, + "eval_accuracy": 0.93375, + "eval_loss": 0.19280995428562164, + "eval_runtime": 16.6122, + "eval_samples_per_second": 1444.719, + "eval_steps_per_second": 4.515, + "step": 18640 + }, + { + "epoch": 7.78, + "learning_rate": 1e-06, + "loss": 0.2304, + "step": 18660 + }, + { + "epoch": 7.78, + "eval_accuracy": 0.93325, + "eval_loss": 0.1952376365661621, + "eval_runtime": 17.2502, + "eval_samples_per_second": 1391.287, + "eval_steps_per_second": 4.348, + "step": 18660 + }, + { + "epoch": 7.78, + "learning_rate": 1e-06, + "loss": 0.1729, + "step": 18680 + }, + { + "epoch": 7.78, + "eval_accuracy": 0.933875, + "eval_loss": 0.1918487548828125, + "eval_runtime": 16.6725, + "eval_samples_per_second": 1439.495, + "eval_steps_per_second": 4.498, + "step": 18680 + }, + { + "epoch": 7.79, + "learning_rate": 1e-06, + "loss": 0.1632, + "step": 18700 + }, + { + "epoch": 7.79, + "eval_accuracy": 0.9339583333333333, + "eval_loss": 0.193745419383049, + "eval_runtime": 17.0655, + "eval_samples_per_second": 1406.35, + "eval_steps_per_second": 4.395, + "step": 18700 + }, + { + "epoch": 7.8, + "learning_rate": 1e-06, + "loss": 0.1381, + "step": 18720 + }, + { + "epoch": 7.8, + "eval_accuracy": 0.9337083333333334, + "eval_loss": 0.19378027319908142, + "eval_runtime": 16.1642, + "eval_samples_per_second": 1484.763, + "eval_steps_per_second": 4.64, + "step": 18720 + }, + { + "epoch": 7.81, + "learning_rate": 1e-06, + "loss": 0.1334, + "step": 18740 + }, + { + "epoch": 7.81, + "eval_accuracy": 0.9339166666666666, + "eval_loss": 0.19540290534496307, + "eval_runtime": 17.2682, + "eval_samples_per_second": 1389.836, + "eval_steps_per_second": 4.343, + "step": 18740 + }, + { + "epoch": 7.82, + "learning_rate": 1e-06, + "loss": 0.1706, + "step": 18760 + }, + { + "epoch": 7.82, + "eval_accuracy": 0.9347083333333334, + "eval_loss": 0.19170990586280823, + "eval_runtime": 17.5111, + "eval_samples_per_second": 1370.562, + "eval_steps_per_second": 4.283, + "step": 18760 + }, + { + "epoch": 7.83, + "learning_rate": 1e-06, + "loss": 0.1774, + "step": 18780 + }, + { + "epoch": 7.83, + "eval_accuracy": 0.93425, + "eval_loss": 0.19449305534362793, + "eval_runtime": 18.1073, + "eval_samples_per_second": 1325.429, + "eval_steps_per_second": 4.142, + "step": 18780 + }, + { + "epoch": 7.83, + "learning_rate": 1e-06, + "loss": 0.1891, + "step": 18800 + }, + { + "epoch": 7.83, + "eval_accuracy": 0.935125, + "eval_loss": 0.19196221232414246, + "eval_runtime": 16.1253, + "eval_samples_per_second": 1488.341, + "eval_steps_per_second": 4.651, + "step": 18800 + }, + { + "epoch": 7.84, + "learning_rate": 1e-06, + "loss": 0.1949, + "step": 18820 + }, + { + "epoch": 7.84, + "eval_accuracy": 0.9347083333333334, + "eval_loss": 0.1915530413389206, + "eval_runtime": 16.928, + "eval_samples_per_second": 1417.773, + "eval_steps_per_second": 4.431, + "step": 18820 + }, + { + "epoch": 7.85, + "learning_rate": 1e-06, + "loss": 0.1511, + "step": 18840 + }, + { + "epoch": 7.85, + "eval_accuracy": 0.9354583333333333, + "eval_loss": 0.1908595710992813, + "eval_runtime": 16.4669, + "eval_samples_per_second": 1457.473, + "eval_steps_per_second": 4.555, + "step": 18840 + }, + { + "epoch": 7.86, + "learning_rate": 1e-06, + "loss": 0.1501, + "step": 18860 + }, + { + "epoch": 7.86, + "eval_accuracy": 0.9355, + "eval_loss": 0.19139742851257324, + "eval_runtime": 16.7079, + "eval_samples_per_second": 1436.446, + "eval_steps_per_second": 4.489, + "step": 18860 + }, + { + "epoch": 7.87, + "learning_rate": 1e-06, + "loss": 0.1367, + "step": 18880 + }, + { + "epoch": 7.87, + "eval_accuracy": 0.9359166666666666, + "eval_loss": 0.19081765413284302, + "eval_runtime": 16.8019, + "eval_samples_per_second": 1428.413, + "eval_steps_per_second": 4.464, + "step": 18880 + }, + { + "epoch": 7.88, + "learning_rate": 1e-06, + "loss": 0.179, + "step": 18900 + }, + { + "epoch": 7.88, + "eval_accuracy": 0.936, + "eval_loss": 0.1911996752023697, + "eval_runtime": 17.4687, + "eval_samples_per_second": 1373.888, + "eval_steps_per_second": 4.293, + "step": 18900 + }, + { + "epoch": 7.88, + "learning_rate": 1e-06, + "loss": 0.1737, + "step": 18920 + }, + { + "epoch": 7.88, + "eval_accuracy": 0.9356666666666666, + "eval_loss": 0.19142092764377594, + "eval_runtime": 17.9819, + "eval_samples_per_second": 1334.677, + "eval_steps_per_second": 4.171, + "step": 18920 + }, + { + "epoch": 7.89, + "learning_rate": 1e-06, + "loss": 0.1821, + "step": 18940 + }, + { + "epoch": 7.89, + "eval_accuracy": 0.9349166666666666, + "eval_loss": 0.19184531271457672, + "eval_runtime": 17.6224, + "eval_samples_per_second": 1361.903, + "eval_steps_per_second": 4.256, + "step": 18940 + }, + { + "epoch": 7.9, + "learning_rate": 1e-06, + "loss": 0.1611, + "step": 18960 + }, + { + "epoch": 7.9, + "eval_accuracy": 0.9360416666666667, + "eval_loss": 0.19008147716522217, + "eval_runtime": 16.936, + "eval_samples_per_second": 1417.103, + "eval_steps_per_second": 4.428, + "step": 18960 + }, + { + "epoch": 7.91, + "learning_rate": 1e-06, + "loss": 0.1389, + "step": 18980 + }, + { + "epoch": 7.91, + "eval_accuracy": 0.935875, + "eval_loss": 0.1900467723608017, + "eval_runtime": 17.5135, + "eval_samples_per_second": 1370.37, + "eval_steps_per_second": 4.282, + "step": 18980 + }, + { + "epoch": 7.92, + "learning_rate": 1e-06, + "loss": 0.1751, + "step": 19000 + }, + { + "epoch": 7.92, + "eval_accuracy": 0.936875, + "eval_loss": 0.19042351841926575, + "eval_runtime": 17.8277, + "eval_samples_per_second": 1346.22, + "eval_steps_per_second": 4.207, + "step": 19000 + }, + { + "epoch": 7.92, + "learning_rate": 1e-06, + "loss": 0.1955, + "step": 19020 + }, + { + "epoch": 7.92, + "eval_accuracy": 0.9355, + "eval_loss": 0.19257663190364838, + "eval_runtime": 17.5409, + "eval_samples_per_second": 1368.229, + "eval_steps_per_second": 4.276, + "step": 19020 + }, + { + "epoch": 7.93, + "learning_rate": 1e-06, + "loss": 0.1762, + "step": 19040 + }, + { + "epoch": 7.93, + "eval_accuracy": 0.9362083333333333, + "eval_loss": 0.19056767225265503, + "eval_runtime": 17.5352, + "eval_samples_per_second": 1368.676, + "eval_steps_per_second": 4.277, + "step": 19040 + }, + { + "epoch": 7.94, + "learning_rate": 1e-06, + "loss": 0.1417, + "step": 19060 + }, + { + "epoch": 7.94, + "eval_accuracy": 0.935625, + "eval_loss": 0.19115488231182098, + "eval_runtime": 17.4988, + "eval_samples_per_second": 1371.52, + "eval_steps_per_second": 4.286, + "step": 19060 + }, + { + "epoch": 7.95, + "learning_rate": 1e-06, + "loss": 0.1602, + "step": 19080 + }, + { + "epoch": 7.95, + "eval_accuracy": 0.9348333333333333, + "eval_loss": 0.19205690920352936, + "eval_runtime": 17.3557, + "eval_samples_per_second": 1382.829, + "eval_steps_per_second": 4.321, + "step": 19080 + }, + { + "epoch": 7.96, + "learning_rate": 1e-06, + "loss": 0.1355, + "step": 19100 + }, + { + "epoch": 7.96, + "eval_accuracy": 0.9320833333333334, + "eval_loss": 0.1974695324897766, + "eval_runtime": 17.4599, + "eval_samples_per_second": 1374.582, + "eval_steps_per_second": 4.296, + "step": 19100 + }, + { + "epoch": 7.97, + "learning_rate": 1e-06, + "loss": 0.1488, + "step": 19120 + }, + { + "epoch": 7.97, + "eval_accuracy": 0.9339166666666666, + "eval_loss": 0.19483338296413422, + "eval_runtime": 17.4251, + "eval_samples_per_second": 1377.325, + "eval_steps_per_second": 4.304, + "step": 19120 + }, + { + "epoch": 7.97, + "learning_rate": 1e-06, + "loss": 0.2128, + "step": 19140 + }, + { + "epoch": 7.97, + "eval_accuracy": 0.9356666666666666, + "eval_loss": 0.19111751019954681, + "eval_runtime": 17.5678, + "eval_samples_per_second": 1366.137, + "eval_steps_per_second": 4.269, + "step": 19140 + }, + { + "epoch": 7.98, + "learning_rate": 1e-06, + "loss": 0.1223, + "step": 19160 + }, + { + "epoch": 7.98, + "eval_accuracy": 0.9352083333333333, + "eval_loss": 0.19168196618556976, + "eval_runtime": 17.4393, + "eval_samples_per_second": 1376.199, + "eval_steps_per_second": 4.301, + "step": 19160 + }, + { + "epoch": 7.99, + "learning_rate": 1e-06, + "loss": 0.1564, + "step": 19180 + }, + { + "epoch": 7.99, + "eval_accuracy": 0.934, + "eval_loss": 0.19453711807727814, + "eval_runtime": 16.8108, + "eval_samples_per_second": 1427.651, + "eval_steps_per_second": 4.461, + "step": 19180 + }, + { + "epoch": 8.0, + "learning_rate": 1e-06, + "loss": 0.1961, + "step": 19200 + }, + { + "epoch": 8.0, + "eval_accuracy": 0.934125, + "eval_loss": 0.19330745935440063, + "eval_runtime": 16.8924, + "eval_samples_per_second": 1420.757, + "eval_steps_per_second": 4.44, + "step": 19200 + }, + { + "epoch": 8.01, + "learning_rate": 1e-06, + "loss": 0.1791, + "step": 19220 + }, + { + "epoch": 8.01, + "eval_accuracy": 0.9350416666666667, + "eval_loss": 0.19044369459152222, + "eval_runtime": 16.3873, + "eval_samples_per_second": 1464.552, + "eval_steps_per_second": 4.577, + "step": 19220 + }, + { + "epoch": 8.02, + "learning_rate": 1e-06, + "loss": 0.1466, + "step": 19240 + }, + { + "epoch": 8.02, + "eval_accuracy": 0.935875, + "eval_loss": 0.19002403318881989, + "eval_runtime": 16.82, + "eval_samples_per_second": 1426.875, + "eval_steps_per_second": 4.459, + "step": 19240 + }, + { + "epoch": 8.03, + "learning_rate": 1e-06, + "loss": 0.135, + "step": 19260 + }, + { + "epoch": 8.03, + "eval_accuracy": 0.9357083333333334, + "eval_loss": 0.19046556949615479, + "eval_runtime": 16.49, + "eval_samples_per_second": 1455.424, + "eval_steps_per_second": 4.548, + "step": 19260 + }, + { + "epoch": 8.03, + "learning_rate": 1e-06, + "loss": 0.129, + "step": 19280 + }, + { + "epoch": 8.03, + "eval_accuracy": 0.935125, + "eval_loss": 0.1913367062807083, + "eval_runtime": 17.0223, + "eval_samples_per_second": 1409.919, + "eval_steps_per_second": 4.406, + "step": 19280 + }, + { + "epoch": 8.04, + "learning_rate": 1e-06, + "loss": 0.1778, + "step": 19300 + }, + { + "epoch": 8.04, + "eval_accuracy": 0.9329166666666666, + "eval_loss": 0.1969105303287506, + "eval_runtime": 16.1602, + "eval_samples_per_second": 1485.13, + "eval_steps_per_second": 4.641, + "step": 19300 + }, + { + "epoch": 8.05, + "learning_rate": 1e-06, + "loss": 0.1362, + "step": 19320 + }, + { + "epoch": 8.05, + "eval_accuracy": 0.9340416666666667, + "eval_loss": 0.19413326680660248, + "eval_runtime": 17.3298, + "eval_samples_per_second": 1384.896, + "eval_steps_per_second": 4.328, + "step": 19320 + }, + { + "epoch": 8.06, + "learning_rate": 1e-06, + "loss": 0.157, + "step": 19340 + }, + { + "epoch": 8.06, + "eval_accuracy": 0.93275, + "eval_loss": 0.19760599732398987, + "eval_runtime": 17.4331, + "eval_samples_per_second": 1376.69, + "eval_steps_per_second": 4.302, + "step": 19340 + }, + { + "epoch": 8.07, + "learning_rate": 1e-06, + "loss": 0.1544, + "step": 19360 + }, + { + "epoch": 8.07, + "eval_accuracy": 0.935125, + "eval_loss": 0.1911655217409134, + "eval_runtime": 16.3075, + "eval_samples_per_second": 1471.715, + "eval_steps_per_second": 4.599, + "step": 19360 + }, + { + "epoch": 8.07, + "learning_rate": 1e-06, + "loss": 0.1566, + "step": 19380 + }, + { + "epoch": 8.07, + "eval_accuracy": 0.9355, + "eval_loss": 0.19158610701560974, + "eval_runtime": 16.834, + "eval_samples_per_second": 1425.685, + "eval_steps_per_second": 4.455, + "step": 19380 + }, + { + "epoch": 8.08, + "learning_rate": 1e-06, + "loss": 0.1572, + "step": 19400 + }, + { + "epoch": 8.08, + "eval_accuracy": 0.93525, + "eval_loss": 0.19257643818855286, + "eval_runtime": 16.0408, + "eval_samples_per_second": 1496.18, + "eval_steps_per_second": 4.676, + "step": 19400 + }, + { + "epoch": 8.09, + "learning_rate": 1e-06, + "loss": 0.1077, + "step": 19420 + }, + { + "epoch": 8.09, + "eval_accuracy": 0.9349583333333333, + "eval_loss": 0.1925935298204422, + "eval_runtime": 17.6153, + "eval_samples_per_second": 1362.454, + "eval_steps_per_second": 4.258, + "step": 19420 + }, + { + "epoch": 8.1, + "learning_rate": 1e-06, + "loss": 0.1365, + "step": 19440 + }, + { + "epoch": 8.1, + "eval_accuracy": 0.9346666666666666, + "eval_loss": 0.19453419744968414, + "eval_runtime": 16.5289, + "eval_samples_per_second": 1451.999, + "eval_steps_per_second": 4.537, + "step": 19440 + }, + { + "epoch": 8.11, + "learning_rate": 1e-06, + "loss": 0.1526, + "step": 19460 + }, + { + "epoch": 8.11, + "eval_accuracy": 0.9355833333333333, + "eval_loss": 0.19276762008666992, + "eval_runtime": 16.3532, + "eval_samples_per_second": 1467.604, + "eval_steps_per_second": 4.586, + "step": 19460 + }, + { + "epoch": 8.12, + "learning_rate": 1e-06, + "loss": 0.1794, + "step": 19480 + }, + { + "epoch": 8.12, + "eval_accuracy": 0.9357083333333334, + "eval_loss": 0.19133615493774414, + "eval_runtime": 16.7591, + "eval_samples_per_second": 1432.06, + "eval_steps_per_second": 4.475, + "step": 19480 + }, + { + "epoch": 8.12, + "learning_rate": 1e-06, + "loss": 0.127, + "step": 19500 + }, + { + "epoch": 8.12, + "eval_accuracy": 0.9350833333333334, + "eval_loss": 0.19240260124206543, + "eval_runtime": 17.3908, + "eval_samples_per_second": 1380.04, + "eval_steps_per_second": 4.313, + "step": 19500 + }, + { + "epoch": 8.13, + "learning_rate": 1e-06, + "loss": 0.1636, + "step": 19520 + }, + { + "epoch": 8.13, + "eval_accuracy": 0.9345, + "eval_loss": 0.19413629174232483, + "eval_runtime": 17.1984, + "eval_samples_per_second": 1395.482, + "eval_steps_per_second": 4.361, + "step": 19520 + }, + { + "epoch": 8.14, + "learning_rate": 1e-06, + "loss": 0.1826, + "step": 19540 + }, + { + "epoch": 8.14, + "eval_accuracy": 0.934625, + "eval_loss": 0.19350507855415344, + "eval_runtime": 16.3498, + "eval_samples_per_second": 1467.911, + "eval_steps_per_second": 4.587, + "step": 19540 + }, + { + "epoch": 8.15, + "learning_rate": 1e-06, + "loss": 0.1524, + "step": 19560 + }, + { + "epoch": 8.15, + "eval_accuracy": 0.933375, + "eval_loss": 0.19596153497695923, + "eval_runtime": 16.6776, + "eval_samples_per_second": 1439.057, + "eval_steps_per_second": 4.497, + "step": 19560 + }, + { + "epoch": 8.16, + "learning_rate": 1e-06, + "loss": 0.1208, + "step": 19580 + }, + { + "epoch": 8.16, + "eval_accuracy": 0.9338333333333333, + "eval_loss": 0.19459760189056396, + "eval_runtime": 16.3642, + "eval_samples_per_second": 1466.618, + "eval_steps_per_second": 4.583, + "step": 19580 + }, + { + "epoch": 8.17, + "learning_rate": 1e-06, + "loss": 0.1516, + "step": 19600 + }, + { + "epoch": 8.17, + "eval_accuracy": 0.9324583333333333, + "eval_loss": 0.19873768091201782, + "eval_runtime": 16.1804, + "eval_samples_per_second": 1483.275, + "eval_steps_per_second": 4.635, + "step": 19600 + }, + { + "epoch": 8.18, + "learning_rate": 1e-06, + "loss": 0.1581, + "step": 19620 + }, + { + "epoch": 8.18, + "eval_accuracy": 0.935375, + "eval_loss": 0.19273337721824646, + "eval_runtime": 16.628, + "eval_samples_per_second": 1443.351, + "eval_steps_per_second": 4.51, + "step": 19620 + }, + { + "epoch": 8.18, + "learning_rate": 1e-06, + "loss": 0.1537, + "step": 19640 + }, + { + "epoch": 8.18, + "eval_accuracy": 0.935625, + "eval_loss": 0.1912853717803955, + "eval_runtime": 16.501, + "eval_samples_per_second": 1454.455, + "eval_steps_per_second": 4.545, + "step": 19640 + }, + { + "epoch": 8.19, + "learning_rate": 1e-06, + "loss": 0.1165, + "step": 19660 + }, + { + "epoch": 8.19, + "eval_accuracy": 0.9357083333333334, + "eval_loss": 0.19141745567321777, + "eval_runtime": 16.8771, + "eval_samples_per_second": 1422.043, + "eval_steps_per_second": 4.444, + "step": 19660 + }, + { + "epoch": 8.2, + "learning_rate": 1e-06, + "loss": 0.1374, + "step": 19680 + }, + { + "epoch": 8.2, + "eval_accuracy": 0.9356666666666666, + "eval_loss": 0.1914907842874527, + "eval_runtime": 15.9508, + "eval_samples_per_second": 1504.626, + "eval_steps_per_second": 4.702, + "step": 19680 + }, + { + "epoch": 8.21, + "learning_rate": 1e-06, + "loss": 0.1806, + "step": 19700 + }, + { + "epoch": 8.21, + "eval_accuracy": 0.9340416666666667, + "eval_loss": 0.1957368105649948, + "eval_runtime": 17.0085, + "eval_samples_per_second": 1411.059, + "eval_steps_per_second": 4.41, + "step": 19700 + }, + { + "epoch": 8.22, + "learning_rate": 1e-06, + "loss": 0.2198, + "step": 19720 + }, + { + "epoch": 8.22, + "eval_accuracy": 0.9335, + "eval_loss": 0.19590935111045837, + "eval_runtime": 17.4075, + "eval_samples_per_second": 1378.713, + "eval_steps_per_second": 4.308, + "step": 19720 + }, + { + "epoch": 8.22, + "learning_rate": 1e-06, + "loss": 0.177, + "step": 19740 + }, + { + "epoch": 8.22, + "eval_accuracy": 0.93525, + "eval_loss": 0.19214990735054016, + "eval_runtime": 17.3221, + "eval_samples_per_second": 1385.511, + "eval_steps_per_second": 4.33, + "step": 19740 + }, + { + "epoch": 8.23, + "learning_rate": 1e-06, + "loss": 0.1417, + "step": 19760 + }, + { + "epoch": 8.23, + "eval_accuracy": 0.9346666666666666, + "eval_loss": 0.1939082145690918, + "eval_runtime": 17.0832, + "eval_samples_per_second": 1404.892, + "eval_steps_per_second": 4.39, + "step": 19760 + }, + { + "epoch": 8.24, + "learning_rate": 1e-06, + "loss": 0.2049, + "step": 19780 + }, + { + "epoch": 8.24, + "eval_accuracy": 0.9353333333333333, + "eval_loss": 0.19216576218605042, + "eval_runtime": 16.5657, + "eval_samples_per_second": 1448.776, + "eval_steps_per_second": 4.527, + "step": 19780 + }, + { + "epoch": 8.25, + "learning_rate": 1e-06, + "loss": 0.1704, + "step": 19800 + }, + { + "epoch": 8.25, + "eval_accuracy": 0.9356666666666666, + "eval_loss": 0.1919444501399994, + "eval_runtime": 17.0092, + "eval_samples_per_second": 1410.998, + "eval_steps_per_second": 4.409, + "step": 19800 + }, + { + "epoch": 8.26, + "learning_rate": 1e-06, + "loss": 0.1448, + "step": 19820 + }, + { + "epoch": 8.26, + "eval_accuracy": 0.9349166666666666, + "eval_loss": 0.19339194893836975, + "eval_runtime": 16.9285, + "eval_samples_per_second": 1417.726, + "eval_steps_per_second": 4.43, + "step": 19820 + }, + { + "epoch": 8.27, + "learning_rate": 1e-06, + "loss": 0.1578, + "step": 19840 + }, + { + "epoch": 8.27, + "eval_accuracy": 0.9350833333333334, + "eval_loss": 0.19213108718395233, + "eval_runtime": 16.9637, + "eval_samples_per_second": 1414.783, + "eval_steps_per_second": 4.421, + "step": 19840 + }, + { + "epoch": 8.28, + "learning_rate": 1e-06, + "loss": 0.2108, + "step": 19860 + }, + { + "epoch": 8.28, + "eval_accuracy": 0.9352916666666666, + "eval_loss": 0.1917588859796524, + "eval_runtime": 16.9832, + "eval_samples_per_second": 1413.159, + "eval_steps_per_second": 4.416, + "step": 19860 + }, + { + "epoch": 8.28, + "learning_rate": 1e-06, + "loss": 0.1945, + "step": 19880 + }, + { + "epoch": 8.28, + "eval_accuracy": 0.9352916666666666, + "eval_loss": 0.1911618411540985, + "eval_runtime": 17.2548, + "eval_samples_per_second": 1390.917, + "eval_steps_per_second": 4.347, + "step": 19880 + }, + { + "epoch": 8.29, + "learning_rate": 1e-06, + "loss": 0.1808, + "step": 19900 + }, + { + "epoch": 8.29, + "eval_accuracy": 0.935125, + "eval_loss": 0.19140420854091644, + "eval_runtime": 16.7213, + "eval_samples_per_second": 1435.291, + "eval_steps_per_second": 4.485, + "step": 19900 + }, + { + "epoch": 8.3, + "learning_rate": 1e-06, + "loss": 0.153, + "step": 19920 + }, + { + "epoch": 8.3, + "eval_accuracy": 0.9332916666666666, + "eval_loss": 0.19377712905406952, + "eval_runtime": 17.2963, + "eval_samples_per_second": 1387.582, + "eval_steps_per_second": 4.336, + "step": 19920 + }, + { + "epoch": 8.31, + "learning_rate": 1e-06, + "loss": 0.1723, + "step": 19940 + }, + { + "epoch": 8.31, + "eval_accuracy": 0.9352083333333333, + "eval_loss": 0.19146102666854858, + "eval_runtime": 16.9128, + "eval_samples_per_second": 1419.047, + "eval_steps_per_second": 4.435, + "step": 19940 + }, + { + "epoch": 8.32, + "learning_rate": 1e-06, + "loss": 0.1505, + "step": 19960 + }, + { + "epoch": 8.32, + "eval_accuracy": 0.9335, + "eval_loss": 0.1947011649608612, + "eval_runtime": 17.0812, + "eval_samples_per_second": 1405.052, + "eval_steps_per_second": 4.391, + "step": 19960 + }, + { + "epoch": 8.32, + "learning_rate": 1e-06, + "loss": 0.1784, + "step": 19980 + }, + { + "epoch": 8.32, + "eval_accuracy": 0.935, + "eval_loss": 0.19037066400051117, + "eval_runtime": 16.5401, + "eval_samples_per_second": 1451.021, + "eval_steps_per_second": 4.534, + "step": 19980 + }, + { + "epoch": 8.33, + "learning_rate": 1e-06, + "loss": 0.1611, + "step": 20000 + }, + { + "epoch": 8.33, + "eval_accuracy": 0.9344583333333333, + "eval_loss": 0.1925501823425293, + "eval_runtime": 17.2658, + "eval_samples_per_second": 1390.034, + "eval_steps_per_second": 4.344, + "step": 20000 + }, + { + "epoch": 8.34, + "learning_rate": 1e-06, + "loss": 0.1505, + "step": 20020 + }, + { + "epoch": 8.34, + "eval_accuracy": 0.9349583333333333, + "eval_loss": 0.19143855571746826, + "eval_runtime": 16.5059, + "eval_samples_per_second": 1454.028, + "eval_steps_per_second": 4.544, + "step": 20020 + }, + { + "epoch": 8.35, + "learning_rate": 1e-06, + "loss": 0.148, + "step": 20040 + }, + { + "epoch": 8.35, + "eval_accuracy": 0.9352083333333333, + "eval_loss": 0.1917656511068344, + "eval_runtime": 16.2857, + "eval_samples_per_second": 1473.686, + "eval_steps_per_second": 4.605, + "step": 20040 + }, + { + "epoch": 8.36, + "learning_rate": 1e-06, + "loss": 0.1958, + "step": 20060 + }, + { + "epoch": 8.36, + "eval_accuracy": 0.9360833333333334, + "eval_loss": 0.1908416450023651, + "eval_runtime": 16.8917, + "eval_samples_per_second": 1420.819, + "eval_steps_per_second": 4.44, + "step": 20060 + }, + { + "epoch": 8.37, + "learning_rate": 1e-06, + "loss": 0.1007, + "step": 20080 + }, + { + "epoch": 8.37, + "eval_accuracy": 0.93475, + "eval_loss": 0.19260205328464508, + "eval_runtime": 16.2197, + "eval_samples_per_second": 1479.681, + "eval_steps_per_second": 4.624, + "step": 20080 + }, + { + "epoch": 8.38, + "learning_rate": 1e-06, + "loss": 0.1829, + "step": 20100 + }, + { + "epoch": 8.38, + "eval_accuracy": 0.93425, + "eval_loss": 0.19410496950149536, + "eval_runtime": 16.7814, + "eval_samples_per_second": 1430.153, + "eval_steps_per_second": 4.469, + "step": 20100 + }, + { + "epoch": 8.38, + "learning_rate": 1e-06, + "loss": 0.1413, + "step": 20120 + }, + { + "epoch": 8.38, + "eval_accuracy": 0.93475, + "eval_loss": 0.19318026304244995, + "eval_runtime": 16.4709, + "eval_samples_per_second": 1457.115, + "eval_steps_per_second": 4.553, + "step": 20120 + }, + { + "epoch": 8.39, + "learning_rate": 1e-06, + "loss": 0.1769, + "step": 20140 + }, + { + "epoch": 8.39, + "eval_accuracy": 0.9340833333333334, + "eval_loss": 0.19297830760478973, + "eval_runtime": 16.9359, + "eval_samples_per_second": 1417.108, + "eval_steps_per_second": 4.428, + "step": 20140 + }, + { + "epoch": 8.4, + "learning_rate": 1e-06, + "loss": 0.1843, + "step": 20160 + }, + { + "epoch": 8.4, + "eval_accuracy": 0.9353333333333333, + "eval_loss": 0.19084292650222778, + "eval_runtime": 17.2429, + "eval_samples_per_second": 1391.878, + "eval_steps_per_second": 4.35, + "step": 20160 + }, + { + "epoch": 8.41, + "learning_rate": 1e-06, + "loss": 0.2067, + "step": 20180 + }, + { + "epoch": 8.41, + "eval_accuracy": 0.9352916666666666, + "eval_loss": 0.19021664559841156, + "eval_runtime": 16.663, + "eval_samples_per_second": 1440.319, + "eval_steps_per_second": 4.501, + "step": 20180 + }, + { + "epoch": 8.42, + "learning_rate": 1e-06, + "loss": 0.1183, + "step": 20200 + }, + { + "epoch": 8.42, + "eval_accuracy": 0.9347083333333334, + "eval_loss": 0.19230937957763672, + "eval_runtime": 16.3196, + "eval_samples_per_second": 1470.622, + "eval_steps_per_second": 4.596, + "step": 20200 + }, + { + "epoch": 8.43, + "learning_rate": 1e-06, + "loss": 0.2065, + "step": 20220 + }, + { + "epoch": 8.43, + "eval_accuracy": 0.9350833333333334, + "eval_loss": 0.19128607213497162, + "eval_runtime": 16.6007, + "eval_samples_per_second": 1445.718, + "eval_steps_per_second": 4.518, + "step": 20220 + }, + { + "epoch": 8.43, + "learning_rate": 1e-06, + "loss": 0.1466, + "step": 20240 + }, + { + "epoch": 8.43, + "eval_accuracy": 0.9354166666666667, + "eval_loss": 0.19124051928520203, + "eval_runtime": 17.0198, + "eval_samples_per_second": 1410.12, + "eval_steps_per_second": 4.407, + "step": 20240 + }, + { + "epoch": 8.44, + "learning_rate": 1e-06, + "loss": 0.1557, + "step": 20260 + }, + { + "epoch": 8.44, + "eval_accuracy": 0.9355833333333333, + "eval_loss": 0.19068805873394012, + "eval_runtime": 16.4856, + "eval_samples_per_second": 1455.816, + "eval_steps_per_second": 4.549, + "step": 20260 + }, + { + "epoch": 8.45, + "learning_rate": 1e-06, + "loss": 0.1437, + "step": 20280 + }, + { + "epoch": 8.45, + "eval_accuracy": 0.934875, + "eval_loss": 0.1921066790819168, + "eval_runtime": 16.6454, + "eval_samples_per_second": 1441.842, + "eval_steps_per_second": 4.506, + "step": 20280 + }, + { + "epoch": 8.46, + "learning_rate": 1e-06, + "loss": 0.1567, + "step": 20300 + }, + { + "epoch": 8.46, + "eval_accuracy": 0.9345833333333333, + "eval_loss": 0.1937917321920395, + "eval_runtime": 16.4297, + "eval_samples_per_second": 1460.769, + "eval_steps_per_second": 4.565, + "step": 20300 + }, + { + "epoch": 8.47, + "learning_rate": 1e-06, + "loss": 0.1363, + "step": 20320 + }, + { + "epoch": 8.47, + "eval_accuracy": 0.935625, + "eval_loss": 0.19153279066085815, + "eval_runtime": 16.7967, + "eval_samples_per_second": 1428.851, + "eval_steps_per_second": 4.465, + "step": 20320 + }, + { + "epoch": 8.47, + "learning_rate": 1e-06, + "loss": 0.2041, + "step": 20340 + }, + { + "epoch": 8.47, + "eval_accuracy": 0.9349166666666666, + "eval_loss": 0.19226667284965515, + "eval_runtime": 16.8781, + "eval_samples_per_second": 1421.958, + "eval_steps_per_second": 4.444, + "step": 20340 + }, + { + "epoch": 8.48, + "learning_rate": 1e-06, + "loss": 0.1656, + "step": 20360 + }, + { + "epoch": 8.48, + "eval_accuracy": 0.933875, + "eval_loss": 0.19749757647514343, + "eval_runtime": 16.5317, + "eval_samples_per_second": 1451.756, + "eval_steps_per_second": 4.537, + "step": 20360 + }, + { + "epoch": 8.49, + "learning_rate": 1e-06, + "loss": 0.1774, + "step": 20380 + }, + { + "epoch": 8.49, + "eval_accuracy": 0.9335416666666667, + "eval_loss": 0.1959267109632492, + "eval_runtime": 16.9568, + "eval_samples_per_second": 1415.361, + "eval_steps_per_second": 4.423, + "step": 20380 + }, + { + "epoch": 8.5, + "learning_rate": 1e-06, + "loss": 0.1472, + "step": 20400 + }, + { + "epoch": 8.5, + "eval_accuracy": 0.9355833333333333, + "eval_loss": 0.19094142317771912, + "eval_runtime": 16.4125, + "eval_samples_per_second": 1462.3, + "eval_steps_per_second": 4.57, + "step": 20400 + }, + { + "epoch": 8.51, + "learning_rate": 1e-06, + "loss": 0.178, + "step": 20420 + }, + { + "epoch": 8.51, + "eval_accuracy": 0.9337083333333334, + "eval_loss": 0.19597363471984863, + "eval_runtime": 16.6336, + "eval_samples_per_second": 1442.864, + "eval_steps_per_second": 4.509, + "step": 20420 + }, + { + "epoch": 8.52, + "learning_rate": 1e-06, + "loss": 0.1561, + "step": 20440 + }, + { + "epoch": 8.52, + "eval_accuracy": 0.9354583333333333, + "eval_loss": 0.19258776307106018, + "eval_runtime": 17.0606, + "eval_samples_per_second": 1406.748, + "eval_steps_per_second": 4.396, + "step": 20440 + }, + { + "epoch": 8.53, + "learning_rate": 1e-06, + "loss": 0.1726, + "step": 20460 + }, + { + "epoch": 8.53, + "eval_accuracy": 0.9355416666666667, + "eval_loss": 0.19182823598384857, + "eval_runtime": 17.3, + "eval_samples_per_second": 1387.283, + "eval_steps_per_second": 4.335, + "step": 20460 + }, + { + "epoch": 8.53, + "learning_rate": 1e-06, + "loss": 0.1465, + "step": 20480 + }, + { + "epoch": 8.53, + "eval_accuracy": 0.9356666666666666, + "eval_loss": 0.19172848761081696, + "eval_runtime": 16.287, + "eval_samples_per_second": 1473.569, + "eval_steps_per_second": 4.605, + "step": 20480 + }, + { + "epoch": 8.54, + "learning_rate": 1e-06, + "loss": 0.1504, + "step": 20500 + }, + { + "epoch": 8.54, + "eval_accuracy": 0.9354166666666667, + "eval_loss": 0.19191785156726837, + "eval_runtime": 15.5845, + "eval_samples_per_second": 1539.99, + "eval_steps_per_second": 4.812, + "step": 20500 + }, + { + "epoch": 8.55, + "learning_rate": 1e-06, + "loss": 0.1593, + "step": 20520 + }, + { + "epoch": 8.55, + "eval_accuracy": 0.9358333333333333, + "eval_loss": 0.1908203661441803, + "eval_runtime": 16.4463, + "eval_samples_per_second": 1459.298, + "eval_steps_per_second": 4.56, + "step": 20520 + }, + { + "epoch": 8.56, + "learning_rate": 1e-06, + "loss": 0.1346, + "step": 20540 + }, + { + "epoch": 8.56, + "eval_accuracy": 0.9362916666666666, + "eval_loss": 0.19051861763000488, + "eval_runtime": 15.646, + "eval_samples_per_second": 1533.937, + "eval_steps_per_second": 4.794, + "step": 20540 + }, + { + "epoch": 8.57, + "learning_rate": 1e-06, + "loss": 0.1509, + "step": 20560 + }, + { + "epoch": 8.57, + "eval_accuracy": 0.9364166666666667, + "eval_loss": 0.18970748782157898, + "eval_runtime": 15.4588, + "eval_samples_per_second": 1552.512, + "eval_steps_per_second": 4.852, + "step": 20560 + }, + { + "epoch": 8.57, + "learning_rate": 1e-06, + "loss": 0.1419, + "step": 20580 + }, + { + "epoch": 8.57, + "eval_accuracy": 0.936375, + "eval_loss": 0.18967027962207794, + "eval_runtime": 15.484, + "eval_samples_per_second": 1549.985, + "eval_steps_per_second": 4.844, + "step": 20580 + }, + { + "epoch": 8.58, + "learning_rate": 1e-06, + "loss": 0.1477, + "step": 20600 + }, + { + "epoch": 8.58, + "eval_accuracy": 0.93525, + "eval_loss": 0.1920623481273651, + "eval_runtime": 15.7145, + "eval_samples_per_second": 1527.252, + "eval_steps_per_second": 4.773, + "step": 20600 + }, + { + "epoch": 8.59, + "learning_rate": 1e-06, + "loss": 0.1791, + "step": 20620 + }, + { + "epoch": 8.59, + "eval_accuracy": 0.9349583333333333, + "eval_loss": 0.19344042241573334, + "eval_runtime": 15.3045, + "eval_samples_per_second": 1568.164, + "eval_steps_per_second": 4.901, + "step": 20620 + }, + { + "epoch": 8.6, + "learning_rate": 1e-06, + "loss": 0.1848, + "step": 20640 + }, + { + "epoch": 8.6, + "eval_accuracy": 0.935, + "eval_loss": 0.1935572475194931, + "eval_runtime": 15.6596, + "eval_samples_per_second": 1532.606, + "eval_steps_per_second": 4.789, + "step": 20640 + }, + { + "epoch": 8.61, + "learning_rate": 1e-06, + "loss": 0.1561, + "step": 20660 + }, + { + "epoch": 8.61, + "eval_accuracy": 0.9360833333333334, + "eval_loss": 0.19057251513004303, + "eval_runtime": 15.8036, + "eval_samples_per_second": 1518.644, + "eval_steps_per_second": 4.746, + "step": 20660 + }, + { + "epoch": 8.62, + "learning_rate": 1e-06, + "loss": 0.1619, + "step": 20680 + }, + { + "epoch": 8.62, + "eval_accuracy": 0.9355, + "eval_loss": 0.1917581558227539, + "eval_runtime": 15.4829, + "eval_samples_per_second": 1550.099, + "eval_steps_per_second": 4.844, + "step": 20680 + }, + { + "epoch": 8.62, + "learning_rate": 1e-06, + "loss": 0.1778, + "step": 20700 + }, + { + "epoch": 8.62, + "eval_accuracy": 0.9360833333333334, + "eval_loss": 0.18889638781547546, + "eval_runtime": 15.6817, + "eval_samples_per_second": 1530.446, + "eval_steps_per_second": 4.783, + "step": 20700 + }, + { + "epoch": 8.63, + "learning_rate": 1e-06, + "loss": 0.1892, + "step": 20720 + }, + { + "epoch": 8.63, + "eval_accuracy": 0.9360416666666667, + "eval_loss": 0.18932241201400757, + "eval_runtime": 15.6011, + "eval_samples_per_second": 1538.354, + "eval_steps_per_second": 4.807, + "step": 20720 + }, + { + "epoch": 8.64, + "learning_rate": 1e-06, + "loss": 0.1358, + "step": 20740 + }, + { + "epoch": 8.64, + "eval_accuracy": 0.9362083333333333, + "eval_loss": 0.18881624937057495, + "eval_runtime": 15.4873, + "eval_samples_per_second": 1549.656, + "eval_steps_per_second": 4.843, + "step": 20740 + }, + { + "epoch": 8.65, + "learning_rate": 1e-06, + "loss": 0.1466, + "step": 20760 + }, + { + "epoch": 8.65, + "eval_accuracy": 0.935625, + "eval_loss": 0.19135870039463043, + "eval_runtime": 15.7754, + "eval_samples_per_second": 1521.355, + "eval_steps_per_second": 4.754, + "step": 20760 + }, + { + "epoch": 8.66, + "learning_rate": 1e-06, + "loss": 0.1536, + "step": 20780 + }, + { + "epoch": 8.66, + "eval_accuracy": 0.9358333333333333, + "eval_loss": 0.19162118434906006, + "eval_runtime": 15.6911, + "eval_samples_per_second": 1529.529, + "eval_steps_per_second": 4.78, + "step": 20780 + }, + { + "epoch": 8.67, + "learning_rate": 1e-06, + "loss": 0.1417, + "step": 20800 + }, + { + "epoch": 8.67, + "eval_accuracy": 0.9351666666666667, + "eval_loss": 0.19261544942855835, + "eval_runtime": 15.318, + "eval_samples_per_second": 1566.781, + "eval_steps_per_second": 4.896, + "step": 20800 + }, + { + "epoch": 8.68, + "learning_rate": 1e-06, + "loss": 0.1335, + "step": 20820 + }, + { + "epoch": 8.68, + "eval_accuracy": 0.9355, + "eval_loss": 0.19034945964813232, + "eval_runtime": 15.7498, + "eval_samples_per_second": 1523.829, + "eval_steps_per_second": 4.762, + "step": 20820 + }, + { + "epoch": 8.68, + "learning_rate": 1e-06, + "loss": 0.1698, + "step": 20840 + }, + { + "epoch": 8.68, + "eval_accuracy": 0.935, + "eval_loss": 0.19306336343288422, + "eval_runtime": 15.5793, + "eval_samples_per_second": 1540.501, + "eval_steps_per_second": 4.814, + "step": 20840 + }, + { + "epoch": 8.69, + "learning_rate": 1e-06, + "loss": 0.1394, + "step": 20860 + }, + { + "epoch": 8.69, + "eval_accuracy": 0.9354166666666667, + "eval_loss": 0.19079020619392395, + "eval_runtime": 15.4491, + "eval_samples_per_second": 1553.491, + "eval_steps_per_second": 4.855, + "step": 20860 + }, + { + "epoch": 8.7, + "learning_rate": 1e-06, + "loss": 0.1467, + "step": 20880 + }, + { + "epoch": 8.7, + "eval_accuracy": 0.9353333333333333, + "eval_loss": 0.1910465806722641, + "eval_runtime": 15.5826, + "eval_samples_per_second": 1540.175, + "eval_steps_per_second": 4.813, + "step": 20880 + }, + { + "epoch": 8.71, + "learning_rate": 1e-06, + "loss": 0.1307, + "step": 20900 + }, + { + "epoch": 8.71, + "eval_accuracy": 0.9344166666666667, + "eval_loss": 0.1931913048028946, + "eval_runtime": 15.7361, + "eval_samples_per_second": 1525.151, + "eval_steps_per_second": 4.766, + "step": 20900 + }, + { + "epoch": 8.72, + "learning_rate": 1e-06, + "loss": 0.1566, + "step": 20920 + }, + { + "epoch": 8.72, + "eval_accuracy": 0.934375, + "eval_loss": 0.19242696464061737, + "eval_runtime": 16.0207, + "eval_samples_per_second": 1498.065, + "eval_steps_per_second": 4.681, + "step": 20920 + }, + { + "epoch": 8.72, + "learning_rate": 1e-06, + "loss": 0.1916, + "step": 20940 + }, + { + "epoch": 8.72, + "eval_accuracy": 0.9332083333333333, + "eval_loss": 0.19626210629940033, + "eval_runtime": 15.5195, + "eval_samples_per_second": 1546.445, + "eval_steps_per_second": 4.833, + "step": 20940 + }, + { + "epoch": 8.73, + "learning_rate": 1e-06, + "loss": 0.1366, + "step": 20960 + }, + { + "epoch": 8.73, + "eval_accuracy": 0.934375, + "eval_loss": 0.1936860978603363, + "eval_runtime": 16.4371, + "eval_samples_per_second": 1460.113, + "eval_steps_per_second": 4.563, + "step": 20960 + }, + { + "epoch": 8.74, + "learning_rate": 1e-06, + "loss": 0.1532, + "step": 20980 + }, + { + "epoch": 8.74, + "eval_accuracy": 0.934, + "eval_loss": 0.19347819685935974, + "eval_runtime": 16.8336, + "eval_samples_per_second": 1425.717, + "eval_steps_per_second": 4.455, + "step": 20980 + }, + { + "epoch": 8.75, + "learning_rate": 1e-06, + "loss": 0.1524, + "step": 21000 + }, + { + "epoch": 8.75, + "eval_accuracy": 0.9351666666666667, + "eval_loss": 0.19147901237010956, + "eval_runtime": 17.5314, + "eval_samples_per_second": 1368.97, + "eval_steps_per_second": 4.278, + "step": 21000 + }, + { + "epoch": 8.76, + "learning_rate": 1e-06, + "loss": 0.1331, + "step": 21020 + }, + { + "epoch": 8.76, + "eval_accuracy": 0.9347083333333334, + "eval_loss": 0.19231154024600983, + "eval_runtime": 17.9038, + "eval_samples_per_second": 1340.496, + "eval_steps_per_second": 4.189, + "step": 21020 + }, + { + "epoch": 8.77, + "learning_rate": 1e-06, + "loss": 0.1776, + "step": 21040 + }, + { + "epoch": 8.77, + "eval_accuracy": 0.935125, + "eval_loss": 0.19299417734146118, + "eval_runtime": 17.3849, + "eval_samples_per_second": 1380.506, + "eval_steps_per_second": 4.314, + "step": 21040 + }, + { + "epoch": 8.78, + "learning_rate": 1e-06, + "loss": 0.1772, + "step": 21060 + }, + { + "epoch": 8.78, + "eval_accuracy": 0.9354583333333333, + "eval_loss": 0.1907561719417572, + "eval_runtime": 17.0346, + "eval_samples_per_second": 1408.894, + "eval_steps_per_second": 4.403, + "step": 21060 + }, + { + "epoch": 8.78, + "learning_rate": 1e-06, + "loss": 0.1501, + "step": 21080 + }, + { + "epoch": 8.78, + "eval_accuracy": 0.934125, + "eval_loss": 0.19375211000442505, + "eval_runtime": 17.0995, + "eval_samples_per_second": 1403.552, + "eval_steps_per_second": 4.386, + "step": 21080 + }, + { + "epoch": 8.79, + "learning_rate": 1e-06, + "loss": 0.1544, + "step": 21100 + }, + { + "epoch": 8.79, + "eval_accuracy": 0.936125, + "eval_loss": 0.18984746932983398, + "eval_runtime": 16.9332, + "eval_samples_per_second": 1417.338, + "eval_steps_per_second": 4.429, + "step": 21100 + }, + { + "epoch": 8.8, + "learning_rate": 1e-06, + "loss": 0.1852, + "step": 21120 + }, + { + "epoch": 8.8, + "eval_accuracy": 0.9360833333333334, + "eval_loss": 0.18966814875602722, + "eval_runtime": 16.7173, + "eval_samples_per_second": 1435.64, + "eval_steps_per_second": 4.486, + "step": 21120 + }, + { + "epoch": 8.81, + "learning_rate": 1e-06, + "loss": 0.1483, + "step": 21140 + }, + { + "epoch": 8.81, + "eval_accuracy": 0.9352083333333333, + "eval_loss": 0.19126974046230316, + "eval_runtime": 16.9887, + "eval_samples_per_second": 1412.7, + "eval_steps_per_second": 4.415, + "step": 21140 + }, + { + "epoch": 8.82, + "learning_rate": 1e-06, + "loss": 0.1434, + "step": 21160 + }, + { + "epoch": 8.82, + "eval_accuracy": 0.9355416666666667, + "eval_loss": 0.19145381450653076, + "eval_runtime": 17.2555, + "eval_samples_per_second": 1390.863, + "eval_steps_per_second": 4.346, + "step": 21160 + }, + { + "epoch": 8.82, + "learning_rate": 1e-06, + "loss": 0.2023, + "step": 21180 + }, + { + "epoch": 8.82, + "eval_accuracy": 0.9348333333333333, + "eval_loss": 0.19298754632472992, + "eval_runtime": 16.4729, + "eval_samples_per_second": 1456.937, + "eval_steps_per_second": 4.553, + "step": 21180 + }, + { + "epoch": 8.83, + "learning_rate": 1e-06, + "loss": 0.1471, + "step": 21200 + }, + { + "epoch": 8.83, + "eval_accuracy": 0.933625, + "eval_loss": 0.19501623511314392, + "eval_runtime": 16.7339, + "eval_samples_per_second": 1434.212, + "eval_steps_per_second": 4.482, + "step": 21200 + }, + { + "epoch": 8.84, + "learning_rate": 1e-06, + "loss": 0.1606, + "step": 21220 + }, + { + "epoch": 8.84, + "eval_accuracy": 0.934375, + "eval_loss": 0.19614289700984955, + "eval_runtime": 16.3664, + "eval_samples_per_second": 1466.416, + "eval_steps_per_second": 4.583, + "step": 21220 + }, + { + "epoch": 8.85, + "learning_rate": 1e-06, + "loss": 0.1873, + "step": 21240 + }, + { + "epoch": 8.85, + "eval_accuracy": 0.9364166666666667, + "eval_loss": 0.19085697829723358, + "eval_runtime": 16.7737, + "eval_samples_per_second": 1430.812, + "eval_steps_per_second": 4.471, + "step": 21240 + }, + { + "epoch": 8.86, + "learning_rate": 1e-06, + "loss": 0.1676, + "step": 21260 + }, + { + "epoch": 8.86, + "eval_accuracy": 0.936625, + "eval_loss": 0.18988537788391113, + "eval_runtime": 16.3918, + "eval_samples_per_second": 1464.148, + "eval_steps_per_second": 4.575, + "step": 21260 + }, + { + "epoch": 8.87, + "learning_rate": 1e-06, + "loss": 0.1421, + "step": 21280 + }, + { + "epoch": 8.87, + "eval_accuracy": 0.93625, + "eval_loss": 0.1904492825269699, + "eval_runtime": 16.5295, + "eval_samples_per_second": 1451.948, + "eval_steps_per_second": 4.537, + "step": 21280 + }, + { + "epoch": 8.88, + "learning_rate": 1e-06, + "loss": 0.1475, + "step": 21300 + }, + { + "epoch": 8.88, + "eval_accuracy": 0.935, + "eval_loss": 0.1930071860551834, + "eval_runtime": 16.4595, + "eval_samples_per_second": 1458.128, + "eval_steps_per_second": 4.557, + "step": 21300 + }, + { + "epoch": 8.88, + "learning_rate": 1e-06, + "loss": 0.1928, + "step": 21320 + }, + { + "epoch": 8.88, + "eval_accuracy": 0.934875, + "eval_loss": 0.19229546189308167, + "eval_runtime": 16.6828, + "eval_samples_per_second": 1438.611, + "eval_steps_per_second": 4.496, + "step": 21320 + }, + { + "epoch": 8.89, + "learning_rate": 1e-06, + "loss": 0.1633, + "step": 21340 + }, + { + "epoch": 8.89, + "eval_accuracy": 0.9347916666666667, + "eval_loss": 0.19193056225776672, + "eval_runtime": 16.5324, + "eval_samples_per_second": 1451.694, + "eval_steps_per_second": 4.537, + "step": 21340 + }, + { + "epoch": 8.9, + "learning_rate": 1e-06, + "loss": 0.1504, + "step": 21360 + }, + { + "epoch": 8.9, + "eval_accuracy": 0.935375, + "eval_loss": 0.1920657902956009, + "eval_runtime": 16.6038, + "eval_samples_per_second": 1445.452, + "eval_steps_per_second": 4.517, + "step": 21360 + }, + { + "epoch": 8.91, + "learning_rate": 1e-06, + "loss": 0.1489, + "step": 21380 + }, + { + "epoch": 8.91, + "eval_accuracy": 0.9335833333333333, + "eval_loss": 0.19495506584644318, + "eval_runtime": 17.9072, + "eval_samples_per_second": 1340.243, + "eval_steps_per_second": 4.188, + "step": 21380 + }, + { + "epoch": 8.92, + "learning_rate": 1e-06, + "loss": 0.1646, + "step": 21400 + }, + { + "epoch": 8.92, + "eval_accuracy": 0.9352916666666666, + "eval_loss": 0.19053585827350616, + "eval_runtime": 17.2542, + "eval_samples_per_second": 1390.963, + "eval_steps_per_second": 4.347, + "step": 21400 + }, + { + "epoch": 8.93, + "learning_rate": 1e-06, + "loss": 0.161, + "step": 21420 + }, + { + "epoch": 8.93, + "eval_accuracy": 0.9343333333333333, + "eval_loss": 0.19274799525737762, + "eval_runtime": 16.8753, + "eval_samples_per_second": 1422.195, + "eval_steps_per_second": 4.444, + "step": 21420 + }, + { + "epoch": 8.93, + "learning_rate": 1e-06, + "loss": 0.1885, + "step": 21440 + }, + { + "epoch": 8.93, + "eval_accuracy": 0.9355833333333333, + "eval_loss": 0.18927617371082306, + "eval_runtime": 17.7571, + "eval_samples_per_second": 1351.572, + "eval_steps_per_second": 4.224, + "step": 21440 + }, + { + "epoch": 8.94, + "learning_rate": 1e-06, + "loss": 0.1789, + "step": 21460 + }, + { + "epoch": 8.94, + "eval_accuracy": 0.9364583333333333, + "eval_loss": 0.18856315314769745, + "eval_runtime": 16.8879, + "eval_samples_per_second": 1421.133, + "eval_steps_per_second": 4.441, + "step": 21460 + }, + { + "epoch": 8.95, + "learning_rate": 1e-06, + "loss": 0.1831, + "step": 21480 + }, + { + "epoch": 8.95, + "eval_accuracy": 0.9367083333333334, + "eval_loss": 0.18908809125423431, + "eval_runtime": 16.5935, + "eval_samples_per_second": 1446.346, + "eval_steps_per_second": 4.52, + "step": 21480 + }, + { + "epoch": 8.96, + "learning_rate": 1e-06, + "loss": 0.1553, + "step": 21500 + }, + { + "epoch": 8.96, + "eval_accuracy": 0.9363333333333334, + "eval_loss": 0.1887378990650177, + "eval_runtime": 17.029, + "eval_samples_per_second": 1409.364, + "eval_steps_per_second": 4.404, + "step": 21500 + }, + { + "epoch": 8.97, + "learning_rate": 1e-06, + "loss": 0.1609, + "step": 21520 + }, + { + "epoch": 8.97, + "eval_accuracy": 0.936625, + "eval_loss": 0.1878993958234787, + "eval_runtime": 17.4532, + "eval_samples_per_second": 1375.109, + "eval_steps_per_second": 4.297, + "step": 21520 + }, + { + "epoch": 8.97, + "learning_rate": 1e-06, + "loss": 0.1616, + "step": 21540 + }, + { + "epoch": 8.97, + "eval_accuracy": 0.93625, + "eval_loss": 0.18860691785812378, + "eval_runtime": 16.6603, + "eval_samples_per_second": 1440.553, + "eval_steps_per_second": 4.502, + "step": 21540 + }, + { + "epoch": 8.98, + "learning_rate": 1e-06, + "loss": 0.1623, + "step": 21560 + }, + { + "epoch": 8.98, + "eval_accuracy": 0.934375, + "eval_loss": 0.19211074709892273, + "eval_runtime": 16.5367, + "eval_samples_per_second": 1451.315, + "eval_steps_per_second": 4.535, + "step": 21560 + }, + { + "epoch": 8.99, + "learning_rate": 1e-06, + "loss": 0.1485, + "step": 21580 + }, + { + "epoch": 8.99, + "eval_accuracy": 0.93575, + "eval_loss": 0.1881760060787201, + "eval_runtime": 16.5469, + "eval_samples_per_second": 1450.422, + "eval_steps_per_second": 4.533, + "step": 21580 + }, + { + "epoch": 9.0, + "learning_rate": 1e-06, + "loss": 0.1922, + "step": 21600 + }, + { + "epoch": 9.0, + "eval_accuracy": 0.9357916666666667, + "eval_loss": 0.18824529647827148, + "eval_runtime": 17.1765, + "eval_samples_per_second": 1397.261, + "eval_steps_per_second": 4.366, + "step": 21600 + }, + { + "epoch": 9.01, + "learning_rate": 1e-06, + "loss": 0.1314, + "step": 21620 + }, + { + "epoch": 9.01, + "eval_accuracy": 0.9365, + "eval_loss": 0.18808509409427643, + "eval_runtime": 16.3714, + "eval_samples_per_second": 1465.972, + "eval_steps_per_second": 4.581, + "step": 21620 + }, + { + "epoch": 9.02, + "learning_rate": 1e-06, + "loss": 0.1579, + "step": 21640 + }, + { + "epoch": 9.02, + "eval_accuracy": 0.935, + "eval_loss": 0.1919311285018921, + "eval_runtime": 16.3749, + "eval_samples_per_second": 1465.66, + "eval_steps_per_second": 4.58, + "step": 21640 + }, + { + "epoch": 9.03, + "learning_rate": 1e-06, + "loss": 0.1449, + "step": 21660 + }, + { + "epoch": 9.03, + "eval_accuracy": 0.9358333333333333, + "eval_loss": 0.19011430442333221, + "eval_runtime": 16.428, + "eval_samples_per_second": 1460.921, + "eval_steps_per_second": 4.565, + "step": 21660 + }, + { + "epoch": 9.03, + "learning_rate": 1e-06, + "loss": 0.1394, + "step": 21680 + }, + { + "epoch": 9.03, + "eval_accuracy": 0.936875, + "eval_loss": 0.19003826379776, + "eval_runtime": 17.008, + "eval_samples_per_second": 1411.103, + "eval_steps_per_second": 4.41, + "step": 21680 + }, + { + "epoch": 9.04, + "learning_rate": 1e-06, + "loss": 0.1411, + "step": 21700 + }, + { + "epoch": 9.04, + "eval_accuracy": 0.9344583333333333, + "eval_loss": 0.19364295899868011, + "eval_runtime": 17.6973, + "eval_samples_per_second": 1356.136, + "eval_steps_per_second": 4.238, + "step": 21700 + }, + { + "epoch": 9.05, + "learning_rate": 1e-06, + "loss": 0.1622, + "step": 21720 + }, + { + "epoch": 9.05, + "eval_accuracy": 0.9360833333333334, + "eval_loss": 0.190113365650177, + "eval_runtime": 17.0907, + "eval_samples_per_second": 1404.273, + "eval_steps_per_second": 4.388, + "step": 21720 + }, + { + "epoch": 9.06, + "learning_rate": 1e-06, + "loss": 0.1758, + "step": 21740 + }, + { + "epoch": 9.06, + "eval_accuracy": 0.9359166666666666, + "eval_loss": 0.19059090316295624, + "eval_runtime": 17.4852, + "eval_samples_per_second": 1372.587, + "eval_steps_per_second": 4.289, + "step": 21740 + }, + { + "epoch": 9.07, + "learning_rate": 1e-06, + "loss": 0.1469, + "step": 21760 + }, + { + "epoch": 9.07, + "eval_accuracy": 0.9352916666666666, + "eval_loss": 0.19149552285671234, + "eval_runtime": 15.6633, + "eval_samples_per_second": 1532.24, + "eval_steps_per_second": 4.788, + "step": 21760 + }, + { + "epoch": 9.07, + "learning_rate": 1e-06, + "loss": 0.1543, + "step": 21780 + }, + { + "epoch": 9.07, + "eval_accuracy": 0.9346666666666666, + "eval_loss": 0.1922057569026947, + "eval_runtime": 15.6817, + "eval_samples_per_second": 1530.451, + "eval_steps_per_second": 4.783, + "step": 21780 + }, + { + "epoch": 9.08, + "learning_rate": 1e-06, + "loss": 0.1411, + "step": 21800 + }, + { + "epoch": 9.08, + "eval_accuracy": 0.9354166666666667, + "eval_loss": 0.19251596927642822, + "eval_runtime": 15.8744, + "eval_samples_per_second": 1511.864, + "eval_steps_per_second": 4.725, + "step": 21800 + }, + { + "epoch": 9.09, + "learning_rate": 1e-06, + "loss": 0.1686, + "step": 21820 + }, + { + "epoch": 9.09, + "eval_accuracy": 0.937125, + "eval_loss": 0.1892337054014206, + "eval_runtime": 16.2971, + "eval_samples_per_second": 1472.653, + "eval_steps_per_second": 4.602, + "step": 21820 + }, + { + "epoch": 9.1, + "learning_rate": 1e-06, + "loss": 0.1332, + "step": 21840 + }, + { + "epoch": 9.1, + "eval_accuracy": 0.9366666666666666, + "eval_loss": 0.1890205591917038, + "eval_runtime": 15.4417, + "eval_samples_per_second": 1554.228, + "eval_steps_per_second": 4.857, + "step": 21840 + }, + { + "epoch": 9.11, + "learning_rate": 1e-06, + "loss": 0.1217, + "step": 21860 + }, + { + "epoch": 9.11, + "eval_accuracy": 0.9357083333333334, + "eval_loss": 0.1898655891418457, + "eval_runtime": 16.1103, + "eval_samples_per_second": 1489.727, + "eval_steps_per_second": 4.655, + "step": 21860 + }, + { + "epoch": 9.12, + "learning_rate": 1e-06, + "loss": 0.1765, + "step": 21880 + }, + { + "epoch": 9.12, + "eval_accuracy": 0.9354166666666667, + "eval_loss": 0.19171109795570374, + "eval_runtime": 16.5889, + "eval_samples_per_second": 1446.753, + "eval_steps_per_second": 4.521, + "step": 21880 + }, + { + "epoch": 9.12, + "learning_rate": 1e-06, + "loss": 0.1733, + "step": 21900 + }, + { + "epoch": 9.12, + "eval_accuracy": 0.9349583333333333, + "eval_loss": 0.19177711009979248, + "eval_runtime": 17.5638, + "eval_samples_per_second": 1366.449, + "eval_steps_per_second": 4.27, + "step": 21900 + }, + { + "epoch": 9.13, + "learning_rate": 1e-06, + "loss": 0.1424, + "step": 21920 + }, + { + "epoch": 9.13, + "eval_accuracy": 0.934625, + "eval_loss": 0.19338135421276093, + "eval_runtime": 17.301, + "eval_samples_per_second": 1387.204, + "eval_steps_per_second": 4.335, + "step": 21920 + }, + { + "epoch": 9.14, + "learning_rate": 1e-06, + "loss": 0.1389, + "step": 21940 + }, + { + "epoch": 9.14, + "eval_accuracy": 0.9360416666666667, + "eval_loss": 0.19054777920246124, + "eval_runtime": 16.9906, + "eval_samples_per_second": 1412.546, + "eval_steps_per_second": 4.414, + "step": 21940 + }, + { + "epoch": 9.15, + "learning_rate": 1e-06, + "loss": 0.1426, + "step": 21960 + }, + { + "epoch": 9.15, + "eval_accuracy": 0.9355, + "eval_loss": 0.1908566951751709, + "eval_runtime": 17.0809, + "eval_samples_per_second": 1405.076, + "eval_steps_per_second": 4.391, + "step": 21960 + }, + { + "epoch": 9.16, + "learning_rate": 1e-06, + "loss": 0.1376, + "step": 21980 + }, + { + "epoch": 9.16, + "eval_accuracy": 0.936375, + "eval_loss": 0.18984192609786987, + "eval_runtime": 16.3175, + "eval_samples_per_second": 1470.81, + "eval_steps_per_second": 4.596, + "step": 21980 + }, + { + "epoch": 9.17, + "learning_rate": 1e-06, + "loss": 0.162, + "step": 22000 + }, + { + "epoch": 9.17, + "eval_accuracy": 0.9356666666666666, + "eval_loss": 0.1910352259874344, + "eval_runtime": 16.1268, + "eval_samples_per_second": 1488.209, + "eval_steps_per_second": 4.651, + "step": 22000 + }, + { + "epoch": 9.18, + "learning_rate": 1e-06, + "loss": 0.1555, + "step": 22020 + }, + { + "epoch": 9.18, + "eval_accuracy": 0.93475, + "eval_loss": 0.19323420524597168, + "eval_runtime": 16.5391, + "eval_samples_per_second": 1451.109, + "eval_steps_per_second": 4.535, + "step": 22020 + }, + { + "epoch": 9.18, + "learning_rate": 1e-06, + "loss": 0.1613, + "step": 22040 + }, + { + "epoch": 9.18, + "eval_accuracy": 0.9364583333333333, + "eval_loss": 0.1894889771938324, + "eval_runtime": 16.1517, + "eval_samples_per_second": 1485.912, + "eval_steps_per_second": 4.643, + "step": 22040 + }, + { + "epoch": 9.19, + "learning_rate": 1e-06, + "loss": 0.1407, + "step": 22060 + }, + { + "epoch": 9.19, + "eval_accuracy": 0.9354583333333333, + "eval_loss": 0.1917874664068222, + "eval_runtime": 17.701, + "eval_samples_per_second": 1355.857, + "eval_steps_per_second": 4.237, + "step": 22060 + }, + { + "epoch": 9.2, + "learning_rate": 1e-06, + "loss": 0.1709, + "step": 22080 + }, + { + "epoch": 9.2, + "eval_accuracy": 0.93425, + "eval_loss": 0.19433771073818207, + "eval_runtime": 17.0414, + "eval_samples_per_second": 1408.333, + "eval_steps_per_second": 4.401, + "step": 22080 + }, + { + "epoch": 9.21, + "learning_rate": 1e-06, + "loss": 0.1435, + "step": 22100 + }, + { + "epoch": 9.21, + "eval_accuracy": 0.93575, + "eval_loss": 0.19121021032333374, + "eval_runtime": 17.2755, + "eval_samples_per_second": 1389.249, + "eval_steps_per_second": 4.341, + "step": 22100 + }, + { + "epoch": 9.22, + "learning_rate": 1e-06, + "loss": 0.1778, + "step": 22120 + }, + { + "epoch": 9.22, + "eval_accuracy": 0.9360833333333334, + "eval_loss": 0.1902051866054535, + "eval_runtime": 16.2731, + "eval_samples_per_second": 1474.822, + "eval_steps_per_second": 4.609, + "step": 22120 + }, + { + "epoch": 9.22, + "learning_rate": 1e-06, + "loss": 0.1413, + "step": 22140 + }, + { + "epoch": 9.22, + "eval_accuracy": 0.9355416666666667, + "eval_loss": 0.1910925656557083, + "eval_runtime": 16.4198, + "eval_samples_per_second": 1461.653, + "eval_steps_per_second": 4.568, + "step": 22140 + }, + { + "epoch": 9.23, + "learning_rate": 1e-06, + "loss": 0.1772, + "step": 22160 + }, + { + "epoch": 9.23, + "eval_accuracy": 0.9365, + "eval_loss": 0.1890135258436203, + "eval_runtime": 16.5257, + "eval_samples_per_second": 1452.285, + "eval_steps_per_second": 4.538, + "step": 22160 + }, + { + "epoch": 9.24, + "learning_rate": 1e-06, + "loss": 0.1545, + "step": 22180 + }, + { + "epoch": 9.24, + "eval_accuracy": 0.9360416666666667, + "eval_loss": 0.19075444340705872, + "eval_runtime": 16.9041, + "eval_samples_per_second": 1419.773, + "eval_steps_per_second": 4.437, + "step": 22180 + }, + { + "epoch": 9.25, + "learning_rate": 1e-06, + "loss": 0.1434, + "step": 22200 + }, + { + "epoch": 9.25, + "eval_accuracy": 0.9331666666666667, + "eval_loss": 0.19980815052986145, + "eval_runtime": 16.2294, + "eval_samples_per_second": 1478.797, + "eval_steps_per_second": 4.621, + "step": 22200 + }, + { + "epoch": 9.26, + "learning_rate": 1e-06, + "loss": 0.1981, + "step": 22220 + }, + { + "epoch": 9.26, + "eval_accuracy": 0.9367916666666667, + "eval_loss": 0.1893180012702942, + "eval_runtime": 16.4467, + "eval_samples_per_second": 1459.263, + "eval_steps_per_second": 4.56, + "step": 22220 + }, + { + "epoch": 9.27, + "learning_rate": 1e-06, + "loss": 0.1585, + "step": 22240 + }, + { + "epoch": 9.27, + "eval_accuracy": 0.936, + "eval_loss": 0.18898563086986542, + "eval_runtime": 16.2988, + "eval_samples_per_second": 1472.504, + "eval_steps_per_second": 4.602, + "step": 22240 + }, + { + "epoch": 9.28, + "learning_rate": 1e-06, + "loss": 0.1361, + "step": 22260 + }, + { + "epoch": 9.28, + "eval_accuracy": 0.9357916666666667, + "eval_loss": 0.1903260499238968, + "eval_runtime": 16.4778, + "eval_samples_per_second": 1456.508, + "eval_steps_per_second": 4.552, + "step": 22260 + }, + { + "epoch": 9.28, + "learning_rate": 1e-06, + "loss": 0.1666, + "step": 22280 + }, + { + "epoch": 9.28, + "eval_accuracy": 0.9356666666666666, + "eval_loss": 0.19420108199119568, + "eval_runtime": 16.7685, + "eval_samples_per_second": 1431.255, + "eval_steps_per_second": 4.473, + "step": 22280 + }, + { + "epoch": 9.29, + "learning_rate": 1e-06, + "loss": 0.1855, + "step": 22300 + }, + { + "epoch": 9.29, + "eval_accuracy": 0.9355416666666667, + "eval_loss": 0.19144552946090698, + "eval_runtime": 16.1429, + "eval_samples_per_second": 1486.719, + "eval_steps_per_second": 4.646, + "step": 22300 + }, + { + "epoch": 9.3, + "learning_rate": 1e-06, + "loss": 0.126, + "step": 22320 + }, + { + "epoch": 9.3, + "eval_accuracy": 0.935875, + "eval_loss": 0.1904752403497696, + "eval_runtime": 16.2377, + "eval_samples_per_second": 1478.039, + "eval_steps_per_second": 4.619, + "step": 22320 + }, + { + "epoch": 9.31, + "learning_rate": 1e-06, + "loss": 0.1244, + "step": 22340 + }, + { + "epoch": 9.31, + "eval_accuracy": 0.9362916666666666, + "eval_loss": 0.19049489498138428, + "eval_runtime": 16.6183, + "eval_samples_per_second": 1444.194, + "eval_steps_per_second": 4.513, + "step": 22340 + }, + { + "epoch": 9.32, + "learning_rate": 1e-06, + "loss": 0.1737, + "step": 22360 + }, + { + "epoch": 9.32, + "eval_accuracy": 0.93575, + "eval_loss": 0.19159464538097382, + "eval_runtime": 16.5579, + "eval_samples_per_second": 1449.463, + "eval_steps_per_second": 4.53, + "step": 22360 + }, + { + "epoch": 9.32, + "learning_rate": 1e-06, + "loss": 0.1568, + "step": 22380 + }, + { + "epoch": 9.32, + "eval_accuracy": 0.9359166666666666, + "eval_loss": 0.19013747572898865, + "eval_runtime": 16.3587, + "eval_samples_per_second": 1467.107, + "eval_steps_per_second": 4.585, + "step": 22380 + }, + { + "epoch": 9.33, + "learning_rate": 1e-06, + "loss": 0.1501, + "step": 22400 + }, + { + "epoch": 9.33, + "eval_accuracy": 0.936625, + "eval_loss": 0.1898564100265503, + "eval_runtime": 16.4813, + "eval_samples_per_second": 1456.2, + "eval_steps_per_second": 4.551, + "step": 22400 + }, + { + "epoch": 9.34, + "learning_rate": 1e-06, + "loss": 0.133, + "step": 22420 + }, + { + "epoch": 9.34, + "eval_accuracy": 0.935875, + "eval_loss": 0.19129687547683716, + "eval_runtime": 17.9192, + "eval_samples_per_second": 1339.343, + "eval_steps_per_second": 4.185, + "step": 22420 + }, + { + "epoch": 9.35, + "learning_rate": 1e-06, + "loss": 0.1577, + "step": 22440 + }, + { + "epoch": 9.35, + "eval_accuracy": 0.9364583333333333, + "eval_loss": 0.19107209146022797, + "eval_runtime": 17.3207, + "eval_samples_per_second": 1385.625, + "eval_steps_per_second": 4.33, + "step": 22440 + }, + { + "epoch": 9.36, + "learning_rate": 1e-06, + "loss": 0.1463, + "step": 22460 + }, + { + "epoch": 9.36, + "eval_accuracy": 0.935875, + "eval_loss": 0.19182544946670532, + "eval_runtime": 17.9384, + "eval_samples_per_second": 1337.915, + "eval_steps_per_second": 4.181, + "step": 22460 + }, + { + "epoch": 9.37, + "learning_rate": 1e-06, + "loss": 0.1466, + "step": 22480 + }, + { + "epoch": 9.37, + "eval_accuracy": 0.93225, + "eval_loss": 0.20088885724544525, + "eval_runtime": 17.7162, + "eval_samples_per_second": 1354.693, + "eval_steps_per_second": 4.233, + "step": 22480 + }, + { + "epoch": 9.38, + "learning_rate": 1e-06, + "loss": 0.1537, + "step": 22500 + }, + { + "epoch": 9.38, + "eval_accuracy": 0.9361666666666667, + "eval_loss": 0.19151511788368225, + "eval_runtime": 17.6, + "eval_samples_per_second": 1363.639, + "eval_steps_per_second": 4.261, + "step": 22500 + }, + { + "epoch": 9.38, + "learning_rate": 1e-06, + "loss": 0.1463, + "step": 22520 + }, + { + "epoch": 9.38, + "eval_accuracy": 0.933875, + "eval_loss": 0.1945749670267105, + "eval_runtime": 15.7657, + "eval_samples_per_second": 1522.29, + "eval_steps_per_second": 4.757, + "step": 22520 + }, + { + "epoch": 9.39, + "learning_rate": 1e-06, + "loss": 0.1763, + "step": 22540 + }, + { + "epoch": 9.39, + "eval_accuracy": 0.9354166666666667, + "eval_loss": 0.19165822863578796, + "eval_runtime": 17.4199, + "eval_samples_per_second": 1377.737, + "eval_steps_per_second": 4.305, + "step": 22540 + }, + { + "epoch": 9.4, + "learning_rate": 1e-06, + "loss": 0.1375, + "step": 22560 + }, + { + "epoch": 9.4, + "eval_accuracy": 0.935625, + "eval_loss": 0.19137227535247803, + "eval_runtime": 17.3107, + "eval_samples_per_second": 1386.428, + "eval_steps_per_second": 4.333, + "step": 22560 + }, + { + "epoch": 9.41, + "learning_rate": 1e-06, + "loss": 0.1616, + "step": 22580 + }, + { + "epoch": 9.41, + "eval_accuracy": 0.93575, + "eval_loss": 0.19043239951133728, + "eval_runtime": 17.1156, + "eval_samples_per_second": 1402.228, + "eval_steps_per_second": 4.382, + "step": 22580 + }, + { + "epoch": 9.42, + "learning_rate": 1e-06, + "loss": 0.1809, + "step": 22600 + }, + { + "epoch": 9.42, + "eval_accuracy": 0.9346666666666666, + "eval_loss": 0.19357599318027496, + "eval_runtime": 17.8709, + "eval_samples_per_second": 1342.962, + "eval_steps_per_second": 4.197, + "step": 22600 + }, + { + "epoch": 9.43, + "learning_rate": 1e-06, + "loss": 0.1436, + "step": 22620 + }, + { + "epoch": 9.43, + "eval_accuracy": 0.935625, + "eval_loss": 0.18992185592651367, + "eval_runtime": 16.5938, + "eval_samples_per_second": 1446.323, + "eval_steps_per_second": 4.52, + "step": 22620 + }, + { + "epoch": 9.43, + "learning_rate": 1e-06, + "loss": 0.1831, + "step": 22640 + }, + { + "epoch": 9.43, + "eval_accuracy": 0.9361666666666667, + "eval_loss": 0.19047614932060242, + "eval_runtime": 17.0882, + "eval_samples_per_second": 1404.476, + "eval_steps_per_second": 4.389, + "step": 22640 + }, + { + "epoch": 9.44, + "learning_rate": 1e-06, + "loss": 0.135, + "step": 22660 + }, + { + "epoch": 9.44, + "eval_accuracy": 0.9363333333333334, + "eval_loss": 0.1894386261701584, + "eval_runtime": 17.1834, + "eval_samples_per_second": 1396.7, + "eval_steps_per_second": 4.365, + "step": 22660 + }, + { + "epoch": 9.45, + "learning_rate": 1e-06, + "loss": 0.198, + "step": 22680 + }, + { + "epoch": 9.45, + "eval_accuracy": 0.93575, + "eval_loss": 0.18925711512565613, + "eval_runtime": 16.7997, + "eval_samples_per_second": 1428.594, + "eval_steps_per_second": 4.464, + "step": 22680 + }, + { + "epoch": 9.46, + "learning_rate": 1e-06, + "loss": 0.1595, + "step": 22700 + }, + { + "epoch": 9.46, + "eval_accuracy": 0.936, + "eval_loss": 0.19023475050926208, + "eval_runtime": 17.1467, + "eval_samples_per_second": 1399.688, + "eval_steps_per_second": 4.374, + "step": 22700 + }, + { + "epoch": 9.47, + "learning_rate": 1e-06, + "loss": 0.1594, + "step": 22720 + }, + { + "epoch": 9.47, + "eval_accuracy": 0.9362083333333333, + "eval_loss": 0.18937428295612335, + "eval_runtime": 16.3859, + "eval_samples_per_second": 1464.678, + "eval_steps_per_second": 4.577, + "step": 22720 + }, + { + "epoch": 9.47, + "learning_rate": 1e-06, + "loss": 0.1741, + "step": 22740 + }, + { + "epoch": 9.47, + "eval_accuracy": 0.9365416666666667, + "eval_loss": 0.18718321621418, + "eval_runtime": 16.7434, + "eval_samples_per_second": 1433.4, + "eval_steps_per_second": 4.479, + "step": 22740 + }, + { + "epoch": 9.48, + "learning_rate": 1e-06, + "loss": 0.1526, + "step": 22760 + }, + { + "epoch": 9.48, + "eval_accuracy": 0.936125, + "eval_loss": 0.18912020325660706, + "eval_runtime": 16.0636, + "eval_samples_per_second": 1494.062, + "eval_steps_per_second": 4.669, + "step": 22760 + }, + { + "epoch": 9.49, + "learning_rate": 1e-06, + "loss": 0.1785, + "step": 22780 + }, + { + "epoch": 9.49, + "eval_accuracy": 0.9362083333333333, + "eval_loss": 0.1892082542181015, + "eval_runtime": 16.7318, + "eval_samples_per_second": 1434.395, + "eval_steps_per_second": 4.482, + "step": 22780 + }, + { + "epoch": 9.5, + "learning_rate": 1e-06, + "loss": 0.1487, + "step": 22800 + }, + { + "epoch": 9.5, + "eval_accuracy": 0.93375, + "eval_loss": 0.1960740089416504, + "eval_runtime": 16.2354, + "eval_samples_per_second": 1478.256, + "eval_steps_per_second": 4.62, + "step": 22800 + }, + { + "epoch": 9.51, + "learning_rate": 1e-06, + "loss": 0.151, + "step": 22820 + }, + { + "epoch": 9.51, + "eval_accuracy": 0.9355, + "eval_loss": 0.19206097722053528, + "eval_runtime": 16.4046, + "eval_samples_per_second": 1463.004, + "eval_steps_per_second": 4.572, + "step": 22820 + }, + { + "epoch": 9.52, + "learning_rate": 1e-06, + "loss": 0.1641, + "step": 22840 + }, + { + "epoch": 9.52, + "eval_accuracy": 0.9347083333333334, + "eval_loss": 0.1922009140253067, + "eval_runtime": 16.3419, + "eval_samples_per_second": 1468.617, + "eval_steps_per_second": 4.589, + "step": 22840 + }, + { + "epoch": 9.53, + "learning_rate": 1e-06, + "loss": 0.1403, + "step": 22860 + }, + { + "epoch": 9.53, + "eval_accuracy": 0.936625, + "eval_loss": 0.18889163434505463, + "eval_runtime": 16.6017, + "eval_samples_per_second": 1445.631, + "eval_steps_per_second": 4.518, + "step": 22860 + }, + { + "epoch": 9.53, + "learning_rate": 1e-06, + "loss": 0.182, + "step": 22880 + }, + { + "epoch": 9.53, + "eval_accuracy": 0.9352083333333333, + "eval_loss": 0.19116635620594025, + "eval_runtime": 16.607, + "eval_samples_per_second": 1445.171, + "eval_steps_per_second": 4.516, + "step": 22880 + }, + { + "epoch": 9.54, + "learning_rate": 1e-06, + "loss": 0.1655, + "step": 22900 + }, + { + "epoch": 9.54, + "eval_accuracy": 0.9345833333333333, + "eval_loss": 0.1919187605381012, + "eval_runtime": 16.4494, + "eval_samples_per_second": 1459.021, + "eval_steps_per_second": 4.559, + "step": 22900 + }, + { + "epoch": 9.55, + "learning_rate": 1e-06, + "loss": 0.1651, + "step": 22920 + }, + { + "epoch": 9.55, + "eval_accuracy": 0.9361666666666667, + "eval_loss": 0.1887436956167221, + "eval_runtime": 16.2982, + "eval_samples_per_second": 1472.556, + "eval_steps_per_second": 4.602, + "step": 22920 + }, + { + "epoch": 9.56, + "learning_rate": 1e-06, + "loss": 0.1493, + "step": 22940 + }, + { + "epoch": 9.56, + "eval_accuracy": 0.9370416666666667, + "eval_loss": 0.18585175275802612, + "eval_runtime": 16.7173, + "eval_samples_per_second": 1435.637, + "eval_steps_per_second": 4.486, + "step": 22940 + }, + { + "epoch": 9.57, + "learning_rate": 1e-06, + "loss": 0.1563, + "step": 22960 + }, + { + "epoch": 9.57, + "eval_accuracy": 0.9357916666666667, + "eval_loss": 0.18925468623638153, + "eval_runtime": 17.5796, + "eval_samples_per_second": 1365.215, + "eval_steps_per_second": 4.266, + "step": 22960 + }, + { + "epoch": 9.57, + "learning_rate": 1e-06, + "loss": 0.1498, + "step": 22980 + }, + { + "epoch": 9.57, + "eval_accuracy": 0.93575, + "eval_loss": 0.19032344222068787, + "eval_runtime": 15.9593, + "eval_samples_per_second": 1503.826, + "eval_steps_per_second": 4.699, + "step": 22980 + }, + { + "epoch": 9.58, + "learning_rate": 1e-06, + "loss": 0.131, + "step": 23000 + }, + { + "epoch": 9.58, + "eval_accuracy": 0.9360833333333334, + "eval_loss": 0.18864993751049042, + "eval_runtime": 15.7391, + "eval_samples_per_second": 1524.867, + "eval_steps_per_second": 4.765, + "step": 23000 + }, + { + "epoch": 9.59, + "learning_rate": 1e-06, + "loss": 0.1588, + "step": 23020 + }, + { + "epoch": 9.59, + "eval_accuracy": 0.9375416666666667, + "eval_loss": 0.18759743869304657, + "eval_runtime": 82.369, + "eval_samples_per_second": 291.372, + "eval_steps_per_second": 0.911, + "step": 23020 + }, + { + "epoch": 9.6, + "learning_rate": 1e-06, + "loss": 0.1555, + "step": 23040 + }, + { + "epoch": 9.6, + "eval_accuracy": 0.9354583333333333, + "eval_loss": 0.1895967274904251, + "eval_runtime": 16.3955, + "eval_samples_per_second": 1463.818, + "eval_steps_per_second": 4.574, + "step": 23040 + }, + { + "epoch": 9.61, + "learning_rate": 1e-06, + "loss": 0.1625, + "step": 23060 + }, + { + "epoch": 9.61, + "eval_accuracy": 0.9355416666666667, + "eval_loss": 0.1896596997976303, + "eval_runtime": 17.4896, + "eval_samples_per_second": 1372.245, + "eval_steps_per_second": 4.288, + "step": 23060 + }, + { + "epoch": 9.62, + "learning_rate": 1e-06, + "loss": 0.1183, + "step": 23080 + }, + { + "epoch": 9.62, + "eval_accuracy": 0.93425, + "eval_loss": 0.19340163469314575, + "eval_runtime": 17.2768, + "eval_samples_per_second": 1389.142, + "eval_steps_per_second": 4.341, + "step": 23080 + }, + { + "epoch": 9.62, + "learning_rate": 1e-06, + "loss": 0.1668, + "step": 23100 + }, + { + "epoch": 9.62, + "eval_accuracy": 0.93425, + "eval_loss": 0.1944810450077057, + "eval_runtime": 17.3938, + "eval_samples_per_second": 1379.802, + "eval_steps_per_second": 4.312, + "step": 23100 + }, + { + "epoch": 9.63, + "learning_rate": 1e-06, + "loss": 0.1514, + "step": 23120 + }, + { + "epoch": 9.63, + "eval_accuracy": 0.9350416666666667, + "eval_loss": 0.19056230783462524, + "eval_runtime": 17.3545, + "eval_samples_per_second": 1382.923, + "eval_steps_per_second": 4.322, + "step": 23120 + }, + { + "epoch": 9.64, + "learning_rate": 1e-06, + "loss": 0.0979, + "step": 23140 + }, + { + "epoch": 9.64, + "eval_accuracy": 0.935375, + "eval_loss": 0.18961113691329956, + "eval_runtime": 17.9278, + "eval_samples_per_second": 1338.707, + "eval_steps_per_second": 4.183, + "step": 23140 + }, + { + "epoch": 9.65, + "learning_rate": 1e-06, + "loss": 0.1414, + "step": 23160 + }, + { + "epoch": 9.65, + "eval_accuracy": 0.93675, + "eval_loss": 0.18856357038021088, + "eval_runtime": 17.2651, + "eval_samples_per_second": 1390.087, + "eval_steps_per_second": 4.344, + "step": 23160 + }, + { + "epoch": 9.66, + "learning_rate": 1e-06, + "loss": 0.1359, + "step": 23180 + }, + { + "epoch": 9.66, + "eval_accuracy": 0.937625, + "eval_loss": 0.18765127658843994, + "eval_runtime": 17.5512, + "eval_samples_per_second": 1367.431, + "eval_steps_per_second": 4.273, + "step": 23180 + }, + { + "epoch": 9.67, + "learning_rate": 1e-06, + "loss": 0.1342, + "step": 23200 + }, + { + "epoch": 9.67, + "eval_accuracy": 0.9369166666666666, + "eval_loss": 0.1894044429063797, + "eval_runtime": 17.3459, + "eval_samples_per_second": 1383.612, + "eval_steps_per_second": 4.324, + "step": 23200 + }, + { + "epoch": 9.68, + "learning_rate": 1e-06, + "loss": 0.156, + "step": 23220 + }, + { + "epoch": 9.68, + "eval_accuracy": 0.9369166666666666, + "eval_loss": 0.18876871466636658, + "eval_runtime": 18.1165, + "eval_samples_per_second": 1324.757, + "eval_steps_per_second": 4.14, + "step": 23220 + }, + { + "epoch": 9.68, + "learning_rate": 1e-06, + "loss": 0.1527, + "step": 23240 + }, + { + "epoch": 9.68, + "eval_accuracy": 0.937375, + "eval_loss": 0.18748903274536133, + "eval_runtime": 17.4524, + "eval_samples_per_second": 1375.169, + "eval_steps_per_second": 4.297, + "step": 23240 + }, + { + "epoch": 9.69, + "learning_rate": 1e-06, + "loss": 0.1608, + "step": 23260 + }, + { + "epoch": 9.69, + "eval_accuracy": 0.9370416666666667, + "eval_loss": 0.18767432868480682, + "eval_runtime": 17.2496, + "eval_samples_per_second": 1391.339, + "eval_steps_per_second": 4.348, + "step": 23260 + }, + { + "epoch": 9.7, + "learning_rate": 1e-06, + "loss": 0.1576, + "step": 23280 + }, + { + "epoch": 9.7, + "eval_accuracy": 0.9365, + "eval_loss": 0.18888776004314423, + "eval_runtime": 17.5705, + "eval_samples_per_second": 1365.923, + "eval_steps_per_second": 4.269, + "step": 23280 + }, + { + "epoch": 9.71, + "learning_rate": 1e-06, + "loss": 0.1618, + "step": 23300 + }, + { + "epoch": 9.71, + "eval_accuracy": 0.9361666666666667, + "eval_loss": 0.19041982293128967, + "eval_runtime": 16.6527, + "eval_samples_per_second": 1441.209, + "eval_steps_per_second": 4.504, + "step": 23300 + }, + { + "epoch": 9.72, + "learning_rate": 1e-06, + "loss": 0.1551, + "step": 23320 + }, + { + "epoch": 9.72, + "eval_accuracy": 0.9355833333333333, + "eval_loss": 0.19049932062625885, + "eval_runtime": 16.922, + "eval_samples_per_second": 1418.271, + "eval_steps_per_second": 4.432, + "step": 23320 + }, + { + "epoch": 9.72, + "learning_rate": 1e-06, + "loss": 0.1311, + "step": 23340 + }, + { + "epoch": 9.72, + "eval_accuracy": 0.9359166666666666, + "eval_loss": 0.18927621841430664, + "eval_runtime": 16.8033, + "eval_samples_per_second": 1428.291, + "eval_steps_per_second": 4.463, + "step": 23340 + }, + { + "epoch": 9.73, + "learning_rate": 1e-06, + "loss": 0.128, + "step": 23360 + }, + { + "epoch": 9.73, + "eval_accuracy": 0.93625, + "eval_loss": 0.1896050125360489, + "eval_runtime": 16.664, + "eval_samples_per_second": 1440.231, + "eval_steps_per_second": 4.501, + "step": 23360 + }, + { + "epoch": 9.74, + "learning_rate": 1e-06, + "loss": 0.1733, + "step": 23380 + }, + { + "epoch": 9.74, + "eval_accuracy": 0.934375, + "eval_loss": 0.19381864368915558, + "eval_runtime": 17.6268, + "eval_samples_per_second": 1361.562, + "eval_steps_per_second": 4.255, + "step": 23380 + }, + { + "epoch": 9.75, + "learning_rate": 1e-06, + "loss": 0.1546, + "step": 23400 + }, + { + "epoch": 9.75, + "eval_accuracy": 0.93575, + "eval_loss": 0.19130919873714447, + "eval_runtime": 17.4959, + "eval_samples_per_second": 1371.752, + "eval_steps_per_second": 4.287, + "step": 23400 + }, + { + "epoch": 9.76, + "learning_rate": 1e-06, + "loss": 0.2157, + "step": 23420 + }, + { + "epoch": 9.76, + "eval_accuracy": 0.9359166666666666, + "eval_loss": 0.18996796011924744, + "eval_runtime": 17.4322, + "eval_samples_per_second": 1376.762, + "eval_steps_per_second": 4.302, + "step": 23420 + }, + { + "epoch": 9.77, + "learning_rate": 1e-06, + "loss": 0.1345, + "step": 23440 + }, + { + "epoch": 9.77, + "eval_accuracy": 0.93425, + "eval_loss": 0.19452691078186035, + "eval_runtime": 17.4799, + "eval_samples_per_second": 1373.007, + "eval_steps_per_second": 4.291, + "step": 23440 + }, + { + "epoch": 9.78, + "learning_rate": 1e-06, + "loss": 0.1598, + "step": 23460 + }, + { + "epoch": 9.78, + "eval_accuracy": 0.9352916666666666, + "eval_loss": 0.19014814496040344, + "eval_runtime": 16.5871, + "eval_samples_per_second": 1446.91, + "eval_steps_per_second": 4.522, + "step": 23460 + }, + { + "epoch": 9.78, + "learning_rate": 1e-06, + "loss": 0.1189, + "step": 23480 + }, + { + "epoch": 9.78, + "eval_accuracy": 0.936875, + "eval_loss": 0.1875523328781128, + "eval_runtime": 15.4296, + "eval_samples_per_second": 1555.449, + "eval_steps_per_second": 4.861, + "step": 23480 + }, + { + "epoch": 9.79, + "learning_rate": 1e-06, + "loss": 0.1406, + "step": 23500 + }, + { + "epoch": 9.79, + "eval_accuracy": 0.9364583333333333, + "eval_loss": 0.19084198772907257, + "eval_runtime": 15.6993, + "eval_samples_per_second": 1528.734, + "eval_steps_per_second": 4.777, + "step": 23500 + }, + { + "epoch": 9.8, + "learning_rate": 1e-06, + "loss": 0.1624, + "step": 23520 + }, + { + "epoch": 9.8, + "eval_accuracy": 0.9345, + "eval_loss": 0.19372619688510895, + "eval_runtime": 15.5172, + "eval_samples_per_second": 1546.671, + "eval_steps_per_second": 4.833, + "step": 23520 + }, + { + "epoch": 9.81, + "learning_rate": 1e-06, + "loss": 0.1422, + "step": 23540 + }, + { + "epoch": 9.81, + "eval_accuracy": 0.9336666666666666, + "eval_loss": 0.19716989994049072, + "eval_runtime": 15.6871, + "eval_samples_per_second": 1529.92, + "eval_steps_per_second": 4.781, + "step": 23540 + }, + { + "epoch": 9.82, + "learning_rate": 1e-06, + "loss": 0.149, + "step": 23560 + }, + { + "epoch": 9.82, + "eval_accuracy": 0.9368333333333333, + "eval_loss": 0.1898457407951355, + "eval_runtime": 15.5314, + "eval_samples_per_second": 1545.261, + "eval_steps_per_second": 4.829, + "step": 23560 + }, + { + "epoch": 9.82, + "learning_rate": 1e-06, + "loss": 0.1635, + "step": 23580 + }, + { + "epoch": 9.82, + "eval_accuracy": 0.935625, + "eval_loss": 0.19061589241027832, + "eval_runtime": 15.9229, + "eval_samples_per_second": 1507.261, + "eval_steps_per_second": 4.71, + "step": 23580 + }, + { + "epoch": 9.83, + "learning_rate": 1e-06, + "loss": 0.1648, + "step": 23600 + }, + { + "epoch": 9.83, + "eval_accuracy": 0.9361666666666667, + "eval_loss": 0.18975524604320526, + "eval_runtime": 15.3175, + "eval_samples_per_second": 1566.837, + "eval_steps_per_second": 4.896, + "step": 23600 + }, + { + "epoch": 9.84, + "learning_rate": 1e-06, + "loss": 0.1577, + "step": 23620 + }, + { + "epoch": 9.84, + "eval_accuracy": 0.9354166666666667, + "eval_loss": 0.1917409747838974, + "eval_runtime": 15.4166, + "eval_samples_per_second": 1556.767, + "eval_steps_per_second": 4.865, + "step": 23620 + }, + { + "epoch": 9.85, + "learning_rate": 1e-06, + "loss": 0.1604, + "step": 23640 + }, + { + "epoch": 9.85, + "eval_accuracy": 0.9365, + "eval_loss": 0.19033245742321014, + "eval_runtime": 15.7074, + "eval_samples_per_second": 1527.939, + "eval_steps_per_second": 4.775, + "step": 23640 + }, + { + "epoch": 9.86, + "learning_rate": 1e-06, + "loss": 0.1477, + "step": 23660 + }, + { + "epoch": 9.86, + "eval_accuracy": 0.9347916666666667, + "eval_loss": 0.19305865466594696, + "eval_runtime": 15.5818, + "eval_samples_per_second": 1540.26, + "eval_steps_per_second": 4.813, + "step": 23660 + }, + { + "epoch": 9.87, + "learning_rate": 1e-06, + "loss": 0.1374, + "step": 23680 + }, + { + "epoch": 9.87, + "eval_accuracy": 0.9369166666666666, + "eval_loss": 0.18935944139957428, + "eval_runtime": 15.6633, + "eval_samples_per_second": 1532.247, + "eval_steps_per_second": 4.788, + "step": 23680 + }, + { + "epoch": 9.88, + "learning_rate": 1e-06, + "loss": 0.1524, + "step": 23700 + }, + { + "epoch": 9.88, + "eval_accuracy": 0.93625, + "eval_loss": 0.19033636152744293, + "eval_runtime": 15.5153, + "eval_samples_per_second": 1546.863, + "eval_steps_per_second": 4.834, + "step": 23700 + }, + { + "epoch": 9.88, + "learning_rate": 1e-06, + "loss": 0.1899, + "step": 23720 + }, + { + "epoch": 9.88, + "eval_accuracy": 0.937, + "eval_loss": 0.18940836191177368, + "eval_runtime": 15.9167, + "eval_samples_per_second": 1507.855, + "eval_steps_per_second": 4.712, + "step": 23720 + }, + { + "epoch": 9.89, + "learning_rate": 1e-06, + "loss": 0.1542, + "step": 23740 + }, + { + "epoch": 9.89, + "eval_accuracy": 0.9356666666666666, + "eval_loss": 0.1927276849746704, + "eval_runtime": 15.6406, + "eval_samples_per_second": 1534.465, + "eval_steps_per_second": 4.795, + "step": 23740 + }, + { + "epoch": 9.9, + "learning_rate": 1e-06, + "loss": 0.1709, + "step": 23760 + }, + { + "epoch": 9.9, + "eval_accuracy": 0.936, + "eval_loss": 0.19114099442958832, + "eval_runtime": 16.1487, + "eval_samples_per_second": 1486.188, + "eval_steps_per_second": 4.644, + "step": 23760 + }, + { + "epoch": 9.91, + "learning_rate": 1e-06, + "loss": 0.1555, + "step": 23780 + }, + { + "epoch": 9.91, + "eval_accuracy": 0.93725, + "eval_loss": 0.18706054985523224, + "eval_runtime": 15.697, + "eval_samples_per_second": 1528.954, + "eval_steps_per_second": 4.778, + "step": 23780 + }, + { + "epoch": 9.92, + "learning_rate": 1e-06, + "loss": 0.1723, + "step": 23800 + }, + { + "epoch": 9.92, + "eval_accuracy": 0.9372916666666666, + "eval_loss": 0.18747729063034058, + "eval_runtime": 15.4803, + "eval_samples_per_second": 1550.356, + "eval_steps_per_second": 4.845, + "step": 23800 + }, + { + "epoch": 9.93, + "learning_rate": 1e-06, + "loss": 0.1736, + "step": 23820 + }, + { + "epoch": 9.93, + "eval_accuracy": 0.935875, + "eval_loss": 0.19002044200897217, + "eval_runtime": 15.9218, + "eval_samples_per_second": 1507.368, + "eval_steps_per_second": 4.711, + "step": 23820 + }, + { + "epoch": 9.93, + "learning_rate": 1e-06, + "loss": 0.1354, + "step": 23840 + }, + { + "epoch": 9.93, + "eval_accuracy": 0.9373333333333334, + "eval_loss": 0.18683619797229767, + "eval_runtime": 15.4111, + "eval_samples_per_second": 1557.318, + "eval_steps_per_second": 4.867, + "step": 23840 + }, + { + "epoch": 9.94, + "learning_rate": 1e-06, + "loss": 0.1798, + "step": 23860 + }, + { + "epoch": 9.94, + "eval_accuracy": 0.937375, + "eval_loss": 0.1869436800479889, + "eval_runtime": 16.0901, + "eval_samples_per_second": 1491.604, + "eval_steps_per_second": 4.661, + "step": 23860 + }, + { + "epoch": 9.95, + "learning_rate": 1e-06, + "loss": 0.1258, + "step": 23880 + }, + { + "epoch": 9.95, + "eval_accuracy": 0.937, + "eval_loss": 0.18814752995967865, + "eval_runtime": 15.464, + "eval_samples_per_second": 1551.992, + "eval_steps_per_second": 4.85, + "step": 23880 + }, + { + "epoch": 9.96, + "learning_rate": 1e-06, + "loss": 0.1389, + "step": 23900 + }, + { + "epoch": 9.96, + "eval_accuracy": 0.9367083333333334, + "eval_loss": 0.18954576551914215, + "eval_runtime": 15.4402, + "eval_samples_per_second": 1554.384, + "eval_steps_per_second": 4.857, + "step": 23900 + }, + { + "epoch": 9.97, + "learning_rate": 1e-06, + "loss": 0.1307, + "step": 23920 + }, + { + "epoch": 9.97, + "eval_accuracy": 0.93675, + "eval_loss": 0.189622163772583, + "eval_runtime": 15.4543, + "eval_samples_per_second": 1552.963, + "eval_steps_per_second": 4.853, + "step": 23920 + }, + { + "epoch": 9.97, + "learning_rate": 1e-06, + "loss": 0.1579, + "step": 23940 + }, + { + "epoch": 9.97, + "eval_accuracy": 0.937125, + "eval_loss": 0.1884654462337494, + "eval_runtime": 15.6522, + "eval_samples_per_second": 1533.332, + "eval_steps_per_second": 4.792, + "step": 23940 + }, + { + "epoch": 9.98, + "learning_rate": 1e-06, + "loss": 0.1667, + "step": 23960 + }, + { + "epoch": 9.98, + "eval_accuracy": 0.937, + "eval_loss": 0.18847951292991638, + "eval_runtime": 15.9741, + "eval_samples_per_second": 1502.433, + "eval_steps_per_second": 4.695, + "step": 23960 + }, + { + "epoch": 9.99, + "learning_rate": 1e-06, + "loss": 0.1407, + "step": 23980 + }, + { + "epoch": 9.99, + "eval_accuracy": 0.9372916666666666, + "eval_loss": 0.1892939805984497, + "eval_runtime": 15.4305, + "eval_samples_per_second": 1555.363, + "eval_steps_per_second": 4.861, + "step": 23980 + }, + { + "epoch": 10.0, + "learning_rate": 1e-06, + "loss": 0.1624, + "step": 24000 + }, + { + "epoch": 10.0, + "eval_accuracy": 0.9372083333333333, + "eval_loss": 0.18687449395656586, + "eval_runtime": 15.6942, + "eval_samples_per_second": 1529.227, + "eval_steps_per_second": 4.779, + "step": 24000 + } + ], + "max_steps": 24000, + "num_train_epochs": 10, + "total_flos": 3.005791444826016e+16, + "trial_name": null, + "trial_params": null +}