{ "best_metric": 0.0923289805650711, "best_model_checkpoint": "./ViT-NIH-Chest-X-ray-dataset-small/checkpoint-2100", "epoch": 8.0, "eval_steps": 100, "global_step": 2168, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03690036900369004, "grad_norm": 0.38048115372657776, "learning_rate": 0.00019907749077490775, "loss": 0.5131, "step": 10 }, { "epoch": 0.07380073800738007, "grad_norm": 0.3416444957256317, "learning_rate": 0.00019815498154981552, "loss": 0.3661, "step": 20 }, { "epoch": 0.11070110701107011, "grad_norm": 0.268430233001709, "learning_rate": 0.00019723247232472326, "loss": 0.3122, "step": 30 }, { "epoch": 0.14760147601476015, "grad_norm": 0.20281535387039185, "learning_rate": 0.000196309963099631, "loss": 0.2862, "step": 40 }, { "epoch": 0.18450184501845018, "grad_norm": 0.17638804018497467, "learning_rate": 0.00019538745387453877, "loss": 0.2582, "step": 50 }, { "epoch": 0.22140221402214022, "grad_norm": 0.13983863592147827, "learning_rate": 0.00019446494464944652, "loss": 0.2621, "step": 60 }, { "epoch": 0.25830258302583026, "grad_norm": 0.1306193470954895, "learning_rate": 0.00019354243542435426, "loss": 0.2361, "step": 70 }, { "epoch": 0.2952029520295203, "grad_norm": 0.12178371101617813, "learning_rate": 0.000192619926199262, "loss": 0.2271, "step": 80 }, { "epoch": 0.33210332103321033, "grad_norm": 0.09625957161188126, "learning_rate": 0.00019169741697416974, "loss": 0.2275, "step": 90 }, { "epoch": 0.36900369003690037, "grad_norm": 0.12183211743831635, "learning_rate": 0.00019077490774907748, "loss": 0.2128, "step": 100 }, { "epoch": 0.36900369003690037, "eval_accuracy": 0.0, "eval_loss": 0.20924170315265656, "eval_runtime": 51.5235, "eval_samples_per_second": 33.577, "eval_steps_per_second": 4.212, "step": 100 }, { "epoch": 0.4059040590405904, "grad_norm": 0.08246201276779175, "learning_rate": 0.00018985239852398525, "loss": 0.2089, "step": 110 }, { "epoch": 0.44280442804428044, "grad_norm": 0.11126121133565903, "learning_rate": 0.000188929889298893, "loss": 0.2147, "step": 120 }, { "epoch": 0.4797047970479705, "grad_norm": 0.11177172511816025, "learning_rate": 0.00018800738007380074, "loss": 0.2044, "step": 130 }, { "epoch": 0.5166051660516605, "grad_norm": 0.10021921247243881, "learning_rate": 0.0001870848708487085, "loss": 0.1839, "step": 140 }, { "epoch": 0.5535055350553506, "grad_norm": 0.185609832406044, "learning_rate": 0.00018616236162361625, "loss": 0.2209, "step": 150 }, { "epoch": 0.5904059040590406, "grad_norm": 0.17880532145500183, "learning_rate": 0.000185239852398524, "loss": 0.2023, "step": 160 }, { "epoch": 0.6273062730627307, "grad_norm": 0.09112340211868286, "learning_rate": 0.00018431734317343173, "loss": 0.1952, "step": 170 }, { "epoch": 0.6642066420664207, "grad_norm": 0.14496631920337677, "learning_rate": 0.0001833948339483395, "loss": 0.2154, "step": 180 }, { "epoch": 0.7011070110701108, "grad_norm": 0.1129971593618393, "learning_rate": 0.00018247232472324724, "loss": 0.1865, "step": 190 }, { "epoch": 0.7380073800738007, "grad_norm": 0.1439884454011917, "learning_rate": 0.00018154981549815499, "loss": 0.1848, "step": 200 }, { "epoch": 0.7380073800738007, "eval_accuracy": 0.38208092485549133, "eval_loss": 0.1909271478652954, "eval_runtime": 51.554, "eval_samples_per_second": 33.557, "eval_steps_per_second": 4.209, "step": 200 }, { "epoch": 0.7749077490774908, "grad_norm": 0.1933569312095642, "learning_rate": 0.00018062730627306276, "loss": 0.1921, "step": 210 }, { "epoch": 0.8118081180811808, "grad_norm": 0.19441623985767365, "learning_rate": 0.0001797047970479705, "loss": 0.2087, "step": 220 }, { "epoch": 0.8487084870848709, "grad_norm": 0.10429559648036957, "learning_rate": 0.00017878228782287824, "loss": 0.2016, "step": 230 }, { "epoch": 0.8856088560885609, "grad_norm": 0.11976602673530579, "learning_rate": 0.00017785977859778598, "loss": 0.1871, "step": 240 }, { "epoch": 0.922509225092251, "grad_norm": 0.13647252321243286, "learning_rate": 0.00017693726937269372, "loss": 0.1951, "step": 250 }, { "epoch": 0.959409594095941, "grad_norm": 0.13491246104240417, "learning_rate": 0.00017601476014760147, "loss": 0.1987, "step": 260 }, { "epoch": 0.996309963099631, "grad_norm": 0.10877779126167297, "learning_rate": 0.00017509225092250923, "loss": 0.2067, "step": 270 }, { "epoch": 1.033210332103321, "grad_norm": 0.10195717215538025, "learning_rate": 0.00017416974169741698, "loss": 0.1829, "step": 280 }, { "epoch": 1.070110701107011, "grad_norm": 0.11287475377321243, "learning_rate": 0.00017324723247232472, "loss": 0.1771, "step": 290 }, { "epoch": 1.1070110701107012, "grad_norm": 0.10177090764045715, "learning_rate": 0.0001723247232472325, "loss": 0.171, "step": 300 }, { "epoch": 1.1070110701107012, "eval_accuracy": 0.5387283236994219, "eval_loss": 0.1967419981956482, "eval_runtime": 50.6773, "eval_samples_per_second": 34.138, "eval_steps_per_second": 4.282, "step": 300 }, { "epoch": 1.1439114391143912, "grad_norm": 0.14216077327728271, "learning_rate": 0.00017140221402214023, "loss": 0.2104, "step": 310 }, { "epoch": 1.1808118081180812, "grad_norm": 0.0753447487950325, "learning_rate": 0.00017047970479704797, "loss": 0.201, "step": 320 }, { "epoch": 1.2177121771217712, "grad_norm": 0.13927125930786133, "learning_rate": 0.00016955719557195574, "loss": 0.1923, "step": 330 }, { "epoch": 1.2546125461254611, "grad_norm": 0.14396004378795624, "learning_rate": 0.00016863468634686348, "loss": 0.1996, "step": 340 }, { "epoch": 1.2915129151291513, "grad_norm": 0.09631673246622086, "learning_rate": 0.00016771217712177123, "loss": 0.1672, "step": 350 }, { "epoch": 1.3284132841328413, "grad_norm": 0.1342993974685669, "learning_rate": 0.00016678966789667897, "loss": 0.1792, "step": 360 }, { "epoch": 1.3653136531365313, "grad_norm": 0.20273268222808838, "learning_rate": 0.00016586715867158674, "loss": 0.1753, "step": 370 }, { "epoch": 1.4022140221402215, "grad_norm": 0.1452128291130066, "learning_rate": 0.00016494464944649448, "loss": 0.1801, "step": 380 }, { "epoch": 1.4391143911439115, "grad_norm": 0.0973893478512764, "learning_rate": 0.00016402214022140222, "loss": 0.2136, "step": 390 }, { "epoch": 1.4760147601476015, "grad_norm": 0.2589876651763916, "learning_rate": 0.00016309963099630996, "loss": 0.1772, "step": 400 }, { "epoch": 1.4760147601476015, "eval_accuracy": 0.5450867052023122, "eval_loss": 0.19317613542079926, "eval_runtime": 50.5026, "eval_samples_per_second": 34.256, "eval_steps_per_second": 4.297, "step": 400 }, { "epoch": 1.5129151291512914, "grad_norm": 0.16841156780719757, "learning_rate": 0.0001621771217712177, "loss": 0.1894, "step": 410 }, { "epoch": 1.5498154981549814, "grad_norm": 0.17766626179218292, "learning_rate": 0.00016125461254612547, "loss": 0.204, "step": 420 }, { "epoch": 1.5867158671586716, "grad_norm": 0.13253839313983917, "learning_rate": 0.00016033210332103322, "loss": 0.2141, "step": 430 }, { "epoch": 1.6236162361623616, "grad_norm": 0.10587523877620697, "learning_rate": 0.00015940959409594096, "loss": 0.2192, "step": 440 }, { "epoch": 1.6605166051660518, "grad_norm": 0.10049675405025482, "learning_rate": 0.0001584870848708487, "loss": 0.1939, "step": 450 }, { "epoch": 1.6974169741697418, "grad_norm": 0.1287400722503662, "learning_rate": 0.00015756457564575647, "loss": 0.1955, "step": 460 }, { "epoch": 1.7343173431734318, "grad_norm": 0.15120339393615723, "learning_rate": 0.0001566420664206642, "loss": 0.1863, "step": 470 }, { "epoch": 1.7712177121771218, "grad_norm": 0.1202373206615448, "learning_rate": 0.00015571955719557195, "loss": 0.1832, "step": 480 }, { "epoch": 1.8081180811808117, "grad_norm": 0.1368759572505951, "learning_rate": 0.00015479704797047972, "loss": 0.2007, "step": 490 }, { "epoch": 1.8450184501845017, "grad_norm": 0.08591968566179276, "learning_rate": 0.00015387453874538746, "loss": 0.1629, "step": 500 }, { "epoch": 1.8450184501845017, "eval_accuracy": 0.4485549132947977, "eval_loss": 0.1842162311077118, "eval_runtime": 50.5861, "eval_samples_per_second": 34.199, "eval_steps_per_second": 4.29, "step": 500 }, { "epoch": 1.881918819188192, "grad_norm": 0.12317466735839844, "learning_rate": 0.0001529520295202952, "loss": 0.213, "step": 510 }, { "epoch": 1.918819188191882, "grad_norm": 0.15092293918132782, "learning_rate": 0.00015202952029520298, "loss": 0.2004, "step": 520 }, { "epoch": 1.9557195571955721, "grad_norm": 0.20343895256519318, "learning_rate": 0.00015110701107011072, "loss": 0.1835, "step": 530 }, { "epoch": 1.992619926199262, "grad_norm": 0.32362422347068787, "learning_rate": 0.00015018450184501846, "loss": 0.1915, "step": 540 }, { "epoch": 2.029520295202952, "grad_norm": 0.14631719887256622, "learning_rate": 0.00014926199261992623, "loss": 0.2113, "step": 550 }, { "epoch": 2.066420664206642, "grad_norm": 0.12011805921792984, "learning_rate": 0.00014833948339483394, "loss": 0.1781, "step": 560 }, { "epoch": 2.103321033210332, "grad_norm": 0.14479252696037292, "learning_rate": 0.00014741697416974169, "loss": 0.182, "step": 570 }, { "epoch": 2.140221402214022, "grad_norm": 0.18806347250938416, "learning_rate": 0.00014649446494464946, "loss": 0.1682, "step": 580 }, { "epoch": 2.177121771217712, "grad_norm": 0.17025235295295715, "learning_rate": 0.0001455719557195572, "loss": 0.1824, "step": 590 }, { "epoch": 2.2140221402214024, "grad_norm": 0.16879422962665558, "learning_rate": 0.00014464944649446494, "loss": 0.1942, "step": 600 }, { "epoch": 2.2140221402214024, "eval_accuracy": 0.41965317919075146, "eval_loss": 0.17699980735778809, "eval_runtime": 50.0976, "eval_samples_per_second": 34.533, "eval_steps_per_second": 4.332, "step": 600 }, { "epoch": 2.2509225092250924, "grad_norm": 0.168411523103714, "learning_rate": 0.0001437269372693727, "loss": 0.1732, "step": 610 }, { "epoch": 2.2878228782287824, "grad_norm": 0.21096496284008026, "learning_rate": 0.00014280442804428045, "loss": 0.1842, "step": 620 }, { "epoch": 2.3247232472324724, "grad_norm": 0.18110381066799164, "learning_rate": 0.0001418819188191882, "loss": 0.1772, "step": 630 }, { "epoch": 2.3616236162361623, "grad_norm": 0.15854766964912415, "learning_rate": 0.00014095940959409593, "loss": 0.1709, "step": 640 }, { "epoch": 2.3985239852398523, "grad_norm": 0.19320182502269745, "learning_rate": 0.0001400369003690037, "loss": 0.2, "step": 650 }, { "epoch": 2.4354243542435423, "grad_norm": 0.16658619046211243, "learning_rate": 0.00013911439114391145, "loss": 0.2061, "step": 660 }, { "epoch": 2.4723247232472323, "grad_norm": 0.14636483788490295, "learning_rate": 0.0001381918819188192, "loss": 0.1946, "step": 670 }, { "epoch": 2.5092250922509223, "grad_norm": 0.1587982028722763, "learning_rate": 0.00013726937269372696, "loss": 0.1963, "step": 680 }, { "epoch": 2.5461254612546127, "grad_norm": 0.2764102518558502, "learning_rate": 0.0001363468634686347, "loss": 0.1702, "step": 690 }, { "epoch": 2.5830258302583027, "grad_norm": 0.14449751377105713, "learning_rate": 0.00013542435424354244, "loss": 0.1714, "step": 700 }, { "epoch": 2.5830258302583027, "eval_accuracy": 0.5023121387283237, "eval_loss": 0.17974236607551575, "eval_runtime": 50.796, "eval_samples_per_second": 34.058, "eval_steps_per_second": 4.272, "step": 700 }, { "epoch": 2.6199261992619927, "grad_norm": 0.15392902493476868, "learning_rate": 0.0001345018450184502, "loss": 0.191, "step": 710 }, { "epoch": 2.6568265682656826, "grad_norm": 0.15529021620750427, "learning_rate": 0.00013357933579335793, "loss": 0.193, "step": 720 }, { "epoch": 2.6937269372693726, "grad_norm": 0.18234789371490479, "learning_rate": 0.00013265682656826567, "loss": 0.1836, "step": 730 }, { "epoch": 2.7306273062730626, "grad_norm": 0.19954174757003784, "learning_rate": 0.00013173431734317344, "loss": 0.2176, "step": 740 }, { "epoch": 2.767527675276753, "grad_norm": 0.13893257081508636, "learning_rate": 0.00013081180811808118, "loss": 0.1699, "step": 750 }, { "epoch": 2.804428044280443, "grad_norm": 0.16896647214889526, "learning_rate": 0.00012988929889298892, "loss": 0.168, "step": 760 }, { "epoch": 2.841328413284133, "grad_norm": 0.20796014368534088, "learning_rate": 0.0001289667896678967, "loss": 0.2141, "step": 770 }, { "epoch": 2.878228782287823, "grad_norm": 0.2690466046333313, "learning_rate": 0.00012804428044280443, "loss": 0.1778, "step": 780 }, { "epoch": 2.915129151291513, "grad_norm": 0.14259500801563263, "learning_rate": 0.00012712177121771217, "loss": 0.1748, "step": 790 }, { "epoch": 2.952029520295203, "grad_norm": 0.14488738775253296, "learning_rate": 0.00012619926199261994, "loss": 0.1832, "step": 800 }, { "epoch": 2.952029520295203, "eval_accuracy": 0.36878612716763004, "eval_loss": 0.17303667962551117, "eval_runtime": 50.3505, "eval_samples_per_second": 34.359, "eval_steps_per_second": 4.31, "step": 800 }, { "epoch": 2.988929889298893, "grad_norm": 0.1963815540075302, "learning_rate": 0.00012527675276752769, "loss": 0.1603, "step": 810 }, { "epoch": 3.025830258302583, "grad_norm": 0.18811728060245514, "learning_rate": 0.00012435424354243543, "loss": 0.1668, "step": 820 }, { "epoch": 3.062730627306273, "grad_norm": 0.3115330636501312, "learning_rate": 0.0001234317343173432, "loss": 0.1764, "step": 830 }, { "epoch": 3.0996309963099633, "grad_norm": 0.15212470293045044, "learning_rate": 0.00012250922509225094, "loss": 0.1668, "step": 840 }, { "epoch": 3.1365313653136533, "grad_norm": 0.17935976386070251, "learning_rate": 0.00012158671586715868, "loss": 0.1807, "step": 850 }, { "epoch": 3.1734317343173433, "grad_norm": 0.23978868126869202, "learning_rate": 0.00012066420664206644, "loss": 0.168, "step": 860 }, { "epoch": 3.2103321033210332, "grad_norm": 0.29603224992752075, "learning_rate": 0.00011974169741697419, "loss": 0.1827, "step": 870 }, { "epoch": 3.2472324723247232, "grad_norm": 0.1385461539030075, "learning_rate": 0.00011881918819188192, "loss": 0.1794, "step": 880 }, { "epoch": 3.284132841328413, "grad_norm": 0.20920993387699127, "learning_rate": 0.00011789667896678966, "loss": 0.1738, "step": 890 }, { "epoch": 3.321033210332103, "grad_norm": 0.31590428948402405, "learning_rate": 0.00011697416974169742, "loss": 0.1766, "step": 900 }, { "epoch": 3.321033210332103, "eval_accuracy": 0.34277456647398846, "eval_loss": 0.17552779614925385, "eval_runtime": 50.5381, "eval_samples_per_second": 34.232, "eval_steps_per_second": 4.294, "step": 900 }, { "epoch": 3.357933579335793, "grad_norm": 0.22194945812225342, "learning_rate": 0.00011605166051660516, "loss": 0.1814, "step": 910 }, { "epoch": 3.3948339483394836, "grad_norm": 0.22071777284145355, "learning_rate": 0.00011512915129151292, "loss": 0.1629, "step": 920 }, { "epoch": 3.4317343173431736, "grad_norm": 0.44112759828567505, "learning_rate": 0.00011420664206642067, "loss": 0.1914, "step": 930 }, { "epoch": 3.4686346863468636, "grad_norm": 0.20971660315990448, "learning_rate": 0.00011328413284132841, "loss": 0.1691, "step": 940 }, { "epoch": 3.5055350553505535, "grad_norm": 0.23813588917255402, "learning_rate": 0.00011236162361623617, "loss": 0.1919, "step": 950 }, { "epoch": 3.5424354243542435, "grad_norm": 0.19610780477523804, "learning_rate": 0.00011143911439114391, "loss": 0.1631, "step": 960 }, { "epoch": 3.5793357933579335, "grad_norm": 0.29578620195388794, "learning_rate": 0.00011051660516605167, "loss": 0.1721, "step": 970 }, { "epoch": 3.6162361623616235, "grad_norm": 0.15876761078834534, "learning_rate": 0.00010959409594095942, "loss": 0.1869, "step": 980 }, { "epoch": 3.6531365313653135, "grad_norm": 0.19575054943561554, "learning_rate": 0.00010867158671586716, "loss": 0.1676, "step": 990 }, { "epoch": 3.6900369003690034, "grad_norm": 0.12657958269119263, "learning_rate": 0.00010774907749077492, "loss": 0.1697, "step": 1000 }, { "epoch": 3.6900369003690034, "eval_accuracy": 0.5167630057803468, "eval_loss": 0.1601094752550125, "eval_runtime": 50.1373, "eval_samples_per_second": 34.505, "eval_steps_per_second": 4.328, "step": 1000 }, { "epoch": 3.726937269372694, "grad_norm": 0.2477671205997467, "learning_rate": 0.00010682656826568268, "loss": 0.1745, "step": 1010 }, { "epoch": 3.763837638376384, "grad_norm": 0.21879136562347412, "learning_rate": 0.00010590405904059042, "loss": 0.1617, "step": 1020 }, { "epoch": 3.800738007380074, "grad_norm": 0.195592001080513, "learning_rate": 0.00010498154981549817, "loss": 0.1534, "step": 1030 }, { "epoch": 3.837638376383764, "grad_norm": 0.35998597741127014, "learning_rate": 0.0001040590405904059, "loss": 0.1606, "step": 1040 }, { "epoch": 3.874538745387454, "grad_norm": 0.30765026807785034, "learning_rate": 0.00010313653136531364, "loss": 0.2019, "step": 1050 }, { "epoch": 3.911439114391144, "grad_norm": 0.16130860149860382, "learning_rate": 0.0001022140221402214, "loss": 0.1738, "step": 1060 }, { "epoch": 3.948339483394834, "grad_norm": 0.1843736171722412, "learning_rate": 0.00010129151291512916, "loss": 0.1941, "step": 1070 }, { "epoch": 3.985239852398524, "grad_norm": 0.21090315282344818, "learning_rate": 0.0001003690036900369, "loss": 0.1695, "step": 1080 }, { "epoch": 4.022140221402214, "grad_norm": 0.19030509889125824, "learning_rate": 9.944649446494465e-05, "loss": 0.1711, "step": 1090 }, { "epoch": 4.059040590405904, "grad_norm": 0.12992843985557556, "learning_rate": 9.85239852398524e-05, "loss": 0.1568, "step": 1100 }, { "epoch": 4.059040590405904, "eval_accuracy": 0.5352601156069364, "eval_loss": 0.15768703818321228, "eval_runtime": 50.6123, "eval_samples_per_second": 34.181, "eval_steps_per_second": 4.287, "step": 1100 }, { "epoch": 4.095940959409594, "grad_norm": 0.23201997578144073, "learning_rate": 9.760147601476015e-05, "loss": 0.1484, "step": 1110 }, { "epoch": 4.132841328413284, "grad_norm": 0.3783067762851715, "learning_rate": 9.66789667896679e-05, "loss": 0.1597, "step": 1120 }, { "epoch": 4.169741697416974, "grad_norm": 0.27165931463241577, "learning_rate": 9.575645756457565e-05, "loss": 0.156, "step": 1130 }, { "epoch": 4.206642066420664, "grad_norm": 0.2932455241680145, "learning_rate": 9.48339483394834e-05, "loss": 0.1353, "step": 1140 }, { "epoch": 4.243542435424354, "grad_norm": 0.27856454253196716, "learning_rate": 9.391143911439116e-05, "loss": 0.1555, "step": 1150 }, { "epoch": 4.280442804428044, "grad_norm": 0.2609305679798126, "learning_rate": 9.298892988929889e-05, "loss": 0.1549, "step": 1160 }, { "epoch": 4.317343173431734, "grad_norm": 0.4013775587081909, "learning_rate": 9.206642066420664e-05, "loss": 0.1555, "step": 1170 }, { "epoch": 4.354243542435424, "grad_norm": 0.24482858180999756, "learning_rate": 9.11439114391144e-05, "loss": 0.1583, "step": 1180 }, { "epoch": 4.391143911439114, "grad_norm": 0.2422870397567749, "learning_rate": 9.022140221402214e-05, "loss": 0.1663, "step": 1190 }, { "epoch": 4.428044280442805, "grad_norm": 0.2710004448890686, "learning_rate": 8.92988929889299e-05, "loss": 0.1484, "step": 1200 }, { "epoch": 4.428044280442805, "eval_accuracy": 0.49190751445086706, "eval_loss": 0.1513577699661255, "eval_runtime": 50.7754, "eval_samples_per_second": 34.072, "eval_steps_per_second": 4.274, "step": 1200 }, { "epoch": 4.464944649446495, "grad_norm": 0.3608151972293854, "learning_rate": 8.837638376383764e-05, "loss": 0.1595, "step": 1210 }, { "epoch": 4.501845018450185, "grad_norm": 0.14578911662101746, "learning_rate": 8.74538745387454e-05, "loss": 0.1841, "step": 1220 }, { "epoch": 4.538745387453875, "grad_norm": 0.2544012665748596, "learning_rate": 8.653136531365315e-05, "loss": 0.1576, "step": 1230 }, { "epoch": 4.575645756457565, "grad_norm": 0.3130911886692047, "learning_rate": 8.560885608856088e-05, "loss": 0.1626, "step": 1240 }, { "epoch": 4.612546125461255, "grad_norm": 0.31136009097099304, "learning_rate": 8.468634686346863e-05, "loss": 0.1715, "step": 1250 }, { "epoch": 4.649446494464945, "grad_norm": 0.20172053575515747, "learning_rate": 8.376383763837639e-05, "loss": 0.1476, "step": 1260 }, { "epoch": 4.686346863468635, "grad_norm": 0.2550618648529053, "learning_rate": 8.284132841328413e-05, "loss": 0.1376, "step": 1270 }, { "epoch": 4.723247232472325, "grad_norm": 0.16149303317070007, "learning_rate": 8.191881918819189e-05, "loss": 0.1461, "step": 1280 }, { "epoch": 4.760147601476015, "grad_norm": 0.27109894156455994, "learning_rate": 8.099630996309964e-05, "loss": 0.1556, "step": 1290 }, { "epoch": 4.797047970479705, "grad_norm": 0.26436206698417664, "learning_rate": 8.007380073800739e-05, "loss": 0.1483, "step": 1300 }, { "epoch": 4.797047970479705, "eval_accuracy": 0.5699421965317919, "eval_loss": 0.14818404614925385, "eval_runtime": 49.9368, "eval_samples_per_second": 34.644, "eval_steps_per_second": 4.345, "step": 1300 }, { "epoch": 4.833948339483395, "grad_norm": 0.39457815885543823, "learning_rate": 7.915129151291514e-05, "loss": 0.1731, "step": 1310 }, { "epoch": 4.870848708487085, "grad_norm": 0.1614658087491989, "learning_rate": 7.822878228782288e-05, "loss": 0.1525, "step": 1320 }, { "epoch": 4.907749077490775, "grad_norm": 0.26091647148132324, "learning_rate": 7.730627306273062e-05, "loss": 0.1854, "step": 1330 }, { "epoch": 4.944649446494465, "grad_norm": 0.33017560839653015, "learning_rate": 7.638376383763838e-05, "loss": 0.1695, "step": 1340 }, { "epoch": 4.9815498154981555, "grad_norm": 0.3637866973876953, "learning_rate": 7.546125461254612e-05, "loss": 0.1666, "step": 1350 }, { "epoch": 5.018450184501845, "grad_norm": 0.3373745083808899, "learning_rate": 7.453874538745388e-05, "loss": 0.1525, "step": 1360 }, { "epoch": 5.055350553505535, "grad_norm": 0.25333917140960693, "learning_rate": 7.361623616236163e-05, "loss": 0.1356, "step": 1370 }, { "epoch": 5.092250922509225, "grad_norm": 0.1722867488861084, "learning_rate": 7.269372693726938e-05, "loss": 0.1357, "step": 1380 }, { "epoch": 5.129151291512915, "grad_norm": 0.13959679007530212, "learning_rate": 7.177121771217713e-05, "loss": 0.1285, "step": 1390 }, { "epoch": 5.166051660516605, "grad_norm": 0.17668481171131134, "learning_rate": 7.084870848708487e-05, "loss": 0.1301, "step": 1400 }, { "epoch": 5.166051660516605, "eval_accuracy": 0.5433526011560693, "eval_loss": 0.13149897754192352, "eval_runtime": 50.8795, "eval_samples_per_second": 34.002, "eval_steps_per_second": 4.265, "step": 1400 }, { "epoch": 5.202952029520295, "grad_norm": 0.1745270937681198, "learning_rate": 6.992619926199262e-05, "loss": 0.1516, "step": 1410 }, { "epoch": 5.239852398523985, "grad_norm": 0.5758349299430847, "learning_rate": 6.900369003690037e-05, "loss": 0.1294, "step": 1420 }, { "epoch": 5.276752767527675, "grad_norm": 0.2458232194185257, "learning_rate": 6.808118081180813e-05, "loss": 0.1385, "step": 1430 }, { "epoch": 5.313653136531365, "grad_norm": 0.3469581604003906, "learning_rate": 6.715867158671587e-05, "loss": 0.1394, "step": 1440 }, { "epoch": 5.350553505535055, "grad_norm": 0.267447292804718, "learning_rate": 6.623616236162362e-05, "loss": 0.1432, "step": 1450 }, { "epoch": 5.387453874538745, "grad_norm": 0.24406275153160095, "learning_rate": 6.531365313653137e-05, "loss": 0.1396, "step": 1460 }, { "epoch": 5.424354243542435, "grad_norm": 0.7067885994911194, "learning_rate": 6.439114391143912e-05, "loss": 0.1456, "step": 1470 }, { "epoch": 5.461254612546125, "grad_norm": 0.2915806174278259, "learning_rate": 6.346863468634686e-05, "loss": 0.1366, "step": 1480 }, { "epoch": 5.498154981549815, "grad_norm": 0.22377534210681915, "learning_rate": 6.25461254612546e-05, "loss": 0.1273, "step": 1490 }, { "epoch": 5.535055350553505, "grad_norm": 0.3705073893070221, "learning_rate": 6.162361623616236e-05, "loss": 0.1149, "step": 1500 }, { "epoch": 5.535055350553505, "eval_accuracy": 0.5583815028901734, "eval_loss": 0.12937474250793457, "eval_runtime": 50.8394, "eval_samples_per_second": 34.029, "eval_steps_per_second": 4.268, "step": 1500 }, { "epoch": 5.571955719557195, "grad_norm": 0.13345371186733246, "learning_rate": 6.070110701107011e-05, "loss": 0.1229, "step": 1510 }, { "epoch": 5.608856088560886, "grad_norm": 0.4122871160507202, "learning_rate": 5.9778597785977866e-05, "loss": 0.1689, "step": 1520 }, { "epoch": 5.645756457564576, "grad_norm": 0.14905782043933868, "learning_rate": 5.8856088560885615e-05, "loss": 0.1365, "step": 1530 }, { "epoch": 5.682656826568266, "grad_norm": 0.21198387444019318, "learning_rate": 5.7933579335793364e-05, "loss": 0.1453, "step": 1540 }, { "epoch": 5.719557195571956, "grad_norm": 0.3941808044910431, "learning_rate": 5.701107011070111e-05, "loss": 0.1584, "step": 1550 }, { "epoch": 5.756457564575646, "grad_norm": 0.1366042047739029, "learning_rate": 5.6088560885608855e-05, "loss": 0.1219, "step": 1560 }, { "epoch": 5.793357933579336, "grad_norm": 0.1590586006641388, "learning_rate": 5.5166051660516604e-05, "loss": 0.1482, "step": 1570 }, { "epoch": 5.830258302583026, "grad_norm": 0.3574014902114868, "learning_rate": 5.424354243542435e-05, "loss": 0.1241, "step": 1580 }, { "epoch": 5.867158671586716, "grad_norm": 0.2934325039386749, "learning_rate": 5.332103321033211e-05, "loss": 0.1397, "step": 1590 }, { "epoch": 5.904059040590406, "grad_norm": 0.2349650263786316, "learning_rate": 5.239852398523986e-05, "loss": 0.1448, "step": 1600 }, { "epoch": 5.904059040590406, "eval_accuracy": 0.5416184971098266, "eval_loss": 0.12662799656391144, "eval_runtime": 50.1064, "eval_samples_per_second": 34.527, "eval_steps_per_second": 4.331, "step": 1600 }, { "epoch": 5.940959409594096, "grad_norm": 0.39207130670547485, "learning_rate": 5.1476014760147606e-05, "loss": 0.1491, "step": 1610 }, { "epoch": 5.977859778597786, "grad_norm": 0.21359127759933472, "learning_rate": 5.0553505535055354e-05, "loss": 0.1367, "step": 1620 }, { "epoch": 6.014760147601476, "grad_norm": 0.17874382436275482, "learning_rate": 4.96309963099631e-05, "loss": 0.1276, "step": 1630 }, { "epoch": 6.051660516605166, "grad_norm": 0.15224817395210266, "learning_rate": 4.870848708487085e-05, "loss": 0.1223, "step": 1640 }, { "epoch": 6.088560885608856, "grad_norm": 0.28657016158103943, "learning_rate": 4.77859778597786e-05, "loss": 0.1327, "step": 1650 }, { "epoch": 6.125461254612546, "grad_norm": 0.16251201927661896, "learning_rate": 4.686346863468635e-05, "loss": 0.1318, "step": 1660 }, { "epoch": 6.162361623616236, "grad_norm": 0.3002704381942749, "learning_rate": 4.59409594095941e-05, "loss": 0.1188, "step": 1670 }, { "epoch": 6.199261992619927, "grad_norm": 0.4188823103904724, "learning_rate": 4.501845018450185e-05, "loss": 0.1004, "step": 1680 }, { "epoch": 6.236162361623617, "grad_norm": 0.18772590160369873, "learning_rate": 4.4095940959409596e-05, "loss": 0.1002, "step": 1690 }, { "epoch": 6.273062730627307, "grad_norm": 0.30921700596809387, "learning_rate": 4.3173431734317345e-05, "loss": 0.1035, "step": 1700 }, { "epoch": 6.273062730627307, "eval_accuracy": 0.6017341040462427, "eval_loss": 0.11507368832826614, "eval_runtime": 50.6734, "eval_samples_per_second": 34.14, "eval_steps_per_second": 4.282, "step": 1700 }, { "epoch": 6.3099630996309966, "grad_norm": 0.42562779784202576, "learning_rate": 4.2250922509225094e-05, "loss": 0.1071, "step": 1710 }, { "epoch": 6.3468634686346865, "grad_norm": 0.36547404527664185, "learning_rate": 4.132841328413284e-05, "loss": 0.12, "step": 1720 }, { "epoch": 6.3837638376383765, "grad_norm": 0.12006784975528717, "learning_rate": 4.040590405904059e-05, "loss": 0.1107, "step": 1730 }, { "epoch": 6.4206642066420665, "grad_norm": 0.1983233392238617, "learning_rate": 3.948339483394834e-05, "loss": 0.1206, "step": 1740 }, { "epoch": 6.4575645756457565, "grad_norm": 0.17691943049430847, "learning_rate": 3.856088560885609e-05, "loss": 0.1252, "step": 1750 }, { "epoch": 6.4944649446494465, "grad_norm": 0.39386728405952454, "learning_rate": 3.763837638376384e-05, "loss": 0.1314, "step": 1760 }, { "epoch": 6.531365313653136, "grad_norm": 0.607455313205719, "learning_rate": 3.6715867158671594e-05, "loss": 0.1095, "step": 1770 }, { "epoch": 6.568265682656826, "grad_norm": 0.21057389676570892, "learning_rate": 3.5793357933579336e-05, "loss": 0.1223, "step": 1780 }, { "epoch": 6.605166051660516, "grad_norm": 0.27539491653442383, "learning_rate": 3.4870848708487085e-05, "loss": 0.1163, "step": 1790 }, { "epoch": 6.642066420664206, "grad_norm": 0.24495290219783783, "learning_rate": 3.3948339483394833e-05, "loss": 0.1048, "step": 1800 }, { "epoch": 6.642066420664206, "eval_accuracy": 0.6046242774566474, "eval_loss": 0.10599144548177719, "eval_runtime": 50.9957, "eval_samples_per_second": 33.924, "eval_steps_per_second": 4.255, "step": 1800 }, { "epoch": 6.678966789667896, "grad_norm": 0.38892611861228943, "learning_rate": 3.302583025830259e-05, "loss": 0.1352, "step": 1810 }, { "epoch": 6.715867158671586, "grad_norm": 0.2850606143474579, "learning_rate": 3.210332103321033e-05, "loss": 0.1153, "step": 1820 }, { "epoch": 6.752767527675276, "grad_norm": 0.16241934895515442, "learning_rate": 3.118081180811808e-05, "loss": 0.1074, "step": 1830 }, { "epoch": 6.789667896678967, "grad_norm": 0.36088794469833374, "learning_rate": 3.0258302583025832e-05, "loss": 0.1219, "step": 1840 }, { "epoch": 6.826568265682657, "grad_norm": 0.21467632055282593, "learning_rate": 2.9335793357933584e-05, "loss": 0.1083, "step": 1850 }, { "epoch": 6.863468634686347, "grad_norm": 0.4730125069618225, "learning_rate": 2.8413284132841326e-05, "loss": 0.1227, "step": 1860 }, { "epoch": 6.900369003690037, "grad_norm": 0.20842638611793518, "learning_rate": 2.749077490774908e-05, "loss": 0.128, "step": 1870 }, { "epoch": 6.937269372693727, "grad_norm": 0.1885102540254593, "learning_rate": 2.6568265682656828e-05, "loss": 0.0923, "step": 1880 }, { "epoch": 6.974169741697417, "grad_norm": 0.48948994278907776, "learning_rate": 2.564575645756458e-05, "loss": 0.1165, "step": 1890 }, { "epoch": 7.011070110701107, "grad_norm": 0.4080180525779724, "learning_rate": 2.472324723247233e-05, "loss": 0.1168, "step": 1900 }, { "epoch": 7.011070110701107, "eval_accuracy": 0.6173410404624278, "eval_loss": 0.10073487460613251, "eval_runtime": 50.8605, "eval_samples_per_second": 34.015, "eval_steps_per_second": 4.267, "step": 1900 }, { "epoch": 7.047970479704797, "grad_norm": 0.19095434248447418, "learning_rate": 2.3800738007380074e-05, "loss": 0.1131, "step": 1910 }, { "epoch": 7.084870848708487, "grad_norm": 0.23603685200214386, "learning_rate": 2.2878228782287826e-05, "loss": 0.089, "step": 1920 }, { "epoch": 7.121771217712177, "grad_norm": 0.09547635912895203, "learning_rate": 2.195571955719557e-05, "loss": 0.1032, "step": 1930 }, { "epoch": 7.158671586715867, "grad_norm": 0.18442951142787933, "learning_rate": 2.1033210332103324e-05, "loss": 0.0909, "step": 1940 }, { "epoch": 7.195571955719557, "grad_norm": 0.2125350534915924, "learning_rate": 2.011070110701107e-05, "loss": 0.0922, "step": 1950 }, { "epoch": 7.232472324723247, "grad_norm": 0.15140217542648315, "learning_rate": 1.918819188191882e-05, "loss": 0.1155, "step": 1960 }, { "epoch": 7.269372693726937, "grad_norm": 0.25905662775039673, "learning_rate": 1.826568265682657e-05, "loss": 0.1194, "step": 1970 }, { "epoch": 7.306273062730627, "grad_norm": 0.18217885494232178, "learning_rate": 1.734317343173432e-05, "loss": 0.1, "step": 1980 }, { "epoch": 7.343173431734318, "grad_norm": 0.21871539950370789, "learning_rate": 1.6420664206642068e-05, "loss": 0.1022, "step": 1990 }, { "epoch": 7.380073800738008, "grad_norm": 0.4127865731716156, "learning_rate": 1.5498154981549817e-05, "loss": 0.1104, "step": 2000 }, { "epoch": 7.380073800738008, "eval_accuracy": 0.6445086705202312, "eval_loss": 0.09489033371210098, "eval_runtime": 50.8042, "eval_samples_per_second": 34.052, "eval_steps_per_second": 4.271, "step": 2000 }, { "epoch": 7.416974169741698, "grad_norm": 0.20000500977039337, "learning_rate": 1.4575645756457566e-05, "loss": 0.1031, "step": 2010 }, { "epoch": 7.453874538745388, "grad_norm": 0.5234202742576599, "learning_rate": 1.3653136531365315e-05, "loss": 0.1197, "step": 2020 }, { "epoch": 7.490774907749078, "grad_norm": 0.16442282497882843, "learning_rate": 1.2730627306273063e-05, "loss": 0.1036, "step": 2030 }, { "epoch": 7.527675276752768, "grad_norm": 0.19210496544837952, "learning_rate": 1.1808118081180812e-05, "loss": 0.0993, "step": 2040 }, { "epoch": 7.564575645756458, "grad_norm": 0.1562729775905609, "learning_rate": 1.0885608856088561e-05, "loss": 0.0941, "step": 2050 }, { "epoch": 7.601476014760148, "grad_norm": 0.29051193594932556, "learning_rate": 9.96309963099631e-06, "loss": 0.092, "step": 2060 }, { "epoch": 7.638376383763838, "grad_norm": 0.21477282047271729, "learning_rate": 9.040590405904059e-06, "loss": 0.1123, "step": 2070 }, { "epoch": 7.675276752767528, "grad_norm": 0.03506307676434517, "learning_rate": 8.118081180811808e-06, "loss": 0.0927, "step": 2080 }, { "epoch": 7.712177121771218, "grad_norm": 0.21280255913734436, "learning_rate": 7.195571955719557e-06, "loss": 0.084, "step": 2090 }, { "epoch": 7.749077490774908, "grad_norm": 0.19547449052333832, "learning_rate": 6.273062730627306e-06, "loss": 0.0873, "step": 2100 }, { "epoch": 7.749077490774908, "eval_accuracy": 0.6526011560693642, "eval_loss": 0.0923289805650711, "eval_runtime": 50.6448, "eval_samples_per_second": 34.159, "eval_steps_per_second": 4.285, "step": 2100 }, { "epoch": 7.785977859778598, "grad_norm": 0.22129392623901367, "learning_rate": 5.350553505535055e-06, "loss": 0.1, "step": 2110 }, { "epoch": 7.822878228782288, "grad_norm": 0.2631789445877075, "learning_rate": 4.428044280442805e-06, "loss": 0.0811, "step": 2120 }, { "epoch": 7.8597785977859775, "grad_norm": 0.16971804201602936, "learning_rate": 3.5055350553505534e-06, "loss": 0.0977, "step": 2130 }, { "epoch": 7.8966789667896675, "grad_norm": 0.10247929394245148, "learning_rate": 2.5830258302583027e-06, "loss": 0.0966, "step": 2140 }, { "epoch": 7.9335793357933575, "grad_norm": 0.15865936875343323, "learning_rate": 1.6605166051660517e-06, "loss": 0.1039, "step": 2150 }, { "epoch": 7.970479704797048, "grad_norm": 0.506331205368042, "learning_rate": 7.380073800738008e-07, "loss": 0.0977, "step": 2160 }, { "epoch": 8.0, "step": 2168, "total_flos": 2.6821552511927255e+18, "train_loss": 0.16216810325304962, "train_runtime": 3143.5173, "train_samples_per_second": 11.009, "train_steps_per_second": 0.69 } ], "logging_steps": 10, "max_steps": 2168, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.6821552511927255e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }