|
{ |
|
"best_metric": 0.0923289805650711, |
|
"best_model_checkpoint": "./ViT-NIH-Chest-X-ray-dataset-small/checkpoint-2100", |
|
"epoch": 8.0, |
|
"eval_steps": 100, |
|
"global_step": 2168, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03690036900369004, |
|
"grad_norm": 0.38048115372657776, |
|
"learning_rate": 0.00019907749077490775, |
|
"loss": 0.5131, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07380073800738007, |
|
"grad_norm": 0.3416444957256317, |
|
"learning_rate": 0.00019815498154981552, |
|
"loss": 0.3661, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.11070110701107011, |
|
"grad_norm": 0.268430233001709, |
|
"learning_rate": 0.00019723247232472326, |
|
"loss": 0.3122, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.14760147601476015, |
|
"grad_norm": 0.20281535387039185, |
|
"learning_rate": 0.000196309963099631, |
|
"loss": 0.2862, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.18450184501845018, |
|
"grad_norm": 0.17638804018497467, |
|
"learning_rate": 0.00019538745387453877, |
|
"loss": 0.2582, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.22140221402214022, |
|
"grad_norm": 0.13983863592147827, |
|
"learning_rate": 0.00019446494464944652, |
|
"loss": 0.2621, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.25830258302583026, |
|
"grad_norm": 0.1306193470954895, |
|
"learning_rate": 0.00019354243542435426, |
|
"loss": 0.2361, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2952029520295203, |
|
"grad_norm": 0.12178371101617813, |
|
"learning_rate": 0.000192619926199262, |
|
"loss": 0.2271, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.33210332103321033, |
|
"grad_norm": 0.09625957161188126, |
|
"learning_rate": 0.00019169741697416974, |
|
"loss": 0.2275, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.36900369003690037, |
|
"grad_norm": 0.12183211743831635, |
|
"learning_rate": 0.00019077490774907748, |
|
"loss": 0.2128, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.36900369003690037, |
|
"eval_accuracy": 0.0, |
|
"eval_loss": 0.20924170315265656, |
|
"eval_runtime": 51.5235, |
|
"eval_samples_per_second": 33.577, |
|
"eval_steps_per_second": 4.212, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.4059040590405904, |
|
"grad_norm": 0.08246201276779175, |
|
"learning_rate": 0.00018985239852398525, |
|
"loss": 0.2089, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.44280442804428044, |
|
"grad_norm": 0.11126121133565903, |
|
"learning_rate": 0.000188929889298893, |
|
"loss": 0.2147, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4797047970479705, |
|
"grad_norm": 0.11177172511816025, |
|
"learning_rate": 0.00018800738007380074, |
|
"loss": 0.2044, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5166051660516605, |
|
"grad_norm": 0.10021921247243881, |
|
"learning_rate": 0.0001870848708487085, |
|
"loss": 0.1839, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5535055350553506, |
|
"grad_norm": 0.185609832406044, |
|
"learning_rate": 0.00018616236162361625, |
|
"loss": 0.2209, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5904059040590406, |
|
"grad_norm": 0.17880532145500183, |
|
"learning_rate": 0.000185239852398524, |
|
"loss": 0.2023, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6273062730627307, |
|
"grad_norm": 0.09112340211868286, |
|
"learning_rate": 0.00018431734317343173, |
|
"loss": 0.1952, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6642066420664207, |
|
"grad_norm": 0.14496631920337677, |
|
"learning_rate": 0.0001833948339483395, |
|
"loss": 0.2154, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.7011070110701108, |
|
"grad_norm": 0.1129971593618393, |
|
"learning_rate": 0.00018247232472324724, |
|
"loss": 0.1865, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7380073800738007, |
|
"grad_norm": 0.1439884454011917, |
|
"learning_rate": 0.00018154981549815499, |
|
"loss": 0.1848, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7380073800738007, |
|
"eval_accuracy": 0.38208092485549133, |
|
"eval_loss": 0.1909271478652954, |
|
"eval_runtime": 51.554, |
|
"eval_samples_per_second": 33.557, |
|
"eval_steps_per_second": 4.209, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7749077490774908, |
|
"grad_norm": 0.1933569312095642, |
|
"learning_rate": 0.00018062730627306276, |
|
"loss": 0.1921, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.8118081180811808, |
|
"grad_norm": 0.19441623985767365, |
|
"learning_rate": 0.0001797047970479705, |
|
"loss": 0.2087, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.8487084870848709, |
|
"grad_norm": 0.10429559648036957, |
|
"learning_rate": 0.00017878228782287824, |
|
"loss": 0.2016, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.8856088560885609, |
|
"grad_norm": 0.11976602673530579, |
|
"learning_rate": 0.00017785977859778598, |
|
"loss": 0.1871, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.922509225092251, |
|
"grad_norm": 0.13647252321243286, |
|
"learning_rate": 0.00017693726937269372, |
|
"loss": 0.1951, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.959409594095941, |
|
"grad_norm": 0.13491246104240417, |
|
"learning_rate": 0.00017601476014760147, |
|
"loss": 0.1987, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.996309963099631, |
|
"grad_norm": 0.10877779126167297, |
|
"learning_rate": 0.00017509225092250923, |
|
"loss": 0.2067, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.033210332103321, |
|
"grad_norm": 0.10195717215538025, |
|
"learning_rate": 0.00017416974169741698, |
|
"loss": 0.1829, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.070110701107011, |
|
"grad_norm": 0.11287475377321243, |
|
"learning_rate": 0.00017324723247232472, |
|
"loss": 0.1771, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.1070110701107012, |
|
"grad_norm": 0.10177090764045715, |
|
"learning_rate": 0.0001723247232472325, |
|
"loss": 0.171, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.1070110701107012, |
|
"eval_accuracy": 0.5387283236994219, |
|
"eval_loss": 0.1967419981956482, |
|
"eval_runtime": 50.6773, |
|
"eval_samples_per_second": 34.138, |
|
"eval_steps_per_second": 4.282, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.1439114391143912, |
|
"grad_norm": 0.14216077327728271, |
|
"learning_rate": 0.00017140221402214023, |
|
"loss": 0.2104, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.1808118081180812, |
|
"grad_norm": 0.0753447487950325, |
|
"learning_rate": 0.00017047970479704797, |
|
"loss": 0.201, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.2177121771217712, |
|
"grad_norm": 0.13927125930786133, |
|
"learning_rate": 0.00016955719557195574, |
|
"loss": 0.1923, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.2546125461254611, |
|
"grad_norm": 0.14396004378795624, |
|
"learning_rate": 0.00016863468634686348, |
|
"loss": 0.1996, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.2915129151291513, |
|
"grad_norm": 0.09631673246622086, |
|
"learning_rate": 0.00016771217712177123, |
|
"loss": 0.1672, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.3284132841328413, |
|
"grad_norm": 0.1342993974685669, |
|
"learning_rate": 0.00016678966789667897, |
|
"loss": 0.1792, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.3653136531365313, |
|
"grad_norm": 0.20273268222808838, |
|
"learning_rate": 0.00016586715867158674, |
|
"loss": 0.1753, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.4022140221402215, |
|
"grad_norm": 0.1452128291130066, |
|
"learning_rate": 0.00016494464944649448, |
|
"loss": 0.1801, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.4391143911439115, |
|
"grad_norm": 0.0973893478512764, |
|
"learning_rate": 0.00016402214022140222, |
|
"loss": 0.2136, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.4760147601476015, |
|
"grad_norm": 0.2589876651763916, |
|
"learning_rate": 0.00016309963099630996, |
|
"loss": 0.1772, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.4760147601476015, |
|
"eval_accuracy": 0.5450867052023122, |
|
"eval_loss": 0.19317613542079926, |
|
"eval_runtime": 50.5026, |
|
"eval_samples_per_second": 34.256, |
|
"eval_steps_per_second": 4.297, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.5129151291512914, |
|
"grad_norm": 0.16841156780719757, |
|
"learning_rate": 0.0001621771217712177, |
|
"loss": 0.1894, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.5498154981549814, |
|
"grad_norm": 0.17766626179218292, |
|
"learning_rate": 0.00016125461254612547, |
|
"loss": 0.204, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.5867158671586716, |
|
"grad_norm": 0.13253839313983917, |
|
"learning_rate": 0.00016033210332103322, |
|
"loss": 0.2141, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.6236162361623616, |
|
"grad_norm": 0.10587523877620697, |
|
"learning_rate": 0.00015940959409594096, |
|
"loss": 0.2192, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.6605166051660518, |
|
"grad_norm": 0.10049675405025482, |
|
"learning_rate": 0.0001584870848708487, |
|
"loss": 0.1939, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.6974169741697418, |
|
"grad_norm": 0.1287400722503662, |
|
"learning_rate": 0.00015756457564575647, |
|
"loss": 0.1955, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.7343173431734318, |
|
"grad_norm": 0.15120339393615723, |
|
"learning_rate": 0.0001566420664206642, |
|
"loss": 0.1863, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.7712177121771218, |
|
"grad_norm": 0.1202373206615448, |
|
"learning_rate": 0.00015571955719557195, |
|
"loss": 0.1832, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.8081180811808117, |
|
"grad_norm": 0.1368759572505951, |
|
"learning_rate": 0.00015479704797047972, |
|
"loss": 0.2007, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.8450184501845017, |
|
"grad_norm": 0.08591968566179276, |
|
"learning_rate": 0.00015387453874538746, |
|
"loss": 0.1629, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.8450184501845017, |
|
"eval_accuracy": 0.4485549132947977, |
|
"eval_loss": 0.1842162311077118, |
|
"eval_runtime": 50.5861, |
|
"eval_samples_per_second": 34.199, |
|
"eval_steps_per_second": 4.29, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.881918819188192, |
|
"grad_norm": 0.12317466735839844, |
|
"learning_rate": 0.0001529520295202952, |
|
"loss": 0.213, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.918819188191882, |
|
"grad_norm": 0.15092293918132782, |
|
"learning_rate": 0.00015202952029520298, |
|
"loss": 0.2004, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.9557195571955721, |
|
"grad_norm": 0.20343895256519318, |
|
"learning_rate": 0.00015110701107011072, |
|
"loss": 0.1835, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.992619926199262, |
|
"grad_norm": 0.32362422347068787, |
|
"learning_rate": 0.00015018450184501846, |
|
"loss": 0.1915, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.029520295202952, |
|
"grad_norm": 0.14631719887256622, |
|
"learning_rate": 0.00014926199261992623, |
|
"loss": 0.2113, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.066420664206642, |
|
"grad_norm": 0.12011805921792984, |
|
"learning_rate": 0.00014833948339483394, |
|
"loss": 0.1781, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.103321033210332, |
|
"grad_norm": 0.14479252696037292, |
|
"learning_rate": 0.00014741697416974169, |
|
"loss": 0.182, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.140221402214022, |
|
"grad_norm": 0.18806347250938416, |
|
"learning_rate": 0.00014649446494464946, |
|
"loss": 0.1682, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.177121771217712, |
|
"grad_norm": 0.17025235295295715, |
|
"learning_rate": 0.0001455719557195572, |
|
"loss": 0.1824, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.2140221402214024, |
|
"grad_norm": 0.16879422962665558, |
|
"learning_rate": 0.00014464944649446494, |
|
"loss": 0.1942, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.2140221402214024, |
|
"eval_accuracy": 0.41965317919075146, |
|
"eval_loss": 0.17699980735778809, |
|
"eval_runtime": 50.0976, |
|
"eval_samples_per_second": 34.533, |
|
"eval_steps_per_second": 4.332, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.2509225092250924, |
|
"grad_norm": 0.168411523103714, |
|
"learning_rate": 0.0001437269372693727, |
|
"loss": 0.1732, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.2878228782287824, |
|
"grad_norm": 0.21096496284008026, |
|
"learning_rate": 0.00014280442804428045, |
|
"loss": 0.1842, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.3247232472324724, |
|
"grad_norm": 0.18110381066799164, |
|
"learning_rate": 0.0001418819188191882, |
|
"loss": 0.1772, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.3616236162361623, |
|
"grad_norm": 0.15854766964912415, |
|
"learning_rate": 0.00014095940959409593, |
|
"loss": 0.1709, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.3985239852398523, |
|
"grad_norm": 0.19320182502269745, |
|
"learning_rate": 0.0001400369003690037, |
|
"loss": 0.2, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.4354243542435423, |
|
"grad_norm": 0.16658619046211243, |
|
"learning_rate": 0.00013911439114391145, |
|
"loss": 0.2061, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.4723247232472323, |
|
"grad_norm": 0.14636483788490295, |
|
"learning_rate": 0.0001381918819188192, |
|
"loss": 0.1946, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.5092250922509223, |
|
"grad_norm": 0.1587982028722763, |
|
"learning_rate": 0.00013726937269372696, |
|
"loss": 0.1963, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.5461254612546127, |
|
"grad_norm": 0.2764102518558502, |
|
"learning_rate": 0.0001363468634686347, |
|
"loss": 0.1702, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.5830258302583027, |
|
"grad_norm": 0.14449751377105713, |
|
"learning_rate": 0.00013542435424354244, |
|
"loss": 0.1714, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.5830258302583027, |
|
"eval_accuracy": 0.5023121387283237, |
|
"eval_loss": 0.17974236607551575, |
|
"eval_runtime": 50.796, |
|
"eval_samples_per_second": 34.058, |
|
"eval_steps_per_second": 4.272, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.6199261992619927, |
|
"grad_norm": 0.15392902493476868, |
|
"learning_rate": 0.0001345018450184502, |
|
"loss": 0.191, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.6568265682656826, |
|
"grad_norm": 0.15529021620750427, |
|
"learning_rate": 0.00013357933579335793, |
|
"loss": 0.193, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.6937269372693726, |
|
"grad_norm": 0.18234789371490479, |
|
"learning_rate": 0.00013265682656826567, |
|
"loss": 0.1836, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.7306273062730626, |
|
"grad_norm": 0.19954174757003784, |
|
"learning_rate": 0.00013173431734317344, |
|
"loss": 0.2176, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.767527675276753, |
|
"grad_norm": 0.13893257081508636, |
|
"learning_rate": 0.00013081180811808118, |
|
"loss": 0.1699, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.804428044280443, |
|
"grad_norm": 0.16896647214889526, |
|
"learning_rate": 0.00012988929889298892, |
|
"loss": 0.168, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.841328413284133, |
|
"grad_norm": 0.20796014368534088, |
|
"learning_rate": 0.0001289667896678967, |
|
"loss": 0.2141, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.878228782287823, |
|
"grad_norm": 0.2690466046333313, |
|
"learning_rate": 0.00012804428044280443, |
|
"loss": 0.1778, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.915129151291513, |
|
"grad_norm": 0.14259500801563263, |
|
"learning_rate": 0.00012712177121771217, |
|
"loss": 0.1748, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.952029520295203, |
|
"grad_norm": 0.14488738775253296, |
|
"learning_rate": 0.00012619926199261994, |
|
"loss": 0.1832, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.952029520295203, |
|
"eval_accuracy": 0.36878612716763004, |
|
"eval_loss": 0.17303667962551117, |
|
"eval_runtime": 50.3505, |
|
"eval_samples_per_second": 34.359, |
|
"eval_steps_per_second": 4.31, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.988929889298893, |
|
"grad_norm": 0.1963815540075302, |
|
"learning_rate": 0.00012527675276752769, |
|
"loss": 0.1603, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 3.025830258302583, |
|
"grad_norm": 0.18811728060245514, |
|
"learning_rate": 0.00012435424354243543, |
|
"loss": 0.1668, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 3.062730627306273, |
|
"grad_norm": 0.3115330636501312, |
|
"learning_rate": 0.0001234317343173432, |
|
"loss": 0.1764, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 3.0996309963099633, |
|
"grad_norm": 0.15212470293045044, |
|
"learning_rate": 0.00012250922509225094, |
|
"loss": 0.1668, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 3.1365313653136533, |
|
"grad_norm": 0.17935976386070251, |
|
"learning_rate": 0.00012158671586715868, |
|
"loss": 0.1807, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 3.1734317343173433, |
|
"grad_norm": 0.23978868126869202, |
|
"learning_rate": 0.00012066420664206644, |
|
"loss": 0.168, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 3.2103321033210332, |
|
"grad_norm": 0.29603224992752075, |
|
"learning_rate": 0.00011974169741697419, |
|
"loss": 0.1827, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 3.2472324723247232, |
|
"grad_norm": 0.1385461539030075, |
|
"learning_rate": 0.00011881918819188192, |
|
"loss": 0.1794, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 3.284132841328413, |
|
"grad_norm": 0.20920993387699127, |
|
"learning_rate": 0.00011789667896678966, |
|
"loss": 0.1738, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 3.321033210332103, |
|
"grad_norm": 0.31590428948402405, |
|
"learning_rate": 0.00011697416974169742, |
|
"loss": 0.1766, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 3.321033210332103, |
|
"eval_accuracy": 0.34277456647398846, |
|
"eval_loss": 0.17552779614925385, |
|
"eval_runtime": 50.5381, |
|
"eval_samples_per_second": 34.232, |
|
"eval_steps_per_second": 4.294, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 3.357933579335793, |
|
"grad_norm": 0.22194945812225342, |
|
"learning_rate": 0.00011605166051660516, |
|
"loss": 0.1814, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 3.3948339483394836, |
|
"grad_norm": 0.22071777284145355, |
|
"learning_rate": 0.00011512915129151292, |
|
"loss": 0.1629, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 3.4317343173431736, |
|
"grad_norm": 0.44112759828567505, |
|
"learning_rate": 0.00011420664206642067, |
|
"loss": 0.1914, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 3.4686346863468636, |
|
"grad_norm": 0.20971660315990448, |
|
"learning_rate": 0.00011328413284132841, |
|
"loss": 0.1691, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 3.5055350553505535, |
|
"grad_norm": 0.23813588917255402, |
|
"learning_rate": 0.00011236162361623617, |
|
"loss": 0.1919, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 3.5424354243542435, |
|
"grad_norm": 0.19610780477523804, |
|
"learning_rate": 0.00011143911439114391, |
|
"loss": 0.1631, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.5793357933579335, |
|
"grad_norm": 0.29578620195388794, |
|
"learning_rate": 0.00011051660516605167, |
|
"loss": 0.1721, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.6162361623616235, |
|
"grad_norm": 0.15876761078834534, |
|
"learning_rate": 0.00010959409594095942, |
|
"loss": 0.1869, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.6531365313653135, |
|
"grad_norm": 0.19575054943561554, |
|
"learning_rate": 0.00010867158671586716, |
|
"loss": 0.1676, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.6900369003690034, |
|
"grad_norm": 0.12657958269119263, |
|
"learning_rate": 0.00010774907749077492, |
|
"loss": 0.1697, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.6900369003690034, |
|
"eval_accuracy": 0.5167630057803468, |
|
"eval_loss": 0.1601094752550125, |
|
"eval_runtime": 50.1373, |
|
"eval_samples_per_second": 34.505, |
|
"eval_steps_per_second": 4.328, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.726937269372694, |
|
"grad_norm": 0.2477671205997467, |
|
"learning_rate": 0.00010682656826568268, |
|
"loss": 0.1745, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.763837638376384, |
|
"grad_norm": 0.21879136562347412, |
|
"learning_rate": 0.00010590405904059042, |
|
"loss": 0.1617, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.800738007380074, |
|
"grad_norm": 0.195592001080513, |
|
"learning_rate": 0.00010498154981549817, |
|
"loss": 0.1534, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.837638376383764, |
|
"grad_norm": 0.35998597741127014, |
|
"learning_rate": 0.0001040590405904059, |
|
"loss": 0.1606, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.874538745387454, |
|
"grad_norm": 0.30765026807785034, |
|
"learning_rate": 0.00010313653136531364, |
|
"loss": 0.2019, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.911439114391144, |
|
"grad_norm": 0.16130860149860382, |
|
"learning_rate": 0.0001022140221402214, |
|
"loss": 0.1738, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.948339483394834, |
|
"grad_norm": 0.1843736171722412, |
|
"learning_rate": 0.00010129151291512916, |
|
"loss": 0.1941, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 3.985239852398524, |
|
"grad_norm": 0.21090315282344818, |
|
"learning_rate": 0.0001003690036900369, |
|
"loss": 0.1695, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 4.022140221402214, |
|
"grad_norm": 0.19030509889125824, |
|
"learning_rate": 9.944649446494465e-05, |
|
"loss": 0.1711, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 4.059040590405904, |
|
"grad_norm": 0.12992843985557556, |
|
"learning_rate": 9.85239852398524e-05, |
|
"loss": 0.1568, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 4.059040590405904, |
|
"eval_accuracy": 0.5352601156069364, |
|
"eval_loss": 0.15768703818321228, |
|
"eval_runtime": 50.6123, |
|
"eval_samples_per_second": 34.181, |
|
"eval_steps_per_second": 4.287, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 4.095940959409594, |
|
"grad_norm": 0.23201997578144073, |
|
"learning_rate": 9.760147601476015e-05, |
|
"loss": 0.1484, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 4.132841328413284, |
|
"grad_norm": 0.3783067762851715, |
|
"learning_rate": 9.66789667896679e-05, |
|
"loss": 0.1597, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 4.169741697416974, |
|
"grad_norm": 0.27165931463241577, |
|
"learning_rate": 9.575645756457565e-05, |
|
"loss": 0.156, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 4.206642066420664, |
|
"grad_norm": 0.2932455241680145, |
|
"learning_rate": 9.48339483394834e-05, |
|
"loss": 0.1353, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 4.243542435424354, |
|
"grad_norm": 0.27856454253196716, |
|
"learning_rate": 9.391143911439116e-05, |
|
"loss": 0.1555, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 4.280442804428044, |
|
"grad_norm": 0.2609305679798126, |
|
"learning_rate": 9.298892988929889e-05, |
|
"loss": 0.1549, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 4.317343173431734, |
|
"grad_norm": 0.4013775587081909, |
|
"learning_rate": 9.206642066420664e-05, |
|
"loss": 0.1555, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 4.354243542435424, |
|
"grad_norm": 0.24482858180999756, |
|
"learning_rate": 9.11439114391144e-05, |
|
"loss": 0.1583, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 4.391143911439114, |
|
"grad_norm": 0.2422870397567749, |
|
"learning_rate": 9.022140221402214e-05, |
|
"loss": 0.1663, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 4.428044280442805, |
|
"grad_norm": 0.2710004448890686, |
|
"learning_rate": 8.92988929889299e-05, |
|
"loss": 0.1484, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 4.428044280442805, |
|
"eval_accuracy": 0.49190751445086706, |
|
"eval_loss": 0.1513577699661255, |
|
"eval_runtime": 50.7754, |
|
"eval_samples_per_second": 34.072, |
|
"eval_steps_per_second": 4.274, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 4.464944649446495, |
|
"grad_norm": 0.3608151972293854, |
|
"learning_rate": 8.837638376383764e-05, |
|
"loss": 0.1595, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 4.501845018450185, |
|
"grad_norm": 0.14578911662101746, |
|
"learning_rate": 8.74538745387454e-05, |
|
"loss": 0.1841, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 4.538745387453875, |
|
"grad_norm": 0.2544012665748596, |
|
"learning_rate": 8.653136531365315e-05, |
|
"loss": 0.1576, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 4.575645756457565, |
|
"grad_norm": 0.3130911886692047, |
|
"learning_rate": 8.560885608856088e-05, |
|
"loss": 0.1626, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 4.612546125461255, |
|
"grad_norm": 0.31136009097099304, |
|
"learning_rate": 8.468634686346863e-05, |
|
"loss": 0.1715, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 4.649446494464945, |
|
"grad_norm": 0.20172053575515747, |
|
"learning_rate": 8.376383763837639e-05, |
|
"loss": 0.1476, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 4.686346863468635, |
|
"grad_norm": 0.2550618648529053, |
|
"learning_rate": 8.284132841328413e-05, |
|
"loss": 0.1376, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 4.723247232472325, |
|
"grad_norm": 0.16149303317070007, |
|
"learning_rate": 8.191881918819189e-05, |
|
"loss": 0.1461, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 4.760147601476015, |
|
"grad_norm": 0.27109894156455994, |
|
"learning_rate": 8.099630996309964e-05, |
|
"loss": 0.1556, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 4.797047970479705, |
|
"grad_norm": 0.26436206698417664, |
|
"learning_rate": 8.007380073800739e-05, |
|
"loss": 0.1483, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.797047970479705, |
|
"eval_accuracy": 0.5699421965317919, |
|
"eval_loss": 0.14818404614925385, |
|
"eval_runtime": 49.9368, |
|
"eval_samples_per_second": 34.644, |
|
"eval_steps_per_second": 4.345, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.833948339483395, |
|
"grad_norm": 0.39457815885543823, |
|
"learning_rate": 7.915129151291514e-05, |
|
"loss": 0.1731, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 4.870848708487085, |
|
"grad_norm": 0.1614658087491989, |
|
"learning_rate": 7.822878228782288e-05, |
|
"loss": 0.1525, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 4.907749077490775, |
|
"grad_norm": 0.26091647148132324, |
|
"learning_rate": 7.730627306273062e-05, |
|
"loss": 0.1854, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 4.944649446494465, |
|
"grad_norm": 0.33017560839653015, |
|
"learning_rate": 7.638376383763838e-05, |
|
"loss": 0.1695, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 4.9815498154981555, |
|
"grad_norm": 0.3637866973876953, |
|
"learning_rate": 7.546125461254612e-05, |
|
"loss": 0.1666, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 5.018450184501845, |
|
"grad_norm": 0.3373745083808899, |
|
"learning_rate": 7.453874538745388e-05, |
|
"loss": 0.1525, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 5.055350553505535, |
|
"grad_norm": 0.25333917140960693, |
|
"learning_rate": 7.361623616236163e-05, |
|
"loss": 0.1356, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 5.092250922509225, |
|
"grad_norm": 0.1722867488861084, |
|
"learning_rate": 7.269372693726938e-05, |
|
"loss": 0.1357, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 5.129151291512915, |
|
"grad_norm": 0.13959679007530212, |
|
"learning_rate": 7.177121771217713e-05, |
|
"loss": 0.1285, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 5.166051660516605, |
|
"grad_norm": 0.17668481171131134, |
|
"learning_rate": 7.084870848708487e-05, |
|
"loss": 0.1301, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 5.166051660516605, |
|
"eval_accuracy": 0.5433526011560693, |
|
"eval_loss": 0.13149897754192352, |
|
"eval_runtime": 50.8795, |
|
"eval_samples_per_second": 34.002, |
|
"eval_steps_per_second": 4.265, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 5.202952029520295, |
|
"grad_norm": 0.1745270937681198, |
|
"learning_rate": 6.992619926199262e-05, |
|
"loss": 0.1516, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 5.239852398523985, |
|
"grad_norm": 0.5758349299430847, |
|
"learning_rate": 6.900369003690037e-05, |
|
"loss": 0.1294, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 5.276752767527675, |
|
"grad_norm": 0.2458232194185257, |
|
"learning_rate": 6.808118081180813e-05, |
|
"loss": 0.1385, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 5.313653136531365, |
|
"grad_norm": 0.3469581604003906, |
|
"learning_rate": 6.715867158671587e-05, |
|
"loss": 0.1394, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 5.350553505535055, |
|
"grad_norm": 0.267447292804718, |
|
"learning_rate": 6.623616236162362e-05, |
|
"loss": 0.1432, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 5.387453874538745, |
|
"grad_norm": 0.24406275153160095, |
|
"learning_rate": 6.531365313653137e-05, |
|
"loss": 0.1396, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 5.424354243542435, |
|
"grad_norm": 0.7067885994911194, |
|
"learning_rate": 6.439114391143912e-05, |
|
"loss": 0.1456, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 5.461254612546125, |
|
"grad_norm": 0.2915806174278259, |
|
"learning_rate": 6.346863468634686e-05, |
|
"loss": 0.1366, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 5.498154981549815, |
|
"grad_norm": 0.22377534210681915, |
|
"learning_rate": 6.25461254612546e-05, |
|
"loss": 0.1273, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 5.535055350553505, |
|
"grad_norm": 0.3705073893070221, |
|
"learning_rate": 6.162361623616236e-05, |
|
"loss": 0.1149, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 5.535055350553505, |
|
"eval_accuracy": 0.5583815028901734, |
|
"eval_loss": 0.12937474250793457, |
|
"eval_runtime": 50.8394, |
|
"eval_samples_per_second": 34.029, |
|
"eval_steps_per_second": 4.268, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 5.571955719557195, |
|
"grad_norm": 0.13345371186733246, |
|
"learning_rate": 6.070110701107011e-05, |
|
"loss": 0.1229, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 5.608856088560886, |
|
"grad_norm": 0.4122871160507202, |
|
"learning_rate": 5.9778597785977866e-05, |
|
"loss": 0.1689, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 5.645756457564576, |
|
"grad_norm": 0.14905782043933868, |
|
"learning_rate": 5.8856088560885615e-05, |
|
"loss": 0.1365, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 5.682656826568266, |
|
"grad_norm": 0.21198387444019318, |
|
"learning_rate": 5.7933579335793364e-05, |
|
"loss": 0.1453, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 5.719557195571956, |
|
"grad_norm": 0.3941808044910431, |
|
"learning_rate": 5.701107011070111e-05, |
|
"loss": 0.1584, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 5.756457564575646, |
|
"grad_norm": 0.1366042047739029, |
|
"learning_rate": 5.6088560885608855e-05, |
|
"loss": 0.1219, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 5.793357933579336, |
|
"grad_norm": 0.1590586006641388, |
|
"learning_rate": 5.5166051660516604e-05, |
|
"loss": 0.1482, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 5.830258302583026, |
|
"grad_norm": 0.3574014902114868, |
|
"learning_rate": 5.424354243542435e-05, |
|
"loss": 0.1241, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 5.867158671586716, |
|
"grad_norm": 0.2934325039386749, |
|
"learning_rate": 5.332103321033211e-05, |
|
"loss": 0.1397, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 5.904059040590406, |
|
"grad_norm": 0.2349650263786316, |
|
"learning_rate": 5.239852398523986e-05, |
|
"loss": 0.1448, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 5.904059040590406, |
|
"eval_accuracy": 0.5416184971098266, |
|
"eval_loss": 0.12662799656391144, |
|
"eval_runtime": 50.1064, |
|
"eval_samples_per_second": 34.527, |
|
"eval_steps_per_second": 4.331, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 5.940959409594096, |
|
"grad_norm": 0.39207130670547485, |
|
"learning_rate": 5.1476014760147606e-05, |
|
"loss": 0.1491, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 5.977859778597786, |
|
"grad_norm": 0.21359127759933472, |
|
"learning_rate": 5.0553505535055354e-05, |
|
"loss": 0.1367, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 6.014760147601476, |
|
"grad_norm": 0.17874382436275482, |
|
"learning_rate": 4.96309963099631e-05, |
|
"loss": 0.1276, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 6.051660516605166, |
|
"grad_norm": 0.15224817395210266, |
|
"learning_rate": 4.870848708487085e-05, |
|
"loss": 0.1223, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 6.088560885608856, |
|
"grad_norm": 0.28657016158103943, |
|
"learning_rate": 4.77859778597786e-05, |
|
"loss": 0.1327, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 6.125461254612546, |
|
"grad_norm": 0.16251201927661896, |
|
"learning_rate": 4.686346863468635e-05, |
|
"loss": 0.1318, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 6.162361623616236, |
|
"grad_norm": 0.3002704381942749, |
|
"learning_rate": 4.59409594095941e-05, |
|
"loss": 0.1188, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 6.199261992619927, |
|
"grad_norm": 0.4188823103904724, |
|
"learning_rate": 4.501845018450185e-05, |
|
"loss": 0.1004, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 6.236162361623617, |
|
"grad_norm": 0.18772590160369873, |
|
"learning_rate": 4.4095940959409596e-05, |
|
"loss": 0.1002, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 6.273062730627307, |
|
"grad_norm": 0.30921700596809387, |
|
"learning_rate": 4.3173431734317345e-05, |
|
"loss": 0.1035, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 6.273062730627307, |
|
"eval_accuracy": 0.6017341040462427, |
|
"eval_loss": 0.11507368832826614, |
|
"eval_runtime": 50.6734, |
|
"eval_samples_per_second": 34.14, |
|
"eval_steps_per_second": 4.282, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 6.3099630996309966, |
|
"grad_norm": 0.42562779784202576, |
|
"learning_rate": 4.2250922509225094e-05, |
|
"loss": 0.1071, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 6.3468634686346865, |
|
"grad_norm": 0.36547404527664185, |
|
"learning_rate": 4.132841328413284e-05, |
|
"loss": 0.12, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 6.3837638376383765, |
|
"grad_norm": 0.12006784975528717, |
|
"learning_rate": 4.040590405904059e-05, |
|
"loss": 0.1107, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 6.4206642066420665, |
|
"grad_norm": 0.1983233392238617, |
|
"learning_rate": 3.948339483394834e-05, |
|
"loss": 0.1206, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 6.4575645756457565, |
|
"grad_norm": 0.17691943049430847, |
|
"learning_rate": 3.856088560885609e-05, |
|
"loss": 0.1252, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 6.4944649446494465, |
|
"grad_norm": 0.39386728405952454, |
|
"learning_rate": 3.763837638376384e-05, |
|
"loss": 0.1314, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 6.531365313653136, |
|
"grad_norm": 0.607455313205719, |
|
"learning_rate": 3.6715867158671594e-05, |
|
"loss": 0.1095, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 6.568265682656826, |
|
"grad_norm": 0.21057389676570892, |
|
"learning_rate": 3.5793357933579336e-05, |
|
"loss": 0.1223, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 6.605166051660516, |
|
"grad_norm": 0.27539491653442383, |
|
"learning_rate": 3.4870848708487085e-05, |
|
"loss": 0.1163, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 6.642066420664206, |
|
"grad_norm": 0.24495290219783783, |
|
"learning_rate": 3.3948339483394833e-05, |
|
"loss": 0.1048, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 6.642066420664206, |
|
"eval_accuracy": 0.6046242774566474, |
|
"eval_loss": 0.10599144548177719, |
|
"eval_runtime": 50.9957, |
|
"eval_samples_per_second": 33.924, |
|
"eval_steps_per_second": 4.255, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 6.678966789667896, |
|
"grad_norm": 0.38892611861228943, |
|
"learning_rate": 3.302583025830259e-05, |
|
"loss": 0.1352, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 6.715867158671586, |
|
"grad_norm": 0.2850606143474579, |
|
"learning_rate": 3.210332103321033e-05, |
|
"loss": 0.1153, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 6.752767527675276, |
|
"grad_norm": 0.16241934895515442, |
|
"learning_rate": 3.118081180811808e-05, |
|
"loss": 0.1074, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 6.789667896678967, |
|
"grad_norm": 0.36088794469833374, |
|
"learning_rate": 3.0258302583025832e-05, |
|
"loss": 0.1219, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 6.826568265682657, |
|
"grad_norm": 0.21467632055282593, |
|
"learning_rate": 2.9335793357933584e-05, |
|
"loss": 0.1083, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 6.863468634686347, |
|
"grad_norm": 0.4730125069618225, |
|
"learning_rate": 2.8413284132841326e-05, |
|
"loss": 0.1227, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 6.900369003690037, |
|
"grad_norm": 0.20842638611793518, |
|
"learning_rate": 2.749077490774908e-05, |
|
"loss": 0.128, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 6.937269372693727, |
|
"grad_norm": 0.1885102540254593, |
|
"learning_rate": 2.6568265682656828e-05, |
|
"loss": 0.0923, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 6.974169741697417, |
|
"grad_norm": 0.48948994278907776, |
|
"learning_rate": 2.564575645756458e-05, |
|
"loss": 0.1165, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 7.011070110701107, |
|
"grad_norm": 0.4080180525779724, |
|
"learning_rate": 2.472324723247233e-05, |
|
"loss": 0.1168, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 7.011070110701107, |
|
"eval_accuracy": 0.6173410404624278, |
|
"eval_loss": 0.10073487460613251, |
|
"eval_runtime": 50.8605, |
|
"eval_samples_per_second": 34.015, |
|
"eval_steps_per_second": 4.267, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 7.047970479704797, |
|
"grad_norm": 0.19095434248447418, |
|
"learning_rate": 2.3800738007380074e-05, |
|
"loss": 0.1131, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 7.084870848708487, |
|
"grad_norm": 0.23603685200214386, |
|
"learning_rate": 2.2878228782287826e-05, |
|
"loss": 0.089, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 7.121771217712177, |
|
"grad_norm": 0.09547635912895203, |
|
"learning_rate": 2.195571955719557e-05, |
|
"loss": 0.1032, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 7.158671586715867, |
|
"grad_norm": 0.18442951142787933, |
|
"learning_rate": 2.1033210332103324e-05, |
|
"loss": 0.0909, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 7.195571955719557, |
|
"grad_norm": 0.2125350534915924, |
|
"learning_rate": 2.011070110701107e-05, |
|
"loss": 0.0922, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 7.232472324723247, |
|
"grad_norm": 0.15140217542648315, |
|
"learning_rate": 1.918819188191882e-05, |
|
"loss": 0.1155, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 7.269372693726937, |
|
"grad_norm": 0.25905662775039673, |
|
"learning_rate": 1.826568265682657e-05, |
|
"loss": 0.1194, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 7.306273062730627, |
|
"grad_norm": 0.18217885494232178, |
|
"learning_rate": 1.734317343173432e-05, |
|
"loss": 0.1, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 7.343173431734318, |
|
"grad_norm": 0.21871539950370789, |
|
"learning_rate": 1.6420664206642068e-05, |
|
"loss": 0.1022, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 7.380073800738008, |
|
"grad_norm": 0.4127865731716156, |
|
"learning_rate": 1.5498154981549817e-05, |
|
"loss": 0.1104, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 7.380073800738008, |
|
"eval_accuracy": 0.6445086705202312, |
|
"eval_loss": 0.09489033371210098, |
|
"eval_runtime": 50.8042, |
|
"eval_samples_per_second": 34.052, |
|
"eval_steps_per_second": 4.271, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 7.416974169741698, |
|
"grad_norm": 0.20000500977039337, |
|
"learning_rate": 1.4575645756457566e-05, |
|
"loss": 0.1031, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 7.453874538745388, |
|
"grad_norm": 0.5234202742576599, |
|
"learning_rate": 1.3653136531365315e-05, |
|
"loss": 0.1197, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 7.490774907749078, |
|
"grad_norm": 0.16442282497882843, |
|
"learning_rate": 1.2730627306273063e-05, |
|
"loss": 0.1036, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 7.527675276752768, |
|
"grad_norm": 0.19210496544837952, |
|
"learning_rate": 1.1808118081180812e-05, |
|
"loss": 0.0993, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 7.564575645756458, |
|
"grad_norm": 0.1562729775905609, |
|
"learning_rate": 1.0885608856088561e-05, |
|
"loss": 0.0941, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 7.601476014760148, |
|
"grad_norm": 0.29051193594932556, |
|
"learning_rate": 9.96309963099631e-06, |
|
"loss": 0.092, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 7.638376383763838, |
|
"grad_norm": 0.21477282047271729, |
|
"learning_rate": 9.040590405904059e-06, |
|
"loss": 0.1123, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 7.675276752767528, |
|
"grad_norm": 0.03506307676434517, |
|
"learning_rate": 8.118081180811808e-06, |
|
"loss": 0.0927, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 7.712177121771218, |
|
"grad_norm": 0.21280255913734436, |
|
"learning_rate": 7.195571955719557e-06, |
|
"loss": 0.084, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 7.749077490774908, |
|
"grad_norm": 0.19547449052333832, |
|
"learning_rate": 6.273062730627306e-06, |
|
"loss": 0.0873, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 7.749077490774908, |
|
"eval_accuracy": 0.6526011560693642, |
|
"eval_loss": 0.0923289805650711, |
|
"eval_runtime": 50.6448, |
|
"eval_samples_per_second": 34.159, |
|
"eval_steps_per_second": 4.285, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 7.785977859778598, |
|
"grad_norm": 0.22129392623901367, |
|
"learning_rate": 5.350553505535055e-06, |
|
"loss": 0.1, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 7.822878228782288, |
|
"grad_norm": 0.2631789445877075, |
|
"learning_rate": 4.428044280442805e-06, |
|
"loss": 0.0811, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 7.8597785977859775, |
|
"grad_norm": 0.16971804201602936, |
|
"learning_rate": 3.5055350553505534e-06, |
|
"loss": 0.0977, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 7.8966789667896675, |
|
"grad_norm": 0.10247929394245148, |
|
"learning_rate": 2.5830258302583027e-06, |
|
"loss": 0.0966, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 7.9335793357933575, |
|
"grad_norm": 0.15865936875343323, |
|
"learning_rate": 1.6605166051660517e-06, |
|
"loss": 0.1039, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 7.970479704797048, |
|
"grad_norm": 0.506331205368042, |
|
"learning_rate": 7.380073800738008e-07, |
|
"loss": 0.0977, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"step": 2168, |
|
"total_flos": 2.6821552511927255e+18, |
|
"train_loss": 0.16216810325304962, |
|
"train_runtime": 3143.5173, |
|
"train_samples_per_second": 11.009, |
|
"train_steps_per_second": 0.69 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2168, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 8, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.6821552511927255e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|