ViT-NIH-Chest-X-ray-dataset-small / trainer_state.json
Sohaibsoussi's picture
Cheers πŸŽ‰
3227fcc verified
raw
history blame
44.1 kB
{
"best_metric": 0.0923289805650711,
"best_model_checkpoint": "./ViT-NIH-Chest-X-ray-dataset-small/checkpoint-2100",
"epoch": 8.0,
"eval_steps": 100,
"global_step": 2168,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03690036900369004,
"grad_norm": 0.38048115372657776,
"learning_rate": 0.00019907749077490775,
"loss": 0.5131,
"step": 10
},
{
"epoch": 0.07380073800738007,
"grad_norm": 0.3416444957256317,
"learning_rate": 0.00019815498154981552,
"loss": 0.3661,
"step": 20
},
{
"epoch": 0.11070110701107011,
"grad_norm": 0.268430233001709,
"learning_rate": 0.00019723247232472326,
"loss": 0.3122,
"step": 30
},
{
"epoch": 0.14760147601476015,
"grad_norm": 0.20281535387039185,
"learning_rate": 0.000196309963099631,
"loss": 0.2862,
"step": 40
},
{
"epoch": 0.18450184501845018,
"grad_norm": 0.17638804018497467,
"learning_rate": 0.00019538745387453877,
"loss": 0.2582,
"step": 50
},
{
"epoch": 0.22140221402214022,
"grad_norm": 0.13983863592147827,
"learning_rate": 0.00019446494464944652,
"loss": 0.2621,
"step": 60
},
{
"epoch": 0.25830258302583026,
"grad_norm": 0.1306193470954895,
"learning_rate": 0.00019354243542435426,
"loss": 0.2361,
"step": 70
},
{
"epoch": 0.2952029520295203,
"grad_norm": 0.12178371101617813,
"learning_rate": 0.000192619926199262,
"loss": 0.2271,
"step": 80
},
{
"epoch": 0.33210332103321033,
"grad_norm": 0.09625957161188126,
"learning_rate": 0.00019169741697416974,
"loss": 0.2275,
"step": 90
},
{
"epoch": 0.36900369003690037,
"grad_norm": 0.12183211743831635,
"learning_rate": 0.00019077490774907748,
"loss": 0.2128,
"step": 100
},
{
"epoch": 0.36900369003690037,
"eval_accuracy": 0.0,
"eval_loss": 0.20924170315265656,
"eval_runtime": 51.5235,
"eval_samples_per_second": 33.577,
"eval_steps_per_second": 4.212,
"step": 100
},
{
"epoch": 0.4059040590405904,
"grad_norm": 0.08246201276779175,
"learning_rate": 0.00018985239852398525,
"loss": 0.2089,
"step": 110
},
{
"epoch": 0.44280442804428044,
"grad_norm": 0.11126121133565903,
"learning_rate": 0.000188929889298893,
"loss": 0.2147,
"step": 120
},
{
"epoch": 0.4797047970479705,
"grad_norm": 0.11177172511816025,
"learning_rate": 0.00018800738007380074,
"loss": 0.2044,
"step": 130
},
{
"epoch": 0.5166051660516605,
"grad_norm": 0.10021921247243881,
"learning_rate": 0.0001870848708487085,
"loss": 0.1839,
"step": 140
},
{
"epoch": 0.5535055350553506,
"grad_norm": 0.185609832406044,
"learning_rate": 0.00018616236162361625,
"loss": 0.2209,
"step": 150
},
{
"epoch": 0.5904059040590406,
"grad_norm": 0.17880532145500183,
"learning_rate": 0.000185239852398524,
"loss": 0.2023,
"step": 160
},
{
"epoch": 0.6273062730627307,
"grad_norm": 0.09112340211868286,
"learning_rate": 0.00018431734317343173,
"loss": 0.1952,
"step": 170
},
{
"epoch": 0.6642066420664207,
"grad_norm": 0.14496631920337677,
"learning_rate": 0.0001833948339483395,
"loss": 0.2154,
"step": 180
},
{
"epoch": 0.7011070110701108,
"grad_norm": 0.1129971593618393,
"learning_rate": 0.00018247232472324724,
"loss": 0.1865,
"step": 190
},
{
"epoch": 0.7380073800738007,
"grad_norm": 0.1439884454011917,
"learning_rate": 0.00018154981549815499,
"loss": 0.1848,
"step": 200
},
{
"epoch": 0.7380073800738007,
"eval_accuracy": 0.38208092485549133,
"eval_loss": 0.1909271478652954,
"eval_runtime": 51.554,
"eval_samples_per_second": 33.557,
"eval_steps_per_second": 4.209,
"step": 200
},
{
"epoch": 0.7749077490774908,
"grad_norm": 0.1933569312095642,
"learning_rate": 0.00018062730627306276,
"loss": 0.1921,
"step": 210
},
{
"epoch": 0.8118081180811808,
"grad_norm": 0.19441623985767365,
"learning_rate": 0.0001797047970479705,
"loss": 0.2087,
"step": 220
},
{
"epoch": 0.8487084870848709,
"grad_norm": 0.10429559648036957,
"learning_rate": 0.00017878228782287824,
"loss": 0.2016,
"step": 230
},
{
"epoch": 0.8856088560885609,
"grad_norm": 0.11976602673530579,
"learning_rate": 0.00017785977859778598,
"loss": 0.1871,
"step": 240
},
{
"epoch": 0.922509225092251,
"grad_norm": 0.13647252321243286,
"learning_rate": 0.00017693726937269372,
"loss": 0.1951,
"step": 250
},
{
"epoch": 0.959409594095941,
"grad_norm": 0.13491246104240417,
"learning_rate": 0.00017601476014760147,
"loss": 0.1987,
"step": 260
},
{
"epoch": 0.996309963099631,
"grad_norm": 0.10877779126167297,
"learning_rate": 0.00017509225092250923,
"loss": 0.2067,
"step": 270
},
{
"epoch": 1.033210332103321,
"grad_norm": 0.10195717215538025,
"learning_rate": 0.00017416974169741698,
"loss": 0.1829,
"step": 280
},
{
"epoch": 1.070110701107011,
"grad_norm": 0.11287475377321243,
"learning_rate": 0.00017324723247232472,
"loss": 0.1771,
"step": 290
},
{
"epoch": 1.1070110701107012,
"grad_norm": 0.10177090764045715,
"learning_rate": 0.0001723247232472325,
"loss": 0.171,
"step": 300
},
{
"epoch": 1.1070110701107012,
"eval_accuracy": 0.5387283236994219,
"eval_loss": 0.1967419981956482,
"eval_runtime": 50.6773,
"eval_samples_per_second": 34.138,
"eval_steps_per_second": 4.282,
"step": 300
},
{
"epoch": 1.1439114391143912,
"grad_norm": 0.14216077327728271,
"learning_rate": 0.00017140221402214023,
"loss": 0.2104,
"step": 310
},
{
"epoch": 1.1808118081180812,
"grad_norm": 0.0753447487950325,
"learning_rate": 0.00017047970479704797,
"loss": 0.201,
"step": 320
},
{
"epoch": 1.2177121771217712,
"grad_norm": 0.13927125930786133,
"learning_rate": 0.00016955719557195574,
"loss": 0.1923,
"step": 330
},
{
"epoch": 1.2546125461254611,
"grad_norm": 0.14396004378795624,
"learning_rate": 0.00016863468634686348,
"loss": 0.1996,
"step": 340
},
{
"epoch": 1.2915129151291513,
"grad_norm": 0.09631673246622086,
"learning_rate": 0.00016771217712177123,
"loss": 0.1672,
"step": 350
},
{
"epoch": 1.3284132841328413,
"grad_norm": 0.1342993974685669,
"learning_rate": 0.00016678966789667897,
"loss": 0.1792,
"step": 360
},
{
"epoch": 1.3653136531365313,
"grad_norm": 0.20273268222808838,
"learning_rate": 0.00016586715867158674,
"loss": 0.1753,
"step": 370
},
{
"epoch": 1.4022140221402215,
"grad_norm": 0.1452128291130066,
"learning_rate": 0.00016494464944649448,
"loss": 0.1801,
"step": 380
},
{
"epoch": 1.4391143911439115,
"grad_norm": 0.0973893478512764,
"learning_rate": 0.00016402214022140222,
"loss": 0.2136,
"step": 390
},
{
"epoch": 1.4760147601476015,
"grad_norm": 0.2589876651763916,
"learning_rate": 0.00016309963099630996,
"loss": 0.1772,
"step": 400
},
{
"epoch": 1.4760147601476015,
"eval_accuracy": 0.5450867052023122,
"eval_loss": 0.19317613542079926,
"eval_runtime": 50.5026,
"eval_samples_per_second": 34.256,
"eval_steps_per_second": 4.297,
"step": 400
},
{
"epoch": 1.5129151291512914,
"grad_norm": 0.16841156780719757,
"learning_rate": 0.0001621771217712177,
"loss": 0.1894,
"step": 410
},
{
"epoch": 1.5498154981549814,
"grad_norm": 0.17766626179218292,
"learning_rate": 0.00016125461254612547,
"loss": 0.204,
"step": 420
},
{
"epoch": 1.5867158671586716,
"grad_norm": 0.13253839313983917,
"learning_rate": 0.00016033210332103322,
"loss": 0.2141,
"step": 430
},
{
"epoch": 1.6236162361623616,
"grad_norm": 0.10587523877620697,
"learning_rate": 0.00015940959409594096,
"loss": 0.2192,
"step": 440
},
{
"epoch": 1.6605166051660518,
"grad_norm": 0.10049675405025482,
"learning_rate": 0.0001584870848708487,
"loss": 0.1939,
"step": 450
},
{
"epoch": 1.6974169741697418,
"grad_norm": 0.1287400722503662,
"learning_rate": 0.00015756457564575647,
"loss": 0.1955,
"step": 460
},
{
"epoch": 1.7343173431734318,
"grad_norm": 0.15120339393615723,
"learning_rate": 0.0001566420664206642,
"loss": 0.1863,
"step": 470
},
{
"epoch": 1.7712177121771218,
"grad_norm": 0.1202373206615448,
"learning_rate": 0.00015571955719557195,
"loss": 0.1832,
"step": 480
},
{
"epoch": 1.8081180811808117,
"grad_norm": 0.1368759572505951,
"learning_rate": 0.00015479704797047972,
"loss": 0.2007,
"step": 490
},
{
"epoch": 1.8450184501845017,
"grad_norm": 0.08591968566179276,
"learning_rate": 0.00015387453874538746,
"loss": 0.1629,
"step": 500
},
{
"epoch": 1.8450184501845017,
"eval_accuracy": 0.4485549132947977,
"eval_loss": 0.1842162311077118,
"eval_runtime": 50.5861,
"eval_samples_per_second": 34.199,
"eval_steps_per_second": 4.29,
"step": 500
},
{
"epoch": 1.881918819188192,
"grad_norm": 0.12317466735839844,
"learning_rate": 0.0001529520295202952,
"loss": 0.213,
"step": 510
},
{
"epoch": 1.918819188191882,
"grad_norm": 0.15092293918132782,
"learning_rate": 0.00015202952029520298,
"loss": 0.2004,
"step": 520
},
{
"epoch": 1.9557195571955721,
"grad_norm": 0.20343895256519318,
"learning_rate": 0.00015110701107011072,
"loss": 0.1835,
"step": 530
},
{
"epoch": 1.992619926199262,
"grad_norm": 0.32362422347068787,
"learning_rate": 0.00015018450184501846,
"loss": 0.1915,
"step": 540
},
{
"epoch": 2.029520295202952,
"grad_norm": 0.14631719887256622,
"learning_rate": 0.00014926199261992623,
"loss": 0.2113,
"step": 550
},
{
"epoch": 2.066420664206642,
"grad_norm": 0.12011805921792984,
"learning_rate": 0.00014833948339483394,
"loss": 0.1781,
"step": 560
},
{
"epoch": 2.103321033210332,
"grad_norm": 0.14479252696037292,
"learning_rate": 0.00014741697416974169,
"loss": 0.182,
"step": 570
},
{
"epoch": 2.140221402214022,
"grad_norm": 0.18806347250938416,
"learning_rate": 0.00014649446494464946,
"loss": 0.1682,
"step": 580
},
{
"epoch": 2.177121771217712,
"grad_norm": 0.17025235295295715,
"learning_rate": 0.0001455719557195572,
"loss": 0.1824,
"step": 590
},
{
"epoch": 2.2140221402214024,
"grad_norm": 0.16879422962665558,
"learning_rate": 0.00014464944649446494,
"loss": 0.1942,
"step": 600
},
{
"epoch": 2.2140221402214024,
"eval_accuracy": 0.41965317919075146,
"eval_loss": 0.17699980735778809,
"eval_runtime": 50.0976,
"eval_samples_per_second": 34.533,
"eval_steps_per_second": 4.332,
"step": 600
},
{
"epoch": 2.2509225092250924,
"grad_norm": 0.168411523103714,
"learning_rate": 0.0001437269372693727,
"loss": 0.1732,
"step": 610
},
{
"epoch": 2.2878228782287824,
"grad_norm": 0.21096496284008026,
"learning_rate": 0.00014280442804428045,
"loss": 0.1842,
"step": 620
},
{
"epoch": 2.3247232472324724,
"grad_norm": 0.18110381066799164,
"learning_rate": 0.0001418819188191882,
"loss": 0.1772,
"step": 630
},
{
"epoch": 2.3616236162361623,
"grad_norm": 0.15854766964912415,
"learning_rate": 0.00014095940959409593,
"loss": 0.1709,
"step": 640
},
{
"epoch": 2.3985239852398523,
"grad_norm": 0.19320182502269745,
"learning_rate": 0.0001400369003690037,
"loss": 0.2,
"step": 650
},
{
"epoch": 2.4354243542435423,
"grad_norm": 0.16658619046211243,
"learning_rate": 0.00013911439114391145,
"loss": 0.2061,
"step": 660
},
{
"epoch": 2.4723247232472323,
"grad_norm": 0.14636483788490295,
"learning_rate": 0.0001381918819188192,
"loss": 0.1946,
"step": 670
},
{
"epoch": 2.5092250922509223,
"grad_norm": 0.1587982028722763,
"learning_rate": 0.00013726937269372696,
"loss": 0.1963,
"step": 680
},
{
"epoch": 2.5461254612546127,
"grad_norm": 0.2764102518558502,
"learning_rate": 0.0001363468634686347,
"loss": 0.1702,
"step": 690
},
{
"epoch": 2.5830258302583027,
"grad_norm": 0.14449751377105713,
"learning_rate": 0.00013542435424354244,
"loss": 0.1714,
"step": 700
},
{
"epoch": 2.5830258302583027,
"eval_accuracy": 0.5023121387283237,
"eval_loss": 0.17974236607551575,
"eval_runtime": 50.796,
"eval_samples_per_second": 34.058,
"eval_steps_per_second": 4.272,
"step": 700
},
{
"epoch": 2.6199261992619927,
"grad_norm": 0.15392902493476868,
"learning_rate": 0.0001345018450184502,
"loss": 0.191,
"step": 710
},
{
"epoch": 2.6568265682656826,
"grad_norm": 0.15529021620750427,
"learning_rate": 0.00013357933579335793,
"loss": 0.193,
"step": 720
},
{
"epoch": 2.6937269372693726,
"grad_norm": 0.18234789371490479,
"learning_rate": 0.00013265682656826567,
"loss": 0.1836,
"step": 730
},
{
"epoch": 2.7306273062730626,
"grad_norm": 0.19954174757003784,
"learning_rate": 0.00013173431734317344,
"loss": 0.2176,
"step": 740
},
{
"epoch": 2.767527675276753,
"grad_norm": 0.13893257081508636,
"learning_rate": 0.00013081180811808118,
"loss": 0.1699,
"step": 750
},
{
"epoch": 2.804428044280443,
"grad_norm": 0.16896647214889526,
"learning_rate": 0.00012988929889298892,
"loss": 0.168,
"step": 760
},
{
"epoch": 2.841328413284133,
"grad_norm": 0.20796014368534088,
"learning_rate": 0.0001289667896678967,
"loss": 0.2141,
"step": 770
},
{
"epoch": 2.878228782287823,
"grad_norm": 0.2690466046333313,
"learning_rate": 0.00012804428044280443,
"loss": 0.1778,
"step": 780
},
{
"epoch": 2.915129151291513,
"grad_norm": 0.14259500801563263,
"learning_rate": 0.00012712177121771217,
"loss": 0.1748,
"step": 790
},
{
"epoch": 2.952029520295203,
"grad_norm": 0.14488738775253296,
"learning_rate": 0.00012619926199261994,
"loss": 0.1832,
"step": 800
},
{
"epoch": 2.952029520295203,
"eval_accuracy": 0.36878612716763004,
"eval_loss": 0.17303667962551117,
"eval_runtime": 50.3505,
"eval_samples_per_second": 34.359,
"eval_steps_per_second": 4.31,
"step": 800
},
{
"epoch": 2.988929889298893,
"grad_norm": 0.1963815540075302,
"learning_rate": 0.00012527675276752769,
"loss": 0.1603,
"step": 810
},
{
"epoch": 3.025830258302583,
"grad_norm": 0.18811728060245514,
"learning_rate": 0.00012435424354243543,
"loss": 0.1668,
"step": 820
},
{
"epoch": 3.062730627306273,
"grad_norm": 0.3115330636501312,
"learning_rate": 0.0001234317343173432,
"loss": 0.1764,
"step": 830
},
{
"epoch": 3.0996309963099633,
"grad_norm": 0.15212470293045044,
"learning_rate": 0.00012250922509225094,
"loss": 0.1668,
"step": 840
},
{
"epoch": 3.1365313653136533,
"grad_norm": 0.17935976386070251,
"learning_rate": 0.00012158671586715868,
"loss": 0.1807,
"step": 850
},
{
"epoch": 3.1734317343173433,
"grad_norm": 0.23978868126869202,
"learning_rate": 0.00012066420664206644,
"loss": 0.168,
"step": 860
},
{
"epoch": 3.2103321033210332,
"grad_norm": 0.29603224992752075,
"learning_rate": 0.00011974169741697419,
"loss": 0.1827,
"step": 870
},
{
"epoch": 3.2472324723247232,
"grad_norm": 0.1385461539030075,
"learning_rate": 0.00011881918819188192,
"loss": 0.1794,
"step": 880
},
{
"epoch": 3.284132841328413,
"grad_norm": 0.20920993387699127,
"learning_rate": 0.00011789667896678966,
"loss": 0.1738,
"step": 890
},
{
"epoch": 3.321033210332103,
"grad_norm": 0.31590428948402405,
"learning_rate": 0.00011697416974169742,
"loss": 0.1766,
"step": 900
},
{
"epoch": 3.321033210332103,
"eval_accuracy": 0.34277456647398846,
"eval_loss": 0.17552779614925385,
"eval_runtime": 50.5381,
"eval_samples_per_second": 34.232,
"eval_steps_per_second": 4.294,
"step": 900
},
{
"epoch": 3.357933579335793,
"grad_norm": 0.22194945812225342,
"learning_rate": 0.00011605166051660516,
"loss": 0.1814,
"step": 910
},
{
"epoch": 3.3948339483394836,
"grad_norm": 0.22071777284145355,
"learning_rate": 0.00011512915129151292,
"loss": 0.1629,
"step": 920
},
{
"epoch": 3.4317343173431736,
"grad_norm": 0.44112759828567505,
"learning_rate": 0.00011420664206642067,
"loss": 0.1914,
"step": 930
},
{
"epoch": 3.4686346863468636,
"grad_norm": 0.20971660315990448,
"learning_rate": 0.00011328413284132841,
"loss": 0.1691,
"step": 940
},
{
"epoch": 3.5055350553505535,
"grad_norm": 0.23813588917255402,
"learning_rate": 0.00011236162361623617,
"loss": 0.1919,
"step": 950
},
{
"epoch": 3.5424354243542435,
"grad_norm": 0.19610780477523804,
"learning_rate": 0.00011143911439114391,
"loss": 0.1631,
"step": 960
},
{
"epoch": 3.5793357933579335,
"grad_norm": 0.29578620195388794,
"learning_rate": 0.00011051660516605167,
"loss": 0.1721,
"step": 970
},
{
"epoch": 3.6162361623616235,
"grad_norm": 0.15876761078834534,
"learning_rate": 0.00010959409594095942,
"loss": 0.1869,
"step": 980
},
{
"epoch": 3.6531365313653135,
"grad_norm": 0.19575054943561554,
"learning_rate": 0.00010867158671586716,
"loss": 0.1676,
"step": 990
},
{
"epoch": 3.6900369003690034,
"grad_norm": 0.12657958269119263,
"learning_rate": 0.00010774907749077492,
"loss": 0.1697,
"step": 1000
},
{
"epoch": 3.6900369003690034,
"eval_accuracy": 0.5167630057803468,
"eval_loss": 0.1601094752550125,
"eval_runtime": 50.1373,
"eval_samples_per_second": 34.505,
"eval_steps_per_second": 4.328,
"step": 1000
},
{
"epoch": 3.726937269372694,
"grad_norm": 0.2477671205997467,
"learning_rate": 0.00010682656826568268,
"loss": 0.1745,
"step": 1010
},
{
"epoch": 3.763837638376384,
"grad_norm": 0.21879136562347412,
"learning_rate": 0.00010590405904059042,
"loss": 0.1617,
"step": 1020
},
{
"epoch": 3.800738007380074,
"grad_norm": 0.195592001080513,
"learning_rate": 0.00010498154981549817,
"loss": 0.1534,
"step": 1030
},
{
"epoch": 3.837638376383764,
"grad_norm": 0.35998597741127014,
"learning_rate": 0.0001040590405904059,
"loss": 0.1606,
"step": 1040
},
{
"epoch": 3.874538745387454,
"grad_norm": 0.30765026807785034,
"learning_rate": 0.00010313653136531364,
"loss": 0.2019,
"step": 1050
},
{
"epoch": 3.911439114391144,
"grad_norm": 0.16130860149860382,
"learning_rate": 0.0001022140221402214,
"loss": 0.1738,
"step": 1060
},
{
"epoch": 3.948339483394834,
"grad_norm": 0.1843736171722412,
"learning_rate": 0.00010129151291512916,
"loss": 0.1941,
"step": 1070
},
{
"epoch": 3.985239852398524,
"grad_norm": 0.21090315282344818,
"learning_rate": 0.0001003690036900369,
"loss": 0.1695,
"step": 1080
},
{
"epoch": 4.022140221402214,
"grad_norm": 0.19030509889125824,
"learning_rate": 9.944649446494465e-05,
"loss": 0.1711,
"step": 1090
},
{
"epoch": 4.059040590405904,
"grad_norm": 0.12992843985557556,
"learning_rate": 9.85239852398524e-05,
"loss": 0.1568,
"step": 1100
},
{
"epoch": 4.059040590405904,
"eval_accuracy": 0.5352601156069364,
"eval_loss": 0.15768703818321228,
"eval_runtime": 50.6123,
"eval_samples_per_second": 34.181,
"eval_steps_per_second": 4.287,
"step": 1100
},
{
"epoch": 4.095940959409594,
"grad_norm": 0.23201997578144073,
"learning_rate": 9.760147601476015e-05,
"loss": 0.1484,
"step": 1110
},
{
"epoch": 4.132841328413284,
"grad_norm": 0.3783067762851715,
"learning_rate": 9.66789667896679e-05,
"loss": 0.1597,
"step": 1120
},
{
"epoch": 4.169741697416974,
"grad_norm": 0.27165931463241577,
"learning_rate": 9.575645756457565e-05,
"loss": 0.156,
"step": 1130
},
{
"epoch": 4.206642066420664,
"grad_norm": 0.2932455241680145,
"learning_rate": 9.48339483394834e-05,
"loss": 0.1353,
"step": 1140
},
{
"epoch": 4.243542435424354,
"grad_norm": 0.27856454253196716,
"learning_rate": 9.391143911439116e-05,
"loss": 0.1555,
"step": 1150
},
{
"epoch": 4.280442804428044,
"grad_norm": 0.2609305679798126,
"learning_rate": 9.298892988929889e-05,
"loss": 0.1549,
"step": 1160
},
{
"epoch": 4.317343173431734,
"grad_norm": 0.4013775587081909,
"learning_rate": 9.206642066420664e-05,
"loss": 0.1555,
"step": 1170
},
{
"epoch": 4.354243542435424,
"grad_norm": 0.24482858180999756,
"learning_rate": 9.11439114391144e-05,
"loss": 0.1583,
"step": 1180
},
{
"epoch": 4.391143911439114,
"grad_norm": 0.2422870397567749,
"learning_rate": 9.022140221402214e-05,
"loss": 0.1663,
"step": 1190
},
{
"epoch": 4.428044280442805,
"grad_norm": 0.2710004448890686,
"learning_rate": 8.92988929889299e-05,
"loss": 0.1484,
"step": 1200
},
{
"epoch": 4.428044280442805,
"eval_accuracy": 0.49190751445086706,
"eval_loss": 0.1513577699661255,
"eval_runtime": 50.7754,
"eval_samples_per_second": 34.072,
"eval_steps_per_second": 4.274,
"step": 1200
},
{
"epoch": 4.464944649446495,
"grad_norm": 0.3608151972293854,
"learning_rate": 8.837638376383764e-05,
"loss": 0.1595,
"step": 1210
},
{
"epoch": 4.501845018450185,
"grad_norm": 0.14578911662101746,
"learning_rate": 8.74538745387454e-05,
"loss": 0.1841,
"step": 1220
},
{
"epoch": 4.538745387453875,
"grad_norm": 0.2544012665748596,
"learning_rate": 8.653136531365315e-05,
"loss": 0.1576,
"step": 1230
},
{
"epoch": 4.575645756457565,
"grad_norm": 0.3130911886692047,
"learning_rate": 8.560885608856088e-05,
"loss": 0.1626,
"step": 1240
},
{
"epoch": 4.612546125461255,
"grad_norm": 0.31136009097099304,
"learning_rate": 8.468634686346863e-05,
"loss": 0.1715,
"step": 1250
},
{
"epoch": 4.649446494464945,
"grad_norm": 0.20172053575515747,
"learning_rate": 8.376383763837639e-05,
"loss": 0.1476,
"step": 1260
},
{
"epoch": 4.686346863468635,
"grad_norm": 0.2550618648529053,
"learning_rate": 8.284132841328413e-05,
"loss": 0.1376,
"step": 1270
},
{
"epoch": 4.723247232472325,
"grad_norm": 0.16149303317070007,
"learning_rate": 8.191881918819189e-05,
"loss": 0.1461,
"step": 1280
},
{
"epoch": 4.760147601476015,
"grad_norm": 0.27109894156455994,
"learning_rate": 8.099630996309964e-05,
"loss": 0.1556,
"step": 1290
},
{
"epoch": 4.797047970479705,
"grad_norm": 0.26436206698417664,
"learning_rate": 8.007380073800739e-05,
"loss": 0.1483,
"step": 1300
},
{
"epoch": 4.797047970479705,
"eval_accuracy": 0.5699421965317919,
"eval_loss": 0.14818404614925385,
"eval_runtime": 49.9368,
"eval_samples_per_second": 34.644,
"eval_steps_per_second": 4.345,
"step": 1300
},
{
"epoch": 4.833948339483395,
"grad_norm": 0.39457815885543823,
"learning_rate": 7.915129151291514e-05,
"loss": 0.1731,
"step": 1310
},
{
"epoch": 4.870848708487085,
"grad_norm": 0.1614658087491989,
"learning_rate": 7.822878228782288e-05,
"loss": 0.1525,
"step": 1320
},
{
"epoch": 4.907749077490775,
"grad_norm": 0.26091647148132324,
"learning_rate": 7.730627306273062e-05,
"loss": 0.1854,
"step": 1330
},
{
"epoch": 4.944649446494465,
"grad_norm": 0.33017560839653015,
"learning_rate": 7.638376383763838e-05,
"loss": 0.1695,
"step": 1340
},
{
"epoch": 4.9815498154981555,
"grad_norm": 0.3637866973876953,
"learning_rate": 7.546125461254612e-05,
"loss": 0.1666,
"step": 1350
},
{
"epoch": 5.018450184501845,
"grad_norm": 0.3373745083808899,
"learning_rate": 7.453874538745388e-05,
"loss": 0.1525,
"step": 1360
},
{
"epoch": 5.055350553505535,
"grad_norm": 0.25333917140960693,
"learning_rate": 7.361623616236163e-05,
"loss": 0.1356,
"step": 1370
},
{
"epoch": 5.092250922509225,
"grad_norm": 0.1722867488861084,
"learning_rate": 7.269372693726938e-05,
"loss": 0.1357,
"step": 1380
},
{
"epoch": 5.129151291512915,
"grad_norm": 0.13959679007530212,
"learning_rate": 7.177121771217713e-05,
"loss": 0.1285,
"step": 1390
},
{
"epoch": 5.166051660516605,
"grad_norm": 0.17668481171131134,
"learning_rate": 7.084870848708487e-05,
"loss": 0.1301,
"step": 1400
},
{
"epoch": 5.166051660516605,
"eval_accuracy": 0.5433526011560693,
"eval_loss": 0.13149897754192352,
"eval_runtime": 50.8795,
"eval_samples_per_second": 34.002,
"eval_steps_per_second": 4.265,
"step": 1400
},
{
"epoch": 5.202952029520295,
"grad_norm": 0.1745270937681198,
"learning_rate": 6.992619926199262e-05,
"loss": 0.1516,
"step": 1410
},
{
"epoch": 5.239852398523985,
"grad_norm": 0.5758349299430847,
"learning_rate": 6.900369003690037e-05,
"loss": 0.1294,
"step": 1420
},
{
"epoch": 5.276752767527675,
"grad_norm": 0.2458232194185257,
"learning_rate": 6.808118081180813e-05,
"loss": 0.1385,
"step": 1430
},
{
"epoch": 5.313653136531365,
"grad_norm": 0.3469581604003906,
"learning_rate": 6.715867158671587e-05,
"loss": 0.1394,
"step": 1440
},
{
"epoch": 5.350553505535055,
"grad_norm": 0.267447292804718,
"learning_rate": 6.623616236162362e-05,
"loss": 0.1432,
"step": 1450
},
{
"epoch": 5.387453874538745,
"grad_norm": 0.24406275153160095,
"learning_rate": 6.531365313653137e-05,
"loss": 0.1396,
"step": 1460
},
{
"epoch": 5.424354243542435,
"grad_norm": 0.7067885994911194,
"learning_rate": 6.439114391143912e-05,
"loss": 0.1456,
"step": 1470
},
{
"epoch": 5.461254612546125,
"grad_norm": 0.2915806174278259,
"learning_rate": 6.346863468634686e-05,
"loss": 0.1366,
"step": 1480
},
{
"epoch": 5.498154981549815,
"grad_norm": 0.22377534210681915,
"learning_rate": 6.25461254612546e-05,
"loss": 0.1273,
"step": 1490
},
{
"epoch": 5.535055350553505,
"grad_norm": 0.3705073893070221,
"learning_rate": 6.162361623616236e-05,
"loss": 0.1149,
"step": 1500
},
{
"epoch": 5.535055350553505,
"eval_accuracy": 0.5583815028901734,
"eval_loss": 0.12937474250793457,
"eval_runtime": 50.8394,
"eval_samples_per_second": 34.029,
"eval_steps_per_second": 4.268,
"step": 1500
},
{
"epoch": 5.571955719557195,
"grad_norm": 0.13345371186733246,
"learning_rate": 6.070110701107011e-05,
"loss": 0.1229,
"step": 1510
},
{
"epoch": 5.608856088560886,
"grad_norm": 0.4122871160507202,
"learning_rate": 5.9778597785977866e-05,
"loss": 0.1689,
"step": 1520
},
{
"epoch": 5.645756457564576,
"grad_norm": 0.14905782043933868,
"learning_rate": 5.8856088560885615e-05,
"loss": 0.1365,
"step": 1530
},
{
"epoch": 5.682656826568266,
"grad_norm": 0.21198387444019318,
"learning_rate": 5.7933579335793364e-05,
"loss": 0.1453,
"step": 1540
},
{
"epoch": 5.719557195571956,
"grad_norm": 0.3941808044910431,
"learning_rate": 5.701107011070111e-05,
"loss": 0.1584,
"step": 1550
},
{
"epoch": 5.756457564575646,
"grad_norm": 0.1366042047739029,
"learning_rate": 5.6088560885608855e-05,
"loss": 0.1219,
"step": 1560
},
{
"epoch": 5.793357933579336,
"grad_norm": 0.1590586006641388,
"learning_rate": 5.5166051660516604e-05,
"loss": 0.1482,
"step": 1570
},
{
"epoch": 5.830258302583026,
"grad_norm": 0.3574014902114868,
"learning_rate": 5.424354243542435e-05,
"loss": 0.1241,
"step": 1580
},
{
"epoch": 5.867158671586716,
"grad_norm": 0.2934325039386749,
"learning_rate": 5.332103321033211e-05,
"loss": 0.1397,
"step": 1590
},
{
"epoch": 5.904059040590406,
"grad_norm": 0.2349650263786316,
"learning_rate": 5.239852398523986e-05,
"loss": 0.1448,
"step": 1600
},
{
"epoch": 5.904059040590406,
"eval_accuracy": 0.5416184971098266,
"eval_loss": 0.12662799656391144,
"eval_runtime": 50.1064,
"eval_samples_per_second": 34.527,
"eval_steps_per_second": 4.331,
"step": 1600
},
{
"epoch": 5.940959409594096,
"grad_norm": 0.39207130670547485,
"learning_rate": 5.1476014760147606e-05,
"loss": 0.1491,
"step": 1610
},
{
"epoch": 5.977859778597786,
"grad_norm": 0.21359127759933472,
"learning_rate": 5.0553505535055354e-05,
"loss": 0.1367,
"step": 1620
},
{
"epoch": 6.014760147601476,
"grad_norm": 0.17874382436275482,
"learning_rate": 4.96309963099631e-05,
"loss": 0.1276,
"step": 1630
},
{
"epoch": 6.051660516605166,
"grad_norm": 0.15224817395210266,
"learning_rate": 4.870848708487085e-05,
"loss": 0.1223,
"step": 1640
},
{
"epoch": 6.088560885608856,
"grad_norm": 0.28657016158103943,
"learning_rate": 4.77859778597786e-05,
"loss": 0.1327,
"step": 1650
},
{
"epoch": 6.125461254612546,
"grad_norm": 0.16251201927661896,
"learning_rate": 4.686346863468635e-05,
"loss": 0.1318,
"step": 1660
},
{
"epoch": 6.162361623616236,
"grad_norm": 0.3002704381942749,
"learning_rate": 4.59409594095941e-05,
"loss": 0.1188,
"step": 1670
},
{
"epoch": 6.199261992619927,
"grad_norm": 0.4188823103904724,
"learning_rate": 4.501845018450185e-05,
"loss": 0.1004,
"step": 1680
},
{
"epoch": 6.236162361623617,
"grad_norm": 0.18772590160369873,
"learning_rate": 4.4095940959409596e-05,
"loss": 0.1002,
"step": 1690
},
{
"epoch": 6.273062730627307,
"grad_norm": 0.30921700596809387,
"learning_rate": 4.3173431734317345e-05,
"loss": 0.1035,
"step": 1700
},
{
"epoch": 6.273062730627307,
"eval_accuracy": 0.6017341040462427,
"eval_loss": 0.11507368832826614,
"eval_runtime": 50.6734,
"eval_samples_per_second": 34.14,
"eval_steps_per_second": 4.282,
"step": 1700
},
{
"epoch": 6.3099630996309966,
"grad_norm": 0.42562779784202576,
"learning_rate": 4.2250922509225094e-05,
"loss": 0.1071,
"step": 1710
},
{
"epoch": 6.3468634686346865,
"grad_norm": 0.36547404527664185,
"learning_rate": 4.132841328413284e-05,
"loss": 0.12,
"step": 1720
},
{
"epoch": 6.3837638376383765,
"grad_norm": 0.12006784975528717,
"learning_rate": 4.040590405904059e-05,
"loss": 0.1107,
"step": 1730
},
{
"epoch": 6.4206642066420665,
"grad_norm": 0.1983233392238617,
"learning_rate": 3.948339483394834e-05,
"loss": 0.1206,
"step": 1740
},
{
"epoch": 6.4575645756457565,
"grad_norm": 0.17691943049430847,
"learning_rate": 3.856088560885609e-05,
"loss": 0.1252,
"step": 1750
},
{
"epoch": 6.4944649446494465,
"grad_norm": 0.39386728405952454,
"learning_rate": 3.763837638376384e-05,
"loss": 0.1314,
"step": 1760
},
{
"epoch": 6.531365313653136,
"grad_norm": 0.607455313205719,
"learning_rate": 3.6715867158671594e-05,
"loss": 0.1095,
"step": 1770
},
{
"epoch": 6.568265682656826,
"grad_norm": 0.21057389676570892,
"learning_rate": 3.5793357933579336e-05,
"loss": 0.1223,
"step": 1780
},
{
"epoch": 6.605166051660516,
"grad_norm": 0.27539491653442383,
"learning_rate": 3.4870848708487085e-05,
"loss": 0.1163,
"step": 1790
},
{
"epoch": 6.642066420664206,
"grad_norm": 0.24495290219783783,
"learning_rate": 3.3948339483394833e-05,
"loss": 0.1048,
"step": 1800
},
{
"epoch": 6.642066420664206,
"eval_accuracy": 0.6046242774566474,
"eval_loss": 0.10599144548177719,
"eval_runtime": 50.9957,
"eval_samples_per_second": 33.924,
"eval_steps_per_second": 4.255,
"step": 1800
},
{
"epoch": 6.678966789667896,
"grad_norm": 0.38892611861228943,
"learning_rate": 3.302583025830259e-05,
"loss": 0.1352,
"step": 1810
},
{
"epoch": 6.715867158671586,
"grad_norm": 0.2850606143474579,
"learning_rate": 3.210332103321033e-05,
"loss": 0.1153,
"step": 1820
},
{
"epoch": 6.752767527675276,
"grad_norm": 0.16241934895515442,
"learning_rate": 3.118081180811808e-05,
"loss": 0.1074,
"step": 1830
},
{
"epoch": 6.789667896678967,
"grad_norm": 0.36088794469833374,
"learning_rate": 3.0258302583025832e-05,
"loss": 0.1219,
"step": 1840
},
{
"epoch": 6.826568265682657,
"grad_norm": 0.21467632055282593,
"learning_rate": 2.9335793357933584e-05,
"loss": 0.1083,
"step": 1850
},
{
"epoch": 6.863468634686347,
"grad_norm": 0.4730125069618225,
"learning_rate": 2.8413284132841326e-05,
"loss": 0.1227,
"step": 1860
},
{
"epoch": 6.900369003690037,
"grad_norm": 0.20842638611793518,
"learning_rate": 2.749077490774908e-05,
"loss": 0.128,
"step": 1870
},
{
"epoch": 6.937269372693727,
"grad_norm": 0.1885102540254593,
"learning_rate": 2.6568265682656828e-05,
"loss": 0.0923,
"step": 1880
},
{
"epoch": 6.974169741697417,
"grad_norm": 0.48948994278907776,
"learning_rate": 2.564575645756458e-05,
"loss": 0.1165,
"step": 1890
},
{
"epoch": 7.011070110701107,
"grad_norm": 0.4080180525779724,
"learning_rate": 2.472324723247233e-05,
"loss": 0.1168,
"step": 1900
},
{
"epoch": 7.011070110701107,
"eval_accuracy": 0.6173410404624278,
"eval_loss": 0.10073487460613251,
"eval_runtime": 50.8605,
"eval_samples_per_second": 34.015,
"eval_steps_per_second": 4.267,
"step": 1900
},
{
"epoch": 7.047970479704797,
"grad_norm": 0.19095434248447418,
"learning_rate": 2.3800738007380074e-05,
"loss": 0.1131,
"step": 1910
},
{
"epoch": 7.084870848708487,
"grad_norm": 0.23603685200214386,
"learning_rate": 2.2878228782287826e-05,
"loss": 0.089,
"step": 1920
},
{
"epoch": 7.121771217712177,
"grad_norm": 0.09547635912895203,
"learning_rate": 2.195571955719557e-05,
"loss": 0.1032,
"step": 1930
},
{
"epoch": 7.158671586715867,
"grad_norm": 0.18442951142787933,
"learning_rate": 2.1033210332103324e-05,
"loss": 0.0909,
"step": 1940
},
{
"epoch": 7.195571955719557,
"grad_norm": 0.2125350534915924,
"learning_rate": 2.011070110701107e-05,
"loss": 0.0922,
"step": 1950
},
{
"epoch": 7.232472324723247,
"grad_norm": 0.15140217542648315,
"learning_rate": 1.918819188191882e-05,
"loss": 0.1155,
"step": 1960
},
{
"epoch": 7.269372693726937,
"grad_norm": 0.25905662775039673,
"learning_rate": 1.826568265682657e-05,
"loss": 0.1194,
"step": 1970
},
{
"epoch": 7.306273062730627,
"grad_norm": 0.18217885494232178,
"learning_rate": 1.734317343173432e-05,
"loss": 0.1,
"step": 1980
},
{
"epoch": 7.343173431734318,
"grad_norm": 0.21871539950370789,
"learning_rate": 1.6420664206642068e-05,
"loss": 0.1022,
"step": 1990
},
{
"epoch": 7.380073800738008,
"grad_norm": 0.4127865731716156,
"learning_rate": 1.5498154981549817e-05,
"loss": 0.1104,
"step": 2000
},
{
"epoch": 7.380073800738008,
"eval_accuracy": 0.6445086705202312,
"eval_loss": 0.09489033371210098,
"eval_runtime": 50.8042,
"eval_samples_per_second": 34.052,
"eval_steps_per_second": 4.271,
"step": 2000
},
{
"epoch": 7.416974169741698,
"grad_norm": 0.20000500977039337,
"learning_rate": 1.4575645756457566e-05,
"loss": 0.1031,
"step": 2010
},
{
"epoch": 7.453874538745388,
"grad_norm": 0.5234202742576599,
"learning_rate": 1.3653136531365315e-05,
"loss": 0.1197,
"step": 2020
},
{
"epoch": 7.490774907749078,
"grad_norm": 0.16442282497882843,
"learning_rate": 1.2730627306273063e-05,
"loss": 0.1036,
"step": 2030
},
{
"epoch": 7.527675276752768,
"grad_norm": 0.19210496544837952,
"learning_rate": 1.1808118081180812e-05,
"loss": 0.0993,
"step": 2040
},
{
"epoch": 7.564575645756458,
"grad_norm": 0.1562729775905609,
"learning_rate": 1.0885608856088561e-05,
"loss": 0.0941,
"step": 2050
},
{
"epoch": 7.601476014760148,
"grad_norm": 0.29051193594932556,
"learning_rate": 9.96309963099631e-06,
"loss": 0.092,
"step": 2060
},
{
"epoch": 7.638376383763838,
"grad_norm": 0.21477282047271729,
"learning_rate": 9.040590405904059e-06,
"loss": 0.1123,
"step": 2070
},
{
"epoch": 7.675276752767528,
"grad_norm": 0.03506307676434517,
"learning_rate": 8.118081180811808e-06,
"loss": 0.0927,
"step": 2080
},
{
"epoch": 7.712177121771218,
"grad_norm": 0.21280255913734436,
"learning_rate": 7.195571955719557e-06,
"loss": 0.084,
"step": 2090
},
{
"epoch": 7.749077490774908,
"grad_norm": 0.19547449052333832,
"learning_rate": 6.273062730627306e-06,
"loss": 0.0873,
"step": 2100
},
{
"epoch": 7.749077490774908,
"eval_accuracy": 0.6526011560693642,
"eval_loss": 0.0923289805650711,
"eval_runtime": 50.6448,
"eval_samples_per_second": 34.159,
"eval_steps_per_second": 4.285,
"step": 2100
},
{
"epoch": 7.785977859778598,
"grad_norm": 0.22129392623901367,
"learning_rate": 5.350553505535055e-06,
"loss": 0.1,
"step": 2110
},
{
"epoch": 7.822878228782288,
"grad_norm": 0.2631789445877075,
"learning_rate": 4.428044280442805e-06,
"loss": 0.0811,
"step": 2120
},
{
"epoch": 7.8597785977859775,
"grad_norm": 0.16971804201602936,
"learning_rate": 3.5055350553505534e-06,
"loss": 0.0977,
"step": 2130
},
{
"epoch": 7.8966789667896675,
"grad_norm": 0.10247929394245148,
"learning_rate": 2.5830258302583027e-06,
"loss": 0.0966,
"step": 2140
},
{
"epoch": 7.9335793357933575,
"grad_norm": 0.15865936875343323,
"learning_rate": 1.6605166051660517e-06,
"loss": 0.1039,
"step": 2150
},
{
"epoch": 7.970479704797048,
"grad_norm": 0.506331205368042,
"learning_rate": 7.380073800738008e-07,
"loss": 0.0977,
"step": 2160
},
{
"epoch": 8.0,
"step": 2168,
"total_flos": 2.6821552511927255e+18,
"train_loss": 0.16216810325304962,
"train_runtime": 3143.5173,
"train_samples_per_second": 11.009,
"train_steps_per_second": 0.69
}
],
"logging_steps": 10,
"max_steps": 2168,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.6821552511927255e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}