{ "best_metric": 0.8249050225524125, "best_model_checkpoint": "/data/hungnm/unisentiment/roberta-large-sentiment/checkpoint-12296", "epoch": 5.0, "eval_steps": 500, "global_step": 15370, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016265452179570592, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 2.2297, "step": 5 }, { "epoch": 0.0032530904359141183, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 2.2375, "step": 10 }, { "epoch": 0.004879635653871178, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 2.2213, "step": 15 }, { "epoch": 0.006506180871828237, "grad_norm": 3.86765718460083, "learning_rate": 9.74025974025974e-07, "loss": 2.2313, "step": 20 }, { "epoch": 0.008132726089785295, "grad_norm": 1.735704779624939, "learning_rate": 2.5974025974025976e-06, "loss": 2.2234, "step": 25 }, { "epoch": 0.009759271307742356, "grad_norm": 1.5706150531768799, "learning_rate": 4.220779220779221e-06, "loss": 2.1965, "step": 30 }, { "epoch": 0.011385816525699415, "grad_norm": 2.025402069091797, "learning_rate": 5.844155844155844e-06, "loss": 2.1842, "step": 35 }, { "epoch": 0.013012361743656473, "grad_norm": 2.8359689712524414, "learning_rate": 7.467532467532468e-06, "loss": 2.1539, "step": 40 }, { "epoch": 0.014638906961613532, "grad_norm": 3.062162399291992, "learning_rate": 9.090909090909091e-06, "loss": 2.1127, "step": 45 }, { "epoch": 0.01626545217957059, "grad_norm": 4.7635908126831055, "learning_rate": 1.0714285714285714e-05, "loss": 2.0376, "step": 50 }, { "epoch": 0.017891997397527653, "grad_norm": 5.502624988555908, "learning_rate": 1.2337662337662339e-05, "loss": 1.853, "step": 55 }, { "epoch": 0.01951854261548471, "grad_norm": 8.860891342163086, "learning_rate": 1.396103896103896e-05, "loss": 1.6277, "step": 60 }, { "epoch": 0.02114508783344177, "grad_norm": 13.536232948303223, "learning_rate": 1.5584415584415583e-05, "loss": 1.4906, "step": 65 }, { "epoch": 0.02277163305139883, "grad_norm": 4.987203598022461, "learning_rate": 1.7207792207792208e-05, "loss": 1.4451, "step": 70 }, { "epoch": 0.024398178269355888, "grad_norm": 6.489861488342285, "learning_rate": 1.8831168831168833e-05, "loss": 1.3596, "step": 75 }, { "epoch": 0.026024723487312947, "grad_norm": 6.009546756744385, "learning_rate": 2.0454545454545457e-05, "loss": 1.3021, "step": 80 }, { "epoch": 0.027651268705270005, "grad_norm": 5.856492042541504, "learning_rate": 2.207792207792208e-05, "loss": 1.2303, "step": 85 }, { "epoch": 0.029277813923227064, "grad_norm": 8.079117774963379, "learning_rate": 2.3701298701298703e-05, "loss": 1.2756, "step": 90 }, { "epoch": 0.030904359141184126, "grad_norm": 9.356736183166504, "learning_rate": 2.5324675324675325e-05, "loss": 1.2231, "step": 95 }, { "epoch": 0.03253090435914118, "grad_norm": 7.978050708770752, "learning_rate": 2.694805194805195e-05, "loss": 1.2171, "step": 100 }, { "epoch": 0.034157449577098244, "grad_norm": 9.788860321044922, "learning_rate": 2.857142857142857e-05, "loss": 1.3057, "step": 105 }, { "epoch": 0.035783994795055306, "grad_norm": 6.810769081115723, "learning_rate": 3.01948051948052e-05, "loss": 1.2105, "step": 110 }, { "epoch": 0.03741054001301236, "grad_norm": 7.437067985534668, "learning_rate": 3.181818181818182e-05, "loss": 1.2065, "step": 115 }, { "epoch": 0.03903708523096942, "grad_norm": 8.67028522491455, "learning_rate": 3.344155844155844e-05, "loss": 1.1346, "step": 120 }, { "epoch": 0.04066363044892648, "grad_norm": 9.523406982421875, "learning_rate": 3.506493506493507e-05, "loss": 1.1662, "step": 125 }, { "epoch": 0.04229017566688354, "grad_norm": 4.4032816886901855, "learning_rate": 3.668831168831169e-05, "loss": 1.1708, "step": 130 }, { "epoch": 0.043916720884840596, "grad_norm": 5.519209861755371, "learning_rate": 3.831168831168831e-05, "loss": 1.1403, "step": 135 }, { "epoch": 0.04554326610279766, "grad_norm": 5.634235382080078, "learning_rate": 3.993506493506494e-05, "loss": 1.1536, "step": 140 }, { "epoch": 0.04716981132075472, "grad_norm": 6.300098419189453, "learning_rate": 4.155844155844156e-05, "loss": 1.1277, "step": 145 }, { "epoch": 0.048796356538711776, "grad_norm": 4.605157375335693, "learning_rate": 4.318181818181819e-05, "loss": 1.1163, "step": 150 }, { "epoch": 0.05042290175666884, "grad_norm": 5.781686305999756, "learning_rate": 4.4805194805194805e-05, "loss": 1.1107, "step": 155 }, { "epoch": 0.05204944697462589, "grad_norm": 4.554281234741211, "learning_rate": 4.642857142857143e-05, "loss": 1.1387, "step": 160 }, { "epoch": 0.053675992192582955, "grad_norm": 10.010477066040039, "learning_rate": 4.8051948051948054e-05, "loss": 1.1531, "step": 165 }, { "epoch": 0.05530253741054001, "grad_norm": 6.158892631530762, "learning_rate": 4.967532467532468e-05, "loss": 1.1396, "step": 170 }, { "epoch": 0.05692908262849707, "grad_norm": 6.746368408203125, "learning_rate": 4.999999520430831e-05, "loss": 1.1243, "step": 175 }, { "epoch": 0.05855562784645413, "grad_norm": 7.575245380401611, "learning_rate": 4.999997389012675e-05, "loss": 1.1174, "step": 180 }, { "epoch": 0.06018217306441119, "grad_norm": 9.75899887084961, "learning_rate": 4.99999232689698e-05, "loss": 1.123, "step": 185 }, { "epoch": 0.06180871828236825, "grad_norm": 6.071502685546875, "learning_rate": 4.9999846005164544e-05, "loss": 1.0794, "step": 190 }, { "epoch": 0.06343526350032531, "grad_norm": 4.638003826141357, "learning_rate": 4.999974209879331e-05, "loss": 1.1059, "step": 195 }, { "epoch": 0.06506180871828236, "grad_norm": 4.056344985961914, "learning_rate": 4.999961154996685e-05, "loss": 1.0373, "step": 200 }, { "epoch": 0.06668835393623943, "grad_norm": 10.81039810180664, "learning_rate": 4.999945435882428e-05, "loss": 1.0483, "step": 205 }, { "epoch": 0.06831489915419649, "grad_norm": 9.315641403198242, "learning_rate": 4.999927052553313e-05, "loss": 1.0723, "step": 210 }, { "epoch": 0.06994144437215355, "grad_norm": 8.322559356689453, "learning_rate": 4.9999060050289286e-05, "loss": 1.1134, "step": 215 }, { "epoch": 0.07156798959011061, "grad_norm": 5.152672290802002, "learning_rate": 4.999882293331708e-05, "loss": 1.0625, "step": 220 }, { "epoch": 0.07319453480806766, "grad_norm": 3.750262498855591, "learning_rate": 4.999855917486921e-05, "loss": 1.025, "step": 225 }, { "epoch": 0.07482108002602472, "grad_norm": 4.812891483306885, "learning_rate": 4.999826877522675e-05, "loss": 1.0774, "step": 230 }, { "epoch": 0.07644762524398178, "grad_norm": 5.638612747192383, "learning_rate": 4.999795173469919e-05, "loss": 1.026, "step": 235 }, { "epoch": 0.07807417046193885, "grad_norm": 4.525393486022949, "learning_rate": 4.99976080536244e-05, "loss": 1.0333, "step": 240 }, { "epoch": 0.0797007156798959, "grad_norm": 14.256878852844238, "learning_rate": 4.9997237732368645e-05, "loss": 1.0749, "step": 245 }, { "epoch": 0.08132726089785296, "grad_norm": 7.804468154907227, "learning_rate": 4.9996840771326584e-05, "loss": 1.0461, "step": 250 }, { "epoch": 0.08295380611581002, "grad_norm": 3.510770559310913, "learning_rate": 4.999641717092126e-05, "loss": 1.0215, "step": 255 }, { "epoch": 0.08458035133376708, "grad_norm": 6.548313140869141, "learning_rate": 4.99959669316041e-05, "loss": 0.9953, "step": 260 }, { "epoch": 0.08620689655172414, "grad_norm": 3.891099691390991, "learning_rate": 4.999549005385494e-05, "loss": 1.0075, "step": 265 }, { "epoch": 0.08783344176968119, "grad_norm": 3.341900587081909, "learning_rate": 4.999498653818199e-05, "loss": 1.0451, "step": 270 }, { "epoch": 0.08945998698763825, "grad_norm": 3.6907620429992676, "learning_rate": 4.999445638512185e-05, "loss": 1.0313, "step": 275 }, { "epoch": 0.09108653220559532, "grad_norm": 4.23262357711792, "learning_rate": 4.99938995952395e-05, "loss": 1.0493, "step": 280 }, { "epoch": 0.09271307742355238, "grad_norm": 3.793381929397583, "learning_rate": 4.9993316169128334e-05, "loss": 1.0103, "step": 285 }, { "epoch": 0.09433962264150944, "grad_norm": 5.088762283325195, "learning_rate": 4.99927061074101e-05, "loss": 1.0206, "step": 290 }, { "epoch": 0.09596616785946649, "grad_norm": 4.209216594696045, "learning_rate": 4.999206941073496e-05, "loss": 1.0018, "step": 295 }, { "epoch": 0.09759271307742355, "grad_norm": 3.5634896755218506, "learning_rate": 4.9991406079781424e-05, "loss": 0.9936, "step": 300 }, { "epoch": 0.09921925829538061, "grad_norm": 6.785153388977051, "learning_rate": 4.999071611525643e-05, "loss": 1.0299, "step": 305 }, { "epoch": 0.10084580351333768, "grad_norm": 4.595156192779541, "learning_rate": 4.998999951789528e-05, "loss": 1.0238, "step": 310 }, { "epoch": 0.10247234873129472, "grad_norm": 4.024753570556641, "learning_rate": 4.998925628846164e-05, "loss": 1.0202, "step": 315 }, { "epoch": 0.10409889394925179, "grad_norm": 4.274604320526123, "learning_rate": 4.9988486427747606e-05, "loss": 1.0169, "step": 320 }, { "epoch": 0.10572543916720885, "grad_norm": 5.742299556732178, "learning_rate": 4.99876899365736e-05, "loss": 0.9837, "step": 325 }, { "epoch": 0.10735198438516591, "grad_norm": 4.468045711517334, "learning_rate": 4.998686681578846e-05, "loss": 1.0671, "step": 330 }, { "epoch": 0.10897852960312297, "grad_norm": 3.6818110942840576, "learning_rate": 4.998601706626938e-05, "loss": 1.0505, "step": 335 }, { "epoch": 0.11060507482108002, "grad_norm": 2.8949925899505615, "learning_rate": 4.9985140688921975e-05, "loss": 1.02, "step": 340 }, { "epoch": 0.11223162003903708, "grad_norm": 3.744718551635742, "learning_rate": 4.9984237684680194e-05, "loss": 1.0275, "step": 345 }, { "epoch": 0.11385816525699415, "grad_norm": 6.084460735321045, "learning_rate": 4.998330805450636e-05, "loss": 0.9692, "step": 350 }, { "epoch": 0.11548471047495121, "grad_norm": 3.124788522720337, "learning_rate": 4.998235179939122e-05, "loss": 0.9594, "step": 355 }, { "epoch": 0.11711125569290826, "grad_norm": 3.5672078132629395, "learning_rate": 4.998136892035382e-05, "loss": 0.9882, "step": 360 }, { "epoch": 0.11873780091086532, "grad_norm": 3.500018835067749, "learning_rate": 4.998035941844167e-05, "loss": 0.9527, "step": 365 }, { "epoch": 0.12036434612882238, "grad_norm": 3.729696035385132, "learning_rate": 4.997932329473058e-05, "loss": 1.0004, "step": 370 }, { "epoch": 0.12199089134677944, "grad_norm": 4.352784156799316, "learning_rate": 4.997826055032476e-05, "loss": 0.9475, "step": 375 }, { "epoch": 0.1236174365647365, "grad_norm": 5.5154523849487305, "learning_rate": 4.99771711863568e-05, "loss": 0.9892, "step": 380 }, { "epoch": 0.12524398178269355, "grad_norm": 5.252265930175781, "learning_rate": 4.997605520398762e-05, "loss": 0.9563, "step": 385 }, { "epoch": 0.12687052700065063, "grad_norm": 3.895646810531616, "learning_rate": 4.9974912604406554e-05, "loss": 1.0064, "step": 390 }, { "epoch": 0.12849707221860768, "grad_norm": 43.6920166015625, "learning_rate": 4.997374338883127e-05, "loss": 0.9715, "step": 395 }, { "epoch": 0.13012361743656473, "grad_norm": 3.9318182468414307, "learning_rate": 4.9972547558507815e-05, "loss": 0.9883, "step": 400 }, { "epoch": 0.1317501626545218, "grad_norm": 9.373114585876465, "learning_rate": 4.99713251147106e-05, "loss": 0.9842, "step": 405 }, { "epoch": 0.13337670787247885, "grad_norm": 4.192168712615967, "learning_rate": 4.997007605874239e-05, "loss": 0.986, "step": 410 }, { "epoch": 0.13500325309043593, "grad_norm": 7.416944980621338, "learning_rate": 4.996880039193431e-05, "loss": 0.965, "step": 415 }, { "epoch": 0.13662979830839297, "grad_norm": 3.9326727390289307, "learning_rate": 4.996749811564586e-05, "loss": 0.9931, "step": 420 }, { "epoch": 0.13825634352635002, "grad_norm": 5.82494592666626, "learning_rate": 4.996616923126488e-05, "loss": 0.9522, "step": 425 }, { "epoch": 0.1398828887443071, "grad_norm": 2.137333869934082, "learning_rate": 4.996481374020759e-05, "loss": 0.9943, "step": 430 }, { "epoch": 0.14150943396226415, "grad_norm": 3.4468958377838135, "learning_rate": 4.996343164391853e-05, "loss": 0.9767, "step": 435 }, { "epoch": 0.14313597918022122, "grad_norm": 19.23995590209961, "learning_rate": 4.9962022943870626e-05, "loss": 0.9562, "step": 440 }, { "epoch": 0.14476252439817827, "grad_norm": 3.0280990600585938, "learning_rate": 4.9960587641565125e-05, "loss": 0.981, "step": 445 }, { "epoch": 0.14638906961613532, "grad_norm": 2.9482223987579346, "learning_rate": 4.995912573853166e-05, "loss": 0.9449, "step": 450 }, { "epoch": 0.1480156148340924, "grad_norm": 5.7567243576049805, "learning_rate": 4.9957637236328195e-05, "loss": 0.9919, "step": 455 }, { "epoch": 0.14964216005204944, "grad_norm": 3.0479254722595215, "learning_rate": 4.995612213654103e-05, "loss": 0.9837, "step": 460 }, { "epoch": 0.1512687052700065, "grad_norm": 3.3615362644195557, "learning_rate": 4.995458044078482e-05, "loss": 0.9584, "step": 465 }, { "epoch": 0.15289525048796357, "grad_norm": 22.171850204467773, "learning_rate": 4.995301215070257e-05, "loss": 0.9984, "step": 470 }, { "epoch": 0.15452179570592062, "grad_norm": 4.525479316711426, "learning_rate": 4.9951417267965626e-05, "loss": 1.0301, "step": 475 }, { "epoch": 0.1561483409238777, "grad_norm": 5.372246265411377, "learning_rate": 4.9949795794273664e-05, "loss": 0.9911, "step": 480 }, { "epoch": 0.15777488614183474, "grad_norm": 2.3989129066467285, "learning_rate": 4.99481477313547e-05, "loss": 0.96, "step": 485 }, { "epoch": 0.1594014313597918, "grad_norm": 3.0757484436035156, "learning_rate": 4.994647308096509e-05, "loss": 0.9738, "step": 490 }, { "epoch": 0.16102797657774887, "grad_norm": 3.538886308670044, "learning_rate": 4.9944771844889524e-05, "loss": 0.9492, "step": 495 }, { "epoch": 0.16265452179570591, "grad_norm": 3.5516517162323, "learning_rate": 4.994304402494104e-05, "loss": 0.9463, "step": 500 }, { "epoch": 0.164281067013663, "grad_norm": 4.078283786773682, "learning_rate": 4.994128962296097e-05, "loss": 0.9378, "step": 505 }, { "epoch": 0.16590761223162004, "grad_norm": 2.795860767364502, "learning_rate": 4.993950864081901e-05, "loss": 0.9979, "step": 510 }, { "epoch": 0.1675341574495771, "grad_norm": 2.6040124893188477, "learning_rate": 4.9937701080413165e-05, "loss": 0.97, "step": 515 }, { "epoch": 0.16916070266753416, "grad_norm": 3.229245185852051, "learning_rate": 4.993586694366977e-05, "loss": 0.9673, "step": 520 }, { "epoch": 0.1707872478854912, "grad_norm": 4.013242244720459, "learning_rate": 4.993400623254347e-05, "loss": 0.9578, "step": 525 }, { "epoch": 0.1724137931034483, "grad_norm": 2.899711847305298, "learning_rate": 4.993249853141837e-05, "loss": 0.959, "step": 530 }, { "epoch": 0.17404033832140534, "grad_norm": 3.2729837894439697, "learning_rate": 4.9930589991419e-05, "loss": 0.9437, "step": 535 }, { "epoch": 0.17566688353936238, "grad_norm": 3.1390457153320312, "learning_rate": 4.992865488266043e-05, "loss": 0.9274, "step": 540 }, { "epoch": 0.17729342875731946, "grad_norm": 2.9977355003356934, "learning_rate": 4.9926693207204925e-05, "loss": 0.9704, "step": 545 }, { "epoch": 0.1789199739752765, "grad_norm": 3.9077603816986084, "learning_rate": 4.9924704967143064e-05, "loss": 0.9489, "step": 550 }, { "epoch": 0.18054651919323358, "grad_norm": 3.813920736312866, "learning_rate": 4.992269016459373e-05, "loss": 0.9854, "step": 555 }, { "epoch": 0.18217306441119063, "grad_norm": 2.9923019409179688, "learning_rate": 4.9920648801704103e-05, "loss": 0.9514, "step": 560 }, { "epoch": 0.18379960962914768, "grad_norm": 3.6704702377319336, "learning_rate": 4.991858088064971e-05, "loss": 0.9483, "step": 565 }, { "epoch": 0.18542615484710476, "grad_norm": 4.187371730804443, "learning_rate": 4.991648640363434e-05, "loss": 0.9206, "step": 570 }, { "epoch": 0.1870527000650618, "grad_norm": 3.446115732192993, "learning_rate": 4.991436537289009e-05, "loss": 0.9495, "step": 575 }, { "epoch": 0.18867924528301888, "grad_norm": 3.385150671005249, "learning_rate": 4.9912217790677365e-05, "loss": 0.9166, "step": 580 }, { "epoch": 0.19030579050097593, "grad_norm": 3.2357215881347656, "learning_rate": 4.991004365928487e-05, "loss": 0.9566, "step": 585 }, { "epoch": 0.19193233571893298, "grad_norm": 5.720197677612305, "learning_rate": 4.990784298102959e-05, "loss": 0.9416, "step": 590 }, { "epoch": 0.19355888093689005, "grad_norm": 3.472399950027466, "learning_rate": 4.99056157582568e-05, "loss": 0.9407, "step": 595 }, { "epoch": 0.1951854261548471, "grad_norm": 3.562246561050415, "learning_rate": 4.9903361993340095e-05, "loss": 0.9659, "step": 600 }, { "epoch": 0.19681197137280415, "grad_norm": 2.6203455924987793, "learning_rate": 4.9901081688681314e-05, "loss": 0.9354, "step": 605 }, { "epoch": 0.19843851659076123, "grad_norm": 2.9699368476867676, "learning_rate": 4.989877484671061e-05, "loss": 0.9478, "step": 610 }, { "epoch": 0.20006506180871828, "grad_norm": 2.1798095703125, "learning_rate": 4.989644146988639e-05, "loss": 0.9689, "step": 615 }, { "epoch": 0.20169160702667535, "grad_norm": 2.909578561782837, "learning_rate": 4.989408156069537e-05, "loss": 0.9187, "step": 620 }, { "epoch": 0.2033181522446324, "grad_norm": 2.7465949058532715, "learning_rate": 4.989169512165253e-05, "loss": 0.9443, "step": 625 }, { "epoch": 0.20494469746258945, "grad_norm": 4.120491981506348, "learning_rate": 4.988928215530111e-05, "loss": 0.9183, "step": 630 }, { "epoch": 0.20657124268054652, "grad_norm": 5.750351428985596, "learning_rate": 4.988684266421263e-05, "loss": 0.9721, "step": 635 }, { "epoch": 0.20819778789850357, "grad_norm": 2.510326862335205, "learning_rate": 4.9884376650986874e-05, "loss": 0.9456, "step": 640 }, { "epoch": 0.20982433311646065, "grad_norm": 3.090721845626831, "learning_rate": 4.988188411825191e-05, "loss": 0.9333, "step": 645 }, { "epoch": 0.2114508783344177, "grad_norm": 2.2550766468048096, "learning_rate": 4.987936506866405e-05, "loss": 0.9336, "step": 650 }, { "epoch": 0.21307742355237475, "grad_norm": 3.066791534423828, "learning_rate": 4.987681950490786e-05, "loss": 0.9349, "step": 655 }, { "epoch": 0.21470396877033182, "grad_norm": 2.512131929397583, "learning_rate": 4.987424742969616e-05, "loss": 0.9353, "step": 660 }, { "epoch": 0.21633051398828887, "grad_norm": 3.988386869430542, "learning_rate": 4.987164884577007e-05, "loss": 0.9514, "step": 665 }, { "epoch": 0.21795705920624595, "grad_norm": 3.2770578861236572, "learning_rate": 4.986902375589889e-05, "loss": 0.9342, "step": 670 }, { "epoch": 0.219583604424203, "grad_norm": 3.2448511123657227, "learning_rate": 4.986637216288021e-05, "loss": 0.9482, "step": 675 }, { "epoch": 0.22121014964216004, "grad_norm": 2.492069721221924, "learning_rate": 4.986369406953988e-05, "loss": 0.9299, "step": 680 }, { "epoch": 0.22283669486011712, "grad_norm": 4.076034069061279, "learning_rate": 4.986098947873195e-05, "loss": 0.9075, "step": 685 }, { "epoch": 0.22446324007807417, "grad_norm": 2.4998836517333984, "learning_rate": 4.985825839333871e-05, "loss": 0.9472, "step": 690 }, { "epoch": 0.22608978529603124, "grad_norm": 7.26285457611084, "learning_rate": 4.985550081627074e-05, "loss": 0.9604, "step": 695 }, { "epoch": 0.2277163305139883, "grad_norm": 3.53657865524292, "learning_rate": 4.985271675046679e-05, "loss": 0.9694, "step": 700 }, { "epoch": 0.22934287573194534, "grad_norm": 2.3773930072784424, "learning_rate": 4.984990619889387e-05, "loss": 0.906, "step": 705 }, { "epoch": 0.23096942094990242, "grad_norm": 3.7686069011688232, "learning_rate": 4.984706916454721e-05, "loss": 0.9665, "step": 710 }, { "epoch": 0.23259596616785946, "grad_norm": 2.416747570037842, "learning_rate": 4.984420565045027e-05, "loss": 0.947, "step": 715 }, { "epoch": 0.2342225113858165, "grad_norm": 3.5830488204956055, "learning_rate": 4.984131565965472e-05, "loss": 0.9246, "step": 720 }, { "epoch": 0.2358490566037736, "grad_norm": 3.2117176055908203, "learning_rate": 4.983839919524045e-05, "loss": 0.942, "step": 725 }, { "epoch": 0.23747560182173064, "grad_norm": 3.1787447929382324, "learning_rate": 4.983545626031555e-05, "loss": 0.9533, "step": 730 }, { "epoch": 0.2391021470396877, "grad_norm": 2.5831379890441895, "learning_rate": 4.983248685801636e-05, "loss": 0.9443, "step": 735 }, { "epoch": 0.24072869225764476, "grad_norm": 7.841883659362793, "learning_rate": 4.982949099150738e-05, "loss": 0.9042, "step": 740 }, { "epoch": 0.2423552374756018, "grad_norm": 2.721116781234741, "learning_rate": 4.982646866398133e-05, "loss": 0.9232, "step": 745 }, { "epoch": 0.24398178269355889, "grad_norm": 2.3474032878875732, "learning_rate": 4.982341987865914e-05, "loss": 0.9731, "step": 750 }, { "epoch": 0.24560832791151593, "grad_norm": 2.8456294536590576, "learning_rate": 4.9820344638789926e-05, "loss": 0.9234, "step": 755 }, { "epoch": 0.247234873129473, "grad_norm": 3.1040987968444824, "learning_rate": 4.981724294765101e-05, "loss": 0.9508, "step": 760 }, { "epoch": 0.24886141834743006, "grad_norm": 4.044954776763916, "learning_rate": 4.9814114808547876e-05, "loss": 0.9593, "step": 765 }, { "epoch": 0.2504879635653871, "grad_norm": 3.3007290363311768, "learning_rate": 4.981096022481422e-05, "loss": 0.9174, "step": 770 }, { "epoch": 0.2521145087833442, "grad_norm": 3.6367475986480713, "learning_rate": 4.980777919981191e-05, "loss": 0.9472, "step": 775 }, { "epoch": 0.25374105400130126, "grad_norm": 18.750322341918945, "learning_rate": 4.980457173693099e-05, "loss": 0.9101, "step": 780 }, { "epoch": 0.2553675992192583, "grad_norm": 9.816338539123535, "learning_rate": 4.980133783958969e-05, "loss": 0.9439, "step": 785 }, { "epoch": 0.25699414443721535, "grad_norm": 2.4749770164489746, "learning_rate": 4.9798077511234396e-05, "loss": 0.9198, "step": 790 }, { "epoch": 0.25862068965517243, "grad_norm": 2.2600533962249756, "learning_rate": 4.979479075533967e-05, "loss": 0.9095, "step": 795 }, { "epoch": 0.26024723487312945, "grad_norm": 3.450012445449829, "learning_rate": 4.9791477575408254e-05, "loss": 0.9263, "step": 800 }, { "epoch": 0.2618737800910865, "grad_norm": 2.614621877670288, "learning_rate": 4.9788137974971006e-05, "loss": 0.9321, "step": 805 }, { "epoch": 0.2635003253090436, "grad_norm": 2.4043772220611572, "learning_rate": 4.9784771957586995e-05, "loss": 0.9241, "step": 810 }, { "epoch": 0.2651268705270006, "grad_norm": 3.927534818649292, "learning_rate": 4.97813795268434e-05, "loss": 0.9428, "step": 815 }, { "epoch": 0.2667534157449577, "grad_norm": 3.2626078128814697, "learning_rate": 4.977796068635558e-05, "loss": 0.936, "step": 820 }, { "epoch": 0.2683799609629148, "grad_norm": 2.7497739791870117, "learning_rate": 4.977451543976701e-05, "loss": 0.9176, "step": 825 }, { "epoch": 0.27000650618087185, "grad_norm": 2.8794078826904297, "learning_rate": 4.9771043790749335e-05, "loss": 0.9346, "step": 830 }, { "epoch": 0.2716330513988289, "grad_norm": 3.0631301403045654, "learning_rate": 4.976754574300231e-05, "loss": 0.9391, "step": 835 }, { "epoch": 0.27325959661678595, "grad_norm": 3.0575807094573975, "learning_rate": 4.9764021300253844e-05, "loss": 0.9176, "step": 840 }, { "epoch": 0.274886141834743, "grad_norm": 2.0632474422454834, "learning_rate": 4.976047046625997e-05, "loss": 0.9371, "step": 845 }, { "epoch": 0.27651268705270005, "grad_norm": 3.8703184127807617, "learning_rate": 4.975689324480484e-05, "loss": 0.9675, "step": 850 }, { "epoch": 0.2781392322706571, "grad_norm": 2.6094343662261963, "learning_rate": 4.975328963970073e-05, "loss": 0.929, "step": 855 }, { "epoch": 0.2797657774886142, "grad_norm": 3.5149149894714355, "learning_rate": 4.9749659654788036e-05, "loss": 0.9091, "step": 860 }, { "epoch": 0.2813923227065712, "grad_norm": 2.295640468597412, "learning_rate": 4.9746003293935275e-05, "loss": 0.9224, "step": 865 }, { "epoch": 0.2830188679245283, "grad_norm": 57.37953186035156, "learning_rate": 4.974232056103906e-05, "loss": 0.9183, "step": 870 }, { "epoch": 0.28464541314248537, "grad_norm": 3.7314531803131104, "learning_rate": 4.973861146002411e-05, "loss": 0.8797, "step": 875 }, { "epoch": 0.28627195836044245, "grad_norm": 12.3214693069458, "learning_rate": 4.973487599484324e-05, "loss": 0.9734, "step": 880 }, { "epoch": 0.28789850357839947, "grad_norm": 3.5420007705688477, "learning_rate": 4.973111416947739e-05, "loss": 0.9167, "step": 885 }, { "epoch": 0.28952504879635654, "grad_norm": 3.150508403778076, "learning_rate": 4.972732598793556e-05, "loss": 0.9057, "step": 890 }, { "epoch": 0.2911515940143136, "grad_norm": 4.188449382781982, "learning_rate": 4.972351145425485e-05, "loss": 0.9125, "step": 895 }, { "epoch": 0.29277813923227064, "grad_norm": 4.404074192047119, "learning_rate": 4.9719670572500444e-05, "loss": 0.9528, "step": 900 }, { "epoch": 0.2944046844502277, "grad_norm": 2.478856325149536, "learning_rate": 4.97158033467656e-05, "loss": 0.9075, "step": 905 }, { "epoch": 0.2960312296681848, "grad_norm": 4.190792560577393, "learning_rate": 4.9711909781171676e-05, "loss": 0.9282, "step": 910 }, { "epoch": 0.2976577748861418, "grad_norm": 3.2189950942993164, "learning_rate": 4.970798987986805e-05, "loss": 0.9494, "step": 915 }, { "epoch": 0.2992843201040989, "grad_norm": 2.2690813541412354, "learning_rate": 4.970404364703222e-05, "loss": 0.9373, "step": 920 }, { "epoch": 0.30091086532205596, "grad_norm": 3.4929933547973633, "learning_rate": 4.970007108686972e-05, "loss": 0.9275, "step": 925 }, { "epoch": 0.302537410540013, "grad_norm": 2.6433072090148926, "learning_rate": 4.9696072203614134e-05, "loss": 0.9124, "step": 930 }, { "epoch": 0.30416395575797006, "grad_norm": 2.269691228866577, "learning_rate": 4.969204700152712e-05, "loss": 0.9081, "step": 935 }, { "epoch": 0.30579050097592714, "grad_norm": 2.6683006286621094, "learning_rate": 4.9687995484898365e-05, "loss": 0.9004, "step": 940 }, { "epoch": 0.3074170461938842, "grad_norm": 6.1091837882995605, "learning_rate": 4.9683917658045606e-05, "loss": 0.9467, "step": 945 }, { "epoch": 0.30904359141184123, "grad_norm": 2.08435320854187, "learning_rate": 4.9679813525314635e-05, "loss": 0.8773, "step": 950 }, { "epoch": 0.3106701366297983, "grad_norm": 2.508965492248535, "learning_rate": 4.967568309107925e-05, "loss": 0.9117, "step": 955 }, { "epoch": 0.3122966818477554, "grad_norm": 2.267766237258911, "learning_rate": 4.967152635974129e-05, "loss": 0.8895, "step": 960 }, { "epoch": 0.3139232270657124, "grad_norm": 2.4633257389068604, "learning_rate": 4.966734333573063e-05, "loss": 0.8829, "step": 965 }, { "epoch": 0.3155497722836695, "grad_norm": 2.442094326019287, "learning_rate": 4.966313402350516e-05, "loss": 0.8979, "step": 970 }, { "epoch": 0.31717631750162656, "grad_norm": 3.596240282058716, "learning_rate": 4.965889842755077e-05, "loss": 0.9094, "step": 975 }, { "epoch": 0.3188028627195836, "grad_norm": 2.0434067249298096, "learning_rate": 4.965463655238139e-05, "loss": 0.9131, "step": 980 }, { "epoch": 0.32042940793754066, "grad_norm": 3.0639724731445312, "learning_rate": 4.965034840253893e-05, "loss": 0.9041, "step": 985 }, { "epoch": 0.32205595315549773, "grad_norm": 3.173212766647339, "learning_rate": 4.964603398259331e-05, "loss": 0.8974, "step": 990 }, { "epoch": 0.3236824983734548, "grad_norm": 2.5176949501037598, "learning_rate": 4.9641693297142455e-05, "loss": 0.9196, "step": 995 }, { "epoch": 0.32530904359141183, "grad_norm": 2.001845359802246, "learning_rate": 4.9637326350812266e-05, "loss": 0.8881, "step": 1000 }, { "epoch": 0.3269355888093689, "grad_norm": 3.1144118309020996, "learning_rate": 4.963293314825663e-05, "loss": 0.8751, "step": 1005 }, { "epoch": 0.328562134027326, "grad_norm": 2.621720552444458, "learning_rate": 4.962851369415744e-05, "loss": 0.9223, "step": 1010 }, { "epoch": 0.330188679245283, "grad_norm": 2.399784564971924, "learning_rate": 4.962406799322454e-05, "loss": 0.9049, "step": 1015 }, { "epoch": 0.3318152244632401, "grad_norm": 2.089038610458374, "learning_rate": 4.961959605019576e-05, "loss": 0.9285, "step": 1020 }, { "epoch": 0.33344176968119715, "grad_norm": 2.1323084831237793, "learning_rate": 4.961509786983689e-05, "loss": 0.9157, "step": 1025 }, { "epoch": 0.3350683148991542, "grad_norm": 2.186891794204712, "learning_rate": 4.961057345694167e-05, "loss": 0.9167, "step": 1030 }, { "epoch": 0.33669486011711125, "grad_norm": 2.111762762069702, "learning_rate": 4.9606022816331824e-05, "loss": 0.907, "step": 1035 }, { "epoch": 0.3383214053350683, "grad_norm": 2.2242021560668945, "learning_rate": 4.960144595285701e-05, "loss": 0.9126, "step": 1040 }, { "epoch": 0.33994795055302535, "grad_norm": 2.575448751449585, "learning_rate": 4.959684287139482e-05, "loss": 0.9559, "step": 1045 }, { "epoch": 0.3415744957709824, "grad_norm": 2.4608261585235596, "learning_rate": 4.959221357685081e-05, "loss": 0.9058, "step": 1050 }, { "epoch": 0.3432010409889395, "grad_norm": 2.738116502761841, "learning_rate": 4.9587558074158464e-05, "loss": 0.9519, "step": 1055 }, { "epoch": 0.3448275862068966, "grad_norm": 2.4760334491729736, "learning_rate": 4.958287636827919e-05, "loss": 0.9204, "step": 1060 }, { "epoch": 0.3464541314248536, "grad_norm": 2.2455074787139893, "learning_rate": 4.9578168464202324e-05, "loss": 0.8996, "step": 1065 }, { "epoch": 0.34808067664281067, "grad_norm": 2.1432058811187744, "learning_rate": 4.9573434366945124e-05, "loss": 0.9308, "step": 1070 }, { "epoch": 0.34970722186076775, "grad_norm": 1.8311374187469482, "learning_rate": 4.956867408155277e-05, "loss": 0.9238, "step": 1075 }, { "epoch": 0.35133376707872477, "grad_norm": 2.4586679935455322, "learning_rate": 4.956388761309832e-05, "loss": 0.8943, "step": 1080 }, { "epoch": 0.35296031229668184, "grad_norm": 2.1968910694122314, "learning_rate": 4.955907496668279e-05, "loss": 0.9492, "step": 1085 }, { "epoch": 0.3545868575146389, "grad_norm": 4.167201519012451, "learning_rate": 4.955423614743503e-05, "loss": 0.9119, "step": 1090 }, { "epoch": 0.35621340273259594, "grad_norm": 3.1522021293640137, "learning_rate": 4.954937116051183e-05, "loss": 0.9077, "step": 1095 }, { "epoch": 0.357839947950553, "grad_norm": 2.009462356567383, "learning_rate": 4.954448001109785e-05, "loss": 0.8913, "step": 1100 }, { "epoch": 0.3594664931685101, "grad_norm": 2.839548110961914, "learning_rate": 4.953956270440563e-05, "loss": 0.9048, "step": 1105 }, { "epoch": 0.36109303838646717, "grad_norm": 2.8811404705047607, "learning_rate": 4.953461924567559e-05, "loss": 0.9697, "step": 1110 }, { "epoch": 0.3627195836044242, "grad_norm": 2.85508394241333, "learning_rate": 4.952964964017602e-05, "loss": 0.9328, "step": 1115 }, { "epoch": 0.36434612882238127, "grad_norm": 2.6920166015625, "learning_rate": 4.952465389320307e-05, "loss": 0.9193, "step": 1120 }, { "epoch": 0.36597267404033834, "grad_norm": 2.714299201965332, "learning_rate": 4.951963201008076e-05, "loss": 0.896, "step": 1125 }, { "epoch": 0.36759921925829536, "grad_norm": 2.70967960357666, "learning_rate": 4.951458399616096e-05, "loss": 0.8776, "step": 1130 }, { "epoch": 0.36922576447625244, "grad_norm": 2.452511787414551, "learning_rate": 4.9509509856823376e-05, "loss": 0.9047, "step": 1135 }, { "epoch": 0.3708523096942095, "grad_norm": 2.4412052631378174, "learning_rate": 4.9504409597475565e-05, "loss": 0.9192, "step": 1140 }, { "epoch": 0.37247885491216653, "grad_norm": 2.8084144592285156, "learning_rate": 4.949928322355293e-05, "loss": 0.8547, "step": 1145 }, { "epoch": 0.3741054001301236, "grad_norm": 3.60684871673584, "learning_rate": 4.949413074051868e-05, "loss": 0.8987, "step": 1150 }, { "epoch": 0.3757319453480807, "grad_norm": 14.40071964263916, "learning_rate": 4.948895215386388e-05, "loss": 0.8831, "step": 1155 }, { "epoch": 0.37735849056603776, "grad_norm": 2.2433459758758545, "learning_rate": 4.948374746910739e-05, "loss": 0.9195, "step": 1160 }, { "epoch": 0.3789850357839948, "grad_norm": 2.2205615043640137, "learning_rate": 4.947851669179589e-05, "loss": 0.9168, "step": 1165 }, { "epoch": 0.38061158100195186, "grad_norm": 2.3830015659332275, "learning_rate": 4.9473259827503865e-05, "loss": 0.9033, "step": 1170 }, { "epoch": 0.38223812621990894, "grad_norm": 2.8027889728546143, "learning_rate": 4.946797688183361e-05, "loss": 0.8746, "step": 1175 }, { "epoch": 0.38386467143786596, "grad_norm": 4.316250324249268, "learning_rate": 4.94626678604152e-05, "loss": 0.888, "step": 1180 }, { "epoch": 0.38549121665582303, "grad_norm": 2.363643169403076, "learning_rate": 4.945733276890652e-05, "loss": 0.8916, "step": 1185 }, { "epoch": 0.3871177618737801, "grad_norm": 2.065160036087036, "learning_rate": 4.945197161299321e-05, "loss": 0.8829, "step": 1190 }, { "epoch": 0.38874430709173713, "grad_norm": 2.287912130355835, "learning_rate": 4.944658439838872e-05, "loss": 0.9027, "step": 1195 }, { "epoch": 0.3903708523096942, "grad_norm": 2.525421142578125, "learning_rate": 4.9441171130834245e-05, "loss": 0.8787, "step": 1200 }, { "epoch": 0.3919973975276513, "grad_norm": 3.762483835220337, "learning_rate": 4.943573181609876e-05, "loss": 0.9301, "step": 1205 }, { "epoch": 0.3936239427456083, "grad_norm": 2.5889155864715576, "learning_rate": 4.943026645997898e-05, "loss": 0.915, "step": 1210 }, { "epoch": 0.3952504879635654, "grad_norm": 2.531707525253296, "learning_rate": 4.9424775068299404e-05, "loss": 0.9419, "step": 1215 }, { "epoch": 0.39687703318152245, "grad_norm": 5.207334041595459, "learning_rate": 4.941925764691224e-05, "loss": 0.8952, "step": 1220 }, { "epoch": 0.39850357839947953, "grad_norm": 2.425893545150757, "learning_rate": 4.941371420169746e-05, "loss": 0.9191, "step": 1225 }, { "epoch": 0.40013012361743655, "grad_norm": 2.961516857147217, "learning_rate": 4.940814473856278e-05, "loss": 0.9196, "step": 1230 }, { "epoch": 0.4017566688353936, "grad_norm": 1.8956583738327026, "learning_rate": 4.940254926344361e-05, "loss": 0.8787, "step": 1235 }, { "epoch": 0.4033832140533507, "grad_norm": 2.169212579727173, "learning_rate": 4.9396927782303105e-05, "loss": 0.9207, "step": 1240 }, { "epoch": 0.4050097592713077, "grad_norm": 2.522149085998535, "learning_rate": 4.939128030113213e-05, "loss": 0.8792, "step": 1245 }, { "epoch": 0.4066363044892648, "grad_norm": 2.043013095855713, "learning_rate": 4.938560682594925e-05, "loss": 0.9033, "step": 1250 }, { "epoch": 0.4082628497072219, "grad_norm": 2.008324384689331, "learning_rate": 4.9379907362800756e-05, "loss": 0.8957, "step": 1255 }, { "epoch": 0.4098893949251789, "grad_norm": 1.997517704963684, "learning_rate": 4.937418191776061e-05, "loss": 0.8688, "step": 1260 }, { "epoch": 0.41151594014313597, "grad_norm": 3.0802457332611084, "learning_rate": 4.936843049693046e-05, "loss": 0.8748, "step": 1265 }, { "epoch": 0.41314248536109305, "grad_norm": 9.878500938415527, "learning_rate": 4.936265310643967e-05, "loss": 0.914, "step": 1270 }, { "epoch": 0.4147690305790501, "grad_norm": 2.237401008605957, "learning_rate": 4.935684975244525e-05, "loss": 0.9122, "step": 1275 }, { "epoch": 0.41639557579700714, "grad_norm": 2.210939407348633, "learning_rate": 4.9351020441131876e-05, "loss": 0.8811, "step": 1280 }, { "epoch": 0.4180221210149642, "grad_norm": 2.0587236881256104, "learning_rate": 4.9345165178711904e-05, "loss": 0.8936, "step": 1285 }, { "epoch": 0.4196486662329213, "grad_norm": 1.8404570817947388, "learning_rate": 4.933928397142536e-05, "loss": 0.9308, "step": 1290 }, { "epoch": 0.4212752114508783, "grad_norm": 1.9954590797424316, "learning_rate": 4.9333376825539864e-05, "loss": 0.8949, "step": 1295 }, { "epoch": 0.4229017566688354, "grad_norm": 2.556403636932373, "learning_rate": 4.932744374735075e-05, "loss": 0.8938, "step": 1300 }, { "epoch": 0.42452830188679247, "grad_norm": 1.964281439781189, "learning_rate": 4.932148474318094e-05, "loss": 0.8739, "step": 1305 }, { "epoch": 0.4261548471047495, "grad_norm": 1.882667899131775, "learning_rate": 4.9315499819381004e-05, "loss": 0.8931, "step": 1310 }, { "epoch": 0.42778139232270657, "grad_norm": 3.4836301803588867, "learning_rate": 4.930948898232912e-05, "loss": 0.9002, "step": 1315 }, { "epoch": 0.42940793754066364, "grad_norm": 2.2451624870300293, "learning_rate": 4.93034522384311e-05, "loss": 0.9097, "step": 1320 }, { "epoch": 0.43103448275862066, "grad_norm": 2.171105146408081, "learning_rate": 4.929738959412037e-05, "loss": 0.9215, "step": 1325 }, { "epoch": 0.43266102797657774, "grad_norm": 2.0800368785858154, "learning_rate": 4.9291301055857916e-05, "loss": 0.901, "step": 1330 }, { "epoch": 0.4342875731945348, "grad_norm": 1.9696561098098755, "learning_rate": 4.9285186630132376e-05, "loss": 0.8899, "step": 1335 }, { "epoch": 0.4359141184124919, "grad_norm": 1.8826249837875366, "learning_rate": 4.9279046323459934e-05, "loss": 0.877, "step": 1340 }, { "epoch": 0.4375406636304489, "grad_norm": 4.076726913452148, "learning_rate": 4.927288014238438e-05, "loss": 0.8946, "step": 1345 }, { "epoch": 0.439167208848406, "grad_norm": 2.221543550491333, "learning_rate": 4.9266688093477066e-05, "loss": 0.8925, "step": 1350 }, { "epoch": 0.44079375406636306, "grad_norm": 2.182318925857544, "learning_rate": 4.926047018333691e-05, "loss": 0.9199, "step": 1355 }, { "epoch": 0.4424202992843201, "grad_norm": 3.5430712699890137, "learning_rate": 4.925422641859041e-05, "loss": 0.9061, "step": 1360 }, { "epoch": 0.44404684450227716, "grad_norm": 3.27327036857605, "learning_rate": 4.92479568058916e-05, "loss": 0.9312, "step": 1365 }, { "epoch": 0.44567338972023424, "grad_norm": 2.458996534347534, "learning_rate": 4.924166135192206e-05, "loss": 0.9184, "step": 1370 }, { "epoch": 0.44729993493819126, "grad_norm": 2.3121581077575684, "learning_rate": 4.923534006339091e-05, "loss": 0.8966, "step": 1375 }, { "epoch": 0.44892648015614833, "grad_norm": 2.582960367202759, "learning_rate": 4.92289929470348e-05, "loss": 0.8672, "step": 1380 }, { "epoch": 0.4505530253741054, "grad_norm": 4.599107265472412, "learning_rate": 4.922262000961793e-05, "loss": 0.8858, "step": 1385 }, { "epoch": 0.4521795705920625, "grad_norm": 2.6778221130371094, "learning_rate": 4.9216221257931984e-05, "loss": 0.8866, "step": 1390 }, { "epoch": 0.4538061158100195, "grad_norm": 2.472867965698242, "learning_rate": 4.920979669879617e-05, "loss": 0.9032, "step": 1395 }, { "epoch": 0.4554326610279766, "grad_norm": 2.3189878463745117, "learning_rate": 4.920334633905721e-05, "loss": 0.8592, "step": 1400 }, { "epoch": 0.45705920624593366, "grad_norm": 2.355407953262329, "learning_rate": 4.9196870185589304e-05, "loss": 0.8722, "step": 1405 }, { "epoch": 0.4586857514638907, "grad_norm": 2.983154296875, "learning_rate": 4.919036824529415e-05, "loss": 0.8677, "step": 1410 }, { "epoch": 0.46031229668184775, "grad_norm": 2.6179659366607666, "learning_rate": 4.918384052510092e-05, "loss": 0.9108, "step": 1415 }, { "epoch": 0.46193884189980483, "grad_norm": 2.4379374980926514, "learning_rate": 4.917728703196628e-05, "loss": 0.9184, "step": 1420 }, { "epoch": 0.46356538711776185, "grad_norm": 2.1980526447296143, "learning_rate": 4.9170707772874324e-05, "loss": 0.8538, "step": 1425 }, { "epoch": 0.4651919323357189, "grad_norm": 2.0506389141082764, "learning_rate": 4.9164102754836655e-05, "loss": 0.8958, "step": 1430 }, { "epoch": 0.466818477553676, "grad_norm": 2.5567352771759033, "learning_rate": 4.915747198489229e-05, "loss": 0.8834, "step": 1435 }, { "epoch": 0.468445022771633, "grad_norm": 2.502176523208618, "learning_rate": 4.915081547010769e-05, "loss": 0.8885, "step": 1440 }, { "epoch": 0.4700715679895901, "grad_norm": 2.5344648361206055, "learning_rate": 4.914413321757679e-05, "loss": 0.9421, "step": 1445 }, { "epoch": 0.4716981132075472, "grad_norm": 1.9389574527740479, "learning_rate": 4.913742523442091e-05, "loss": 0.8703, "step": 1450 }, { "epoch": 0.47332465842550425, "grad_norm": 2.2000226974487305, "learning_rate": 4.913069152778881e-05, "loss": 0.8911, "step": 1455 }, { "epoch": 0.4749512036434613, "grad_norm": 2.386732339859009, "learning_rate": 4.912393210485666e-05, "loss": 0.9334, "step": 1460 }, { "epoch": 0.47657774886141835, "grad_norm": 2.4275944232940674, "learning_rate": 4.911714697282806e-05, "loss": 0.8778, "step": 1465 }, { "epoch": 0.4782042940793754, "grad_norm": 2.639420509338379, "learning_rate": 4.9110336138933964e-05, "loss": 0.8834, "step": 1470 }, { "epoch": 0.47983083929733245, "grad_norm": 1.7982360124588013, "learning_rate": 4.9103499610432744e-05, "loss": 0.8468, "step": 1475 }, { "epoch": 0.4814573845152895, "grad_norm": 2.2488203048706055, "learning_rate": 4.909663739461017e-05, "loss": 0.8559, "step": 1480 }, { "epoch": 0.4830839297332466, "grad_norm": 2.1595704555511475, "learning_rate": 4.908974949877935e-05, "loss": 0.9202, "step": 1485 }, { "epoch": 0.4847104749512036, "grad_norm": 2.029742479324341, "learning_rate": 4.908283593028078e-05, "loss": 0.9053, "step": 1490 }, { "epoch": 0.4863370201691607, "grad_norm": 2.2849221229553223, "learning_rate": 4.907589669648232e-05, "loss": 0.8709, "step": 1495 }, { "epoch": 0.48796356538711777, "grad_norm": 2.050663709640503, "learning_rate": 4.9068931804779175e-05, "loss": 0.8895, "step": 1500 }, { "epoch": 0.48959011060507485, "grad_norm": 1.842971682548523, "learning_rate": 4.9061941262593886e-05, "loss": 0.8773, "step": 1505 }, { "epoch": 0.49121665582303187, "grad_norm": 3.2731707096099854, "learning_rate": 4.905492507737634e-05, "loss": 0.8763, "step": 1510 }, { "epoch": 0.49284320104098894, "grad_norm": 1.898478388786316, "learning_rate": 4.904788325660377e-05, "loss": 0.8569, "step": 1515 }, { "epoch": 0.494469746258946, "grad_norm": 2.5667638778686523, "learning_rate": 4.9040815807780676e-05, "loss": 0.8591, "step": 1520 }, { "epoch": 0.49609629147690304, "grad_norm": 4.3376383781433105, "learning_rate": 4.9033722738438924e-05, "loss": 0.9128, "step": 1525 }, { "epoch": 0.4977228366948601, "grad_norm": 1.9267882108688354, "learning_rate": 4.9026604056137664e-05, "loss": 0.88, "step": 1530 }, { "epoch": 0.4993493819128172, "grad_norm": 1.95156729221344, "learning_rate": 4.901945976846334e-05, "loss": 0.9075, "step": 1535 }, { "epoch": 0.5009759271307742, "grad_norm": 1.811874508857727, "learning_rate": 4.9013725907570775e-05, "loss": 0.8934, "step": 1540 }, { "epoch": 0.5026024723487313, "grad_norm": 2.121002674102783, "learning_rate": 4.900653554942987e-05, "loss": 0.8735, "step": 1545 }, { "epoch": 0.5042290175666884, "grad_norm": 2.8290669918060303, "learning_rate": 4.8999319607303076e-05, "loss": 0.904, "step": 1550 }, { "epoch": 0.5058555627846454, "grad_norm": 1.8206206560134888, "learning_rate": 4.899207808888051e-05, "loss": 0.8922, "step": 1555 }, { "epoch": 0.5074821080026025, "grad_norm": 2.283167839050293, "learning_rate": 4.898481100187953e-05, "loss": 0.8866, "step": 1560 }, { "epoch": 0.5091086532205595, "grad_norm": 3.57720685005188, "learning_rate": 4.897751835404471e-05, "loss": 0.8682, "step": 1565 }, { "epoch": 0.5107351984385166, "grad_norm": 2.4594457149505615, "learning_rate": 4.897020015314791e-05, "loss": 0.8686, "step": 1570 }, { "epoch": 0.5123617436564737, "grad_norm": 2.2869081497192383, "learning_rate": 4.89628564069882e-05, "loss": 0.8681, "step": 1575 }, { "epoch": 0.5139882888744307, "grad_norm": 2.9496114253997803, "learning_rate": 4.8955487123391884e-05, "loss": 0.9034, "step": 1580 }, { "epoch": 0.5156148340923877, "grad_norm": 2.24989652633667, "learning_rate": 4.8948092310212466e-05, "loss": 0.8814, "step": 1585 }, { "epoch": 0.5172413793103449, "grad_norm": 2.880655527114868, "learning_rate": 4.8940671975330674e-05, "loss": 0.8777, "step": 1590 }, { "epoch": 0.5188679245283019, "grad_norm": 1.8517874479293823, "learning_rate": 4.893322612665442e-05, "loss": 0.8696, "step": 1595 }, { "epoch": 0.5204944697462589, "grad_norm": 1.9082527160644531, "learning_rate": 4.8925754772118825e-05, "loss": 0.8732, "step": 1600 }, { "epoch": 0.522121014964216, "grad_norm": 3.6718180179595947, "learning_rate": 4.891825791968617e-05, "loss": 0.9223, "step": 1605 }, { "epoch": 0.523747560182173, "grad_norm": 2.3135855197906494, "learning_rate": 4.8910735577345925e-05, "loss": 0.9122, "step": 1610 }, { "epoch": 0.5253741054001301, "grad_norm": 2.163776397705078, "learning_rate": 4.890318775311471e-05, "loss": 0.9288, "step": 1615 }, { "epoch": 0.5270006506180872, "grad_norm": 3.1857199668884277, "learning_rate": 4.889561445503632e-05, "loss": 0.8805, "step": 1620 }, { "epoch": 0.5286271958360442, "grad_norm": 2.046293258666992, "learning_rate": 4.888801569118169e-05, "loss": 0.879, "step": 1625 }, { "epoch": 0.5302537410540012, "grad_norm": 2.4297502040863037, "learning_rate": 4.888039146964888e-05, "loss": 0.8726, "step": 1630 }, { "epoch": 0.5318802862719584, "grad_norm": 2.176730155944824, "learning_rate": 4.88727417985631e-05, "loss": 0.8869, "step": 1635 }, { "epoch": 0.5335068314899154, "grad_norm": 2.660156011581421, "learning_rate": 4.8865066686076685e-05, "loss": 0.8794, "step": 1640 }, { "epoch": 0.5351333767078725, "grad_norm": 1.9085681438446045, "learning_rate": 4.885736614036907e-05, "loss": 0.8842, "step": 1645 }, { "epoch": 0.5367599219258296, "grad_norm": 2.0674500465393066, "learning_rate": 4.8849640169646784e-05, "loss": 0.9047, "step": 1650 }, { "epoch": 0.5383864671437866, "grad_norm": 1.7518874406814575, "learning_rate": 4.884188878214347e-05, "loss": 0.8731, "step": 1655 }, { "epoch": 0.5400130123617437, "grad_norm": 1.9546101093292236, "learning_rate": 4.883411198611987e-05, "loss": 0.8703, "step": 1660 }, { "epoch": 0.5416395575797007, "grad_norm": 2.0076913833618164, "learning_rate": 4.882630978986377e-05, "loss": 0.8457, "step": 1665 }, { "epoch": 0.5432661027976577, "grad_norm": 1.994197130203247, "learning_rate": 4.881848220169003e-05, "loss": 0.8691, "step": 1670 }, { "epoch": 0.5448926480156149, "grad_norm": 1.8957130908966064, "learning_rate": 4.881062922994061e-05, "loss": 0.9, "step": 1675 }, { "epoch": 0.5465191932335719, "grad_norm": 2.4126741886138916, "learning_rate": 4.8802750882984484e-05, "loss": 0.8792, "step": 1680 }, { "epoch": 0.5481457384515289, "grad_norm": 2.029853105545044, "learning_rate": 4.879484716921767e-05, "loss": 0.8551, "step": 1685 }, { "epoch": 0.549772283669486, "grad_norm": 2.168409585952759, "learning_rate": 4.878691809706324e-05, "loss": 0.8499, "step": 1690 }, { "epoch": 0.5513988288874431, "grad_norm": 1.8969876766204834, "learning_rate": 4.8778963674971276e-05, "loss": 0.8941, "step": 1695 }, { "epoch": 0.5530253741054001, "grad_norm": 2.291750192642212, "learning_rate": 4.877098391141888e-05, "loss": 0.8772, "step": 1700 }, { "epoch": 0.5546519193233572, "grad_norm": 2.1070289611816406, "learning_rate": 4.876297881491015e-05, "loss": 0.8585, "step": 1705 }, { "epoch": 0.5562784645413142, "grad_norm": 2.3090732097625732, "learning_rate": 4.875494839397621e-05, "loss": 0.8563, "step": 1710 }, { "epoch": 0.5579050097592713, "grad_norm": 2.1373162269592285, "learning_rate": 4.874689265717514e-05, "loss": 0.8968, "step": 1715 }, { "epoch": 0.5595315549772284, "grad_norm": 2.126462459564209, "learning_rate": 4.873881161309201e-05, "loss": 0.9171, "step": 1720 }, { "epoch": 0.5611581001951854, "grad_norm": 2.2820169925689697, "learning_rate": 4.8730705270338864e-05, "loss": 0.8692, "step": 1725 }, { "epoch": 0.5627846454131424, "grad_norm": 2.154158353805542, "learning_rate": 4.872257363755471e-05, "loss": 0.8497, "step": 1730 }, { "epoch": 0.5644111906310996, "grad_norm": 2.639125347137451, "learning_rate": 4.871441672340551e-05, "loss": 0.8565, "step": 1735 }, { "epoch": 0.5660377358490566, "grad_norm": 2.3836843967437744, "learning_rate": 4.870623453658416e-05, "loss": 0.9039, "step": 1740 }, { "epoch": 0.5676642810670136, "grad_norm": 2.24796199798584, "learning_rate": 4.869802708581048e-05, "loss": 0.8957, "step": 1745 }, { "epoch": 0.5692908262849707, "grad_norm": 2.960399627685547, "learning_rate": 4.8689794379831244e-05, "loss": 0.8952, "step": 1750 }, { "epoch": 0.5709173715029278, "grad_norm": 2.3349416255950928, "learning_rate": 4.8681536427420106e-05, "loss": 0.8962, "step": 1755 }, { "epoch": 0.5725439167208849, "grad_norm": 1.852830410003662, "learning_rate": 4.867325323737765e-05, "loss": 0.8478, "step": 1760 }, { "epoch": 0.5741704619388419, "grad_norm": 2.357431650161743, "learning_rate": 4.866494481853134e-05, "loss": 0.8773, "step": 1765 }, { "epoch": 0.5757970071567989, "grad_norm": 1.9144153594970703, "learning_rate": 4.865661117973555e-05, "loss": 0.8611, "step": 1770 }, { "epoch": 0.5774235523747561, "grad_norm": 2.0974292755126953, "learning_rate": 4.8648252329871494e-05, "loss": 0.8983, "step": 1775 }, { "epoch": 0.5790500975927131, "grad_norm": 4.729403972625732, "learning_rate": 4.863986827784729e-05, "loss": 0.8712, "step": 1780 }, { "epoch": 0.5806766428106701, "grad_norm": 2.222569227218628, "learning_rate": 4.8631459032597905e-05, "loss": 0.9075, "step": 1785 }, { "epoch": 0.5823031880286272, "grad_norm": 2.1208598613739014, "learning_rate": 4.8623024603085145e-05, "loss": 0.8685, "step": 1790 }, { "epoch": 0.5839297332465843, "grad_norm": 2.058300733566284, "learning_rate": 4.861456499829764e-05, "loss": 0.8901, "step": 1795 }, { "epoch": 0.5855562784645413, "grad_norm": 1.7815157175064087, "learning_rate": 4.86060802272509e-05, "loss": 0.9112, "step": 1800 }, { "epoch": 0.5871828236824984, "grad_norm": 1.6161009073257446, "learning_rate": 4.8597570298987196e-05, "loss": 0.882, "step": 1805 }, { "epoch": 0.5888093689004554, "grad_norm": 2.2587006092071533, "learning_rate": 4.858903522257565e-05, "loss": 0.8494, "step": 1810 }, { "epoch": 0.5904359141184125, "grad_norm": 1.8459244966506958, "learning_rate": 4.858047500711217e-05, "loss": 0.8782, "step": 1815 }, { "epoch": 0.5920624593363696, "grad_norm": 2.19490122795105, "learning_rate": 4.857188966171946e-05, "loss": 0.8794, "step": 1820 }, { "epoch": 0.5936890045543266, "grad_norm": 2.3178391456604004, "learning_rate": 4.856327919554699e-05, "loss": 0.8661, "step": 1825 }, { "epoch": 0.5953155497722836, "grad_norm": 2.1639411449432373, "learning_rate": 4.855464361777102e-05, "loss": 0.888, "step": 1830 }, { "epoch": 0.5969420949902408, "grad_norm": 1.616613507270813, "learning_rate": 4.8545982937594576e-05, "loss": 0.8784, "step": 1835 }, { "epoch": 0.5985686402081978, "grad_norm": 2.395139217376709, "learning_rate": 4.8537297164247405e-05, "loss": 0.8677, "step": 1840 }, { "epoch": 0.6001951854261548, "grad_norm": 6.791170597076416, "learning_rate": 4.8528586306986034e-05, "loss": 0.8745, "step": 1845 }, { "epoch": 0.6018217306441119, "grad_norm": 3.1358447074890137, "learning_rate": 4.8519850375093686e-05, "loss": 0.8636, "step": 1850 }, { "epoch": 0.603448275862069, "grad_norm": 2.2898097038269043, "learning_rate": 4.851108937788034e-05, "loss": 0.9009, "step": 1855 }, { "epoch": 0.605074821080026, "grad_norm": 2.2613158226013184, "learning_rate": 4.850230332468265e-05, "loss": 0.8893, "step": 1860 }, { "epoch": 0.6067013662979831, "grad_norm": 1.6697711944580078, "learning_rate": 4.849349222486402e-05, "loss": 0.8751, "step": 1865 }, { "epoch": 0.6083279115159401, "grad_norm": 1.8398504257202148, "learning_rate": 4.848465608781451e-05, "loss": 0.8845, "step": 1870 }, { "epoch": 0.6099544567338973, "grad_norm": 2.2207603454589844, "learning_rate": 4.8475794922950855e-05, "loss": 0.8973, "step": 1875 }, { "epoch": 0.6115810019518543, "grad_norm": 1.824102520942688, "learning_rate": 4.846690873971651e-05, "loss": 0.8989, "step": 1880 }, { "epoch": 0.6132075471698113, "grad_norm": 2.416241407394409, "learning_rate": 4.8457997547581546e-05, "loss": 0.8822, "step": 1885 }, { "epoch": 0.6148340923877684, "grad_norm": 1.9753471612930298, "learning_rate": 4.84490613560427e-05, "loss": 0.8719, "step": 1890 }, { "epoch": 0.6164606376057254, "grad_norm": 1.9445384740829468, "learning_rate": 4.8440100174623354e-05, "loss": 0.865, "step": 1895 }, { "epoch": 0.6180871828236825, "grad_norm": 1.8253824710845947, "learning_rate": 4.843111401287353e-05, "loss": 0.8734, "step": 1900 }, { "epoch": 0.6197137280416396, "grad_norm": 1.9183560609817505, "learning_rate": 4.842210288036986e-05, "loss": 0.8479, "step": 1905 }, { "epoch": 0.6213402732595966, "grad_norm": 1.9014906883239746, "learning_rate": 4.841306678671558e-05, "loss": 0.898, "step": 1910 }, { "epoch": 0.6229668184775536, "grad_norm": 2.550158739089966, "learning_rate": 4.840400574154056e-05, "loss": 0.8809, "step": 1915 }, { "epoch": 0.6245933636955108, "grad_norm": 6.833774089813232, "learning_rate": 4.8394919754501217e-05, "loss": 0.8755, "step": 1920 }, { "epoch": 0.6262199089134678, "grad_norm": 4.296611309051514, "learning_rate": 4.8385808835280584e-05, "loss": 0.8456, "step": 1925 }, { "epoch": 0.6278464541314248, "grad_norm": 2.215914487838745, "learning_rate": 4.8376672993588245e-05, "loss": 0.8969, "step": 1930 }, { "epoch": 0.629472999349382, "grad_norm": 2.6802151203155518, "learning_rate": 4.8367512239160356e-05, "loss": 0.8768, "step": 1935 }, { "epoch": 0.631099544567339, "grad_norm": 2.843503475189209, "learning_rate": 4.8358326581759615e-05, "loss": 0.8593, "step": 1940 }, { "epoch": 0.632726089785296, "grad_norm": 2.6657474040985107, "learning_rate": 4.834911603117526e-05, "loss": 0.8932, "step": 1945 }, { "epoch": 0.6343526350032531, "grad_norm": 1.8572765588760376, "learning_rate": 4.833988059722306e-05, "loss": 0.8921, "step": 1950 }, { "epoch": 0.6359791802212101, "grad_norm": 3.170466184616089, "learning_rate": 4.833062028974531e-05, "loss": 0.8409, "step": 1955 }, { "epoch": 0.6376057254391672, "grad_norm": 1.8783475160598755, "learning_rate": 4.832133511861079e-05, "loss": 0.8651, "step": 1960 }, { "epoch": 0.6392322706571243, "grad_norm": 2.4108924865722656, "learning_rate": 4.8312025093714806e-05, "loss": 0.8651, "step": 1965 }, { "epoch": 0.6408588158750813, "grad_norm": 3.5263514518737793, "learning_rate": 4.830269022497913e-05, "loss": 0.8509, "step": 1970 }, { "epoch": 0.6424853610930383, "grad_norm": 2.5810863971710205, "learning_rate": 4.829333052235202e-05, "loss": 0.8692, "step": 1975 }, { "epoch": 0.6441119063109955, "grad_norm": 2.0188722610473633, "learning_rate": 4.82839459958082e-05, "loss": 0.8697, "step": 1980 }, { "epoch": 0.6457384515289525, "grad_norm": 2.043989896774292, "learning_rate": 4.8274536655348834e-05, "loss": 0.8836, "step": 1985 }, { "epoch": 0.6473649967469096, "grad_norm": 2.661475419998169, "learning_rate": 4.826510251100155e-05, "loss": 0.8619, "step": 1990 }, { "epoch": 0.6489915419648666, "grad_norm": 2.208104133605957, "learning_rate": 4.825564357282041e-05, "loss": 0.8879, "step": 1995 }, { "epoch": 0.6506180871828237, "grad_norm": 4.05889368057251, "learning_rate": 4.8246159850885883e-05, "loss": 0.8645, "step": 2000 }, { "epoch": 0.6522446324007808, "grad_norm": 2.1343226432800293, "learning_rate": 4.823665135530486e-05, "loss": 0.8617, "step": 2005 }, { "epoch": 0.6538711776187378, "grad_norm": 2.087005615234375, "learning_rate": 4.822711809621063e-05, "loss": 0.8902, "step": 2010 }, { "epoch": 0.6554977228366948, "grad_norm": 4.40689754486084, "learning_rate": 4.821756008376289e-05, "loss": 0.9144, "step": 2015 }, { "epoch": 0.657124268054652, "grad_norm": 3.3655898571014404, "learning_rate": 4.820797732814768e-05, "loss": 0.8924, "step": 2020 }, { "epoch": 0.658750813272609, "grad_norm": 2.2318570613861084, "learning_rate": 4.819836983957744e-05, "loss": 0.8448, "step": 2025 }, { "epoch": 0.660377358490566, "grad_norm": 2.6342506408691406, "learning_rate": 4.818873762829097e-05, "loss": 0.8864, "step": 2030 }, { "epoch": 0.6620039037085231, "grad_norm": 2.1214475631713867, "learning_rate": 4.8179080704553386e-05, "loss": 0.8495, "step": 2035 }, { "epoch": 0.6636304489264802, "grad_norm": 1.7027548551559448, "learning_rate": 4.8169399078656175e-05, "loss": 0.8495, "step": 2040 }, { "epoch": 0.6652569941444372, "grad_norm": 2.5632596015930176, "learning_rate": 4.815969276091713e-05, "loss": 0.8696, "step": 2045 }, { "epoch": 0.6668835393623943, "grad_norm": 2.2075538635253906, "learning_rate": 4.814996176168036e-05, "loss": 0.8568, "step": 2050 }, { "epoch": 0.6685100845803513, "grad_norm": 2.355635404586792, "learning_rate": 4.8140206091316284e-05, "loss": 0.8534, "step": 2055 }, { "epoch": 0.6701366297983083, "grad_norm": 2.9022514820098877, "learning_rate": 4.8130425760221615e-05, "loss": 0.8704, "step": 2060 }, { "epoch": 0.6717631750162655, "grad_norm": 2.1720938682556152, "learning_rate": 4.8120620778819334e-05, "loss": 0.8646, "step": 2065 }, { "epoch": 0.6733897202342225, "grad_norm": 2.2691755294799805, "learning_rate": 4.81107911575587e-05, "loss": 0.8392, "step": 2070 }, { "epoch": 0.6750162654521795, "grad_norm": 1.8914642333984375, "learning_rate": 4.810093690691524e-05, "loss": 0.8764, "step": 2075 }, { "epoch": 0.6766428106701367, "grad_norm": 1.8982810974121094, "learning_rate": 4.80910580373907e-05, "loss": 0.8917, "step": 2080 }, { "epoch": 0.6782693558880937, "grad_norm": 2.189270496368408, "learning_rate": 4.8081154559513106e-05, "loss": 0.9098, "step": 2085 }, { "epoch": 0.6798959011060507, "grad_norm": 2.0608716011047363, "learning_rate": 4.807122648383667e-05, "loss": 0.8694, "step": 2090 }, { "epoch": 0.6815224463240078, "grad_norm": 3.0705621242523193, "learning_rate": 4.8061273820941845e-05, "loss": 0.86, "step": 2095 }, { "epoch": 0.6831489915419648, "grad_norm": 2.9095020294189453, "learning_rate": 4.8051296581435264e-05, "loss": 0.876, "step": 2100 }, { "epoch": 0.684775536759922, "grad_norm": 1.7684783935546875, "learning_rate": 4.804129477594977e-05, "loss": 0.8863, "step": 2105 }, { "epoch": 0.686402081977879, "grad_norm": 2.1000592708587646, "learning_rate": 4.803126841514437e-05, "loss": 0.9342, "step": 2110 }, { "epoch": 0.688028627195836, "grad_norm": 2.359872579574585, "learning_rate": 4.802121750970425e-05, "loss": 0.8656, "step": 2115 }, { "epoch": 0.6896551724137931, "grad_norm": 2.3630592823028564, "learning_rate": 4.8011142070340774e-05, "loss": 0.8587, "step": 2120 }, { "epoch": 0.6912817176317502, "grad_norm": 2.767029285430908, "learning_rate": 4.8001042107791405e-05, "loss": 0.8634, "step": 2125 }, { "epoch": 0.6929082628497072, "grad_norm": 6.400025367736816, "learning_rate": 4.799091763281978e-05, "loss": 0.8751, "step": 2130 }, { "epoch": 0.6945348080676643, "grad_norm": 3.766014814376831, "learning_rate": 4.798076865621564e-05, "loss": 0.8715, "step": 2135 }, { "epoch": 0.6961613532856213, "grad_norm": 1.9491114616394043, "learning_rate": 4.7970595188794846e-05, "loss": 0.8969, "step": 2140 }, { "epoch": 0.6977878985035784, "grad_norm": 2.2208404541015625, "learning_rate": 4.796039724139935e-05, "loss": 0.8848, "step": 2145 }, { "epoch": 0.6994144437215355, "grad_norm": 2.700895309448242, "learning_rate": 4.79501748248972e-05, "loss": 0.8695, "step": 2150 }, { "epoch": 0.7010409889394925, "grad_norm": 2.6097826957702637, "learning_rate": 4.793992795018253e-05, "loss": 0.8604, "step": 2155 }, { "epoch": 0.7026675341574495, "grad_norm": 1.8880006074905396, "learning_rate": 4.792965662817551e-05, "loss": 0.885, "step": 2160 }, { "epoch": 0.7042940793754067, "grad_norm": 3.3072919845581055, "learning_rate": 4.7919360869822394e-05, "loss": 0.9164, "step": 2165 }, { "epoch": 0.7059206245933637, "grad_norm": 2.4248101711273193, "learning_rate": 4.790904068609546e-05, "loss": 0.8677, "step": 2170 }, { "epoch": 0.7075471698113207, "grad_norm": 2.345674991607666, "learning_rate": 4.789869608799304e-05, "loss": 0.837, "step": 2175 }, { "epoch": 0.7091737150292778, "grad_norm": 1.677289605140686, "learning_rate": 4.7888327086539446e-05, "loss": 0.8544, "step": 2180 }, { "epoch": 0.7108002602472349, "grad_norm": 2.025778293609619, "learning_rate": 4.787793369278503e-05, "loss": 0.8867, "step": 2185 }, { "epoch": 0.7124268054651919, "grad_norm": 2.362762689590454, "learning_rate": 4.7867515917806124e-05, "loss": 0.8777, "step": 2190 }, { "epoch": 0.714053350683149, "grad_norm": 5.319698333740234, "learning_rate": 4.785707377270505e-05, "loss": 0.858, "step": 2195 }, { "epoch": 0.715679895901106, "grad_norm": 4.411806106567383, "learning_rate": 4.784660726861009e-05, "loss": 0.8466, "step": 2200 }, { "epoch": 0.7173064411190632, "grad_norm": 2.4684970378875732, "learning_rate": 4.78361164166755e-05, "loss": 0.9027, "step": 2205 }, { "epoch": 0.7189329863370202, "grad_norm": 2.4765100479125977, "learning_rate": 4.7825601228081484e-05, "loss": 0.8433, "step": 2210 }, { "epoch": 0.7205595315549772, "grad_norm": 1.9810550212860107, "learning_rate": 4.781506171403416e-05, "loss": 0.8736, "step": 2215 }, { "epoch": 0.7221860767729343, "grad_norm": 1.7304781675338745, "learning_rate": 4.78044978857656e-05, "loss": 0.8681, "step": 2220 }, { "epoch": 0.7238126219908914, "grad_norm": 1.7415772676467896, "learning_rate": 4.779390975453376e-05, "loss": 0.8731, "step": 2225 }, { "epoch": 0.7254391672088484, "grad_norm": 1.787624716758728, "learning_rate": 4.778329733162251e-05, "loss": 0.8768, "step": 2230 }, { "epoch": 0.7270657124268055, "grad_norm": 2.4273312091827393, "learning_rate": 4.777266062834162e-05, "loss": 0.858, "step": 2235 }, { "epoch": 0.7286922576447625, "grad_norm": 1.8364176750183105, "learning_rate": 4.776199965602671e-05, "loss": 0.892, "step": 2240 }, { "epoch": 0.7303188028627196, "grad_norm": 1.8039227724075317, "learning_rate": 4.7751314426039286e-05, "loss": 0.8345, "step": 2245 }, { "epoch": 0.7319453480806767, "grad_norm": 1.9627631902694702, "learning_rate": 4.7740604949766686e-05, "loss": 0.8597, "step": 2250 }, { "epoch": 0.7335718932986337, "grad_norm": 1.7530760765075684, "learning_rate": 4.77298712386221e-05, "loss": 0.885, "step": 2255 }, { "epoch": 0.7351984385165907, "grad_norm": 2.443120002746582, "learning_rate": 4.771911330404455e-05, "loss": 0.864, "step": 2260 }, { "epoch": 0.7368249837345479, "grad_norm": 1.866398572921753, "learning_rate": 4.7708331157498855e-05, "loss": 0.9056, "step": 2265 }, { "epoch": 0.7384515289525049, "grad_norm": 5.161078929901123, "learning_rate": 4.769752481047566e-05, "loss": 0.8454, "step": 2270 }, { "epoch": 0.7400780741704619, "grad_norm": 1.9637868404388428, "learning_rate": 4.768669427449137e-05, "loss": 0.8917, "step": 2275 }, { "epoch": 0.741704619388419, "grad_norm": 2.199129819869995, "learning_rate": 4.767583956108821e-05, "loss": 0.8515, "step": 2280 }, { "epoch": 0.743331164606376, "grad_norm": 2.3960113525390625, "learning_rate": 4.766496068183413e-05, "loss": 0.8803, "step": 2285 }, { "epoch": 0.7449577098243331, "grad_norm": 2.398287534713745, "learning_rate": 4.765405764832286e-05, "loss": 0.8582, "step": 2290 }, { "epoch": 0.7465842550422902, "grad_norm": 1.8750267028808594, "learning_rate": 4.764313047217386e-05, "loss": 0.8609, "step": 2295 }, { "epoch": 0.7482108002602472, "grad_norm": 2.4340929985046387, "learning_rate": 4.763217916503233e-05, "loss": 0.846, "step": 2300 }, { "epoch": 0.7498373454782042, "grad_norm": 2.125413656234741, "learning_rate": 4.762120373856917e-05, "loss": 0.8741, "step": 2305 }, { "epoch": 0.7514638906961614, "grad_norm": 2.3385329246520996, "learning_rate": 4.761020420448099e-05, "loss": 0.8747, "step": 2310 }, { "epoch": 0.7530904359141184, "grad_norm": 2.125859260559082, "learning_rate": 4.759918057449011e-05, "loss": 0.8934, "step": 2315 }, { "epoch": 0.7547169811320755, "grad_norm": 3.7632899284362793, "learning_rate": 4.758813286034449e-05, "loss": 0.9177, "step": 2320 }, { "epoch": 0.7563435263500325, "grad_norm": 3.737948417663574, "learning_rate": 4.7577061073817806e-05, "loss": 0.9371, "step": 2325 }, { "epoch": 0.7579700715679896, "grad_norm": 3.86966609954834, "learning_rate": 4.7565965226709353e-05, "loss": 0.8809, "step": 2330 }, { "epoch": 0.7595966167859467, "grad_norm": 1.5923842191696167, "learning_rate": 4.7554845330844066e-05, "loss": 0.9081, "step": 2335 }, { "epoch": 0.7612231620039037, "grad_norm": 2.317854166030884, "learning_rate": 4.754370139807254e-05, "loss": 0.8979, "step": 2340 }, { "epoch": 0.7628497072218607, "grad_norm": 1.9626786708831787, "learning_rate": 4.753253344027095e-05, "loss": 0.8509, "step": 2345 }, { "epoch": 0.7644762524398179, "grad_norm": 2.554448127746582, "learning_rate": 4.7521341469341106e-05, "loss": 0.861, "step": 2350 }, { "epoch": 0.7661027976577749, "grad_norm": 2.060730457305908, "learning_rate": 4.7510125497210386e-05, "loss": 0.8527, "step": 2355 }, { "epoch": 0.7677293428757319, "grad_norm": 2.024789571762085, "learning_rate": 4.749888553583175e-05, "loss": 0.8477, "step": 2360 }, { "epoch": 0.769355888093689, "grad_norm": 1.6188427209854126, "learning_rate": 4.7487621597183734e-05, "loss": 0.852, "step": 2365 }, { "epoch": 0.7709824333116461, "grad_norm": 2.30036997795105, "learning_rate": 4.747633369327043e-05, "loss": 0.8654, "step": 2370 }, { "epoch": 0.7726089785296031, "grad_norm": 2.2033796310424805, "learning_rate": 4.746502183612144e-05, "loss": 0.8451, "step": 2375 }, { "epoch": 0.7742355237475602, "grad_norm": 2.1281795501708984, "learning_rate": 4.745368603779193e-05, "loss": 0.9064, "step": 2380 }, { "epoch": 0.7758620689655172, "grad_norm": 2.251528263092041, "learning_rate": 4.7442326310362565e-05, "loss": 0.8481, "step": 2385 }, { "epoch": 0.7774886141834743, "grad_norm": 2.573554039001465, "learning_rate": 4.743094266593951e-05, "loss": 0.8699, "step": 2390 }, { "epoch": 0.7791151594014314, "grad_norm": 2.4464991092681885, "learning_rate": 4.74195351166544e-05, "loss": 0.9089, "step": 2395 }, { "epoch": 0.7807417046193884, "grad_norm": 2.2439992427825928, "learning_rate": 4.74081036746644e-05, "loss": 0.8616, "step": 2400 }, { "epoch": 0.7823682498373454, "grad_norm": 2.5337162017822266, "learning_rate": 4.7396648352152075e-05, "loss": 0.8716, "step": 2405 }, { "epoch": 0.7839947950553026, "grad_norm": 1.8930824995040894, "learning_rate": 4.7385169161325485e-05, "loss": 0.8365, "step": 2410 }, { "epoch": 0.7856213402732596, "grad_norm": 2.614783763885498, "learning_rate": 4.737366611441809e-05, "loss": 0.8967, "step": 2415 }, { "epoch": 0.7872478854912166, "grad_norm": 1.6895397901535034, "learning_rate": 4.736213922368881e-05, "loss": 0.8813, "step": 2420 }, { "epoch": 0.7888744307091737, "grad_norm": 4.0224223136901855, "learning_rate": 4.735058850142196e-05, "loss": 0.8602, "step": 2425 }, { "epoch": 0.7905009759271308, "grad_norm": 3.099029302597046, "learning_rate": 4.733901395992724e-05, "loss": 0.8604, "step": 2430 }, { "epoch": 0.7921275211450879, "grad_norm": 2.1614017486572266, "learning_rate": 4.732741561153974e-05, "loss": 0.87, "step": 2435 }, { "epoch": 0.7937540663630449, "grad_norm": 5.100318908691406, "learning_rate": 4.7315793468619954e-05, "loss": 0.8428, "step": 2440 }, { "epoch": 0.7953806115810019, "grad_norm": 3.786672830581665, "learning_rate": 4.730414754355368e-05, "loss": 0.8319, "step": 2445 }, { "epoch": 0.7970071567989591, "grad_norm": 2.4570653438568115, "learning_rate": 4.729247784875209e-05, "loss": 0.8752, "step": 2450 }, { "epoch": 0.7986337020169161, "grad_norm": 1.931662917137146, "learning_rate": 4.72807843966517e-05, "loss": 0.8473, "step": 2455 }, { "epoch": 0.8002602472348731, "grad_norm": 9.377140045166016, "learning_rate": 4.7269067199714324e-05, "loss": 0.8335, "step": 2460 }, { "epoch": 0.8018867924528302, "grad_norm": 2.5859813690185547, "learning_rate": 4.725732627042707e-05, "loss": 0.8586, "step": 2465 }, { "epoch": 0.8035133376707873, "grad_norm": 2.093609094619751, "learning_rate": 4.724556162130238e-05, "loss": 0.8479, "step": 2470 }, { "epoch": 0.8051398828887443, "grad_norm": 3.501932144165039, "learning_rate": 4.723377326487794e-05, "loss": 0.8601, "step": 2475 }, { "epoch": 0.8067664281067014, "grad_norm": 2.1503453254699707, "learning_rate": 4.7221961213716695e-05, "loss": 0.8555, "step": 2480 }, { "epoch": 0.8083929733246584, "grad_norm": 2.191429376602173, "learning_rate": 4.721012548040688e-05, "loss": 0.862, "step": 2485 }, { "epoch": 0.8100195185426154, "grad_norm": 2.2335188388824463, "learning_rate": 4.7198266077561925e-05, "loss": 0.8459, "step": 2490 }, { "epoch": 0.8116460637605726, "grad_norm": 2.0258047580718994, "learning_rate": 4.7186383017820516e-05, "loss": 0.8639, "step": 2495 }, { "epoch": 0.8132726089785296, "grad_norm": 3.4457333087921143, "learning_rate": 4.717447631384654e-05, "loss": 0.8673, "step": 2500 }, { "epoch": 0.8148991541964866, "grad_norm": 2.7301559448242188, "learning_rate": 4.7162545978329076e-05, "loss": 0.8611, "step": 2505 }, { "epoch": 0.8165256994144438, "grad_norm": 2.1690988540649414, "learning_rate": 4.715059202398239e-05, "loss": 0.8665, "step": 2510 }, { "epoch": 0.8181522446324008, "grad_norm": 2.5224900245666504, "learning_rate": 4.7138614463545924e-05, "loss": 0.8405, "step": 2515 }, { "epoch": 0.8197787898503578, "grad_norm": 2.2839272022247314, "learning_rate": 4.712661330978428e-05, "loss": 0.8488, "step": 2520 }, { "epoch": 0.8214053350683149, "grad_norm": 3.877641439437866, "learning_rate": 4.711458857548719e-05, "loss": 0.883, "step": 2525 }, { "epoch": 0.8230318802862719, "grad_norm": 2.267813205718994, "learning_rate": 4.710254027346953e-05, "loss": 0.8822, "step": 2530 }, { "epoch": 0.824658425504229, "grad_norm": 1.544561743736267, "learning_rate": 4.709046841657129e-05, "loss": 0.8479, "step": 2535 }, { "epoch": 0.8262849707221861, "grad_norm": 1.6922204494476318, "learning_rate": 4.707837301765754e-05, "loss": 0.8411, "step": 2540 }, { "epoch": 0.8279115159401431, "grad_norm": 1.6176890134811401, "learning_rate": 4.706625408961848e-05, "loss": 0.8749, "step": 2545 }, { "epoch": 0.8295380611581002, "grad_norm": 2.7533674240112305, "learning_rate": 4.705411164536935e-05, "loss": 0.8706, "step": 2550 }, { "epoch": 0.8311646063760573, "grad_norm": 1.8867239952087402, "learning_rate": 4.7041945697850466e-05, "loss": 0.8325, "step": 2555 }, { "epoch": 0.8327911515940143, "grad_norm": 2.392642021179199, "learning_rate": 4.7029756260027197e-05, "loss": 0.8796, "step": 2560 }, { "epoch": 0.8344176968119714, "grad_norm": 1.9180896282196045, "learning_rate": 4.701754334488993e-05, "loss": 0.8565, "step": 2565 }, { "epoch": 0.8360442420299284, "grad_norm": 2.039537191390991, "learning_rate": 4.700530696545409e-05, "loss": 0.8845, "step": 2570 }, { "epoch": 0.8376707872478855, "grad_norm": 1.9221045970916748, "learning_rate": 4.699304713476009e-05, "loss": 0.8876, "step": 2575 }, { "epoch": 0.8392973324658426, "grad_norm": 1.6282926797866821, "learning_rate": 4.698076386587335e-05, "loss": 0.8514, "step": 2580 }, { "epoch": 0.8409238776837996, "grad_norm": 2.0738115310668945, "learning_rate": 4.6968457171884274e-05, "loss": 0.8403, "step": 2585 }, { "epoch": 0.8425504229017566, "grad_norm": 2.1336333751678467, "learning_rate": 4.695612706590819e-05, "loss": 0.8634, "step": 2590 }, { "epoch": 0.8441769681197138, "grad_norm": 1.7246681451797485, "learning_rate": 4.6943773561085435e-05, "loss": 0.8568, "step": 2595 }, { "epoch": 0.8458035133376708, "grad_norm": 1.9486162662506104, "learning_rate": 4.693139667058123e-05, "loss": 0.8453, "step": 2600 }, { "epoch": 0.8474300585556278, "grad_norm": 2.4294521808624268, "learning_rate": 4.691899640758576e-05, "loss": 0.8708, "step": 2605 }, { "epoch": 0.8490566037735849, "grad_norm": 1.9123241901397705, "learning_rate": 4.690657278531409e-05, "loss": 0.8639, "step": 2610 }, { "epoch": 0.850683148991542, "grad_norm": 1.5926252603530884, "learning_rate": 4.689412581700618e-05, "loss": 0.8444, "step": 2615 }, { "epoch": 0.852309694209499, "grad_norm": 2.1733107566833496, "learning_rate": 4.6881655515926907e-05, "loss": 0.8815, "step": 2620 }, { "epoch": 0.8539362394274561, "grad_norm": 1.63432776927948, "learning_rate": 4.686916189536594e-05, "loss": 0.8626, "step": 2625 }, { "epoch": 0.8555627846454131, "grad_norm": 2.2323520183563232, "learning_rate": 4.685664496863789e-05, "loss": 0.8533, "step": 2630 }, { "epoch": 0.8571893298633702, "grad_norm": 1.6251574754714966, "learning_rate": 4.6844104749082144e-05, "loss": 0.8621, "step": 2635 }, { "epoch": 0.8588158750813273, "grad_norm": 1.9397878646850586, "learning_rate": 4.683154125006293e-05, "loss": 0.876, "step": 2640 }, { "epoch": 0.8604424202992843, "grad_norm": 1.4942872524261475, "learning_rate": 4.681895448496927e-05, "loss": 0.8755, "step": 2645 }, { "epoch": 0.8620689655172413, "grad_norm": 2.520001173019409, "learning_rate": 4.680634446721501e-05, "loss": 0.8728, "step": 2650 }, { "epoch": 0.8636955107351985, "grad_norm": 1.6082730293273926, "learning_rate": 4.679371121023877e-05, "loss": 0.85, "step": 2655 }, { "epoch": 0.8653220559531555, "grad_norm": 2.3203847408294678, "learning_rate": 4.678105472750391e-05, "loss": 0.8446, "step": 2660 }, { "epoch": 0.8669486011711126, "grad_norm": 2.4071483612060547, "learning_rate": 4.6768375032498577e-05, "loss": 0.882, "step": 2665 }, { "epoch": 0.8685751463890696, "grad_norm": 2.0908102989196777, "learning_rate": 4.675567213873563e-05, "loss": 0.9034, "step": 2670 }, { "epoch": 0.8702016916070267, "grad_norm": 1.8213086128234863, "learning_rate": 4.6742946059752654e-05, "loss": 0.8876, "step": 2675 }, { "epoch": 0.8718282368249838, "grad_norm": 2.208648681640625, "learning_rate": 4.673019680911196e-05, "loss": 0.861, "step": 2680 }, { "epoch": 0.8734547820429408, "grad_norm": 2.068134307861328, "learning_rate": 4.671742440040053e-05, "loss": 0.8674, "step": 2685 }, { "epoch": 0.8750813272608978, "grad_norm": 3.2821731567382812, "learning_rate": 4.670462884723005e-05, "loss": 0.834, "step": 2690 }, { "epoch": 0.876707872478855, "grad_norm": 1.8073285818099976, "learning_rate": 4.6691810163236855e-05, "loss": 0.8688, "step": 2695 }, { "epoch": 0.878334417696812, "grad_norm": 2.1804659366607666, "learning_rate": 4.667896836208192e-05, "loss": 0.8713, "step": 2700 }, { "epoch": 0.879960962914769, "grad_norm": 2.100541830062866, "learning_rate": 4.66661034574509e-05, "loss": 0.8517, "step": 2705 }, { "epoch": 0.8815875081327261, "grad_norm": 1.5606156587600708, "learning_rate": 4.665321546305401e-05, "loss": 0.8812, "step": 2710 }, { "epoch": 0.8832140533506831, "grad_norm": 1.7539852857589722, "learning_rate": 4.664030439262613e-05, "loss": 0.8356, "step": 2715 }, { "epoch": 0.8848405985686402, "grad_norm": 2.1108126640319824, "learning_rate": 4.6627370259926695e-05, "loss": 0.8198, "step": 2720 }, { "epoch": 0.8864671437865973, "grad_norm": 2.120741605758667, "learning_rate": 4.661441307873973e-05, "loss": 0.8813, "step": 2725 }, { "epoch": 0.8880936890045543, "grad_norm": 1.7935503721237183, "learning_rate": 4.660143286287381e-05, "loss": 0.8686, "step": 2730 }, { "epoch": 0.8897202342225113, "grad_norm": 1.799494981765747, "learning_rate": 4.6588429626162095e-05, "loss": 0.8386, "step": 2735 }, { "epoch": 0.8913467794404685, "grad_norm": 1.954468846321106, "learning_rate": 4.657540338246224e-05, "loss": 0.8909, "step": 2740 }, { "epoch": 0.8929733246584255, "grad_norm": 2.7000679969787598, "learning_rate": 4.656235414565644e-05, "loss": 0.8463, "step": 2745 }, { "epoch": 0.8945998698763825, "grad_norm": 1.9565421342849731, "learning_rate": 4.654928192965137e-05, "loss": 0.8411, "step": 2750 }, { "epoch": 0.8962264150943396, "grad_norm": 2.5602779388427734, "learning_rate": 4.653618674837823e-05, "loss": 0.8649, "step": 2755 }, { "epoch": 0.8978529603122967, "grad_norm": 1.341413974761963, "learning_rate": 4.652306861579266e-05, "loss": 0.8423, "step": 2760 }, { "epoch": 0.8994795055302537, "grad_norm": 1.5747458934783936, "learning_rate": 4.6509927545874795e-05, "loss": 0.8435, "step": 2765 }, { "epoch": 0.9011060507482108, "grad_norm": 2.1025686264038086, "learning_rate": 4.6496763552629174e-05, "loss": 0.8145, "step": 2770 }, { "epoch": 0.9027325959661678, "grad_norm": 2.007817506790161, "learning_rate": 4.6483576650084795e-05, "loss": 0.9037, "step": 2775 }, { "epoch": 0.904359141184125, "grad_norm": 1.7015758752822876, "learning_rate": 4.6470366852295057e-05, "loss": 0.8736, "step": 2780 }, { "epoch": 0.905985686402082, "grad_norm": 1.8775756359100342, "learning_rate": 4.645713417333777e-05, "loss": 0.8791, "step": 2785 }, { "epoch": 0.907612231620039, "grad_norm": 1.6114087104797363, "learning_rate": 4.644387862731511e-05, "loss": 0.8418, "step": 2790 }, { "epoch": 0.9092387768379961, "grad_norm": 2.247753620147705, "learning_rate": 4.643060022835365e-05, "loss": 0.8993, "step": 2795 }, { "epoch": 0.9108653220559532, "grad_norm": 1.7884836196899414, "learning_rate": 4.641729899060429e-05, "loss": 0.8357, "step": 2800 }, { "epoch": 0.9124918672739102, "grad_norm": 8.958685874938965, "learning_rate": 4.6403974928242286e-05, "loss": 0.8406, "step": 2805 }, { "epoch": 0.9141184124918673, "grad_norm": 1.895634412765503, "learning_rate": 4.639062805546721e-05, "loss": 0.8973, "step": 2810 }, { "epoch": 0.9157449577098243, "grad_norm": 1.6189725399017334, "learning_rate": 4.6377258386502956e-05, "loss": 0.8472, "step": 2815 }, { "epoch": 0.9173715029277814, "grad_norm": 6.746943950653076, "learning_rate": 4.636386593559769e-05, "loss": 0.8247, "step": 2820 }, { "epoch": 0.9189980481457385, "grad_norm": 2.1239964962005615, "learning_rate": 4.6350450717023886e-05, "loss": 0.8693, "step": 2825 }, { "epoch": 0.9206245933636955, "grad_norm": 1.823723554611206, "learning_rate": 4.6337012745078254e-05, "loss": 0.8896, "step": 2830 }, { "epoch": 0.9222511385816525, "grad_norm": 1.6684235334396362, "learning_rate": 4.6323552034081773e-05, "loss": 0.8505, "step": 2835 }, { "epoch": 0.9238776837996097, "grad_norm": 1.6994017362594604, "learning_rate": 4.631006859837964e-05, "loss": 0.8624, "step": 2840 }, { "epoch": 0.9255042290175667, "grad_norm": 2.0857439041137695, "learning_rate": 4.629656245234127e-05, "loss": 0.8559, "step": 2845 }, { "epoch": 0.9271307742355237, "grad_norm": 2.246621608734131, "learning_rate": 4.628303361036031e-05, "loss": 0.8567, "step": 2850 }, { "epoch": 0.9287573194534808, "grad_norm": 1.9449462890625, "learning_rate": 4.626948208685454e-05, "loss": 0.8432, "step": 2855 }, { "epoch": 0.9303838646714379, "grad_norm": 2.311030864715576, "learning_rate": 4.625590789626597e-05, "loss": 0.8541, "step": 2860 }, { "epoch": 0.9320104098893949, "grad_norm": 2.238831043243408, "learning_rate": 4.6242311053060725e-05, "loss": 0.8271, "step": 2865 }, { "epoch": 0.933636955107352, "grad_norm": 2.0517778396606445, "learning_rate": 4.6228691571729095e-05, "loss": 0.8739, "step": 2870 }, { "epoch": 0.935263500325309, "grad_norm": 1.7728257179260254, "learning_rate": 4.621504946678548e-05, "loss": 0.8727, "step": 2875 }, { "epoch": 0.936890045543266, "grad_norm": 2.1137027740478516, "learning_rate": 4.62013847527684e-05, "loss": 0.8521, "step": 2880 }, { "epoch": 0.9385165907612232, "grad_norm": 2.272860050201416, "learning_rate": 4.6187697444240466e-05, "loss": 0.8431, "step": 2885 }, { "epoch": 0.9401431359791802, "grad_norm": 1.9327101707458496, "learning_rate": 4.6173987555788366e-05, "loss": 0.872, "step": 2890 }, { "epoch": 0.9417696811971373, "grad_norm": 1.739867925643921, "learning_rate": 4.6160255102022865e-05, "loss": 0.8599, "step": 2895 }, { "epoch": 0.9433962264150944, "grad_norm": 1.8884609937667847, "learning_rate": 4.614650009757875e-05, "loss": 0.8507, "step": 2900 }, { "epoch": 0.9450227716330514, "grad_norm": 2.3399882316589355, "learning_rate": 4.613272255711486e-05, "loss": 0.8606, "step": 2905 }, { "epoch": 0.9466493168510085, "grad_norm": 2.2510006427764893, "learning_rate": 4.611892249531405e-05, "loss": 0.8765, "step": 2910 }, { "epoch": 0.9482758620689655, "grad_norm": 1.770300269126892, "learning_rate": 4.610509992688319e-05, "loss": 0.8449, "step": 2915 }, { "epoch": 0.9499024072869225, "grad_norm": 2.5434508323669434, "learning_rate": 4.609125486655308e-05, "loss": 0.863, "step": 2920 }, { "epoch": 0.9515289525048797, "grad_norm": 1.7364468574523926, "learning_rate": 4.607738732907856e-05, "loss": 0.8432, "step": 2925 }, { "epoch": 0.9531554977228367, "grad_norm": 2.0038013458251953, "learning_rate": 4.606349732923837e-05, "loss": 0.8606, "step": 2930 }, { "epoch": 0.9547820429407937, "grad_norm": 1.9203795194625854, "learning_rate": 4.604958488183523e-05, "loss": 0.8497, "step": 2935 }, { "epoch": 0.9564085881587508, "grad_norm": 1.5324605703353882, "learning_rate": 4.603565000169576e-05, "loss": 0.8482, "step": 2940 }, { "epoch": 0.9580351333767079, "grad_norm": 1.6473455429077148, "learning_rate": 4.602169270367048e-05, "loss": 0.844, "step": 2945 }, { "epoch": 0.9596616785946649, "grad_norm": 1.987318992614746, "learning_rate": 4.600771300263381e-05, "loss": 0.8641, "step": 2950 }, { "epoch": 0.961288223812622, "grad_norm": 1.7535080909729004, "learning_rate": 4.5993710913484065e-05, "loss": 0.8919, "step": 2955 }, { "epoch": 0.962914769030579, "grad_norm": 1.718846082687378, "learning_rate": 4.597968645114338e-05, "loss": 0.8473, "step": 2960 }, { "epoch": 0.9645413142485361, "grad_norm": 1.794784426689148, "learning_rate": 4.596563963055775e-05, "loss": 0.8786, "step": 2965 }, { "epoch": 0.9661678594664932, "grad_norm": 1.9121543169021606, "learning_rate": 4.5951570466697034e-05, "loss": 0.846, "step": 2970 }, { "epoch": 0.9677944046844502, "grad_norm": 1.9233866930007935, "learning_rate": 4.593747897455484e-05, "loss": 0.8646, "step": 2975 }, { "epoch": 0.9694209499024072, "grad_norm": 2.0550642013549805, "learning_rate": 4.59233651691486e-05, "loss": 0.8518, "step": 2980 }, { "epoch": 0.9710474951203644, "grad_norm": 1.9614688158035278, "learning_rate": 4.590922906551954e-05, "loss": 0.8722, "step": 2985 }, { "epoch": 0.9726740403383214, "grad_norm": 1.5085740089416504, "learning_rate": 4.5895070678732635e-05, "loss": 0.8485, "step": 2990 }, { "epoch": 0.9743005855562785, "grad_norm": 2.253807306289673, "learning_rate": 4.58808900238766e-05, "loss": 0.8831, "step": 2995 }, { "epoch": 0.9759271307742355, "grad_norm": 2.406322956085205, "learning_rate": 4.586668711606389e-05, "loss": 0.8215, "step": 3000 }, { "epoch": 0.9775536759921926, "grad_norm": 1.6362812519073486, "learning_rate": 4.585246197043068e-05, "loss": 0.8557, "step": 3005 }, { "epoch": 0.9791802212101497, "grad_norm": 2.0213871002197266, "learning_rate": 4.583821460213684e-05, "loss": 0.8706, "step": 3010 }, { "epoch": 0.9808067664281067, "grad_norm": 2.181262254714966, "learning_rate": 4.582394502636591e-05, "loss": 0.8551, "step": 3015 }, { "epoch": 0.9824333116460637, "grad_norm": 1.651312232017517, "learning_rate": 4.580965325832512e-05, "loss": 0.8624, "step": 3020 }, { "epoch": 0.9840598568640209, "grad_norm": 1.6381468772888184, "learning_rate": 4.579533931324533e-05, "loss": 0.8456, "step": 3025 }, { "epoch": 0.9856864020819779, "grad_norm": 1.8208574056625366, "learning_rate": 4.578100320638106e-05, "loss": 0.864, "step": 3030 }, { "epoch": 0.9873129472999349, "grad_norm": 1.6688610315322876, "learning_rate": 4.5766644953010406e-05, "loss": 0.8426, "step": 3035 }, { "epoch": 0.988939492517892, "grad_norm": 2.131165027618408, "learning_rate": 4.5752264568435104e-05, "loss": 0.8771, "step": 3040 }, { "epoch": 0.9905660377358491, "grad_norm": 1.726283311843872, "learning_rate": 4.573786206798046e-05, "loss": 0.8573, "step": 3045 }, { "epoch": 0.9921925829538061, "grad_norm": 2.4873554706573486, "learning_rate": 4.5723437466995364e-05, "loss": 0.8448, "step": 3050 }, { "epoch": 0.9938191281717632, "grad_norm": 2.203801155090332, "learning_rate": 4.570899078085223e-05, "loss": 0.8839, "step": 3055 }, { "epoch": 0.9954456733897202, "grad_norm": 1.5567647218704224, "learning_rate": 4.5694522024947026e-05, "loss": 0.8411, "step": 3060 }, { "epoch": 0.9970722186076773, "grad_norm": 2.006624221801758, "learning_rate": 4.5680031214699245e-05, "loss": 0.8666, "step": 3065 }, { "epoch": 0.9986987638256344, "grad_norm": 2.026901960372925, "learning_rate": 4.5665518365551876e-05, "loss": 0.8677, "step": 3070 }, { "epoch": 1.0, "eval_f1": 0.8113036588681131, "eval_loss": 0.416015625, "eval_precision": 0.8161222961369498, "eval_recall": 0.8094014323999219, "eval_runtime": 1031.3491, "eval_samples_per_second": 381.476, "eval_steps_per_second": 0.746, "step": 3074 }, { "epoch": 1.0003253090435915, "grad_norm": 1.9352390766143799, "learning_rate": 4.565098349297139e-05, "loss": 0.875, "step": 3075 }, { "epoch": 1.0019518542615484, "grad_norm": 1.7438503503799438, "learning_rate": 4.5636426612447735e-05, "loss": 0.8357, "step": 3080 }, { "epoch": 1.0035783994795056, "grad_norm": 1.6971402168273926, "learning_rate": 4.5621847739494315e-05, "loss": 0.8136, "step": 3085 }, { "epoch": 1.0052049446974627, "grad_norm": 1.4777500629425049, "learning_rate": 4.560724688964797e-05, "loss": 0.8161, "step": 3090 }, { "epoch": 1.0068314899154196, "grad_norm": 2.5030698776245117, "learning_rate": 4.559262407846896e-05, "loss": 0.7944, "step": 3095 }, { "epoch": 1.0084580351333767, "grad_norm": 1.8135157823562622, "learning_rate": 4.5577979321540956e-05, "loss": 0.7664, "step": 3100 }, { "epoch": 1.0100845803513339, "grad_norm": 2.1230430603027344, "learning_rate": 4.5563312634471e-05, "loss": 0.815, "step": 3105 }, { "epoch": 1.0117111255692908, "grad_norm": 1.7101008892059326, "learning_rate": 4.5548624032889515e-05, "loss": 0.8123, "step": 3110 }, { "epoch": 1.013337670787248, "grad_norm": 1.7563308477401733, "learning_rate": 4.553391353245028e-05, "loss": 0.8166, "step": 3115 }, { "epoch": 1.014964216005205, "grad_norm": 1.9305217266082764, "learning_rate": 4.551918114883042e-05, "loss": 0.8292, "step": 3120 }, { "epoch": 1.016590761223162, "grad_norm": 1.791092872619629, "learning_rate": 4.550442689773036e-05, "loss": 0.8163, "step": 3125 }, { "epoch": 1.018217306441119, "grad_norm": 1.7695600986480713, "learning_rate": 4.5489650794873853e-05, "loss": 0.7985, "step": 3130 }, { "epoch": 1.0198438516590762, "grad_norm": 2.181884765625, "learning_rate": 4.547485285600791e-05, "loss": 0.7809, "step": 3135 }, { "epoch": 1.0214703968770331, "grad_norm": 1.7219114303588867, "learning_rate": 4.546003309690285e-05, "loss": 0.8179, "step": 3140 }, { "epoch": 1.0230969420949902, "grad_norm": 1.8673876523971558, "learning_rate": 4.544519153335222e-05, "loss": 0.7763, "step": 3145 }, { "epoch": 1.0247234873129474, "grad_norm": 1.8142812252044678, "learning_rate": 4.54303281811728e-05, "loss": 0.8538, "step": 3150 }, { "epoch": 1.0263500325309043, "grad_norm": 1.7296979427337646, "learning_rate": 4.541544305620462e-05, "loss": 0.8141, "step": 3155 }, { "epoch": 1.0279765777488614, "grad_norm": 2.0520095825195312, "learning_rate": 4.5400536174310875e-05, "loss": 0.8305, "step": 3160 }, { "epoch": 1.0296031229668186, "grad_norm": 1.7631771564483643, "learning_rate": 4.538560755137798e-05, "loss": 0.7926, "step": 3165 }, { "epoch": 1.0312296681847755, "grad_norm": 2.381673812866211, "learning_rate": 4.537065720331551e-05, "loss": 0.7962, "step": 3170 }, { "epoch": 1.0328562134027326, "grad_norm": 1.8267555236816406, "learning_rate": 4.535568514605617e-05, "loss": 0.7908, "step": 3175 }, { "epoch": 1.0344827586206897, "grad_norm": 2.032332420349121, "learning_rate": 4.5340691395555835e-05, "loss": 0.8091, "step": 3180 }, { "epoch": 1.0361093038386466, "grad_norm": 1.757824420928955, "learning_rate": 4.532567596779348e-05, "loss": 0.8042, "step": 3185 }, { "epoch": 1.0377358490566038, "grad_norm": 2.0004148483276367, "learning_rate": 4.531063887877118e-05, "loss": 0.8403, "step": 3190 }, { "epoch": 1.039362394274561, "grad_norm": 2.3177530765533447, "learning_rate": 4.52955801445141e-05, "loss": 0.815, "step": 3195 }, { "epoch": 1.0409889394925178, "grad_norm": 1.7317501306533813, "learning_rate": 4.528049978107047e-05, "loss": 0.8243, "step": 3200 }, { "epoch": 1.042615484710475, "grad_norm": 1.8553853034973145, "learning_rate": 4.5265397804511575e-05, "loss": 0.8064, "step": 3205 }, { "epoch": 1.044242029928432, "grad_norm": 1.816109538078308, "learning_rate": 4.525027423093174e-05, "loss": 0.8166, "step": 3210 }, { "epoch": 1.045868575146389, "grad_norm": 1.9716473817825317, "learning_rate": 4.523512907644828e-05, "loss": 0.8125, "step": 3215 }, { "epoch": 1.047495120364346, "grad_norm": 2.2402427196502686, "learning_rate": 4.521996235720154e-05, "loss": 0.7953, "step": 3220 }, { "epoch": 1.0491216655823032, "grad_norm": 2.047614812850952, "learning_rate": 4.520477408935482e-05, "loss": 0.8102, "step": 3225 }, { "epoch": 1.0507482108002602, "grad_norm": 1.8117738962173462, "learning_rate": 4.51895642890944e-05, "loss": 0.8187, "step": 3230 }, { "epoch": 1.0523747560182173, "grad_norm": 1.726954460144043, "learning_rate": 4.517433297262951e-05, "loss": 0.8269, "step": 3235 }, { "epoch": 1.0540013012361744, "grad_norm": 2.0745491981506348, "learning_rate": 4.51590801561923e-05, "loss": 0.83, "step": 3240 }, { "epoch": 1.0556278464541313, "grad_norm": 1.9231599569320679, "learning_rate": 4.514380585603783e-05, "loss": 0.8036, "step": 3245 }, { "epoch": 1.0572543916720885, "grad_norm": 2.1803252696990967, "learning_rate": 4.512851008844408e-05, "loss": 0.8117, "step": 3250 }, { "epoch": 1.0588809368900456, "grad_norm": 1.8261420726776123, "learning_rate": 4.5113192869711867e-05, "loss": 0.8199, "step": 3255 }, { "epoch": 1.0605074821080025, "grad_norm": 1.6798607110977173, "learning_rate": 4.509785421616492e-05, "loss": 0.8419, "step": 3260 }, { "epoch": 1.0621340273259596, "grad_norm": 1.9406812191009521, "learning_rate": 4.508249414414975e-05, "loss": 0.8361, "step": 3265 }, { "epoch": 1.0637605725439168, "grad_norm": 1.4756301641464233, "learning_rate": 4.506711267003575e-05, "loss": 0.8085, "step": 3270 }, { "epoch": 1.065387117761874, "grad_norm": 1.636296272277832, "learning_rate": 4.50517098102151e-05, "loss": 0.8001, "step": 3275 }, { "epoch": 1.0670136629798308, "grad_norm": 3.0141994953155518, "learning_rate": 4.503628558110276e-05, "loss": 0.79, "step": 3280 }, { "epoch": 1.068640208197788, "grad_norm": 1.9245976209640503, "learning_rate": 4.502083999913648e-05, "loss": 0.8144, "step": 3285 }, { "epoch": 1.070266753415745, "grad_norm": 2.0785765647888184, "learning_rate": 4.500537308077675e-05, "loss": 0.8126, "step": 3290 }, { "epoch": 1.071893298633702, "grad_norm": 2.4258196353912354, "learning_rate": 4.498988484250681e-05, "loss": 0.8266, "step": 3295 }, { "epoch": 1.073519843851659, "grad_norm": 1.7474132776260376, "learning_rate": 4.4974375300832614e-05, "loss": 0.8128, "step": 3300 }, { "epoch": 1.0751463890696162, "grad_norm": 1.7980300188064575, "learning_rate": 4.4958844472282845e-05, "loss": 0.8257, "step": 3305 }, { "epoch": 1.0767729342875731, "grad_norm": 1.6487613916397095, "learning_rate": 4.494329237340882e-05, "loss": 0.7875, "step": 3310 }, { "epoch": 1.0783994795055303, "grad_norm": 2.1843864917755127, "learning_rate": 4.4927719020784575e-05, "loss": 0.8347, "step": 3315 }, { "epoch": 1.0800260247234874, "grad_norm": 1.9588794708251953, "learning_rate": 4.491212443100677e-05, "loss": 0.7839, "step": 3320 }, { "epoch": 1.0816525699414443, "grad_norm": 2.564725160598755, "learning_rate": 4.48965086206947e-05, "loss": 0.8277, "step": 3325 }, { "epoch": 1.0832791151594015, "grad_norm": 2.135406255722046, "learning_rate": 4.4880871606490274e-05, "loss": 0.766, "step": 3330 }, { "epoch": 1.0849056603773586, "grad_norm": 1.9069466590881348, "learning_rate": 4.4865213405058014e-05, "loss": 0.8192, "step": 3335 }, { "epoch": 1.0865322055953155, "grad_norm": 1.691408395767212, "learning_rate": 4.4849534033084996e-05, "loss": 0.801, "step": 3340 }, { "epoch": 1.0881587508132726, "grad_norm": 1.6413347721099854, "learning_rate": 4.4833833507280884e-05, "loss": 0.8187, "step": 3345 }, { "epoch": 1.0897852960312298, "grad_norm": 1.7701938152313232, "learning_rate": 4.481811184437786e-05, "loss": 0.8126, "step": 3350 }, { "epoch": 1.0914118412491867, "grad_norm": 2.274620294570923, "learning_rate": 4.480236906113066e-05, "loss": 0.8246, "step": 3355 }, { "epoch": 1.0930383864671438, "grad_norm": 1.8796124458312988, "learning_rate": 4.478660517431649e-05, "loss": 0.8227, "step": 3360 }, { "epoch": 1.094664931685101, "grad_norm": 2.04233455657959, "learning_rate": 4.477082020073509e-05, "loss": 0.8392, "step": 3365 }, { "epoch": 1.0962914769030578, "grad_norm": 1.7941282987594604, "learning_rate": 4.475501415720864e-05, "loss": 0.7835, "step": 3370 }, { "epoch": 1.097918022121015, "grad_norm": 2.058685302734375, "learning_rate": 4.47391870605818e-05, "loss": 0.7946, "step": 3375 }, { "epoch": 1.099544567338972, "grad_norm": 1.6518946886062622, "learning_rate": 4.4723338927721634e-05, "loss": 0.7946, "step": 3380 }, { "epoch": 1.101171112556929, "grad_norm": 2.23441481590271, "learning_rate": 4.4707469775517664e-05, "loss": 0.7921, "step": 3385 }, { "epoch": 1.1027976577748861, "grad_norm": 2.3232648372650146, "learning_rate": 4.469157962088177e-05, "loss": 0.853, "step": 3390 }, { "epoch": 1.1044242029928433, "grad_norm": 2.5304653644561768, "learning_rate": 4.467566848074825e-05, "loss": 0.8243, "step": 3395 }, { "epoch": 1.1060507482108002, "grad_norm": 1.7827421426773071, "learning_rate": 4.4659736372073744e-05, "loss": 0.7872, "step": 3400 }, { "epoch": 1.1076772934287573, "grad_norm": 1.954492211341858, "learning_rate": 4.4643783311837254e-05, "loss": 0.7794, "step": 3405 }, { "epoch": 1.1093038386467144, "grad_norm": 2.82947039604187, "learning_rate": 4.462780931704011e-05, "loss": 0.8311, "step": 3410 }, { "epoch": 1.1109303838646714, "grad_norm": 1.5368119478225708, "learning_rate": 4.461181440470592e-05, "loss": 0.7937, "step": 3415 }, { "epoch": 1.1125569290826285, "grad_norm": 1.84377121925354, "learning_rate": 4.4595798591880635e-05, "loss": 0.7949, "step": 3420 }, { "epoch": 1.1141834743005856, "grad_norm": 2.373713970184326, "learning_rate": 4.457976189563244e-05, "loss": 0.8294, "step": 3425 }, { "epoch": 1.1158100195185425, "grad_norm": 3.3402259349823, "learning_rate": 4.4563704333051795e-05, "loss": 0.8156, "step": 3430 }, { "epoch": 1.1174365647364997, "grad_norm": 2.2353696823120117, "learning_rate": 4.454762592125139e-05, "loss": 0.8355, "step": 3435 }, { "epoch": 1.1190631099544568, "grad_norm": 1.695486068725586, "learning_rate": 4.453152667736614e-05, "loss": 0.8266, "step": 3440 }, { "epoch": 1.1206896551724137, "grad_norm": 1.7904291152954102, "learning_rate": 4.451540661855315e-05, "loss": 0.8075, "step": 3445 }, { "epoch": 1.1223162003903708, "grad_norm": 3.15537166595459, "learning_rate": 4.449926576199173e-05, "loss": 0.8403, "step": 3450 }, { "epoch": 1.123942745608328, "grad_norm": 2.273641586303711, "learning_rate": 4.4483104124883324e-05, "loss": 0.827, "step": 3455 }, { "epoch": 1.1255692908262849, "grad_norm": 2.7670817375183105, "learning_rate": 4.4466921724451535e-05, "loss": 0.8333, "step": 3460 }, { "epoch": 1.127195836044242, "grad_norm": 1.8045217990875244, "learning_rate": 4.4450718577942114e-05, "loss": 0.7822, "step": 3465 }, { "epoch": 1.1288223812621991, "grad_norm": 3.31329345703125, "learning_rate": 4.443449470262289e-05, "loss": 0.8287, "step": 3470 }, { "epoch": 1.130448926480156, "grad_norm": 3.028852939605713, "learning_rate": 4.44182501157838e-05, "loss": 0.8143, "step": 3475 }, { "epoch": 1.1320754716981132, "grad_norm": 1.77473783493042, "learning_rate": 4.440198483473684e-05, "loss": 0.8133, "step": 3480 }, { "epoch": 1.1337020169160703, "grad_norm": 2.183030366897583, "learning_rate": 4.4385698876816095e-05, "loss": 0.7916, "step": 3485 }, { "epoch": 1.1353285621340272, "grad_norm": 1.941740870475769, "learning_rate": 4.436939225937764e-05, "loss": 0.7976, "step": 3490 }, { "epoch": 1.1369551073519844, "grad_norm": 1.8414844274520874, "learning_rate": 4.4353064999799583e-05, "loss": 0.834, "step": 3495 }, { "epoch": 1.1385816525699415, "grad_norm": 3.1121630668640137, "learning_rate": 4.4336717115482044e-05, "loss": 0.7853, "step": 3500 }, { "epoch": 1.1402081977878984, "grad_norm": 2.0056183338165283, "learning_rate": 4.432034862384712e-05, "loss": 0.8098, "step": 3505 }, { "epoch": 1.1418347430058555, "grad_norm": 4.22079610824585, "learning_rate": 4.4303959542338854e-05, "loss": 0.811, "step": 3510 }, { "epoch": 1.1434612882238127, "grad_norm": 8.928543090820312, "learning_rate": 4.428754988842324e-05, "loss": 0.8039, "step": 3515 }, { "epoch": 1.1450878334417696, "grad_norm": 2.357715368270874, "learning_rate": 4.4271119679588204e-05, "loss": 0.8244, "step": 3520 }, { "epoch": 1.1467143786597267, "grad_norm": 2.109389305114746, "learning_rate": 4.4254668933343566e-05, "loss": 0.7981, "step": 3525 }, { "epoch": 1.1483409238776838, "grad_norm": 1.7039803266525269, "learning_rate": 4.423819766722104e-05, "loss": 0.7762, "step": 3530 }, { "epoch": 1.1499674690956407, "grad_norm": 2.0489721298217773, "learning_rate": 4.4221705898774196e-05, "loss": 0.8192, "step": 3535 }, { "epoch": 1.1515940143135979, "grad_norm": 1.818631649017334, "learning_rate": 4.420519364557848e-05, "loss": 0.8002, "step": 3540 }, { "epoch": 1.153220559531555, "grad_norm": 1.7001547813415527, "learning_rate": 4.418866092523114e-05, "loss": 0.8348, "step": 3545 }, { "epoch": 1.1548471047495121, "grad_norm": 2.8383827209472656, "learning_rate": 4.4172107755351256e-05, "loss": 0.8103, "step": 3550 }, { "epoch": 1.156473649967469, "grad_norm": 2.1644821166992188, "learning_rate": 4.415553415357969e-05, "loss": 0.7796, "step": 3555 }, { "epoch": 1.1581001951854262, "grad_norm": 3.3231115341186523, "learning_rate": 4.4138940137579084e-05, "loss": 0.7834, "step": 3560 }, { "epoch": 1.1597267404033833, "grad_norm": 1.8891205787658691, "learning_rate": 4.412232572503383e-05, "loss": 0.8016, "step": 3565 }, { "epoch": 1.1613532856213402, "grad_norm": 1.6741869449615479, "learning_rate": 4.4105690933650055e-05, "loss": 0.8139, "step": 3570 }, { "epoch": 1.1629798308392973, "grad_norm": 1.8224472999572754, "learning_rate": 4.4089035781155624e-05, "loss": 0.805, "step": 3575 }, { "epoch": 1.1646063760572545, "grad_norm": 1.8925952911376953, "learning_rate": 4.4072360285300076e-05, "loss": 0.799, "step": 3580 }, { "epoch": 1.1662329212752114, "grad_norm": 1.6234843730926514, "learning_rate": 4.405566446385464e-05, "loss": 0.8237, "step": 3585 }, { "epoch": 1.1678594664931685, "grad_norm": 1.7579517364501953, "learning_rate": 4.4038948334612215e-05, "loss": 0.7959, "step": 3590 }, { "epoch": 1.1694860117111257, "grad_norm": 1.6992871761322021, "learning_rate": 4.4022211915387324e-05, "loss": 0.7827, "step": 3595 }, { "epoch": 1.1711125569290826, "grad_norm": 1.990018606185913, "learning_rate": 4.400545522401613e-05, "loss": 0.8171, "step": 3600 }, { "epoch": 1.1727391021470397, "grad_norm": 1.8367072343826294, "learning_rate": 4.3988678278356386e-05, "loss": 0.7709, "step": 3605 }, { "epoch": 1.1743656473649968, "grad_norm": 1.8837988376617432, "learning_rate": 4.3971881096287446e-05, "loss": 0.7936, "step": 3610 }, { "epoch": 1.1759921925829537, "grad_norm": 2.361079454421997, "learning_rate": 4.395506369571022e-05, "loss": 0.8119, "step": 3615 }, { "epoch": 1.1776187378009109, "grad_norm": 2.039238214492798, "learning_rate": 4.3938226094547156e-05, "loss": 0.8025, "step": 3620 }, { "epoch": 1.179245283018868, "grad_norm": 1.535110354423523, "learning_rate": 4.392136831074225e-05, "loss": 0.8029, "step": 3625 }, { "epoch": 1.180871828236825, "grad_norm": 1.8643568754196167, "learning_rate": 4.3904490362260994e-05, "loss": 0.8278, "step": 3630 }, { "epoch": 1.182498373454782, "grad_norm": 1.7051054239273071, "learning_rate": 4.388759226709038e-05, "loss": 0.803, "step": 3635 }, { "epoch": 1.1841249186727392, "grad_norm": 1.8260678052902222, "learning_rate": 4.3870674043238866e-05, "loss": 0.8136, "step": 3640 }, { "epoch": 1.185751463890696, "grad_norm": 2.171731948852539, "learning_rate": 4.3853735708736346e-05, "loss": 0.8347, "step": 3645 }, { "epoch": 1.1873780091086532, "grad_norm": 1.6170158386230469, "learning_rate": 4.3836777281634156e-05, "loss": 0.7931, "step": 3650 }, { "epoch": 1.1890045543266103, "grad_norm": 2.0424461364746094, "learning_rate": 4.381979878000506e-05, "loss": 0.8187, "step": 3655 }, { "epoch": 1.1906310995445673, "grad_norm": 1.7460720539093018, "learning_rate": 4.38028002219432e-05, "loss": 0.7913, "step": 3660 }, { "epoch": 1.1922576447625244, "grad_norm": 1.7360622882843018, "learning_rate": 4.378578162556409e-05, "loss": 0.8587, "step": 3665 }, { "epoch": 1.1938841899804815, "grad_norm": 1.6812613010406494, "learning_rate": 4.376874300900462e-05, "loss": 0.7885, "step": 3670 }, { "epoch": 1.1955107351984384, "grad_norm": 1.8114211559295654, "learning_rate": 4.3751684390422984e-05, "loss": 0.8399, "step": 3675 }, { "epoch": 1.1971372804163956, "grad_norm": 2.1626474857330322, "learning_rate": 4.3734605787998715e-05, "loss": 0.8353, "step": 3680 }, { "epoch": 1.1987638256343527, "grad_norm": 2.1298441886901855, "learning_rate": 4.371750721993264e-05, "loss": 0.7979, "step": 3685 }, { "epoch": 1.2003903708523098, "grad_norm": 2.7537834644317627, "learning_rate": 4.3700388704446856e-05, "loss": 0.8168, "step": 3690 }, { "epoch": 1.2020169160702667, "grad_norm": 2.0105204582214355, "learning_rate": 4.368325025978472e-05, "loss": 0.7984, "step": 3695 }, { "epoch": 1.2036434612882239, "grad_norm": 1.7766393423080444, "learning_rate": 4.3666091904210836e-05, "loss": 0.7903, "step": 3700 }, { "epoch": 1.205270006506181, "grad_norm": 1.8659816980361938, "learning_rate": 4.364891365601102e-05, "loss": 0.8235, "step": 3705 }, { "epoch": 1.206896551724138, "grad_norm": 1.7383310794830322, "learning_rate": 4.3631715533492296e-05, "loss": 0.8176, "step": 3710 }, { "epoch": 1.208523096942095, "grad_norm": 1.6841621398925781, "learning_rate": 4.361449755498285e-05, "loss": 0.8118, "step": 3715 }, { "epoch": 1.2101496421600522, "grad_norm": 1.9598522186279297, "learning_rate": 4.359725973883204e-05, "loss": 0.7958, "step": 3720 }, { "epoch": 1.211776187378009, "grad_norm": 2.602379560470581, "learning_rate": 4.3580002103410375e-05, "loss": 0.8183, "step": 3725 }, { "epoch": 1.2134027325959662, "grad_norm": 1.8115839958190918, "learning_rate": 4.356272466710947e-05, "loss": 0.7928, "step": 3730 }, { "epoch": 1.2150292778139233, "grad_norm": 1.8908313512802124, "learning_rate": 4.354542744834204e-05, "loss": 0.8089, "step": 3735 }, { "epoch": 1.2166558230318802, "grad_norm": 2.1180243492126465, "learning_rate": 4.352811046554191e-05, "loss": 0.8151, "step": 3740 }, { "epoch": 1.2182823682498374, "grad_norm": 1.7875014543533325, "learning_rate": 4.351077373716393e-05, "loss": 0.7996, "step": 3745 }, { "epoch": 1.2199089134677945, "grad_norm": 2.1548900604248047, "learning_rate": 4.349341728168402e-05, "loss": 0.7836, "step": 3750 }, { "epoch": 1.2215354586857514, "grad_norm": 1.9586331844329834, "learning_rate": 4.347604111759911e-05, "loss": 0.82, "step": 3755 }, { "epoch": 1.2231620039037086, "grad_norm": 1.666062593460083, "learning_rate": 4.3458645263427146e-05, "loss": 0.8127, "step": 3760 }, { "epoch": 1.2247885491216657, "grad_norm": 1.6794251203536987, "learning_rate": 4.3441229737707034e-05, "loss": 0.8092, "step": 3765 }, { "epoch": 1.2264150943396226, "grad_norm": 2.466075897216797, "learning_rate": 4.3423794558998675e-05, "loss": 0.7803, "step": 3770 }, { "epoch": 1.2280416395575797, "grad_norm": 1.6132360696792603, "learning_rate": 4.340633974588289e-05, "loss": 0.7889, "step": 3775 }, { "epoch": 1.2296681847755369, "grad_norm": 1.8046766519546509, "learning_rate": 4.338886531696145e-05, "loss": 0.8327, "step": 3780 }, { "epoch": 1.2312947299934938, "grad_norm": 2.1257879734039307, "learning_rate": 4.3371371290856996e-05, "loss": 0.8275, "step": 3785 }, { "epoch": 1.232921275211451, "grad_norm": 1.8193806409835815, "learning_rate": 4.335385768621307e-05, "loss": 0.8043, "step": 3790 }, { "epoch": 1.234547820429408, "grad_norm": 1.484757661819458, "learning_rate": 4.3336324521694104e-05, "loss": 0.836, "step": 3795 }, { "epoch": 1.236174365647365, "grad_norm": 1.6302132606506348, "learning_rate": 4.331877181598534e-05, "loss": 0.8157, "step": 3800 }, { "epoch": 1.237800910865322, "grad_norm": 3.4269461631774902, "learning_rate": 4.330119958779285e-05, "loss": 0.8175, "step": 3805 }, { "epoch": 1.2394274560832792, "grad_norm": 1.798671841621399, "learning_rate": 4.328360785584353e-05, "loss": 0.7987, "step": 3810 }, { "epoch": 1.241054001301236, "grad_norm": 4.2418413162231445, "learning_rate": 4.3265996638885054e-05, "loss": 0.8141, "step": 3815 }, { "epoch": 1.2426805465191932, "grad_norm": 1.7764673233032227, "learning_rate": 4.3248365955685856e-05, "loss": 0.8297, "step": 3820 }, { "epoch": 1.2443070917371504, "grad_norm": 1.755462408065796, "learning_rate": 4.32307158250351e-05, "loss": 0.819, "step": 3825 }, { "epoch": 1.2459336369551073, "grad_norm": 2.2177834510803223, "learning_rate": 4.321304626574271e-05, "loss": 0.7973, "step": 3830 }, { "epoch": 1.2475601821730644, "grad_norm": 2.8550915718078613, "learning_rate": 4.319535729663929e-05, "loss": 0.7799, "step": 3835 }, { "epoch": 1.2491867273910215, "grad_norm": 1.7983542680740356, "learning_rate": 4.317764893657616e-05, "loss": 0.7766, "step": 3840 }, { "epoch": 1.2508132726089785, "grad_norm": 2.1239964962005615, "learning_rate": 4.3159921204425246e-05, "loss": 0.8215, "step": 3845 }, { "epoch": 1.2524398178269356, "grad_norm": 2.1900477409362793, "learning_rate": 4.3142174119079175e-05, "loss": 0.8195, "step": 3850 }, { "epoch": 1.2540663630448927, "grad_norm": 1.7720361948013306, "learning_rate": 4.3124407699451174e-05, "loss": 0.8211, "step": 3855 }, { "epoch": 1.2556929082628496, "grad_norm": 2.2531039714813232, "learning_rate": 4.310662196447509e-05, "loss": 0.8173, "step": 3860 }, { "epoch": 1.2573194534808068, "grad_norm": 1.7785147428512573, "learning_rate": 4.3088816933105336e-05, "loss": 0.8015, "step": 3865 }, { "epoch": 1.258945998698764, "grad_norm": 1.6724070310592651, "learning_rate": 4.3070992624316895e-05, "loss": 0.8426, "step": 3870 }, { "epoch": 1.2605725439167208, "grad_norm": 1.6628553867340088, "learning_rate": 4.305314905710531e-05, "loss": 0.8194, "step": 3875 }, { "epoch": 1.262199089134678, "grad_norm": 2.1507434844970703, "learning_rate": 4.3035286250486616e-05, "loss": 0.8368, "step": 3880 }, { "epoch": 1.263825634352635, "grad_norm": 1.816701054573059, "learning_rate": 4.3017404223497385e-05, "loss": 0.8028, "step": 3885 }, { "epoch": 1.265452179570592, "grad_norm": 1.667349100112915, "learning_rate": 4.299950299519465e-05, "loss": 0.835, "step": 3890 }, { "epoch": 1.267078724788549, "grad_norm": 1.932996392250061, "learning_rate": 4.298158258465592e-05, "loss": 0.8239, "step": 3895 }, { "epoch": 1.2687052700065062, "grad_norm": 2.326045513153076, "learning_rate": 4.296364301097914e-05, "loss": 0.7889, "step": 3900 }, { "epoch": 1.2703318152244631, "grad_norm": 2.015550374984741, "learning_rate": 4.2945684293282685e-05, "loss": 0.8212, "step": 3905 }, { "epoch": 1.2719583604424203, "grad_norm": 1.7425355911254883, "learning_rate": 4.2927706450705305e-05, "loss": 0.8129, "step": 3910 }, { "epoch": 1.2735849056603774, "grad_norm": 1.7180896997451782, "learning_rate": 4.290970950240617e-05, "loss": 0.7754, "step": 3915 }, { "epoch": 1.2752114508783343, "grad_norm": 1.7110702991485596, "learning_rate": 4.28916934675648e-05, "loss": 0.8199, "step": 3920 }, { "epoch": 1.2768379960962914, "grad_norm": 1.8523958921432495, "learning_rate": 4.2873658365381026e-05, "loss": 0.8303, "step": 3925 }, { "epoch": 1.2784645413142486, "grad_norm": 1.8597928285598755, "learning_rate": 4.285560421507504e-05, "loss": 0.8032, "step": 3930 }, { "epoch": 1.2800910865322055, "grad_norm": 1.6834561824798584, "learning_rate": 4.2837531035887305e-05, "loss": 0.8038, "step": 3935 }, { "epoch": 1.2817176317501626, "grad_norm": 2.0416834354400635, "learning_rate": 4.281943884707859e-05, "loss": 0.8087, "step": 3940 }, { "epoch": 1.2833441769681198, "grad_norm": 1.819843053817749, "learning_rate": 4.280132766792989e-05, "loss": 0.8153, "step": 3945 }, { "epoch": 1.2849707221860767, "grad_norm": 1.82972252368927, "learning_rate": 4.2783197517742464e-05, "loss": 0.7974, "step": 3950 }, { "epoch": 1.2865972674040338, "grad_norm": 1.898067831993103, "learning_rate": 4.276504841583778e-05, "loss": 0.7882, "step": 3955 }, { "epoch": 1.288223812621991, "grad_norm": 2.071033477783203, "learning_rate": 4.27468803815575e-05, "loss": 0.8017, "step": 3960 }, { "epoch": 1.2898503578399478, "grad_norm": 1.8522893190383911, "learning_rate": 4.2728693434263476e-05, "loss": 0.8115, "step": 3965 }, { "epoch": 1.291476903057905, "grad_norm": 1.7110276222229004, "learning_rate": 4.2710487593337684e-05, "loss": 0.8063, "step": 3970 }, { "epoch": 1.293103448275862, "grad_norm": 1.731191873550415, "learning_rate": 4.269226287818228e-05, "loss": 0.7753, "step": 3975 }, { "epoch": 1.294729993493819, "grad_norm": 2.6229982376098633, "learning_rate": 4.2674019308219484e-05, "loss": 0.8478, "step": 3980 }, { "epoch": 1.2963565387117761, "grad_norm": 1.8522100448608398, "learning_rate": 4.2655756902891665e-05, "loss": 0.8106, "step": 3985 }, { "epoch": 1.2979830839297333, "grad_norm": 2.134326457977295, "learning_rate": 4.2637475681661214e-05, "loss": 0.8006, "step": 3990 }, { "epoch": 1.2996096291476902, "grad_norm": 2.10435152053833, "learning_rate": 4.261917566401061e-05, "loss": 0.857, "step": 3995 }, { "epoch": 1.3012361743656473, "grad_norm": 1.500496745109558, "learning_rate": 4.260085686944235e-05, "loss": 0.8273, "step": 4000 }, { "epoch": 1.3028627195836044, "grad_norm": 1.9462052583694458, "learning_rate": 4.258251931747893e-05, "loss": 0.7947, "step": 4005 }, { "epoch": 1.3044892648015614, "grad_norm": 3.217682361602783, "learning_rate": 4.256416302766286e-05, "loss": 0.7836, "step": 4010 }, { "epoch": 1.3061158100195185, "grad_norm": 1.6719838380813599, "learning_rate": 4.25457880195566e-05, "loss": 0.8, "step": 4015 }, { "epoch": 1.3077423552374756, "grad_norm": 2.3355538845062256, "learning_rate": 4.2527394312742574e-05, "loss": 0.8194, "step": 4020 }, { "epoch": 1.3093689004554325, "grad_norm": 2.0860061645507812, "learning_rate": 4.250898192682311e-05, "loss": 0.8147, "step": 4025 }, { "epoch": 1.3109954456733897, "grad_norm": 1.890384316444397, "learning_rate": 4.249055088142047e-05, "loss": 0.808, "step": 4030 }, { "epoch": 1.3126219908913468, "grad_norm": 2.144073009490967, "learning_rate": 4.247210119617679e-05, "loss": 0.7904, "step": 4035 }, { "epoch": 1.3142485361093037, "grad_norm": 2.1934149265289307, "learning_rate": 4.245363289075406e-05, "loss": 0.8062, "step": 4040 }, { "epoch": 1.3158750813272608, "grad_norm": 1.7943332195281982, "learning_rate": 4.243514598483412e-05, "loss": 0.8102, "step": 4045 }, { "epoch": 1.317501626545218, "grad_norm": 4.055437088012695, "learning_rate": 4.241664049811864e-05, "loss": 0.8231, "step": 4050 }, { "epoch": 1.319128171763175, "grad_norm": 2.4395828247070312, "learning_rate": 4.23981164503291e-05, "loss": 0.7925, "step": 4055 }, { "epoch": 1.320754716981132, "grad_norm": 2.233398675918579, "learning_rate": 4.237957386120674e-05, "loss": 0.8168, "step": 4060 }, { "epoch": 1.3223812621990891, "grad_norm": 2.865018606185913, "learning_rate": 4.236101275051256e-05, "loss": 0.8261, "step": 4065 }, { "epoch": 1.3240078074170463, "grad_norm": 2.397876501083374, "learning_rate": 4.234243313802732e-05, "loss": 0.812, "step": 4070 }, { "epoch": 1.3256343526350032, "grad_norm": 4.140904903411865, "learning_rate": 4.232383504355147e-05, "loss": 0.7997, "step": 4075 }, { "epoch": 1.3272608978529603, "grad_norm": 2.3326687812805176, "learning_rate": 4.230521848690517e-05, "loss": 0.7922, "step": 4080 }, { "epoch": 1.3288874430709174, "grad_norm": 2.385756731033325, "learning_rate": 4.228658348792828e-05, "loss": 0.8036, "step": 4085 }, { "epoch": 1.3305139882888743, "grad_norm": 2.274498224258423, "learning_rate": 4.2267930066480266e-05, "loss": 0.7874, "step": 4090 }, { "epoch": 1.3321405335068315, "grad_norm": 3.2354235649108887, "learning_rate": 4.224925824244025e-05, "loss": 0.8055, "step": 4095 }, { "epoch": 1.3337670787247886, "grad_norm": 1.8239346742630005, "learning_rate": 4.2230568035706987e-05, "loss": 0.8126, "step": 4100 }, { "epoch": 1.3353936239427457, "grad_norm": 1.9903501272201538, "learning_rate": 4.2211859466198785e-05, "loss": 0.7992, "step": 4105 }, { "epoch": 1.3370201691607027, "grad_norm": 1.6283726692199707, "learning_rate": 4.219313255385354e-05, "loss": 0.7975, "step": 4110 }, { "epoch": 1.3386467143786598, "grad_norm": 1.444446086883545, "learning_rate": 4.217438731862871e-05, "loss": 0.7741, "step": 4115 }, { "epoch": 1.340273259596617, "grad_norm": 2.0824830532073975, "learning_rate": 4.2155623780501236e-05, "loss": 0.8521, "step": 4120 }, { "epoch": 1.3418998048145738, "grad_norm": 2.0181331634521484, "learning_rate": 4.213684195946762e-05, "loss": 0.7845, "step": 4125 }, { "epoch": 1.343526350032531, "grad_norm": 2.0623695850372314, "learning_rate": 4.211804187554381e-05, "loss": 0.8115, "step": 4130 }, { "epoch": 1.345152895250488, "grad_norm": 1.8838248252868652, "learning_rate": 4.2099223548765224e-05, "loss": 0.8091, "step": 4135 }, { "epoch": 1.346779440468445, "grad_norm": 2.2108914852142334, "learning_rate": 4.208038699918674e-05, "loss": 0.8139, "step": 4140 }, { "epoch": 1.3484059856864021, "grad_norm": 1.8372282981872559, "learning_rate": 4.206153224688264e-05, "loss": 0.7977, "step": 4145 }, { "epoch": 1.3500325309043593, "grad_norm": 2.2820627689361572, "learning_rate": 4.2042659311946586e-05, "loss": 0.8083, "step": 4150 }, { "epoch": 1.3516590761223162, "grad_norm": 1.6137808561325073, "learning_rate": 4.202376821449167e-05, "loss": 0.8127, "step": 4155 }, { "epoch": 1.3532856213402733, "grad_norm": 1.8549315929412842, "learning_rate": 4.2004858974650285e-05, "loss": 0.8071, "step": 4160 }, { "epoch": 1.3549121665582304, "grad_norm": 1.7504771947860718, "learning_rate": 4.1985931612574186e-05, "loss": 0.8305, "step": 4165 }, { "epoch": 1.3565387117761873, "grad_norm": 1.92816162109375, "learning_rate": 4.196698614843445e-05, "loss": 0.8238, "step": 4170 }, { "epoch": 1.3581652569941445, "grad_norm": 4.2875471115112305, "learning_rate": 4.194802260242141e-05, "loss": 0.8139, "step": 4175 }, { "epoch": 1.3597918022121016, "grad_norm": 2.0751023292541504, "learning_rate": 4.192904099474472e-05, "loss": 0.8061, "step": 4180 }, { "epoch": 1.3614183474300585, "grad_norm": 1.8640562295913696, "learning_rate": 4.191004134563322e-05, "loss": 0.808, "step": 4185 }, { "epoch": 1.3630448926480156, "grad_norm": 1.9962856769561768, "learning_rate": 4.1891023675335044e-05, "loss": 0.8016, "step": 4190 }, { "epoch": 1.3646714378659728, "grad_norm": 1.7219542264938354, "learning_rate": 4.187198800411748e-05, "loss": 0.8172, "step": 4195 }, { "epoch": 1.3662979830839297, "grad_norm": 1.8055989742279053, "learning_rate": 4.1852934352267017e-05, "loss": 0.8162, "step": 4200 }, { "epoch": 1.3679245283018868, "grad_norm": 1.8981190919876099, "learning_rate": 4.183386274008932e-05, "loss": 0.8172, "step": 4205 }, { "epoch": 1.369551073519844, "grad_norm": 2.515291690826416, "learning_rate": 4.181477318790917e-05, "loss": 0.8164, "step": 4210 }, { "epoch": 1.3711776187378009, "grad_norm": 1.5880002975463867, "learning_rate": 4.1795665716070474e-05, "loss": 0.8087, "step": 4215 }, { "epoch": 1.372804163955758, "grad_norm": 1.719185709953308, "learning_rate": 4.177654034493626e-05, "loss": 0.801, "step": 4220 }, { "epoch": 1.3744307091737151, "grad_norm": 2.8107213973999023, "learning_rate": 4.1757397094888594e-05, "loss": 0.7675, "step": 4225 }, { "epoch": 1.376057254391672, "grad_norm": 1.931240200996399, "learning_rate": 4.173823598632862e-05, "loss": 0.824, "step": 4230 }, { "epoch": 1.3776837996096292, "grad_norm": 1.6347918510437012, "learning_rate": 4.1719057039676515e-05, "loss": 0.8057, "step": 4235 }, { "epoch": 1.3793103448275863, "grad_norm": 2.2919232845306396, "learning_rate": 4.1699860275371435e-05, "loss": 0.8068, "step": 4240 }, { "epoch": 1.3809368900455432, "grad_norm": 1.8959335088729858, "learning_rate": 4.168064571387159e-05, "loss": 0.7934, "step": 4245 }, { "epoch": 1.3825634352635003, "grad_norm": 1.9993014335632324, "learning_rate": 4.166141337565407e-05, "loss": 0.8189, "step": 4250 }, { "epoch": 1.3841899804814575, "grad_norm": 2.6744425296783447, "learning_rate": 4.1642163281214984e-05, "loss": 0.8283, "step": 4255 }, { "epoch": 1.3858165256994144, "grad_norm": 2.359461545944214, "learning_rate": 4.162289545106932e-05, "loss": 0.7969, "step": 4260 }, { "epoch": 1.3874430709173715, "grad_norm": 2.145277976989746, "learning_rate": 4.160360990575099e-05, "loss": 0.8002, "step": 4265 }, { "epoch": 1.3890696161353286, "grad_norm": 1.8672142028808594, "learning_rate": 4.1584306665812787e-05, "loss": 0.8007, "step": 4270 }, { "epoch": 1.3906961613532856, "grad_norm": 1.8362537622451782, "learning_rate": 4.156498575182633e-05, "loss": 0.8324, "step": 4275 }, { "epoch": 1.3923227065712427, "grad_norm": 2.507525682449341, "learning_rate": 4.15456471843821e-05, "loss": 0.8186, "step": 4280 }, { "epoch": 1.3939492517891998, "grad_norm": 1.6994293928146362, "learning_rate": 4.152629098408939e-05, "loss": 0.7999, "step": 4285 }, { "epoch": 1.3955757970071567, "grad_norm": 1.9243533611297607, "learning_rate": 4.1506917171576295e-05, "loss": 0.7933, "step": 4290 }, { "epoch": 1.3972023422251139, "grad_norm": 1.678754210472107, "learning_rate": 4.1487525767489635e-05, "loss": 0.7926, "step": 4295 }, { "epoch": 1.398828887443071, "grad_norm": 3.0569796562194824, "learning_rate": 4.146811679249504e-05, "loss": 0.8216, "step": 4300 }, { "epoch": 1.400455432661028, "grad_norm": 1.5882444381713867, "learning_rate": 4.144869026727681e-05, "loss": 0.8101, "step": 4305 }, { "epoch": 1.402081977878985, "grad_norm": 2.770890235900879, "learning_rate": 4.1429246212537974e-05, "loss": 0.7915, "step": 4310 }, { "epoch": 1.4037085230969422, "grad_norm": 4.253058433532715, "learning_rate": 4.140978464900025e-05, "loss": 0.8204, "step": 4315 }, { "epoch": 1.405335068314899, "grad_norm": 2.0877606868743896, "learning_rate": 4.1390305597404e-05, "loss": 0.8163, "step": 4320 }, { "epoch": 1.4069616135328562, "grad_norm": 1.743794560432434, "learning_rate": 4.137080907850823e-05, "loss": 0.8297, "step": 4325 }, { "epoch": 1.4085881587508133, "grad_norm": 1.6247589588165283, "learning_rate": 4.135129511309056e-05, "loss": 0.81, "step": 4330 }, { "epoch": 1.4102147039687702, "grad_norm": 5.552340984344482, "learning_rate": 4.13317637219472e-05, "loss": 0.8039, "step": 4335 }, { "epoch": 1.4118412491867274, "grad_norm": 1.9774831533432007, "learning_rate": 4.131221492589295e-05, "loss": 0.8447, "step": 4340 }, { "epoch": 1.4134677944046845, "grad_norm": 1.8310171365737915, "learning_rate": 4.129264874576111e-05, "loss": 0.7966, "step": 4345 }, { "epoch": 1.4150943396226414, "grad_norm": 2.037260055541992, "learning_rate": 4.127306520240356e-05, "loss": 0.793, "step": 4350 }, { "epoch": 1.4167208848405985, "grad_norm": 1.7230867147445679, "learning_rate": 4.125346431669065e-05, "loss": 0.8316, "step": 4355 }, { "epoch": 1.4183474300585557, "grad_norm": 3.2659876346588135, "learning_rate": 4.123384610951124e-05, "loss": 0.7975, "step": 4360 }, { "epoch": 1.4199739752765126, "grad_norm": 1.837768316268921, "learning_rate": 4.121421060177263e-05, "loss": 0.7825, "step": 4365 }, { "epoch": 1.4216005204944697, "grad_norm": 2.2004690170288086, "learning_rate": 4.1194557814400545e-05, "loss": 0.8181, "step": 4370 }, { "epoch": 1.4232270657124269, "grad_norm": 1.841178059577942, "learning_rate": 4.1174887768339164e-05, "loss": 0.7974, "step": 4375 }, { "epoch": 1.4248536109303838, "grad_norm": 2.3017239570617676, "learning_rate": 4.115520048455102e-05, "loss": 0.8045, "step": 4380 }, { "epoch": 1.426480156148341, "grad_norm": 1.8260972499847412, "learning_rate": 4.113549598401704e-05, "loss": 0.8175, "step": 4385 }, { "epoch": 1.428106701366298, "grad_norm": 2.360365867614746, "learning_rate": 4.111577428773649e-05, "loss": 0.8042, "step": 4390 }, { "epoch": 1.429733246584255, "grad_norm": 1.8616596460342407, "learning_rate": 4.1096035416726966e-05, "loss": 0.7756, "step": 4395 }, { "epoch": 1.431359791802212, "grad_norm": 1.970524787902832, "learning_rate": 4.107627939202435e-05, "loss": 0.8207, "step": 4400 }, { "epoch": 1.4329863370201692, "grad_norm": 1.5485093593597412, "learning_rate": 4.105650623468284e-05, "loss": 0.7761, "step": 4405 }, { "epoch": 1.434612882238126, "grad_norm": 1.799946665763855, "learning_rate": 4.103671596577486e-05, "loss": 0.8384, "step": 4410 }, { "epoch": 1.4362394274560832, "grad_norm": 2.041393518447876, "learning_rate": 4.101690860639108e-05, "loss": 0.8278, "step": 4415 }, { "epoch": 1.4378659726740404, "grad_norm": 1.733588457107544, "learning_rate": 4.09970841776404e-05, "loss": 0.7961, "step": 4420 }, { "epoch": 1.4394925178919973, "grad_norm": 2.301828145980835, "learning_rate": 4.097724270064988e-05, "loss": 0.8102, "step": 4425 }, { "epoch": 1.4411190631099544, "grad_norm": 1.7932064533233643, "learning_rate": 4.0961357258533774e-05, "loss": 0.7817, "step": 4430 }, { "epoch": 1.4427456083279115, "grad_norm": 1.8835421800613403, "learning_rate": 4.0941485148009765e-05, "loss": 0.7893, "step": 4435 }, { "epoch": 1.4443721535458685, "grad_norm": 2.5245280265808105, "learning_rate": 4.0921596048498315e-05, "loss": 0.8149, "step": 4440 }, { "epoch": 1.4459986987638256, "grad_norm": 3.859861135482788, "learning_rate": 4.090168998119542e-05, "loss": 0.814, "step": 4445 }, { "epoch": 1.4476252439817827, "grad_norm": 1.7895616292953491, "learning_rate": 4.088176696731517e-05, "loss": 0.8054, "step": 4450 }, { "epoch": 1.4492517891997396, "grad_norm": 2.096266508102417, "learning_rate": 4.08618270280897e-05, "loss": 0.8064, "step": 4455 }, { "epoch": 1.4508783344176968, "grad_norm": 4.13993501663208, "learning_rate": 4.084187018476918e-05, "loss": 0.7698, "step": 4460 }, { "epoch": 1.452504879635654, "grad_norm": 1.7063169479370117, "learning_rate": 4.0821896458621814e-05, "loss": 0.8107, "step": 4465 }, { "epoch": 1.4541314248536108, "grad_norm": 1.8808252811431885, "learning_rate": 4.0801905870933764e-05, "loss": 0.7758, "step": 4470 }, { "epoch": 1.455757970071568, "grad_norm": 2.3222527503967285, "learning_rate": 4.07818984430092e-05, "loss": 0.771, "step": 4475 }, { "epoch": 1.457384515289525, "grad_norm": 1.7792950868606567, "learning_rate": 4.076187419617024e-05, "loss": 0.8149, "step": 4480 }, { "epoch": 1.459011060507482, "grad_norm": 1.9383238554000854, "learning_rate": 4.074183315175686e-05, "loss": 0.8008, "step": 4485 }, { "epoch": 1.460637605725439, "grad_norm": 1.767006516456604, "learning_rate": 4.072177533112703e-05, "loss": 0.7871, "step": 4490 }, { "epoch": 1.4622641509433962, "grad_norm": 2.0811262130737305, "learning_rate": 4.0701700755656534e-05, "loss": 0.8408, "step": 4495 }, { "epoch": 1.4638906961613534, "grad_norm": 1.673139214515686, "learning_rate": 4.068160944673903e-05, "loss": 0.8747, "step": 4500 }, { "epoch": 1.4655172413793103, "grad_norm": 1.6479978561401367, "learning_rate": 4.066150142578602e-05, "loss": 0.8025, "step": 4505 }, { "epoch": 1.4671437865972674, "grad_norm": 5.902811050415039, "learning_rate": 4.0641376714226795e-05, "loss": 0.8073, "step": 4510 }, { "epoch": 1.4687703318152245, "grad_norm": 1.991217851638794, "learning_rate": 4.062123533350847e-05, "loss": 0.8249, "step": 4515 }, { "epoch": 1.4703968770331814, "grad_norm": 2.3157060146331787, "learning_rate": 4.060107730509587e-05, "loss": 0.8106, "step": 4520 }, { "epoch": 1.4720234222511386, "grad_norm": 1.7831164598464966, "learning_rate": 4.05809026504716e-05, "loss": 0.7792, "step": 4525 }, { "epoch": 1.4736499674690957, "grad_norm": 2.218675136566162, "learning_rate": 4.0560711391135986e-05, "loss": 0.7869, "step": 4530 }, { "epoch": 1.4752765126870526, "grad_norm": 1.9791754484176636, "learning_rate": 4.0540503548607035e-05, "loss": 0.8083, "step": 4535 }, { "epoch": 1.4769030579050098, "grad_norm": 1.804709553718567, "learning_rate": 4.052027914442043e-05, "loss": 0.8167, "step": 4540 }, { "epoch": 1.4785296031229669, "grad_norm": 1.798051357269287, "learning_rate": 4.050003820012948e-05, "loss": 0.8041, "step": 4545 }, { "epoch": 1.480156148340924, "grad_norm": 1.6213825941085815, "learning_rate": 4.047978073730519e-05, "loss": 0.8297, "step": 4550 }, { "epoch": 1.481782693558881, "grad_norm": 1.7084782123565674, "learning_rate": 4.045950677753611e-05, "loss": 0.7996, "step": 4555 }, { "epoch": 1.483409238776838, "grad_norm": 2.0501437187194824, "learning_rate": 4.043921634242836e-05, "loss": 0.7992, "step": 4560 }, { "epoch": 1.4850357839947952, "grad_norm": 1.5576157569885254, "learning_rate": 4.041890945360567e-05, "loss": 0.7909, "step": 4565 }, { "epoch": 1.486662329212752, "grad_norm": 1.9696029424667358, "learning_rate": 4.039858613270927e-05, "loss": 0.8363, "step": 4570 }, { "epoch": 1.4882888744307092, "grad_norm": 1.7343369722366333, "learning_rate": 4.037824640139791e-05, "loss": 0.8187, "step": 4575 }, { "epoch": 1.4899154196486664, "grad_norm": 1.8161100149154663, "learning_rate": 4.035789028134782e-05, "loss": 0.8193, "step": 4580 }, { "epoch": 1.4915419648666233, "grad_norm": 1.7704228162765503, "learning_rate": 4.033751779425272e-05, "loss": 0.7905, "step": 4585 }, { "epoch": 1.4931685100845804, "grad_norm": 1.9006760120391846, "learning_rate": 4.031712896182376e-05, "loss": 0.8017, "step": 4590 }, { "epoch": 1.4947950553025375, "grad_norm": 1.5826624631881714, "learning_rate": 4.029672380578948e-05, "loss": 0.7889, "step": 4595 }, { "epoch": 1.4964216005204944, "grad_norm": 1.7704415321350098, "learning_rate": 4.0276302347895864e-05, "loss": 0.8064, "step": 4600 }, { "epoch": 1.4980481457384516, "grad_norm": 1.7217965126037598, "learning_rate": 4.025586460990625e-05, "loss": 0.8, "step": 4605 }, { "epoch": 1.4996746909564087, "grad_norm": 1.4004994630813599, "learning_rate": 4.023541061360131e-05, "loss": 0.7898, "step": 4610 }, { "epoch": 1.5013012361743656, "grad_norm": 1.8781967163085938, "learning_rate": 4.021494038077907e-05, "loss": 0.7992, "step": 4615 }, { "epoch": 1.5029277813923227, "grad_norm": 2.0038421154022217, "learning_rate": 4.019445393325483e-05, "loss": 0.8055, "step": 4620 }, { "epoch": 1.5045543266102799, "grad_norm": 1.9267133474349976, "learning_rate": 4.01739512928612e-05, "loss": 0.8447, "step": 4625 }, { "epoch": 1.5061808718282368, "grad_norm": 1.6246354579925537, "learning_rate": 4.0153432481448027e-05, "loss": 0.816, "step": 4630 }, { "epoch": 1.507807417046194, "grad_norm": 1.8404632806777954, "learning_rate": 4.01328975208824e-05, "loss": 0.7858, "step": 4635 }, { "epoch": 1.509433962264151, "grad_norm": 1.8644263744354248, "learning_rate": 4.01123464330486e-05, "loss": 0.8302, "step": 4640 }, { "epoch": 1.511060507482108, "grad_norm": 1.5745633840560913, "learning_rate": 4.009177923984812e-05, "loss": 0.7719, "step": 4645 }, { "epoch": 1.512687052700065, "grad_norm": 1.6362204551696777, "learning_rate": 4.007119596319962e-05, "loss": 0.7955, "step": 4650 }, { "epoch": 1.5143135979180222, "grad_norm": 1.4978214502334595, "learning_rate": 4.005059662503888e-05, "loss": 0.8176, "step": 4655 }, { "epoch": 1.5159401431359791, "grad_norm": 1.6588901281356812, "learning_rate": 4.002998124731879e-05, "loss": 0.8118, "step": 4660 }, { "epoch": 1.5175666883539363, "grad_norm": 1.4084504842758179, "learning_rate": 4.000934985200937e-05, "loss": 0.8055, "step": 4665 }, { "epoch": 1.5191932335718934, "grad_norm": 1.8851374387741089, "learning_rate": 3.998870246109767e-05, "loss": 0.7762, "step": 4670 }, { "epoch": 1.5208197787898503, "grad_norm": 2.7642505168914795, "learning_rate": 3.996803909658782e-05, "loss": 0.7927, "step": 4675 }, { "epoch": 1.5224463240078074, "grad_norm": 1.984631896018982, "learning_rate": 3.994735978050094e-05, "loss": 0.8018, "step": 4680 }, { "epoch": 1.5240728692257646, "grad_norm": 1.5648013353347778, "learning_rate": 3.992666453487518e-05, "loss": 0.8135, "step": 4685 }, { "epoch": 1.5256994144437215, "grad_norm": 1.6731303930282593, "learning_rate": 3.990595338176564e-05, "loss": 0.8305, "step": 4690 }, { "epoch": 1.5273259596616786, "grad_norm": 2.1696157455444336, "learning_rate": 3.988522634324441e-05, "loss": 0.7958, "step": 4695 }, { "epoch": 1.5289525048796357, "grad_norm": 1.3574761152267456, "learning_rate": 3.986448344140047e-05, "loss": 0.7748, "step": 4700 }, { "epoch": 1.5305790500975927, "grad_norm": 1.553859829902649, "learning_rate": 3.984372469833972e-05, "loss": 0.794, "step": 4705 }, { "epoch": 1.5322055953155498, "grad_norm": 1.8226816654205322, "learning_rate": 3.9822950136184946e-05, "loss": 0.8153, "step": 4710 }, { "epoch": 1.533832140533507, "grad_norm": 1.8266657590866089, "learning_rate": 3.9802159777075796e-05, "loss": 0.8073, "step": 4715 }, { "epoch": 1.5354586857514638, "grad_norm": 1.4437830448150635, "learning_rate": 3.978135364316874e-05, "loss": 0.775, "step": 4720 }, { "epoch": 1.537085230969421, "grad_norm": 2.241546392440796, "learning_rate": 3.976053175663707e-05, "loss": 0.7779, "step": 4725 }, { "epoch": 1.538711776187378, "grad_norm": 1.7656699419021606, "learning_rate": 3.973969413967086e-05, "loss": 0.8373, "step": 4730 }, { "epoch": 1.540338321405335, "grad_norm": 1.451778531074524, "learning_rate": 3.971884081447695e-05, "loss": 0.8314, "step": 4735 }, { "epoch": 1.5419648666232921, "grad_norm": 1.667837381362915, "learning_rate": 3.9697971803278924e-05, "loss": 0.8282, "step": 4740 }, { "epoch": 1.5435914118412493, "grad_norm": 1.6840225458145142, "learning_rate": 3.967708712831707e-05, "loss": 0.7994, "step": 4745 }, { "epoch": 1.5452179570592062, "grad_norm": 1.5227510929107666, "learning_rate": 3.9656186811848395e-05, "loss": 0.7899, "step": 4750 }, { "epoch": 1.5468445022771633, "grad_norm": 1.7087949514389038, "learning_rate": 3.963527087614655e-05, "loss": 0.8272, "step": 4755 }, { "epoch": 1.5484710474951204, "grad_norm": 1.5343523025512695, "learning_rate": 3.9614339343501836e-05, "loss": 0.796, "step": 4760 }, { "epoch": 1.5500975927130773, "grad_norm": 1.9713935852050781, "learning_rate": 3.9593392236221176e-05, "loss": 0.7851, "step": 4765 }, { "epoch": 1.5517241379310345, "grad_norm": 1.7553633451461792, "learning_rate": 3.9572429576628114e-05, "loss": 0.7687, "step": 4770 }, { "epoch": 1.5533506831489916, "grad_norm": 1.773939847946167, "learning_rate": 3.955145138706273e-05, "loss": 0.8141, "step": 4775 }, { "epoch": 1.5549772283669485, "grad_norm": 1.7146141529083252, "learning_rate": 3.9530457689881684e-05, "loss": 0.7813, "step": 4780 }, { "epoch": 1.5566037735849056, "grad_norm": 2.0186431407928467, "learning_rate": 3.9509448507458146e-05, "loss": 0.7816, "step": 4785 }, { "epoch": 1.5582303188028628, "grad_norm": 1.6115589141845703, "learning_rate": 3.94884238621818e-05, "loss": 0.8232, "step": 4790 }, { "epoch": 1.5598568640208197, "grad_norm": 1.784650444984436, "learning_rate": 3.94673837764588e-05, "loss": 0.8161, "step": 4795 }, { "epoch": 1.5614834092387768, "grad_norm": 1.9830811023712158, "learning_rate": 3.944632827271176e-05, "loss": 0.8087, "step": 4800 }, { "epoch": 1.563109954456734, "grad_norm": 1.8103023767471313, "learning_rate": 3.942525737337973e-05, "loss": 0.7763, "step": 4805 }, { "epoch": 1.5647364996746909, "grad_norm": 1.7412750720977783, "learning_rate": 3.940417110091816e-05, "loss": 0.795, "step": 4810 }, { "epoch": 1.566363044892648, "grad_norm": 1.4648451805114746, "learning_rate": 3.9383069477798886e-05, "loss": 0.8255, "step": 4815 }, { "epoch": 1.5679895901106051, "grad_norm": 1.4472988843917847, "learning_rate": 3.9361952526510085e-05, "loss": 0.8215, "step": 4820 }, { "epoch": 1.569616135328562, "grad_norm": 1.5990488529205322, "learning_rate": 3.93408202695563e-05, "loss": 0.7938, "step": 4825 }, { "epoch": 1.5712426805465192, "grad_norm": 2.124126672744751, "learning_rate": 3.9319672729458376e-05, "loss": 0.8117, "step": 4830 }, { "epoch": 1.5728692257644763, "grad_norm": 1.7679290771484375, "learning_rate": 3.9298509928753434e-05, "loss": 0.7868, "step": 4835 }, { "epoch": 1.5744957709824332, "grad_norm": 2.1029303073883057, "learning_rate": 3.927733188999486e-05, "loss": 0.7573, "step": 4840 }, { "epoch": 1.5761223162003903, "grad_norm": 1.73438560962677, "learning_rate": 3.9256138635752304e-05, "loss": 0.8231, "step": 4845 }, { "epoch": 1.5777488614183475, "grad_norm": 1.4345791339874268, "learning_rate": 3.92349301886116e-05, "loss": 0.7866, "step": 4850 }, { "epoch": 1.5793754066363044, "grad_norm": 2.6696362495422363, "learning_rate": 3.921370657117478e-05, "loss": 0.8242, "step": 4855 }, { "epoch": 1.5810019518542615, "grad_norm": 2.566169261932373, "learning_rate": 3.9192467806060044e-05, "loss": 0.7749, "step": 4860 }, { "epoch": 1.5826284970722186, "grad_norm": 1.6439261436462402, "learning_rate": 3.917121391590176e-05, "loss": 0.8143, "step": 4865 }, { "epoch": 1.5842550422901756, "grad_norm": 1.8413790464401245, "learning_rate": 3.914994492335038e-05, "loss": 0.8113, "step": 4870 }, { "epoch": 1.5858815875081327, "grad_norm": 1.6771591901779175, "learning_rate": 3.912866085107247e-05, "loss": 0.7638, "step": 4875 }, { "epoch": 1.5875081327260898, "grad_norm": 1.6582680940628052, "learning_rate": 3.910736172175066e-05, "loss": 0.8137, "step": 4880 }, { "epoch": 1.5891346779440467, "grad_norm": 1.6781288385391235, "learning_rate": 3.908604755808363e-05, "loss": 0.7764, "step": 4885 }, { "epoch": 1.5907612231620039, "grad_norm": 2.101989507675171, "learning_rate": 3.9064718382786076e-05, "loss": 0.781, "step": 4890 }, { "epoch": 1.592387768379961, "grad_norm": 2.3528003692626953, "learning_rate": 3.90433742185887e-05, "loss": 0.8243, "step": 4895 }, { "epoch": 1.594014313597918, "grad_norm": 1.4553966522216797, "learning_rate": 3.9022015088238174e-05, "loss": 0.7994, "step": 4900 }, { "epoch": 1.595640858815875, "grad_norm": 1.6103534698486328, "learning_rate": 3.9000641014497124e-05, "loss": 0.8042, "step": 4905 }, { "epoch": 1.5972674040338322, "grad_norm": 1.993911623954773, "learning_rate": 3.897925202014409e-05, "loss": 0.8148, "step": 4910 }, { "epoch": 1.598893949251789, "grad_norm": 1.6934844255447388, "learning_rate": 3.895784812797352e-05, "loss": 0.7999, "step": 4915 }, { "epoch": 1.6005204944697464, "grad_norm": 1.602703332901001, "learning_rate": 3.8936429360795745e-05, "loss": 0.8024, "step": 4920 }, { "epoch": 1.6021470396877033, "grad_norm": 1.9552648067474365, "learning_rate": 3.891499574143693e-05, "loss": 0.8171, "step": 4925 }, { "epoch": 1.6037735849056602, "grad_norm": 1.9080613851547241, "learning_rate": 3.88935472927391e-05, "loss": 0.8218, "step": 4930 }, { "epoch": 1.6054001301236176, "grad_norm": 1.7507500648498535, "learning_rate": 3.887208403756005e-05, "loss": 0.8017, "step": 4935 }, { "epoch": 1.6070266753415745, "grad_norm": 1.6852693557739258, "learning_rate": 3.885060599877337e-05, "loss": 0.8115, "step": 4940 }, { "epoch": 1.6086532205595314, "grad_norm": 1.4053925275802612, "learning_rate": 3.8829113199268403e-05, "loss": 0.8327, "step": 4945 }, { "epoch": 1.6102797657774888, "grad_norm": 1.6863847970962524, "learning_rate": 3.880760566195023e-05, "loss": 0.7858, "step": 4950 }, { "epoch": 1.6119063109954457, "grad_norm": 1.752721905708313, "learning_rate": 3.878608340973962e-05, "loss": 0.7895, "step": 4955 }, { "epoch": 1.6135328562134026, "grad_norm": 2.294194221496582, "learning_rate": 3.876454646557305e-05, "loss": 0.7745, "step": 4960 }, { "epoch": 1.61515940143136, "grad_norm": 1.9643068313598633, "learning_rate": 3.8742994852402637e-05, "loss": 0.8077, "step": 4965 }, { "epoch": 1.6167859466493169, "grad_norm": 1.7443420886993408, "learning_rate": 3.872142859319612e-05, "loss": 0.7877, "step": 4970 }, { "epoch": 1.6184124918672738, "grad_norm": 2.0942933559417725, "learning_rate": 3.869984771093687e-05, "loss": 0.8167, "step": 4975 }, { "epoch": 1.6200390370852311, "grad_norm": 1.7708196640014648, "learning_rate": 3.867825222862383e-05, "loss": 0.8007, "step": 4980 }, { "epoch": 1.621665582303188, "grad_norm": 2.0296547412872314, "learning_rate": 3.8656642169271505e-05, "loss": 0.8063, "step": 4985 }, { "epoch": 1.623292127521145, "grad_norm": 1.888785481452942, "learning_rate": 3.863501755590994e-05, "loss": 0.8319, "step": 4990 }, { "epoch": 1.6249186727391023, "grad_norm": 1.4504474401474, "learning_rate": 3.8613378411584665e-05, "loss": 0.8158, "step": 4995 }, { "epoch": 1.6265452179570592, "grad_norm": 2.6302969455718994, "learning_rate": 3.8591724759356734e-05, "loss": 0.81, "step": 5000 }, { "epoch": 1.628171763175016, "grad_norm": 1.873052954673767, "learning_rate": 3.857005662230264e-05, "loss": 0.8089, "step": 5005 }, { "epoch": 1.6297983083929735, "grad_norm": 1.664070725440979, "learning_rate": 3.854837402351431e-05, "loss": 0.8248, "step": 5010 }, { "epoch": 1.6314248536109304, "grad_norm": 1.9076143503189087, "learning_rate": 3.85266769860991e-05, "loss": 0.8255, "step": 5015 }, { "epoch": 1.6330513988288873, "grad_norm": 2.2082643508911133, "learning_rate": 3.8504965533179724e-05, "loss": 0.783, "step": 5020 }, { "epoch": 1.6346779440468446, "grad_norm": 1.6590235233306885, "learning_rate": 3.84832396878943e-05, "loss": 0.7941, "step": 5025 }, { "epoch": 1.6363044892648015, "grad_norm": 1.7328729629516602, "learning_rate": 3.8461499473396246e-05, "loss": 0.8133, "step": 5030 }, { "epoch": 1.6379310344827587, "grad_norm": 3.048574209213257, "learning_rate": 3.843974491285432e-05, "loss": 0.804, "step": 5035 }, { "epoch": 1.6395575797007158, "grad_norm": 1.6420484781265259, "learning_rate": 3.841797602945254e-05, "loss": 0.7681, "step": 5040 }, { "epoch": 1.6411841249186727, "grad_norm": 6.178708553314209, "learning_rate": 3.839619284639022e-05, "loss": 0.7657, "step": 5045 }, { "epoch": 1.6428106701366298, "grad_norm": 1.622899055480957, "learning_rate": 3.837439538688189e-05, "loss": 0.7869, "step": 5050 }, { "epoch": 1.644437215354587, "grad_norm": 1.6254470348358154, "learning_rate": 3.8352583674157314e-05, "loss": 0.8116, "step": 5055 }, { "epoch": 1.6460637605725439, "grad_norm": 1.450502872467041, "learning_rate": 3.833075773146142e-05, "loss": 0.8245, "step": 5060 }, { "epoch": 1.647690305790501, "grad_norm": 1.6301724910736084, "learning_rate": 3.8308917582054324e-05, "loss": 0.77, "step": 5065 }, { "epoch": 1.6493168510084582, "grad_norm": 3.7551519870758057, "learning_rate": 3.828706324921128e-05, "loss": 0.7974, "step": 5070 }, { "epoch": 1.650943396226415, "grad_norm": 2.2062652111053467, "learning_rate": 3.826519475622265e-05, "loss": 0.8043, "step": 5075 }, { "epoch": 1.6525699414443722, "grad_norm": 1.5353881120681763, "learning_rate": 3.824331212639388e-05, "loss": 0.7968, "step": 5080 }, { "epoch": 1.6541964866623293, "grad_norm": 1.8680821657180786, "learning_rate": 3.822141538304549e-05, "loss": 0.7878, "step": 5085 }, { "epoch": 1.6558230318802862, "grad_norm": 1.8406147956848145, "learning_rate": 3.8199504549513055e-05, "loss": 0.7833, "step": 5090 }, { "epoch": 1.6574495770982434, "grad_norm": 1.6851632595062256, "learning_rate": 3.817757964914713e-05, "loss": 0.8122, "step": 5095 }, { "epoch": 1.6590761223162005, "grad_norm": 1.4229167699813843, "learning_rate": 3.81556407053133e-05, "loss": 0.8089, "step": 5100 }, { "epoch": 1.6607026675341574, "grad_norm": 1.7230238914489746, "learning_rate": 3.81336877413921e-05, "loss": 0.8054, "step": 5105 }, { "epoch": 1.6623292127521145, "grad_norm": 1.6220729351043701, "learning_rate": 3.811172078077899e-05, "loss": 0.7732, "step": 5110 }, { "epoch": 1.6639557579700717, "grad_norm": 1.9733960628509521, "learning_rate": 3.808973984688439e-05, "loss": 0.7928, "step": 5115 }, { "epoch": 1.6655823031880286, "grad_norm": 1.9078205823898315, "learning_rate": 3.806774496313355e-05, "loss": 0.8029, "step": 5120 }, { "epoch": 1.6672088484059857, "grad_norm": 2.955739736557007, "learning_rate": 3.8045736152966635e-05, "loss": 0.7885, "step": 5125 }, { "epoch": 1.6688353936239428, "grad_norm": 2.7966153621673584, "learning_rate": 3.802371343983865e-05, "loss": 0.7869, "step": 5130 }, { "epoch": 1.6704619388418998, "grad_norm": 1.9048030376434326, "learning_rate": 3.800167684721938e-05, "loss": 0.7971, "step": 5135 }, { "epoch": 1.6720884840598569, "grad_norm": 1.6827515363693237, "learning_rate": 3.797962639859344e-05, "loss": 0.7805, "step": 5140 }, { "epoch": 1.673715029277814, "grad_norm": 1.5637474060058594, "learning_rate": 3.7957562117460187e-05, "loss": 0.7994, "step": 5145 }, { "epoch": 1.675341574495771, "grad_norm": 1.4919729232788086, "learning_rate": 3.7935484027333746e-05, "loss": 0.807, "step": 5150 }, { "epoch": 1.676968119713728, "grad_norm": 1.6017789840698242, "learning_rate": 3.7913392151742924e-05, "loss": 0.8152, "step": 5155 }, { "epoch": 1.6785946649316852, "grad_norm": 2.5162606239318848, "learning_rate": 3.7891286514231225e-05, "loss": 0.7919, "step": 5160 }, { "epoch": 1.680221210149642, "grad_norm": 1.5875083208084106, "learning_rate": 3.786916713835685e-05, "loss": 0.8212, "step": 5165 }, { "epoch": 1.6818477553675992, "grad_norm": 1.7888031005859375, "learning_rate": 3.784703404769263e-05, "loss": 0.7905, "step": 5170 }, { "epoch": 1.6834743005855564, "grad_norm": 1.5636552572250366, "learning_rate": 3.782488726582598e-05, "loss": 0.7896, "step": 5175 }, { "epoch": 1.6851008458035133, "grad_norm": 2.8158295154571533, "learning_rate": 3.780272681635894e-05, "loss": 0.7746, "step": 5180 }, { "epoch": 1.6867273910214704, "grad_norm": 1.6401851177215576, "learning_rate": 3.77805527229081e-05, "loss": 0.7874, "step": 5185 }, { "epoch": 1.6883539362394275, "grad_norm": 2.0426077842712402, "learning_rate": 3.77583650091046e-05, "loss": 0.7929, "step": 5190 }, { "epoch": 1.6899804814573844, "grad_norm": 1.6837797164916992, "learning_rate": 3.7736163698594094e-05, "loss": 0.8331, "step": 5195 }, { "epoch": 1.6916070266753416, "grad_norm": 1.5901471376419067, "learning_rate": 3.771394881503673e-05, "loss": 0.7882, "step": 5200 }, { "epoch": 1.6932335718932987, "grad_norm": 2.020697593688965, "learning_rate": 3.7691720382107084e-05, "loss": 0.8056, "step": 5205 }, { "epoch": 1.6948601171112556, "grad_norm": 1.6831104755401611, "learning_rate": 3.766947842349423e-05, "loss": 0.8024, "step": 5210 }, { "epoch": 1.6964866623292127, "grad_norm": 1.7317795753479004, "learning_rate": 3.764722296290162e-05, "loss": 0.8069, "step": 5215 }, { "epoch": 1.6981132075471699, "grad_norm": 1.4958361387252808, "learning_rate": 3.76249540240471e-05, "loss": 0.8255, "step": 5220 }, { "epoch": 1.6997397527651268, "grad_norm": 1.7042391300201416, "learning_rate": 3.7602671630662886e-05, "loss": 0.8126, "step": 5225 }, { "epoch": 1.701366297983084, "grad_norm": 1.7775369882583618, "learning_rate": 3.7580375806495524e-05, "loss": 0.7854, "step": 5230 }, { "epoch": 1.702992843201041, "grad_norm": 1.6327508687973022, "learning_rate": 3.755806657530589e-05, "loss": 0.8067, "step": 5235 }, { "epoch": 1.704619388418998, "grad_norm": 1.5772026777267456, "learning_rate": 3.753574396086913e-05, "loss": 0.7861, "step": 5240 }, { "epoch": 1.706245933636955, "grad_norm": 1.642557144165039, "learning_rate": 3.751340798697466e-05, "loss": 0.7904, "step": 5245 }, { "epoch": 1.7078724788549122, "grad_norm": 1.9723272323608398, "learning_rate": 3.7491058677426135e-05, "loss": 0.8274, "step": 5250 }, { "epoch": 1.7094990240728691, "grad_norm": 1.5701051950454712, "learning_rate": 3.7468696056041406e-05, "loss": 0.8231, "step": 5255 }, { "epoch": 1.7111255692908263, "grad_norm": 1.5915799140930176, "learning_rate": 3.7446320146652556e-05, "loss": 0.8006, "step": 5260 }, { "epoch": 1.7127521145087834, "grad_norm": 1.9272072315216064, "learning_rate": 3.7423930973105766e-05, "loss": 0.7917, "step": 5265 }, { "epoch": 1.7143786597267403, "grad_norm": 2.022019386291504, "learning_rate": 3.740152855926139e-05, "loss": 0.8224, "step": 5270 }, { "epoch": 1.7160052049446974, "grad_norm": 1.566528081893921, "learning_rate": 3.7379112928993904e-05, "loss": 0.7587, "step": 5275 }, { "epoch": 1.7176317501626546, "grad_norm": 2.751673936843872, "learning_rate": 3.735668410619183e-05, "loss": 0.7843, "step": 5280 }, { "epoch": 1.7192582953806115, "grad_norm": 2.7237708568573, "learning_rate": 3.73342421147578e-05, "loss": 0.7954, "step": 5285 }, { "epoch": 1.7208848405985686, "grad_norm": 1.8551533222198486, "learning_rate": 3.7311786978608415e-05, "loss": 0.8091, "step": 5290 }, { "epoch": 1.7225113858165257, "grad_norm": 1.67500638961792, "learning_rate": 3.7289318721674346e-05, "loss": 0.8154, "step": 5295 }, { "epoch": 1.7241379310344827, "grad_norm": 1.9620552062988281, "learning_rate": 3.726683736790022e-05, "loss": 0.7943, "step": 5300 }, { "epoch": 1.7257644762524398, "grad_norm": 2.0189971923828125, "learning_rate": 3.72443429412446e-05, "loss": 0.7814, "step": 5305 }, { "epoch": 1.727391021470397, "grad_norm": 1.632895827293396, "learning_rate": 3.7221835465680024e-05, "loss": 0.7814, "step": 5310 }, { "epoch": 1.7290175666883538, "grad_norm": 1.7532051801681519, "learning_rate": 3.719931496519291e-05, "loss": 0.8273, "step": 5315 }, { "epoch": 1.730644111906311, "grad_norm": 1.4986096620559692, "learning_rate": 3.717678146378357e-05, "loss": 0.7931, "step": 5320 }, { "epoch": 1.732270657124268, "grad_norm": 1.7292662858963013, "learning_rate": 3.7154234985466155e-05, "loss": 0.8032, "step": 5325 }, { "epoch": 1.733897202342225, "grad_norm": 2.197148084640503, "learning_rate": 3.7131675554268654e-05, "loss": 0.7974, "step": 5330 }, { "epoch": 1.7355237475601821, "grad_norm": 1.840177297592163, "learning_rate": 3.7109103194232856e-05, "loss": 0.8385, "step": 5335 }, { "epoch": 1.7371502927781393, "grad_norm": 1.728795051574707, "learning_rate": 3.7086517929414346e-05, "loss": 0.7905, "step": 5340 }, { "epoch": 1.7387768379960962, "grad_norm": 3.702511787414551, "learning_rate": 3.706391978388245e-05, "loss": 0.8194, "step": 5345 }, { "epoch": 1.7404033832140533, "grad_norm": 1.6523361206054688, "learning_rate": 3.70413087817202e-05, "loss": 0.8152, "step": 5350 }, { "epoch": 1.7420299284320104, "grad_norm": 1.5317710638046265, "learning_rate": 3.701868494702437e-05, "loss": 0.7935, "step": 5355 }, { "epoch": 1.7436564736499673, "grad_norm": 1.8728350400924683, "learning_rate": 3.699604830390537e-05, "loss": 0.7902, "step": 5360 }, { "epoch": 1.7452830188679245, "grad_norm": 1.674080491065979, "learning_rate": 3.69733988764873e-05, "loss": 0.7796, "step": 5365 }, { "epoch": 1.7469095640858816, "grad_norm": 2.6330764293670654, "learning_rate": 3.695073668890785e-05, "loss": 0.8109, "step": 5370 }, { "epoch": 1.7485361093038385, "grad_norm": 1.6176660060882568, "learning_rate": 3.692806176531832e-05, "loss": 0.7639, "step": 5375 }, { "epoch": 1.7501626545217959, "grad_norm": 1.6828603744506836, "learning_rate": 3.690537412988359e-05, "loss": 0.8128, "step": 5380 }, { "epoch": 1.7517891997397528, "grad_norm": 1.547345757484436, "learning_rate": 3.688267380678208e-05, "loss": 0.852, "step": 5385 }, { "epoch": 1.7534157449577097, "grad_norm": 1.802438735961914, "learning_rate": 3.685996082020574e-05, "loss": 0.7841, "step": 5390 }, { "epoch": 1.755042290175667, "grad_norm": 2.5347976684570312, "learning_rate": 3.683723519436e-05, "loss": 0.8074, "step": 5395 }, { "epoch": 1.756668835393624, "grad_norm": 1.9095966815948486, "learning_rate": 3.681449695346376e-05, "loss": 0.7791, "step": 5400 }, { "epoch": 1.7582953806115809, "grad_norm": 2.8300509452819824, "learning_rate": 3.67917461217494e-05, "loss": 0.7851, "step": 5405 }, { "epoch": 1.7599219258295382, "grad_norm": 2.3183364868164062, "learning_rate": 3.676898272346266e-05, "loss": 0.768, "step": 5410 }, { "epoch": 1.7615484710474951, "grad_norm": 1.687774896621704, "learning_rate": 3.674620678286273e-05, "loss": 0.7968, "step": 5415 }, { "epoch": 1.763175016265452, "grad_norm": 1.9604495763778687, "learning_rate": 3.6723418324222126e-05, "loss": 0.8386, "step": 5420 }, { "epoch": 1.7648015614834094, "grad_norm": 1.6525349617004395, "learning_rate": 3.670061737182672e-05, "loss": 0.8047, "step": 5425 }, { "epoch": 1.7664281067013663, "grad_norm": 1.938249945640564, "learning_rate": 3.667780394997569e-05, "loss": 0.7749, "step": 5430 }, { "epoch": 1.7680546519193232, "grad_norm": 3.016706943511963, "learning_rate": 3.6654978082981514e-05, "loss": 0.8117, "step": 5435 }, { "epoch": 1.7696811971372806, "grad_norm": 1.8364930152893066, "learning_rate": 3.663213979516994e-05, "loss": 0.8016, "step": 5440 }, { "epoch": 1.7713077423552375, "grad_norm": 2.14928936958313, "learning_rate": 3.660928911087993e-05, "loss": 0.796, "step": 5445 }, { "epoch": 1.7729342875731944, "grad_norm": 2.1587629318237305, "learning_rate": 3.658642605446367e-05, "loss": 0.7777, "step": 5450 }, { "epoch": 1.7745608327911517, "grad_norm": 1.730048418045044, "learning_rate": 3.6563550650286526e-05, "loss": 0.7868, "step": 5455 }, { "epoch": 1.7761873780091086, "grad_norm": 1.746994137763977, "learning_rate": 3.6540662922727034e-05, "loss": 0.7828, "step": 5460 }, { "epoch": 1.7778139232270656, "grad_norm": 1.678442358970642, "learning_rate": 3.651776289617685e-05, "loss": 0.8107, "step": 5465 }, { "epoch": 1.779440468445023, "grad_norm": 1.93441641330719, "learning_rate": 3.6494850595040745e-05, "loss": 0.7975, "step": 5470 }, { "epoch": 1.7810670136629798, "grad_norm": 2.052583932876587, "learning_rate": 3.647192604373658e-05, "loss": 0.8173, "step": 5475 }, { "epoch": 1.7826935588809367, "grad_norm": 1.5014948844909668, "learning_rate": 3.644898926669524e-05, "loss": 0.781, "step": 5480 }, { "epoch": 1.784320104098894, "grad_norm": 1.6949294805526733, "learning_rate": 3.6426040288360674e-05, "loss": 0.8263, "step": 5485 }, { "epoch": 1.785946649316851, "grad_norm": 1.567044734954834, "learning_rate": 3.640307913318982e-05, "loss": 0.8007, "step": 5490 }, { "epoch": 1.7875731945348081, "grad_norm": 1.6042593717575073, "learning_rate": 3.638010582565257e-05, "loss": 0.7956, "step": 5495 }, { "epoch": 1.7891997397527653, "grad_norm": 1.9586327075958252, "learning_rate": 3.6357120390231825e-05, "loss": 0.8122, "step": 5500 }, { "epoch": 1.7908262849707222, "grad_norm": 5.827934265136719, "learning_rate": 3.6334122851423344e-05, "loss": 0.8541, "step": 5505 }, { "epoch": 1.7924528301886793, "grad_norm": 1.5693747997283936, "learning_rate": 3.6311113233735836e-05, "loss": 0.8042, "step": 5510 }, { "epoch": 1.7940793754066364, "grad_norm": 1.6549158096313477, "learning_rate": 3.6288091561690855e-05, "loss": 0.8172, "step": 5515 }, { "epoch": 1.7957059206245933, "grad_norm": 1.5341600179672241, "learning_rate": 3.626505785982281e-05, "loss": 0.8073, "step": 5520 }, { "epoch": 1.7973324658425505, "grad_norm": 4.368505954742432, "learning_rate": 3.6242012152678925e-05, "loss": 0.7982, "step": 5525 }, { "epoch": 1.7989590110605076, "grad_norm": 1.8337335586547852, "learning_rate": 3.6218954464819224e-05, "loss": 0.7842, "step": 5530 }, { "epoch": 1.8005855562784645, "grad_norm": 1.8748178482055664, "learning_rate": 3.61958848208165e-05, "loss": 0.7305, "step": 5535 }, { "epoch": 1.8022121014964216, "grad_norm": 1.6605912446975708, "learning_rate": 3.6172803245256284e-05, "loss": 0.7669, "step": 5540 }, { "epoch": 1.8038386467143788, "grad_norm": 1.788952350616455, "learning_rate": 3.614970976273681e-05, "loss": 0.7843, "step": 5545 }, { "epoch": 1.8054651919323357, "grad_norm": 1.8774856328964233, "learning_rate": 3.612660439786904e-05, "loss": 0.7941, "step": 5550 }, { "epoch": 1.8070917371502928, "grad_norm": 1.7823916673660278, "learning_rate": 3.6103487175276564e-05, "loss": 0.7977, "step": 5555 }, { "epoch": 1.80871828236825, "grad_norm": 2.3198554515838623, "learning_rate": 3.608035811959561e-05, "loss": 0.8222, "step": 5560 }, { "epoch": 1.8103448275862069, "grad_norm": 2.542912244796753, "learning_rate": 3.6057217255475034e-05, "loss": 0.7993, "step": 5565 }, { "epoch": 1.811971372804164, "grad_norm": 2.11651349067688, "learning_rate": 3.603406460757627e-05, "loss": 0.8015, "step": 5570 }, { "epoch": 1.8135979180221211, "grad_norm": 2.6846766471862793, "learning_rate": 3.601090020057329e-05, "loss": 0.81, "step": 5575 }, { "epoch": 1.815224463240078, "grad_norm": 1.87996244430542, "learning_rate": 3.598772405915264e-05, "loss": 0.8084, "step": 5580 }, { "epoch": 1.8168510084580352, "grad_norm": 1.4050390720367432, "learning_rate": 3.596453620801334e-05, "loss": 0.7983, "step": 5585 }, { "epoch": 1.8184775536759923, "grad_norm": 1.6851388216018677, "learning_rate": 3.594133667186688e-05, "loss": 0.7948, "step": 5590 }, { "epoch": 1.8201040988939492, "grad_norm": 1.6844091415405273, "learning_rate": 3.591812547543725e-05, "loss": 0.7935, "step": 5595 }, { "epoch": 1.8217306441119063, "grad_norm": 1.704921841621399, "learning_rate": 3.58949026434608e-05, "loss": 0.7882, "step": 5600 }, { "epoch": 1.8233571893298635, "grad_norm": 1.488486409187317, "learning_rate": 3.587166820068635e-05, "loss": 0.8135, "step": 5605 }, { "epoch": 1.8249837345478204, "grad_norm": 1.907758355140686, "learning_rate": 3.584842217187503e-05, "loss": 0.8007, "step": 5610 }, { "epoch": 1.8266102797657775, "grad_norm": 1.9598406553268433, "learning_rate": 3.582516458180036e-05, "loss": 0.78, "step": 5615 }, { "epoch": 1.8282368249837346, "grad_norm": 1.5850437879562378, "learning_rate": 3.580189545524818e-05, "loss": 0.7971, "step": 5620 }, { "epoch": 1.8298633702016915, "grad_norm": 1.4961295127868652, "learning_rate": 3.577861481701659e-05, "loss": 0.8059, "step": 5625 }, { "epoch": 1.8314899154196487, "grad_norm": 1.4719113111495972, "learning_rate": 3.575532269191599e-05, "loss": 0.7722, "step": 5630 }, { "epoch": 1.8331164606376058, "grad_norm": 1.7238733768463135, "learning_rate": 3.573201910476902e-05, "loss": 0.7766, "step": 5635 }, { "epoch": 1.8347430058555627, "grad_norm": 2.2540855407714844, "learning_rate": 3.57087040804105e-05, "loss": 0.8187, "step": 5640 }, { "epoch": 1.8363695510735198, "grad_norm": 1.552124261856079, "learning_rate": 3.568537764368751e-05, "loss": 0.7887, "step": 5645 }, { "epoch": 1.837996096291477, "grad_norm": 1.6686851978302002, "learning_rate": 3.566203981945921e-05, "loss": 0.8184, "step": 5650 }, { "epoch": 1.8396226415094339, "grad_norm": 1.840654730796814, "learning_rate": 3.5638690632596956e-05, "loss": 0.7635, "step": 5655 }, { "epoch": 1.841249186727391, "grad_norm": 1.8260648250579834, "learning_rate": 3.561533010798418e-05, "loss": 0.766, "step": 5660 }, { "epoch": 1.8428757319453482, "grad_norm": 1.6572096347808838, "learning_rate": 3.559195827051641e-05, "loss": 0.8138, "step": 5665 }, { "epoch": 1.844502277163305, "grad_norm": 1.450379490852356, "learning_rate": 3.556857514510123e-05, "loss": 0.7801, "step": 5670 }, { "epoch": 1.8461288223812622, "grad_norm": 1.6766986846923828, "learning_rate": 3.554518075665826e-05, "loss": 0.7615, "step": 5675 }, { "epoch": 1.8477553675992193, "grad_norm": 1.8202241659164429, "learning_rate": 3.5521775130119095e-05, "loss": 0.8211, "step": 5680 }, { "epoch": 1.8493819128171762, "grad_norm": 1.6854424476623535, "learning_rate": 3.549835829042735e-05, "loss": 0.848, "step": 5685 }, { "epoch": 1.8510084580351334, "grad_norm": 1.4499651193618774, "learning_rate": 3.547493026253854e-05, "loss": 0.788, "step": 5690 }, { "epoch": 1.8526350032530905, "grad_norm": 1.5754683017730713, "learning_rate": 3.545149107142016e-05, "loss": 0.7962, "step": 5695 }, { "epoch": 1.8542615484710474, "grad_norm": 1.6627172231674194, "learning_rate": 3.542804074205155e-05, "loss": 0.8025, "step": 5700 }, { "epoch": 1.8558880936890045, "grad_norm": 1.7090286016464233, "learning_rate": 3.5404579299423944e-05, "loss": 0.7991, "step": 5705 }, { "epoch": 1.8575146389069617, "grad_norm": 1.612816572189331, "learning_rate": 3.5381106768540426e-05, "loss": 0.7932, "step": 5710 }, { "epoch": 1.8591411841249186, "grad_norm": 1.7315623760223389, "learning_rate": 3.5357623174415886e-05, "loss": 0.7914, "step": 5715 }, { "epoch": 1.8607677293428757, "grad_norm": 1.6280250549316406, "learning_rate": 3.5334128542077004e-05, "loss": 0.7666, "step": 5720 }, { "epoch": 1.8623942745608328, "grad_norm": 2.159470796585083, "learning_rate": 3.531062289656223e-05, "loss": 0.826, "step": 5725 }, { "epoch": 1.8640208197787898, "grad_norm": 1.5661296844482422, "learning_rate": 3.528710626292174e-05, "loss": 0.8098, "step": 5730 }, { "epoch": 1.8656473649967469, "grad_norm": 1.5078445672988892, "learning_rate": 3.5263578666217426e-05, "loss": 0.8125, "step": 5735 }, { "epoch": 1.867273910214704, "grad_norm": 1.8959999084472656, "learning_rate": 3.5240040131522876e-05, "loss": 0.7917, "step": 5740 }, { "epoch": 1.868900455432661, "grad_norm": 2.221889019012451, "learning_rate": 3.5216490683923306e-05, "loss": 0.7969, "step": 5745 }, { "epoch": 1.870527000650618, "grad_norm": 1.801939845085144, "learning_rate": 3.519293034851559e-05, "loss": 0.8124, "step": 5750 }, { "epoch": 1.8721535458685752, "grad_norm": 1.5591964721679688, "learning_rate": 3.51693591504082e-05, "loss": 0.8293, "step": 5755 }, { "epoch": 1.873780091086532, "grad_norm": 1.4893189668655396, "learning_rate": 3.5150494387658796e-05, "loss": 0.7798, "step": 5760 }, { "epoch": 1.8754066363044892, "grad_norm": 1.6666206121444702, "learning_rate": 3.512690370000224e-05, "loss": 0.7907, "step": 5765 }, { "epoch": 1.8770331815224464, "grad_norm": 1.7753440141677856, "learning_rate": 3.5103302220011216e-05, "loss": 0.7612, "step": 5770 }, { "epoch": 1.8786597267404033, "grad_norm": 1.8589496612548828, "learning_rate": 3.5079689972838033e-05, "loss": 0.797, "step": 5775 }, { "epoch": 1.8802862719583604, "grad_norm": 1.7949668169021606, "learning_rate": 3.505606698364648e-05, "loss": 0.8046, "step": 5780 }, { "epoch": 1.8819128171763175, "grad_norm": 1.6644870042800903, "learning_rate": 3.503243327761179e-05, "loss": 0.7843, "step": 5785 }, { "epoch": 1.8835393623942744, "grad_norm": 1.7566967010498047, "learning_rate": 3.500878887992063e-05, "loss": 0.792, "step": 5790 }, { "epoch": 1.8851659076122316, "grad_norm": 1.666772484779358, "learning_rate": 3.4985133815771037e-05, "loss": 0.792, "step": 5795 }, { "epoch": 1.8867924528301887, "grad_norm": 1.5666133165359497, "learning_rate": 3.4961468110372445e-05, "loss": 0.7709, "step": 5800 }, { "epoch": 1.8884189980481456, "grad_norm": 1.8184480667114258, "learning_rate": 3.4937791788945615e-05, "loss": 0.7961, "step": 5805 }, { "epoch": 1.8900455432661027, "grad_norm": 1.7216947078704834, "learning_rate": 3.49141048767226e-05, "loss": 0.7997, "step": 5810 }, { "epoch": 1.8916720884840599, "grad_norm": 2.7872517108917236, "learning_rate": 3.489040739894679e-05, "loss": 0.7889, "step": 5815 }, { "epoch": 1.8932986337020168, "grad_norm": 1.7658991813659668, "learning_rate": 3.486669938087278e-05, "loss": 0.8139, "step": 5820 }, { "epoch": 1.8949251789199741, "grad_norm": 1.5896073579788208, "learning_rate": 3.484298084776644e-05, "loss": 0.8105, "step": 5825 }, { "epoch": 1.896551724137931, "grad_norm": 1.5800156593322754, "learning_rate": 3.4819251824904814e-05, "loss": 0.7968, "step": 5830 }, { "epoch": 1.898178269355888, "grad_norm": 1.7342997789382935, "learning_rate": 3.479551233757616e-05, "loss": 0.8007, "step": 5835 }, { "epoch": 1.8998048145738453, "grad_norm": 2.0258543491363525, "learning_rate": 3.477176241107985e-05, "loss": 0.778, "step": 5840 }, { "epoch": 1.9014313597918022, "grad_norm": 1.7815632820129395, "learning_rate": 3.47480020707264e-05, "loss": 0.7997, "step": 5845 }, { "epoch": 1.9030579050097591, "grad_norm": 1.4552288055419922, "learning_rate": 3.4724231341837446e-05, "loss": 0.8031, "step": 5850 }, { "epoch": 1.9046844502277165, "grad_norm": 1.822088360786438, "learning_rate": 3.470045024974564e-05, "loss": 0.8377, "step": 5855 }, { "epoch": 1.9063109954456734, "grad_norm": 1.5762214660644531, "learning_rate": 3.467665881979473e-05, "loss": 0.825, "step": 5860 }, { "epoch": 1.9079375406636303, "grad_norm": 1.3905141353607178, "learning_rate": 3.4652857077339465e-05, "loss": 0.8005, "step": 5865 }, { "epoch": 1.9095640858815877, "grad_norm": 2.5645837783813477, "learning_rate": 3.4629045047745566e-05, "loss": 0.7714, "step": 5870 }, { "epoch": 1.9111906310995446, "grad_norm": 1.9059853553771973, "learning_rate": 3.460522275638974e-05, "loss": 0.7933, "step": 5875 }, { "epoch": 1.9128171763175015, "grad_norm": 1.685237169265747, "learning_rate": 3.4581390228659634e-05, "loss": 0.786, "step": 5880 }, { "epoch": 1.9144437215354588, "grad_norm": 1.6190307140350342, "learning_rate": 3.455754748995377e-05, "loss": 0.8037, "step": 5885 }, { "epoch": 1.9160702667534157, "grad_norm": 1.8023563623428345, "learning_rate": 3.453369456568159e-05, "loss": 0.7781, "step": 5890 }, { "epoch": 1.9176968119713727, "grad_norm": 1.6431732177734375, "learning_rate": 3.450983148126337e-05, "loss": 0.8081, "step": 5895 }, { "epoch": 1.91932335718933, "grad_norm": 1.851529836654663, "learning_rate": 3.4485958262130215e-05, "loss": 0.8115, "step": 5900 }, { "epoch": 1.920949902407287, "grad_norm": 1.6920840740203857, "learning_rate": 3.446207493372404e-05, "loss": 0.7958, "step": 5905 }, { "epoch": 1.9225764476252438, "grad_norm": 2.3522324562072754, "learning_rate": 3.4438181521497525e-05, "loss": 0.7921, "step": 5910 }, { "epoch": 1.9242029928432012, "grad_norm": 2.0344767570495605, "learning_rate": 3.44142780509141e-05, "loss": 0.788, "step": 5915 }, { "epoch": 1.925829538061158, "grad_norm": 1.7460458278656006, "learning_rate": 3.439036454744791e-05, "loss": 0.8313, "step": 5920 }, { "epoch": 1.927456083279115, "grad_norm": 1.6961497068405151, "learning_rate": 3.43664410365838e-05, "loss": 0.7826, "step": 5925 }, { "epoch": 1.9290826284970723, "grad_norm": 2.1205198764801025, "learning_rate": 3.434250754381728e-05, "loss": 0.7765, "step": 5930 }, { "epoch": 1.9307091737150293, "grad_norm": 1.5347671508789062, "learning_rate": 3.4318564094654484e-05, "loss": 0.7735, "step": 5935 }, { "epoch": 1.9323357189329864, "grad_norm": 1.7982996702194214, "learning_rate": 3.4294610714612176e-05, "loss": 0.8435, "step": 5940 }, { "epoch": 1.9339622641509435, "grad_norm": 1.7098652124404907, "learning_rate": 3.427064742921768e-05, "loss": 0.7757, "step": 5945 }, { "epoch": 1.9355888093689004, "grad_norm": 1.446061611175537, "learning_rate": 3.42466742640089e-05, "loss": 0.8333, "step": 5950 }, { "epoch": 1.9372153545868576, "grad_norm": 1.6027209758758545, "learning_rate": 3.4222691244534253e-05, "loss": 0.7904, "step": 5955 }, { "epoch": 1.9388418998048147, "grad_norm": 1.6385594606399536, "learning_rate": 3.419869839635267e-05, "loss": 0.7825, "step": 5960 }, { "epoch": 1.9404684450227716, "grad_norm": 1.4547863006591797, "learning_rate": 3.417469574503356e-05, "loss": 0.7726, "step": 5965 }, { "epoch": 1.9420949902407287, "grad_norm": 1.7256838083267212, "learning_rate": 3.415068331615674e-05, "loss": 0.7763, "step": 5970 }, { "epoch": 1.9437215354586859, "grad_norm": 1.7924857139587402, "learning_rate": 3.41266611353125e-05, "loss": 0.8119, "step": 5975 }, { "epoch": 1.9453480806766428, "grad_norm": 1.5046923160552979, "learning_rate": 3.4102629228101494e-05, "loss": 0.7997, "step": 5980 }, { "epoch": 1.9469746258946, "grad_norm": 1.7069694995880127, "learning_rate": 3.407858762013474e-05, "loss": 0.8143, "step": 5985 }, { "epoch": 1.948601171112557, "grad_norm": 2.707512140274048, "learning_rate": 3.405453633703361e-05, "loss": 0.7946, "step": 5990 }, { "epoch": 1.950227716330514, "grad_norm": 1.4022822380065918, "learning_rate": 3.4030475404429776e-05, "loss": 0.7807, "step": 5995 }, { "epoch": 1.951854261548471, "grad_norm": 1.9132970571517944, "learning_rate": 3.4006404847965196e-05, "loss": 0.7888, "step": 6000 }, { "epoch": 1.9534808067664282, "grad_norm": 2.214003324508667, "learning_rate": 3.398232469329207e-05, "loss": 0.7712, "step": 6005 }, { "epoch": 1.9551073519843851, "grad_norm": 3.6037864685058594, "learning_rate": 3.395823496607286e-05, "loss": 0.8125, "step": 6010 }, { "epoch": 1.9567338972023423, "grad_norm": 1.6346657276153564, "learning_rate": 3.39341356919802e-05, "loss": 0.8014, "step": 6015 }, { "epoch": 1.9583604424202994, "grad_norm": 1.836584448814392, "learning_rate": 3.39100268966969e-05, "loss": 0.792, "step": 6020 }, { "epoch": 1.9599869876382563, "grad_norm": 1.9783848524093628, "learning_rate": 3.388590860591595e-05, "loss": 0.7552, "step": 6025 }, { "epoch": 1.9616135328562134, "grad_norm": 1.9278146028518677, "learning_rate": 3.3861780845340406e-05, "loss": 0.7921, "step": 6030 }, { "epoch": 1.9632400780741706, "grad_norm": 2.9828009605407715, "learning_rate": 3.383764364068346e-05, "loss": 0.7936, "step": 6035 }, { "epoch": 1.9648666232921275, "grad_norm": 2.507549524307251, "learning_rate": 3.381349701766835e-05, "loss": 0.7761, "step": 6040 }, { "epoch": 1.9664931685100846, "grad_norm": 2.000004768371582, "learning_rate": 3.3789341002028363e-05, "loss": 0.8025, "step": 6045 }, { "epoch": 1.9681197137280417, "grad_norm": 1.9173941612243652, "learning_rate": 3.376517561950677e-05, "loss": 0.7907, "step": 6050 }, { "epoch": 1.9697462589459986, "grad_norm": 2.1533915996551514, "learning_rate": 3.374100089585685e-05, "loss": 0.7914, "step": 6055 }, { "epoch": 1.9713728041639558, "grad_norm": 1.5206011533737183, "learning_rate": 3.371681685684184e-05, "loss": 0.7902, "step": 6060 }, { "epoch": 1.972999349381913, "grad_norm": 1.5686813592910767, "learning_rate": 3.3692623528234865e-05, "loss": 0.7862, "step": 6065 }, { "epoch": 1.9746258945998698, "grad_norm": 1.5873268842697144, "learning_rate": 3.366842093581901e-05, "loss": 0.7717, "step": 6070 }, { "epoch": 1.976252439817827, "grad_norm": 1.777116060256958, "learning_rate": 3.364420910538718e-05, "loss": 0.7947, "step": 6075 }, { "epoch": 1.977878985035784, "grad_norm": 2.015531301498413, "learning_rate": 3.361998806274214e-05, "loss": 0.785, "step": 6080 }, { "epoch": 1.979505530253741, "grad_norm": 1.6734198331832886, "learning_rate": 3.3595757833696505e-05, "loss": 0.7841, "step": 6085 }, { "epoch": 1.9811320754716981, "grad_norm": 2.073692560195923, "learning_rate": 3.3571518444072626e-05, "loss": 0.7715, "step": 6090 }, { "epoch": 1.9827586206896552, "grad_norm": 1.8034336566925049, "learning_rate": 3.354726991970266e-05, "loss": 0.8163, "step": 6095 }, { "epoch": 1.9843851659076122, "grad_norm": 1.5698885917663574, "learning_rate": 3.352301228642847e-05, "loss": 0.786, "step": 6100 }, { "epoch": 1.9860117111255693, "grad_norm": 1.3378742933273315, "learning_rate": 3.349874557010166e-05, "loss": 0.7908, "step": 6105 }, { "epoch": 1.9876382563435264, "grad_norm": 1.6845659017562866, "learning_rate": 3.3474469796583477e-05, "loss": 0.8112, "step": 6110 }, { "epoch": 1.9892648015614833, "grad_norm": 1.834529995918274, "learning_rate": 3.345018499174482e-05, "loss": 0.7599, "step": 6115 }, { "epoch": 1.9908913467794405, "grad_norm": 1.9633735418319702, "learning_rate": 3.3425891181466275e-05, "loss": 0.825, "step": 6120 }, { "epoch": 1.9925178919973976, "grad_norm": 1.81381094455719, "learning_rate": 3.340158839163794e-05, "loss": 0.7793, "step": 6125 }, { "epoch": 1.9941444372153545, "grad_norm": 1.7863167524337769, "learning_rate": 3.337727664815952e-05, "loss": 0.7549, "step": 6130 }, { "epoch": 1.9957709824333116, "grad_norm": 2.0586788654327393, "learning_rate": 3.335295597694029e-05, "loss": 0.7701, "step": 6135 }, { "epoch": 1.9973975276512688, "grad_norm": 1.8435872793197632, "learning_rate": 3.3328626403899e-05, "loss": 0.7667, "step": 6140 }, { "epoch": 1.9990240728692257, "grad_norm": 1.6310111284255981, "learning_rate": 3.33042879549639e-05, "loss": 0.7994, "step": 6145 }, { "epoch": 2.0, "eval_f1": 0.8204322346195636, "eval_loss": 0.4052734375, "eval_precision": 0.8203424383778088, "eval_recall": 0.8207026062554238, "eval_runtime": 1029.4017, "eval_samples_per_second": 382.198, "eval_steps_per_second": 0.747, "step": 6148 }, { "epoch": 2.000650618087183, "grad_norm": 2.080751419067383, "learning_rate": 3.3279940656072706e-05, "loss": 0.7787, "step": 6150 }, { "epoch": 2.00227716330514, "grad_norm": 2.657672643661499, "learning_rate": 3.325558453317254e-05, "loss": 0.7258, "step": 6155 }, { "epoch": 2.003903708523097, "grad_norm": 2.20680570602417, "learning_rate": 3.323121961221996e-05, "loss": 0.7325, "step": 6160 }, { "epoch": 2.005530253741054, "grad_norm": 2.3011794090270996, "learning_rate": 3.320684591918089e-05, "loss": 0.7722, "step": 6165 }, { "epoch": 2.007156798959011, "grad_norm": 1.6290371417999268, "learning_rate": 3.3182463480030595e-05, "loss": 0.7134, "step": 6170 }, { "epoch": 2.008783344176968, "grad_norm": 1.6625546216964722, "learning_rate": 3.3158072320753664e-05, "loss": 0.737, "step": 6175 }, { "epoch": 2.0104098893949254, "grad_norm": 1.832472562789917, "learning_rate": 3.3133672467343965e-05, "loss": 0.7456, "step": 6180 }, { "epoch": 2.0120364346128823, "grad_norm": 1.8868688344955444, "learning_rate": 3.310926394580468e-05, "loss": 0.7438, "step": 6185 }, { "epoch": 2.013662979830839, "grad_norm": 2.2079997062683105, "learning_rate": 3.308484678214817e-05, "loss": 0.7365, "step": 6190 }, { "epoch": 2.0152895250487965, "grad_norm": 2.1085147857666016, "learning_rate": 3.306042100239602e-05, "loss": 0.7352, "step": 6195 }, { "epoch": 2.0169160702667535, "grad_norm": 2.2749311923980713, "learning_rate": 3.303598663257904e-05, "loss": 0.7529, "step": 6200 }, { "epoch": 2.0185426154847104, "grad_norm": 3.618288516998291, "learning_rate": 3.3011543698737155e-05, "loss": 0.7432, "step": 6205 }, { "epoch": 2.0201691607026677, "grad_norm": 2.5536632537841797, "learning_rate": 3.29870922269194e-05, "loss": 0.7465, "step": 6210 }, { "epoch": 2.0217957059206246, "grad_norm": 1.9676142930984497, "learning_rate": 3.296263224318397e-05, "loss": 0.7615, "step": 6215 }, { "epoch": 2.0234222511385815, "grad_norm": 1.650276780128479, "learning_rate": 3.2938163773598065e-05, "loss": 0.7579, "step": 6220 }, { "epoch": 2.025048796356539, "grad_norm": 1.7215265035629272, "learning_rate": 3.291368684423796e-05, "loss": 0.7333, "step": 6225 }, { "epoch": 2.026675341574496, "grad_norm": 1.7647247314453125, "learning_rate": 3.288920148118896e-05, "loss": 0.7282, "step": 6230 }, { "epoch": 2.0283018867924527, "grad_norm": 1.90190851688385, "learning_rate": 3.286470771054534e-05, "loss": 0.7317, "step": 6235 }, { "epoch": 2.02992843201041, "grad_norm": 1.6041595935821533, "learning_rate": 3.284020555841033e-05, "loss": 0.7003, "step": 6240 }, { "epoch": 2.031554977228367, "grad_norm": 1.7647874355316162, "learning_rate": 3.2815695050896093e-05, "loss": 0.7459, "step": 6245 }, { "epoch": 2.033181522446324, "grad_norm": 1.9271812438964844, "learning_rate": 3.279117621412372e-05, "loss": 0.7299, "step": 6250 }, { "epoch": 2.0348080676642812, "grad_norm": 1.810985803604126, "learning_rate": 3.2766649074223145e-05, "loss": 0.7284, "step": 6255 }, { "epoch": 2.036434612882238, "grad_norm": 1.575209379196167, "learning_rate": 3.274211365733317e-05, "loss": 0.7332, "step": 6260 }, { "epoch": 2.038061158100195, "grad_norm": 2.2633728981018066, "learning_rate": 3.271756998960143e-05, "loss": 0.7421, "step": 6265 }, { "epoch": 2.0396877033181524, "grad_norm": 2.159065008163452, "learning_rate": 3.2693018097184314e-05, "loss": 0.7199, "step": 6270 }, { "epoch": 2.0413142485361093, "grad_norm": 1.732470154762268, "learning_rate": 3.2668458006247e-05, "loss": 0.7401, "step": 6275 }, { "epoch": 2.0429407937540662, "grad_norm": 2.0348281860351562, "learning_rate": 3.2643889742963434e-05, "loss": 0.7274, "step": 6280 }, { "epoch": 2.0445673389720236, "grad_norm": 1.893390417098999, "learning_rate": 3.261931333351621e-05, "loss": 0.7055, "step": 6285 }, { "epoch": 2.0461938841899805, "grad_norm": 2.049177646636963, "learning_rate": 3.259472880409664e-05, "loss": 0.7244, "step": 6290 }, { "epoch": 2.0478204294079374, "grad_norm": 1.857323169708252, "learning_rate": 3.2570136180904696e-05, "loss": 0.7428, "step": 6295 }, { "epoch": 2.0494469746258948, "grad_norm": 1.827819585800171, "learning_rate": 3.2545535490148955e-05, "loss": 0.7575, "step": 6300 }, { "epoch": 2.0510735198438517, "grad_norm": 1.724376916885376, "learning_rate": 3.2520926758046586e-05, "loss": 0.7204, "step": 6305 }, { "epoch": 2.0527000650618086, "grad_norm": 1.9833449125289917, "learning_rate": 3.249631001082336e-05, "loss": 0.7377, "step": 6310 }, { "epoch": 2.054326610279766, "grad_norm": 1.9445111751556396, "learning_rate": 3.247168527471357e-05, "loss": 0.7548, "step": 6315 }, { "epoch": 2.055953155497723, "grad_norm": 1.842955470085144, "learning_rate": 3.244705257596001e-05, "loss": 0.7519, "step": 6320 }, { "epoch": 2.0575797007156797, "grad_norm": 1.5295389890670776, "learning_rate": 3.242241194081398e-05, "loss": 0.7478, "step": 6325 }, { "epoch": 2.059206245933637, "grad_norm": 2.5027713775634766, "learning_rate": 3.2397763395535244e-05, "loss": 0.7513, "step": 6330 }, { "epoch": 2.060832791151594, "grad_norm": 1.6601084470748901, "learning_rate": 3.2373106966391965e-05, "loss": 0.6987, "step": 6335 }, { "epoch": 2.062459336369551, "grad_norm": 2.04126238822937, "learning_rate": 3.234844267966074e-05, "loss": 0.7315, "step": 6340 }, { "epoch": 2.0640858815875083, "grad_norm": 1.687272548675537, "learning_rate": 3.232377056162652e-05, "loss": 0.7296, "step": 6345 }, { "epoch": 2.065712426805465, "grad_norm": 1.9358770847320557, "learning_rate": 3.229909063858261e-05, "loss": 0.7125, "step": 6350 }, { "epoch": 2.067338972023422, "grad_norm": 1.9139622449874878, "learning_rate": 3.2274402936830626e-05, "loss": 0.7454, "step": 6355 }, { "epoch": 2.0689655172413794, "grad_norm": 2.3934218883514404, "learning_rate": 3.224970748268048e-05, "loss": 0.7467, "step": 6360 }, { "epoch": 2.0705920624593364, "grad_norm": 2.055274486541748, "learning_rate": 3.2225004302450354e-05, "loss": 0.7388, "step": 6365 }, { "epoch": 2.0722186076772933, "grad_norm": 1.8648474216461182, "learning_rate": 3.220029342246665e-05, "loss": 0.7241, "step": 6370 }, { "epoch": 2.0738451528952506, "grad_norm": 2.0889534950256348, "learning_rate": 3.217557486906398e-05, "loss": 0.7711, "step": 6375 }, { "epoch": 2.0754716981132075, "grad_norm": 1.6840265989303589, "learning_rate": 3.215084866858513e-05, "loss": 0.7331, "step": 6380 }, { "epoch": 2.0770982433311644, "grad_norm": 1.883111596107483, "learning_rate": 3.212611484738105e-05, "loss": 0.7269, "step": 6385 }, { "epoch": 2.078724788549122, "grad_norm": 1.9425476789474487, "learning_rate": 3.210137343181078e-05, "loss": 0.7518, "step": 6390 }, { "epoch": 2.0803513337670787, "grad_norm": 1.6710432767868042, "learning_rate": 3.20766244482415e-05, "loss": 0.7307, "step": 6395 }, { "epoch": 2.0819778789850356, "grad_norm": 1.703465461730957, "learning_rate": 3.2051867923048415e-05, "loss": 0.7412, "step": 6400 }, { "epoch": 2.083604424202993, "grad_norm": 1.7874739170074463, "learning_rate": 3.2027103882614774e-05, "loss": 0.7208, "step": 6405 }, { "epoch": 2.08523096942095, "grad_norm": 3.441638708114624, "learning_rate": 3.200233235333186e-05, "loss": 0.7439, "step": 6410 }, { "epoch": 2.086857514638907, "grad_norm": 2.2441136837005615, "learning_rate": 3.197755336159891e-05, "loss": 0.721, "step": 6415 }, { "epoch": 2.088484059856864, "grad_norm": 2.098034620285034, "learning_rate": 3.195276693382313e-05, "loss": 0.7125, "step": 6420 }, { "epoch": 2.090110605074821, "grad_norm": 2.6204288005828857, "learning_rate": 3.192797309641964e-05, "loss": 0.7352, "step": 6425 }, { "epoch": 2.091737150292778, "grad_norm": 1.990392804145813, "learning_rate": 3.190317187581146e-05, "loss": 0.7655, "step": 6430 }, { "epoch": 2.0933636955107353, "grad_norm": 1.7657074928283691, "learning_rate": 3.187836329842947e-05, "loss": 0.7603, "step": 6435 }, { "epoch": 2.094990240728692, "grad_norm": 1.7322503328323364, "learning_rate": 3.185354739071242e-05, "loss": 0.7477, "step": 6440 }, { "epoch": 2.096616785946649, "grad_norm": 1.8331769704818726, "learning_rate": 3.182872417910684e-05, "loss": 0.7181, "step": 6445 }, { "epoch": 2.0982433311646065, "grad_norm": 1.7432118654251099, "learning_rate": 3.1803893690067046e-05, "loss": 0.7255, "step": 6450 }, { "epoch": 2.0998698763825634, "grad_norm": 1.8132730722427368, "learning_rate": 3.177905595005514e-05, "loss": 0.7447, "step": 6455 }, { "epoch": 2.1014964216005203, "grad_norm": 1.6996136903762817, "learning_rate": 3.175421098554091e-05, "loss": 0.7374, "step": 6460 }, { "epoch": 2.1031229668184777, "grad_norm": 2.2226970195770264, "learning_rate": 3.172935882300187e-05, "loss": 0.7402, "step": 6465 }, { "epoch": 2.1047495120364346, "grad_norm": 2.5579731464385986, "learning_rate": 3.1704499488923204e-05, "loss": 0.7471, "step": 6470 }, { "epoch": 2.1063760572543915, "grad_norm": 1.8817853927612305, "learning_rate": 3.167963300979773e-05, "loss": 0.7068, "step": 6475 }, { "epoch": 2.108002602472349, "grad_norm": 1.7366390228271484, "learning_rate": 3.165475941212588e-05, "loss": 0.7132, "step": 6480 }, { "epoch": 2.1096291476903057, "grad_norm": 1.832213282585144, "learning_rate": 3.162987872241568e-05, "loss": 0.7295, "step": 6485 }, { "epoch": 2.1112556929082626, "grad_norm": 1.9638258218765259, "learning_rate": 3.160499096718271e-05, "loss": 0.7532, "step": 6490 }, { "epoch": 2.11288223812622, "grad_norm": 1.7910124063491821, "learning_rate": 3.1580096172950076e-05, "loss": 0.718, "step": 6495 }, { "epoch": 2.114508783344177, "grad_norm": 1.870341181755066, "learning_rate": 3.155519436624839e-05, "loss": 0.7315, "step": 6500 }, { "epoch": 2.116135328562134, "grad_norm": 1.8709512948989868, "learning_rate": 3.1530285573615744e-05, "loss": 0.7133, "step": 6505 }, { "epoch": 2.117761873780091, "grad_norm": 1.7289118766784668, "learning_rate": 3.150536982159766e-05, "loss": 0.7378, "step": 6510 }, { "epoch": 2.119388418998048, "grad_norm": 1.747144341468811, "learning_rate": 3.148044713674708e-05, "loss": 0.7342, "step": 6515 }, { "epoch": 2.121014964216005, "grad_norm": 1.8033082485198975, "learning_rate": 3.145551754562435e-05, "loss": 0.7491, "step": 6520 }, { "epoch": 2.1226415094339623, "grad_norm": 1.4740681648254395, "learning_rate": 3.143058107479716e-05, "loss": 0.7274, "step": 6525 }, { "epoch": 2.1242680546519193, "grad_norm": 1.6742298603057861, "learning_rate": 3.1405637750840536e-05, "loss": 0.7452, "step": 6530 }, { "epoch": 2.125894599869876, "grad_norm": 2.113196611404419, "learning_rate": 3.1380687600336804e-05, "loss": 0.7333, "step": 6535 }, { "epoch": 2.1275211450878335, "grad_norm": 2.39341402053833, "learning_rate": 3.135573064987558e-05, "loss": 0.7318, "step": 6540 }, { "epoch": 2.1291476903057904, "grad_norm": 1.6623047590255737, "learning_rate": 3.1330766926053696e-05, "loss": 0.732, "step": 6545 }, { "epoch": 2.130774235523748, "grad_norm": 3.4303982257843018, "learning_rate": 3.130579645547525e-05, "loss": 0.7304, "step": 6550 }, { "epoch": 2.1324007807417047, "grad_norm": 1.9972918033599854, "learning_rate": 3.128081926475148e-05, "loss": 0.7587, "step": 6555 }, { "epoch": 2.1340273259596616, "grad_norm": 2.0030083656311035, "learning_rate": 3.1255835380500816e-05, "loss": 0.7177, "step": 6560 }, { "epoch": 2.1356538711776185, "grad_norm": 1.7061758041381836, "learning_rate": 3.12308448293488e-05, "loss": 0.699, "step": 6565 }, { "epoch": 2.137280416395576, "grad_norm": 2.000803232192993, "learning_rate": 3.1205847637928126e-05, "loss": 0.7215, "step": 6570 }, { "epoch": 2.1389069616135328, "grad_norm": 1.9984855651855469, "learning_rate": 3.11808438328785e-05, "loss": 0.7449, "step": 6575 }, { "epoch": 2.14053350683149, "grad_norm": 2.0136473178863525, "learning_rate": 3.1155833440846715e-05, "loss": 0.7581, "step": 6580 }, { "epoch": 2.142160052049447, "grad_norm": 2.2597501277923584, "learning_rate": 3.113081648848658e-05, "loss": 0.714, "step": 6585 }, { "epoch": 2.143786597267404, "grad_norm": 9.528533935546875, "learning_rate": 3.110579300245889e-05, "loss": 0.7427, "step": 6590 }, { "epoch": 2.145413142485361, "grad_norm": 1.6550788879394531, "learning_rate": 3.10807630094314e-05, "loss": 0.7517, "step": 6595 }, { "epoch": 2.147039687703318, "grad_norm": 1.8862191438674927, "learning_rate": 3.105572653607881e-05, "loss": 0.7572, "step": 6600 }, { "epoch": 2.148666232921275, "grad_norm": 1.7123699188232422, "learning_rate": 3.1030683609082725e-05, "loss": 0.7123, "step": 6605 }, { "epoch": 2.1502927781392325, "grad_norm": 1.5585683584213257, "learning_rate": 3.10056342551316e-05, "loss": 0.7351, "step": 6610 }, { "epoch": 2.1519193233571894, "grad_norm": 2.117034912109375, "learning_rate": 3.098057850092079e-05, "loss": 0.7155, "step": 6615 }, { "epoch": 2.1535458685751463, "grad_norm": 2.3031628131866455, "learning_rate": 3.095551637315243e-05, "loss": 0.7411, "step": 6620 }, { "epoch": 2.1551724137931036, "grad_norm": 1.798038363456726, "learning_rate": 3.093044789853545e-05, "loss": 0.7209, "step": 6625 }, { "epoch": 2.1567989590110606, "grad_norm": 1.8484983444213867, "learning_rate": 3.090537310378558e-05, "loss": 0.7439, "step": 6630 }, { "epoch": 2.1584255042290175, "grad_norm": 2.020235300064087, "learning_rate": 3.088029201562523e-05, "loss": 0.7375, "step": 6635 }, { "epoch": 2.160052049446975, "grad_norm": 1.7193057537078857, "learning_rate": 3.085520466078357e-05, "loss": 0.7456, "step": 6640 }, { "epoch": 2.1616785946649317, "grad_norm": 1.806383490562439, "learning_rate": 3.083011106599641e-05, "loss": 0.7164, "step": 6645 }, { "epoch": 2.1633051398828886, "grad_norm": 1.6970350742340088, "learning_rate": 3.080501125800623e-05, "loss": 0.7252, "step": 6650 }, { "epoch": 2.164931685100846, "grad_norm": 1.8691726922988892, "learning_rate": 3.0779905263562135e-05, "loss": 0.7489, "step": 6655 }, { "epoch": 2.166558230318803, "grad_norm": 1.8269325494766235, "learning_rate": 3.07547931094198e-05, "loss": 0.7649, "step": 6660 }, { "epoch": 2.16818477553676, "grad_norm": 1.6941983699798584, "learning_rate": 3.072967482234148e-05, "loss": 0.7388, "step": 6665 }, { "epoch": 2.169811320754717, "grad_norm": 1.8301239013671875, "learning_rate": 3.070455042909598e-05, "loss": 0.7383, "step": 6670 }, { "epoch": 2.171437865972674, "grad_norm": 2.8997089862823486, "learning_rate": 3.067941995645858e-05, "loss": 0.7449, "step": 6675 }, { "epoch": 2.173064411190631, "grad_norm": 1.7533553838729858, "learning_rate": 3.0654283431211066e-05, "loss": 0.7708, "step": 6680 }, { "epoch": 2.1746909564085883, "grad_norm": 1.7293111085891724, "learning_rate": 3.062914088014167e-05, "loss": 0.7573, "step": 6685 }, { "epoch": 2.1763175016265452, "grad_norm": 1.7983448505401611, "learning_rate": 3.060399233004502e-05, "loss": 0.7403, "step": 6690 }, { "epoch": 2.177944046844502, "grad_norm": 2.226148843765259, "learning_rate": 3.057883780772217e-05, "loss": 0.7125, "step": 6695 }, { "epoch": 2.1795705920624595, "grad_norm": 1.9191621541976929, "learning_rate": 3.055367733998052e-05, "loss": 0.7808, "step": 6700 }, { "epoch": 2.1811971372804164, "grad_norm": 1.9563281536102295, "learning_rate": 3.0528510953633824e-05, "loss": 0.7457, "step": 6705 }, { "epoch": 2.1828236824983733, "grad_norm": 1.8121241331100464, "learning_rate": 3.0503338675502118e-05, "loss": 0.7365, "step": 6710 }, { "epoch": 2.1844502277163307, "grad_norm": 2.0334646701812744, "learning_rate": 3.0478160532411747e-05, "loss": 0.7262, "step": 6715 }, { "epoch": 2.1860767729342876, "grad_norm": 2.2769808769226074, "learning_rate": 3.0452976551195268e-05, "loss": 0.7255, "step": 6720 }, { "epoch": 2.1877033181522445, "grad_norm": 1.9302563667297363, "learning_rate": 3.0427786758691496e-05, "loss": 0.7599, "step": 6725 }, { "epoch": 2.189329863370202, "grad_norm": 1.4247409105300903, "learning_rate": 3.0402591181745428e-05, "loss": 0.7228, "step": 6730 }, { "epoch": 2.1909564085881588, "grad_norm": 1.5811034440994263, "learning_rate": 3.037738984720821e-05, "loss": 0.7201, "step": 6735 }, { "epoch": 2.1925829538061157, "grad_norm": 1.704803466796875, "learning_rate": 3.035218278193714e-05, "loss": 0.7429, "step": 6740 }, { "epoch": 2.194209499024073, "grad_norm": 1.7844103574752808, "learning_rate": 3.0326970012795626e-05, "loss": 0.7436, "step": 6745 }, { "epoch": 2.19583604424203, "grad_norm": 2.1869356632232666, "learning_rate": 3.030175156665314e-05, "loss": 0.75, "step": 6750 }, { "epoch": 2.197462589459987, "grad_norm": 1.7514694929122925, "learning_rate": 3.027652747038522e-05, "loss": 0.7291, "step": 6755 }, { "epoch": 2.199089134677944, "grad_norm": 1.5369763374328613, "learning_rate": 3.0251297750873408e-05, "loss": 0.7379, "step": 6760 }, { "epoch": 2.200715679895901, "grad_norm": 1.9312034845352173, "learning_rate": 3.022606243500526e-05, "loss": 0.7354, "step": 6765 }, { "epoch": 2.202342225113858, "grad_norm": 1.956268548965454, "learning_rate": 3.0200821549674264e-05, "loss": 0.7345, "step": 6770 }, { "epoch": 2.2039687703318154, "grad_norm": 1.5713850259780884, "learning_rate": 3.0175575121779886e-05, "loss": 0.7387, "step": 6775 }, { "epoch": 2.2055953155497723, "grad_norm": 1.7191925048828125, "learning_rate": 3.0150323178227473e-05, "loss": 0.725, "step": 6780 }, { "epoch": 2.207221860767729, "grad_norm": 1.6469066143035889, "learning_rate": 3.012506574592825e-05, "loss": 0.6947, "step": 6785 }, { "epoch": 2.2088484059856865, "grad_norm": 1.6148409843444824, "learning_rate": 3.0099802851799285e-05, "loss": 0.7261, "step": 6790 }, { "epoch": 2.2104749512036435, "grad_norm": 1.8722944259643555, "learning_rate": 3.007453452276349e-05, "loss": 0.7269, "step": 6795 }, { "epoch": 2.2121014964216004, "grad_norm": 1.997087001800537, "learning_rate": 3.0049260785749556e-05, "loss": 0.785, "step": 6800 }, { "epoch": 2.2137280416395577, "grad_norm": 1.833642601966858, "learning_rate": 3.0023981667691926e-05, "loss": 0.755, "step": 6805 }, { "epoch": 2.2153545868575146, "grad_norm": 1.352949619293213, "learning_rate": 2.9998697195530796e-05, "loss": 0.7142, "step": 6810 }, { "epoch": 2.2169811320754715, "grad_norm": 2.393751382827759, "learning_rate": 2.997340739621206e-05, "loss": 0.724, "step": 6815 }, { "epoch": 2.218607677293429, "grad_norm": 1.5960283279418945, "learning_rate": 2.994811229668729e-05, "loss": 0.716, "step": 6820 }, { "epoch": 2.220234222511386, "grad_norm": 1.8506897687911987, "learning_rate": 2.9922811923913714e-05, "loss": 0.7783, "step": 6825 }, { "epoch": 2.2218607677293427, "grad_norm": 1.6366667747497559, "learning_rate": 2.9897506304854157e-05, "loss": 0.7637, "step": 6830 }, { "epoch": 2.2234873129473, "grad_norm": 1.6875860691070557, "learning_rate": 2.9872195466477054e-05, "loss": 0.7217, "step": 6835 }, { "epoch": 2.225113858165257, "grad_norm": 1.6556414365768433, "learning_rate": 2.9846879435756415e-05, "loss": 0.6959, "step": 6840 }, { "epoch": 2.226740403383214, "grad_norm": 2.3893930912017822, "learning_rate": 2.9821558239671744e-05, "loss": 0.7626, "step": 6845 }, { "epoch": 2.2283669486011712, "grad_norm": 2.506157398223877, "learning_rate": 2.9796231905208084e-05, "loss": 0.7388, "step": 6850 }, { "epoch": 2.229993493819128, "grad_norm": 1.7940316200256348, "learning_rate": 2.977090045935594e-05, "loss": 0.7482, "step": 6855 }, { "epoch": 2.231620039037085, "grad_norm": 2.089195728302002, "learning_rate": 2.9745563929111274e-05, "loss": 0.7346, "step": 6860 }, { "epoch": 2.2332465842550424, "grad_norm": 1.5408855676651, "learning_rate": 2.9720222341475445e-05, "loss": 0.7277, "step": 6865 }, { "epoch": 2.2348731294729993, "grad_norm": 2.133648157119751, "learning_rate": 2.969487572345523e-05, "loss": 0.7337, "step": 6870 }, { "epoch": 2.2364996746909562, "grad_norm": 1.9534162282943726, "learning_rate": 2.966952410206275e-05, "loss": 0.7325, "step": 6875 }, { "epoch": 2.2381262199089136, "grad_norm": 1.9022512435913086, "learning_rate": 2.9644167504315458e-05, "loss": 0.6883, "step": 6880 }, { "epoch": 2.2397527651268705, "grad_norm": 1.4984121322631836, "learning_rate": 2.9618805957236113e-05, "loss": 0.7144, "step": 6885 }, { "epoch": 2.2413793103448274, "grad_norm": 1.6198983192443848, "learning_rate": 2.9593439487852753e-05, "loss": 0.7098, "step": 6890 }, { "epoch": 2.2430058555627848, "grad_norm": 1.6259560585021973, "learning_rate": 2.956806812319865e-05, "loss": 0.733, "step": 6895 }, { "epoch": 2.2446324007807417, "grad_norm": 1.8737263679504395, "learning_rate": 2.9542691890312312e-05, "loss": 0.7134, "step": 6900 }, { "epoch": 2.2462589459986986, "grad_norm": 1.9029572010040283, "learning_rate": 2.951731081623742e-05, "loss": 0.7171, "step": 6905 }, { "epoch": 2.247885491216656, "grad_norm": 1.9204849004745483, "learning_rate": 2.9491924928022813e-05, "loss": 0.7217, "step": 6910 }, { "epoch": 2.249512036434613, "grad_norm": 2.126492977142334, "learning_rate": 2.946653425272247e-05, "loss": 0.7355, "step": 6915 }, { "epoch": 2.2511385816525697, "grad_norm": 2.0191283226013184, "learning_rate": 2.944113881739547e-05, "loss": 0.7401, "step": 6920 }, { "epoch": 2.252765126870527, "grad_norm": 1.8509751558303833, "learning_rate": 2.9415738649105963e-05, "loss": 0.7331, "step": 6925 }, { "epoch": 2.254391672088484, "grad_norm": 2.1766791343688965, "learning_rate": 2.9390333774923124e-05, "loss": 0.7606, "step": 6930 }, { "epoch": 2.2560182173064414, "grad_norm": 1.8282063007354736, "learning_rate": 2.9364924221921185e-05, "loss": 0.7153, "step": 6935 }, { "epoch": 2.2576447625243983, "grad_norm": 1.6795986890792847, "learning_rate": 2.9339510017179332e-05, "loss": 0.7377, "step": 6940 }, { "epoch": 2.259271307742355, "grad_norm": 2.486461877822876, "learning_rate": 2.9314091187781715e-05, "loss": 0.7285, "step": 6945 }, { "epoch": 2.260897852960312, "grad_norm": 1.7224416732788086, "learning_rate": 2.9288667760817413e-05, "loss": 0.719, "step": 6950 }, { "epoch": 2.2625243981782694, "grad_norm": 2.0632481575012207, "learning_rate": 2.9263239763380412e-05, "loss": 0.7114, "step": 6955 }, { "epoch": 2.2641509433962264, "grad_norm": 1.746992588043213, "learning_rate": 2.9237807222569558e-05, "loss": 0.7529, "step": 6960 }, { "epoch": 2.2657774886141837, "grad_norm": 1.490057110786438, "learning_rate": 2.921237016548854e-05, "loss": 0.7395, "step": 6965 }, { "epoch": 2.2674040338321406, "grad_norm": 1.6305809020996094, "learning_rate": 2.9186928619245872e-05, "loss": 0.7334, "step": 6970 }, { "epoch": 2.2690305790500975, "grad_norm": 1.6492929458618164, "learning_rate": 2.9161482610954842e-05, "loss": 0.7464, "step": 6975 }, { "epoch": 2.2706571242680544, "grad_norm": 2.410400390625, "learning_rate": 2.913603216773349e-05, "loss": 0.7275, "step": 6980 }, { "epoch": 2.272283669486012, "grad_norm": 2.1574628353118896, "learning_rate": 2.9110577316704602e-05, "loss": 0.7284, "step": 6985 }, { "epoch": 2.2739102147039687, "grad_norm": 1.6114851236343384, "learning_rate": 2.9085118084995626e-05, "loss": 0.7167, "step": 6990 }, { "epoch": 2.275536759921926, "grad_norm": 2.434839963912964, "learning_rate": 2.905965449973871e-05, "loss": 0.7251, "step": 6995 }, { "epoch": 2.277163305139883, "grad_norm": 2.417687177658081, "learning_rate": 2.9034186588070637e-05, "loss": 0.7317, "step": 7000 }, { "epoch": 2.27878985035784, "grad_norm": 2.3264355659484863, "learning_rate": 2.900871437713279e-05, "loss": 0.7514, "step": 7005 }, { "epoch": 2.280416395575797, "grad_norm": 1.8014682531356812, "learning_rate": 2.898323789407113e-05, "loss": 0.7324, "step": 7010 }, { "epoch": 2.282042940793754, "grad_norm": 3.271698236465454, "learning_rate": 2.8957757166036193e-05, "loss": 0.6987, "step": 7015 }, { "epoch": 2.283669486011711, "grad_norm": 1.8290796279907227, "learning_rate": 2.893227222018302e-05, "loss": 0.7477, "step": 7020 }, { "epoch": 2.2852960312296684, "grad_norm": 1.7644013166427612, "learning_rate": 2.890678308367115e-05, "loss": 0.7395, "step": 7025 }, { "epoch": 2.2869225764476253, "grad_norm": 1.9547981023788452, "learning_rate": 2.8881289783664594e-05, "loss": 0.7115, "step": 7030 }, { "epoch": 2.288549121665582, "grad_norm": 1.6306935548782349, "learning_rate": 2.8855792347331793e-05, "loss": 0.732, "step": 7035 }, { "epoch": 2.290175666883539, "grad_norm": 1.7639504671096802, "learning_rate": 2.88302908018456e-05, "loss": 0.7446, "step": 7040 }, { "epoch": 2.2918022121014965, "grad_norm": 1.500830054283142, "learning_rate": 2.8804785174383248e-05, "loss": 0.7675, "step": 7045 }, { "epoch": 2.2934287573194534, "grad_norm": 1.7111210823059082, "learning_rate": 2.8779275492126324e-05, "loss": 0.7285, "step": 7050 }, { "epoch": 2.2950553025374107, "grad_norm": 1.6088038682937622, "learning_rate": 2.8753761782260723e-05, "loss": 0.734, "step": 7055 }, { "epoch": 2.2966818477553677, "grad_norm": 2.570765256881714, "learning_rate": 2.872824407197664e-05, "loss": 0.7273, "step": 7060 }, { "epoch": 2.2983083929733246, "grad_norm": 2.01545786857605, "learning_rate": 2.8702722388468546e-05, "loss": 0.7055, "step": 7065 }, { "epoch": 2.2999349381912815, "grad_norm": 1.9878475666046143, "learning_rate": 2.867719675893512e-05, "loss": 0.7317, "step": 7070 }, { "epoch": 2.301561483409239, "grad_norm": 2.0020320415496826, "learning_rate": 2.8651667210579257e-05, "loss": 0.7307, "step": 7075 }, { "epoch": 2.3031880286271957, "grad_norm": 1.8539988994598389, "learning_rate": 2.8626133770608055e-05, "loss": 0.7746, "step": 7080 }, { "epoch": 2.304814573845153, "grad_norm": 2.5461862087249756, "learning_rate": 2.8600596466232715e-05, "loss": 0.7215, "step": 7085 }, { "epoch": 2.30644111906311, "grad_norm": 2.5611026287078857, "learning_rate": 2.8575055324668583e-05, "loss": 0.7412, "step": 7090 }, { "epoch": 2.308067664281067, "grad_norm": 1.762474536895752, "learning_rate": 2.8549510373135092e-05, "loss": 0.7532, "step": 7095 }, { "epoch": 2.3096942094990243, "grad_norm": 1.908408522605896, "learning_rate": 2.852396163885573e-05, "loss": 0.7474, "step": 7100 }, { "epoch": 2.311320754716981, "grad_norm": 2.2530789375305176, "learning_rate": 2.8498409149058008e-05, "loss": 0.7206, "step": 7105 }, { "epoch": 2.312947299934938, "grad_norm": 1.8963435888290405, "learning_rate": 2.8472852930973464e-05, "loss": 0.7429, "step": 7110 }, { "epoch": 2.3145738451528954, "grad_norm": 1.78444504737854, "learning_rate": 2.8447293011837596e-05, "loss": 0.7125, "step": 7115 }, { "epoch": 2.3162003903708523, "grad_norm": 1.888048768043518, "learning_rate": 2.842172941888983e-05, "loss": 0.7308, "step": 7120 }, { "epoch": 2.3178269355888093, "grad_norm": 1.765034556388855, "learning_rate": 2.8396162179373535e-05, "loss": 0.7448, "step": 7125 }, { "epoch": 2.3194534808067666, "grad_norm": 1.7544866800308228, "learning_rate": 2.837059132053595e-05, "loss": 0.7505, "step": 7130 }, { "epoch": 2.3210800260247235, "grad_norm": 1.7565666437149048, "learning_rate": 2.8345016869628175e-05, "loss": 0.7497, "step": 7135 }, { "epoch": 2.3227065712426804, "grad_norm": 2.0311663150787354, "learning_rate": 2.8319438853905135e-05, "loss": 0.7261, "step": 7140 }, { "epoch": 2.324333116460638, "grad_norm": 1.872420310974121, "learning_rate": 2.8293857300625555e-05, "loss": 0.7257, "step": 7145 }, { "epoch": 2.3259596616785947, "grad_norm": 1.6592035293579102, "learning_rate": 2.826827223705194e-05, "loss": 0.7862, "step": 7150 }, { "epoch": 2.3275862068965516, "grad_norm": 1.7728911638259888, "learning_rate": 2.8242683690450518e-05, "loss": 0.7524, "step": 7155 }, { "epoch": 2.329212752114509, "grad_norm": 2.1312925815582275, "learning_rate": 2.8217091688091253e-05, "loss": 0.7479, "step": 7160 }, { "epoch": 2.330839297332466, "grad_norm": 1.6046788692474365, "learning_rate": 2.8191496257247764e-05, "loss": 0.7141, "step": 7165 }, { "epoch": 2.3324658425504228, "grad_norm": 1.7915352582931519, "learning_rate": 2.816589742519733e-05, "loss": 0.7531, "step": 7170 }, { "epoch": 2.33409238776838, "grad_norm": 2.469411849975586, "learning_rate": 2.814029521922088e-05, "loss": 0.7187, "step": 7175 }, { "epoch": 2.335718932986337, "grad_norm": 2.290977954864502, "learning_rate": 2.8114689666602915e-05, "loss": 0.7171, "step": 7180 }, { "epoch": 2.337345478204294, "grad_norm": 1.7341169118881226, "learning_rate": 2.8089080794631512e-05, "loss": 0.7209, "step": 7185 }, { "epoch": 2.3389720234222513, "grad_norm": 1.7666329145431519, "learning_rate": 2.806346863059827e-05, "loss": 0.7159, "step": 7190 }, { "epoch": 2.340598568640208, "grad_norm": 2.103515625, "learning_rate": 2.803785320179832e-05, "loss": 0.707, "step": 7195 }, { "epoch": 2.342225113858165, "grad_norm": 1.8098862171173096, "learning_rate": 2.801223453553025e-05, "loss": 0.7415, "step": 7200 }, { "epoch": 2.3438516590761225, "grad_norm": 1.7912166118621826, "learning_rate": 2.7986612659096113e-05, "loss": 0.7063, "step": 7205 }, { "epoch": 2.3454782042940794, "grad_norm": 1.7796494960784912, "learning_rate": 2.796098759980138e-05, "loss": 0.7492, "step": 7210 }, { "epoch": 2.3471047495120363, "grad_norm": 1.6385430097579956, "learning_rate": 2.7935359384954914e-05, "loss": 0.7538, "step": 7215 }, { "epoch": 2.3487312947299936, "grad_norm": 1.807198166847229, "learning_rate": 2.7909728041868928e-05, "loss": 0.739, "step": 7220 }, { "epoch": 2.3503578399479506, "grad_norm": 1.8463494777679443, "learning_rate": 2.7884093597858996e-05, "loss": 0.7293, "step": 7225 }, { "epoch": 2.3519843851659075, "grad_norm": 2.101288080215454, "learning_rate": 2.7858456080243972e-05, "loss": 0.7285, "step": 7230 }, { "epoch": 2.353610930383865, "grad_norm": 2.048839569091797, "learning_rate": 2.783281551634599e-05, "loss": 0.7299, "step": 7235 }, { "epoch": 2.3552374756018217, "grad_norm": 1.6900931596755981, "learning_rate": 2.7812300890266442e-05, "loss": 0.7163, "step": 7240 }, { "epoch": 2.3568640208197786, "grad_norm": 2.0370028018951416, "learning_rate": 2.778665491192128e-05, "loss": 0.7452, "step": 7245 }, { "epoch": 2.358490566037736, "grad_norm": 1.7076982259750366, "learning_rate": 2.7761005963812337e-05, "loss": 0.7263, "step": 7250 }, { "epoch": 2.360117111255693, "grad_norm": 1.7953894138336182, "learning_rate": 2.7735354073273926e-05, "loss": 0.7123, "step": 7255 }, { "epoch": 2.36174365647365, "grad_norm": 1.7410740852355957, "learning_rate": 2.7709699267643503e-05, "loss": 0.7263, "step": 7260 }, { "epoch": 2.363370201691607, "grad_norm": 1.8876742124557495, "learning_rate": 2.7684041574261636e-05, "loss": 0.7234, "step": 7265 }, { "epoch": 2.364996746909564, "grad_norm": 2.0356671810150146, "learning_rate": 2.7658381020471964e-05, "loss": 0.7437, "step": 7270 }, { "epoch": 2.366623292127521, "grad_norm": 1.7301623821258545, "learning_rate": 2.7632717633621164e-05, "loss": 0.7301, "step": 7275 }, { "epoch": 2.3682498373454783, "grad_norm": 2.022576332092285, "learning_rate": 2.7607051441058958e-05, "loss": 0.7959, "step": 7280 }, { "epoch": 2.3698763825634352, "grad_norm": 1.5123145580291748, "learning_rate": 2.758138247013804e-05, "loss": 0.7239, "step": 7285 }, { "epoch": 2.371502927781392, "grad_norm": 1.9764145612716675, "learning_rate": 2.7555710748214064e-05, "loss": 0.7279, "step": 7290 }, { "epoch": 2.3731294729993495, "grad_norm": 2.1461610794067383, "learning_rate": 2.7530036302645618e-05, "loss": 0.7559, "step": 7295 }, { "epoch": 2.3747560182173064, "grad_norm": 7.111647129058838, "learning_rate": 2.75043591607942e-05, "loss": 0.7275, "step": 7300 }, { "epoch": 2.3763825634352633, "grad_norm": 2.709261655807495, "learning_rate": 2.7478679350024183e-05, "loss": 0.7627, "step": 7305 }, { "epoch": 2.3780091086532207, "grad_norm": 1.5946375131607056, "learning_rate": 2.7452996897702765e-05, "loss": 0.7428, "step": 7310 }, { "epoch": 2.3796356538711776, "grad_norm": 1.9049484729766846, "learning_rate": 2.7427311831199975e-05, "loss": 0.7153, "step": 7315 }, { "epoch": 2.3812621990891345, "grad_norm": 1.988143801689148, "learning_rate": 2.7401624177888636e-05, "loss": 0.7211, "step": 7320 }, { "epoch": 2.382888744307092, "grad_norm": 1.6142897605895996, "learning_rate": 2.7375933965144296e-05, "loss": 0.7196, "step": 7325 }, { "epoch": 2.3845152895250488, "grad_norm": 1.8634716272354126, "learning_rate": 2.7350241220345274e-05, "loss": 0.7182, "step": 7330 }, { "epoch": 2.3861418347430057, "grad_norm": 2.3685801029205322, "learning_rate": 2.732454597087255e-05, "loss": 0.7148, "step": 7335 }, { "epoch": 2.387768379960963, "grad_norm": 1.7274894714355469, "learning_rate": 2.729884824410979e-05, "loss": 0.7079, "step": 7340 }, { "epoch": 2.38939492517892, "grad_norm": 1.8336807489395142, "learning_rate": 2.72731480674433e-05, "loss": 0.7323, "step": 7345 }, { "epoch": 2.391021470396877, "grad_norm": 1.8585869073867798, "learning_rate": 2.724744546826199e-05, "loss": 0.7227, "step": 7350 }, { "epoch": 2.392648015614834, "grad_norm": 1.6090168952941895, "learning_rate": 2.722174047395737e-05, "loss": 0.7303, "step": 7355 }, { "epoch": 2.394274560832791, "grad_norm": 1.7963923215866089, "learning_rate": 2.719603311192347e-05, "loss": 0.744, "step": 7360 }, { "epoch": 2.395901106050748, "grad_norm": 1.7062233686447144, "learning_rate": 2.7170323409556875e-05, "loss": 0.7137, "step": 7365 }, { "epoch": 2.3975276512687054, "grad_norm": 1.6454732418060303, "learning_rate": 2.7144611394256653e-05, "loss": 0.7312, "step": 7370 }, { "epoch": 2.3991541964866623, "grad_norm": 1.6548445224761963, "learning_rate": 2.7118897093424323e-05, "loss": 0.7121, "step": 7375 }, { "epoch": 2.4007807417046196, "grad_norm": 1.5025266408920288, "learning_rate": 2.7093180534463863e-05, "loss": 0.7235, "step": 7380 }, { "epoch": 2.4024072869225765, "grad_norm": 1.7081060409545898, "learning_rate": 2.7067461744781642e-05, "loss": 0.7204, "step": 7385 }, { "epoch": 2.4040338321405335, "grad_norm": 1.6674851179122925, "learning_rate": 2.7041740751786408e-05, "loss": 0.7278, "step": 7390 }, { "epoch": 2.4056603773584904, "grad_norm": 1.753976583480835, "learning_rate": 2.7016017582889273e-05, "loss": 0.6939, "step": 7395 }, { "epoch": 2.4072869225764477, "grad_norm": 2.3544299602508545, "learning_rate": 2.6990292265503646e-05, "loss": 0.7316, "step": 7400 }, { "epoch": 2.4089134677944046, "grad_norm": 1.823217749595642, "learning_rate": 2.6964564827045224e-05, "loss": 0.7389, "step": 7405 }, { "epoch": 2.410540013012362, "grad_norm": 1.7922483682632446, "learning_rate": 2.6938835294931996e-05, "loss": 0.6885, "step": 7410 }, { "epoch": 2.412166558230319, "grad_norm": 1.7357585430145264, "learning_rate": 2.6913103696584148e-05, "loss": 0.7186, "step": 7415 }, { "epoch": 2.413793103448276, "grad_norm": 1.9426454305648804, "learning_rate": 2.6887370059424078e-05, "loss": 0.7357, "step": 7420 }, { "epoch": 2.4154196486662327, "grad_norm": 1.9222463369369507, "learning_rate": 2.6861634410876367e-05, "loss": 0.7493, "step": 7425 }, { "epoch": 2.41704619388419, "grad_norm": 2.432889938354492, "learning_rate": 2.6835896778367738e-05, "loss": 0.7432, "step": 7430 }, { "epoch": 2.418672739102147, "grad_norm": 1.7843577861785889, "learning_rate": 2.6810157189327007e-05, "loss": 0.7344, "step": 7435 }, { "epoch": 2.4202992843201043, "grad_norm": 1.835652232170105, "learning_rate": 2.6784415671185104e-05, "loss": 0.7489, "step": 7440 }, { "epoch": 2.4219258295380612, "grad_norm": 3.3983778953552246, "learning_rate": 2.6758672251375e-05, "loss": 0.7292, "step": 7445 }, { "epoch": 2.423552374756018, "grad_norm": 1.7670029401779175, "learning_rate": 2.6732926957331688e-05, "loss": 0.7487, "step": 7450 }, { "epoch": 2.425178919973975, "grad_norm": 1.4413880109786987, "learning_rate": 2.6707179816492168e-05, "loss": 0.7441, "step": 7455 }, { "epoch": 2.4268054651919324, "grad_norm": 1.9677038192749023, "learning_rate": 2.668143085629541e-05, "loss": 0.7308, "step": 7460 }, { "epoch": 2.4284320104098893, "grad_norm": 1.7730647325515747, "learning_rate": 2.6655680104182313e-05, "loss": 0.7147, "step": 7465 }, { "epoch": 2.4300585556278467, "grad_norm": 1.8835413455963135, "learning_rate": 2.6629927587595688e-05, "loss": 0.7174, "step": 7470 }, { "epoch": 2.4316851008458036, "grad_norm": 1.521517038345337, "learning_rate": 2.6604173333980237e-05, "loss": 0.7237, "step": 7475 }, { "epoch": 2.4333116460637605, "grad_norm": 2.2355844974517822, "learning_rate": 2.65784173707825e-05, "loss": 0.7137, "step": 7480 }, { "epoch": 2.4349381912817174, "grad_norm": 1.9166332483291626, "learning_rate": 2.6552659725450836e-05, "loss": 0.719, "step": 7485 }, { "epoch": 2.4365647364996748, "grad_norm": 3.5268030166625977, "learning_rate": 2.6526900425435425e-05, "loss": 0.7321, "step": 7490 }, { "epoch": 2.4381912817176317, "grad_norm": 1.6944290399551392, "learning_rate": 2.650113949818817e-05, "loss": 0.7286, "step": 7495 }, { "epoch": 2.439817826935589, "grad_norm": 1.8654463291168213, "learning_rate": 2.6475376971162734e-05, "loss": 0.7352, "step": 7500 }, { "epoch": 2.441444372153546, "grad_norm": 1.7177530527114868, "learning_rate": 2.644961287181449e-05, "loss": 0.7472, "step": 7505 }, { "epoch": 2.443070917371503, "grad_norm": 1.724672794342041, "learning_rate": 2.642384722760046e-05, "loss": 0.7525, "step": 7510 }, { "epoch": 2.4446974625894597, "grad_norm": 1.7894189357757568, "learning_rate": 2.6398080065979346e-05, "loss": 0.7591, "step": 7515 }, { "epoch": 2.446324007807417, "grad_norm": 1.6924527883529663, "learning_rate": 2.6372311414411427e-05, "loss": 0.7397, "step": 7520 }, { "epoch": 2.447950553025374, "grad_norm": 1.7091771364212036, "learning_rate": 2.6346541300358613e-05, "loss": 0.7323, "step": 7525 }, { "epoch": 2.4495770982433314, "grad_norm": 2.6545259952545166, "learning_rate": 2.6320769751284335e-05, "loss": 0.7104, "step": 7530 }, { "epoch": 2.4512036434612883, "grad_norm": 1.6754066944122314, "learning_rate": 2.6294996794653576e-05, "loss": 0.7345, "step": 7535 }, { "epoch": 2.452830188679245, "grad_norm": 1.8702524900436401, "learning_rate": 2.6269222457932824e-05, "loss": 0.7287, "step": 7540 }, { "epoch": 2.4544567338972025, "grad_norm": 1.705138087272644, "learning_rate": 2.6243446768590015e-05, "loss": 0.7213, "step": 7545 }, { "epoch": 2.4560832791151594, "grad_norm": 1.8247156143188477, "learning_rate": 2.621766975409453e-05, "loss": 0.7461, "step": 7550 }, { "epoch": 2.4577098243331164, "grad_norm": 1.8440947532653809, "learning_rate": 2.6191891441917195e-05, "loss": 0.7163, "step": 7555 }, { "epoch": 2.4593363695510737, "grad_norm": 1.5848308801651, "learning_rate": 2.616611185953018e-05, "loss": 0.7156, "step": 7560 }, { "epoch": 2.4609629147690306, "grad_norm": 1.6326448917388916, "learning_rate": 2.6140331034407013e-05, "loss": 0.7358, "step": 7565 }, { "epoch": 2.4625894599869875, "grad_norm": 1.6340683698654175, "learning_rate": 2.6114548994022576e-05, "loss": 0.7263, "step": 7570 }, { "epoch": 2.464216005204945, "grad_norm": 1.7180076837539673, "learning_rate": 2.608876576585302e-05, "loss": 0.7499, "step": 7575 }, { "epoch": 2.465842550422902, "grad_norm": 1.7165942192077637, "learning_rate": 2.6062981377375762e-05, "loss": 0.7535, "step": 7580 }, { "epoch": 2.4674690956408587, "grad_norm": 1.7136976718902588, "learning_rate": 2.6037195856069462e-05, "loss": 0.7404, "step": 7585 }, { "epoch": 2.469095640858816, "grad_norm": 1.7513527870178223, "learning_rate": 2.6011409229414003e-05, "loss": 0.725, "step": 7590 }, { "epoch": 2.470722186076773, "grad_norm": 2.0561563968658447, "learning_rate": 2.598562152489042e-05, "loss": 0.7324, "step": 7595 }, { "epoch": 2.47234873129473, "grad_norm": 2.355729818344116, "learning_rate": 2.59598327699809e-05, "loss": 0.729, "step": 7600 }, { "epoch": 2.4739752765126872, "grad_norm": 1.8387353420257568, "learning_rate": 2.5934042992168767e-05, "loss": 0.7447, "step": 7605 }, { "epoch": 2.475601821730644, "grad_norm": 1.8397676944732666, "learning_rate": 2.5908252218938423e-05, "loss": 0.7583, "step": 7610 }, { "epoch": 2.477228366948601, "grad_norm": 1.6779429912567139, "learning_rate": 2.5882460477775326e-05, "loss": 0.7119, "step": 7615 }, { "epoch": 2.4788549121665584, "grad_norm": 1.7980512380599976, "learning_rate": 2.585666779616598e-05, "loss": 0.746, "step": 7620 }, { "epoch": 2.4804814573845153, "grad_norm": 2.2904257774353027, "learning_rate": 2.5830874201597883e-05, "loss": 0.7102, "step": 7625 }, { "epoch": 2.482108002602472, "grad_norm": 1.5908973217010498, "learning_rate": 2.5805079721559494e-05, "loss": 0.7294, "step": 7630 }, { "epoch": 2.4837345478204296, "grad_norm": 1.8799176216125488, "learning_rate": 2.577928438354024e-05, "loss": 0.7328, "step": 7635 }, { "epoch": 2.4853610930383865, "grad_norm": 1.6054154634475708, "learning_rate": 2.5753488215030448e-05, "loss": 0.7295, "step": 7640 }, { "epoch": 2.4869876382563434, "grad_norm": 1.764978289604187, "learning_rate": 2.5727691243521325e-05, "loss": 0.7409, "step": 7645 }, { "epoch": 2.4886141834743007, "grad_norm": 1.6741787195205688, "learning_rate": 2.5701893496504953e-05, "loss": 0.744, "step": 7650 }, { "epoch": 2.4902407286922577, "grad_norm": 1.697543740272522, "learning_rate": 2.567609500147422e-05, "loss": 0.7269, "step": 7655 }, { "epoch": 2.4918672739102146, "grad_norm": 1.884772539138794, "learning_rate": 2.5650295785922817e-05, "loss": 0.7128, "step": 7660 }, { "epoch": 2.493493819128172, "grad_norm": 1.6686285734176636, "learning_rate": 2.562449587734522e-05, "loss": 0.7619, "step": 7665 }, { "epoch": 2.495120364346129, "grad_norm": 2.351414918899536, "learning_rate": 2.5598695303236615e-05, "loss": 0.7191, "step": 7670 }, { "epoch": 2.4967469095640857, "grad_norm": 1.8593933582305908, "learning_rate": 2.557289409109291e-05, "loss": 0.7325, "step": 7675 }, { "epoch": 2.498373454782043, "grad_norm": 1.7637239694595337, "learning_rate": 2.5547092268410703e-05, "loss": 0.7151, "step": 7680 }, { "epoch": 2.5, "grad_norm": 2.0318996906280518, "learning_rate": 2.5521289862687237e-05, "loss": 0.7238, "step": 7685 }, { "epoch": 2.501626545217957, "grad_norm": 1.8583095073699951, "learning_rate": 2.5495486901420362e-05, "loss": 0.7234, "step": 7690 }, { "epoch": 2.5032530904359143, "grad_norm": 1.5586649179458618, "learning_rate": 2.546968341210853e-05, "loss": 0.7417, "step": 7695 }, { "epoch": 2.504879635653871, "grad_norm": 1.831864833831787, "learning_rate": 2.5443879422250767e-05, "loss": 0.7334, "step": 7700 }, { "epoch": 2.506506180871828, "grad_norm": 1.5912280082702637, "learning_rate": 2.541807495934662e-05, "loss": 0.7356, "step": 7705 }, { "epoch": 2.5081327260897854, "grad_norm": 1.8658093214035034, "learning_rate": 2.539227005089614e-05, "loss": 0.7337, "step": 7710 }, { "epoch": 2.5097592713077423, "grad_norm": 1.8022907972335815, "learning_rate": 2.536646472439986e-05, "loss": 0.7362, "step": 7715 }, { "epoch": 2.5113858165256993, "grad_norm": 1.79240083694458, "learning_rate": 2.5340659007358742e-05, "loss": 0.7518, "step": 7720 }, { "epoch": 2.5130123617436566, "grad_norm": 2.8876941204071045, "learning_rate": 2.531485292727419e-05, "loss": 0.7582, "step": 7725 }, { "epoch": 2.5146389069616135, "grad_norm": 1.7138187885284424, "learning_rate": 2.5289046511647972e-05, "loss": 0.7589, "step": 7730 }, { "epoch": 2.5162654521795704, "grad_norm": 1.4874415397644043, "learning_rate": 2.5263239787982234e-05, "loss": 0.7397, "step": 7735 }, { "epoch": 2.517891997397528, "grad_norm": 1.6347683668136597, "learning_rate": 2.523743278377943e-05, "loss": 0.685, "step": 7740 }, { "epoch": 2.5195185426154847, "grad_norm": 1.623404860496521, "learning_rate": 2.5211625526542313e-05, "loss": 0.7399, "step": 7745 }, { "epoch": 2.5211450878334416, "grad_norm": 2.0945160388946533, "learning_rate": 2.518581804377394e-05, "loss": 0.7458, "step": 7750 }, { "epoch": 2.522771633051399, "grad_norm": 2.3906166553497314, "learning_rate": 2.516001036297756e-05, "loss": 0.7179, "step": 7755 }, { "epoch": 2.524398178269356, "grad_norm": 1.6636216640472412, "learning_rate": 2.5134202511656658e-05, "loss": 0.7228, "step": 7760 }, { "epoch": 2.526024723487313, "grad_norm": 1.6783771514892578, "learning_rate": 2.5108394517314915e-05, "loss": 0.7403, "step": 7765 }, { "epoch": 2.52765126870527, "grad_norm": 1.7963955402374268, "learning_rate": 2.5082586407456134e-05, "loss": 0.7181, "step": 7770 }, { "epoch": 2.529277813923227, "grad_norm": 2.11568021774292, "learning_rate": 2.5056778209584252e-05, "loss": 0.7135, "step": 7775 }, { "epoch": 2.530904359141184, "grad_norm": 2.0192832946777344, "learning_rate": 2.5030969951203316e-05, "loss": 0.7509, "step": 7780 }, { "epoch": 2.5325309043591413, "grad_norm": 1.4360262155532837, "learning_rate": 2.5005161659817418e-05, "loss": 0.7579, "step": 7785 }, { "epoch": 2.534157449577098, "grad_norm": 2.1884679794311523, "learning_rate": 2.4979353362930685e-05, "loss": 0.7375, "step": 7790 }, { "epoch": 2.5357839947950556, "grad_norm": 1.7255921363830566, "learning_rate": 2.4953545088047263e-05, "loss": 0.7177, "step": 7795 }, { "epoch": 2.5374105400130125, "grad_norm": 1.8770041465759277, "learning_rate": 2.492773686267128e-05, "loss": 0.7094, "step": 7800 }, { "epoch": 2.5390370852309694, "grad_norm": 1.7790896892547607, "learning_rate": 2.4901928714306777e-05, "loss": 0.7168, "step": 7805 }, { "epoch": 2.5406636304489263, "grad_norm": 1.711310863494873, "learning_rate": 2.4876120670457754e-05, "loss": 0.7277, "step": 7810 }, { "epoch": 2.5422901756668836, "grad_norm": 1.9501806497573853, "learning_rate": 2.4850312758628062e-05, "loss": 0.7278, "step": 7815 }, { "epoch": 2.5439167208848406, "grad_norm": 1.7433342933654785, "learning_rate": 2.482450500632145e-05, "loss": 0.7233, "step": 7820 }, { "epoch": 2.545543266102798, "grad_norm": 1.9563379287719727, "learning_rate": 2.479869744104146e-05, "loss": 0.7464, "step": 7825 }, { "epoch": 2.547169811320755, "grad_norm": 1.655651330947876, "learning_rate": 2.477289009029147e-05, "loss": 0.7808, "step": 7830 }, { "epoch": 2.5487963565387117, "grad_norm": 1.86355459690094, "learning_rate": 2.4747082981574593e-05, "loss": 0.7108, "step": 7835 }, { "epoch": 2.5504229017566686, "grad_norm": 1.6612073183059692, "learning_rate": 2.4721276142393714e-05, "loss": 0.6986, "step": 7840 }, { "epoch": 2.552049446974626, "grad_norm": 1.7691892385482788, "learning_rate": 2.469546960025141e-05, "loss": 0.7595, "step": 7845 }, { "epoch": 2.553675992192583, "grad_norm": 1.760162353515625, "learning_rate": 2.4669663382649967e-05, "loss": 0.7267, "step": 7850 }, { "epoch": 2.5553025374105403, "grad_norm": 1.884227991104126, "learning_rate": 2.464385751709128e-05, "loss": 0.7314, "step": 7855 }, { "epoch": 2.556929082628497, "grad_norm": 1.6329880952835083, "learning_rate": 2.4618052031076933e-05, "loss": 0.7272, "step": 7860 }, { "epoch": 2.558555627846454, "grad_norm": 1.9245282411575317, "learning_rate": 2.459224695210804e-05, "loss": 0.6923, "step": 7865 }, { "epoch": 2.560182173064411, "grad_norm": 1.7136482000350952, "learning_rate": 2.4566442307685325e-05, "loss": 0.7046, "step": 7870 }, { "epoch": 2.5618087182823683, "grad_norm": 1.7598689794540405, "learning_rate": 2.4540638125309032e-05, "loss": 0.7222, "step": 7875 }, { "epoch": 2.5634352635003252, "grad_norm": 1.9097983837127686, "learning_rate": 2.4514834432478927e-05, "loss": 0.7007, "step": 7880 }, { "epoch": 2.5650618087182826, "grad_norm": 1.859815001487732, "learning_rate": 2.4489031256694212e-05, "loss": 0.7399, "step": 7885 }, { "epoch": 2.5666883539362395, "grad_norm": 1.593043565750122, "learning_rate": 2.4463228625453607e-05, "loss": 0.7125, "step": 7890 }, { "epoch": 2.5683148991541964, "grad_norm": 1.563043475151062, "learning_rate": 2.4437426566255188e-05, "loss": 0.6984, "step": 7895 }, { "epoch": 2.5699414443721533, "grad_norm": 1.6478209495544434, "learning_rate": 2.4411625106596457e-05, "loss": 0.7491, "step": 7900 }, { "epoch": 2.5715679895901107, "grad_norm": 1.6165393590927124, "learning_rate": 2.4385824273974262e-05, "loss": 0.7488, "step": 7905 }, { "epoch": 2.5731945348080676, "grad_norm": 1.757622480392456, "learning_rate": 2.43600240958848e-05, "loss": 0.7273, "step": 7910 }, { "epoch": 2.574821080026025, "grad_norm": 1.9844690561294556, "learning_rate": 2.433422459982354e-05, "loss": 0.7552, "step": 7915 }, { "epoch": 2.576447625243982, "grad_norm": 1.7524667978286743, "learning_rate": 2.4308425813285255e-05, "loss": 0.7179, "step": 7920 }, { "epoch": 2.5780741704619388, "grad_norm": 1.6379843950271606, "learning_rate": 2.428262776376394e-05, "loss": 0.74, "step": 7925 }, { "epoch": 2.5797007156798957, "grad_norm": 1.8496534824371338, "learning_rate": 2.425683047875282e-05, "loss": 0.7194, "step": 7930 }, { "epoch": 2.581327260897853, "grad_norm": 1.599732518196106, "learning_rate": 2.4231033985744305e-05, "loss": 0.7326, "step": 7935 }, { "epoch": 2.58295380611581, "grad_norm": 1.7700289487838745, "learning_rate": 2.420523831222994e-05, "loss": 0.7481, "step": 7940 }, { "epoch": 2.5845803513337673, "grad_norm": 1.5851463079452515, "learning_rate": 2.4179443485700434e-05, "loss": 0.7573, "step": 7945 }, { "epoch": 2.586206896551724, "grad_norm": 1.7170052528381348, "learning_rate": 2.4153649533645545e-05, "loss": 0.7222, "step": 7950 }, { "epoch": 2.587833441769681, "grad_norm": 1.7591465711593628, "learning_rate": 2.412785648355414e-05, "loss": 0.7233, "step": 7955 }, { "epoch": 2.589459986987638, "grad_norm": 2.046952724456787, "learning_rate": 2.4102064362914108e-05, "loss": 0.7176, "step": 7960 }, { "epoch": 2.5910865322055954, "grad_norm": 1.6409968137741089, "learning_rate": 2.4076273199212352e-05, "loss": 0.7139, "step": 7965 }, { "epoch": 2.5927130774235523, "grad_norm": 2.4438061714172363, "learning_rate": 2.4050483019934737e-05, "loss": 0.7431, "step": 7970 }, { "epoch": 2.5943396226415096, "grad_norm": 1.9839977025985718, "learning_rate": 2.4024693852566124e-05, "loss": 0.7305, "step": 7975 }, { "epoch": 2.5959661678594665, "grad_norm": 1.5234110355377197, "learning_rate": 2.3998905724590237e-05, "loss": 0.6977, "step": 7980 }, { "epoch": 2.5975927130774235, "grad_norm": 1.705267310142517, "learning_rate": 2.3973118663489736e-05, "loss": 0.6917, "step": 7985 }, { "epoch": 2.5992192582953804, "grad_norm": 1.8564801216125488, "learning_rate": 2.3947332696746122e-05, "loss": 0.7277, "step": 7990 }, { "epoch": 2.6008458035133377, "grad_norm": 2.0818612575531006, "learning_rate": 2.3921547851839747e-05, "loss": 0.7544, "step": 7995 }, { "epoch": 2.6024723487312946, "grad_norm": 1.8174341917037964, "learning_rate": 2.3895764156249746e-05, "loss": 0.7192, "step": 8000 }, { "epoch": 2.604098893949252, "grad_norm": 1.606379508972168, "learning_rate": 2.3869981637454053e-05, "loss": 0.7222, "step": 8005 }, { "epoch": 2.605725439167209, "grad_norm": 1.8314141035079956, "learning_rate": 2.3844200322929323e-05, "loss": 0.7135, "step": 8010 }, { "epoch": 2.607351984385166, "grad_norm": 1.7894541025161743, "learning_rate": 2.3818420240150947e-05, "loss": 0.7342, "step": 8015 }, { "epoch": 2.6089785296031227, "grad_norm": 2.068234443664551, "learning_rate": 2.3792641416592994e-05, "loss": 0.7033, "step": 8020 }, { "epoch": 2.61060507482108, "grad_norm": 1.7998398542404175, "learning_rate": 2.376686387972821e-05, "loss": 0.7421, "step": 8025 }, { "epoch": 2.612231620039037, "grad_norm": 2.042323589324951, "learning_rate": 2.3741087657027912e-05, "loss": 0.7232, "step": 8030 }, { "epoch": 2.6138581652569943, "grad_norm": 1.5981550216674805, "learning_rate": 2.3715312775962105e-05, "loss": 0.7524, "step": 8035 }, { "epoch": 2.6154847104749512, "grad_norm": 1.6464756727218628, "learning_rate": 2.3689539263999286e-05, "loss": 0.7407, "step": 8040 }, { "epoch": 2.617111255692908, "grad_norm": 4.4229021072387695, "learning_rate": 2.366376714860654e-05, "loss": 0.7646, "step": 8045 }, { "epoch": 2.618737800910865, "grad_norm": 1.9497524499893188, "learning_rate": 2.3637996457249434e-05, "loss": 0.7191, "step": 8050 }, { "epoch": 2.6203643461288224, "grad_norm": 2.853574514389038, "learning_rate": 2.361222721739205e-05, "loss": 0.7229, "step": 8055 }, { "epoch": 2.6219908913467793, "grad_norm": 1.6402703523635864, "learning_rate": 2.3586459456496877e-05, "loss": 0.7226, "step": 8060 }, { "epoch": 2.6236174365647367, "grad_norm": 1.9785535335540771, "learning_rate": 2.3560693202024877e-05, "loss": 0.7016, "step": 8065 }, { "epoch": 2.6252439817826936, "grad_norm": 1.7689776420593262, "learning_rate": 2.3534928481435388e-05, "loss": 0.7107, "step": 8070 }, { "epoch": 2.6268705270006505, "grad_norm": 2.1093742847442627, "learning_rate": 2.3509165322186084e-05, "loss": 0.7173, "step": 8075 }, { "epoch": 2.6284970722186074, "grad_norm": 1.9455536603927612, "learning_rate": 2.348340375173303e-05, "loss": 0.7421, "step": 8080 }, { "epoch": 2.6301236174365648, "grad_norm": 1.7876079082489014, "learning_rate": 2.3457643797530544e-05, "loss": 0.7278, "step": 8085 }, { "epoch": 2.6317501626545217, "grad_norm": 1.8814291954040527, "learning_rate": 2.343188548703127e-05, "loss": 0.7403, "step": 8090 }, { "epoch": 2.633376707872479, "grad_norm": 1.7408051490783691, "learning_rate": 2.340612884768605e-05, "loss": 0.7042, "step": 8095 }, { "epoch": 2.635003253090436, "grad_norm": 2.139902353286743, "learning_rate": 2.3380373906944004e-05, "loss": 0.6996, "step": 8100 }, { "epoch": 2.636629798308393, "grad_norm": 1.5826632976531982, "learning_rate": 2.3354620692252377e-05, "loss": 0.7646, "step": 8105 }, { "epoch": 2.63825634352635, "grad_norm": 2.2483596801757812, "learning_rate": 2.3328869231056627e-05, "loss": 0.742, "step": 8110 }, { "epoch": 2.639882888744307, "grad_norm": 1.6104656457901, "learning_rate": 2.330311955080031e-05, "loss": 0.7283, "step": 8115 }, { "epoch": 2.641509433962264, "grad_norm": 1.8288098573684692, "learning_rate": 2.3277371678925123e-05, "loss": 0.7291, "step": 8120 }, { "epoch": 2.6431359791802214, "grad_norm": 1.7580839395523071, "learning_rate": 2.3251625642870774e-05, "loss": 0.7187, "step": 8125 }, { "epoch": 2.6447625243981783, "grad_norm": 1.7375248670578003, "learning_rate": 2.3225881470075075e-05, "loss": 0.7481, "step": 8130 }, { "epoch": 2.646389069616135, "grad_norm": 1.697384238243103, "learning_rate": 2.3200139187973818e-05, "loss": 0.7052, "step": 8135 }, { "epoch": 2.6480156148340925, "grad_norm": 2.009641647338867, "learning_rate": 2.3174398824000794e-05, "loss": 0.7042, "step": 8140 }, { "epoch": 2.6496421600520494, "grad_norm": 1.9816298484802246, "learning_rate": 2.3148660405587745e-05, "loss": 0.7353, "step": 8145 }, { "epoch": 2.6512687052700064, "grad_norm": 1.696813702583313, "learning_rate": 2.3122923960164346e-05, "loss": 0.723, "step": 8150 }, { "epoch": 2.6528952504879637, "grad_norm": 8.96198844909668, "learning_rate": 2.309718951515815e-05, "loss": 0.7429, "step": 8155 }, { "epoch": 2.6545217957059206, "grad_norm": 1.714002013206482, "learning_rate": 2.3071457097994607e-05, "loss": 0.7447, "step": 8160 }, { "epoch": 2.6561483409238775, "grad_norm": 1.8118575811386108, "learning_rate": 2.3045726736096978e-05, "loss": 0.7204, "step": 8165 }, { "epoch": 2.657774886141835, "grad_norm": 7.933690547943115, "learning_rate": 2.301999845688635e-05, "loss": 0.7226, "step": 8170 }, { "epoch": 2.659401431359792, "grad_norm": 2.298539638519287, "learning_rate": 2.2994272287781593e-05, "loss": 0.7301, "step": 8175 }, { "epoch": 2.6610279765777487, "grad_norm": 1.950821042060852, "learning_rate": 2.2968548256199325e-05, "loss": 0.7278, "step": 8180 }, { "epoch": 2.662654521795706, "grad_norm": 2.008021831512451, "learning_rate": 2.2942826389553865e-05, "loss": 0.7421, "step": 8185 }, { "epoch": 2.664281067013663, "grad_norm": 1.7788177728652954, "learning_rate": 2.291710671525726e-05, "loss": 0.7472, "step": 8190 }, { "epoch": 2.66590761223162, "grad_norm": 1.5617021322250366, "learning_rate": 2.2891389260719197e-05, "loss": 0.7154, "step": 8195 }, { "epoch": 2.6675341574495772, "grad_norm": 2.134261131286621, "learning_rate": 2.2865674053347e-05, "loss": 0.7321, "step": 8200 }, { "epoch": 2.669160702667534, "grad_norm": 1.7520164251327515, "learning_rate": 2.2839961120545614e-05, "loss": 0.7015, "step": 8205 }, { "epoch": 2.6707872478854915, "grad_norm": 1.7123262882232666, "learning_rate": 2.2814250489717536e-05, "loss": 0.7041, "step": 8210 }, { "epoch": 2.6724137931034484, "grad_norm": 3.0392608642578125, "learning_rate": 2.278854218826284e-05, "loss": 0.7253, "step": 8215 }, { "epoch": 2.6740403383214053, "grad_norm": 1.6789202690124512, "learning_rate": 2.2762836243579073e-05, "loss": 0.7321, "step": 8220 }, { "epoch": 2.675666883539362, "grad_norm": 1.9678678512573242, "learning_rate": 2.2737132683061318e-05, "loss": 0.7309, "step": 8225 }, { "epoch": 2.6772934287573196, "grad_norm": 2.1092047691345215, "learning_rate": 2.2711431534102082e-05, "loss": 0.767, "step": 8230 }, { "epoch": 2.6789199739752765, "grad_norm": 1.7379661798477173, "learning_rate": 2.268573282409133e-05, "loss": 0.7189, "step": 8235 }, { "epoch": 2.680546519193234, "grad_norm": 1.8567099571228027, "learning_rate": 2.26600365804164e-05, "loss": 0.7098, "step": 8240 }, { "epoch": 2.6821730644111907, "grad_norm": 2.021911382675171, "learning_rate": 2.263434283046203e-05, "loss": 0.7304, "step": 8245 }, { "epoch": 2.6837996096291477, "grad_norm": 1.7543399333953857, "learning_rate": 2.260865160161027e-05, "loss": 0.7289, "step": 8250 }, { "epoch": 2.6854261548471046, "grad_norm": 1.5903064012527466, "learning_rate": 2.2582962921240507e-05, "loss": 0.7363, "step": 8255 }, { "epoch": 2.687052700065062, "grad_norm": 1.5687363147735596, "learning_rate": 2.2557276816729404e-05, "loss": 0.7124, "step": 8260 }, { "epoch": 2.688679245283019, "grad_norm": 1.7256449460983276, "learning_rate": 2.253159331545089e-05, "loss": 0.7333, "step": 8265 }, { "epoch": 2.690305790500976, "grad_norm": 1.9225983619689941, "learning_rate": 2.2505912444776076e-05, "loss": 0.7302, "step": 8270 }, { "epoch": 2.691932335718933, "grad_norm": 1.7383098602294922, "learning_rate": 2.248023423207334e-05, "loss": 0.7349, "step": 8275 }, { "epoch": 2.69355888093689, "grad_norm": 1.5088884830474854, "learning_rate": 2.2454558704708165e-05, "loss": 0.7425, "step": 8280 }, { "epoch": 2.695185426154847, "grad_norm": 2.35827374458313, "learning_rate": 2.242888589004321e-05, "loss": 0.7031, "step": 8285 }, { "epoch": 2.6968119713728043, "grad_norm": 1.793815016746521, "learning_rate": 2.240321581543822e-05, "loss": 0.7313, "step": 8290 }, { "epoch": 2.698438516590761, "grad_norm": 2.1256535053253174, "learning_rate": 2.2377548508250043e-05, "loss": 0.7174, "step": 8295 }, { "epoch": 2.7000650618087185, "grad_norm": 1.643990397453308, "learning_rate": 2.2351883995832545e-05, "loss": 0.7239, "step": 8300 }, { "epoch": 2.7016916070266754, "grad_norm": 1.457271933555603, "learning_rate": 2.2326222305536653e-05, "loss": 0.7184, "step": 8305 }, { "epoch": 2.7033181522446323, "grad_norm": 1.6957201957702637, "learning_rate": 2.230056346471025e-05, "loss": 0.7152, "step": 8310 }, { "epoch": 2.7049446974625893, "grad_norm": 1.8537200689315796, "learning_rate": 2.2274907500698212e-05, "loss": 0.7005, "step": 8315 }, { "epoch": 2.7065712426805466, "grad_norm": 1.7019157409667969, "learning_rate": 2.2249254440842324e-05, "loss": 0.7497, "step": 8320 }, { "epoch": 2.7081977878985035, "grad_norm": 1.6967856884002686, "learning_rate": 2.222360431248131e-05, "loss": 0.7131, "step": 8325 }, { "epoch": 2.709824333116461, "grad_norm": 1.7513232231140137, "learning_rate": 2.2197957142950714e-05, "loss": 0.6965, "step": 8330 }, { "epoch": 2.711450878334418, "grad_norm": 2.3701894283294678, "learning_rate": 2.2172312959582983e-05, "loss": 0.7165, "step": 8335 }, { "epoch": 2.7130774235523747, "grad_norm": 1.6562988758087158, "learning_rate": 2.214667178970736e-05, "loss": 0.7289, "step": 8340 }, { "epoch": 2.7147039687703316, "grad_norm": 1.782637119293213, "learning_rate": 2.2121033660649863e-05, "loss": 0.7281, "step": 8345 }, { "epoch": 2.716330513988289, "grad_norm": 1.559083342552185, "learning_rate": 2.20953985997333e-05, "loss": 0.7544, "step": 8350 }, { "epoch": 2.717957059206246, "grad_norm": 2.210763454437256, "learning_rate": 2.2069766634277174e-05, "loss": 0.7411, "step": 8355 }, { "epoch": 2.719583604424203, "grad_norm": 1.729476809501648, "learning_rate": 2.2044137791597723e-05, "loss": 0.7414, "step": 8360 }, { "epoch": 2.72121014964216, "grad_norm": 1.8460407257080078, "learning_rate": 2.2018512099007823e-05, "loss": 0.7203, "step": 8365 }, { "epoch": 2.722836694860117, "grad_norm": 1.8685365915298462, "learning_rate": 2.1992889583817023e-05, "loss": 0.7463, "step": 8370 }, { "epoch": 2.724463240078074, "grad_norm": 1.7120425701141357, "learning_rate": 2.1967270273331464e-05, "loss": 0.7575, "step": 8375 }, { "epoch": 2.7260897852960313, "grad_norm": 1.839551329612732, "learning_rate": 2.194165419485389e-05, "loss": 0.735, "step": 8380 }, { "epoch": 2.727716330513988, "grad_norm": 1.5357433557510376, "learning_rate": 2.191604137568358e-05, "loss": 0.7125, "step": 8385 }, { "epoch": 2.7293428757319456, "grad_norm": 1.843185544013977, "learning_rate": 2.1890431843116373e-05, "loss": 0.7247, "step": 8390 }, { "epoch": 2.7309694209499025, "grad_norm": 1.520994782447815, "learning_rate": 2.186482562444456e-05, "loss": 0.731, "step": 8395 }, { "epoch": 2.7325959661678594, "grad_norm": 1.8172752857208252, "learning_rate": 2.1839222746956936e-05, "loss": 0.7369, "step": 8400 }, { "epoch": 2.7342225113858163, "grad_norm": 2.0478515625, "learning_rate": 2.1813623237938714e-05, "loss": 0.7047, "step": 8405 }, { "epoch": 2.7358490566037736, "grad_norm": 1.963808298110962, "learning_rate": 2.178802712467154e-05, "loss": 0.735, "step": 8410 }, { "epoch": 2.7374756018217306, "grad_norm": 2.2454097270965576, "learning_rate": 2.1762434434433414e-05, "loss": 0.7352, "step": 8415 }, { "epoch": 2.739102147039688, "grad_norm": 1.9860196113586426, "learning_rate": 2.173684519449872e-05, "loss": 0.7128, "step": 8420 }, { "epoch": 2.740728692257645, "grad_norm": 1.8477674722671509, "learning_rate": 2.1711259432138118e-05, "loss": 0.7336, "step": 8425 }, { "epoch": 2.7423552374756017, "grad_norm": 1.6550543308258057, "learning_rate": 2.1685677174618615e-05, "loss": 0.7001, "step": 8430 }, { "epoch": 2.7439817826935586, "grad_norm": 1.5890171527862549, "learning_rate": 2.1660098449203446e-05, "loss": 0.7494, "step": 8435 }, { "epoch": 2.745608327911516, "grad_norm": 1.7456623315811157, "learning_rate": 2.1634523283152107e-05, "loss": 0.7574, "step": 8440 }, { "epoch": 2.747234873129473, "grad_norm": 1.7022398710250854, "learning_rate": 2.160895170372026e-05, "loss": 0.7174, "step": 8445 }, { "epoch": 2.7488614183474303, "grad_norm": 1.7472623586654663, "learning_rate": 2.1583383738159812e-05, "loss": 0.7331, "step": 8450 }, { "epoch": 2.750487963565387, "grad_norm": 1.8784606456756592, "learning_rate": 2.1557819413718743e-05, "loss": 0.7302, "step": 8455 }, { "epoch": 2.752114508783344, "grad_norm": 1.8083624839782715, "learning_rate": 2.1532258757641216e-05, "loss": 0.7273, "step": 8460 }, { "epoch": 2.753741054001301, "grad_norm": 2.1513571739196777, "learning_rate": 2.1506701797167435e-05, "loss": 0.7327, "step": 8465 }, { "epoch": 2.7553675992192583, "grad_norm": 6.521982192993164, "learning_rate": 2.14811485595337e-05, "loss": 0.7357, "step": 8470 }, { "epoch": 2.7569941444372152, "grad_norm": 1.629664659500122, "learning_rate": 2.1455599071972344e-05, "loss": 0.7231, "step": 8475 }, { "epoch": 2.7586206896551726, "grad_norm": 1.6649502515792847, "learning_rate": 2.143005336171166e-05, "loss": 0.7108, "step": 8480 }, { "epoch": 2.7602472348731295, "grad_norm": 1.8977936506271362, "learning_rate": 2.1404511455975977e-05, "loss": 0.7402, "step": 8485 }, { "epoch": 2.7618737800910864, "grad_norm": 1.6810303926467896, "learning_rate": 2.137897338198552e-05, "loss": 0.7045, "step": 8490 }, { "epoch": 2.7635003253090433, "grad_norm": 1.9456619024276733, "learning_rate": 2.1353439166956453e-05, "loss": 0.7221, "step": 8495 }, { "epoch": 2.7651268705270007, "grad_norm": 2.0193536281585693, "learning_rate": 2.1327908838100824e-05, "loss": 0.7471, "step": 8500 }, { "epoch": 2.7667534157449576, "grad_norm": 1.7074350118637085, "learning_rate": 2.1302382422626556e-05, "loss": 0.7446, "step": 8505 }, { "epoch": 2.768379960962915, "grad_norm": 1.6121134757995605, "learning_rate": 2.1276859947737356e-05, "loss": 0.7501, "step": 8510 }, { "epoch": 2.770006506180872, "grad_norm": 1.8085863590240479, "learning_rate": 2.1251341440632783e-05, "loss": 0.7112, "step": 8515 }, { "epoch": 2.7716330513988288, "grad_norm": 1.7475433349609375, "learning_rate": 2.122582692850813e-05, "loss": 0.753, "step": 8520 }, { "epoch": 2.7732595966167857, "grad_norm": 1.9420067071914673, "learning_rate": 2.120031643855446e-05, "loss": 0.7381, "step": 8525 }, { "epoch": 2.774886141834743, "grad_norm": 1.8206048011779785, "learning_rate": 2.117480999795853e-05, "loss": 0.7378, "step": 8530 }, { "epoch": 2.7765126870527, "grad_norm": 1.6537567377090454, "learning_rate": 2.114930763390279e-05, "loss": 0.7196, "step": 8535 }, { "epoch": 2.7781392322706573, "grad_norm": 1.623516321182251, "learning_rate": 2.1123809373565342e-05, "loss": 0.7136, "step": 8540 }, { "epoch": 2.779765777488614, "grad_norm": 2.062002182006836, "learning_rate": 2.1098315244119922e-05, "loss": 0.7179, "step": 8545 }, { "epoch": 2.781392322706571, "grad_norm": 1.7511444091796875, "learning_rate": 2.1072825272735848e-05, "loss": 0.7189, "step": 8550 }, { "epoch": 2.7830188679245285, "grad_norm": 1.8162951469421387, "learning_rate": 2.104733948657803e-05, "loss": 0.714, "step": 8555 }, { "epoch": 2.7846454131424854, "grad_norm": 1.6305124759674072, "learning_rate": 2.102185791280689e-05, "loss": 0.7332, "step": 8560 }, { "epoch": 2.7862719583604423, "grad_norm": 1.8210047483444214, "learning_rate": 2.09963805785784e-05, "loss": 0.7362, "step": 8565 }, { "epoch": 2.7878985035783996, "grad_norm": 1.965907096862793, "learning_rate": 2.0970907511043964e-05, "loss": 0.7378, "step": 8570 }, { "epoch": 2.7895250487963565, "grad_norm": 1.8362945318222046, "learning_rate": 2.0945438737350476e-05, "loss": 0.7399, "step": 8575 }, { "epoch": 2.7911515940143135, "grad_norm": 1.7062568664550781, "learning_rate": 2.0919974284640237e-05, "loss": 0.7436, "step": 8580 }, { "epoch": 2.792778139232271, "grad_norm": 1.911207914352417, "learning_rate": 2.0894514180050964e-05, "loss": 0.7419, "step": 8585 }, { "epoch": 2.7944046844502277, "grad_norm": 1.878778338432312, "learning_rate": 2.0869058450715696e-05, "loss": 0.7517, "step": 8590 }, { "epoch": 2.7960312296681846, "grad_norm": 1.5588253736495972, "learning_rate": 2.0843607123762875e-05, "loss": 0.7775, "step": 8595 }, { "epoch": 2.797657774886142, "grad_norm": 1.6011550426483154, "learning_rate": 2.081816022631618e-05, "loss": 0.7463, "step": 8600 }, { "epoch": 2.799284320104099, "grad_norm": 1.6946686506271362, "learning_rate": 2.079271778549462e-05, "loss": 0.7052, "step": 8605 }, { "epoch": 2.800910865322056, "grad_norm": 1.7881314754486084, "learning_rate": 2.0767279828412442e-05, "loss": 0.7566, "step": 8610 }, { "epoch": 2.802537410540013, "grad_norm": 1.9608092308044434, "learning_rate": 2.0741846382179102e-05, "loss": 0.7481, "step": 8615 }, { "epoch": 2.80416395575797, "grad_norm": 1.8728355169296265, "learning_rate": 2.0716417473899268e-05, "loss": 0.7619, "step": 8620 }, { "epoch": 2.805790500975927, "grad_norm": 1.7436429262161255, "learning_rate": 2.069099313067275e-05, "loss": 0.6805, "step": 8625 }, { "epoch": 2.8074170461938843, "grad_norm": 2.551387071609497, "learning_rate": 2.0665573379594516e-05, "loss": 0.7302, "step": 8630 }, { "epoch": 2.8090435914118412, "grad_norm": 1.6762300729751587, "learning_rate": 2.0640158247754614e-05, "loss": 0.7121, "step": 8635 }, { "epoch": 2.810670136629798, "grad_norm": 1.5577950477600098, "learning_rate": 2.06147477622382e-05, "loss": 0.7586, "step": 8640 }, { "epoch": 2.8122966818477555, "grad_norm": 1.7521088123321533, "learning_rate": 2.0589341950125444e-05, "loss": 0.7032, "step": 8645 }, { "epoch": 2.8139232270657124, "grad_norm": 1.9056010246276855, "learning_rate": 2.056394083849158e-05, "loss": 0.7133, "step": 8650 }, { "epoch": 2.8155497722836698, "grad_norm": 1.8002731800079346, "learning_rate": 2.0538544454406776e-05, "loss": 0.7489, "step": 8655 }, { "epoch": 2.8171763175016267, "grad_norm": 1.6665791273117065, "learning_rate": 2.051315282493622e-05, "loss": 0.7431, "step": 8660 }, { "epoch": 2.8188028627195836, "grad_norm": 1.627205729484558, "learning_rate": 2.048776597713998e-05, "loss": 0.7262, "step": 8665 }, { "epoch": 2.8204294079375405, "grad_norm": 2.089062452316284, "learning_rate": 2.0462383938073078e-05, "loss": 0.7246, "step": 8670 }, { "epoch": 2.822055953155498, "grad_norm": 1.521073341369629, "learning_rate": 2.0437006734785365e-05, "loss": 0.7207, "step": 8675 }, { "epoch": 2.8236824983734548, "grad_norm": 2.04318904876709, "learning_rate": 2.0411634394321578e-05, "loss": 0.7386, "step": 8680 }, { "epoch": 2.825309043591412, "grad_norm": 1.7933502197265625, "learning_rate": 2.0386266943721235e-05, "loss": 0.7441, "step": 8685 }, { "epoch": 2.826935588809369, "grad_norm": 1.615450143814087, "learning_rate": 2.0360904410018676e-05, "loss": 0.7017, "step": 8690 }, { "epoch": 2.828562134027326, "grad_norm": 2.150987386703491, "learning_rate": 2.033554682024298e-05, "loss": 0.7638, "step": 8695 }, { "epoch": 2.830188679245283, "grad_norm": 2.078624963760376, "learning_rate": 2.031019420141797e-05, "loss": 0.7263, "step": 8700 }, { "epoch": 2.83181522446324, "grad_norm": 1.6702829599380493, "learning_rate": 2.028484658056216e-05, "loss": 0.7257, "step": 8705 }, { "epoch": 2.833441769681197, "grad_norm": 1.7530449628829956, "learning_rate": 2.025950398468875e-05, "loss": 0.7051, "step": 8710 }, { "epoch": 2.8350683148991545, "grad_norm": 2.530587673187256, "learning_rate": 2.0234166440805562e-05, "loss": 0.7341, "step": 8715 }, { "epoch": 2.8366948601171114, "grad_norm": 1.8676536083221436, "learning_rate": 2.020883397591507e-05, "loss": 0.7295, "step": 8720 }, { "epoch": 2.8383214053350683, "grad_norm": 1.7714043855667114, "learning_rate": 2.0183506617014293e-05, "loss": 0.7247, "step": 8725 }, { "epoch": 2.839947950553025, "grad_norm": 1.6985597610473633, "learning_rate": 2.015818439109485e-05, "loss": 0.6978, "step": 8730 }, { "epoch": 2.8415744957709825, "grad_norm": 1.7291275262832642, "learning_rate": 2.0132867325142858e-05, "loss": 0.7155, "step": 8735 }, { "epoch": 2.8432010409889394, "grad_norm": 1.6741697788238525, "learning_rate": 2.010755544613895e-05, "loss": 0.7253, "step": 8740 }, { "epoch": 2.844827586206897, "grad_norm": 1.7950025796890259, "learning_rate": 2.008224878105824e-05, "loss": 0.7178, "step": 8745 }, { "epoch": 2.8464541314248537, "grad_norm": 1.9313833713531494, "learning_rate": 2.005694735687025e-05, "loss": 0.7128, "step": 8750 }, { "epoch": 2.8480806766428106, "grad_norm": 1.7579280138015747, "learning_rate": 2.0031651200538963e-05, "loss": 0.6925, "step": 8755 }, { "epoch": 2.8497072218607675, "grad_norm": 1.711421251296997, "learning_rate": 2.000636033902271e-05, "loss": 0.7479, "step": 8760 }, { "epoch": 2.851333767078725, "grad_norm": 1.752677321434021, "learning_rate": 1.998107479927421e-05, "loss": 0.7246, "step": 8765 }, { "epoch": 2.852960312296682, "grad_norm": 2.1234564781188965, "learning_rate": 1.995579460824048e-05, "loss": 0.7231, "step": 8770 }, { "epoch": 2.854586857514639, "grad_norm": 1.6792376041412354, "learning_rate": 1.9930519792862873e-05, "loss": 0.6929, "step": 8775 }, { "epoch": 2.856213402732596, "grad_norm": 1.910710096359253, "learning_rate": 1.9905250380076965e-05, "loss": 0.7176, "step": 8780 }, { "epoch": 2.857839947950553, "grad_norm": 1.9245860576629639, "learning_rate": 1.9879986396812623e-05, "loss": 0.7285, "step": 8785 }, { "epoch": 2.85946649316851, "grad_norm": 1.8107945919036865, "learning_rate": 1.985472786999389e-05, "loss": 0.7178, "step": 8790 }, { "epoch": 2.8610930383864672, "grad_norm": 1.5665243864059448, "learning_rate": 1.982947482653903e-05, "loss": 0.7098, "step": 8795 }, { "epoch": 2.862719583604424, "grad_norm": 1.9117822647094727, "learning_rate": 1.980422729336042e-05, "loss": 0.7184, "step": 8800 }, { "epoch": 2.8643461288223815, "grad_norm": 2.126077651977539, "learning_rate": 1.977898529736462e-05, "loss": 0.7414, "step": 8805 }, { "epoch": 2.8659726740403384, "grad_norm": 1.8685696125030518, "learning_rate": 1.9753748865452226e-05, "loss": 0.7074, "step": 8810 }, { "epoch": 2.8675992192582953, "grad_norm": 1.9692431688308716, "learning_rate": 1.9728518024517957e-05, "loss": 0.6982, "step": 8815 }, { "epoch": 2.869225764476252, "grad_norm": 1.534635066986084, "learning_rate": 1.9703292801450536e-05, "loss": 0.7153, "step": 8820 }, { "epoch": 2.8708523096942096, "grad_norm": 1.5757933855056763, "learning_rate": 1.9678073223132737e-05, "loss": 0.7457, "step": 8825 }, { "epoch": 2.8724788549121665, "grad_norm": 1.720250129699707, "learning_rate": 1.9652859316441267e-05, "loss": 0.7032, "step": 8830 }, { "epoch": 2.874105400130124, "grad_norm": 1.6449079513549805, "learning_rate": 1.9627651108246848e-05, "loss": 0.7061, "step": 8835 }, { "epoch": 2.8757319453480807, "grad_norm": 1.6099705696105957, "learning_rate": 1.9602448625414077e-05, "loss": 0.693, "step": 8840 }, { "epoch": 2.8773584905660377, "grad_norm": 1.9806228876113892, "learning_rate": 1.9577251894801488e-05, "loss": 0.7428, "step": 8845 }, { "epoch": 2.8789850357839946, "grad_norm": 1.7147135734558105, "learning_rate": 1.9552060943261456e-05, "loss": 0.7634, "step": 8850 }, { "epoch": 2.880611581001952, "grad_norm": 1.7568204402923584, "learning_rate": 1.9526875797640226e-05, "loss": 0.7062, "step": 8855 }, { "epoch": 2.882238126219909, "grad_norm": 1.8958820104599, "learning_rate": 1.950169648477782e-05, "loss": 0.7187, "step": 8860 }, { "epoch": 2.883864671437866, "grad_norm": 1.7266991138458252, "learning_rate": 1.947652303150808e-05, "loss": 0.7518, "step": 8865 }, { "epoch": 2.885491216655823, "grad_norm": 1.6969014406204224, "learning_rate": 1.945135546465857e-05, "loss": 0.7407, "step": 8870 }, { "epoch": 2.88711776187378, "grad_norm": 1.9077585935592651, "learning_rate": 1.942619381105061e-05, "loss": 0.7169, "step": 8875 }, { "epoch": 2.888744307091737, "grad_norm": 1.9687460660934448, "learning_rate": 1.9401038097499208e-05, "loss": 0.731, "step": 8880 }, { "epoch": 2.8903708523096943, "grad_norm": 2.1020829677581787, "learning_rate": 1.9375888350813026e-05, "loss": 0.7104, "step": 8885 }, { "epoch": 2.891997397527651, "grad_norm": 2.6251018047332764, "learning_rate": 1.9350744597794405e-05, "loss": 0.6999, "step": 8890 }, { "epoch": 2.8936239427456085, "grad_norm": 2.1141345500946045, "learning_rate": 1.9325606865239243e-05, "loss": 0.7493, "step": 8895 }, { "epoch": 2.8952504879635654, "grad_norm": 2.002952814102173, "learning_rate": 1.9300475179937077e-05, "loss": 0.714, "step": 8900 }, { "epoch": 2.8968770331815223, "grad_norm": 2.4000797271728516, "learning_rate": 1.9275349568670957e-05, "loss": 0.7104, "step": 8905 }, { "epoch": 2.8985035783994793, "grad_norm": 1.9128988981246948, "learning_rate": 1.9250230058217496e-05, "loss": 0.7081, "step": 8910 }, { "epoch": 2.9001301236174366, "grad_norm": 1.6900957822799683, "learning_rate": 1.9225116675346776e-05, "loss": 0.7381, "step": 8915 }, { "epoch": 2.9017566688353935, "grad_norm": 1.6366512775421143, "learning_rate": 1.920000944682237e-05, "loss": 0.6969, "step": 8920 }, { "epoch": 2.903383214053351, "grad_norm": 1.9840657711029053, "learning_rate": 1.9174908399401266e-05, "loss": 0.7298, "step": 8925 }, { "epoch": 2.905009759271308, "grad_norm": 1.681493878364563, "learning_rate": 1.9149813559833897e-05, "loss": 0.7284, "step": 8930 }, { "epoch": 2.9066363044892647, "grad_norm": 1.9548931121826172, "learning_rate": 1.912472495486405e-05, "loss": 0.7197, "step": 8935 }, { "epoch": 2.9082628497072216, "grad_norm": 1.92655611038208, "learning_rate": 1.9099642611228896e-05, "loss": 0.7528, "step": 8940 }, { "epoch": 2.909889394925179, "grad_norm": 1.547681212425232, "learning_rate": 1.907456655565891e-05, "loss": 0.7207, "step": 8945 }, { "epoch": 2.911515940143136, "grad_norm": 1.506739616394043, "learning_rate": 1.9049496814877893e-05, "loss": 0.6939, "step": 8950 }, { "epoch": 2.913142485361093, "grad_norm": 1.6843504905700684, "learning_rate": 1.9024433415602872e-05, "loss": 0.7461, "step": 8955 }, { "epoch": 2.91476903057905, "grad_norm": 1.856924295425415, "learning_rate": 1.899937638454416e-05, "loss": 0.7348, "step": 8960 }, { "epoch": 2.916395575797007, "grad_norm": 2.313263177871704, "learning_rate": 1.8974325748405258e-05, "loss": 0.7271, "step": 8965 }, { "epoch": 2.918022121014964, "grad_norm": 1.6064339876174927, "learning_rate": 1.8949281533882864e-05, "loss": 0.7281, "step": 8970 }, { "epoch": 2.9196486662329213, "grad_norm": 2.2871596813201904, "learning_rate": 1.8924243767666823e-05, "loss": 0.745, "step": 8975 }, { "epoch": 2.921275211450878, "grad_norm": 1.5119794607162476, "learning_rate": 1.8899212476440125e-05, "loss": 0.7236, "step": 8980 }, { "epoch": 2.9229017566688356, "grad_norm": 1.7199755907058716, "learning_rate": 1.887418768687883e-05, "loss": 0.7452, "step": 8985 }, { "epoch": 2.9245283018867925, "grad_norm": 1.876117467880249, "learning_rate": 1.8849169425652095e-05, "loss": 0.7237, "step": 8990 }, { "epoch": 2.9261548471047494, "grad_norm": 1.9358798265457153, "learning_rate": 1.8824157719422112e-05, "loss": 0.7311, "step": 8995 }, { "epoch": 2.9277813923227067, "grad_norm": 1.8423490524291992, "learning_rate": 1.8799152594844093e-05, "loss": 0.7247, "step": 9000 }, { "epoch": 2.9294079375406636, "grad_norm": 3.251728057861328, "learning_rate": 1.8774154078566207e-05, "loss": 0.7415, "step": 9005 }, { "epoch": 2.9310344827586206, "grad_norm": 1.8368016481399536, "learning_rate": 1.8749162197229626e-05, "loss": 0.7178, "step": 9010 }, { "epoch": 2.932661027976578, "grad_norm": 1.7899824380874634, "learning_rate": 1.872417697746843e-05, "loss": 0.7046, "step": 9015 }, { "epoch": 2.934287573194535, "grad_norm": 1.997498869895935, "learning_rate": 1.8699198445909572e-05, "loss": 0.7239, "step": 9020 }, { "epoch": 2.9359141184124917, "grad_norm": 1.5661916732788086, "learning_rate": 1.8674226629172925e-05, "loss": 0.7231, "step": 9025 }, { "epoch": 2.937540663630449, "grad_norm": 1.6990458965301514, "learning_rate": 1.8649261553871176e-05, "loss": 0.7206, "step": 9030 }, { "epoch": 2.939167208848406, "grad_norm": 1.7623580694198608, "learning_rate": 1.8624303246609847e-05, "loss": 0.7218, "step": 9035 }, { "epoch": 2.940793754066363, "grad_norm": 1.6274583339691162, "learning_rate": 1.8599351733987205e-05, "loss": 0.7362, "step": 9040 }, { "epoch": 2.9424202992843203, "grad_norm": 2.1584455966949463, "learning_rate": 1.8574407042594348e-05, "loss": 0.7171, "step": 9045 }, { "epoch": 2.944046844502277, "grad_norm": 1.5133984088897705, "learning_rate": 1.854946919901503e-05, "loss": 0.7399, "step": 9050 }, { "epoch": 2.945673389720234, "grad_norm": 1.7558419704437256, "learning_rate": 1.8524538229825757e-05, "loss": 0.7382, "step": 9055 }, { "epoch": 2.9472999349381914, "grad_norm": 1.7353185415267944, "learning_rate": 1.8499614161595685e-05, "loss": 0.7105, "step": 9060 }, { "epoch": 2.9489264801561483, "grad_norm": 2.38606595993042, "learning_rate": 1.8474697020886636e-05, "loss": 0.7605, "step": 9065 }, { "epoch": 2.9505530253741052, "grad_norm": 1.7944279909133911, "learning_rate": 1.8449786834253015e-05, "loss": 0.715, "step": 9070 }, { "epoch": 2.9521795705920626, "grad_norm": 1.5300099849700928, "learning_rate": 1.8424883628241857e-05, "loss": 0.7015, "step": 9075 }, { "epoch": 2.9538061158100195, "grad_norm": 1.6979626417160034, "learning_rate": 1.8399987429392722e-05, "loss": 0.7271, "step": 9080 }, { "epoch": 2.9554326610279764, "grad_norm": 1.8683507442474365, "learning_rate": 1.837509826423773e-05, "loss": 0.687, "step": 9085 }, { "epoch": 2.9570592062459338, "grad_norm": 1.8588496446609497, "learning_rate": 1.8350216159301483e-05, "loss": 0.7267, "step": 9090 }, { "epoch": 2.9586857514638907, "grad_norm": 1.57637619972229, "learning_rate": 1.8325341141101087e-05, "loss": 0.707, "step": 9095 }, { "epoch": 2.960312296681848, "grad_norm": 1.714156985282898, "learning_rate": 1.830047323614606e-05, "loss": 0.7032, "step": 9100 }, { "epoch": 2.961938841899805, "grad_norm": 1.7165101766586304, "learning_rate": 1.827561247093836e-05, "loss": 0.6934, "step": 9105 }, { "epoch": 2.963565387117762, "grad_norm": 1.688585877418518, "learning_rate": 1.8250758871972335e-05, "loss": 0.7076, "step": 9110 }, { "epoch": 2.9651919323357188, "grad_norm": 1.927809476852417, "learning_rate": 1.82259124657347e-05, "loss": 0.7257, "step": 9115 }, { "epoch": 2.966818477553676, "grad_norm": 1.6875810623168945, "learning_rate": 1.8201073278704492e-05, "loss": 0.7341, "step": 9120 }, { "epoch": 2.968445022771633, "grad_norm": 1.7902840375900269, "learning_rate": 1.8176241337353073e-05, "loss": 0.7162, "step": 9125 }, { "epoch": 2.9700715679895904, "grad_norm": 1.5761710405349731, "learning_rate": 1.815141666814405e-05, "loss": 0.7042, "step": 9130 }, { "epoch": 2.9716981132075473, "grad_norm": 1.8276005983352661, "learning_rate": 1.812659929753332e-05, "loss": 0.7388, "step": 9135 }, { "epoch": 2.973324658425504, "grad_norm": 1.7913482189178467, "learning_rate": 1.810178925196897e-05, "loss": 0.7465, "step": 9140 }, { "epoch": 2.974951203643461, "grad_norm": 1.6558361053466797, "learning_rate": 1.80769865578913e-05, "loss": 0.7269, "step": 9145 }, { "epoch": 2.9765777488614185, "grad_norm": 1.754176378250122, "learning_rate": 1.805219124173278e-05, "loss": 0.7365, "step": 9150 }, { "epoch": 2.9782042940793754, "grad_norm": 1.735854148864746, "learning_rate": 1.802740332991799e-05, "loss": 0.7312, "step": 9155 }, { "epoch": 2.9798308392973327, "grad_norm": 1.854060173034668, "learning_rate": 1.800262284886365e-05, "loss": 0.6995, "step": 9160 }, { "epoch": 2.9814573845152896, "grad_norm": 1.8062974214553833, "learning_rate": 1.797784982497853e-05, "loss": 0.7316, "step": 9165 }, { "epoch": 2.9830839297332465, "grad_norm": 1.7298572063446045, "learning_rate": 1.7953084284663486e-05, "loss": 0.7445, "step": 9170 }, { "epoch": 2.9847104749512035, "grad_norm": 1.7473677396774292, "learning_rate": 1.7928326254311363e-05, "loss": 0.7474, "step": 9175 }, { "epoch": 2.986337020169161, "grad_norm": 1.807447910308838, "learning_rate": 1.7903575760307044e-05, "loss": 0.6988, "step": 9180 }, { "epoch": 2.9879635653871177, "grad_norm": 2.072762966156006, "learning_rate": 1.787883282902734e-05, "loss": 0.7396, "step": 9185 }, { "epoch": 2.989590110605075, "grad_norm": 1.9413923025131226, "learning_rate": 1.7854097486841044e-05, "loss": 0.759, "step": 9190 }, { "epoch": 2.991216655823032, "grad_norm": 1.696505069732666, "learning_rate": 1.782936976010881e-05, "loss": 0.7233, "step": 9195 }, { "epoch": 2.992843201040989, "grad_norm": 1.5802931785583496, "learning_rate": 1.7804649675183223e-05, "loss": 0.7215, "step": 9200 }, { "epoch": 2.994469746258946, "grad_norm": 1.557307243347168, "learning_rate": 1.7779937258408685e-05, "loss": 0.7219, "step": 9205 }, { "epoch": 2.996096291476903, "grad_norm": 1.7299686670303345, "learning_rate": 1.7755232536121477e-05, "loss": 0.6935, "step": 9210 }, { "epoch": 2.99772283669486, "grad_norm": 1.9171065092086792, "learning_rate": 1.7730535534649614e-05, "loss": 0.7665, "step": 9215 }, { "epoch": 2.9993493819128174, "grad_norm": 1.767626166343689, "learning_rate": 1.7705846280312948e-05, "loss": 0.687, "step": 9220 }, { "epoch": 3.0, "eval_f1": 0.824066158814795, "eval_loss": 0.403564453125, "eval_precision": 0.8248737220377281, "eval_recall": 0.8234550337582588, "eval_runtime": 1028.979, "eval_samples_per_second": 382.355, "eval_steps_per_second": 0.747, "step": 9222 }, { "epoch": 3.0009759271307743, "grad_norm": 1.4708726406097412, "learning_rate": 1.768116479942303e-05, "loss": 0.6647, "step": 9225 }, { "epoch": 3.0026024723487312, "grad_norm": 1.9929927587509155, "learning_rate": 1.7656491118283135e-05, "loss": 0.6565, "step": 9230 }, { "epoch": 3.004229017566688, "grad_norm": 1.7582905292510986, "learning_rate": 1.7631825263188246e-05, "loss": 0.6723, "step": 9235 }, { "epoch": 3.0058555627846455, "grad_norm": 1.9024964570999146, "learning_rate": 1.760716726042499e-05, "loss": 0.6528, "step": 9240 }, { "epoch": 3.0074821080026024, "grad_norm": 1.9164223670959473, "learning_rate": 1.7582517136271616e-05, "loss": 0.647, "step": 9245 }, { "epoch": 3.0091086532205593, "grad_norm": 1.8600401878356934, "learning_rate": 1.7557874916997996e-05, "loss": 0.6601, "step": 9250 }, { "epoch": 3.0107351984385167, "grad_norm": 1.8095484972000122, "learning_rate": 1.7533240628865567e-05, "loss": 0.6627, "step": 9255 }, { "epoch": 3.0123617436564736, "grad_norm": 2.208045244216919, "learning_rate": 1.7508614298127322e-05, "loss": 0.6739, "step": 9260 }, { "epoch": 3.0139882888744305, "grad_norm": 2.595229387283325, "learning_rate": 1.7483995951027767e-05, "loss": 0.6648, "step": 9265 }, { "epoch": 3.015614834092388, "grad_norm": 1.7185879945755005, "learning_rate": 1.7459385613802903e-05, "loss": 0.6742, "step": 9270 }, { "epoch": 3.0172413793103448, "grad_norm": 1.649437665939331, "learning_rate": 1.743478331268018e-05, "loss": 0.6808, "step": 9275 }, { "epoch": 3.018867924528302, "grad_norm": 2.5946407318115234, "learning_rate": 1.7410189073878513e-05, "loss": 0.6392, "step": 9280 }, { "epoch": 3.020494469746259, "grad_norm": 1.7240699529647827, "learning_rate": 1.7385602923608192e-05, "loss": 0.6396, "step": 9285 }, { "epoch": 3.022121014964216, "grad_norm": 2.009359359741211, "learning_rate": 1.736102488807092e-05, "loss": 0.6712, "step": 9290 }, { "epoch": 3.0237475601821733, "grad_norm": 1.9779503345489502, "learning_rate": 1.7336454993459726e-05, "loss": 0.6742, "step": 9295 }, { "epoch": 3.02537410540013, "grad_norm": 1.8295644521713257, "learning_rate": 1.7311893265958974e-05, "loss": 0.6896, "step": 9300 }, { "epoch": 3.027000650618087, "grad_norm": 1.862802267074585, "learning_rate": 1.7287339731744336e-05, "loss": 0.646, "step": 9305 }, { "epoch": 3.0286271958360445, "grad_norm": 2.442687749862671, "learning_rate": 1.7262794416982716e-05, "loss": 0.673, "step": 9310 }, { "epoch": 3.0302537410540014, "grad_norm": 1.9544475078582764, "learning_rate": 1.72382573478323e-05, "loss": 0.6643, "step": 9315 }, { "epoch": 3.0318802862719583, "grad_norm": 2.301948070526123, "learning_rate": 1.721372855044246e-05, "loss": 0.6631, "step": 9320 }, { "epoch": 3.0335068314899156, "grad_norm": 1.9081635475158691, "learning_rate": 1.7189208050953765e-05, "loss": 0.6859, "step": 9325 }, { "epoch": 3.0351333767078725, "grad_norm": 1.8731703758239746, "learning_rate": 1.7164695875497928e-05, "loss": 0.6686, "step": 9330 }, { "epoch": 3.0367599219258294, "grad_norm": 2.1195106506347656, "learning_rate": 1.714019205019782e-05, "loss": 0.6526, "step": 9335 }, { "epoch": 3.038386467143787, "grad_norm": 2.022167444229126, "learning_rate": 1.711569660116737e-05, "loss": 0.664, "step": 9340 }, { "epoch": 3.0400130123617437, "grad_norm": 1.7911338806152344, "learning_rate": 1.709120955451162e-05, "loss": 0.6774, "step": 9345 }, { "epoch": 3.0416395575797006, "grad_norm": 2.0143144130706787, "learning_rate": 1.706673093632663e-05, "loss": 0.691, "step": 9350 }, { "epoch": 3.043266102797658, "grad_norm": 1.735304594039917, "learning_rate": 1.70422607726995e-05, "loss": 0.6552, "step": 9355 }, { "epoch": 3.044892648015615, "grad_norm": 1.8288079500198364, "learning_rate": 1.7017799089708293e-05, "loss": 0.6673, "step": 9360 }, { "epoch": 3.046519193233572, "grad_norm": 2.4522781372070312, "learning_rate": 1.699334591342207e-05, "loss": 0.6906, "step": 9365 }, { "epoch": 3.048145738451529, "grad_norm": 1.8423187732696533, "learning_rate": 1.696890126990079e-05, "loss": 0.6622, "step": 9370 }, { "epoch": 3.049772283669486, "grad_norm": 2.284074544906616, "learning_rate": 1.694446518519534e-05, "loss": 0.7015, "step": 9375 }, { "epoch": 3.051398828887443, "grad_norm": 1.8895907402038574, "learning_rate": 1.692003768534747e-05, "loss": 0.6525, "step": 9380 }, { "epoch": 3.0530253741054003, "grad_norm": 2.285318613052368, "learning_rate": 1.689561879638982e-05, "loss": 0.6873, "step": 9385 }, { "epoch": 3.0546519193233572, "grad_norm": 1.9741708040237427, "learning_rate": 1.687120854434579e-05, "loss": 0.6642, "step": 9390 }, { "epoch": 3.056278464541314, "grad_norm": 1.918621301651001, "learning_rate": 1.684680695522964e-05, "loss": 0.6556, "step": 9395 }, { "epoch": 3.0579050097592715, "grad_norm": 1.9934749603271484, "learning_rate": 1.682241405504634e-05, "loss": 0.6549, "step": 9400 }, { "epoch": 3.0595315549772284, "grad_norm": 1.994615912437439, "learning_rate": 1.679802986979165e-05, "loss": 0.6503, "step": 9405 }, { "epoch": 3.0611581001951853, "grad_norm": 1.8361835479736328, "learning_rate": 1.6773654425452007e-05, "loss": 0.6581, "step": 9410 }, { "epoch": 3.0627846454131427, "grad_norm": 1.9985431432724, "learning_rate": 1.6749287748004567e-05, "loss": 0.6606, "step": 9415 }, { "epoch": 3.0644111906310996, "grad_norm": 1.8159586191177368, "learning_rate": 1.6724929863417094e-05, "loss": 0.6215, "step": 9420 }, { "epoch": 3.0660377358490565, "grad_norm": 1.887814998626709, "learning_rate": 1.670058079764802e-05, "loss": 0.6597, "step": 9425 }, { "epoch": 3.067664281067014, "grad_norm": 2.240401029586792, "learning_rate": 1.6676240576646387e-05, "loss": 0.6489, "step": 9430 }, { "epoch": 3.0692908262849707, "grad_norm": 1.9252859354019165, "learning_rate": 1.665190922635177e-05, "loss": 0.6683, "step": 9435 }, { "epoch": 3.0709173715029277, "grad_norm": 1.790296196937561, "learning_rate": 1.662758677269432e-05, "loss": 0.6634, "step": 9440 }, { "epoch": 3.072543916720885, "grad_norm": 1.9673635959625244, "learning_rate": 1.66032732415947e-05, "loss": 0.6765, "step": 9445 }, { "epoch": 3.074170461938842, "grad_norm": 1.8681625127792358, "learning_rate": 1.658382885836926e-05, "loss": 0.6526, "step": 9450 }, { "epoch": 3.075797007156799, "grad_norm": 1.5958857536315918, "learning_rate": 1.655953145316344e-05, "loss": 0.6675, "step": 9455 }, { "epoch": 3.077423552374756, "grad_norm": 2.2521891593933105, "learning_rate": 1.6535243043042624e-05, "loss": 0.6506, "step": 9460 }, { "epoch": 3.079050097592713, "grad_norm": 2.3123247623443604, "learning_rate": 1.651096365389121e-05, "loss": 0.63, "step": 9465 }, { "epoch": 3.08067664281067, "grad_norm": 1.9048078060150146, "learning_rate": 1.6486693311583946e-05, "loss": 0.6817, "step": 9470 }, { "epoch": 3.0823031880286273, "grad_norm": 2.893087387084961, "learning_rate": 1.6462432041985988e-05, "loss": 0.6824, "step": 9475 }, { "epoch": 3.0839297332465843, "grad_norm": 1.932302713394165, "learning_rate": 1.6438179870952762e-05, "loss": 0.6573, "step": 9480 }, { "epoch": 3.085556278464541, "grad_norm": 1.8314918279647827, "learning_rate": 1.641393682433005e-05, "loss": 0.6671, "step": 9485 }, { "epoch": 3.0871828236824985, "grad_norm": 1.8099032640457153, "learning_rate": 1.6389702927953876e-05, "loss": 0.6589, "step": 9490 }, { "epoch": 3.0888093689004554, "grad_norm": 2.079960584640503, "learning_rate": 1.6365478207650548e-05, "loss": 0.657, "step": 9495 }, { "epoch": 3.0904359141184123, "grad_norm": 2.1277928352355957, "learning_rate": 1.634126268923655e-05, "loss": 0.6561, "step": 9500 }, { "epoch": 3.0920624593363697, "grad_norm": 2.0239083766937256, "learning_rate": 1.6317056398518603e-05, "loss": 0.6121, "step": 9505 }, { "epoch": 3.0936890045543266, "grad_norm": 2.638597249984741, "learning_rate": 1.6292859361293554e-05, "loss": 0.6291, "step": 9510 }, { "epoch": 3.0953155497722835, "grad_norm": 2.395009994506836, "learning_rate": 1.626867160334843e-05, "loss": 0.6799, "step": 9515 }, { "epoch": 3.096942094990241, "grad_norm": 2.195422410964966, "learning_rate": 1.624449315046032e-05, "loss": 0.6687, "step": 9520 }, { "epoch": 3.0985686402081978, "grad_norm": 2.0896871089935303, "learning_rate": 1.622032402839645e-05, "loss": 0.642, "step": 9525 }, { "epoch": 3.1001951854261547, "grad_norm": 1.827916145324707, "learning_rate": 1.6196164262914064e-05, "loss": 0.6713, "step": 9530 }, { "epoch": 3.101821730644112, "grad_norm": 1.917366623878479, "learning_rate": 1.617201387976045e-05, "loss": 0.673, "step": 9535 }, { "epoch": 3.103448275862069, "grad_norm": 1.6420366764068604, "learning_rate": 1.6147872904672887e-05, "loss": 0.6736, "step": 9540 }, { "epoch": 3.105074821080026, "grad_norm": 1.9722702503204346, "learning_rate": 1.612374136337864e-05, "loss": 0.6429, "step": 9545 }, { "epoch": 3.106701366297983, "grad_norm": 2.036595582962036, "learning_rate": 1.609961928159491e-05, "loss": 0.6805, "step": 9550 }, { "epoch": 3.10832791151594, "grad_norm": 2.1223628520965576, "learning_rate": 1.6075506685028825e-05, "loss": 0.6756, "step": 9555 }, { "epoch": 3.109954456733897, "grad_norm": 1.976130485534668, "learning_rate": 1.6051403599377405e-05, "loss": 0.6489, "step": 9560 }, { "epoch": 3.1115810019518544, "grad_norm": 2.3929290771484375, "learning_rate": 1.6027310050327522e-05, "loss": 0.6823, "step": 9565 }, { "epoch": 3.1132075471698113, "grad_norm": 1.8878233432769775, "learning_rate": 1.6003226063555905e-05, "loss": 0.6805, "step": 9570 }, { "epoch": 3.114834092387768, "grad_norm": 2.0991029739379883, "learning_rate": 1.5979151664729062e-05, "loss": 0.6892, "step": 9575 }, { "epoch": 3.1164606376057256, "grad_norm": 1.9038327932357788, "learning_rate": 1.5955086879503316e-05, "loss": 0.6709, "step": 9580 }, { "epoch": 3.1180871828236825, "grad_norm": 1.7656376361846924, "learning_rate": 1.5931031733524727e-05, "loss": 0.6424, "step": 9585 }, { "epoch": 3.1197137280416394, "grad_norm": 1.930082082748413, "learning_rate": 1.5906986252429087e-05, "loss": 0.6529, "step": 9590 }, { "epoch": 3.1213402732595967, "grad_norm": 2.5808334350585938, "learning_rate": 1.5882950461841872e-05, "loss": 0.6669, "step": 9595 }, { "epoch": 3.1229668184775536, "grad_norm": 2.0705926418304443, "learning_rate": 1.585892438737827e-05, "loss": 0.6482, "step": 9600 }, { "epoch": 3.1245933636955106, "grad_norm": 2.081015110015869, "learning_rate": 1.5834908054643073e-05, "loss": 0.6803, "step": 9605 }, { "epoch": 3.126219908913468, "grad_norm": 1.792731523513794, "learning_rate": 1.581090148923071e-05, "loss": 0.6707, "step": 9610 }, { "epoch": 3.127846454131425, "grad_norm": 2.385056972503662, "learning_rate": 1.5786904716725196e-05, "loss": 0.6334, "step": 9615 }, { "epoch": 3.1294729993493817, "grad_norm": 2.067519187927246, "learning_rate": 1.576291776270013e-05, "loss": 0.6579, "step": 9620 }, { "epoch": 3.131099544567339, "grad_norm": 1.966447353363037, "learning_rate": 1.5738940652718594e-05, "loss": 0.6396, "step": 9625 }, { "epoch": 3.132726089785296, "grad_norm": 2.1981186866760254, "learning_rate": 1.5714973412333257e-05, "loss": 0.6833, "step": 9630 }, { "epoch": 3.134352635003253, "grad_norm": 1.8088887929916382, "learning_rate": 1.56910160670862e-05, "loss": 0.6726, "step": 9635 }, { "epoch": 3.1359791802212102, "grad_norm": 1.9311386346817017, "learning_rate": 1.5667068642508996e-05, "loss": 0.6712, "step": 9640 }, { "epoch": 3.137605725439167, "grad_norm": 1.7471827268600464, "learning_rate": 1.5643131164122626e-05, "loss": 0.6524, "step": 9645 }, { "epoch": 3.139232270657124, "grad_norm": 1.9465330839157104, "learning_rate": 1.5619203657437503e-05, "loss": 0.6627, "step": 9650 }, { "epoch": 3.1408588158750814, "grad_norm": 2.1639490127563477, "learning_rate": 1.5595286147953364e-05, "loss": 0.656, "step": 9655 }, { "epoch": 3.1424853610930383, "grad_norm": 2.0224697589874268, "learning_rate": 1.5571378661159337e-05, "loss": 0.6441, "step": 9660 }, { "epoch": 3.1441119063109952, "grad_norm": 1.8008503913879395, "learning_rate": 1.5547481222533846e-05, "loss": 0.6109, "step": 9665 }, { "epoch": 3.1457384515289526, "grad_norm": 1.9340412616729736, "learning_rate": 1.552359385754461e-05, "loss": 0.7001, "step": 9670 }, { "epoch": 3.1473649967469095, "grad_norm": 2.107783794403076, "learning_rate": 1.549971659164861e-05, "loss": 0.6766, "step": 9675 }, { "epoch": 3.1489915419648664, "grad_norm": 1.9612401723861694, "learning_rate": 1.5475849450292085e-05, "loss": 0.6526, "step": 9680 }, { "epoch": 3.1506180871828238, "grad_norm": 1.7919425964355469, "learning_rate": 1.5451992458910442e-05, "loss": 0.659, "step": 9685 }, { "epoch": 3.1522446324007807, "grad_norm": 2.4329445362091064, "learning_rate": 1.542814564292831e-05, "loss": 0.6392, "step": 9690 }, { "epoch": 3.153871177618738, "grad_norm": 2.0440330505371094, "learning_rate": 1.540430902775946e-05, "loss": 0.6085, "step": 9695 }, { "epoch": 3.155497722836695, "grad_norm": 2.4745633602142334, "learning_rate": 1.5380482638806794e-05, "loss": 0.6549, "step": 9700 }, { "epoch": 3.157124268054652, "grad_norm": 2.1841065883636475, "learning_rate": 1.5356666501462314e-05, "loss": 0.6245, "step": 9705 }, { "epoch": 3.1587508132726088, "grad_norm": 4.957603454589844, "learning_rate": 1.533286064110709e-05, "loss": 0.6846, "step": 9710 }, { "epoch": 3.160377358490566, "grad_norm": 2.1394810676574707, "learning_rate": 1.5309065083111255e-05, "loss": 0.641, "step": 9715 }, { "epoch": 3.162003903708523, "grad_norm": 1.9957211017608643, "learning_rate": 1.5285279852833944e-05, "loss": 0.6611, "step": 9720 }, { "epoch": 3.1636304489264804, "grad_norm": 2.25938081741333, "learning_rate": 1.5261504975623306e-05, "loss": 0.6455, "step": 9725 }, { "epoch": 3.1652569941444373, "grad_norm": 2.0558090209960938, "learning_rate": 1.5237740476816436e-05, "loss": 0.6612, "step": 9730 }, { "epoch": 3.166883539362394, "grad_norm": 1.9984946250915527, "learning_rate": 1.5213986381739393e-05, "loss": 0.6593, "step": 9735 }, { "epoch": 3.168510084580351, "grad_norm": 2.157356023788452, "learning_rate": 1.519024271570712e-05, "loss": 0.6646, "step": 9740 }, { "epoch": 3.1701366297983085, "grad_norm": 1.8496346473693848, "learning_rate": 1.5166509504023473e-05, "loss": 0.6537, "step": 9745 }, { "epoch": 3.1717631750162654, "grad_norm": 2.2275230884552, "learning_rate": 1.5142786771981146e-05, "loss": 0.6445, "step": 9750 }, { "epoch": 3.1733897202342227, "grad_norm": 1.8542896509170532, "learning_rate": 1.5119074544861678e-05, "loss": 0.6578, "step": 9755 }, { "epoch": 3.1750162654521796, "grad_norm": 2.0999529361724854, "learning_rate": 1.5095372847935396e-05, "loss": 0.6538, "step": 9760 }, { "epoch": 3.1766428106701365, "grad_norm": 1.8991894721984863, "learning_rate": 1.5071681706461438e-05, "loss": 0.6359, "step": 9765 }, { "epoch": 3.178269355888094, "grad_norm": 2.260857105255127, "learning_rate": 1.5048001145687646e-05, "loss": 0.6646, "step": 9770 }, { "epoch": 3.179895901106051, "grad_norm": 1.817297339439392, "learning_rate": 1.5024331190850637e-05, "loss": 0.6469, "step": 9775 }, { "epoch": 3.1815224463240077, "grad_norm": 1.993095874786377, "learning_rate": 1.5000671867175678e-05, "loss": 0.6479, "step": 9780 }, { "epoch": 3.183148991541965, "grad_norm": 2.3499114513397217, "learning_rate": 1.4977023199876743e-05, "loss": 0.7044, "step": 9785 }, { "epoch": 3.184775536759922, "grad_norm": 1.8569183349609375, "learning_rate": 1.4953385214156423e-05, "loss": 0.6655, "step": 9790 }, { "epoch": 3.186402081977879, "grad_norm": 1.9735565185546875, "learning_rate": 1.4929757935205951e-05, "loss": 0.6926, "step": 9795 }, { "epoch": 3.1880286271958362, "grad_norm": 2.0960886478424072, "learning_rate": 1.490614138820512e-05, "loss": 0.6899, "step": 9800 }, { "epoch": 3.189655172413793, "grad_norm": 1.7670766115188599, "learning_rate": 1.4882535598322311e-05, "loss": 0.6324, "step": 9805 }, { "epoch": 3.19128171763175, "grad_norm": 1.9267206192016602, "learning_rate": 1.4858940590714427e-05, "loss": 0.6515, "step": 9810 }, { "epoch": 3.1929082628497074, "grad_norm": 1.7379379272460938, "learning_rate": 1.4835356390526888e-05, "loss": 0.6452, "step": 9815 }, { "epoch": 3.1945348080676643, "grad_norm": 2.3884451389312744, "learning_rate": 1.481178302289359e-05, "loss": 0.6574, "step": 9820 }, { "epoch": 3.1961613532856212, "grad_norm": 2.4783401489257812, "learning_rate": 1.478822051293689e-05, "loss": 0.6291, "step": 9825 }, { "epoch": 3.1977878985035786, "grad_norm": 2.2705469131469727, "learning_rate": 1.476466888576758e-05, "loss": 0.6549, "step": 9830 }, { "epoch": 3.1994144437215355, "grad_norm": 1.7577061653137207, "learning_rate": 1.4741128166484824e-05, "loss": 0.6498, "step": 9835 }, { "epoch": 3.2010409889394924, "grad_norm": 2.099090576171875, "learning_rate": 1.4717598380176212e-05, "loss": 0.6521, "step": 9840 }, { "epoch": 3.2026675341574498, "grad_norm": 2.067274570465088, "learning_rate": 1.4694079551917629e-05, "loss": 0.6709, "step": 9845 }, { "epoch": 3.2042940793754067, "grad_norm": 2.2046360969543457, "learning_rate": 1.4670571706773318e-05, "loss": 0.6753, "step": 9850 }, { "epoch": 3.2059206245933636, "grad_norm": 2.7054951190948486, "learning_rate": 1.4647074869795802e-05, "loss": 0.6428, "step": 9855 }, { "epoch": 3.207547169811321, "grad_norm": 1.866786241531372, "learning_rate": 1.462358906602589e-05, "loss": 0.6797, "step": 9860 }, { "epoch": 3.209173715029278, "grad_norm": 2.2875099182128906, "learning_rate": 1.4600114320492594e-05, "loss": 0.67, "step": 9865 }, { "epoch": 3.2108002602472347, "grad_norm": 2.176388740539551, "learning_rate": 1.4576650658213191e-05, "loss": 0.6426, "step": 9870 }, { "epoch": 3.212426805465192, "grad_norm": 1.9909074306488037, "learning_rate": 1.4553198104193094e-05, "loss": 0.6714, "step": 9875 }, { "epoch": 3.214053350683149, "grad_norm": 2.1360368728637695, "learning_rate": 1.452975668342594e-05, "loss": 0.6961, "step": 9880 }, { "epoch": 3.215679895901106, "grad_norm": 1.810542106628418, "learning_rate": 1.450632642089344e-05, "loss": 0.6336, "step": 9885 }, { "epoch": 3.2173064411190633, "grad_norm": 1.868118405342102, "learning_rate": 1.448290734156546e-05, "loss": 0.6469, "step": 9890 }, { "epoch": 3.21893298633702, "grad_norm": 2.1547558307647705, "learning_rate": 1.445949947039991e-05, "loss": 0.6085, "step": 9895 }, { "epoch": 3.220559531554977, "grad_norm": 2.0586001873016357, "learning_rate": 1.443610283234279e-05, "loss": 0.6509, "step": 9900 }, { "epoch": 3.2221860767729344, "grad_norm": 2.112361431121826, "learning_rate": 1.4412717452328084e-05, "loss": 0.6525, "step": 9905 }, { "epoch": 3.2238126219908914, "grad_norm": 2.0411717891693115, "learning_rate": 1.4389343355277852e-05, "loss": 0.6475, "step": 9910 }, { "epoch": 3.2254391672088483, "grad_norm": 2.1603314876556396, "learning_rate": 1.4365980566102044e-05, "loss": 0.6718, "step": 9915 }, { "epoch": 3.2270657124268056, "grad_norm": 1.6672358512878418, "learning_rate": 1.4342629109698627e-05, "loss": 0.6203, "step": 9920 }, { "epoch": 3.2286922576447625, "grad_norm": 2.003319263458252, "learning_rate": 1.431928901095344e-05, "loss": 0.6448, "step": 9925 }, { "epoch": 3.2303188028627194, "grad_norm": 1.9274591207504272, "learning_rate": 1.4295960294740263e-05, "loss": 0.6595, "step": 9930 }, { "epoch": 3.231945348080677, "grad_norm": 2.407855749130249, "learning_rate": 1.4272642985920704e-05, "loss": 0.6695, "step": 9935 }, { "epoch": 3.2335718932986337, "grad_norm": 2.0356640815734863, "learning_rate": 1.4249337109344241e-05, "loss": 0.6617, "step": 9940 }, { "epoch": 3.2351984385165906, "grad_norm": 1.9890923500061035, "learning_rate": 1.4226042689848163e-05, "loss": 0.6702, "step": 9945 }, { "epoch": 3.236824983734548, "grad_norm": 2.0976686477661133, "learning_rate": 1.4202759752257555e-05, "loss": 0.6487, "step": 9950 }, { "epoch": 3.238451528952505, "grad_norm": 2.0105624198913574, "learning_rate": 1.4179488321385243e-05, "loss": 0.6666, "step": 9955 }, { "epoch": 3.240078074170462, "grad_norm": 1.6774218082427979, "learning_rate": 1.415622842203182e-05, "loss": 0.6585, "step": 9960 }, { "epoch": 3.241704619388419, "grad_norm": 2.181556463241577, "learning_rate": 1.4132980078985553e-05, "loss": 0.6487, "step": 9965 }, { "epoch": 3.243331164606376, "grad_norm": 2.0243122577667236, "learning_rate": 1.4109743317022434e-05, "loss": 0.6852, "step": 9970 }, { "epoch": 3.244957709824333, "grad_norm": 2.3831090927124023, "learning_rate": 1.4086518160906084e-05, "loss": 0.6428, "step": 9975 }, { "epoch": 3.2465842550422903, "grad_norm": 2.026235818862915, "learning_rate": 1.4063304635387773e-05, "loss": 0.6523, "step": 9980 }, { "epoch": 3.248210800260247, "grad_norm": 2.0918619632720947, "learning_rate": 1.4040102765206375e-05, "loss": 0.6746, "step": 9985 }, { "epoch": 3.249837345478204, "grad_norm": 1.862228274345398, "learning_rate": 1.4016912575088318e-05, "loss": 0.6756, "step": 9990 }, { "epoch": 3.2514638906961615, "grad_norm": 2.001894235610962, "learning_rate": 1.3993734089747617e-05, "loss": 0.6531, "step": 9995 }, { "epoch": 3.2530904359141184, "grad_norm": 1.8046789169311523, "learning_rate": 1.3970567333885786e-05, "loss": 0.6547, "step": 10000 }, { "epoch": 3.2547169811320753, "grad_norm": 1.9411813020706177, "learning_rate": 1.3947412332191855e-05, "loss": 0.6805, "step": 10005 }, { "epoch": 3.2563435263500327, "grad_norm": 1.9151054620742798, "learning_rate": 1.392426910934232e-05, "loss": 0.6648, "step": 10010 }, { "epoch": 3.2579700715679896, "grad_norm": 1.8788055181503296, "learning_rate": 1.3901137690001137e-05, "loss": 0.6224, "step": 10015 }, { "epoch": 3.2595966167859465, "grad_norm": 2.2758357524871826, "learning_rate": 1.3878018098819657e-05, "loss": 0.6579, "step": 10020 }, { "epoch": 3.261223162003904, "grad_norm": 2.455655813217163, "learning_rate": 1.3854910360436657e-05, "loss": 0.6604, "step": 10025 }, { "epoch": 3.2628497072218607, "grad_norm": 2.1581814289093018, "learning_rate": 1.383181449947825e-05, "loss": 0.688, "step": 10030 }, { "epoch": 3.2644762524398176, "grad_norm": 2.1598403453826904, "learning_rate": 1.3808730540557913e-05, "loss": 0.6521, "step": 10035 }, { "epoch": 3.266102797657775, "grad_norm": 2.160384178161621, "learning_rate": 1.3785658508276436e-05, "loss": 0.6739, "step": 10040 }, { "epoch": 3.267729342875732, "grad_norm": 1.7373710870742798, "learning_rate": 1.3762598427221906e-05, "loss": 0.674, "step": 10045 }, { "epoch": 3.269355888093689, "grad_norm": 2.675269365310669, "learning_rate": 1.3739550321969647e-05, "loss": 0.6825, "step": 10050 }, { "epoch": 3.270982433311646, "grad_norm": 1.9858858585357666, "learning_rate": 1.3716514217082249e-05, "loss": 0.6811, "step": 10055 }, { "epoch": 3.272608978529603, "grad_norm": 2.0622477531433105, "learning_rate": 1.3693490137109485e-05, "loss": 0.633, "step": 10060 }, { "epoch": 3.27423552374756, "grad_norm": 2.0745816230773926, "learning_rate": 1.3670478106588341e-05, "loss": 0.6432, "step": 10065 }, { "epoch": 3.2758620689655173, "grad_norm": 1.9760076999664307, "learning_rate": 1.364747815004295e-05, "loss": 0.6758, "step": 10070 }, { "epoch": 3.2774886141834743, "grad_norm": 1.8305747509002686, "learning_rate": 1.3624490291984582e-05, "loss": 0.6453, "step": 10075 }, { "epoch": 3.279115159401431, "grad_norm": 1.9638245105743408, "learning_rate": 1.3601514556911596e-05, "loss": 0.6833, "step": 10080 }, { "epoch": 3.2807417046193885, "grad_norm": 1.8853123188018799, "learning_rate": 1.3578550969309459e-05, "loss": 0.6502, "step": 10085 }, { "epoch": 3.2823682498373454, "grad_norm": 2.171973466873169, "learning_rate": 1.3555599553650658e-05, "loss": 0.6604, "step": 10090 }, { "epoch": 3.2839947950553023, "grad_norm": 2.070957899093628, "learning_rate": 1.3532660334394742e-05, "loss": 0.6766, "step": 10095 }, { "epoch": 3.2856213402732597, "grad_norm": 1.9634778499603271, "learning_rate": 1.3509733335988245e-05, "loss": 0.6532, "step": 10100 }, { "epoch": 3.2872478854912166, "grad_norm": 2.1095433235168457, "learning_rate": 1.3486818582864678e-05, "loss": 0.6566, "step": 10105 }, { "epoch": 3.288874430709174, "grad_norm": 2.1379847526550293, "learning_rate": 1.3463916099444518e-05, "loss": 0.6971, "step": 10110 }, { "epoch": 3.290500975927131, "grad_norm": 2.246108293533325, "learning_rate": 1.344102591013513e-05, "loss": 0.6653, "step": 10115 }, { "epoch": 3.2921275211450878, "grad_norm": 2.186302661895752, "learning_rate": 1.3418148039330822e-05, "loss": 0.6718, "step": 10120 }, { "epoch": 3.2937540663630447, "grad_norm": 2.1060941219329834, "learning_rate": 1.339528251141273e-05, "loss": 0.6681, "step": 10125 }, { "epoch": 3.295380611581002, "grad_norm": 2.1231400966644287, "learning_rate": 1.3372429350748866e-05, "loss": 0.6763, "step": 10130 }, { "epoch": 3.297007156798959, "grad_norm": 2.142677068710327, "learning_rate": 1.3349588581694058e-05, "loss": 0.6504, "step": 10135 }, { "epoch": 3.2986337020169163, "grad_norm": 2.07317852973938, "learning_rate": 1.332676022858993e-05, "loss": 0.645, "step": 10140 }, { "epoch": 3.300260247234873, "grad_norm": 2.129087448120117, "learning_rate": 1.3303944315764848e-05, "loss": 0.6666, "step": 10145 }, { "epoch": 3.30188679245283, "grad_norm": 1.9616256952285767, "learning_rate": 1.3281140867533962e-05, "loss": 0.6404, "step": 10150 }, { "epoch": 3.303513337670787, "grad_norm": 1.7806838750839233, "learning_rate": 1.3258349908199098e-05, "loss": 0.6373, "step": 10155 }, { "epoch": 3.3051398828887444, "grad_norm": 1.8721908330917358, "learning_rate": 1.3235571462048795e-05, "loss": 0.653, "step": 10160 }, { "epoch": 3.3067664281067013, "grad_norm": 1.8101102113723755, "learning_rate": 1.321280555335826e-05, "loss": 0.6488, "step": 10165 }, { "epoch": 3.3083929733246586, "grad_norm": 1.9928457736968994, "learning_rate": 1.3190052206389337e-05, "loss": 0.631, "step": 10170 }, { "epoch": 3.3100195185426156, "grad_norm": 1.8210963010787964, "learning_rate": 1.3167311445390456e-05, "loss": 0.6227, "step": 10175 }, { "epoch": 3.3116460637605725, "grad_norm": 1.9028152227401733, "learning_rate": 1.314458329459668e-05, "loss": 0.6644, "step": 10180 }, { "epoch": 3.3132726089785294, "grad_norm": 2.116238832473755, "learning_rate": 1.3121867778229588e-05, "loss": 0.653, "step": 10185 }, { "epoch": 3.3148991541964867, "grad_norm": 1.8992266654968262, "learning_rate": 1.309916492049732e-05, "loss": 0.6675, "step": 10190 }, { "epoch": 3.3165256994144436, "grad_norm": 1.8556034564971924, "learning_rate": 1.3076474745594524e-05, "loss": 0.6604, "step": 10195 }, { "epoch": 3.318152244632401, "grad_norm": 1.9362324476242065, "learning_rate": 1.3053797277702339e-05, "loss": 0.6423, "step": 10200 }, { "epoch": 3.319778789850358, "grad_norm": 1.9838426113128662, "learning_rate": 1.3031132540988331e-05, "loss": 0.6687, "step": 10205 }, { "epoch": 3.321405335068315, "grad_norm": 2.070091485977173, "learning_rate": 1.3008480559606534e-05, "loss": 0.676, "step": 10210 }, { "epoch": 3.3230318802862717, "grad_norm": 1.8255711793899536, "learning_rate": 1.2985841357697359e-05, "loss": 0.6741, "step": 10215 }, { "epoch": 3.324658425504229, "grad_norm": 2.1347503662109375, "learning_rate": 1.2963214959387632e-05, "loss": 0.681, "step": 10220 }, { "epoch": 3.326284970722186, "grad_norm": 1.940697193145752, "learning_rate": 1.2940601388790475e-05, "loss": 0.6698, "step": 10225 }, { "epoch": 3.3279115159401433, "grad_norm": 2.2285683155059814, "learning_rate": 1.2918000670005429e-05, "loss": 0.6906, "step": 10230 }, { "epoch": 3.3295380611581002, "grad_norm": 1.9986547231674194, "learning_rate": 1.2895412827118252e-05, "loss": 0.6535, "step": 10235 }, { "epoch": 3.331164606376057, "grad_norm": 1.889677882194519, "learning_rate": 1.2872837884201028e-05, "loss": 0.6673, "step": 10240 }, { "epoch": 3.332791151594014, "grad_norm": 1.8856425285339355, "learning_rate": 1.2850275865312089e-05, "loss": 0.6598, "step": 10245 }, { "epoch": 3.3344176968119714, "grad_norm": 2.092761278152466, "learning_rate": 1.282772679449597e-05, "loss": 0.6555, "step": 10250 }, { "epoch": 3.3360442420299283, "grad_norm": 1.8956481218338013, "learning_rate": 1.2805190695783442e-05, "loss": 0.6418, "step": 10255 }, { "epoch": 3.3376707872478857, "grad_norm": 1.9665729999542236, "learning_rate": 1.2782667593191403e-05, "loss": 0.6459, "step": 10260 }, { "epoch": 3.3392973324658426, "grad_norm": 1.6512507200241089, "learning_rate": 1.276015751072297e-05, "loss": 0.6267, "step": 10265 }, { "epoch": 3.3409238776837995, "grad_norm": 2.1978135108947754, "learning_rate": 1.2737660472367314e-05, "loss": 0.6704, "step": 10270 }, { "epoch": 3.342550422901757, "grad_norm": 1.9107602834701538, "learning_rate": 1.2715176502099755e-05, "loss": 0.6351, "step": 10275 }, { "epoch": 3.3441769681197138, "grad_norm": 2.22953462600708, "learning_rate": 1.2692705623881651e-05, "loss": 0.6736, "step": 10280 }, { "epoch": 3.3458035133376707, "grad_norm": 1.9832690954208374, "learning_rate": 1.2670247861660438e-05, "loss": 0.7022, "step": 10285 }, { "epoch": 3.347430058555628, "grad_norm": 1.9249918460845947, "learning_rate": 1.264780323936954e-05, "loss": 0.6531, "step": 10290 }, { "epoch": 3.349056603773585, "grad_norm": 1.793448567390442, "learning_rate": 1.2625371780928428e-05, "loss": 0.663, "step": 10295 }, { "epoch": 3.350683148991542, "grad_norm": 1.7719804048538208, "learning_rate": 1.2602953510242487e-05, "loss": 0.6372, "step": 10300 }, { "epoch": 3.352309694209499, "grad_norm": 1.9875365495681763, "learning_rate": 1.2580548451203095e-05, "loss": 0.6202, "step": 10305 }, { "epoch": 3.353936239427456, "grad_norm": 1.892668604850769, "learning_rate": 1.2558156627687507e-05, "loss": 0.6498, "step": 10310 }, { "epoch": 3.355562784645413, "grad_norm": 1.960500955581665, "learning_rate": 1.2535778063558917e-05, "loss": 0.6427, "step": 10315 }, { "epoch": 3.3571893298633704, "grad_norm": 2.0554163455963135, "learning_rate": 1.251341278266635e-05, "loss": 0.6934, "step": 10320 }, { "epoch": 3.3588158750813273, "grad_norm": 1.8425451517105103, "learning_rate": 1.2491060808844696e-05, "loss": 0.6604, "step": 10325 }, { "epoch": 3.360442420299284, "grad_norm": 2.2402796745300293, "learning_rate": 1.2468722165914662e-05, "loss": 0.666, "step": 10330 }, { "epoch": 3.3620689655172415, "grad_norm": 2.013751745223999, "learning_rate": 1.2446396877682756e-05, "loss": 0.6825, "step": 10335 }, { "epoch": 3.3636955107351985, "grad_norm": 2.137054204940796, "learning_rate": 1.2424084967941222e-05, "loss": 0.6573, "step": 10340 }, { "epoch": 3.3653220559531554, "grad_norm": 1.8506207466125488, "learning_rate": 1.2401786460468087e-05, "loss": 0.6427, "step": 10345 }, { "epoch": 3.3669486011711127, "grad_norm": 2.129305124282837, "learning_rate": 1.2379501379027059e-05, "loss": 0.6433, "step": 10350 }, { "epoch": 3.3685751463890696, "grad_norm": 2.1928210258483887, "learning_rate": 1.235722974736756e-05, "loss": 0.6897, "step": 10355 }, { "epoch": 3.3702016916070265, "grad_norm": 1.9839317798614502, "learning_rate": 1.2334971589224675e-05, "loss": 0.6797, "step": 10360 }, { "epoch": 3.371828236824984, "grad_norm": 2.428084135055542, "learning_rate": 1.2312726928319138e-05, "loss": 0.6598, "step": 10365 }, { "epoch": 3.373454782042941, "grad_norm": 2.3940176963806152, "learning_rate": 1.2290495788357267e-05, "loss": 0.6366, "step": 10370 }, { "epoch": 3.3750813272608977, "grad_norm": 2.1621506214141846, "learning_rate": 1.2268278193031008e-05, "loss": 0.6703, "step": 10375 }, { "epoch": 3.376707872478855, "grad_norm": 1.8342362642288208, "learning_rate": 1.224607416601786e-05, "loss": 0.6521, "step": 10380 }, { "epoch": 3.378334417696812, "grad_norm": 1.9424413442611694, "learning_rate": 1.2223883730980843e-05, "loss": 0.6687, "step": 10385 }, { "epoch": 3.379960962914769, "grad_norm": 2.1477859020233154, "learning_rate": 1.2201706911568515e-05, "loss": 0.6543, "step": 10390 }, { "epoch": 3.3815875081327262, "grad_norm": 1.9607585668563843, "learning_rate": 1.2179543731414919e-05, "loss": 0.6492, "step": 10395 }, { "epoch": 3.383214053350683, "grad_norm": 2.1383254528045654, "learning_rate": 1.215739421413957e-05, "loss": 0.6451, "step": 10400 }, { "epoch": 3.38484059856864, "grad_norm": 2.1169769763946533, "learning_rate": 1.2135258383347392e-05, "loss": 0.646, "step": 10405 }, { "epoch": 3.3864671437865974, "grad_norm": 1.802030086517334, "learning_rate": 1.211313626262876e-05, "loss": 0.6289, "step": 10410 }, { "epoch": 3.3880936890045543, "grad_norm": 2.1696512699127197, "learning_rate": 1.2091027875559408e-05, "loss": 0.6714, "step": 10415 }, { "epoch": 3.3897202342225112, "grad_norm": 1.7991942167282104, "learning_rate": 1.2068933245700454e-05, "loss": 0.6352, "step": 10420 }, { "epoch": 3.3913467794404686, "grad_norm": 2.010716199874878, "learning_rate": 1.204685239659835e-05, "loss": 0.6629, "step": 10425 }, { "epoch": 3.3929733246584255, "grad_norm": 1.9059813022613525, "learning_rate": 1.2024785351784868e-05, "loss": 0.6728, "step": 10430 }, { "epoch": 3.3945998698763824, "grad_norm": 1.8924747705459595, "learning_rate": 1.2002732134777039e-05, "loss": 0.6636, "step": 10435 }, { "epoch": 3.3962264150943398, "grad_norm": 1.7315809726715088, "learning_rate": 1.1980692769077207e-05, "loss": 0.6646, "step": 10440 }, { "epoch": 3.3978529603122967, "grad_norm": 1.937632441520691, "learning_rate": 1.1958667278172897e-05, "loss": 0.6582, "step": 10445 }, { "epoch": 3.3994795055302536, "grad_norm": 1.9362993240356445, "learning_rate": 1.1936655685536896e-05, "loss": 0.692, "step": 10450 }, { "epoch": 3.401106050748211, "grad_norm": 2.0336852073669434, "learning_rate": 1.1914658014627156e-05, "loss": 0.6641, "step": 10455 }, { "epoch": 3.402732595966168, "grad_norm": 2.393552303314209, "learning_rate": 1.1892674288886807e-05, "loss": 0.6588, "step": 10460 }, { "epoch": 3.4043591411841247, "grad_norm": 1.6933391094207764, "learning_rate": 1.1870704531744093e-05, "loss": 0.6424, "step": 10465 }, { "epoch": 3.405985686402082, "grad_norm": 2.002310037612915, "learning_rate": 1.18487487666124e-05, "loss": 0.6804, "step": 10470 }, { "epoch": 3.407612231620039, "grad_norm": 2.073310375213623, "learning_rate": 1.182680701689017e-05, "loss": 0.6614, "step": 10475 }, { "epoch": 3.409238776837996, "grad_norm": 1.9920084476470947, "learning_rate": 1.1804879305960942e-05, "loss": 0.647, "step": 10480 }, { "epoch": 3.4108653220559533, "grad_norm": 2.400181531906128, "learning_rate": 1.1782965657193277e-05, "loss": 0.644, "step": 10485 }, { "epoch": 3.41249186727391, "grad_norm": 2.0325639247894287, "learning_rate": 1.1761066093940758e-05, "loss": 0.6813, "step": 10490 }, { "epoch": 3.414118412491867, "grad_norm": 2.365372657775879, "learning_rate": 1.1739180639541938e-05, "loss": 0.6757, "step": 10495 }, { "epoch": 3.4157449577098244, "grad_norm": 1.978880763053894, "learning_rate": 1.1717309317320365e-05, "loss": 0.6654, "step": 10500 }, { "epoch": 3.4173715029277814, "grad_norm": 1.876266360282898, "learning_rate": 1.1695452150584484e-05, "loss": 0.6799, "step": 10505 }, { "epoch": 3.4189980481457383, "grad_norm": 2.210228204727173, "learning_rate": 1.1673609162627697e-05, "loss": 0.666, "step": 10510 }, { "epoch": 3.4206245933636956, "grad_norm": 2.094750165939331, "learning_rate": 1.1651780376728269e-05, "loss": 0.6802, "step": 10515 }, { "epoch": 3.4222511385816525, "grad_norm": 2.0837318897247314, "learning_rate": 1.1629965816149343e-05, "loss": 0.6626, "step": 10520 }, { "epoch": 3.4238776837996094, "grad_norm": 1.8837127685546875, "learning_rate": 1.1608165504138904e-05, "loss": 0.6722, "step": 10525 }, { "epoch": 3.425504229017567, "grad_norm": 1.968827486038208, "learning_rate": 1.1586379463929727e-05, "loss": 0.6562, "step": 10530 }, { "epoch": 3.4271307742355237, "grad_norm": 2.7125394344329834, "learning_rate": 1.1564607718739418e-05, "loss": 0.6736, "step": 10535 }, { "epoch": 3.4287573194534806, "grad_norm": 1.7573403120040894, "learning_rate": 1.1542850291770301e-05, "loss": 0.6493, "step": 10540 }, { "epoch": 3.430383864671438, "grad_norm": 1.8931772708892822, "learning_rate": 1.1521107206209478e-05, "loss": 0.6374, "step": 10545 }, { "epoch": 3.432010409889395, "grad_norm": 2.2354066371917725, "learning_rate": 1.1499378485228757e-05, "loss": 0.6826, "step": 10550 }, { "epoch": 3.4336369551073522, "grad_norm": 2.4254794120788574, "learning_rate": 1.1477664151984646e-05, "loss": 0.6592, "step": 10555 }, { "epoch": 3.435263500325309, "grad_norm": 2.335663318634033, "learning_rate": 1.1455964229618287e-05, "loss": 0.6697, "step": 10560 }, { "epoch": 3.436890045543266, "grad_norm": 1.8259519338607788, "learning_rate": 1.1434278741255508e-05, "loss": 0.64, "step": 10565 }, { "epoch": 3.438516590761223, "grad_norm": 1.8743382692337036, "learning_rate": 1.1412607710006717e-05, "loss": 0.6743, "step": 10570 }, { "epoch": 3.4401431359791803, "grad_norm": 2.0380537509918213, "learning_rate": 1.1390951158966936e-05, "loss": 0.6458, "step": 10575 }, { "epoch": 3.441769681197137, "grad_norm": 2.0234997272491455, "learning_rate": 1.1369309111215756e-05, "loss": 0.6314, "step": 10580 }, { "epoch": 3.4433962264150946, "grad_norm": 1.9040913581848145, "learning_rate": 1.1347681589817311e-05, "loss": 0.6265, "step": 10585 }, { "epoch": 3.4450227716330515, "grad_norm": 2.380798578262329, "learning_rate": 1.1326068617820235e-05, "loss": 0.6506, "step": 10590 }, { "epoch": 3.4466493168510084, "grad_norm": 1.9014941453933716, "learning_rate": 1.1304470218257684e-05, "loss": 0.6692, "step": 10595 }, { "epoch": 3.4482758620689653, "grad_norm": 2.294818878173828, "learning_rate": 1.1282886414147261e-05, "loss": 0.6765, "step": 10600 }, { "epoch": 3.4499024072869227, "grad_norm": 2.3812739849090576, "learning_rate": 1.1261317228491035e-05, "loss": 0.6485, "step": 10605 }, { "epoch": 3.4515289525048796, "grad_norm": 2.180023431777954, "learning_rate": 1.1239762684275465e-05, "loss": 0.6834, "step": 10610 }, { "epoch": 3.453155497722837, "grad_norm": 2.2398617267608643, "learning_rate": 1.121822280447146e-05, "loss": 0.6561, "step": 10615 }, { "epoch": 3.454782042940794, "grad_norm": 1.9909019470214844, "learning_rate": 1.119669761203424e-05, "loss": 0.6424, "step": 10620 }, { "epoch": 3.4564085881587507, "grad_norm": 1.8090051412582397, "learning_rate": 1.1175187129903423e-05, "loss": 0.6614, "step": 10625 }, { "epoch": 3.4580351333767076, "grad_norm": 1.932723045349121, "learning_rate": 1.1153691381002904e-05, "loss": 0.6504, "step": 10630 }, { "epoch": 3.459661678594665, "grad_norm": 2.2373321056365967, "learning_rate": 1.1132210388240923e-05, "loss": 0.6272, "step": 10635 }, { "epoch": 3.461288223812622, "grad_norm": 1.8688981533050537, "learning_rate": 1.1110744174509952e-05, "loss": 0.6321, "step": 10640 }, { "epoch": 3.4629147690305793, "grad_norm": 2.219717502593994, "learning_rate": 1.1089292762686732e-05, "loss": 0.6321, "step": 10645 }, { "epoch": 3.464541314248536, "grad_norm": 2.1592214107513428, "learning_rate": 1.1067856175632257e-05, "loss": 0.6473, "step": 10650 }, { "epoch": 3.466167859466493, "grad_norm": 2.246952533721924, "learning_rate": 1.1046434436191669e-05, "loss": 0.6827, "step": 10655 }, { "epoch": 3.46779440468445, "grad_norm": 1.9593287706375122, "learning_rate": 1.1025027567194331e-05, "loss": 0.6698, "step": 10660 }, { "epoch": 3.4694209499024073, "grad_norm": 1.896320104598999, "learning_rate": 1.100363559145372e-05, "loss": 0.6467, "step": 10665 }, { "epoch": 3.4710474951203643, "grad_norm": 1.9459210634231567, "learning_rate": 1.0982258531767484e-05, "loss": 0.67, "step": 10670 }, { "epoch": 3.4726740403383216, "grad_norm": 2.0499391555786133, "learning_rate": 1.096089641091732e-05, "loss": 0.6427, "step": 10675 }, { "epoch": 3.4743005855562785, "grad_norm": 1.976870059967041, "learning_rate": 1.0939549251669079e-05, "loss": 0.6851, "step": 10680 }, { "epoch": 3.4759271307742354, "grad_norm": 2.0898680686950684, "learning_rate": 1.0918217076772591e-05, "loss": 0.6911, "step": 10685 }, { "epoch": 3.4775536759921923, "grad_norm": 2.5856997966766357, "learning_rate": 1.0896899908961775e-05, "loss": 0.6432, "step": 10690 }, { "epoch": 3.4791802212101497, "grad_norm": 2.0004806518554688, "learning_rate": 1.087559777095451e-05, "loss": 0.6553, "step": 10695 }, { "epoch": 3.4808067664281066, "grad_norm": 1.8624881505966187, "learning_rate": 1.0854310685452704e-05, "loss": 0.6234, "step": 10700 }, { "epoch": 3.482433311646064, "grad_norm": 3.014589548110962, "learning_rate": 1.0833038675142176e-05, "loss": 0.6503, "step": 10705 }, { "epoch": 3.484059856864021, "grad_norm": 2.341999053955078, "learning_rate": 1.0811781762692716e-05, "loss": 0.6504, "step": 10710 }, { "epoch": 3.4856864020819778, "grad_norm": 2.056342840194702, "learning_rate": 1.079053997075801e-05, "loss": 0.6369, "step": 10715 }, { "epoch": 3.487312947299935, "grad_norm": 2.085636854171753, "learning_rate": 1.076931332197564e-05, "loss": 0.6712, "step": 10720 }, { "epoch": 3.488939492517892, "grad_norm": 1.8923035860061646, "learning_rate": 1.0748101838967026e-05, "loss": 0.6449, "step": 10725 }, { "epoch": 3.490566037735849, "grad_norm": 1.9910145998001099, "learning_rate": 1.0726905544337454e-05, "loss": 0.6528, "step": 10730 }, { "epoch": 3.4921925829538063, "grad_norm": 2.0261192321777344, "learning_rate": 1.0705724460675994e-05, "loss": 0.6638, "step": 10735 }, { "epoch": 3.493819128171763, "grad_norm": 1.9802298545837402, "learning_rate": 1.0684558610555534e-05, "loss": 0.6565, "step": 10740 }, { "epoch": 3.49544567338972, "grad_norm": 2.180168390274048, "learning_rate": 1.0663408016532708e-05, "loss": 0.6994, "step": 10745 }, { "epoch": 3.4970722186076775, "grad_norm": 1.9455138444900513, "learning_rate": 1.0642272701147915e-05, "loss": 0.6534, "step": 10750 }, { "epoch": 3.4986987638256344, "grad_norm": 2.1108453273773193, "learning_rate": 1.0621152686925234e-05, "loss": 0.6493, "step": 10755 }, { "epoch": 3.5003253090435913, "grad_norm": 1.8773350715637207, "learning_rate": 1.0600047996372474e-05, "loss": 0.6524, "step": 10760 }, { "epoch": 3.5019518542615486, "grad_norm": 2.0078532695770264, "learning_rate": 1.0578958651981089e-05, "loss": 0.6526, "step": 10765 }, { "epoch": 3.5035783994795056, "grad_norm": 1.8449904918670654, "learning_rate": 1.0557884676226184e-05, "loss": 0.665, "step": 10770 }, { "epoch": 3.5052049446974625, "grad_norm": 2.0787408351898193, "learning_rate": 1.0536826091566498e-05, "loss": 0.6343, "step": 10775 }, { "epoch": 3.5068314899154194, "grad_norm": 2.1818220615386963, "learning_rate": 1.0515782920444354e-05, "loss": 0.6438, "step": 10780 }, { "epoch": 3.5084580351333767, "grad_norm": 1.9410792589187622, "learning_rate": 1.0494755185285666e-05, "loss": 0.6586, "step": 10785 }, { "epoch": 3.5100845803513336, "grad_norm": 1.8695874214172363, "learning_rate": 1.0473742908499863e-05, "loss": 0.6635, "step": 10790 }, { "epoch": 3.511711125569291, "grad_norm": 2.154265880584717, "learning_rate": 1.045274611247994e-05, "loss": 0.6409, "step": 10795 }, { "epoch": 3.513337670787248, "grad_norm": 2.1310086250305176, "learning_rate": 1.043176481960236e-05, "loss": 0.643, "step": 10800 }, { "epoch": 3.514964216005205, "grad_norm": 1.9660890102386475, "learning_rate": 1.0410799052227089e-05, "loss": 0.6604, "step": 10805 }, { "epoch": 3.516590761223162, "grad_norm": 2.0470850467681885, "learning_rate": 1.0389848832697536e-05, "loss": 0.6494, "step": 10810 }, { "epoch": 3.518217306441119, "grad_norm": 1.8898303508758545, "learning_rate": 1.0368914183340552e-05, "loss": 0.6563, "step": 10815 }, { "epoch": 3.519843851659076, "grad_norm": 1.815321445465088, "learning_rate": 1.034799512646637e-05, "loss": 0.6487, "step": 10820 }, { "epoch": 3.5214703968770333, "grad_norm": 2.03470778465271, "learning_rate": 1.0327091684368639e-05, "loss": 0.6853, "step": 10825 }, { "epoch": 3.5230969420949902, "grad_norm": 1.9658763408660889, "learning_rate": 1.030620387932433e-05, "loss": 0.6784, "step": 10830 }, { "epoch": 3.524723487312947, "grad_norm": 2.371723175048828, "learning_rate": 1.0285331733593778e-05, "loss": 0.6527, "step": 10835 }, { "epoch": 3.5263500325309045, "grad_norm": 2.1440389156341553, "learning_rate": 1.0264475269420621e-05, "loss": 0.6546, "step": 10840 }, { "epoch": 3.5279765777488614, "grad_norm": 2.305694103240967, "learning_rate": 1.0243634509031794e-05, "loss": 0.6663, "step": 10845 }, { "epoch": 3.5296031229668183, "grad_norm": 2.1012611389160156, "learning_rate": 1.0222809474637471e-05, "loss": 0.6327, "step": 10850 }, { "epoch": 3.5312296681847757, "grad_norm": 2.1110281944274902, "learning_rate": 1.0202000188431097e-05, "loss": 0.6849, "step": 10855 }, { "epoch": 3.5328562134027326, "grad_norm": 2.1224873065948486, "learning_rate": 1.0181206672589305e-05, "loss": 0.6643, "step": 10860 }, { "epoch": 3.5344827586206895, "grad_norm": 1.970705509185791, "learning_rate": 1.016458322947035e-05, "loss": 0.6545, "step": 10865 }, { "epoch": 3.536109303838647, "grad_norm": 2.3876793384552, "learning_rate": 1.0143818156116323e-05, "loss": 0.6431, "step": 10870 }, { "epoch": 3.5377358490566038, "grad_norm": 1.9727391004562378, "learning_rate": 1.0123068915132e-05, "loss": 0.6667, "step": 10875 }, { "epoch": 3.5393623942745607, "grad_norm": 2.115534543991089, "learning_rate": 1.0102335528630061e-05, "loss": 0.6587, "step": 10880 }, { "epoch": 3.540988939492518, "grad_norm": 1.8757723569869995, "learning_rate": 1.008161801870625e-05, "loss": 0.6372, "step": 10885 }, { "epoch": 3.542615484710475, "grad_norm": 2.2348427772521973, "learning_rate": 1.0060916407439413e-05, "loss": 0.6413, "step": 10890 }, { "epoch": 3.544242029928432, "grad_norm": 1.9503233432769775, "learning_rate": 1.0040230716891449e-05, "loss": 0.6424, "step": 10895 }, { "epoch": 3.545868575146389, "grad_norm": 2.2776994705200195, "learning_rate": 1.0019560969107302e-05, "loss": 0.6806, "step": 10900 }, { "epoch": 3.547495120364346, "grad_norm": 1.9576505422592163, "learning_rate": 9.99890718611489e-06, "loss": 0.6223, "step": 10905 }, { "epoch": 3.5491216655823035, "grad_norm": 2.091238498687744, "learning_rate": 9.978269389925157e-06, "loss": 0.6282, "step": 10910 }, { "epoch": 3.5507482108002604, "grad_norm": 2.4088802337646484, "learning_rate": 9.957647602531977e-06, "loss": 0.6532, "step": 10915 }, { "epoch": 3.5523747560182173, "grad_norm": 2.684156894683838, "learning_rate": 9.937041845912188e-06, "loss": 0.6525, "step": 10920 }, { "epoch": 3.554001301236174, "grad_norm": 2.216970205307007, "learning_rate": 9.91645214202553e-06, "loss": 0.6473, "step": 10925 }, { "epoch": 3.5556278464541315, "grad_norm": 1.7419428825378418, "learning_rate": 9.895878512814647e-06, "loss": 0.6489, "step": 10930 }, { "epoch": 3.5572543916720885, "grad_norm": 2.063995838165283, "learning_rate": 9.875320980205046e-06, "loss": 0.6622, "step": 10935 }, { "epoch": 3.558880936890046, "grad_norm": 1.8147531747817993, "learning_rate": 9.854779566105068e-06, "loss": 0.6332, "step": 10940 }, { "epoch": 3.5605074821080027, "grad_norm": 1.9879847764968872, "learning_rate": 9.834254292405901e-06, "loss": 0.6661, "step": 10945 }, { "epoch": 3.5621340273259596, "grad_norm": 1.8721420764923096, "learning_rate": 9.813745180981502e-06, "loss": 0.6182, "step": 10950 }, { "epoch": 3.5637605725439165, "grad_norm": 1.889029622077942, "learning_rate": 9.793252253688626e-06, "loss": 0.6687, "step": 10955 }, { "epoch": 3.565387117761874, "grad_norm": 2.983214855194092, "learning_rate": 9.772775532366774e-06, "loss": 0.6681, "step": 10960 }, { "epoch": 3.567013662979831, "grad_norm": 1.9701480865478516, "learning_rate": 9.75231503883819e-06, "loss": 0.6637, "step": 10965 }, { "epoch": 3.568640208197788, "grad_norm": 2.113018274307251, "learning_rate": 9.731870794907789e-06, "loss": 0.6616, "step": 10970 }, { "epoch": 3.570266753415745, "grad_norm": 1.971614956855774, "learning_rate": 9.711442822363209e-06, "loss": 0.6871, "step": 10975 }, { "epoch": 3.571893298633702, "grad_norm": 1.9272350072860718, "learning_rate": 9.691031142974707e-06, "loss": 0.6643, "step": 10980 }, { "epoch": 3.573519843851659, "grad_norm": 1.8993514776229858, "learning_rate": 9.670635778495213e-06, "loss": 0.6594, "step": 10985 }, { "epoch": 3.5751463890696162, "grad_norm": 1.8925209045410156, "learning_rate": 9.65025675066025e-06, "loss": 0.654, "step": 10990 }, { "epoch": 3.576772934287573, "grad_norm": 1.8996151685714722, "learning_rate": 9.629894081187943e-06, "loss": 0.6517, "step": 10995 }, { "epoch": 3.5783994795055305, "grad_norm": 2.024343252182007, "learning_rate": 9.609547791778964e-06, "loss": 0.6838, "step": 11000 }, { "epoch": 3.5800260247234874, "grad_norm": 1.8595373630523682, "learning_rate": 9.589217904116554e-06, "loss": 0.6708, "step": 11005 }, { "epoch": 3.5816525699414443, "grad_norm": 1.7943567037582397, "learning_rate": 9.568904439866444e-06, "loss": 0.6382, "step": 11010 }, { "epoch": 3.5832791151594012, "grad_norm": 2.03316593170166, "learning_rate": 9.548607420676902e-06, "loss": 0.6587, "step": 11015 }, { "epoch": 3.5849056603773586, "grad_norm": 1.9646462202072144, "learning_rate": 9.528326868178616e-06, "loss": 0.6355, "step": 11020 }, { "epoch": 3.5865322055953155, "grad_norm": 1.768291711807251, "learning_rate": 9.508062803984796e-06, "loss": 0.683, "step": 11025 }, { "epoch": 3.588158750813273, "grad_norm": 1.892801284790039, "learning_rate": 9.487815249691012e-06, "loss": 0.636, "step": 11030 }, { "epoch": 3.5897852960312298, "grad_norm": 1.9367650747299194, "learning_rate": 9.467584226875292e-06, "loss": 0.672, "step": 11035 }, { "epoch": 3.5914118412491867, "grad_norm": 2.049873113632202, "learning_rate": 9.447369757098002e-06, "loss": 0.6736, "step": 11040 }, { "epoch": 3.5930383864671436, "grad_norm": 2.523022413253784, "learning_rate": 9.427171861901903e-06, "loss": 0.6591, "step": 11045 }, { "epoch": 3.594664931685101, "grad_norm": 2.1238725185394287, "learning_rate": 9.406990562812068e-06, "loss": 0.6867, "step": 11050 }, { "epoch": 3.596291476903058, "grad_norm": 2.2790896892547607, "learning_rate": 9.386825881335889e-06, "loss": 0.6294, "step": 11055 }, { "epoch": 3.597918022121015, "grad_norm": 1.9417608976364136, "learning_rate": 9.366677838963078e-06, "loss": 0.6331, "step": 11060 }, { "epoch": 3.599544567338972, "grad_norm": 1.9915907382965088, "learning_rate": 9.34654645716556e-06, "loss": 0.6175, "step": 11065 }, { "epoch": 3.601171112556929, "grad_norm": 2.109654188156128, "learning_rate": 9.32643175739756e-06, "loss": 0.6211, "step": 11070 }, { "epoch": 3.602797657774886, "grad_norm": 4.143826961517334, "learning_rate": 9.306333761095476e-06, "loss": 0.6321, "step": 11075 }, { "epoch": 3.6044242029928433, "grad_norm": 1.9590835571289062, "learning_rate": 9.286252489677944e-06, "loss": 0.652, "step": 11080 }, { "epoch": 3.6060507482108, "grad_norm": 2.243879556655884, "learning_rate": 9.266187964545744e-06, "loss": 0.6638, "step": 11085 }, { "epoch": 3.6076772934287575, "grad_norm": 1.9153443574905396, "learning_rate": 9.246140207081833e-06, "loss": 0.6694, "step": 11090 }, { "epoch": 3.6093038386467144, "grad_norm": 1.8952282667160034, "learning_rate": 9.226109238651293e-06, "loss": 0.6449, "step": 11095 }, { "epoch": 3.6109303838646714, "grad_norm": 1.9768298864364624, "learning_rate": 9.206095080601319e-06, "loss": 0.6292, "step": 11100 }, { "epoch": 3.6125569290826283, "grad_norm": 2.424776077270508, "learning_rate": 9.18609775426116e-06, "loss": 0.6764, "step": 11105 }, { "epoch": 3.6141834743005856, "grad_norm": 1.9179407358169556, "learning_rate": 9.16611728094218e-06, "loss": 0.6874, "step": 11110 }, { "epoch": 3.6158100195185425, "grad_norm": 1.6501758098602295, "learning_rate": 9.146153681937725e-06, "loss": 0.6326, "step": 11115 }, { "epoch": 3.6174365647365, "grad_norm": 2.2321882247924805, "learning_rate": 9.126206978523202e-06, "loss": 0.6673, "step": 11120 }, { "epoch": 3.619063109954457, "grad_norm": 1.9093389511108398, "learning_rate": 9.106277191955992e-06, "loss": 0.6701, "step": 11125 }, { "epoch": 3.6206896551724137, "grad_norm": 2.682499885559082, "learning_rate": 9.086364343475461e-06, "loss": 0.6784, "step": 11130 }, { "epoch": 3.6223162003903706, "grad_norm": 1.8484448194503784, "learning_rate": 9.0664684543029e-06, "loss": 0.6666, "step": 11135 }, { "epoch": 3.623942745608328, "grad_norm": 1.914831280708313, "learning_rate": 9.04658954564156e-06, "loss": 0.6698, "step": 11140 }, { "epoch": 3.625569290826285, "grad_norm": 1.9206961393356323, "learning_rate": 9.026727638676554e-06, "loss": 0.6756, "step": 11145 }, { "epoch": 3.6271958360442422, "grad_norm": 1.8589897155761719, "learning_rate": 9.006882754574914e-06, "loss": 0.6601, "step": 11150 }, { "epoch": 3.628822381262199, "grad_norm": 2.0659871101379395, "learning_rate": 8.98705491448551e-06, "loss": 0.6258, "step": 11155 }, { "epoch": 3.630448926480156, "grad_norm": 2.1568715572357178, "learning_rate": 8.967244139539064e-06, "loss": 0.6161, "step": 11160 }, { "epoch": 3.632075471698113, "grad_norm": 2.5119800567626953, "learning_rate": 8.947450450848086e-06, "loss": 0.663, "step": 11165 }, { "epoch": 3.6337020169160703, "grad_norm": 2.0948991775512695, "learning_rate": 8.927673869506905e-06, "loss": 0.6529, "step": 11170 }, { "epoch": 3.635328562134027, "grad_norm": 1.8586777448654175, "learning_rate": 8.907914416591595e-06, "loss": 0.668, "step": 11175 }, { "epoch": 3.6369551073519846, "grad_norm": 2.1093571186065674, "learning_rate": 8.888172113159989e-06, "loss": 0.6499, "step": 11180 }, { "epoch": 3.6385816525699415, "grad_norm": 2.1958131790161133, "learning_rate": 8.868446980251647e-06, "loss": 0.6409, "step": 11185 }, { "epoch": 3.6402081977878984, "grad_norm": 2.053478479385376, "learning_rate": 8.848739038887822e-06, "loss": 0.668, "step": 11190 }, { "epoch": 3.6418347430058553, "grad_norm": 1.9138593673706055, "learning_rate": 8.829048310071456e-06, "loss": 0.6492, "step": 11195 }, { "epoch": 3.6434612882238127, "grad_norm": 2.214141607284546, "learning_rate": 8.809374814787124e-06, "loss": 0.6749, "step": 11200 }, { "epoch": 3.6450878334417696, "grad_norm": 2.0137126445770264, "learning_rate": 8.789718574001068e-06, "loss": 0.6634, "step": 11205 }, { "epoch": 3.646714378659727, "grad_norm": 2.1497995853424072, "learning_rate": 8.770079608661108e-06, "loss": 0.6469, "step": 11210 }, { "epoch": 3.648340923877684, "grad_norm": 1.8830994367599487, "learning_rate": 8.750457939696677e-06, "loss": 0.6608, "step": 11215 }, { "epoch": 3.6499674690956407, "grad_norm": 1.8222702741622925, "learning_rate": 8.730853588018772e-06, "loss": 0.626, "step": 11220 }, { "epoch": 3.6515940143135976, "grad_norm": 2.0502288341522217, "learning_rate": 8.711266574519935e-06, "loss": 0.6704, "step": 11225 }, { "epoch": 3.653220559531555, "grad_norm": 2.0919625759124756, "learning_rate": 8.691696920074214e-06, "loss": 0.625, "step": 11230 }, { "epoch": 3.654847104749512, "grad_norm": 2.2549009323120117, "learning_rate": 8.67214464553718e-06, "loss": 0.6254, "step": 11235 }, { "epoch": 3.6564736499674693, "grad_norm": 2.0247724056243896, "learning_rate": 8.652609771745862e-06, "loss": 0.6393, "step": 11240 }, { "epoch": 3.658100195185426, "grad_norm": 1.92485773563385, "learning_rate": 8.633092319518763e-06, "loss": 0.642, "step": 11245 }, { "epoch": 3.659726740403383, "grad_norm": 2.0102007389068604, "learning_rate": 8.613592309655804e-06, "loss": 0.6835, "step": 11250 }, { "epoch": 3.6613532856213404, "grad_norm": 2.189568519592285, "learning_rate": 8.594109762938343e-06, "loss": 0.6853, "step": 11255 }, { "epoch": 3.6629798308392973, "grad_norm": 1.7303372621536255, "learning_rate": 8.574644700129087e-06, "loss": 0.6129, "step": 11260 }, { "epoch": 3.6646063760572543, "grad_norm": 2.110283136367798, "learning_rate": 8.555197141972149e-06, "loss": 0.6555, "step": 11265 }, { "epoch": 3.6662329212752116, "grad_norm": 2.2367451190948486, "learning_rate": 8.535767109192955e-06, "loss": 0.6312, "step": 11270 }, { "epoch": 3.6678594664931685, "grad_norm": 2.0181496143341064, "learning_rate": 8.51635462249828e-06, "loss": 0.6359, "step": 11275 }, { "epoch": 3.6694860117111254, "grad_norm": 2.297574281692505, "learning_rate": 8.496959702576187e-06, "loss": 0.6564, "step": 11280 }, { "epoch": 3.671112556929083, "grad_norm": 2.1040380001068115, "learning_rate": 8.477582370096029e-06, "loss": 0.6876, "step": 11285 }, { "epoch": 3.6727391021470397, "grad_norm": 2.0794670581817627, "learning_rate": 8.458222645708394e-06, "loss": 0.6591, "step": 11290 }, { "epoch": 3.6743656473649966, "grad_norm": 2.1442930698394775, "learning_rate": 8.438880550045134e-06, "loss": 0.6566, "step": 11295 }, { "epoch": 3.675992192582954, "grad_norm": 2.2995944023132324, "learning_rate": 8.419556103719279e-06, "loss": 0.6501, "step": 11300 }, { "epoch": 3.677618737800911, "grad_norm": 1.8101574182510376, "learning_rate": 8.40024932732508e-06, "loss": 0.6404, "step": 11305 }, { "epoch": 3.6792452830188678, "grad_norm": 2.0527381896972656, "learning_rate": 8.380960241437947e-06, "loss": 0.6646, "step": 11310 }, { "epoch": 3.680871828236825, "grad_norm": 1.9029465913772583, "learning_rate": 8.361688866614442e-06, "loss": 0.6484, "step": 11315 }, { "epoch": 3.682498373454782, "grad_norm": 2.3560564517974854, "learning_rate": 8.342435223392232e-06, "loss": 0.6355, "step": 11320 }, { "epoch": 3.684124918672739, "grad_norm": 2.5555644035339355, "learning_rate": 8.323199332290108e-06, "loss": 0.6552, "step": 11325 }, { "epoch": 3.6857514638906963, "grad_norm": 1.7349854707717896, "learning_rate": 8.303981213807947e-06, "loss": 0.6521, "step": 11330 }, { "epoch": 3.687378009108653, "grad_norm": 2.144853353500366, "learning_rate": 8.284780888426657e-06, "loss": 0.6434, "step": 11335 }, { "epoch": 3.68900455432661, "grad_norm": 2.3442630767822266, "learning_rate": 8.265598376608211e-06, "loss": 0.6548, "step": 11340 }, { "epoch": 3.6906310995445675, "grad_norm": 2.33579683303833, "learning_rate": 8.246433698795586e-06, "loss": 0.623, "step": 11345 }, { "epoch": 3.6922576447625244, "grad_norm": 2.119555711746216, "learning_rate": 8.227286875412766e-06, "loss": 0.6491, "step": 11350 }, { "epoch": 3.6938841899804813, "grad_norm": 2.2910735607147217, "learning_rate": 8.208157926864677e-06, "loss": 0.6694, "step": 11355 }, { "epoch": 3.6955107351984386, "grad_norm": 1.9945921897888184, "learning_rate": 8.189046873537237e-06, "loss": 0.6241, "step": 11360 }, { "epoch": 3.6971372804163956, "grad_norm": 2.0588414669036865, "learning_rate": 8.169953735797251e-06, "loss": 0.6487, "step": 11365 }, { "epoch": 3.6987638256343525, "grad_norm": 2.145359754562378, "learning_rate": 8.150878533992458e-06, "loss": 0.649, "step": 11370 }, { "epoch": 3.70039037085231, "grad_norm": 1.9804723262786865, "learning_rate": 8.131821288451486e-06, "loss": 0.6523, "step": 11375 }, { "epoch": 3.7020169160702667, "grad_norm": 2.157130002975464, "learning_rate": 8.112782019483813e-06, "loss": 0.6437, "step": 11380 }, { "epoch": 3.703643461288224, "grad_norm": 1.9447418451309204, "learning_rate": 8.093760747379756e-06, "loss": 0.7042, "step": 11385 }, { "epoch": 3.705270006506181, "grad_norm": 2.0443172454833984, "learning_rate": 8.074757492410471e-06, "loss": 0.6875, "step": 11390 }, { "epoch": 3.706896551724138, "grad_norm": 2.231553554534912, "learning_rate": 8.055772274827885e-06, "loss": 0.6733, "step": 11395 }, { "epoch": 3.708523096942095, "grad_norm": 2.4282796382904053, "learning_rate": 8.036805114864736e-06, "loss": 0.6496, "step": 11400 }, { "epoch": 3.710149642160052, "grad_norm": 2.104163646697998, "learning_rate": 8.017856032734484e-06, "loss": 0.6515, "step": 11405 }, { "epoch": 3.711776187378009, "grad_norm": 1.9769706726074219, "learning_rate": 7.998925048631362e-06, "loss": 0.6343, "step": 11410 }, { "epoch": 3.7134027325959664, "grad_norm": 1.9851341247558594, "learning_rate": 7.980012182730273e-06, "loss": 0.6458, "step": 11415 }, { "epoch": 3.7150292778139233, "grad_norm": 2.1585636138916016, "learning_rate": 7.96111745518685e-06, "loss": 0.6622, "step": 11420 }, { "epoch": 3.7166558230318802, "grad_norm": 1.7822608947753906, "learning_rate": 7.942240886137364e-06, "loss": 0.6674, "step": 11425 }, { "epoch": 3.718282368249837, "grad_norm": 1.9461740255355835, "learning_rate": 7.923382495698758e-06, "loss": 0.6429, "step": 11430 }, { "epoch": 3.7199089134677945, "grad_norm": 2.038227081298828, "learning_rate": 7.904542303968585e-06, "loss": 0.6472, "step": 11435 }, { "epoch": 3.7215354586857514, "grad_norm": 1.929215669631958, "learning_rate": 7.88572033102501e-06, "loss": 0.6791, "step": 11440 }, { "epoch": 3.7231620039037088, "grad_norm": 2.0168604850769043, "learning_rate": 7.866916596926784e-06, "loss": 0.6516, "step": 11445 }, { "epoch": 3.7247885491216657, "grad_norm": 2.7941908836364746, "learning_rate": 7.848131121713234e-06, "loss": 0.6328, "step": 11450 }, { "epoch": 3.7264150943396226, "grad_norm": 2.256675958633423, "learning_rate": 7.829363925404195e-06, "loss": 0.6523, "step": 11455 }, { "epoch": 3.7280416395575795, "grad_norm": 1.9694584608078003, "learning_rate": 7.810615028000045e-06, "loss": 0.6609, "step": 11460 }, { "epoch": 3.729668184775537, "grad_norm": 2.0209426879882812, "learning_rate": 7.791884449481668e-06, "loss": 0.6536, "step": 11465 }, { "epoch": 3.7312947299934938, "grad_norm": 2.078065872192383, "learning_rate": 7.773172209810397e-06, "loss": 0.6988, "step": 11470 }, { "epoch": 3.732921275211451, "grad_norm": 2.086122989654541, "learning_rate": 7.754478328928047e-06, "loss": 0.6599, "step": 11475 }, { "epoch": 3.734547820429408, "grad_norm": 2.021228313446045, "learning_rate": 7.735802826756856e-06, "loss": 0.6182, "step": 11480 }, { "epoch": 3.736174365647365, "grad_norm": 2.1194801330566406, "learning_rate": 7.717145723199489e-06, "loss": 0.6752, "step": 11485 }, { "epoch": 3.737800910865322, "grad_norm": 2.1797292232513428, "learning_rate": 7.698507038138978e-06, "loss": 0.6815, "step": 11490 }, { "epoch": 3.739427456083279, "grad_norm": 2.020460844039917, "learning_rate": 7.679886791438754e-06, "loss": 0.6521, "step": 11495 }, { "epoch": 3.741054001301236, "grad_norm": 2.0722012519836426, "learning_rate": 7.661285002942572e-06, "loss": 0.6365, "step": 11500 }, { "epoch": 3.7426805465191935, "grad_norm": 1.9416598081588745, "learning_rate": 7.642701692474535e-06, "loss": 0.6474, "step": 11505 }, { "epoch": 3.7443070917371504, "grad_norm": 2.0658600330352783, "learning_rate": 7.624136879839053e-06, "loss": 0.6549, "step": 11510 }, { "epoch": 3.7459336369551073, "grad_norm": 2.0475194454193115, "learning_rate": 7.605590584820818e-06, "loss": 0.6587, "step": 11515 }, { "epoch": 3.747560182173064, "grad_norm": 1.8753339052200317, "learning_rate": 7.5870628271847765e-06, "loss": 0.67, "step": 11520 }, { "epoch": 3.7491867273910215, "grad_norm": 2.400254726409912, "learning_rate": 7.568553626676145e-06, "loss": 0.6502, "step": 11525 }, { "epoch": 3.7508132726089785, "grad_norm": 1.9283435344696045, "learning_rate": 7.550063003020333e-06, "loss": 0.6634, "step": 11530 }, { "epoch": 3.752439817826936, "grad_norm": 2.002530574798584, "learning_rate": 7.531590975922975e-06, "loss": 0.6555, "step": 11535 }, { "epoch": 3.7540663630448927, "grad_norm": 2.7513580322265625, "learning_rate": 7.5131375650698835e-06, "loss": 0.629, "step": 11540 }, { "epoch": 3.7556929082628496, "grad_norm": 1.9480247497558594, "learning_rate": 7.494702790127031e-06, "loss": 0.6704, "step": 11545 }, { "epoch": 3.7573194534808065, "grad_norm": 1.8581825494766235, "learning_rate": 7.476286670740518e-06, "loss": 0.6269, "step": 11550 }, { "epoch": 3.758945998698764, "grad_norm": 1.9145394563674927, "learning_rate": 7.4578892265365825e-06, "loss": 0.6637, "step": 11555 }, { "epoch": 3.760572543916721, "grad_norm": 1.9342228174209595, "learning_rate": 7.439510477121536e-06, "loss": 0.6881, "step": 11560 }, { "epoch": 3.762199089134678, "grad_norm": 2.0574638843536377, "learning_rate": 7.421150442081787e-06, "loss": 0.6495, "step": 11565 }, { "epoch": 3.763825634352635, "grad_norm": 1.9603220224380493, "learning_rate": 7.402809140983799e-06, "loss": 0.6534, "step": 11570 }, { "epoch": 3.765452179570592, "grad_norm": 2.2657594680786133, "learning_rate": 7.384486593374068e-06, "loss": 0.6883, "step": 11575 }, { "epoch": 3.767078724788549, "grad_norm": 2.123866558074951, "learning_rate": 7.366182818779088e-06, "loss": 0.6349, "step": 11580 }, { "epoch": 3.7687052700065062, "grad_norm": 2.215325117111206, "learning_rate": 7.347897836705378e-06, "loss": 0.6603, "step": 11585 }, { "epoch": 3.770331815224463, "grad_norm": 2.326101064682007, "learning_rate": 7.329631666639392e-06, "loss": 0.6544, "step": 11590 }, { "epoch": 3.7719583604424205, "grad_norm": 2.234374523162842, "learning_rate": 7.311384328047563e-06, "loss": 0.6667, "step": 11595 }, { "epoch": 3.7735849056603774, "grad_norm": 2.0708627700805664, "learning_rate": 7.2931558403762535e-06, "loss": 0.6392, "step": 11600 }, { "epoch": 3.7752114508783343, "grad_norm": 2.1877968311309814, "learning_rate": 7.274946223051723e-06, "loss": 0.6175, "step": 11605 }, { "epoch": 3.7768379960962912, "grad_norm": 1.891137957572937, "learning_rate": 7.256755495480141e-06, "loss": 0.6699, "step": 11610 }, { "epoch": 3.7784645413142486, "grad_norm": 1.8815498352050781, "learning_rate": 7.23858367704752e-06, "loss": 0.6501, "step": 11615 }, { "epoch": 3.7800910865322055, "grad_norm": 2.035649299621582, "learning_rate": 7.220430787119742e-06, "loss": 0.6496, "step": 11620 }, { "epoch": 3.781717631750163, "grad_norm": 2.20412278175354, "learning_rate": 7.2022968450425e-06, "loss": 0.6358, "step": 11625 }, { "epoch": 3.7833441769681198, "grad_norm": 2.2816989421844482, "learning_rate": 7.184181870141307e-06, "loss": 0.5931, "step": 11630 }, { "epoch": 3.7849707221860767, "grad_norm": 2.1493120193481445, "learning_rate": 7.16608588172146e-06, "loss": 0.6476, "step": 11635 }, { "epoch": 3.7865972674040336, "grad_norm": 2.1440515518188477, "learning_rate": 7.148008899068029e-06, "loss": 0.6531, "step": 11640 }, { "epoch": 3.788223812621991, "grad_norm": 2.1033313274383545, "learning_rate": 7.129950941445801e-06, "loss": 0.6207, "step": 11645 }, { "epoch": 3.789850357839948, "grad_norm": 2.0437424182891846, "learning_rate": 7.1119120280993295e-06, "loss": 0.6411, "step": 11650 }, { "epoch": 3.791476903057905, "grad_norm": 2.332766056060791, "learning_rate": 7.093892178252831e-06, "loss": 0.6495, "step": 11655 }, { "epoch": 3.793103448275862, "grad_norm": 2.138261556625366, "learning_rate": 7.0758914111102335e-06, "loss": 0.6373, "step": 11660 }, { "epoch": 3.794729993493819, "grad_norm": 2.130322217941284, "learning_rate": 7.057909745855121e-06, "loss": 0.6225, "step": 11665 }, { "epoch": 3.796356538711776, "grad_norm": 1.8375707864761353, "learning_rate": 7.039947201650726e-06, "loss": 0.6488, "step": 11670 }, { "epoch": 3.7979830839297333, "grad_norm": 2.5891764163970947, "learning_rate": 7.022003797639884e-06, "loss": 0.6388, "step": 11675 }, { "epoch": 3.79960962914769, "grad_norm": 1.9251084327697754, "learning_rate": 7.004079552945062e-06, "loss": 0.6282, "step": 11680 }, { "epoch": 3.8012361743656475, "grad_norm": 2.1112570762634277, "learning_rate": 6.98617448666827e-06, "loss": 0.6383, "step": 11685 }, { "epoch": 3.8028627195836044, "grad_norm": 2.921325922012329, "learning_rate": 6.968288617891116e-06, "loss": 0.6436, "step": 11690 }, { "epoch": 3.8044892648015614, "grad_norm": 2.328986883163452, "learning_rate": 6.950421965674736e-06, "loss": 0.655, "step": 11695 }, { "epoch": 3.8061158100195187, "grad_norm": 1.931818962097168, "learning_rate": 6.932574549059789e-06, "loss": 0.6615, "step": 11700 }, { "epoch": 3.8077423552374756, "grad_norm": 2.4075441360473633, "learning_rate": 6.91474638706642e-06, "loss": 0.6549, "step": 11705 }, { "epoch": 3.8093689004554325, "grad_norm": 2.0539324283599854, "learning_rate": 6.8969374986942784e-06, "loss": 0.6631, "step": 11710 }, { "epoch": 3.81099544567339, "grad_norm": 2.220444440841675, "learning_rate": 6.879147902922442e-06, "loss": 0.6443, "step": 11715 }, { "epoch": 3.812621990891347, "grad_norm": 2.3892598152160645, "learning_rate": 6.861377618709466e-06, "loss": 0.6351, "step": 11720 }, { "epoch": 3.8142485361093037, "grad_norm": 2.2880091667175293, "learning_rate": 6.843626664993286e-06, "loss": 0.6462, "step": 11725 }, { "epoch": 3.815875081327261, "grad_norm": 2.3915302753448486, "learning_rate": 6.825895060691273e-06, "loss": 0.6386, "step": 11730 }, { "epoch": 3.817501626545218, "grad_norm": 2.0178608894348145, "learning_rate": 6.808182824700163e-06, "loss": 0.6214, "step": 11735 }, { "epoch": 3.819128171763175, "grad_norm": 1.9248898029327393, "learning_rate": 6.790489975896033e-06, "loss": 0.6505, "step": 11740 }, { "epoch": 3.8207547169811322, "grad_norm": 1.9928648471832275, "learning_rate": 6.7728165331343355e-06, "loss": 0.6256, "step": 11745 }, { "epoch": 3.822381262199089, "grad_norm": 2.3208022117614746, "learning_rate": 6.755162515249799e-06, "loss": 0.6679, "step": 11750 }, { "epoch": 3.824007807417046, "grad_norm": 2.1799709796905518, "learning_rate": 6.737527941056482e-06, "loss": 0.6561, "step": 11755 }, { "epoch": 3.8256343526350034, "grad_norm": 1.859642744064331, "learning_rate": 6.719912829347716e-06, "loss": 0.6632, "step": 11760 }, { "epoch": 3.8272608978529603, "grad_norm": 1.8891128301620483, "learning_rate": 6.702317198896094e-06, "loss": 0.6263, "step": 11765 }, { "epoch": 3.828887443070917, "grad_norm": 1.9287997484207153, "learning_rate": 6.6847410684534235e-06, "loss": 0.6463, "step": 11770 }, { "epoch": 3.8305139882888746, "grad_norm": 1.865962266921997, "learning_rate": 6.667184456750763e-06, "loss": 0.6368, "step": 11775 }, { "epoch": 3.8321405335068315, "grad_norm": 2.176776647567749, "learning_rate": 6.649647382498345e-06, "loss": 0.6621, "step": 11780 }, { "epoch": 3.8337670787247884, "grad_norm": 2.3224313259124756, "learning_rate": 6.632129864385603e-06, "loss": 0.6716, "step": 11785 }, { "epoch": 3.8353936239427457, "grad_norm": 2.210333824157715, "learning_rate": 6.6146319210810935e-06, "loss": 0.6798, "step": 11790 }, { "epoch": 3.8370201691607027, "grad_norm": 2.4533655643463135, "learning_rate": 6.59715357123257e-06, "loss": 0.6685, "step": 11795 }, { "epoch": 3.8386467143786596, "grad_norm": 2.1346421241760254, "learning_rate": 6.579694833466843e-06, "loss": 0.6522, "step": 11800 }, { "epoch": 3.840273259596617, "grad_norm": 1.7396992444992065, "learning_rate": 6.5622557263898685e-06, "loss": 0.6218, "step": 11805 }, { "epoch": 3.841899804814574, "grad_norm": 2.317690134048462, "learning_rate": 6.5448362685866485e-06, "loss": 0.6804, "step": 11810 }, { "epoch": 3.8435263500325307, "grad_norm": 2.0960299968719482, "learning_rate": 6.527436478621271e-06, "loss": 0.692, "step": 11815 }, { "epoch": 3.845152895250488, "grad_norm": 2.0712311267852783, "learning_rate": 6.510056375036841e-06, "loss": 0.6534, "step": 11820 }, { "epoch": 3.846779440468445, "grad_norm": 2.082179069519043, "learning_rate": 6.4926959763555035e-06, "loss": 0.6433, "step": 11825 }, { "epoch": 3.8484059856864024, "grad_norm": 2.23954439163208, "learning_rate": 6.47535530107839e-06, "loss": 0.653, "step": 11830 }, { "epoch": 3.8500325309043593, "grad_norm": 1.9695318937301636, "learning_rate": 6.458034367685628e-06, "loss": 0.6232, "step": 11835 }, { "epoch": 3.851659076122316, "grad_norm": 2.3786518573760986, "learning_rate": 6.440733194636281e-06, "loss": 0.6841, "step": 11840 }, { "epoch": 3.853285621340273, "grad_norm": 2.1393187046051025, "learning_rate": 6.423451800368382e-06, "loss": 0.6422, "step": 11845 }, { "epoch": 3.8549121665582304, "grad_norm": 1.9716554880142212, "learning_rate": 6.406190203298859e-06, "loss": 0.6578, "step": 11850 }, { "epoch": 3.8565387117761873, "grad_norm": 2.2387914657592773, "learning_rate": 6.388948421823563e-06, "loss": 0.6485, "step": 11855 }, { "epoch": 3.8581652569941447, "grad_norm": 2.0851616859436035, "learning_rate": 6.3717264743172134e-06, "loss": 0.6537, "step": 11860 }, { "epoch": 3.8597918022121016, "grad_norm": 2.284559965133667, "learning_rate": 6.3545243791334055e-06, "loss": 0.6397, "step": 11865 }, { "epoch": 3.8614183474300585, "grad_norm": 2.1620171070098877, "learning_rate": 6.337342154604573e-06, "loss": 0.6347, "step": 11870 }, { "epoch": 3.8630448926480154, "grad_norm": 2.347256898880005, "learning_rate": 6.320179819041958e-06, "loss": 0.6278, "step": 11875 }, { "epoch": 3.864671437865973, "grad_norm": 2.4992942810058594, "learning_rate": 6.303037390735634e-06, "loss": 0.6876, "step": 11880 }, { "epoch": 3.8662979830839297, "grad_norm": 2.0782930850982666, "learning_rate": 6.2859148879544305e-06, "loss": 0.6422, "step": 11885 }, { "epoch": 3.867924528301887, "grad_norm": 1.9557080268859863, "learning_rate": 6.268812328945961e-06, "loss": 0.6436, "step": 11890 }, { "epoch": 3.869551073519844, "grad_norm": 2.2186222076416016, "learning_rate": 6.2517297319365814e-06, "loss": 0.6487, "step": 11895 }, { "epoch": 3.871177618737801, "grad_norm": 2.5874240398406982, "learning_rate": 6.234667115131382e-06, "loss": 0.6418, "step": 11900 }, { "epoch": 3.8728041639557578, "grad_norm": 2.3835012912750244, "learning_rate": 6.217624496714134e-06, "loss": 0.696, "step": 11905 }, { "epoch": 3.874430709173715, "grad_norm": 2.054152488708496, "learning_rate": 6.200601894847324e-06, "loss": 0.681, "step": 11910 }, { "epoch": 3.876057254391672, "grad_norm": 1.9579323530197144, "learning_rate": 6.183599327672088e-06, "loss": 0.653, "step": 11915 }, { "epoch": 3.8776837996096294, "grad_norm": 2.3043148517608643, "learning_rate": 6.166616813308221e-06, "loss": 0.6709, "step": 11920 }, { "epoch": 3.8793103448275863, "grad_norm": 2.012111186981201, "learning_rate": 6.149654369854146e-06, "loss": 0.6519, "step": 11925 }, { "epoch": 3.880936890045543, "grad_norm": 1.9760338068008423, "learning_rate": 6.132712015386902e-06, "loss": 0.667, "step": 11930 }, { "epoch": 3.8825634352635, "grad_norm": 1.9617925882339478, "learning_rate": 6.115789767962099e-06, "loss": 0.6333, "step": 11935 }, { "epoch": 3.8841899804814575, "grad_norm": 1.8881852626800537, "learning_rate": 6.098887645613943e-06, "loss": 0.6315, "step": 11940 }, { "epoch": 3.8858165256994144, "grad_norm": 2.1504695415496826, "learning_rate": 6.08200566635517e-06, "loss": 0.6492, "step": 11945 }, { "epoch": 3.8874430709173717, "grad_norm": 2.1913535594940186, "learning_rate": 6.065143848177066e-06, "loss": 0.6612, "step": 11950 }, { "epoch": 3.8890696161353286, "grad_norm": 1.8416792154312134, "learning_rate": 6.048302209049428e-06, "loss": 0.6473, "step": 11955 }, { "epoch": 3.8906961613532856, "grad_norm": 2.969268798828125, "learning_rate": 6.03148076692055e-06, "loss": 0.6281, "step": 11960 }, { "epoch": 3.8923227065712425, "grad_norm": 2.2109107971191406, "learning_rate": 6.014679539717186e-06, "loss": 0.6618, "step": 11965 }, { "epoch": 3.8939492517892, "grad_norm": 4.5289082527160645, "learning_rate": 5.997898545344571e-06, "loss": 0.6841, "step": 11970 }, { "epoch": 3.8955757970071567, "grad_norm": 1.9927226305007935, "learning_rate": 5.981137801686354e-06, "loss": 0.652, "step": 11975 }, { "epoch": 3.897202342225114, "grad_norm": 2.0749526023864746, "learning_rate": 5.9643973266046145e-06, "loss": 0.6597, "step": 11980 }, { "epoch": 3.898828887443071, "grad_norm": 2.044415235519409, "learning_rate": 5.947677137939839e-06, "loss": 0.673, "step": 11985 }, { "epoch": 3.900455432661028, "grad_norm": 1.8921427726745605, "learning_rate": 5.930977253510886e-06, "loss": 0.6698, "step": 11990 }, { "epoch": 3.902081977878985, "grad_norm": 1.9163424968719482, "learning_rate": 5.914297691114967e-06, "loss": 0.6368, "step": 11995 }, { "epoch": 3.903708523096942, "grad_norm": 2.6371891498565674, "learning_rate": 5.897638468527653e-06, "loss": 0.6321, "step": 12000 }, { "epoch": 3.905335068314899, "grad_norm": 2.0630850791931152, "learning_rate": 5.880999603502832e-06, "loss": 0.6687, "step": 12005 }, { "epoch": 3.9069616135328564, "grad_norm": 1.8964064121246338, "learning_rate": 5.864381113772685e-06, "loss": 0.6321, "step": 12010 }, { "epoch": 3.9085881587508133, "grad_norm": 1.7861818075180054, "learning_rate": 5.847783017047698e-06, "loss": 0.641, "step": 12015 }, { "epoch": 3.9102147039687702, "grad_norm": 1.9545518159866333, "learning_rate": 5.831205331016612e-06, "loss": 0.6499, "step": 12020 }, { "epoch": 3.911841249186727, "grad_norm": 2.174670934677124, "learning_rate": 5.814648073346429e-06, "loss": 0.6666, "step": 12025 }, { "epoch": 3.9134677944046845, "grad_norm": 2.0867092609405518, "learning_rate": 5.798111261682357e-06, "loss": 0.6656, "step": 12030 }, { "epoch": 3.9150943396226414, "grad_norm": 2.1856985092163086, "learning_rate": 5.781594913647842e-06, "loss": 0.6219, "step": 12035 }, { "epoch": 3.9167208848405988, "grad_norm": 2.310839891433716, "learning_rate": 5.765099046844491e-06, "loss": 0.6511, "step": 12040 }, { "epoch": 3.9183474300585557, "grad_norm": 2.0312695503234863, "learning_rate": 5.7486236788521135e-06, "loss": 0.6478, "step": 12045 }, { "epoch": 3.9199739752765126, "grad_norm": 1.90248441696167, "learning_rate": 5.7321688272286596e-06, "loss": 0.6157, "step": 12050 }, { "epoch": 3.9216005204944695, "grad_norm": 1.5778822898864746, "learning_rate": 5.715734509510218e-06, "loss": 0.6536, "step": 12055 }, { "epoch": 3.923227065712427, "grad_norm": 1.9187769889831543, "learning_rate": 5.699320743210984e-06, "loss": 0.6295, "step": 12060 }, { "epoch": 3.9248536109303838, "grad_norm": 2.114119052886963, "learning_rate": 5.682927545823272e-06, "loss": 0.6452, "step": 12065 }, { "epoch": 3.926480156148341, "grad_norm": 1.953281283378601, "learning_rate": 5.666554934817447e-06, "loss": 0.6088, "step": 12070 }, { "epoch": 3.928106701366298, "grad_norm": 1.7641047239303589, "learning_rate": 5.650202927641959e-06, "loss": 0.6072, "step": 12075 }, { "epoch": 3.929733246584255, "grad_norm": 2.1239614486694336, "learning_rate": 5.633871541723295e-06, "loss": 0.6562, "step": 12080 }, { "epoch": 3.931359791802212, "grad_norm": 1.825926423072815, "learning_rate": 5.61756079446597e-06, "loss": 0.6362, "step": 12085 }, { "epoch": 3.932986337020169, "grad_norm": 2.1758463382720947, "learning_rate": 5.601270703252481e-06, "loss": 0.6146, "step": 12090 }, { "epoch": 3.934612882238126, "grad_norm": 3.43782377243042, "learning_rate": 5.585001285443345e-06, "loss": 0.6418, "step": 12095 }, { "epoch": 3.9362394274560835, "grad_norm": 2.19016170501709, "learning_rate": 5.5687525583770135e-06, "loss": 0.6554, "step": 12100 }, { "epoch": 3.9378659726740404, "grad_norm": 2.0171186923980713, "learning_rate": 5.55252453936991e-06, "loss": 0.6284, "step": 12105 }, { "epoch": 3.9394925178919973, "grad_norm": 2.5034661293029785, "learning_rate": 5.536317245716391e-06, "loss": 0.6363, "step": 12110 }, { "epoch": 3.941119063109954, "grad_norm": 2.0403332710266113, "learning_rate": 5.520130694688716e-06, "loss": 0.6521, "step": 12115 }, { "epoch": 3.9427456083279115, "grad_norm": 2.5809988975524902, "learning_rate": 5.503964903537037e-06, "loss": 0.671, "step": 12120 }, { "epoch": 3.9443721535458685, "grad_norm": 2.1316304206848145, "learning_rate": 5.487819889489393e-06, "loss": 0.673, "step": 12125 }, { "epoch": 3.945998698763826, "grad_norm": 2.0063838958740234, "learning_rate": 5.471695669751664e-06, "loss": 0.6632, "step": 12130 }, { "epoch": 3.9476252439817827, "grad_norm": 2.0849430561065674, "learning_rate": 5.455592261507589e-06, "loss": 0.6336, "step": 12135 }, { "epoch": 3.9492517891997396, "grad_norm": 1.7159404754638672, "learning_rate": 5.439509681918703e-06, "loss": 0.6588, "step": 12140 }, { "epoch": 3.9508783344176965, "grad_norm": 2.0891189575195312, "learning_rate": 5.42665862639774e-06, "loss": 0.6209, "step": 12145 }, { "epoch": 3.952504879635654, "grad_norm": 1.9608136415481567, "learning_rate": 5.410613581564464e-06, "loss": 0.631, "step": 12150 }, { "epoch": 3.954131424853611, "grad_norm": 2.2602360248565674, "learning_rate": 5.394589413320589e-06, "loss": 0.6455, "step": 12155 }, { "epoch": 3.955757970071568, "grad_norm": 2.0772197246551514, "learning_rate": 5.378586138743203e-06, "loss": 0.6371, "step": 12160 }, { "epoch": 3.957384515289525, "grad_norm": 2.095057725906372, "learning_rate": 5.3626037748871565e-06, "loss": 0.6596, "step": 12165 }, { "epoch": 3.959011060507482, "grad_norm": 2.1261723041534424, "learning_rate": 5.346642338784985e-06, "loss": 0.6655, "step": 12170 }, { "epoch": 3.9606376057254393, "grad_norm": 2.161123037338257, "learning_rate": 5.330701847446962e-06, "loss": 0.6528, "step": 12175 }, { "epoch": 3.9622641509433962, "grad_norm": 2.0051002502441406, "learning_rate": 5.314782317860998e-06, "loss": 0.6183, "step": 12180 }, { "epoch": 3.963890696161353, "grad_norm": 3.760082721710205, "learning_rate": 5.29888376699269e-06, "loss": 0.6418, "step": 12185 }, { "epoch": 3.9655172413793105, "grad_norm": 2.0798699855804443, "learning_rate": 5.2830062117852654e-06, "loss": 0.6404, "step": 12190 }, { "epoch": 3.9671437865972674, "grad_norm": 1.942769169807434, "learning_rate": 5.267149669159588e-06, "loss": 0.6642, "step": 12195 }, { "epoch": 3.9687703318152243, "grad_norm": 3.208273410797119, "learning_rate": 5.251314156014109e-06, "loss": 0.6344, "step": 12200 }, { "epoch": 3.9703968770331817, "grad_norm": 1.9215033054351807, "learning_rate": 5.235499689224885e-06, "loss": 0.644, "step": 12205 }, { "epoch": 3.9720234222511386, "grad_norm": 2.114232301712036, "learning_rate": 5.219706285645545e-06, "loss": 0.6734, "step": 12210 }, { "epoch": 3.9736499674690955, "grad_norm": 1.9279701709747314, "learning_rate": 5.203933962107266e-06, "loss": 0.6735, "step": 12215 }, { "epoch": 3.975276512687053, "grad_norm": 1.9819085597991943, "learning_rate": 5.1881827354187454e-06, "loss": 0.6419, "step": 12220 }, { "epoch": 3.9769030579050098, "grad_norm": 1.8507076501846313, "learning_rate": 5.172452622366228e-06, "loss": 0.6434, "step": 12225 }, { "epoch": 3.9785296031229667, "grad_norm": 1.9687306880950928, "learning_rate": 5.156743639713421e-06, "loss": 0.6469, "step": 12230 }, { "epoch": 3.980156148340924, "grad_norm": 2.129936933517456, "learning_rate": 5.141055804201541e-06, "loss": 0.6424, "step": 12235 }, { "epoch": 3.981782693558881, "grad_norm": 2.0583765506744385, "learning_rate": 5.12538913254926e-06, "loss": 0.6132, "step": 12240 }, { "epoch": 3.983409238776838, "grad_norm": 2.0079078674316406, "learning_rate": 5.109743641452699e-06, "loss": 0.6286, "step": 12245 }, { "epoch": 3.985035783994795, "grad_norm": 2.323115348815918, "learning_rate": 5.094119347585391e-06, "loss": 0.6553, "step": 12250 }, { "epoch": 3.986662329212752, "grad_norm": 2.1607253551483154, "learning_rate": 5.078516267598299e-06, "loss": 0.6405, "step": 12255 }, { "epoch": 3.988288874430709, "grad_norm": 2.571053981781006, "learning_rate": 5.062934418119761e-06, "loss": 0.6373, "step": 12260 }, { "epoch": 3.9899154196486664, "grad_norm": 2.4664855003356934, "learning_rate": 5.047373815755496e-06, "loss": 0.6689, "step": 12265 }, { "epoch": 3.9915419648666233, "grad_norm": 2.033982038497925, "learning_rate": 5.03183447708859e-06, "loss": 0.644, "step": 12270 }, { "epoch": 3.9931685100845806, "grad_norm": 2.154475212097168, "learning_rate": 5.016316418679454e-06, "loss": 0.6635, "step": 12275 }, { "epoch": 3.9947950553025375, "grad_norm": 3.9255807399749756, "learning_rate": 5.000819657065833e-06, "loss": 0.661, "step": 12280 }, { "epoch": 3.9964216005204944, "grad_norm": 1.9987826347351074, "learning_rate": 4.985344208762757e-06, "loss": 0.642, "step": 12285 }, { "epoch": 3.9980481457384514, "grad_norm": 1.9555718898773193, "learning_rate": 4.9698900902625666e-06, "loss": 0.6492, "step": 12290 }, { "epoch": 3.9996746909564087, "grad_norm": 2.102719783782959, "learning_rate": 4.954457318034841e-06, "loss": 0.6777, "step": 12295 }, { "epoch": 4.0, "eval_f1": 0.8249050225524125, "eval_loss": 0.41357421875, "eval_precision": 0.825624581488519, "eval_recall": 0.8243496742948847, "eval_runtime": 1029.9022, "eval_samples_per_second": 382.012, "eval_steps_per_second": 0.747, "step": 12296 }, { "epoch": 4.001301236174366, "grad_norm": 1.9310343265533447, "learning_rate": 4.939045908526441e-06, "loss": 0.6276, "step": 12300 }, { "epoch": 4.002927781392323, "grad_norm": 1.874909520149231, "learning_rate": 4.92365587816144e-06, "loss": 0.6199, "step": 12305 }, { "epoch": 4.00455432661028, "grad_norm": 1.908257246017456, "learning_rate": 4.908287243341147e-06, "loss": 0.575, "step": 12310 }, { "epoch": 4.006180871828237, "grad_norm": 2.0421009063720703, "learning_rate": 4.892940020444043e-06, "loss": 0.6105, "step": 12315 }, { "epoch": 4.007807417046194, "grad_norm": 1.8622931241989136, "learning_rate": 4.877614225825816e-06, "loss": 0.62, "step": 12320 }, { "epoch": 4.009433962264151, "grad_norm": 2.134507179260254, "learning_rate": 4.862309875819299e-06, "loss": 0.5821, "step": 12325 }, { "epoch": 4.011060507482108, "grad_norm": 2.4682130813598633, "learning_rate": 4.8470269867344764e-06, "loss": 0.616, "step": 12330 }, { "epoch": 4.012687052700065, "grad_norm": 2.3341264724731445, "learning_rate": 4.831765574858471e-06, "loss": 0.6122, "step": 12335 }, { "epoch": 4.014313597918022, "grad_norm": 2.212038278579712, "learning_rate": 4.816525656455512e-06, "loss": 0.6072, "step": 12340 }, { "epoch": 4.015940143135979, "grad_norm": 2.357851505279541, "learning_rate": 4.801307247766912e-06, "loss": 0.5993, "step": 12345 }, { "epoch": 4.017566688353936, "grad_norm": 2.0386996269226074, "learning_rate": 4.786110365011079e-06, "loss": 0.6069, "step": 12350 }, { "epoch": 4.019193233571893, "grad_norm": 2.1316967010498047, "learning_rate": 4.770935024383458e-06, "loss": 0.6128, "step": 12355 }, { "epoch": 4.020819778789851, "grad_norm": 2.2749853134155273, "learning_rate": 4.755781242056556e-06, "loss": 0.6066, "step": 12360 }, { "epoch": 4.022446324007808, "grad_norm": 2.4930074214935303, "learning_rate": 4.740649034179898e-06, "loss": 0.6093, "step": 12365 }, { "epoch": 4.024072869225765, "grad_norm": 2.158831834793091, "learning_rate": 4.7255384168800235e-06, "loss": 0.6197, "step": 12370 }, { "epoch": 4.0256994144437215, "grad_norm": 2.402033567428589, "learning_rate": 4.7104494062604445e-06, "loss": 0.6083, "step": 12375 }, { "epoch": 4.027325959661678, "grad_norm": 3.0678963661193848, "learning_rate": 4.695382018401673e-06, "loss": 0.5798, "step": 12380 }, { "epoch": 4.028952504879635, "grad_norm": 2.1048078536987305, "learning_rate": 4.680336269361146e-06, "loss": 0.6079, "step": 12385 }, { "epoch": 4.030579050097593, "grad_norm": 2.014091730117798, "learning_rate": 4.665312175173267e-06, "loss": 0.6157, "step": 12390 }, { "epoch": 4.03220559531555, "grad_norm": 2.081606864929199, "learning_rate": 4.650309751849349e-06, "loss": 0.6043, "step": 12395 }, { "epoch": 4.033832140533507, "grad_norm": 2.3270719051361084, "learning_rate": 4.635329015377621e-06, "loss": 0.6135, "step": 12400 }, { "epoch": 4.035458685751464, "grad_norm": 2.3290863037109375, "learning_rate": 4.620369981723174e-06, "loss": 0.6371, "step": 12405 }, { "epoch": 4.037085230969421, "grad_norm": 2.0165207386016846, "learning_rate": 4.605432666828002e-06, "loss": 0.621, "step": 12410 }, { "epoch": 4.038711776187378, "grad_norm": 1.9335479736328125, "learning_rate": 4.590517086610938e-06, "loss": 0.588, "step": 12415 }, { "epoch": 4.040338321405335, "grad_norm": 1.9899652004241943, "learning_rate": 4.575623256967646e-06, "loss": 0.6159, "step": 12420 }, { "epoch": 4.041964866623292, "grad_norm": 2.0712404251098633, "learning_rate": 4.560751193770619e-06, "loss": 0.6164, "step": 12425 }, { "epoch": 4.043591411841249, "grad_norm": 1.8574680089950562, "learning_rate": 4.545900912869156e-06, "loss": 0.6229, "step": 12430 }, { "epoch": 4.045217957059206, "grad_norm": 1.8692642450332642, "learning_rate": 4.531072430089339e-06, "loss": 0.5659, "step": 12435 }, { "epoch": 4.046844502277163, "grad_norm": 2.001011371612549, "learning_rate": 4.516265761234012e-06, "loss": 0.6209, "step": 12440 }, { "epoch": 4.04847104749512, "grad_norm": 2.2855708599090576, "learning_rate": 4.501480922082787e-06, "loss": 0.6031, "step": 12445 }, { "epoch": 4.050097592713078, "grad_norm": 2.4015791416168213, "learning_rate": 4.486717928391993e-06, "loss": 0.6152, "step": 12450 }, { "epoch": 4.051724137931035, "grad_norm": 1.9134855270385742, "learning_rate": 4.471976795894692e-06, "loss": 0.5889, "step": 12455 }, { "epoch": 4.053350683148992, "grad_norm": 2.1408133506774902, "learning_rate": 4.457257540300647e-06, "loss": 0.6092, "step": 12460 }, { "epoch": 4.0549772283669485, "grad_norm": 2.091780185699463, "learning_rate": 4.442560177296307e-06, "loss": 0.6578, "step": 12465 }, { "epoch": 4.056603773584905, "grad_norm": 2.202208995819092, "learning_rate": 4.427884722544776e-06, "loss": 0.6078, "step": 12470 }, { "epoch": 4.058230318802862, "grad_norm": 1.9250588417053223, "learning_rate": 4.413231191685838e-06, "loss": 0.583, "step": 12475 }, { "epoch": 4.05985686402082, "grad_norm": 2.2674145698547363, "learning_rate": 4.398599600335876e-06, "loss": 0.6082, "step": 12480 }, { "epoch": 4.061483409238777, "grad_norm": 2.1977386474609375, "learning_rate": 4.383989964087923e-06, "loss": 0.5921, "step": 12485 }, { "epoch": 4.063109954456734, "grad_norm": 2.112382650375366, "learning_rate": 4.369402298511599e-06, "loss": 0.5699, "step": 12490 }, { "epoch": 4.064736499674691, "grad_norm": 2.3108229637145996, "learning_rate": 4.354836619153124e-06, "loss": 0.6073, "step": 12495 }, { "epoch": 4.066363044892648, "grad_norm": 2.145864725112915, "learning_rate": 4.3402929415352625e-06, "loss": 0.6028, "step": 12500 }, { "epoch": 4.067989590110605, "grad_norm": 2.464498996734619, "learning_rate": 4.325771281157356e-06, "loss": 0.633, "step": 12505 }, { "epoch": 4.0696161353285625, "grad_norm": 2.521463394165039, "learning_rate": 4.311271653495261e-06, "loss": 0.6024, "step": 12510 }, { "epoch": 4.071242680546519, "grad_norm": 2.3448150157928467, "learning_rate": 4.296794074001376e-06, "loss": 0.5965, "step": 12515 }, { "epoch": 4.072869225764476, "grad_norm": 2.0570671558380127, "learning_rate": 4.282338558104573e-06, "loss": 0.5832, "step": 12520 }, { "epoch": 4.074495770982433, "grad_norm": 2.1091291904449463, "learning_rate": 4.267905121210253e-06, "loss": 0.6082, "step": 12525 }, { "epoch": 4.07612231620039, "grad_norm": 2.690284252166748, "learning_rate": 4.2534937787002405e-06, "loss": 0.5833, "step": 12530 }, { "epoch": 4.077748861418347, "grad_norm": 2.233856439590454, "learning_rate": 4.239104545932854e-06, "loss": 0.6048, "step": 12535 }, { "epoch": 4.079375406636305, "grad_norm": 2.2243893146514893, "learning_rate": 4.224737438242815e-06, "loss": 0.5929, "step": 12540 }, { "epoch": 4.081001951854262, "grad_norm": 2.082343816757202, "learning_rate": 4.210392470941288e-06, "loss": 0.581, "step": 12545 }, { "epoch": 4.082628497072219, "grad_norm": 2.1762783527374268, "learning_rate": 4.196069659315846e-06, "loss": 0.5979, "step": 12550 }, { "epoch": 4.0842550422901756, "grad_norm": 2.3144097328186035, "learning_rate": 4.181769018630422e-06, "loss": 0.6143, "step": 12555 }, { "epoch": 4.0858815875081325, "grad_norm": 2.2833683490753174, "learning_rate": 4.167490564125362e-06, "loss": 0.5876, "step": 12560 }, { "epoch": 4.087508132726089, "grad_norm": 2.1938295364379883, "learning_rate": 4.153234311017332e-06, "loss": 0.6488, "step": 12565 }, { "epoch": 4.089134677944047, "grad_norm": 2.2685635089874268, "learning_rate": 4.13900027449936e-06, "loss": 0.6016, "step": 12570 }, { "epoch": 4.090761223162004, "grad_norm": 2.142134666442871, "learning_rate": 4.124788469740784e-06, "loss": 0.6206, "step": 12575 }, { "epoch": 4.092387768379961, "grad_norm": 2.7002527713775635, "learning_rate": 4.110598911887259e-06, "loss": 0.6071, "step": 12580 }, { "epoch": 4.094014313597918, "grad_norm": 2.357795000076294, "learning_rate": 4.096431616060717e-06, "loss": 0.609, "step": 12585 }, { "epoch": 4.095640858815875, "grad_norm": 2.264958620071411, "learning_rate": 4.082286597359395e-06, "loss": 0.6337, "step": 12590 }, { "epoch": 4.097267404033832, "grad_norm": 2.031507730484009, "learning_rate": 4.0681638708577495e-06, "loss": 0.5885, "step": 12595 }, { "epoch": 4.0988939492517895, "grad_norm": 2.763190507888794, "learning_rate": 4.054063451606518e-06, "loss": 0.5873, "step": 12600 }, { "epoch": 4.100520494469746, "grad_norm": 2.8204002380371094, "learning_rate": 4.039985354632633e-06, "loss": 0.6147, "step": 12605 }, { "epoch": 4.102147039687703, "grad_norm": 2.5044710636138916, "learning_rate": 4.025929594939262e-06, "loss": 0.6034, "step": 12610 }, { "epoch": 4.10377358490566, "grad_norm": 2.4716875553131104, "learning_rate": 4.0118961875057485e-06, "loss": 0.6319, "step": 12615 }, { "epoch": 4.105400130123617, "grad_norm": 1.859110951423645, "learning_rate": 3.997885147287628e-06, "loss": 0.5791, "step": 12620 }, { "epoch": 4.107026675341574, "grad_norm": 1.9830174446105957, "learning_rate": 3.983896489216596e-06, "loss": 0.5906, "step": 12625 }, { "epoch": 4.108653220559532, "grad_norm": 2.4182615280151367, "learning_rate": 3.969930228200497e-06, "loss": 0.6238, "step": 12630 }, { "epoch": 4.110279765777489, "grad_norm": 2.034052610397339, "learning_rate": 3.955986379123297e-06, "loss": 0.6097, "step": 12635 }, { "epoch": 4.111906310995446, "grad_norm": 2.3397281169891357, "learning_rate": 3.94206495684509e-06, "loss": 0.6108, "step": 12640 }, { "epoch": 4.113532856213403, "grad_norm": 2.510648727416992, "learning_rate": 3.928165976202058e-06, "loss": 0.6007, "step": 12645 }, { "epoch": 4.1151594014313595, "grad_norm": 2.819530963897705, "learning_rate": 3.914289452006478e-06, "loss": 0.6258, "step": 12650 }, { "epoch": 4.116785946649317, "grad_norm": 2.195157766342163, "learning_rate": 3.900435399046684e-06, "loss": 0.5999, "step": 12655 }, { "epoch": 4.118412491867274, "grad_norm": 2.176823616027832, "learning_rate": 3.88660383208708e-06, "loss": 0.6322, "step": 12660 }, { "epoch": 4.120039037085231, "grad_norm": 2.2609336376190186, "learning_rate": 3.872794765868079e-06, "loss": 0.6082, "step": 12665 }, { "epoch": 4.121665582303188, "grad_norm": 2.0738308429718018, "learning_rate": 3.859008215106141e-06, "loss": 0.589, "step": 12670 }, { "epoch": 4.123292127521145, "grad_norm": 1.9815113544464111, "learning_rate": 3.845244194493711e-06, "loss": 0.6007, "step": 12675 }, { "epoch": 4.124918672739102, "grad_norm": 2.2510576248168945, "learning_rate": 3.831502718699237e-06, "loss": 0.6518, "step": 12680 }, { "epoch": 4.126545217957059, "grad_norm": 2.5228381156921387, "learning_rate": 3.817783802367137e-06, "loss": 0.5937, "step": 12685 }, { "epoch": 4.1281717631750166, "grad_norm": 2.113708734512329, "learning_rate": 3.8040874601177862e-06, "loss": 0.6185, "step": 12690 }, { "epoch": 4.1297983083929735, "grad_norm": 2.3992342948913574, "learning_rate": 3.790413706547505e-06, "loss": 0.5963, "step": 12695 }, { "epoch": 4.13142485361093, "grad_norm": 2.4300765991210938, "learning_rate": 3.7767625562285304e-06, "loss": 0.5974, "step": 12700 }, { "epoch": 4.133051398828887, "grad_norm": 2.2891955375671387, "learning_rate": 3.763134023709031e-06, "loss": 0.5951, "step": 12705 }, { "epoch": 4.134677944046844, "grad_norm": 2.182823896408081, "learning_rate": 3.7495281235130465e-06, "loss": 0.5987, "step": 12710 }, { "epoch": 4.136304489264802, "grad_norm": 2.214000940322876, "learning_rate": 3.7359448701405147e-06, "loss": 0.5987, "step": 12715 }, { "epoch": 4.137931034482759, "grad_norm": 2.0059731006622314, "learning_rate": 3.7223842780672367e-06, "loss": 0.6031, "step": 12720 }, { "epoch": 4.139557579700716, "grad_norm": 2.5928211212158203, "learning_rate": 3.7088463617448637e-06, "loss": 0.6273, "step": 12725 }, { "epoch": 4.141184124918673, "grad_norm": 2.49120831489563, "learning_rate": 3.6953311356008657e-06, "loss": 0.6219, "step": 12730 }, { "epoch": 4.14281067013663, "grad_norm": 2.263577938079834, "learning_rate": 3.6818386140385575e-06, "loss": 0.5967, "step": 12735 }, { "epoch": 4.1444372153545865, "grad_norm": 2.0892696380615234, "learning_rate": 3.6683688114370283e-06, "loss": 0.5919, "step": 12740 }, { "epoch": 4.146063760572544, "grad_norm": 2.027967929840088, "learning_rate": 3.6549217421511795e-06, "loss": 0.5978, "step": 12745 }, { "epoch": 4.147690305790501, "grad_norm": 2.1417746543884277, "learning_rate": 3.6414974205116746e-06, "loss": 0.6295, "step": 12750 }, { "epoch": 4.149316851008458, "grad_norm": 2.0953240394592285, "learning_rate": 3.6280958608249456e-06, "loss": 0.6059, "step": 12755 }, { "epoch": 4.150943396226415, "grad_norm": 2.216292381286621, "learning_rate": 3.614717077373145e-06, "loss": 0.6357, "step": 12760 }, { "epoch": 4.152569941444372, "grad_norm": 2.3620471954345703, "learning_rate": 3.601361084414176e-06, "loss": 0.6012, "step": 12765 }, { "epoch": 4.154196486662329, "grad_norm": 2.3432223796844482, "learning_rate": 3.588027896181631e-06, "loss": 0.6326, "step": 12770 }, { "epoch": 4.155823031880287, "grad_norm": 2.8889687061309814, "learning_rate": 3.5747175268848188e-06, "loss": 0.6229, "step": 12775 }, { "epoch": 4.157449577098244, "grad_norm": 2.173004627227783, "learning_rate": 3.5614299907087227e-06, "loss": 0.5868, "step": 12780 }, { "epoch": 4.1590761223162005, "grad_norm": 2.1605019569396973, "learning_rate": 3.5481653018139995e-06, "loss": 0.6088, "step": 12785 }, { "epoch": 4.160702667534157, "grad_norm": 2.0582311153411865, "learning_rate": 3.534923474336932e-06, "loss": 0.5967, "step": 12790 }, { "epoch": 4.162329212752114, "grad_norm": 2.2507846355438232, "learning_rate": 3.521704522389477e-06, "loss": 0.6231, "step": 12795 }, { "epoch": 4.163955757970071, "grad_norm": 2.285806894302368, "learning_rate": 3.508508460059179e-06, "loss": 0.6049, "step": 12800 }, { "epoch": 4.165582303188029, "grad_norm": 2.1297316551208496, "learning_rate": 3.4953353014092057e-06, "loss": 0.5776, "step": 12805 }, { "epoch": 4.167208848405986, "grad_norm": 3.20120906829834, "learning_rate": 3.4821850604783186e-06, "loss": 0.622, "step": 12810 }, { "epoch": 4.168835393623943, "grad_norm": 2.3360631465911865, "learning_rate": 3.469057751280852e-06, "loss": 0.6112, "step": 12815 }, { "epoch": 4.1704619388419, "grad_norm": 2.469560146331787, "learning_rate": 3.45595338780669e-06, "loss": 0.6148, "step": 12820 }, { "epoch": 4.172088484059857, "grad_norm": 2.289069890975952, "learning_rate": 3.4428719840212814e-06, "loss": 0.6116, "step": 12825 }, { "epoch": 4.173715029277814, "grad_norm": 2.0820260047912598, "learning_rate": 3.4298135538656017e-06, "loss": 0.5922, "step": 12830 }, { "epoch": 4.175341574495771, "grad_norm": 2.3774619102478027, "learning_rate": 3.4167781112561275e-06, "loss": 0.6348, "step": 12835 }, { "epoch": 4.176968119713728, "grad_norm": 2.2148120403289795, "learning_rate": 3.40376567008486e-06, "loss": 0.6031, "step": 12840 }, { "epoch": 4.178594664931685, "grad_norm": 2.243387222290039, "learning_rate": 3.3907762442192735e-06, "loss": 0.5964, "step": 12845 }, { "epoch": 4.180221210149642, "grad_norm": 2.0168538093566895, "learning_rate": 3.3778098475023277e-06, "loss": 0.6048, "step": 12850 }, { "epoch": 4.181847755367599, "grad_norm": 1.9758124351501465, "learning_rate": 3.3648664937524165e-06, "loss": 0.5786, "step": 12855 }, { "epoch": 4.183474300585556, "grad_norm": 2.3366825580596924, "learning_rate": 3.351946196763403e-06, "loss": 0.6021, "step": 12860 }, { "epoch": 4.185100845803514, "grad_norm": 2.2335920333862305, "learning_rate": 3.3390489703045593e-06, "loss": 0.5811, "step": 12865 }, { "epoch": 4.186727391021471, "grad_norm": 2.1503963470458984, "learning_rate": 3.326174828120576e-06, "loss": 0.6013, "step": 12870 }, { "epoch": 4.1883539362394275, "grad_norm": 2.215834140777588, "learning_rate": 3.3133237839315546e-06, "loss": 0.5913, "step": 12875 }, { "epoch": 4.189980481457384, "grad_norm": 2.486844778060913, "learning_rate": 3.3004958514329692e-06, "loss": 0.6051, "step": 12880 }, { "epoch": 4.191607026675341, "grad_norm": 2.2572789192199707, "learning_rate": 3.2876910442956573e-06, "loss": 0.5964, "step": 12885 }, { "epoch": 4.193233571893298, "grad_norm": 2.700684070587158, "learning_rate": 3.27490937616583e-06, "loss": 0.6274, "step": 12890 }, { "epoch": 4.194860117111256, "grad_norm": 2.216101884841919, "learning_rate": 3.262150860665017e-06, "loss": 0.5871, "step": 12895 }, { "epoch": 4.196486662329213, "grad_norm": 2.355828285217285, "learning_rate": 3.2494155113901e-06, "loss": 0.6068, "step": 12900 }, { "epoch": 4.19811320754717, "grad_norm": 2.384627342224121, "learning_rate": 3.2367033419132388e-06, "loss": 0.619, "step": 12905 }, { "epoch": 4.199739752765127, "grad_norm": 2.2327868938446045, "learning_rate": 3.224014365781933e-06, "loss": 0.6276, "step": 12910 }, { "epoch": 4.201366297983084, "grad_norm": 2.0765182971954346, "learning_rate": 3.2113485965189205e-06, "loss": 0.6078, "step": 12915 }, { "epoch": 4.202992843201041, "grad_norm": 2.2915468215942383, "learning_rate": 3.1987060476222435e-06, "loss": 0.6005, "step": 12920 }, { "epoch": 4.204619388418998, "grad_norm": 2.486490249633789, "learning_rate": 3.1860867325651717e-06, "loss": 0.6157, "step": 12925 }, { "epoch": 4.206245933636955, "grad_norm": 2.343235731124878, "learning_rate": 3.173490664796233e-06, "loss": 0.5871, "step": 12930 }, { "epoch": 4.207872478854912, "grad_norm": 2.1548774242401123, "learning_rate": 3.160917857739165e-06, "loss": 0.5915, "step": 12935 }, { "epoch": 4.209499024072869, "grad_norm": 2.2372446060180664, "learning_rate": 3.1483683247929275e-06, "loss": 0.5895, "step": 12940 }, { "epoch": 4.211125569290826, "grad_norm": 2.7394518852233887, "learning_rate": 3.1358420793316744e-06, "loss": 0.6046, "step": 12945 }, { "epoch": 4.212752114508783, "grad_norm": 2.1555449962615967, "learning_rate": 3.1233391347047476e-06, "loss": 0.5843, "step": 12950 }, { "epoch": 4.214378659726741, "grad_norm": 6.938313961029053, "learning_rate": 3.110859504236635e-06, "loss": 0.6005, "step": 12955 }, { "epoch": 4.216005204944698, "grad_norm": 2.1898958683013916, "learning_rate": 3.0984032012270043e-06, "loss": 0.6032, "step": 12960 }, { "epoch": 4.217631750162655, "grad_norm": 2.1506869792938232, "learning_rate": 3.085970238950653e-06, "loss": 0.5854, "step": 12965 }, { "epoch": 4.2192582953806115, "grad_norm": 2.2408323287963867, "learning_rate": 3.0735606306574875e-06, "loss": 0.6201, "step": 12970 }, { "epoch": 4.220884840598568, "grad_norm": 2.4407107830047607, "learning_rate": 3.0611743895725686e-06, "loss": 0.5857, "step": 12975 }, { "epoch": 4.222511385816525, "grad_norm": 2.288879871368408, "learning_rate": 3.048811528896006e-06, "loss": 0.6074, "step": 12980 }, { "epoch": 4.224137931034483, "grad_norm": 2.333875894546509, "learning_rate": 3.036472061803025e-06, "loss": 0.6236, "step": 12985 }, { "epoch": 4.22576447625244, "grad_norm": 2.0758614540100098, "learning_rate": 3.024156001443901e-06, "loss": 0.5939, "step": 12990 }, { "epoch": 4.227391021470397, "grad_norm": 17.28084945678711, "learning_rate": 3.011863360943984e-06, "loss": 0.5874, "step": 12995 }, { "epoch": 4.229017566688354, "grad_norm": 2.7625977993011475, "learning_rate": 2.99959415340364e-06, "loss": 0.6448, "step": 13000 }, { "epoch": 4.230644111906311, "grad_norm": 2.0970189571380615, "learning_rate": 2.987348391898284e-06, "loss": 0.5858, "step": 13005 }, { "epoch": 4.232270657124268, "grad_norm": 2.935084104537964, "learning_rate": 2.9751260894783362e-06, "loss": 0.6077, "step": 13010 }, { "epoch": 4.233897202342225, "grad_norm": 2.0353829860687256, "learning_rate": 2.962927259169221e-06, "loss": 0.581, "step": 13015 }, { "epoch": 4.235523747560182, "grad_norm": 2.1475942134857178, "learning_rate": 2.9507519139713364e-06, "loss": 0.578, "step": 13020 }, { "epoch": 4.237150292778139, "grad_norm": 2.0711092948913574, "learning_rate": 2.9386000668600698e-06, "loss": 0.5895, "step": 13025 }, { "epoch": 4.238776837996096, "grad_norm": 2.343844413757324, "learning_rate": 2.926471730785743e-06, "loss": 0.6123, "step": 13030 }, { "epoch": 4.240403383214053, "grad_norm": 2.2693004608154297, "learning_rate": 2.9143669186736445e-06, "loss": 0.5798, "step": 13035 }, { "epoch": 4.24202992843201, "grad_norm": 2.457479238510132, "learning_rate": 2.9022856434239796e-06, "loss": 0.6087, "step": 13040 }, { "epoch": 4.243656473649968, "grad_norm": 2.129124164581299, "learning_rate": 2.8902279179118837e-06, "loss": 0.6415, "step": 13045 }, { "epoch": 4.245283018867925, "grad_norm": 2.264616012573242, "learning_rate": 2.878193754987374e-06, "loss": 0.5968, "step": 13050 }, { "epoch": 4.246909564085882, "grad_norm": 2.3036763668060303, "learning_rate": 2.866183167475378e-06, "loss": 0.5908, "step": 13055 }, { "epoch": 4.2485361093038385, "grad_norm": 2.0123422145843506, "learning_rate": 2.8541961681756795e-06, "loss": 0.6054, "step": 13060 }, { "epoch": 4.250162654521795, "grad_norm": 2.1743271350860596, "learning_rate": 2.8422327698629405e-06, "loss": 0.6075, "step": 13065 }, { "epoch": 4.251789199739752, "grad_norm": 2.558833122253418, "learning_rate": 2.8302929852866644e-06, "loss": 0.5922, "step": 13070 }, { "epoch": 4.25341574495771, "grad_norm": 2.240840435028076, "learning_rate": 2.818376827171193e-06, "loss": 0.6121, "step": 13075 }, { "epoch": 4.255042290175667, "grad_norm": 2.0290110111236572, "learning_rate": 2.8064843082156787e-06, "loss": 0.5783, "step": 13080 }, { "epoch": 4.256668835393624, "grad_norm": 2.0330371856689453, "learning_rate": 2.794615441094095e-06, "loss": 0.5582, "step": 13085 }, { "epoch": 4.258295380611581, "grad_norm": 2.271972894668579, "learning_rate": 2.782770238455193e-06, "loss": 0.6019, "step": 13090 }, { "epoch": 4.259921925829538, "grad_norm": 3.2765743732452393, "learning_rate": 2.770948712922522e-06, "loss": 0.59, "step": 13095 }, { "epoch": 4.261548471047496, "grad_norm": 2.7162623405456543, "learning_rate": 2.759150877094388e-06, "loss": 0.5984, "step": 13100 }, { "epoch": 4.2631750162654525, "grad_norm": 2.14172625541687, "learning_rate": 2.747376743543853e-06, "loss": 0.5702, "step": 13105 }, { "epoch": 4.264801561483409, "grad_norm": 2.3000221252441406, "learning_rate": 2.735626324818727e-06, "loss": 0.5908, "step": 13110 }, { "epoch": 4.266428106701366, "grad_norm": 2.5657973289489746, "learning_rate": 2.7238996334415262e-06, "loss": 0.6329, "step": 13115 }, { "epoch": 4.268054651919323, "grad_norm": 2.254990816116333, "learning_rate": 2.712196681909507e-06, "loss": 0.6069, "step": 13120 }, { "epoch": 4.26968119713728, "grad_norm": 2.045351505279541, "learning_rate": 2.7005174826946004e-06, "loss": 0.5857, "step": 13125 }, { "epoch": 4.271307742355237, "grad_norm": 2.3086118698120117, "learning_rate": 2.688862048243443e-06, "loss": 0.6122, "step": 13130 }, { "epoch": 4.272934287573195, "grad_norm": 2.1348495483398438, "learning_rate": 2.6772303909773366e-06, "loss": 0.5837, "step": 13135 }, { "epoch": 4.274560832791152, "grad_norm": 2.0408337116241455, "learning_rate": 2.6656225232922492e-06, "loss": 0.6114, "step": 13140 }, { "epoch": 4.276187378009109, "grad_norm": 2.258314371109009, "learning_rate": 2.6540384575587885e-06, "loss": 0.5926, "step": 13145 }, { "epoch": 4.2778139232270656, "grad_norm": 2.1348114013671875, "learning_rate": 2.642478206122201e-06, "loss": 0.6269, "step": 13150 }, { "epoch": 4.2794404684450225, "grad_norm": 2.749288558959961, "learning_rate": 2.6309417813023513e-06, "loss": 0.5978, "step": 13155 }, { "epoch": 4.28106701366298, "grad_norm": 2.077808380126953, "learning_rate": 2.619429195393713e-06, "loss": 0.6157, "step": 13160 }, { "epoch": 4.282693558880937, "grad_norm": 2.2650487422943115, "learning_rate": 2.607940460665359e-06, "loss": 0.6376, "step": 13165 }, { "epoch": 4.284320104098894, "grad_norm": 2.7509374618530273, "learning_rate": 2.5964755893609414e-06, "loss": 0.6037, "step": 13170 }, { "epoch": 4.285946649316851, "grad_norm": 3.157712697982788, "learning_rate": 2.585034593698668e-06, "loss": 0.6096, "step": 13175 }, { "epoch": 4.287573194534808, "grad_norm": 2.2161614894866943, "learning_rate": 2.5736174858713267e-06, "loss": 0.6009, "step": 13180 }, { "epoch": 4.289199739752765, "grad_norm": 2.189051389694214, "learning_rate": 2.5622242780462243e-06, "loss": 0.6001, "step": 13185 }, { "epoch": 4.290826284970722, "grad_norm": 2.3055875301361084, "learning_rate": 2.5508549823652114e-06, "loss": 0.6343, "step": 13190 }, { "epoch": 4.2924528301886795, "grad_norm": 2.214721918106079, "learning_rate": 2.5395096109446488e-06, "loss": 0.6182, "step": 13195 }, { "epoch": 4.294079375406636, "grad_norm": 2.2350032329559326, "learning_rate": 2.528188175875412e-06, "loss": 0.6032, "step": 13200 }, { "epoch": 4.295705920624593, "grad_norm": 2.064831018447876, "learning_rate": 2.516890689222845e-06, "loss": 0.6392, "step": 13205 }, { "epoch": 4.29733246584255, "grad_norm": 2.1273586750030518, "learning_rate": 2.5056171630267937e-06, "loss": 0.6279, "step": 13210 }, { "epoch": 4.298959011060507, "grad_norm": 2.605733633041382, "learning_rate": 2.4943676093015513e-06, "loss": 0.6033, "step": 13215 }, { "epoch": 4.300585556278465, "grad_norm": 2.7585792541503906, "learning_rate": 2.483142040035874e-06, "loss": 0.6128, "step": 13220 }, { "epoch": 4.302212101496422, "grad_norm": 2.709352731704712, "learning_rate": 2.471940467192957e-06, "loss": 0.5904, "step": 13225 }, { "epoch": 4.303838646714379, "grad_norm": 2.4799320697784424, "learning_rate": 2.4607629027104147e-06, "loss": 0.6199, "step": 13230 }, { "epoch": 4.305465191932336, "grad_norm": 2.359222412109375, "learning_rate": 2.449609358500288e-06, "loss": 0.5858, "step": 13235 }, { "epoch": 4.307091737150293, "grad_norm": 2.0881259441375732, "learning_rate": 2.4384798464490016e-06, "loss": 0.5912, "step": 13240 }, { "epoch": 4.3087182823682495, "grad_norm": 1.9966338872909546, "learning_rate": 2.427374378417388e-06, "loss": 0.592, "step": 13245 }, { "epoch": 4.310344827586207, "grad_norm": 2.474705457687378, "learning_rate": 2.416292966240641e-06, "loss": 0.6016, "step": 13250 }, { "epoch": 4.311971372804164, "grad_norm": 3.0268406867980957, "learning_rate": 2.405235621728322e-06, "loss": 0.6061, "step": 13255 }, { "epoch": 4.313597918022121, "grad_norm": 4.327364444732666, "learning_rate": 2.394202356664349e-06, "loss": 0.6119, "step": 13260 }, { "epoch": 4.315224463240078, "grad_norm": 2.194871187210083, "learning_rate": 2.383193182806978e-06, "loss": 0.6134, "step": 13265 }, { "epoch": 4.316851008458035, "grad_norm": 2.07330322265625, "learning_rate": 2.3722081118887767e-06, "loss": 0.6216, "step": 13270 }, { "epoch": 4.318477553675992, "grad_norm": 2.8549962043762207, "learning_rate": 2.3612471556166442e-06, "loss": 0.6119, "step": 13275 }, { "epoch": 4.32010409889395, "grad_norm": 2.333425283432007, "learning_rate": 2.3503103256717673e-06, "loss": 0.6029, "step": 13280 }, { "epoch": 4.3217306441119065, "grad_norm": 2.603220224380493, "learning_rate": 2.3393976337096334e-06, "loss": 0.6238, "step": 13285 }, { "epoch": 4.3233571893298635, "grad_norm": 2.699075222015381, "learning_rate": 2.328509091359984e-06, "loss": 0.6103, "step": 13290 }, { "epoch": 4.32498373454782, "grad_norm": 2.9168403148651123, "learning_rate": 2.3176447102268602e-06, "loss": 0.5861, "step": 13295 }, { "epoch": 4.326610279765777, "grad_norm": 2.3375420570373535, "learning_rate": 2.3068045018885153e-06, "loss": 0.623, "step": 13300 }, { "epoch": 4.328236824983734, "grad_norm": 2.315999746322632, "learning_rate": 2.2959884778974735e-06, "loss": 0.5896, "step": 13305 }, { "epoch": 4.329863370201692, "grad_norm": 2.135993242263794, "learning_rate": 2.285196649780455e-06, "loss": 0.6104, "step": 13310 }, { "epoch": 4.331489915419649, "grad_norm": 2.2192249298095703, "learning_rate": 2.2744290290384247e-06, "loss": 0.607, "step": 13315 }, { "epoch": 4.333116460637606, "grad_norm": 2.0429885387420654, "learning_rate": 2.2636856271465194e-06, "loss": 0.6061, "step": 13320 }, { "epoch": 4.334743005855563, "grad_norm": 2.8340260982513428, "learning_rate": 2.252966455554101e-06, "loss": 0.6189, "step": 13325 }, { "epoch": 4.33636955107352, "grad_norm": 2.315480947494507, "learning_rate": 2.242271525684672e-06, "loss": 0.5966, "step": 13330 }, { "epoch": 4.3379960962914765, "grad_norm": 3.0812127590179443, "learning_rate": 2.2316008489359304e-06, "loss": 0.63, "step": 13335 }, { "epoch": 4.339622641509434, "grad_norm": 2.2564315795898438, "learning_rate": 2.2209544366797066e-06, "loss": 0.5944, "step": 13340 }, { "epoch": 4.341249186727391, "grad_norm": 2.032088279724121, "learning_rate": 2.2103323002619857e-06, "loss": 0.5801, "step": 13345 }, { "epoch": 4.342875731945348, "grad_norm": 2.143538236618042, "learning_rate": 2.199734451002869e-06, "loss": 0.6006, "step": 13350 }, { "epoch": 4.344502277163305, "grad_norm": 2.0386180877685547, "learning_rate": 2.189160900196585e-06, "loss": 0.6095, "step": 13355 }, { "epoch": 4.346128822381262, "grad_norm": 2.3832762241363525, "learning_rate": 2.178611659111468e-06, "loss": 0.5936, "step": 13360 }, { "epoch": 4.347755367599219, "grad_norm": 2.4224894046783447, "learning_rate": 2.1680867389899355e-06, "loss": 0.6062, "step": 13365 }, { "epoch": 4.349381912817177, "grad_norm": 2.3058247566223145, "learning_rate": 2.1575861510485017e-06, "loss": 0.6307, "step": 13370 }, { "epoch": 4.351008458035134, "grad_norm": 2.045710563659668, "learning_rate": 2.147109906477726e-06, "loss": 0.6001, "step": 13375 }, { "epoch": 4.3526350032530905, "grad_norm": 2.2343456745147705, "learning_rate": 2.136658016442253e-06, "loss": 0.6363, "step": 13380 }, { "epoch": 4.354261548471047, "grad_norm": 2.8338370323181152, "learning_rate": 2.126230492080744e-06, "loss": 0.624, "step": 13385 }, { "epoch": 4.355888093689004, "grad_norm": 2.3945822715759277, "learning_rate": 2.1158273445059135e-06, "loss": 0.6151, "step": 13390 }, { "epoch": 4.357514638906961, "grad_norm": 2.234694004058838, "learning_rate": 2.1054485848044952e-06, "loss": 0.6072, "step": 13395 }, { "epoch": 4.359141184124919, "grad_norm": 2.2890405654907227, "learning_rate": 2.095094224037228e-06, "loss": 0.6339, "step": 13400 }, { "epoch": 4.360767729342876, "grad_norm": 2.266394853591919, "learning_rate": 2.0847642732388457e-06, "loss": 0.597, "step": 13405 }, { "epoch": 4.362394274560833, "grad_norm": 2.2408626079559326, "learning_rate": 2.0744587434180757e-06, "loss": 0.6224, "step": 13410 }, { "epoch": 4.36402081977879, "grad_norm": 2.1054515838623047, "learning_rate": 2.0641776455576105e-06, "loss": 0.6094, "step": 13415 }, { "epoch": 4.365647364996747, "grad_norm": 2.2183995246887207, "learning_rate": 2.0539209906141167e-06, "loss": 0.5976, "step": 13420 }, { "epoch": 4.367273910214704, "grad_norm": 2.342390775680542, "learning_rate": 2.0436887895182e-06, "loss": 0.5971, "step": 13425 }, { "epoch": 4.368900455432661, "grad_norm": 2.0689449310302734, "learning_rate": 2.0334810531744213e-06, "loss": 0.5908, "step": 13430 }, { "epoch": 4.370527000650618, "grad_norm": 2.089670181274414, "learning_rate": 2.0232977924612457e-06, "loss": 0.5898, "step": 13435 }, { "epoch": 4.372153545868575, "grad_norm": 2.30362606048584, "learning_rate": 2.0131390182310805e-06, "loss": 0.606, "step": 13440 }, { "epoch": 4.373780091086532, "grad_norm": 2.311826467514038, "learning_rate": 2.003004741310216e-06, "loss": 0.6044, "step": 13445 }, { "epoch": 4.375406636304489, "grad_norm": 2.338933229446411, "learning_rate": 1.992894972498846e-06, "loss": 0.6068, "step": 13450 }, { "epoch": 4.377033181522446, "grad_norm": 2.32281756401062, "learning_rate": 1.982809722571047e-06, "loss": 0.6066, "step": 13455 }, { "epoch": 4.378659726740404, "grad_norm": 2.3082902431488037, "learning_rate": 1.972749002274765e-06, "loss": 0.6101, "step": 13460 }, { "epoch": 4.380286271958361, "grad_norm": 2.083157777786255, "learning_rate": 1.9627128223317942e-06, "loss": 0.6151, "step": 13465 }, { "epoch": 4.3819128171763175, "grad_norm": 2.093111753463745, "learning_rate": 1.952701193437792e-06, "loss": 0.6313, "step": 13470 }, { "epoch": 4.383539362394274, "grad_norm": 2.210050582885742, "learning_rate": 1.942714126262238e-06, "loss": 0.6016, "step": 13475 }, { "epoch": 4.385165907612231, "grad_norm": 2.0351722240448, "learning_rate": 1.93275163144844e-06, "loss": 0.6093, "step": 13480 }, { "epoch": 4.386792452830189, "grad_norm": 2.6686933040618896, "learning_rate": 1.9228137196135254e-06, "loss": 0.6177, "step": 13485 }, { "epoch": 4.388418998048146, "grad_norm": 2.5283005237579346, "learning_rate": 1.912900401348422e-06, "loss": 0.6184, "step": 13490 }, { "epoch": 4.390045543266103, "grad_norm": 1.9730428457260132, "learning_rate": 1.9030116872178316e-06, "loss": 0.5938, "step": 13495 }, { "epoch": 4.39167208848406, "grad_norm": 2.1344470977783203, "learning_rate": 1.8931475877602579e-06, "loss": 0.6306, "step": 13500 }, { "epoch": 4.393298633702017, "grad_norm": 1.9898133277893066, "learning_rate": 1.8833081134879637e-06, "loss": 0.611, "step": 13505 }, { "epoch": 4.394925178919974, "grad_norm": 2.810833215713501, "learning_rate": 1.8734932748869588e-06, "loss": 0.5925, "step": 13510 }, { "epoch": 4.396551724137931, "grad_norm": 2.492091655731201, "learning_rate": 1.8637030824170121e-06, "loss": 0.5901, "step": 13515 }, { "epoch": 4.398178269355888, "grad_norm": 2.828150749206543, "learning_rate": 1.8539375465116193e-06, "loss": 0.6431, "step": 13520 }, { "epoch": 4.399804814573845, "grad_norm": 2.4096462726593018, "learning_rate": 1.8441966775780112e-06, "loss": 0.5792, "step": 13525 }, { "epoch": 4.401431359791802, "grad_norm": 2.353559732437134, "learning_rate": 1.834480485997106e-06, "loss": 0.6104, "step": 13530 }, { "epoch": 4.403057905009759, "grad_norm": 1.9907680749893188, "learning_rate": 1.8247889821235543e-06, "loss": 0.5964, "step": 13535 }, { "epoch": 4.404684450227716, "grad_norm": 2.5092782974243164, "learning_rate": 1.815122176285669e-06, "loss": 0.6118, "step": 13540 }, { "epoch": 4.406310995445674, "grad_norm": 2.1358978748321533, "learning_rate": 1.8054800787854569e-06, "loss": 0.5925, "step": 13545 }, { "epoch": 4.407937540663631, "grad_norm": 2.424093008041382, "learning_rate": 1.7958626998985928e-06, "loss": 0.6362, "step": 13550 }, { "epoch": 4.409564085881588, "grad_norm": 2.92620849609375, "learning_rate": 1.7862700498744085e-06, "loss": 0.5897, "step": 13555 }, { "epoch": 4.411190631099545, "grad_norm": 2.4644806385040283, "learning_rate": 1.7767021389358706e-06, "loss": 0.6257, "step": 13560 }, { "epoch": 4.4128171763175015, "grad_norm": 2.018040657043457, "learning_rate": 1.767158977279601e-06, "loss": 0.6118, "step": 13565 }, { "epoch": 4.414443721535458, "grad_norm": 2.133514642715454, "learning_rate": 1.7576405750758224e-06, "loss": 0.5955, "step": 13570 }, { "epoch": 4.416070266753415, "grad_norm": 2.5312135219573975, "learning_rate": 1.74814694246839e-06, "loss": 0.5924, "step": 13575 }, { "epoch": 4.417696811971373, "grad_norm": 2.0397260189056396, "learning_rate": 1.7386780895747578e-06, "loss": 0.6171, "step": 13580 }, { "epoch": 4.41932335718933, "grad_norm": 1.9202146530151367, "learning_rate": 1.729234026485968e-06, "loss": 0.5873, "step": 13585 }, { "epoch": 4.420949902407287, "grad_norm": 3.7298386096954346, "learning_rate": 1.7198147632666416e-06, "loss": 0.587, "step": 13590 }, { "epoch": 4.422576447625244, "grad_norm": 2.167330265045166, "learning_rate": 1.7104203099549827e-06, "loss": 0.5749, "step": 13595 }, { "epoch": 4.424202992843201, "grad_norm": 2.2834017276763916, "learning_rate": 1.701050676562735e-06, "loss": 0.5965, "step": 13600 }, { "epoch": 4.4258295380611585, "grad_norm": 2.037825107574463, "learning_rate": 1.691705873075211e-06, "loss": 0.5955, "step": 13605 }, { "epoch": 4.427456083279115, "grad_norm": 2.448396682739258, "learning_rate": 1.6823859094512507e-06, "loss": 0.624, "step": 13610 }, { "epoch": 4.429082628497072, "grad_norm": 2.062052011489868, "learning_rate": 1.6730907956232306e-06, "loss": 0.5848, "step": 13615 }, { "epoch": 4.430709173715029, "grad_norm": 2.03035306930542, "learning_rate": 1.6638205414970298e-06, "loss": 0.5875, "step": 13620 }, { "epoch": 4.432335718932986, "grad_norm": 2.0660645961761475, "learning_rate": 1.654575156952054e-06, "loss": 0.6194, "step": 13625 }, { "epoch": 4.433962264150943, "grad_norm": 2.2230019569396973, "learning_rate": 1.6453546518411855e-06, "loss": 0.601, "step": 13630 }, { "epoch": 4.4355888093689, "grad_norm": 2.1817357540130615, "learning_rate": 1.636159035990803e-06, "loss": 0.613, "step": 13635 }, { "epoch": 4.437215354586858, "grad_norm": 2.207064390182495, "learning_rate": 1.6269883192007618e-06, "loss": 0.5996, "step": 13640 }, { "epoch": 4.438841899804815, "grad_norm": 2.2545690536499023, "learning_rate": 1.6178425112443774e-06, "loss": 0.6121, "step": 13645 }, { "epoch": 4.440468445022772, "grad_norm": 2.3414602279663086, "learning_rate": 1.608721621868428e-06, "loss": 0.6295, "step": 13650 }, { "epoch": 4.4420949902407285, "grad_norm": 2.3920414447784424, "learning_rate": 1.5996256607931193e-06, "loss": 0.5941, "step": 13655 }, { "epoch": 4.443721535458685, "grad_norm": 2.2873120307922363, "learning_rate": 1.5905546377121077e-06, "loss": 0.6179, "step": 13660 }, { "epoch": 4.445348080676643, "grad_norm": 2.398427963256836, "learning_rate": 1.583315781100353e-06, "loss": 0.5979, "step": 13665 }, { "epoch": 4.4469746258946, "grad_norm": 2.2126176357269287, "learning_rate": 1.5742896707522242e-06, "loss": 0.6173, "step": 13670 }, { "epoch": 4.448601171112557, "grad_norm": 2.4789159297943115, "learning_rate": 1.5652885253991944e-06, "loss": 0.6519, "step": 13675 }, { "epoch": 4.450227716330514, "grad_norm": 2.2594988346099854, "learning_rate": 1.5563123546338572e-06, "loss": 0.5882, "step": 13680 }, { "epoch": 4.451854261548471, "grad_norm": 2.307455062866211, "learning_rate": 1.5473611680222045e-06, "loss": 0.6314, "step": 13685 }, { "epoch": 4.453480806766428, "grad_norm": 2.2615737915039062, "learning_rate": 1.5384349751035948e-06, "loss": 0.6094, "step": 13690 }, { "epoch": 4.455107351984386, "grad_norm": 2.9631905555725098, "learning_rate": 1.5295337853907604e-06, "loss": 0.589, "step": 13695 }, { "epoch": 4.4567338972023425, "grad_norm": 2.044833183288574, "learning_rate": 1.5206576083697687e-06, "loss": 0.5841, "step": 13700 }, { "epoch": 4.458360442420299, "grad_norm": 2.2695491313934326, "learning_rate": 1.5118064535000614e-06, "loss": 0.6207, "step": 13705 }, { "epoch": 4.459986987638256, "grad_norm": 1.9073365926742554, "learning_rate": 1.502980330214379e-06, "loss": 0.5814, "step": 13710 }, { "epoch": 4.461613532856213, "grad_norm": 2.171907424926758, "learning_rate": 1.4941792479188171e-06, "loss": 0.6072, "step": 13715 }, { "epoch": 4.46324007807417, "grad_norm": 2.340442657470703, "learning_rate": 1.4854032159927562e-06, "loss": 0.6231, "step": 13720 }, { "epoch": 4.464866623292128, "grad_norm": 2.614452600479126, "learning_rate": 1.4766522437889035e-06, "loss": 0.5633, "step": 13725 }, { "epoch": 4.466493168510085, "grad_norm": 2.452821969985962, "learning_rate": 1.4679263406332467e-06, "loss": 0.5978, "step": 13730 }, { "epoch": 4.468119713728042, "grad_norm": 2.4186081886291504, "learning_rate": 1.4592255158250605e-06, "loss": 0.5868, "step": 13735 }, { "epoch": 4.469746258945999, "grad_norm": 2.2780089378356934, "learning_rate": 1.450549778636895e-06, "loss": 0.6151, "step": 13740 }, { "epoch": 4.4713728041639555, "grad_norm": 2.662311315536499, "learning_rate": 1.4418991383145675e-06, "loss": 0.6099, "step": 13745 }, { "epoch": 4.4729993493819125, "grad_norm": 2.089160919189453, "learning_rate": 1.43327360407714e-06, "loss": 0.6126, "step": 13750 }, { "epoch": 4.47462589459987, "grad_norm": 2.2200684547424316, "learning_rate": 1.424673185116926e-06, "loss": 0.6217, "step": 13755 }, { "epoch": 4.476252439817827, "grad_norm": 2.120103359222412, "learning_rate": 1.416097890599466e-06, "loss": 0.6153, "step": 13760 }, { "epoch": 4.477878985035784, "grad_norm": 2.2231285572052, "learning_rate": 1.4075477296635359e-06, "loss": 0.6043, "step": 13765 }, { "epoch": 4.479505530253741, "grad_norm": 2.0046818256378174, "learning_rate": 1.3990227114211191e-06, "loss": 0.5882, "step": 13770 }, { "epoch": 4.481132075471698, "grad_norm": 2.097630023956299, "learning_rate": 1.3905228449574066e-06, "loss": 0.5992, "step": 13775 }, { "epoch": 4.482758620689655, "grad_norm": 2.1202268600463867, "learning_rate": 1.3820481393307855e-06, "loss": 0.6002, "step": 13780 }, { "epoch": 4.484385165907613, "grad_norm": 2.250000476837158, "learning_rate": 1.3735986035728232e-06, "loss": 0.5961, "step": 13785 }, { "epoch": 4.4860117111255695, "grad_norm": 2.1143972873687744, "learning_rate": 1.365174246688275e-06, "loss": 0.5937, "step": 13790 }, { "epoch": 4.487638256343526, "grad_norm": 2.0777087211608887, "learning_rate": 1.356775077655048e-06, "loss": 0.5981, "step": 13795 }, { "epoch": 4.489264801561483, "grad_norm": 2.2611429691314697, "learning_rate": 1.3484011054242157e-06, "loss": 0.6002, "step": 13800 }, { "epoch": 4.49089134677944, "grad_norm": 2.1241838932037354, "learning_rate": 1.3400523389199976e-06, "loss": 0.5941, "step": 13805 }, { "epoch": 4.492517891997397, "grad_norm": 2.054589033126831, "learning_rate": 1.3317287870397572e-06, "loss": 0.6068, "step": 13810 }, { "epoch": 4.494144437215355, "grad_norm": 2.5084924697875977, "learning_rate": 1.3234304586539737e-06, "loss": 0.622, "step": 13815 }, { "epoch": 4.495770982433312, "grad_norm": 2.368989944458008, "learning_rate": 1.3151573626062535e-06, "loss": 0.6081, "step": 13820 }, { "epoch": 4.497397527651269, "grad_norm": 2.2150580883026123, "learning_rate": 1.3069095077133108e-06, "loss": 0.5901, "step": 13825 }, { "epoch": 4.499024072869226, "grad_norm": 2.227565050125122, "learning_rate": 1.298686902764959e-06, "loss": 0.5816, "step": 13830 }, { "epoch": 4.500650618087183, "grad_norm": 2.3818137645721436, "learning_rate": 1.2904895565241076e-06, "loss": 0.6138, "step": 13835 }, { "epoch": 4.5022771633051395, "grad_norm": 2.314545154571533, "learning_rate": 1.2823174777267439e-06, "loss": 0.6164, "step": 13840 }, { "epoch": 4.503903708523097, "grad_norm": 2.640153408050537, "learning_rate": 1.2741706750819232e-06, "loss": 0.6004, "step": 13845 }, { "epoch": 4.505530253741054, "grad_norm": 1.9388872385025024, "learning_rate": 1.266049157271773e-06, "loss": 0.5973, "step": 13850 }, { "epoch": 4.507156798959011, "grad_norm": 2.119119882583618, "learning_rate": 1.2579529329514645e-06, "loss": 0.6035, "step": 13855 }, { "epoch": 4.508783344176968, "grad_norm": 2.359867572784424, "learning_rate": 1.2498820107492204e-06, "loss": 0.63, "step": 13860 }, { "epoch": 4.510409889394925, "grad_norm": 2.2548766136169434, "learning_rate": 1.2418363992662997e-06, "loss": 0.5911, "step": 13865 }, { "epoch": 4.512036434612883, "grad_norm": 2.283270835876465, "learning_rate": 1.233816107076985e-06, "loss": 0.5932, "step": 13870 }, { "epoch": 4.51366297983084, "grad_norm": 2.1963112354278564, "learning_rate": 1.2258211427285671e-06, "loss": 0.5878, "step": 13875 }, { "epoch": 4.5152895250487965, "grad_norm": 2.4247801303863525, "learning_rate": 1.2178515147413665e-06, "loss": 0.6076, "step": 13880 }, { "epoch": 4.5169160702667535, "grad_norm": 2.2438313961029053, "learning_rate": 1.2099072316086757e-06, "loss": 0.6221, "step": 13885 }, { "epoch": 4.51854261548471, "grad_norm": 2.403510570526123, "learning_rate": 1.2019883017967943e-06, "loss": 0.6022, "step": 13890 }, { "epoch": 4.520169160702667, "grad_norm": 2.0666253566741943, "learning_rate": 1.194094733745002e-06, "loss": 0.5915, "step": 13895 }, { "epoch": 4.521795705920624, "grad_norm": 2.325934410095215, "learning_rate": 1.1862265358655505e-06, "loss": 0.5797, "step": 13900 }, { "epoch": 4.523422251138582, "grad_norm": 2.2772769927978516, "learning_rate": 1.1783837165436406e-06, "loss": 0.5991, "step": 13905 }, { "epoch": 4.525048796356539, "grad_norm": 2.4774351119995117, "learning_rate": 1.170566284137442e-06, "loss": 0.634, "step": 13910 }, { "epoch": 4.526675341574496, "grad_norm": 4.598024845123291, "learning_rate": 1.1627742469780684e-06, "loss": 0.6306, "step": 13915 }, { "epoch": 4.528301886792453, "grad_norm": 2.2267725467681885, "learning_rate": 1.1550076133695604e-06, "loss": 0.5909, "step": 13920 }, { "epoch": 4.52992843201041, "grad_norm": 2.3685433864593506, "learning_rate": 1.1472663915888888e-06, "loss": 0.5712, "step": 13925 }, { "epoch": 4.531554977228367, "grad_norm": 2.711865186691284, "learning_rate": 1.1395505898859487e-06, "loss": 0.6308, "step": 13930 }, { "epoch": 4.533181522446324, "grad_norm": 2.2160632610321045, "learning_rate": 1.1318602164835434e-06, "loss": 0.6059, "step": 13935 }, { "epoch": 4.534808067664281, "grad_norm": 1.9569048881530762, "learning_rate": 1.1241952795773697e-06, "loss": 0.6007, "step": 13940 }, { "epoch": 4.536434612882238, "grad_norm": 2.2012217044830322, "learning_rate": 1.1165557873360267e-06, "loss": 0.5861, "step": 13945 }, { "epoch": 4.538061158100195, "grad_norm": 2.1481223106384277, "learning_rate": 1.108941747900985e-06, "loss": 0.6111, "step": 13950 }, { "epoch": 4.539687703318152, "grad_norm": 2.3771395683288574, "learning_rate": 1.1013531693865985e-06, "loss": 0.5952, "step": 13955 }, { "epoch": 4.541314248536109, "grad_norm": 2.468219041824341, "learning_rate": 1.0937900598800872e-06, "loss": 0.5997, "step": 13960 }, { "epoch": 4.542940793754067, "grad_norm": 2.475027561187744, "learning_rate": 1.0862524274415282e-06, "loss": 0.6236, "step": 13965 }, { "epoch": 4.544567338972024, "grad_norm": 2.3337695598602295, "learning_rate": 1.0787402801038405e-06, "loss": 0.6248, "step": 13970 }, { "epoch": 4.5461938841899805, "grad_norm": 2.1060009002685547, "learning_rate": 1.071253625872795e-06, "loss": 0.5921, "step": 13975 }, { "epoch": 4.547820429407937, "grad_norm": 2.5564815998077393, "learning_rate": 1.0637924727269822e-06, "loss": 0.5999, "step": 13980 }, { "epoch": 4.549446974625894, "grad_norm": 2.3401384353637695, "learning_rate": 1.0563568286178216e-06, "loss": 0.6002, "step": 13985 }, { "epoch": 4.551073519843852, "grad_norm": 2.1616568565368652, "learning_rate": 1.0489467014695526e-06, "loss": 0.6227, "step": 13990 }, { "epoch": 4.552700065061809, "grad_norm": 2.3356876373291016, "learning_rate": 1.0415620991792135e-06, "loss": 0.6052, "step": 13995 }, { "epoch": 4.554326610279766, "grad_norm": 2.068387269973755, "learning_rate": 1.0342030296166428e-06, "loss": 0.6063, "step": 14000 }, { "epoch": 4.555953155497723, "grad_norm": 2.4070541858673096, "learning_rate": 1.0268695006244695e-06, "loss": 0.6329, "step": 14005 }, { "epoch": 4.55757970071568, "grad_norm": 2.427407741546631, "learning_rate": 1.0195615200180974e-06, "loss": 0.6172, "step": 14010 }, { "epoch": 4.559206245933637, "grad_norm": 2.068614959716797, "learning_rate": 1.0122790955857192e-06, "loss": 0.5901, "step": 14015 }, { "epoch": 4.560832791151594, "grad_norm": 2.4775562286376953, "learning_rate": 1.0050222350882682e-06, "loss": 0.6524, "step": 14020 }, { "epoch": 4.562459336369551, "grad_norm": 2.4976789951324463, "learning_rate": 9.97790946259461e-07, "loss": 0.5967, "step": 14025 }, { "epoch": 4.564085881587508, "grad_norm": 2.4459400177001953, "learning_rate": 9.905852368057383e-07, "loss": 0.6231, "step": 14030 }, { "epoch": 4.565712426805465, "grad_norm": 2.338926076889038, "learning_rate": 9.834051144062994e-07, "loss": 0.5923, "step": 14035 }, { "epoch": 4.567338972023422, "grad_norm": 2.1870245933532715, "learning_rate": 9.762505867130594e-07, "loss": 0.6371, "step": 14040 }, { "epoch": 4.568965517241379, "grad_norm": 2.2032628059387207, "learning_rate": 9.691216613506692e-07, "loss": 0.5831, "step": 14045 }, { "epoch": 4.570592062459337, "grad_norm": 2.1163077354431152, "learning_rate": 9.620183459164878e-07, "loss": 0.6174, "step": 14050 }, { "epoch": 4.572218607677294, "grad_norm": 2.1451101303100586, "learning_rate": 9.549406479805818e-07, "loss": 0.5736, "step": 14055 }, { "epoch": 4.573845152895251, "grad_norm": 2.404646158218384, "learning_rate": 9.478885750857285e-07, "loss": 0.5963, "step": 14060 }, { "epoch": 4.5754716981132075, "grad_norm": 3.9412124156951904, "learning_rate": 9.408621347473751e-07, "loss": 0.6251, "step": 14065 }, { "epoch": 4.577098243331164, "grad_norm": 2.258427381515503, "learning_rate": 9.338613344536701e-07, "loss": 0.5948, "step": 14070 }, { "epoch": 4.578724788549121, "grad_norm": 2.4941704273223877, "learning_rate": 9.26886181665429e-07, "loss": 0.6027, "step": 14075 }, { "epoch": 4.580351333767078, "grad_norm": 2.2300431728363037, "learning_rate": 9.199366838161389e-07, "loss": 0.6096, "step": 14080 }, { "epoch": 4.581977878985036, "grad_norm": 2.5056817531585693, "learning_rate": 9.130128483119366e-07, "loss": 0.6497, "step": 14085 }, { "epoch": 4.583604424202993, "grad_norm": 2.3211681842803955, "learning_rate": 9.06114682531628e-07, "loss": 0.5903, "step": 14090 }, { "epoch": 4.58523096942095, "grad_norm": 2.2497894763946533, "learning_rate": 8.992421938266438e-07, "loss": 0.6077, "step": 14095 }, { "epoch": 4.586857514638907, "grad_norm": 2.4197893142700195, "learning_rate": 8.923953895210612e-07, "loss": 0.6446, "step": 14100 }, { "epoch": 4.588484059856864, "grad_norm": 2.1274402141571045, "learning_rate": 8.855742769115799e-07, "loss": 0.6106, "step": 14105 }, { "epoch": 4.5901106050748215, "grad_norm": 2.331447124481201, "learning_rate": 8.787788632675293e-07, "loss": 0.6173, "step": 14110 }, { "epoch": 4.591737150292778, "grad_norm": 2.1276087760925293, "learning_rate": 8.720091558308357e-07, "loss": 0.5834, "step": 14115 }, { "epoch": 4.593363695510735, "grad_norm": 2.276212453842163, "learning_rate": 8.652651618160424e-07, "loss": 0.561, "step": 14120 }, { "epoch": 4.594990240728692, "grad_norm": 2.1035265922546387, "learning_rate": 8.58546888410286e-07, "loss": 0.5967, "step": 14125 }, { "epoch": 4.596616785946649, "grad_norm": 2.2376317977905273, "learning_rate": 8.51854342773295e-07, "loss": 0.5995, "step": 14130 }, { "epoch": 4.598243331164606, "grad_norm": 2.7903850078582764, "learning_rate": 8.451875320373698e-07, "loss": 0.6026, "step": 14135 }, { "epoch": 4.599869876382563, "grad_norm": 2.252089262008667, "learning_rate": 8.385464633074019e-07, "loss": 0.6072, "step": 14140 }, { "epoch": 4.601496421600521, "grad_norm": 2.2325665950775146, "learning_rate": 8.319311436608301e-07, "loss": 0.6099, "step": 14145 }, { "epoch": 4.603122966818478, "grad_norm": 2.1295957565307617, "learning_rate": 8.25341580147665e-07, "loss": 0.5926, "step": 14150 }, { "epoch": 4.604749512036435, "grad_norm": 2.0349044799804688, "learning_rate": 8.187777797904639e-07, "loss": 0.6037, "step": 14155 }, { "epoch": 4.6063760572543915, "grad_norm": 2.2015886306762695, "learning_rate": 8.122397495843343e-07, "loss": 0.6203, "step": 14160 }, { "epoch": 4.608002602472348, "grad_norm": 2.2389609813690186, "learning_rate": 8.057274964969108e-07, "loss": 0.6152, "step": 14165 }, { "epoch": 4.609629147690306, "grad_norm": 2.4615068435668945, "learning_rate": 7.992410274683615e-07, "loss": 0.5913, "step": 14170 }, { "epoch": 4.611255692908263, "grad_norm": 2.589155912399292, "learning_rate": 7.927803494113761e-07, "loss": 0.5855, "step": 14175 }, { "epoch": 4.61288223812622, "grad_norm": 2.780627965927124, "learning_rate": 7.863454692111583e-07, "loss": 0.6109, "step": 14180 }, { "epoch": 4.614508783344177, "grad_norm": 2.3839402198791504, "learning_rate": 7.799363937254195e-07, "loss": 0.6059, "step": 14185 }, { "epoch": 4.616135328562134, "grad_norm": 2.354104518890381, "learning_rate": 7.735531297843713e-07, "loss": 0.6084, "step": 14190 }, { "epoch": 4.617761873780091, "grad_norm": 2.4791300296783447, "learning_rate": 7.671956841907218e-07, "loss": 0.597, "step": 14195 }, { "epoch": 4.6193884189980485, "grad_norm": 2.4160773754119873, "learning_rate": 7.60864063719649e-07, "loss": 0.6093, "step": 14200 }, { "epoch": 4.621014964216005, "grad_norm": 2.403437376022339, "learning_rate": 7.545582751188274e-07, "loss": 0.5975, "step": 14205 }, { "epoch": 4.622641509433962, "grad_norm": 2.3289949893951416, "learning_rate": 7.482783251083869e-07, "loss": 0.5875, "step": 14210 }, { "epoch": 4.624268054651919, "grad_norm": 2.0737967491149902, "learning_rate": 7.420242203809325e-07, "loss": 0.5932, "step": 14215 }, { "epoch": 4.625894599869876, "grad_norm": 2.1003692150115967, "learning_rate": 7.357959676015214e-07, "loss": 0.5966, "step": 14220 }, { "epoch": 4.627521145087833, "grad_norm": 3.097031831741333, "learning_rate": 7.295935734076609e-07, "loss": 0.6009, "step": 14225 }, { "epoch": 4.629147690305791, "grad_norm": 2.105490207672119, "learning_rate": 7.234170444092942e-07, "loss": 0.5875, "step": 14230 }, { "epoch": 4.630774235523748, "grad_norm": 2.537712335586548, "learning_rate": 7.172663871888113e-07, "loss": 0.6117, "step": 14235 }, { "epoch": 4.632400780741705, "grad_norm": 2.247049570083618, "learning_rate": 7.11141608301022e-07, "loss": 0.6105, "step": 14240 }, { "epoch": 4.634027325959662, "grad_norm": 2.3542373180389404, "learning_rate": 7.050427142731547e-07, "loss": 0.6041, "step": 14245 }, { "epoch": 4.6356538711776185, "grad_norm": 2.3531222343444824, "learning_rate": 6.989697116048633e-07, "loss": 0.6171, "step": 14250 }, { "epoch": 4.637280416395575, "grad_norm": 2.226870536804199, "learning_rate": 6.929226067682037e-07, "loss": 0.5884, "step": 14255 }, { "epoch": 4.638906961613533, "grad_norm": 3.1148924827575684, "learning_rate": 6.86901406207624e-07, "loss": 0.6381, "step": 14260 }, { "epoch": 4.64053350683149, "grad_norm": 1.998335599899292, "learning_rate": 6.809061163399827e-07, "loss": 0.5953, "step": 14265 }, { "epoch": 4.642160052049447, "grad_norm": 2.1844255924224854, "learning_rate": 6.749367435545024e-07, "loss": 0.5795, "step": 14270 }, { "epoch": 4.643786597267404, "grad_norm": 2.270549774169922, "learning_rate": 6.689932942128108e-07, "loss": 0.5996, "step": 14275 }, { "epoch": 4.645413142485361, "grad_norm": 2.049288511276245, "learning_rate": 6.630757746488886e-07, "loss": 0.6136, "step": 14280 }, { "epoch": 4.647039687703318, "grad_norm": 2.2355549335479736, "learning_rate": 6.571841911690968e-07, "loss": 0.6488, "step": 14285 }, { "epoch": 4.648666232921276, "grad_norm": 3.0274786949157715, "learning_rate": 6.513185500521463e-07, "loss": 0.5846, "step": 14290 }, { "epoch": 4.6502927781392325, "grad_norm": 2.0298526287078857, "learning_rate": 6.454788575491061e-07, "loss": 0.5942, "step": 14295 }, { "epoch": 4.651919323357189, "grad_norm": 2.31168794631958, "learning_rate": 6.396651198833897e-07, "loss": 0.5852, "step": 14300 }, { "epoch": 4.653545868575146, "grad_norm": 2.6897263526916504, "learning_rate": 6.338773432507494e-07, "loss": 0.6254, "step": 14305 }, { "epoch": 4.655172413793103, "grad_norm": 2.1427431106567383, "learning_rate": 6.281155338192762e-07, "loss": 0.5588, "step": 14310 }, { "epoch": 4.656798959011061, "grad_norm": 4.211636543273926, "learning_rate": 6.223796977293777e-07, "loss": 0.5825, "step": 14315 }, { "epoch": 4.658425504229018, "grad_norm": 2.302633047103882, "learning_rate": 6.166698410937949e-07, "loss": 0.6081, "step": 14320 }, { "epoch": 4.660052049446975, "grad_norm": 2.098952054977417, "learning_rate": 6.109859699975684e-07, "loss": 0.5957, "step": 14325 }, { "epoch": 4.661678594664932, "grad_norm": 2.4573733806610107, "learning_rate": 6.053280904980557e-07, "loss": 0.5685, "step": 14330 }, { "epoch": 4.663305139882889, "grad_norm": 2.3456218242645264, "learning_rate": 5.996962086249058e-07, "loss": 0.6169, "step": 14335 }, { "epoch": 4.6649316851008455, "grad_norm": 2.059295415878296, "learning_rate": 5.940903303800705e-07, "loss": 0.5839, "step": 14340 }, { "epoch": 4.6665582303188025, "grad_norm": 2.1461198329925537, "learning_rate": 5.885104617377873e-07, "loss": 0.6013, "step": 14345 }, { "epoch": 4.66818477553676, "grad_norm": 2.2912466526031494, "learning_rate": 5.829566086445721e-07, "loss": 0.63, "step": 14350 }, { "epoch": 4.669811320754717, "grad_norm": 2.241825819015503, "learning_rate": 5.774287770192149e-07, "loss": 0.596, "step": 14355 }, { "epoch": 4.671437865972674, "grad_norm": 1.978074550628662, "learning_rate": 5.719269727527843e-07, "loss": 0.5816, "step": 14360 }, { "epoch": 4.673064411190631, "grad_norm": 2.453749895095825, "learning_rate": 5.664512017085926e-07, "loss": 0.6131, "step": 14365 }, { "epoch": 4.674690956408588, "grad_norm": 2.2468841075897217, "learning_rate": 5.610014697222249e-07, "loss": 0.6201, "step": 14370 }, { "epoch": 4.676317501626546, "grad_norm": 2.2596335411071777, "learning_rate": 5.555777826015129e-07, "loss": 0.6454, "step": 14375 }, { "epoch": 4.677944046844503, "grad_norm": 2.49043607711792, "learning_rate": 5.501801461265304e-07, "loss": 0.5676, "step": 14380 }, { "epoch": 4.6795705920624595, "grad_norm": 2.2760651111602783, "learning_rate": 5.448085660495816e-07, "loss": 0.5935, "step": 14385 }, { "epoch": 4.681197137280416, "grad_norm": 1.9554202556610107, "learning_rate": 5.394630480952178e-07, "loss": 0.5756, "step": 14390 }, { "epoch": 4.682823682498373, "grad_norm": 2.103641986846924, "learning_rate": 5.341435979601988e-07, "loss": 0.5702, "step": 14395 }, { "epoch": 4.68445022771633, "grad_norm": 5.318946361541748, "learning_rate": 5.288502213135149e-07, "loss": 0.6155, "step": 14400 }, { "epoch": 4.686076772934287, "grad_norm": 2.606043815612793, "learning_rate": 5.235829237963646e-07, "loss": 0.6106, "step": 14405 }, { "epoch": 4.687703318152245, "grad_norm": 2.7256171703338623, "learning_rate": 5.183417110221606e-07, "loss": 0.6194, "step": 14410 }, { "epoch": 4.689329863370202, "grad_norm": 2.1221957206726074, "learning_rate": 5.131265885765041e-07, "loss": 0.6355, "step": 14415 }, { "epoch": 4.690956408588159, "grad_norm": 2.3277482986450195, "learning_rate": 5.07937562017205e-07, "loss": 0.5896, "step": 14420 }, { "epoch": 4.692582953806116, "grad_norm": 2.2037417888641357, "learning_rate": 5.027746368742536e-07, "loss": 0.6172, "step": 14425 }, { "epoch": 4.694209499024073, "grad_norm": 2.131485939025879, "learning_rate": 4.976378186498293e-07, "loss": 0.6077, "step": 14430 }, { "epoch": 4.69583604424203, "grad_norm": 2.2293078899383545, "learning_rate": 4.925271128182807e-07, "loss": 0.5721, "step": 14435 }, { "epoch": 4.697462589459987, "grad_norm": 2.614671468734741, "learning_rate": 4.874425248261428e-07, "loss": 0.6042, "step": 14440 }, { "epoch": 4.699089134677944, "grad_norm": 2.4178030490875244, "learning_rate": 4.823840600921003e-07, "loss": 0.5984, "step": 14445 }, { "epoch": 4.700715679895901, "grad_norm": 2.6082773208618164, "learning_rate": 4.773517240070108e-07, "loss": 0.5976, "step": 14450 }, { "epoch": 4.702342225113858, "grad_norm": 2.058941602706909, "learning_rate": 4.7234552193387846e-07, "loss": 0.6062, "step": 14455 }, { "epoch": 4.703968770331815, "grad_norm": 2.080263614654541, "learning_rate": 4.67365459207858e-07, "loss": 0.5993, "step": 14460 }, { "epoch": 4.705595315549772, "grad_norm": 2.656578779220581, "learning_rate": 4.624115411362512e-07, "loss": 0.6043, "step": 14465 }, { "epoch": 4.70722186076773, "grad_norm": 2.144035577774048, "learning_rate": 4.5748377299849045e-07, "loss": 0.613, "step": 14470 }, { "epoch": 4.7088484059856865, "grad_norm": 2.266422748565674, "learning_rate": 4.525821600461472e-07, "loss": 0.5928, "step": 14475 }, { "epoch": 4.7104749512036435, "grad_norm": 2.155153512954712, "learning_rate": 4.477067075029123e-07, "loss": 0.6138, "step": 14480 }, { "epoch": 4.7121014964216, "grad_norm": 2.283352851867676, "learning_rate": 4.428574205646047e-07, "loss": 0.593, "step": 14485 }, { "epoch": 4.713728041639557, "grad_norm": 2.3388490676879883, "learning_rate": 4.3803430439915137e-07, "loss": 0.5753, "step": 14490 }, { "epoch": 4.715354586857515, "grad_norm": 1.8969876766204834, "learning_rate": 4.332373641465909e-07, "loss": 0.5931, "step": 14495 }, { "epoch": 4.716981132075472, "grad_norm": 2.4692556858062744, "learning_rate": 4.284666049190644e-07, "loss": 0.5988, "step": 14500 }, { "epoch": 4.718607677293429, "grad_norm": 2.4041595458984375, "learning_rate": 4.2372203180081893e-07, "loss": 0.6155, "step": 14505 }, { "epoch": 4.720234222511386, "grad_norm": 2.1963233947753906, "learning_rate": 4.1900364984818754e-07, "loss": 0.6022, "step": 14510 }, { "epoch": 4.721860767729343, "grad_norm": 2.479945421218872, "learning_rate": 4.143114640895951e-07, "loss": 0.613, "step": 14515 }, { "epoch": 4.7234873129473, "grad_norm": 2.1995420455932617, "learning_rate": 4.0964547952554443e-07, "loss": 0.5877, "step": 14520 }, { "epoch": 4.7251138581652565, "grad_norm": 2.2108662128448486, "learning_rate": 4.05005701128619e-07, "loss": 0.6426, "step": 14525 }, { "epoch": 4.726740403383214, "grad_norm": 1.9551113843917847, "learning_rate": 4.0039213384347187e-07, "loss": 0.6173, "step": 14530 }, { "epoch": 4.728366948601171, "grad_norm": 2.2449254989624023, "learning_rate": 3.958047825868283e-07, "loss": 0.5923, "step": 14535 }, { "epoch": 4.729993493819128, "grad_norm": 2.076014518737793, "learning_rate": 3.912436522474666e-07, "loss": 0.6095, "step": 14540 }, { "epoch": 4.731620039037085, "grad_norm": 2.1760308742523193, "learning_rate": 3.867087476862291e-07, "loss": 0.6125, "step": 14545 }, { "epoch": 4.733246584255042, "grad_norm": 2.2484352588653564, "learning_rate": 3.822000737360026e-07, "loss": 0.5837, "step": 14550 }, { "epoch": 4.734873129473, "grad_norm": 2.1155903339385986, "learning_rate": 3.777176352017242e-07, "loss": 0.6028, "step": 14555 }, { "epoch": 4.736499674690957, "grad_norm": 2.0362231731414795, "learning_rate": 3.7326143686036706e-07, "loss": 0.6188, "step": 14560 }, { "epoch": 4.738126219908914, "grad_norm": 2.6431407928466797, "learning_rate": 3.6883148346094356e-07, "loss": 0.6163, "step": 14565 }, { "epoch": 4.7397527651268705, "grad_norm": 2.1116814613342285, "learning_rate": 3.644277797244966e-07, "loss": 0.6308, "step": 14570 }, { "epoch": 4.741379310344827, "grad_norm": 3.778200149536133, "learning_rate": 3.600503303440972e-07, "loss": 0.6118, "step": 14575 }, { "epoch": 4.743005855562784, "grad_norm": 2.127812147140503, "learning_rate": 3.556991399848275e-07, "loss": 0.5945, "step": 14580 }, { "epoch": 4.744632400780741, "grad_norm": 2.4827187061309814, "learning_rate": 3.5137421328379493e-07, "loss": 0.5995, "step": 14585 }, { "epoch": 4.746258945998699, "grad_norm": 2.650378465652466, "learning_rate": 3.4707555485011533e-07, "loss": 0.6254, "step": 14590 }, { "epoch": 4.747885491216656, "grad_norm": 2.0619797706604004, "learning_rate": 3.4280316926490196e-07, "loss": 0.5765, "step": 14595 }, { "epoch": 4.749512036434613, "grad_norm": 2.2239131927490234, "learning_rate": 3.385570610812794e-07, "loss": 0.6025, "step": 14600 }, { "epoch": 4.75113858165257, "grad_norm": 2.138410806655884, "learning_rate": 3.3433723482436676e-07, "loss": 0.5962, "step": 14605 }, { "epoch": 4.752765126870527, "grad_norm": 2.1161324977874756, "learning_rate": 3.3014369499126675e-07, "loss": 0.5921, "step": 14610 }, { "epoch": 4.7543916720884845, "grad_norm": 2.4537761211395264, "learning_rate": 3.259764460510767e-07, "loss": 0.5996, "step": 14615 }, { "epoch": 4.756018217306441, "grad_norm": 2.0539655685424805, "learning_rate": 3.218354924448719e-07, "loss": 0.5926, "step": 14620 }, { "epoch": 4.757644762524398, "grad_norm": 1.9618831872940063, "learning_rate": 3.177208385857028e-07, "loss": 0.5867, "step": 14625 }, { "epoch": 4.759271307742355, "grad_norm": 3.0551681518554688, "learning_rate": 3.1363248885859506e-07, "loss": 0.5933, "step": 14630 }, { "epoch": 4.760897852960312, "grad_norm": 2.0586209297180176, "learning_rate": 3.0957044762054133e-07, "loss": 0.6, "step": 14635 }, { "epoch": 4.762524398178269, "grad_norm": 2.6311097145080566, "learning_rate": 3.055347192004954e-07, "loss": 0.628, "step": 14640 }, { "epoch": 4.764150943396227, "grad_norm": 2.4645638465881348, "learning_rate": 3.0152530789936963e-07, "loss": 0.5861, "step": 14645 }, { "epoch": 4.765777488614184, "grad_norm": 2.747060775756836, "learning_rate": 2.9754221799003503e-07, "loss": 0.6072, "step": 14650 }, { "epoch": 4.767404033832141, "grad_norm": 2.1690328121185303, "learning_rate": 2.9358545371729883e-07, "loss": 0.5802, "step": 14655 }, { "epoch": 4.7690305790500975, "grad_norm": 2.304227828979492, "learning_rate": 2.8965501929792695e-07, "loss": 0.6088, "step": 14660 }, { "epoch": 4.770657124268054, "grad_norm": 2.5094473361968994, "learning_rate": 2.857509189206187e-07, "loss": 0.6202, "step": 14665 }, { "epoch": 4.772283669486011, "grad_norm": 2.518124580383301, "learning_rate": 2.818731567460098e-07, "loss": 0.6064, "step": 14670 }, { "epoch": 4.773910214703969, "grad_norm": 2.467226266860962, "learning_rate": 2.7802173690666676e-07, "loss": 0.5852, "step": 14675 }, { "epoch": 4.775536759921926, "grad_norm": 2.188849925994873, "learning_rate": 2.741966635070842e-07, "loss": 0.5984, "step": 14680 }, { "epoch": 4.777163305139883, "grad_norm": 2.122866153717041, "learning_rate": 2.7039794062367616e-07, "loss": 0.5949, "step": 14685 }, { "epoch": 4.77878985035784, "grad_norm": 2.236489772796631, "learning_rate": 2.6662557230477667e-07, "loss": 0.5542, "step": 14690 }, { "epoch": 4.780416395575797, "grad_norm": 2.2290842533111572, "learning_rate": 2.6287956257063374e-07, "loss": 0.5766, "step": 14695 }, { "epoch": 4.782042940793754, "grad_norm": 2.478623151779175, "learning_rate": 2.5915991541340667e-07, "loss": 0.612, "step": 14700 }, { "epoch": 4.7836694860117115, "grad_norm": 2.169400691986084, "learning_rate": 2.5546663479715236e-07, "loss": 0.6124, "step": 14705 }, { "epoch": 4.785296031229668, "grad_norm": 2.0891544818878174, "learning_rate": 2.5179972465784186e-07, "loss": 0.6029, "step": 14710 }, { "epoch": 4.786922576447625, "grad_norm": 2.379319667816162, "learning_rate": 2.481591889033269e-07, "loss": 0.579, "step": 14715 }, { "epoch": 4.788549121665582, "grad_norm": 1.8330259323120117, "learning_rate": 2.4454503141336513e-07, "loss": 0.5935, "step": 14720 }, { "epoch": 4.790175666883539, "grad_norm": 2.2244858741760254, "learning_rate": 2.409572560395951e-07, "loss": 0.635, "step": 14725 }, { "epoch": 4.791802212101496, "grad_norm": 4.858855724334717, "learning_rate": 2.3739586660554148e-07, "loss": 0.5882, "step": 14730 }, { "epoch": 4.793428757319454, "grad_norm": 1.9892836809158325, "learning_rate": 2.338608669066128e-07, "loss": 0.5708, "step": 14735 }, { "epoch": 4.795055302537411, "grad_norm": 2.171619176864624, "learning_rate": 2.3035226071008997e-07, "loss": 0.6061, "step": 14740 }, { "epoch": 4.796681847755368, "grad_norm": 2.2100746631622314, "learning_rate": 2.2687005175512642e-07, "loss": 0.5932, "step": 14745 }, { "epoch": 4.798308392973325, "grad_norm": 2.541048765182495, "learning_rate": 2.2341424375274256e-07, "loss": 0.5977, "step": 14750 }, { "epoch": 4.7999349381912815, "grad_norm": 2.1693179607391357, "learning_rate": 2.1998484038582567e-07, "loss": 0.5917, "step": 14755 }, { "epoch": 4.801561483409239, "grad_norm": 2.8264858722686768, "learning_rate": 2.165818453091245e-07, "loss": 0.6237, "step": 14760 }, { "epoch": 4.803188028627196, "grad_norm": 3.8728349208831787, "learning_rate": 2.1320526214924086e-07, "loss": 0.5953, "step": 14765 }, { "epoch": 4.804814573845153, "grad_norm": 2.0309038162231445, "learning_rate": 2.098550945046268e-07, "loss": 0.6002, "step": 14770 }, { "epoch": 4.80644111906311, "grad_norm": 2.2680821418762207, "learning_rate": 2.0653134594559586e-07, "loss": 0.5958, "step": 14775 }, { "epoch": 4.808067664281067, "grad_norm": 2.4042727947235107, "learning_rate": 2.0323402001428682e-07, "loss": 0.6018, "step": 14780 }, { "epoch": 4.809694209499024, "grad_norm": 2.323235511779785, "learning_rate": 1.999631202246971e-07, "loss": 0.6215, "step": 14785 }, { "epoch": 4.811320754716981, "grad_norm": 10.879158020019531, "learning_rate": 1.9671865006265223e-07, "loss": 0.6268, "step": 14790 }, { "epoch": 4.8129472999349385, "grad_norm": 2.1472039222717285, "learning_rate": 1.935006129858169e-07, "loss": 0.6221, "step": 14795 }, { "epoch": 4.814573845152895, "grad_norm": 2.1684577465057373, "learning_rate": 1.9030901242367837e-07, "loss": 0.5702, "step": 14800 }, { "epoch": 4.816200390370852, "grad_norm": 2.1772913932800293, "learning_rate": 1.8714385177756032e-07, "loss": 0.592, "step": 14805 }, { "epoch": 4.817826935588809, "grad_norm": 2.1808815002441406, "learning_rate": 1.8400513442059786e-07, "loss": 0.5781, "step": 14810 }, { "epoch": 4.819453480806766, "grad_norm": 2.3493869304656982, "learning_rate": 1.8089286369775415e-07, "loss": 0.6637, "step": 14815 }, { "epoch": 4.821080026024724, "grad_norm": 2.1450183391571045, "learning_rate": 1.7780704292580107e-07, "loss": 0.5833, "step": 14820 }, { "epoch": 4.822706571242681, "grad_norm": 2.0887880325317383, "learning_rate": 1.7474767539333302e-07, "loss": 0.6156, "step": 14825 }, { "epoch": 4.824333116460638, "grad_norm": 2.41434383392334, "learning_rate": 1.717147643607392e-07, "loss": 0.603, "step": 14830 }, { "epoch": 4.825959661678595, "grad_norm": 2.320181369781494, "learning_rate": 1.687083130602257e-07, "loss": 0.6061, "step": 14835 }, { "epoch": 4.827586206896552, "grad_norm": 2.6415679454803467, "learning_rate": 1.6572832469579357e-07, "loss": 0.6161, "step": 14840 }, { "epoch": 4.8292127521145085, "grad_norm": 2.5681235790252686, "learning_rate": 1.6277480244324127e-07, "loss": 0.5878, "step": 14845 }, { "epoch": 4.830839297332465, "grad_norm": 2.113208055496216, "learning_rate": 1.5984774945017044e-07, "loss": 0.59, "step": 14850 }, { "epoch": 4.832465842550423, "grad_norm": 2.5020604133605957, "learning_rate": 1.5694716883596083e-07, "loss": 0.6092, "step": 14855 }, { "epoch": 4.83409238776838, "grad_norm": 2.2251505851745605, "learning_rate": 1.540730636917953e-07, "loss": 0.6021, "step": 14860 }, { "epoch": 4.835718932986337, "grad_norm": 2.315005302429199, "learning_rate": 1.512254370806293e-07, "loss": 0.6022, "step": 14865 }, { "epoch": 4.837345478204294, "grad_norm": 2.2444543838500977, "learning_rate": 1.4840429203720752e-07, "loss": 0.607, "step": 14870 }, { "epoch": 4.838972023422251, "grad_norm": 2.1719343662261963, "learning_rate": 1.4560963156804997e-07, "loss": 0.6111, "step": 14875 }, { "epoch": 4.840598568640209, "grad_norm": 2.1927409172058105, "learning_rate": 1.4284145865144928e-07, "loss": 0.6029, "step": 14880 }, { "epoch": 4.842225113858166, "grad_norm": 1.9452162981033325, "learning_rate": 1.4009977623747617e-07, "loss": 0.5633, "step": 14885 }, { "epoch": 4.8438516590761225, "grad_norm": 2.175806999206543, "learning_rate": 1.3738458724796288e-07, "loss": 0.6247, "step": 14890 }, { "epoch": 4.845478204294079, "grad_norm": 1.9696511030197144, "learning_rate": 1.3469589457651422e-07, "loss": 0.5686, "step": 14895 }, { "epoch": 4.847104749512036, "grad_norm": 2.063143253326416, "learning_rate": 1.3203370108849644e-07, "loss": 0.5636, "step": 14900 }, { "epoch": 4.848731294729993, "grad_norm": 2.154536724090576, "learning_rate": 1.2939800962103176e-07, "loss": 0.5839, "step": 14905 }, { "epoch": 4.85035783994795, "grad_norm": 1.9221045970916748, "learning_rate": 1.2678882298299833e-07, "loss": 0.5907, "step": 14910 }, { "epoch": 4.851984385165908, "grad_norm": 2.1464858055114746, "learning_rate": 1.2420614395503294e-07, "loss": 0.6062, "step": 14915 }, { "epoch": 4.853610930383865, "grad_norm": 2.47495174407959, "learning_rate": 1.2164997528952004e-07, "loss": 0.5889, "step": 14920 }, { "epoch": 4.855237475601822, "grad_norm": 2.2816359996795654, "learning_rate": 1.1912031971059168e-07, "loss": 0.6131, "step": 14925 }, { "epoch": 4.856864020819779, "grad_norm": 2.2610902786254883, "learning_rate": 1.1661717991412746e-07, "loss": 0.5917, "step": 14930 }, { "epoch": 4.8584905660377355, "grad_norm": 2.1258704662323, "learning_rate": 1.1414055856774075e-07, "loss": 0.6128, "step": 14935 }, { "epoch": 4.860117111255693, "grad_norm": 2.082951784133911, "learning_rate": 1.1169045831079805e-07, "loss": 0.6119, "step": 14940 }, { "epoch": 4.86174365647365, "grad_norm": 2.3723201751708984, "learning_rate": 1.0926688175438571e-07, "loss": 0.5465, "step": 14945 }, { "epoch": 4.863370201691607, "grad_norm": 2.3645992279052734, "learning_rate": 1.0686983148133489e-07, "loss": 0.5899, "step": 14950 }, { "epoch": 4.864996746909564, "grad_norm": 2.1711809635162354, "learning_rate": 1.0449931004620495e-07, "loss": 0.5932, "step": 14955 }, { "epoch": 4.866623292127521, "grad_norm": 2.588343381881714, "learning_rate": 1.0215531997528338e-07, "loss": 0.6134, "step": 14960 }, { "epoch": 4.868249837345478, "grad_norm": 2.0764243602752686, "learning_rate": 9.983786376657755e-08, "loss": 0.5932, "step": 14965 }, { "epoch": 4.869876382563435, "grad_norm": 2.267519235610962, "learning_rate": 9.754694388982854e-08, "loss": 0.6049, "step": 14970 }, { "epoch": 4.871502927781393, "grad_norm": 2.4809203147888184, "learning_rate": 9.52825627864834e-08, "loss": 0.6334, "step": 14975 }, { "epoch": 4.8731294729993495, "grad_norm": 2.4457435607910156, "learning_rate": 9.304472286971733e-08, "loss": 0.5986, "step": 14980 }, { "epoch": 4.874756018217306, "grad_norm": 11.295981407165527, "learning_rate": 9.083342652441706e-08, "loss": 0.5956, "step": 14985 }, { "epoch": 4.876382563435263, "grad_norm": 1.9970314502716064, "learning_rate": 8.864867610718363e-08, "loss": 0.6116, "step": 14990 }, { "epoch": 4.87800910865322, "grad_norm": 4.389446258544922, "learning_rate": 8.649047394632126e-08, "loss": 0.6008, "step": 14995 }, { "epoch": 4.879635653871178, "grad_norm": 2.124135732650757, "learning_rate": 8.435882234184844e-08, "loss": 0.5881, "step": 15000 }, { "epoch": 4.881262199089135, "grad_norm": 2.077362298965454, "learning_rate": 8.225372356548689e-08, "loss": 0.5928, "step": 15005 }, { "epoch": 4.882888744307092, "grad_norm": 2.21614408493042, "learning_rate": 8.017517986065593e-08, "loss": 0.6198, "step": 15010 }, { "epoch": 4.884515289525049, "grad_norm": 2.278759479522705, "learning_rate": 7.812319344248365e-08, "loss": 0.6096, "step": 15015 }, { "epoch": 4.886141834743006, "grad_norm": 2.567445993423462, "learning_rate": 7.609776649778466e-08, "loss": 0.6011, "step": 15020 }, { "epoch": 4.887768379960963, "grad_norm": 2.1544313430786133, "learning_rate": 7.409890118508234e-08, "loss": 0.6016, "step": 15025 }, { "epoch": 4.8893949251789195, "grad_norm": 1.9982365369796753, "learning_rate": 7.212659963458101e-08, "loss": 0.6215, "step": 15030 }, { "epoch": 4.891021470396877, "grad_norm": 2.2209675312042236, "learning_rate": 7.018086394817991e-08, "loss": 0.577, "step": 15035 }, { "epoch": 4.892648015614834, "grad_norm": 2.1577842235565186, "learning_rate": 6.826169619947032e-08, "loss": 0.5865, "step": 15040 }, { "epoch": 4.894274560832791, "grad_norm": 1.9539618492126465, "learning_rate": 6.63690984337273e-08, "loss": 0.5889, "step": 15045 }, { "epoch": 4.895901106050748, "grad_norm": 2.0748941898345947, "learning_rate": 6.45030726679069e-08, "loss": 0.5853, "step": 15050 }, { "epoch": 4.897527651268705, "grad_norm": 2.1897246837615967, "learning_rate": 6.266362089065448e-08, "loss": 0.6013, "step": 15055 }, { "epoch": 4.899154196486663, "grad_norm": 4.679499626159668, "learning_rate": 6.085074506228528e-08, "loss": 0.615, "step": 15060 }, { "epoch": 4.90078074170462, "grad_norm": 2.0859599113464355, "learning_rate": 5.90644471147983e-08, "loss": 0.5736, "step": 15065 }, { "epoch": 4.9024072869225765, "grad_norm": 2.305415153503418, "learning_rate": 5.730472895187355e-08, "loss": 0.6139, "step": 15070 }, { "epoch": 4.9040338321405335, "grad_norm": 2.4055914878845215, "learning_rate": 5.557159244885257e-08, "loss": 0.6172, "step": 15075 }, { "epoch": 4.90566037735849, "grad_norm": 4.010295867919922, "learning_rate": 5.386503945275789e-08, "loss": 0.615, "step": 15080 }, { "epoch": 4.907286922576447, "grad_norm": 1.898116946220398, "learning_rate": 5.2185071782276385e-08, "loss": 0.5578, "step": 15085 }, { "epoch": 4.908913467794405, "grad_norm": 2.1695468425750732, "learning_rate": 5.053169122776757e-08, "loss": 0.6188, "step": 15090 }, { "epoch": 4.910540013012362, "grad_norm": 2.376075506210327, "learning_rate": 4.8904899551255326e-08, "loss": 0.608, "step": 15095 }, { "epoch": 4.912166558230319, "grad_norm": 2.4588913917541504, "learning_rate": 4.730469848642505e-08, "loss": 0.5981, "step": 15100 }, { "epoch": 4.913793103448276, "grad_norm": 2.42128849029541, "learning_rate": 4.573108973862095e-08, "loss": 0.6244, "step": 15105 }, { "epoch": 4.915419648666233, "grad_norm": 2.012868642807007, "learning_rate": 4.418407498485988e-08, "loss": 0.585, "step": 15110 }, { "epoch": 4.91704619388419, "grad_norm": 2.3483657836914062, "learning_rate": 4.2663655873806385e-08, "loss": 0.6012, "step": 15115 }, { "epoch": 4.918672739102147, "grad_norm": 2.9775454998016357, "learning_rate": 4.116983402578656e-08, "loss": 0.5937, "step": 15120 }, { "epoch": 4.920299284320104, "grad_norm": 2.1648337841033936, "learning_rate": 3.9702611032776946e-08, "loss": 0.6045, "step": 15125 }, { "epoch": 4.921925829538061, "grad_norm": 2.8775291442871094, "learning_rate": 3.826198845841289e-08, "loss": 0.5842, "step": 15130 }, { "epoch": 4.923552374756018, "grad_norm": 2.1625959873199463, "learning_rate": 3.684796783798017e-08, "loss": 0.6006, "step": 15135 }, { "epoch": 4.925178919973975, "grad_norm": 2.0495049953460693, "learning_rate": 3.546055067840948e-08, "loss": 0.5935, "step": 15140 }, { "epoch": 4.926805465191932, "grad_norm": 2.2851064205169678, "learning_rate": 3.409973845829029e-08, "loss": 0.6197, "step": 15145 }, { "epoch": 4.92843201040989, "grad_norm": 2.15488338470459, "learning_rate": 3.2765532627845874e-08, "loss": 0.6117, "step": 15150 }, { "epoch": 4.930058555627847, "grad_norm": 1.9095077514648438, "learning_rate": 3.145793460895552e-08, "loss": 0.6169, "step": 15155 }, { "epoch": 4.931685100845804, "grad_norm": 2.2632343769073486, "learning_rate": 3.017694579514063e-08, "loss": 0.638, "step": 15160 }, { "epoch": 4.9333116460637605, "grad_norm": 3.9604759216308594, "learning_rate": 2.8922567551556424e-08, "loss": 0.6118, "step": 15165 }, { "epoch": 4.934938191281717, "grad_norm": 2.236368417739868, "learning_rate": 2.7694801215011333e-08, "loss": 0.6115, "step": 15170 }, { "epoch": 4.936564736499674, "grad_norm": 2.2910823822021484, "learning_rate": 2.6493648093942058e-08, "loss": 0.6125, "step": 15175 }, { "epoch": 4.938191281717632, "grad_norm": 2.1260986328125, "learning_rate": 2.531910946843574e-08, "loss": 0.605, "step": 15180 }, { "epoch": 4.939817826935589, "grad_norm": 2.2453088760375977, "learning_rate": 2.4171186590202233e-08, "loss": 0.6114, "step": 15185 }, { "epoch": 4.941444372153546, "grad_norm": 2.0122714042663574, "learning_rate": 2.3049880682593504e-08, "loss": 0.6324, "step": 15190 }, { "epoch": 4.943070917371503, "grad_norm": 2.417142629623413, "learning_rate": 2.1955192940600887e-08, "loss": 0.6188, "step": 15195 }, { "epoch": 4.94469746258946, "grad_norm": 2.1205084323883057, "learning_rate": 2.0887124530841183e-08, "loss": 0.609, "step": 15200 }, { "epoch": 4.9463240078074175, "grad_norm": 2.5445995330810547, "learning_rate": 1.9845676591559446e-08, "loss": 0.6005, "step": 15205 }, { "epoch": 4.9479505530253745, "grad_norm": 2.3689987659454346, "learning_rate": 1.8830850232645636e-08, "loss": 0.6141, "step": 15210 }, { "epoch": 4.949577098243331, "grad_norm": 2.399352788925171, "learning_rate": 1.7842646535601305e-08, "loss": 0.6107, "step": 15215 }, { "epoch": 4.951203643461288, "grad_norm": 1.9902371168136597, "learning_rate": 1.688106655356736e-08, "loss": 0.5823, "step": 15220 }, { "epoch": 4.952830188679245, "grad_norm": 2.2933974266052246, "learning_rate": 1.5946111311310186e-08, "loss": 0.5969, "step": 15225 }, { "epoch": 4.954456733897202, "grad_norm": 2.3006935119628906, "learning_rate": 1.5037781805218863e-08, "loss": 0.6092, "step": 15230 }, { "epoch": 4.956083279115159, "grad_norm": 1.9131032228469849, "learning_rate": 1.4156079003307953e-08, "loss": 0.5738, "step": 15235 }, { "epoch": 4.957709824333117, "grad_norm": 2.205113172531128, "learning_rate": 1.330100384521471e-08, "loss": 0.6286, "step": 15240 }, { "epoch": 4.959336369551074, "grad_norm": 2.259881019592285, "learning_rate": 1.247255724220464e-08, "loss": 0.5911, "step": 15245 }, { "epoch": 4.960962914769031, "grad_norm": 3.1508960723876953, "learning_rate": 1.1670740077157626e-08, "loss": 0.6078, "step": 15250 }, { "epoch": 4.9625894599869875, "grad_norm": 2.3667948246002197, "learning_rate": 1.0895553204579024e-08, "loss": 0.6068, "step": 15255 }, { "epoch": 4.964216005204944, "grad_norm": 2.3552937507629395, "learning_rate": 1.0146997450591333e-08, "loss": 0.6335, "step": 15260 }, { "epoch": 4.965842550422902, "grad_norm": 2.3416175842285156, "learning_rate": 9.42507361293976e-09, "loss": 0.6005, "step": 15265 }, { "epoch": 4.967469095640859, "grad_norm": 2.283876419067383, "learning_rate": 8.729782460981106e-09, "loss": 0.5736, "step": 15270 }, { "epoch": 4.969095640858816, "grad_norm": 2.2261712551116943, "learning_rate": 8.061124735697645e-09, "loss": 0.5841, "step": 15275 }, { "epoch": 4.970722186076773, "grad_norm": 2.4273598194122314, "learning_rate": 7.4191011496832495e-09, "loss": 0.6333, "step": 15280 }, { "epoch": 4.97234873129473, "grad_norm": 2.3165061473846436, "learning_rate": 6.803712387146166e-09, "loss": 0.6144, "step": 15285 }, { "epoch": 4.973975276512687, "grad_norm": 2.3885746002197266, "learning_rate": 6.2149591039145635e-09, "loss": 0.6086, "step": 15290 }, { "epoch": 4.975601821730644, "grad_norm": 2.53422212600708, "learning_rate": 5.652841927425434e-09, "loss": 0.5898, "step": 15295 }, { "epoch": 4.9772283669486015, "grad_norm": 2.120922327041626, "learning_rate": 5.117361456735692e-09, "loss": 0.6188, "step": 15300 }, { "epoch": 4.978854912166558, "grad_norm": 1.9496095180511475, "learning_rate": 4.7081558927991594e-09, "loss": 0.5899, "step": 15305 }, { "epoch": 4.980481457384515, "grad_norm": 2.3398711681365967, "learning_rate": 4.220622911546568e-09, "loss": 0.59, "step": 15310 }, { "epoch": 4.982108002602472, "grad_norm": 2.346818685531616, "learning_rate": 3.759728162422427e-09, "loss": 0.5875, "step": 15315 }, { "epoch": 4.983734547820429, "grad_norm": 2.2996296882629395, "learning_rate": 3.3254721366032805e-09, "loss": 0.5574, "step": 15320 }, { "epoch": 4.985361093038387, "grad_norm": 2.248450756072998, "learning_rate": 2.9178552968800454e-09, "loss": 0.5798, "step": 15325 }, { "epoch": 4.986987638256344, "grad_norm": 2.0372941493988037, "learning_rate": 2.536878077655236e-09, "loss": 0.5893, "step": 15330 }, { "epoch": 4.988614183474301, "grad_norm": 2.27799654006958, "learning_rate": 2.1825408849401873e-09, "loss": 0.6261, "step": 15335 }, { "epoch": 4.990240728692258, "grad_norm": 2.2353765964508057, "learning_rate": 1.8548440963522818e-09, "loss": 0.5939, "step": 15340 }, { "epoch": 4.991867273910215, "grad_norm": 1.94557523727417, "learning_rate": 1.5537880611260491e-09, "loss": 0.6064, "step": 15345 }, { "epoch": 4.9934938191281715, "grad_norm": 2.407500743865967, "learning_rate": 1.2793731000937393e-09, "loss": 0.6106, "step": 15350 }, { "epoch": 4.995120364346128, "grad_norm": 2.1297216415405273, "learning_rate": 1.0315995057075256e-09, "loss": 0.6218, "step": 15355 }, { "epoch": 4.996746909564086, "grad_norm": 2.2248427867889404, "learning_rate": 8.104675420173014e-10, "loss": 0.6253, "step": 15360 }, { "epoch": 4.998373454782043, "grad_norm": 2.113090753555298, "learning_rate": 6.159774446901079e-10, "loss": 0.607, "step": 15365 }, { "epoch": 5.0, "grad_norm": 3.143172025680542, "learning_rate": 4.481294209907061e-10, "loss": 0.5905, "step": 15370 }, { "epoch": 5.0, "eval_f1": 0.8242242626847417, "eval_loss": 0.43017578125, "eval_precision": 0.8247019556006198, "eval_recall": 0.8238432612482821, "eval_runtime": 1050.5502, "eval_samples_per_second": 374.504, "eval_steps_per_second": 0.732, "step": 15370 }, { "epoch": 5.0, "step": 15370, "total_flos": 1.4665425515846828e+19, "train_loss": 0.7460857761162166, "train_runtime": 176390.3873, "train_samples_per_second": 89.219, "train_steps_per_second": 0.087 } ], "logging_steps": 5, "max_steps": 15370, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 5.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.4665425515846828e+19, "train_batch_size": 512, "trial_name": null, "trial_params": null }