{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 2976, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006720430107526882, "grad_norm": 48.78054272427667, "learning_rate": 6.711409395973155e-08, "loss": 2.3825, "step": 1 }, { "epoch": 0.003360215053763441, "grad_norm": 48.47840626370759, "learning_rate": 3.3557046979865777e-07, "loss": 2.376, "step": 5 }, { "epoch": 0.006720430107526882, "grad_norm": 51.80697175894909, "learning_rate": 6.711409395973155e-07, "loss": 2.3226, "step": 10 }, { "epoch": 0.010080645161290322, "grad_norm": 13.624773264265663, "learning_rate": 1.006711409395973e-06, "loss": 2.1053, "step": 15 }, { "epoch": 0.013440860215053764, "grad_norm": 6.262298470265524, "learning_rate": 1.342281879194631e-06, "loss": 1.9272, "step": 20 }, { "epoch": 0.016801075268817203, "grad_norm": 7.548559183569387, "learning_rate": 1.6778523489932889e-06, "loss": 1.7186, "step": 25 }, { "epoch": 0.020161290322580645, "grad_norm": 2.6440968174788626, "learning_rate": 2.013422818791946e-06, "loss": 1.5223, "step": 30 }, { "epoch": 0.023521505376344086, "grad_norm": 3.58492629574647, "learning_rate": 2.348993288590604e-06, "loss": 1.3232, "step": 35 }, { "epoch": 0.026881720430107527, "grad_norm": 1.6886536264742815, "learning_rate": 2.684563758389262e-06, "loss": 1.1928, "step": 40 }, { "epoch": 0.03024193548387097, "grad_norm": 1.217304086312586, "learning_rate": 3.02013422818792e-06, "loss": 1.0767, "step": 45 }, { "epoch": 0.033602150537634407, "grad_norm": 1.1946964624516874, "learning_rate": 3.3557046979865777e-06, "loss": 1.0086, "step": 50 }, { "epoch": 0.03696236559139785, "grad_norm": 1.3676335716854129, "learning_rate": 3.6912751677852355e-06, "loss": 0.9521, "step": 55 }, { "epoch": 0.04032258064516129, "grad_norm": 1.15160172673339, "learning_rate": 4.026845637583892e-06, "loss": 0.9144, "step": 60 }, { "epoch": 0.043682795698924734, "grad_norm": 1.1898281923774547, "learning_rate": 4.362416107382551e-06, "loss": 0.8891, "step": 65 }, { "epoch": 0.04704301075268817, "grad_norm": 1.2290162400825384, "learning_rate": 4.697986577181208e-06, "loss": 0.8684, "step": 70 }, { "epoch": 0.05040322580645161, "grad_norm": 1.1047127902421139, "learning_rate": 5.033557046979867e-06, "loss": 0.852, "step": 75 }, { "epoch": 0.053763440860215055, "grad_norm": 1.3842650719759613, "learning_rate": 5.369127516778524e-06, "loss": 0.8421, "step": 80 }, { "epoch": 0.05712365591397849, "grad_norm": 1.1630054678494466, "learning_rate": 5.704697986577181e-06, "loss": 0.8293, "step": 85 }, { "epoch": 0.06048387096774194, "grad_norm": 1.2895915993551423, "learning_rate": 6.04026845637584e-06, "loss": 0.8155, "step": 90 }, { "epoch": 0.06384408602150538, "grad_norm": 1.3357187881014583, "learning_rate": 6.375838926174497e-06, "loss": 0.8094, "step": 95 }, { "epoch": 0.06720430107526881, "grad_norm": 1.4294338181391608, "learning_rate": 6.711409395973155e-06, "loss": 0.807, "step": 100 }, { "epoch": 0.07056451612903226, "grad_norm": 0.9685428191519895, "learning_rate": 7.046979865771813e-06, "loss": 0.7963, "step": 105 }, { "epoch": 0.0739247311827957, "grad_norm": 1.0685231337896743, "learning_rate": 7.382550335570471e-06, "loss": 0.7903, "step": 110 }, { "epoch": 0.07728494623655914, "grad_norm": 1.3129982717287054, "learning_rate": 7.718120805369127e-06, "loss": 0.7932, "step": 115 }, { "epoch": 0.08064516129032258, "grad_norm": 1.6648712375867527, "learning_rate": 8.053691275167785e-06, "loss": 0.7865, "step": 120 }, { "epoch": 0.08400537634408602, "grad_norm": 1.2705854884448164, "learning_rate": 8.389261744966444e-06, "loss": 0.7799, "step": 125 }, { "epoch": 0.08736559139784947, "grad_norm": 1.3845304392408557, "learning_rate": 8.724832214765101e-06, "loss": 0.7794, "step": 130 }, { "epoch": 0.0907258064516129, "grad_norm": 1.0490487686523224, "learning_rate": 9.060402684563759e-06, "loss": 0.7762, "step": 135 }, { "epoch": 0.09408602150537634, "grad_norm": 1.1349198286002737, "learning_rate": 9.395973154362416e-06, "loss": 0.7704, "step": 140 }, { "epoch": 0.09744623655913978, "grad_norm": 1.311025567661312, "learning_rate": 9.731543624161075e-06, "loss": 0.7624, "step": 145 }, { "epoch": 0.10080645161290322, "grad_norm": 1.0240371131291772, "learning_rate": 1.0067114093959734e-05, "loss": 0.7645, "step": 150 }, { "epoch": 0.10416666666666667, "grad_norm": 1.0576784893797677, "learning_rate": 1.040268456375839e-05, "loss": 0.7612, "step": 155 }, { "epoch": 0.10752688172043011, "grad_norm": 0.9872090722303354, "learning_rate": 1.0738255033557049e-05, "loss": 0.762, "step": 160 }, { "epoch": 0.11088709677419355, "grad_norm": 1.1448278866865749, "learning_rate": 1.1073825503355706e-05, "loss": 0.7595, "step": 165 }, { "epoch": 0.11424731182795698, "grad_norm": 0.9099648734864951, "learning_rate": 1.1409395973154362e-05, "loss": 0.7545, "step": 170 }, { "epoch": 0.11760752688172044, "grad_norm": 1.2385558169257114, "learning_rate": 1.174496644295302e-05, "loss": 0.7559, "step": 175 }, { "epoch": 0.12096774193548387, "grad_norm": 0.9417818627828941, "learning_rate": 1.208053691275168e-05, "loss": 0.7533, "step": 180 }, { "epoch": 0.12432795698924731, "grad_norm": 1.4888584917162164, "learning_rate": 1.2416107382550337e-05, "loss": 0.75, "step": 185 }, { "epoch": 0.12768817204301075, "grad_norm": 1.6094338507124168, "learning_rate": 1.2751677852348994e-05, "loss": 0.7516, "step": 190 }, { "epoch": 0.1310483870967742, "grad_norm": 1.1484532669834084, "learning_rate": 1.3087248322147652e-05, "loss": 0.7528, "step": 195 }, { "epoch": 0.13440860215053763, "grad_norm": 0.9606481759674919, "learning_rate": 1.342281879194631e-05, "loss": 0.7432, "step": 200 }, { "epoch": 0.13776881720430106, "grad_norm": 0.90666325947728, "learning_rate": 1.3758389261744966e-05, "loss": 0.7456, "step": 205 }, { "epoch": 0.14112903225806453, "grad_norm": 0.9003130723406445, "learning_rate": 1.4093959731543626e-05, "loss": 0.741, "step": 210 }, { "epoch": 0.14448924731182797, "grad_norm": 0.9046243900141376, "learning_rate": 1.4429530201342283e-05, "loss": 0.7413, "step": 215 }, { "epoch": 0.1478494623655914, "grad_norm": 0.9495916559973592, "learning_rate": 1.4765100671140942e-05, "loss": 0.7396, "step": 220 }, { "epoch": 0.15120967741935484, "grad_norm": 0.8392501968484852, "learning_rate": 1.5100671140939598e-05, "loss": 0.7451, "step": 225 }, { "epoch": 0.15456989247311828, "grad_norm": 1.0111465864476745, "learning_rate": 1.5436241610738255e-05, "loss": 0.7434, "step": 230 }, { "epoch": 0.15793010752688172, "grad_norm": 0.9025884088477472, "learning_rate": 1.5771812080536916e-05, "loss": 0.7354, "step": 235 }, { "epoch": 0.16129032258064516, "grad_norm": 0.9720296761608475, "learning_rate": 1.610738255033557e-05, "loss": 0.7371, "step": 240 }, { "epoch": 0.1646505376344086, "grad_norm": 0.9947001241550435, "learning_rate": 1.644295302013423e-05, "loss": 0.7288, "step": 245 }, { "epoch": 0.16801075268817203, "grad_norm": 0.9174436365138352, "learning_rate": 1.6778523489932888e-05, "loss": 0.7347, "step": 250 }, { "epoch": 0.17137096774193547, "grad_norm": 1.0267733434401298, "learning_rate": 1.7114093959731545e-05, "loss": 0.7375, "step": 255 }, { "epoch": 0.17473118279569894, "grad_norm": 0.8397895938406165, "learning_rate": 1.7449664429530202e-05, "loss": 0.7341, "step": 260 }, { "epoch": 0.17809139784946237, "grad_norm": 0.76727099531327, "learning_rate": 1.778523489932886e-05, "loss": 0.7309, "step": 265 }, { "epoch": 0.1814516129032258, "grad_norm": 0.7412065721447615, "learning_rate": 1.8120805369127517e-05, "loss": 0.7269, "step": 270 }, { "epoch": 0.18481182795698925, "grad_norm": 0.7810498280270748, "learning_rate": 1.8456375838926174e-05, "loss": 0.7257, "step": 275 }, { "epoch": 0.1881720430107527, "grad_norm": 0.8613709749529885, "learning_rate": 1.8791946308724832e-05, "loss": 0.7315, "step": 280 }, { "epoch": 0.19153225806451613, "grad_norm": 0.9739010230796492, "learning_rate": 1.9127516778523493e-05, "loss": 0.7272, "step": 285 }, { "epoch": 0.19489247311827956, "grad_norm": 0.8646191524355271, "learning_rate": 1.946308724832215e-05, "loss": 0.7289, "step": 290 }, { "epoch": 0.198252688172043, "grad_norm": 0.9363534800514449, "learning_rate": 1.9798657718120807e-05, "loss": 0.7258, "step": 295 }, { "epoch": 0.20161290322580644, "grad_norm": 0.9701273350687407, "learning_rate": 1.9999972476199807e-05, "loss": 0.7317, "step": 300 }, { "epoch": 0.2049731182795699, "grad_norm": 0.7773609368388674, "learning_rate": 1.999966283518764e-05, "loss": 0.7227, "step": 305 }, { "epoch": 0.20833333333333334, "grad_norm": 0.8956665591253581, "learning_rate": 1.999900915910167e-05, "loss": 0.7231, "step": 310 }, { "epoch": 0.21169354838709678, "grad_norm": 0.6269021652498205, "learning_rate": 1.99980114704314e-05, "loss": 0.7216, "step": 315 }, { "epoch": 0.21505376344086022, "grad_norm": 1.011329580641999, "learning_rate": 1.9996669803501982e-05, "loss": 0.7276, "step": 320 }, { "epoch": 0.21841397849462366, "grad_norm": 1.080429434985579, "learning_rate": 1.999498420447302e-05, "loss": 0.7236, "step": 325 }, { "epoch": 0.2217741935483871, "grad_norm": 0.8583297562008422, "learning_rate": 1.9992954731336992e-05, "loss": 0.7225, "step": 330 }, { "epoch": 0.22513440860215053, "grad_norm": 0.8080172656804933, "learning_rate": 1.9990581453917234e-05, "loss": 0.7209, "step": 335 }, { "epoch": 0.22849462365591397, "grad_norm": 0.8049987876429506, "learning_rate": 1.998786445386558e-05, "loss": 0.7178, "step": 340 }, { "epoch": 0.2318548387096774, "grad_norm": 0.7233007018856564, "learning_rate": 1.9984803824659504e-05, "loss": 0.7181, "step": 345 }, { "epoch": 0.23521505376344087, "grad_norm": 0.6435752626797113, "learning_rate": 1.998139967159894e-05, "loss": 0.7124, "step": 350 }, { "epoch": 0.2385752688172043, "grad_norm": 0.6988189980904659, "learning_rate": 1.997765211180264e-05, "loss": 0.7138, "step": 355 }, { "epoch": 0.24193548387096775, "grad_norm": 0.8449192477747994, "learning_rate": 1.9973561274204153e-05, "loss": 0.7164, "step": 360 }, { "epoch": 0.2452956989247312, "grad_norm": 0.6199401123601755, "learning_rate": 1.9969127299547387e-05, "loss": 0.711, "step": 365 }, { "epoch": 0.24865591397849462, "grad_norm": 0.6508335116554123, "learning_rate": 1.9964350340381763e-05, "loss": 0.7099, "step": 370 }, { "epoch": 0.25201612903225806, "grad_norm": 0.7240178041828755, "learning_rate": 1.995923056105697e-05, "loss": 0.7108, "step": 375 }, { "epoch": 0.2553763440860215, "grad_norm": 0.7146640916604361, "learning_rate": 1.9953768137717323e-05, "loss": 0.714, "step": 380 }, { "epoch": 0.25873655913978494, "grad_norm": 0.7235799398485985, "learning_rate": 1.9947963258295665e-05, "loss": 0.7141, "step": 385 }, { "epoch": 0.2620967741935484, "grad_norm": 0.7085278247346875, "learning_rate": 1.9941816122506958e-05, "loss": 0.7123, "step": 390 }, { "epoch": 0.2654569892473118, "grad_norm": 0.8336879205562984, "learning_rate": 1.9935326941841348e-05, "loss": 0.7118, "step": 395 }, { "epoch": 0.26881720430107525, "grad_norm": 0.6472707436647601, "learning_rate": 1.9928495939556952e-05, "loss": 0.7112, "step": 400 }, { "epoch": 0.2721774193548387, "grad_norm": 0.5404676396301624, "learning_rate": 1.9921323350672123e-05, "loss": 0.7023, "step": 405 }, { "epoch": 0.27553763440860213, "grad_norm": 0.5867567569524356, "learning_rate": 1.9913809421957395e-05, "loss": 0.7061, "step": 410 }, { "epoch": 0.27889784946236557, "grad_norm": 0.5310275193870236, "learning_rate": 1.9905954411926992e-05, "loss": 0.7097, "step": 415 }, { "epoch": 0.28225806451612906, "grad_norm": 0.5428907392759953, "learning_rate": 1.9897758590829915e-05, "loss": 0.7046, "step": 420 }, { "epoch": 0.2856182795698925, "grad_norm": 0.6360587388135657, "learning_rate": 1.988922224064067e-05, "loss": 0.7059, "step": 425 }, { "epoch": 0.28897849462365593, "grad_norm": 0.5538060039668187, "learning_rate": 1.988034565504954e-05, "loss": 0.7055, "step": 430 }, { "epoch": 0.2923387096774194, "grad_norm": 0.7074777520721658, "learning_rate": 1.9871129139452502e-05, "loss": 0.7023, "step": 435 }, { "epoch": 0.2956989247311828, "grad_norm": 0.6033039358656708, "learning_rate": 1.986157301094071e-05, "loss": 0.702, "step": 440 }, { "epoch": 0.29905913978494625, "grad_norm": 0.7020301366143326, "learning_rate": 1.9851677598289588e-05, "loss": 0.7072, "step": 445 }, { "epoch": 0.3024193548387097, "grad_norm": 0.6735214131146494, "learning_rate": 1.9841443241947515e-05, "loss": 0.7046, "step": 450 }, { "epoch": 0.3057795698924731, "grad_norm": 0.5846703075665397, "learning_rate": 1.983087029402411e-05, "loss": 0.7004, "step": 455 }, { "epoch": 0.30913978494623656, "grad_norm": 0.558983961781733, "learning_rate": 1.9819959118278144e-05, "loss": 0.7046, "step": 460 }, { "epoch": 0.3125, "grad_norm": 0.6880041036412101, "learning_rate": 1.9808710090104983e-05, "loss": 0.7037, "step": 465 }, { "epoch": 0.31586021505376344, "grad_norm": 0.622425101525314, "learning_rate": 1.9797123596523692e-05, "loss": 0.6984, "step": 470 }, { "epoch": 0.3192204301075269, "grad_norm": 0.6341081550172605, "learning_rate": 1.978520003616374e-05, "loss": 0.6985, "step": 475 }, { "epoch": 0.3225806451612903, "grad_norm": 0.659287705673367, "learning_rate": 1.9772939819251247e-05, "loss": 0.6954, "step": 480 }, { "epoch": 0.32594086021505375, "grad_norm": 0.5715874934602246, "learning_rate": 1.976034336759491e-05, "loss": 0.6988, "step": 485 }, { "epoch": 0.3293010752688172, "grad_norm": 0.8050941273557731, "learning_rate": 1.9747411114571445e-05, "loss": 0.6993, "step": 490 }, { "epoch": 0.3326612903225806, "grad_norm": 0.6666660914014055, "learning_rate": 1.973414350511072e-05, "loss": 0.7018, "step": 495 }, { "epoch": 0.33602150537634407, "grad_norm": 0.5898448667662453, "learning_rate": 1.9720540995680428e-05, "loss": 0.697, "step": 500 }, { "epoch": 0.3393817204301075, "grad_norm": 0.7204191081412994, "learning_rate": 1.9706604054270376e-05, "loss": 0.6966, "step": 505 }, { "epoch": 0.34274193548387094, "grad_norm": 0.6353645462526023, "learning_rate": 1.9692333160376407e-05, "loss": 0.693, "step": 510 }, { "epoch": 0.34610215053763443, "grad_norm": 0.5663739741433981, "learning_rate": 1.967772880498387e-05, "loss": 0.6985, "step": 515 }, { "epoch": 0.34946236559139787, "grad_norm": 0.541549492600514, "learning_rate": 1.9662791490550755e-05, "loss": 0.6933, "step": 520 }, { "epoch": 0.3528225806451613, "grad_norm": 0.5766179836769963, "learning_rate": 1.9647521730990408e-05, "loss": 0.6996, "step": 525 }, { "epoch": 0.35618279569892475, "grad_norm": 0.528070730439876, "learning_rate": 1.9631920051653813e-05, "loss": 0.6967, "step": 530 }, { "epoch": 0.3595430107526882, "grad_norm": 0.590209921037558, "learning_rate": 1.9615986989311567e-05, "loss": 0.6956, "step": 535 }, { "epoch": 0.3629032258064516, "grad_norm": 0.5291818662889545, "learning_rate": 1.9599723092135376e-05, "loss": 0.695, "step": 540 }, { "epoch": 0.36626344086021506, "grad_norm": 0.4970212649639191, "learning_rate": 1.9583128919679218e-05, "loss": 0.6917, "step": 545 }, { "epoch": 0.3696236559139785, "grad_norm": 0.6225127520386264, "learning_rate": 1.956620504286007e-05, "loss": 0.6935, "step": 550 }, { "epoch": 0.37298387096774194, "grad_norm": 0.6214865530844041, "learning_rate": 1.9548952043938286e-05, "loss": 0.6894, "step": 555 }, { "epoch": 0.3763440860215054, "grad_norm": 0.6009400925112226, "learning_rate": 1.9531370516497562e-05, "loss": 0.6888, "step": 560 }, { "epoch": 0.3797043010752688, "grad_norm": 0.6054051937463671, "learning_rate": 1.95134610654245e-05, "loss": 0.6904, "step": 565 }, { "epoch": 0.38306451612903225, "grad_norm": 0.5850716552121316, "learning_rate": 1.9495224306887797e-05, "loss": 0.6924, "step": 570 }, { "epoch": 0.3864247311827957, "grad_norm": 0.5796507375628339, "learning_rate": 1.9476660868317076e-05, "loss": 0.69, "step": 575 }, { "epoch": 0.3897849462365591, "grad_norm": 0.72632335015395, "learning_rate": 1.945777138838126e-05, "loss": 0.6877, "step": 580 }, { "epoch": 0.39314516129032256, "grad_norm": 0.5476131150629957, "learning_rate": 1.943855651696663e-05, "loss": 0.6902, "step": 585 }, { "epoch": 0.396505376344086, "grad_norm": 0.5140382069241621, "learning_rate": 1.941901691515444e-05, "loss": 0.6929, "step": 590 }, { "epoch": 0.39986559139784944, "grad_norm": 0.4839340720292055, "learning_rate": 1.9399153255198193e-05, "loss": 0.694, "step": 595 }, { "epoch": 0.4032258064516129, "grad_norm": 0.4677177786124492, "learning_rate": 1.9378966220500503e-05, "loss": 0.6952, "step": 600 }, { "epoch": 0.40658602150537637, "grad_norm": 0.5207299081688349, "learning_rate": 1.9358456505589586e-05, "loss": 0.6932, "step": 605 }, { "epoch": 0.4099462365591398, "grad_norm": 0.5396909713942541, "learning_rate": 1.933762481609536e-05, "loss": 0.684, "step": 610 }, { "epoch": 0.41330645161290325, "grad_norm": 0.49142294486557386, "learning_rate": 1.9316471868725167e-05, "loss": 0.6948, "step": 615 }, { "epoch": 0.4166666666666667, "grad_norm": 0.5408353090528122, "learning_rate": 1.9294998391239133e-05, "loss": 0.6858, "step": 620 }, { "epoch": 0.4200268817204301, "grad_norm": 0.5233730262407599, "learning_rate": 1.9273205122425108e-05, "loss": 0.6887, "step": 625 }, { "epoch": 0.42338709677419356, "grad_norm": 0.5715799297785072, "learning_rate": 1.925109281207326e-05, "loss": 0.6872, "step": 630 }, { "epoch": 0.426747311827957, "grad_norm": 0.5328961169706027, "learning_rate": 1.922866222095026e-05, "loss": 0.6899, "step": 635 }, { "epoch": 0.43010752688172044, "grad_norm": 0.4887114084575751, "learning_rate": 1.9205914120773146e-05, "loss": 0.6906, "step": 640 }, { "epoch": 0.4334677419354839, "grad_norm": 0.5209483295062493, "learning_rate": 1.9182849294182734e-05, "loss": 0.6879, "step": 645 }, { "epoch": 0.4368279569892473, "grad_norm": 0.5695163664455944, "learning_rate": 1.915946853471671e-05, "loss": 0.6888, "step": 650 }, { "epoch": 0.44018817204301075, "grad_norm": 0.5383685605458758, "learning_rate": 1.913577264678233e-05, "loss": 0.6866, "step": 655 }, { "epoch": 0.4435483870967742, "grad_norm": 0.5016268706035367, "learning_rate": 1.9111762445628738e-05, "loss": 0.6828, "step": 660 }, { "epoch": 0.4469086021505376, "grad_norm": 0.49415639069338374, "learning_rate": 1.908743875731891e-05, "loss": 0.6902, "step": 665 }, { "epoch": 0.45026881720430106, "grad_norm": 0.5339722588835791, "learning_rate": 1.906280241870126e-05, "loss": 0.685, "step": 670 }, { "epoch": 0.4536290322580645, "grad_norm": 0.6037780346276747, "learning_rate": 1.903785427738082e-05, "loss": 0.6837, "step": 675 }, { "epoch": 0.45698924731182794, "grad_norm": 0.5163759102323392, "learning_rate": 1.9012595191690096e-05, "loss": 0.6864, "step": 680 }, { "epoch": 0.4603494623655914, "grad_norm": 0.5540520547432817, "learning_rate": 1.8987026030659527e-05, "loss": 0.6868, "step": 685 }, { "epoch": 0.4637096774193548, "grad_norm": 0.5141253497235312, "learning_rate": 1.8961147673987598e-05, "loss": 0.685, "step": 690 }, { "epoch": 0.46706989247311825, "grad_norm": 0.47087600726350604, "learning_rate": 1.893496101201056e-05, "loss": 0.6879, "step": 695 }, { "epoch": 0.47043010752688175, "grad_norm": 0.4745813202969137, "learning_rate": 1.8908466945671805e-05, "loss": 0.6873, "step": 700 }, { "epoch": 0.4737903225806452, "grad_norm": 0.5512208302577559, "learning_rate": 1.8881666386490874e-05, "loss": 0.6816, "step": 705 }, { "epoch": 0.4771505376344086, "grad_norm": 0.49728239853688544, "learning_rate": 1.8854560256532098e-05, "loss": 0.6839, "step": 710 }, { "epoch": 0.48051075268817206, "grad_norm": 0.529373520548778, "learning_rate": 1.882714948837286e-05, "loss": 0.6824, "step": 715 }, { "epoch": 0.4838709677419355, "grad_norm": 0.5220674061051042, "learning_rate": 1.8799435025071515e-05, "loss": 0.6786, "step": 720 }, { "epoch": 0.48723118279569894, "grad_norm": 0.5130522433448756, "learning_rate": 1.8771417820134963e-05, "loss": 0.6822, "step": 725 }, { "epoch": 0.4905913978494624, "grad_norm": 0.5290689359475228, "learning_rate": 1.8743098837485813e-05, "loss": 0.6865, "step": 730 }, { "epoch": 0.4939516129032258, "grad_norm": 0.6064255629442339, "learning_rate": 1.871447905142925e-05, "loss": 0.6806, "step": 735 }, { "epoch": 0.49731182795698925, "grad_norm": 0.5012647709893052, "learning_rate": 1.868555944661949e-05, "loss": 0.6835, "step": 740 }, { "epoch": 0.5006720430107527, "grad_norm": 0.644030287593773, "learning_rate": 1.865634101802592e-05, "loss": 0.6833, "step": 745 }, { "epoch": 0.5040322580645161, "grad_norm": 0.5147725717983542, "learning_rate": 1.8626824770898856e-05, "loss": 0.6869, "step": 750 }, { "epoch": 0.5073924731182796, "grad_norm": 0.6476710537127688, "learning_rate": 1.859701172073496e-05, "loss": 0.6798, "step": 755 }, { "epoch": 0.510752688172043, "grad_norm": 0.5471024984874042, "learning_rate": 1.856690289324231e-05, "loss": 0.6809, "step": 760 }, { "epoch": 0.5141129032258065, "grad_norm": 0.5523881151273712, "learning_rate": 1.8536499324305102e-05, "loss": 0.6818, "step": 765 }, { "epoch": 0.5174731182795699, "grad_norm": 0.5050686616013327, "learning_rate": 1.8505802059948012e-05, "loss": 0.6843, "step": 770 }, { "epoch": 0.5208333333333334, "grad_norm": 0.5438632142417479, "learning_rate": 1.847481215630021e-05, "loss": 0.685, "step": 775 }, { "epoch": 0.5241935483870968, "grad_norm": 0.4941593674742346, "learning_rate": 1.844353067955902e-05, "loss": 0.6808, "step": 780 }, { "epoch": 0.5275537634408602, "grad_norm": 0.4754694994792113, "learning_rate": 1.8411958705953248e-05, "loss": 0.6816, "step": 785 }, { "epoch": 0.5309139784946236, "grad_norm": 0.4805660656514537, "learning_rate": 1.838009732170614e-05, "loss": 0.6796, "step": 790 }, { "epoch": 0.5342741935483871, "grad_norm": 0.44882280902701865, "learning_rate": 1.834794762299803e-05, "loss": 0.6835, "step": 795 }, { "epoch": 0.5376344086021505, "grad_norm": 0.4955511641250313, "learning_rate": 1.8315510715928607e-05, "loss": 0.6847, "step": 800 }, { "epoch": 0.540994623655914, "grad_norm": 0.4852311864762166, "learning_rate": 1.828278771647887e-05, "loss": 0.681, "step": 805 }, { "epoch": 0.5443548387096774, "grad_norm": 0.5005675657439985, "learning_rate": 1.8249779750472725e-05, "loss": 0.6832, "step": 810 }, { "epoch": 0.5477150537634409, "grad_norm": 0.5102495993533148, "learning_rate": 1.821648795353827e-05, "loss": 0.6817, "step": 815 }, { "epoch": 0.5510752688172043, "grad_norm": 0.4943223187596057, "learning_rate": 1.81829134710687e-05, "loss": 0.6791, "step": 820 }, { "epoch": 0.5544354838709677, "grad_norm": 0.542801548624785, "learning_rate": 1.8149057458182924e-05, "loss": 0.6832, "step": 825 }, { "epoch": 0.5577956989247311, "grad_norm": 0.45148280962815884, "learning_rate": 1.81149210796858e-05, "loss": 0.6784, "step": 830 }, { "epoch": 0.5611559139784946, "grad_norm": 0.4566100659521107, "learning_rate": 1.8080505510028073e-05, "loss": 0.6811, "step": 835 }, { "epoch": 0.5645161290322581, "grad_norm": 0.45139497231587655, "learning_rate": 1.8045811933265973e-05, "loss": 0.6782, "step": 840 }, { "epoch": 0.5678763440860215, "grad_norm": 0.5039358231725258, "learning_rate": 1.8010841543020472e-05, "loss": 0.6776, "step": 845 }, { "epoch": 0.571236559139785, "grad_norm": 0.44244553735827763, "learning_rate": 1.7975595542436207e-05, "loss": 0.6795, "step": 850 }, { "epoch": 0.5745967741935484, "grad_norm": 0.513649210570403, "learning_rate": 1.794007514414011e-05, "loss": 0.6777, "step": 855 }, { "epoch": 0.5779569892473119, "grad_norm": 0.4779895963476636, "learning_rate": 1.790428157019967e-05, "loss": 0.6807, "step": 860 }, { "epoch": 0.5813172043010753, "grad_norm": 0.5317771601563221, "learning_rate": 1.7868216052080898e-05, "loss": 0.6774, "step": 865 }, { "epoch": 0.5846774193548387, "grad_norm": 0.4387347378891155, "learning_rate": 1.783187983060594e-05, "loss": 0.6771, "step": 870 }, { "epoch": 0.5880376344086021, "grad_norm": 0.47208773953042626, "learning_rate": 1.7795274155910408e-05, "loss": 0.6795, "step": 875 }, { "epoch": 0.5913978494623656, "grad_norm": 0.4919880987398928, "learning_rate": 1.7758400287400372e-05, "loss": 0.6764, "step": 880 }, { "epoch": 0.594758064516129, "grad_norm": 0.4746143436746516, "learning_rate": 1.772125949370901e-05, "loss": 0.6777, "step": 885 }, { "epoch": 0.5981182795698925, "grad_norm": 0.4866530917728411, "learning_rate": 1.7683853052652974e-05, "loss": 0.6778, "step": 890 }, { "epoch": 0.6014784946236559, "grad_norm": 0.45607894178261754, "learning_rate": 1.764618225118843e-05, "loss": 0.675, "step": 895 }, { "epoch": 0.6048387096774194, "grad_norm": 0.4623134720756107, "learning_rate": 1.7608248385366774e-05, "loss": 0.6724, "step": 900 }, { "epoch": 0.6081989247311828, "grad_norm": 0.41997040926221346, "learning_rate": 1.7570052760290036e-05, "loss": 0.6749, "step": 905 }, { "epoch": 0.6115591397849462, "grad_norm": 0.4352461962298591, "learning_rate": 1.7531596690065998e-05, "loss": 0.673, "step": 910 }, { "epoch": 0.6149193548387096, "grad_norm": 0.47410360513736394, "learning_rate": 1.7492881497762958e-05, "loss": 0.6758, "step": 915 }, { "epoch": 0.6182795698924731, "grad_norm": 0.45269850953936747, "learning_rate": 1.7453908515364238e-05, "loss": 0.6802, "step": 920 }, { "epoch": 0.6216397849462365, "grad_norm": 0.46524887071623716, "learning_rate": 1.741467908372233e-05, "loss": 0.674, "step": 925 }, { "epoch": 0.625, "grad_norm": 0.4739090399223664, "learning_rate": 1.7375194552512777e-05, "loss": 0.6743, "step": 930 }, { "epoch": 0.6283602150537635, "grad_norm": 0.50057753354274, "learning_rate": 1.7335456280187752e-05, "loss": 0.678, "step": 935 }, { "epoch": 0.6317204301075269, "grad_norm": 0.4342277157697679, "learning_rate": 1.729546563392929e-05, "loss": 0.6766, "step": 940 }, { "epoch": 0.6350806451612904, "grad_norm": 0.5421833972178155, "learning_rate": 1.7255223989602277e-05, "loss": 0.6725, "step": 945 }, { "epoch": 0.6384408602150538, "grad_norm": 0.5180088295892716, "learning_rate": 1.72147327317071e-05, "loss": 0.6766, "step": 950 }, { "epoch": 0.6418010752688172, "grad_norm": 0.4835576817093598, "learning_rate": 1.7173993253332016e-05, "loss": 0.6754, "step": 955 }, { "epoch": 0.6451612903225806, "grad_norm": 0.4566946543115902, "learning_rate": 1.7133006956105237e-05, "loss": 0.6751, "step": 960 }, { "epoch": 0.6485215053763441, "grad_norm": 0.43744247420203414, "learning_rate": 1.7091775250146678e-05, "loss": 0.6708, "step": 965 }, { "epoch": 0.6518817204301075, "grad_norm": 0.4237915807278061, "learning_rate": 1.7050299554019466e-05, "loss": 0.6702, "step": 970 }, { "epoch": 0.655241935483871, "grad_norm": 0.4159411791640258, "learning_rate": 1.700858129468114e-05, "loss": 0.6738, "step": 975 }, { "epoch": 0.6586021505376344, "grad_norm": 0.47229925238109904, "learning_rate": 1.696662190743453e-05, "loss": 0.6731, "step": 980 }, { "epoch": 0.6619623655913979, "grad_norm": 0.4357051029703012, "learning_rate": 1.69244228358784e-05, "loss": 0.6732, "step": 985 }, { "epoch": 0.6653225806451613, "grad_norm": 0.47900841918884185, "learning_rate": 1.688198553185777e-05, "loss": 0.6742, "step": 990 }, { "epoch": 0.6686827956989247, "grad_norm": 0.529861867014197, "learning_rate": 1.683931145541397e-05, "loss": 0.6753, "step": 995 }, { "epoch": 0.6720430107526881, "grad_norm": 0.49314598912759483, "learning_rate": 1.6796402074734404e-05, "loss": 0.6694, "step": 1000 }, { "epoch": 0.6754032258064516, "grad_norm": 0.47658141232504353, "learning_rate": 1.6753258866102047e-05, "loss": 0.6733, "step": 1005 }, { "epoch": 0.678763440860215, "grad_norm": 0.4555180579717891, "learning_rate": 1.6709883313844634e-05, "loss": 0.6771, "step": 1010 }, { "epoch": 0.6821236559139785, "grad_norm": 0.4804224902104754, "learning_rate": 1.6666276910283623e-05, "loss": 0.6754, "step": 1015 }, { "epoch": 0.6854838709677419, "grad_norm": 0.4676965423125691, "learning_rate": 1.662244115568282e-05, "loss": 0.6697, "step": 1020 }, { "epoch": 0.6888440860215054, "grad_norm": 0.4309666314092742, "learning_rate": 1.657837755819678e-05, "loss": 0.6682, "step": 1025 }, { "epoch": 0.6922043010752689, "grad_norm": 0.4526083077842183, "learning_rate": 1.6534087633818914e-05, "loss": 0.6738, "step": 1030 }, { "epoch": 0.6955645161290323, "grad_norm": 0.42731232075306474, "learning_rate": 1.6489572906329345e-05, "loss": 0.6699, "step": 1035 }, { "epoch": 0.6989247311827957, "grad_norm": 0.4584418364845978, "learning_rate": 1.644483490724247e-05, "loss": 0.671, "step": 1040 }, { "epoch": 0.7022849462365591, "grad_norm": 0.45883989932174113, "learning_rate": 1.6399875175754258e-05, "loss": 0.6696, "step": 1045 }, { "epoch": 0.7056451612903226, "grad_norm": 0.44771182993617314, "learning_rate": 1.635469525868932e-05, "loss": 0.6721, "step": 1050 }, { "epoch": 0.709005376344086, "grad_norm": 0.4588720172063543, "learning_rate": 1.6309296710447674e-05, "loss": 0.6697, "step": 1055 }, { "epoch": 0.7123655913978495, "grad_norm": 0.45001838732164606, "learning_rate": 1.626368109295128e-05, "loss": 0.667, "step": 1060 }, { "epoch": 0.7157258064516129, "grad_norm": 0.42641910570356634, "learning_rate": 1.6217849975590275e-05, "loss": 0.6708, "step": 1065 }, { "epoch": 0.7190860215053764, "grad_norm": 0.422498792435904, "learning_rate": 1.617180493516901e-05, "loss": 0.6711, "step": 1070 }, { "epoch": 0.7224462365591398, "grad_norm": 0.4415726107421213, "learning_rate": 1.6125547555851787e-05, "loss": 0.6685, "step": 1075 }, { "epoch": 0.7258064516129032, "grad_norm": 0.4534706255885532, "learning_rate": 1.6079079429108357e-05, "loss": 0.6696, "step": 1080 }, { "epoch": 0.7291666666666666, "grad_norm": 0.4029176160192046, "learning_rate": 1.6032402153659164e-05, "loss": 0.6726, "step": 1085 }, { "epoch": 0.7325268817204301, "grad_norm": 0.42621354076663304, "learning_rate": 1.5985517335420346e-05, "loss": 0.6666, "step": 1090 }, { "epoch": 0.7358870967741935, "grad_norm": 0.44234432560741715, "learning_rate": 1.5938426587448473e-05, "loss": 0.6741, "step": 1095 }, { "epoch": 0.739247311827957, "grad_norm": 0.4473272678937478, "learning_rate": 1.589113152988507e-05, "loss": 0.6709, "step": 1100 }, { "epoch": 0.7426075268817204, "grad_norm": 0.44595646225862023, "learning_rate": 1.5843633789900862e-05, "loss": 0.6712, "step": 1105 }, { "epoch": 0.7459677419354839, "grad_norm": 0.4324555524707172, "learning_rate": 1.5795935001639796e-05, "loss": 0.6658, "step": 1110 }, { "epoch": 0.7493279569892473, "grad_norm": 0.46394662933500824, "learning_rate": 1.5748036806162816e-05, "loss": 0.6684, "step": 1115 }, { "epoch": 0.7526881720430108, "grad_norm": 0.4947844886498111, "learning_rate": 1.56999408513914e-05, "loss": 0.6722, "step": 1120 }, { "epoch": 0.7560483870967742, "grad_norm": 0.4250235043595764, "learning_rate": 1.5651648792050886e-05, "loss": 0.6686, "step": 1125 }, { "epoch": 0.7594086021505376, "grad_norm": 0.4159591266629867, "learning_rate": 1.5603162289613503e-05, "loss": 0.669, "step": 1130 }, { "epoch": 0.7627688172043011, "grad_norm": 0.4095733043870104, "learning_rate": 1.555448301224124e-05, "loss": 0.6692, "step": 1135 }, { "epoch": 0.7661290322580645, "grad_norm": 0.43896150432589165, "learning_rate": 1.550561263472845e-05, "loss": 0.6657, "step": 1140 }, { "epoch": 0.769489247311828, "grad_norm": 0.47253071219496107, "learning_rate": 1.5456552838444215e-05, "loss": 0.6695, "step": 1145 }, { "epoch": 0.7728494623655914, "grad_norm": 0.43201524742454317, "learning_rate": 1.5407305311274502e-05, "loss": 0.6639, "step": 1150 }, { "epoch": 0.7762096774193549, "grad_norm": 0.46875979315386157, "learning_rate": 1.5357871747564108e-05, "loss": 0.6696, "step": 1155 }, { "epoch": 0.7795698924731183, "grad_norm": 0.4458966308423493, "learning_rate": 1.530825384805835e-05, "loss": 0.6671, "step": 1160 }, { "epoch": 0.7829301075268817, "grad_norm": 0.4045923602073338, "learning_rate": 1.5258453319844555e-05, "loss": 0.6679, "step": 1165 }, { "epoch": 0.7862903225806451, "grad_norm": 0.46081178192946914, "learning_rate": 1.5208471876293334e-05, "loss": 0.6653, "step": 1170 }, { "epoch": 0.7896505376344086, "grad_norm": 0.4412474732597341, "learning_rate": 1.5158311236999622e-05, "loss": 0.6696, "step": 1175 }, { "epoch": 0.793010752688172, "grad_norm": 0.4561523660741418, "learning_rate": 1.5107973127723537e-05, "loss": 0.6662, "step": 1180 }, { "epoch": 0.7963709677419355, "grad_norm": 0.45370412836150925, "learning_rate": 1.5057459280330984e-05, "loss": 0.6668, "step": 1185 }, { "epoch": 0.7997311827956989, "grad_norm": 0.4641379862022367, "learning_rate": 1.500677143273408e-05, "loss": 0.6618, "step": 1190 }, { "epoch": 0.8030913978494624, "grad_norm": 0.4228354205291783, "learning_rate": 1.4955911328831357e-05, "loss": 0.6689, "step": 1195 }, { "epoch": 0.8064516129032258, "grad_norm": 0.45194187006117376, "learning_rate": 1.4904880718447781e-05, "loss": 0.6654, "step": 1200 }, { "epoch": 0.8098118279569892, "grad_norm": 0.41370408468731323, "learning_rate": 1.4853681357274522e-05, "loss": 0.6656, "step": 1205 }, { "epoch": 0.8131720430107527, "grad_norm": 0.39658836438427153, "learning_rate": 1.4802315006808578e-05, "loss": 0.6642, "step": 1210 }, { "epoch": 0.8165322580645161, "grad_norm": 0.39180055538376923, "learning_rate": 1.4750783434292147e-05, "loss": 0.6669, "step": 1215 }, { "epoch": 0.8198924731182796, "grad_norm": 0.4140385959248494, "learning_rate": 1.4699088412651841e-05, "loss": 0.6677, "step": 1220 }, { "epoch": 0.823252688172043, "grad_norm": 0.4135853126184148, "learning_rate": 1.4647231720437687e-05, "loss": 0.6691, "step": 1225 }, { "epoch": 0.8266129032258065, "grad_norm": 0.3886901463807677, "learning_rate": 1.4595215141761934e-05, "loss": 0.6612, "step": 1230 }, { "epoch": 0.8299731182795699, "grad_norm": 0.427261482590677, "learning_rate": 1.4543040466237662e-05, "loss": 0.6635, "step": 1235 }, { "epoch": 0.8333333333333334, "grad_norm": 0.41557585081944015, "learning_rate": 1.4490709488917239e-05, "loss": 0.6642, "step": 1240 }, { "epoch": 0.8366935483870968, "grad_norm": 0.41396481934719226, "learning_rate": 1.4438224010230526e-05, "loss": 0.6642, "step": 1245 }, { "epoch": 0.8400537634408602, "grad_norm": 0.4000957409618339, "learning_rate": 1.4385585835922962e-05, "loss": 0.6679, "step": 1250 }, { "epoch": 0.8434139784946236, "grad_norm": 0.5093831922663081, "learning_rate": 1.4332796776993423e-05, "loss": 0.6665, "step": 1255 }, { "epoch": 0.8467741935483871, "grad_norm": 0.39777324014388976, "learning_rate": 1.427985864963193e-05, "loss": 0.6608, "step": 1260 }, { "epoch": 0.8501344086021505, "grad_norm": 0.4123671465949071, "learning_rate": 1.4226773275157142e-05, "loss": 0.6679, "step": 1265 }, { "epoch": 0.853494623655914, "grad_norm": 0.4178878559263764, "learning_rate": 1.4173542479953712e-05, "loss": 0.6629, "step": 1270 }, { "epoch": 0.8568548387096774, "grad_norm": 0.4151008943507023, "learning_rate": 1.412016809540944e-05, "loss": 0.6708, "step": 1275 }, { "epoch": 0.8602150537634409, "grad_norm": 0.39420851761758596, "learning_rate": 1.406665195785228e-05, "loss": 0.66, "step": 1280 }, { "epoch": 0.8635752688172043, "grad_norm": 0.39995681473664046, "learning_rate": 1.401299590848714e-05, "loss": 0.6631, "step": 1285 }, { "epoch": 0.8669354838709677, "grad_norm": 0.43024431364313487, "learning_rate": 1.3959201793332554e-05, "loss": 0.6648, "step": 1290 }, { "epoch": 0.8702956989247311, "grad_norm": 0.3823413783572042, "learning_rate": 1.3905271463157153e-05, "loss": 0.6668, "step": 1295 }, { "epoch": 0.8736559139784946, "grad_norm": 0.4168656965051171, "learning_rate": 1.385120677341602e-05, "loss": 0.6584, "step": 1300 }, { "epoch": 0.8770161290322581, "grad_norm": 0.3967459217329594, "learning_rate": 1.3797009584186818e-05, "loss": 0.6627, "step": 1305 }, { "epoch": 0.8803763440860215, "grad_norm": 0.4090750420770741, "learning_rate": 1.3742681760105814e-05, "loss": 0.6616, "step": 1310 }, { "epoch": 0.883736559139785, "grad_norm": 0.4175005433442616, "learning_rate": 1.3688225170303727e-05, "loss": 0.6668, "step": 1315 }, { "epoch": 0.8870967741935484, "grad_norm": 0.425478340178616, "learning_rate": 1.3633641688341413e-05, "loss": 0.6613, "step": 1320 }, { "epoch": 0.8904569892473119, "grad_norm": 0.39081095331783994, "learning_rate": 1.3578933192145421e-05, "loss": 0.6631, "step": 1325 }, { "epoch": 0.8938172043010753, "grad_norm": 0.460628551115726, "learning_rate": 1.3524101563943356e-05, "loss": 0.6617, "step": 1330 }, { "epoch": 0.8971774193548387, "grad_norm": 0.4558498194690748, "learning_rate": 1.3469148690199157e-05, "loss": 0.665, "step": 1335 }, { "epoch": 0.9005376344086021, "grad_norm": 0.4182583525353485, "learning_rate": 1.3414076461548162e-05, "loss": 0.662, "step": 1340 }, { "epoch": 0.9038978494623656, "grad_norm": 0.4438115831427305, "learning_rate": 1.3358886772732085e-05, "loss": 0.6638, "step": 1345 }, { "epoch": 0.907258064516129, "grad_norm": 0.45301099015215857, "learning_rate": 1.3303581522533806e-05, "loss": 0.6602, "step": 1350 }, { "epoch": 0.9106182795698925, "grad_norm": 0.4353757710607073, "learning_rate": 1.3248162613712066e-05, "loss": 0.6614, "step": 1355 }, { "epoch": 0.9139784946236559, "grad_norm": 0.3966488654029928, "learning_rate": 1.319263195293599e-05, "loss": 0.661, "step": 1360 }, { "epoch": 0.9173387096774194, "grad_norm": 0.43451849890336813, "learning_rate": 1.3136991450719493e-05, "loss": 0.6632, "step": 1365 }, { "epoch": 0.9206989247311828, "grad_norm": 0.4331957813206352, "learning_rate": 1.3081243021355542e-05, "loss": 0.6618, "step": 1370 }, { "epoch": 0.9240591397849462, "grad_norm": 0.42461868845233375, "learning_rate": 1.3025388582850311e-05, "loss": 0.661, "step": 1375 }, { "epoch": 0.9274193548387096, "grad_norm": 0.4270005545440327, "learning_rate": 1.2969430056857177e-05, "loss": 0.6619, "step": 1380 }, { "epoch": 0.9307795698924731, "grad_norm": 0.40199632811605207, "learning_rate": 1.2913369368610618e-05, "loss": 0.6625, "step": 1385 }, { "epoch": 0.9341397849462365, "grad_norm": 0.40238063432322, "learning_rate": 1.285720844685996e-05, "loss": 0.6619, "step": 1390 }, { "epoch": 0.9375, "grad_norm": 0.4063919100845298, "learning_rate": 1.2800949223803039e-05, "loss": 0.6593, "step": 1395 }, { "epoch": 0.9408602150537635, "grad_norm": 0.42658950759070957, "learning_rate": 1.274459363501971e-05, "loss": 0.6663, "step": 1400 }, { "epoch": 0.9442204301075269, "grad_norm": 0.40879880761812026, "learning_rate": 1.2688143619405266e-05, "loss": 0.6626, "step": 1405 }, { "epoch": 0.9475806451612904, "grad_norm": 0.4136553833868636, "learning_rate": 1.2631601119103714e-05, "loss": 0.6595, "step": 1410 }, { "epoch": 0.9509408602150538, "grad_norm": 0.4566408970103592, "learning_rate": 1.2574968079440969e-05, "loss": 0.6612, "step": 1415 }, { "epoch": 0.9543010752688172, "grad_norm": 0.4707155898476047, "learning_rate": 1.251824644885792e-05, "loss": 0.661, "step": 1420 }, { "epoch": 0.9576612903225806, "grad_norm": 0.44843086084440054, "learning_rate": 1.2461438178843409e-05, "loss": 0.6603, "step": 1425 }, { "epoch": 0.9610215053763441, "grad_norm": 0.3887174931118394, "learning_rate": 1.2404545223867067e-05, "loss": 0.6648, "step": 1430 }, { "epoch": 0.9643817204301075, "grad_norm": 0.3844627071853656, "learning_rate": 1.2347569541312086e-05, "loss": 0.661, "step": 1435 }, { "epoch": 0.967741935483871, "grad_norm": 0.38712969083554144, "learning_rate": 1.2290513091407871e-05, "loss": 0.6626, "step": 1440 }, { "epoch": 0.9711021505376344, "grad_norm": 0.3765260995061332, "learning_rate": 1.223337783716261e-05, "loss": 0.658, "step": 1445 }, { "epoch": 0.9744623655913979, "grad_norm": 0.38880975825103575, "learning_rate": 1.2176165744295716e-05, "loss": 0.6625, "step": 1450 }, { "epoch": 0.9778225806451613, "grad_norm": 0.38891299522851835, "learning_rate": 1.2118878781170213e-05, "loss": 0.6571, "step": 1455 }, { "epoch": 0.9811827956989247, "grad_norm": 0.3883707422936783, "learning_rate": 1.2061518918725019e-05, "loss": 0.6554, "step": 1460 }, { "epoch": 0.9845430107526881, "grad_norm": 0.4326330855587226, "learning_rate": 1.2004088130407114e-05, "loss": 0.6617, "step": 1465 }, { "epoch": 0.9879032258064516, "grad_norm": 0.40177893100437934, "learning_rate": 1.1946588392103678e-05, "loss": 0.6601, "step": 1470 }, { "epoch": 0.991263440860215, "grad_norm": 0.4069182709848244, "learning_rate": 1.1889021682074074e-05, "loss": 0.6578, "step": 1475 }, { "epoch": 0.9946236559139785, "grad_norm": 0.371562924814927, "learning_rate": 1.1831389980881815e-05, "loss": 0.6595, "step": 1480 }, { "epoch": 0.9979838709677419, "grad_norm": 0.40260684742734143, "learning_rate": 1.1773695271326413e-05, "loss": 0.6598, "step": 1485 }, { "epoch": 1.0, "eval_loss": 0.6596925854682922, "eval_runtime": 38.4699, "eval_samples_per_second": 75.8, "eval_steps_per_second": 1.196, "step": 1488 }, { "epoch": 1.0013440860215055, "grad_norm": 0.43923520164769597, "learning_rate": 1.1715939538375159e-05, "loss": 0.6515, "step": 1490 }, { "epoch": 1.0047043010752688, "grad_norm": 0.4093115648975466, "learning_rate": 1.1658124769094834e-05, "loss": 0.6335, "step": 1495 }, { "epoch": 1.0080645161290323, "grad_norm": 0.44488404164651246, "learning_rate": 1.160025295258335e-05, "loss": 0.6351, "step": 1500 }, { "epoch": 1.0114247311827957, "grad_norm": 0.41541519893053236, "learning_rate": 1.1542326079901296e-05, "loss": 0.6363, "step": 1505 }, { "epoch": 1.0147849462365592, "grad_norm": 0.4120083737414266, "learning_rate": 1.1484346144003467e-05, "loss": 0.629, "step": 1510 }, { "epoch": 1.0181451612903225, "grad_norm": 0.4134775783414065, "learning_rate": 1.1426315139670268e-05, "loss": 0.6358, "step": 1515 }, { "epoch": 1.021505376344086, "grad_norm": 0.414634455881357, "learning_rate": 1.1368235063439103e-05, "loss": 0.6316, "step": 1520 }, { "epoch": 1.0248655913978495, "grad_norm": 0.4254446985228589, "learning_rate": 1.1310107913535677e-05, "loss": 0.6323, "step": 1525 }, { "epoch": 1.028225806451613, "grad_norm": 0.4034820728921977, "learning_rate": 1.1251935689805249e-05, "loss": 0.6348, "step": 1530 }, { "epoch": 1.0315860215053763, "grad_norm": 0.41791569897419, "learning_rate": 1.1193720393643826e-05, "loss": 0.6319, "step": 1535 }, { "epoch": 1.0349462365591398, "grad_norm": 0.41814029821287946, "learning_rate": 1.1135464027929306e-05, "loss": 0.6384, "step": 1540 }, { "epoch": 1.0383064516129032, "grad_norm": 0.42081017954332256, "learning_rate": 1.1077168596952579e-05, "loss": 0.6378, "step": 1545 }, { "epoch": 1.0416666666666667, "grad_norm": 0.39427585600241666, "learning_rate": 1.1018836106348558e-05, "loss": 0.6308, "step": 1550 }, { "epoch": 1.04502688172043, "grad_norm": 0.41657992186432496, "learning_rate": 1.0960468563027178e-05, "loss": 0.6359, "step": 1555 }, { "epoch": 1.0483870967741935, "grad_norm": 0.42812446527859066, "learning_rate": 1.0902067975104354e-05, "loss": 0.6375, "step": 1560 }, { "epoch": 1.051747311827957, "grad_norm": 0.4250184457384185, "learning_rate": 1.0843636351832888e-05, "loss": 0.6335, "step": 1565 }, { "epoch": 1.0551075268817205, "grad_norm": 0.4039339824459457, "learning_rate": 1.0785175703533341e-05, "loss": 0.6354, "step": 1570 }, { "epoch": 1.0584677419354838, "grad_norm": 0.4350937331558841, "learning_rate": 1.072668804152488e-05, "loss": 0.6346, "step": 1575 }, { "epoch": 1.0618279569892473, "grad_norm": 0.43413177405165077, "learning_rate": 1.0668175378056053e-05, "loss": 0.6328, "step": 1580 }, { "epoch": 1.0651881720430108, "grad_norm": 0.4543315336438448, "learning_rate": 1.0609639726235592e-05, "loss": 0.6291, "step": 1585 }, { "epoch": 1.0685483870967742, "grad_norm": 0.4639621657382721, "learning_rate": 1.0551083099963125e-05, "loss": 0.6347, "step": 1590 }, { "epoch": 1.0719086021505377, "grad_norm": 0.38636625101348104, "learning_rate": 1.0492507513859904e-05, "loss": 0.6329, "step": 1595 }, { "epoch": 1.075268817204301, "grad_norm": 0.43112374827834826, "learning_rate": 1.043391498319948e-05, "loss": 0.6334, "step": 1600 }, { "epoch": 1.0786290322580645, "grad_norm": 0.4083117563136851, "learning_rate": 1.037530752383839e-05, "loss": 0.6312, "step": 1605 }, { "epoch": 1.081989247311828, "grad_norm": 0.4092678104422559, "learning_rate": 1.0316687152146774e-05, "loss": 0.6341, "step": 1610 }, { "epoch": 1.0853494623655915, "grad_norm": 0.4236890040624566, "learning_rate": 1.0258055884939023e-05, "loss": 0.6342, "step": 1615 }, { "epoch": 1.0887096774193548, "grad_norm": 0.4129609373942744, "learning_rate": 1.0199415739404381e-05, "loss": 0.6337, "step": 1620 }, { "epoch": 1.0920698924731183, "grad_norm": 0.45620399681821466, "learning_rate": 1.0140768733037558e-05, "loss": 0.6355, "step": 1625 }, { "epoch": 1.0954301075268817, "grad_norm": 0.3996898071123565, "learning_rate": 1.0082116883569295e-05, "loss": 0.6353, "step": 1630 }, { "epoch": 1.0987903225806452, "grad_norm": 0.42181670497621626, "learning_rate": 1.0023462208896967e-05, "loss": 0.6368, "step": 1635 }, { "epoch": 1.1021505376344085, "grad_norm": 0.4170971985416292, "learning_rate": 9.964806727015144e-06, "loss": 0.6369, "step": 1640 }, { "epoch": 1.105510752688172, "grad_norm": 0.3850003704964035, "learning_rate": 9.906152455946176e-06, "loss": 0.6316, "step": 1645 }, { "epoch": 1.1088709677419355, "grad_norm": 0.3959138602164967, "learning_rate": 9.847501413670742e-06, "loss": 0.6329, "step": 1650 }, { "epoch": 1.112231182795699, "grad_norm": 0.38653649392484757, "learning_rate": 9.788855618058446e-06, "loss": 0.6315, "step": 1655 }, { "epoch": 1.1155913978494623, "grad_norm": 0.434626659914302, "learning_rate": 9.730217086798387e-06, "loss": 0.6329, "step": 1660 }, { "epoch": 1.1189516129032258, "grad_norm": 0.4043062586269176, "learning_rate": 9.671587837329717e-06, "loss": 0.6314, "step": 1665 }, { "epoch": 1.1223118279569892, "grad_norm": 0.3912022835272916, "learning_rate": 9.612969886772272e-06, "loss": 0.6347, "step": 1670 }, { "epoch": 1.1256720430107527, "grad_norm": 0.3945451917100905, "learning_rate": 9.554365251857141e-06, "loss": 0.6334, "step": 1675 }, { "epoch": 1.129032258064516, "grad_norm": 0.39115203980879226, "learning_rate": 9.495775948857292e-06, "loss": 0.631, "step": 1680 }, { "epoch": 1.1323924731182795, "grad_norm": 0.3899923876549679, "learning_rate": 9.437203993518214e-06, "loss": 0.6304, "step": 1685 }, { "epoch": 1.135752688172043, "grad_norm": 0.4002252410564929, "learning_rate": 9.37865140098854e-06, "loss": 0.6313, "step": 1690 }, { "epoch": 1.1391129032258065, "grad_norm": 0.43477646973135253, "learning_rate": 9.320120185750745e-06, "loss": 0.6319, "step": 1695 }, { "epoch": 1.14247311827957, "grad_norm": 0.4090247302348437, "learning_rate": 9.261612361551827e-06, "loss": 0.6317, "step": 1700 }, { "epoch": 1.1458333333333333, "grad_norm": 0.45307609961969436, "learning_rate": 9.203129941334004e-06, "loss": 0.6333, "step": 1705 }, { "epoch": 1.1491935483870968, "grad_norm": 0.41729176695078596, "learning_rate": 9.144674937165503e-06, "loss": 0.6351, "step": 1710 }, { "epoch": 1.1525537634408602, "grad_norm": 0.40602215785080276, "learning_rate": 9.086249360171291e-06, "loss": 0.6343, "step": 1715 }, { "epoch": 1.1559139784946237, "grad_norm": 0.45074003009366553, "learning_rate": 9.027855220463915e-06, "loss": 0.6335, "step": 1720 }, { "epoch": 1.159274193548387, "grad_norm": 0.4134807886527817, "learning_rate": 8.969494527074333e-06, "loss": 0.6324, "step": 1725 }, { "epoch": 1.1626344086021505, "grad_norm": 0.4342008270842711, "learning_rate": 8.91116928788278e-06, "loss": 0.6272, "step": 1730 }, { "epoch": 1.165994623655914, "grad_norm": 0.4390573307122215, "learning_rate": 8.852881509549716e-06, "loss": 0.6319, "step": 1735 }, { "epoch": 1.1693548387096775, "grad_norm": 0.39093522707469097, "learning_rate": 8.79463319744677e-06, "loss": 0.6339, "step": 1740 }, { "epoch": 1.172715053763441, "grad_norm": 0.3954417150387851, "learning_rate": 8.73642635558774e-06, "loss": 0.6323, "step": 1745 }, { "epoch": 1.1760752688172043, "grad_norm": 0.40100373608359324, "learning_rate": 8.678262986559667e-06, "loss": 0.6269, "step": 1750 }, { "epoch": 1.1794354838709677, "grad_norm": 0.39734268517939386, "learning_rate": 8.620145091453907e-06, "loss": 0.6332, "step": 1755 }, { "epoch": 1.1827956989247312, "grad_norm": 0.39585129412547304, "learning_rate": 8.562074669797313e-06, "loss": 0.6292, "step": 1760 }, { "epoch": 1.1861559139784945, "grad_norm": 0.4037432361209901, "learning_rate": 8.504053719483433e-06, "loss": 0.6311, "step": 1765 }, { "epoch": 1.189516129032258, "grad_norm": 0.3975358417069771, "learning_rate": 8.446084236703758e-06, "loss": 0.6292, "step": 1770 }, { "epoch": 1.1928763440860215, "grad_norm": 0.4254328638432713, "learning_rate": 8.38816821587906e-06, "loss": 0.6294, "step": 1775 }, { "epoch": 1.196236559139785, "grad_norm": 0.4130515802670977, "learning_rate": 8.330307649590782e-06, "loss": 0.6304, "step": 1780 }, { "epoch": 1.1995967741935485, "grad_norm": 0.417130552856278, "learning_rate": 8.272504528512448e-06, "loss": 0.6329, "step": 1785 }, { "epoch": 1.2029569892473118, "grad_norm": 0.40653243786568827, "learning_rate": 8.214760841341223e-06, "loss": 0.6343, "step": 1790 }, { "epoch": 1.2063172043010753, "grad_norm": 0.40605875241786166, "learning_rate": 8.157078574729451e-06, "loss": 0.6302, "step": 1795 }, { "epoch": 1.2096774193548387, "grad_norm": 0.38683973621773765, "learning_rate": 8.099459713216331e-06, "loss": 0.6266, "step": 1800 }, { "epoch": 1.2130376344086022, "grad_norm": 0.39898560454739485, "learning_rate": 8.041906239159636e-06, "loss": 0.6303, "step": 1805 }, { "epoch": 1.2163978494623655, "grad_norm": 0.4377517352185253, "learning_rate": 7.984420132667483e-06, "loss": 0.63, "step": 1810 }, { "epoch": 1.219758064516129, "grad_norm": 0.39945365735612615, "learning_rate": 7.927003371530254e-06, "loss": 0.6291, "step": 1815 }, { "epoch": 1.2231182795698925, "grad_norm": 0.4030460272109659, "learning_rate": 7.869657931152522e-06, "loss": 0.6262, "step": 1820 }, { "epoch": 1.226478494623656, "grad_norm": 0.3958516566003825, "learning_rate": 7.812385784485079e-06, "loss": 0.631, "step": 1825 }, { "epoch": 1.2298387096774193, "grad_norm": 0.4234150471780541, "learning_rate": 7.755188901957093e-06, "loss": 0.6271, "step": 1830 }, { "epoch": 1.2331989247311828, "grad_norm": 0.43581653936134584, "learning_rate": 7.698069251408271e-06, "loss": 0.6289, "step": 1835 }, { "epoch": 1.2365591397849462, "grad_norm": 0.871860809323723, "learning_rate": 7.641028798021197e-06, "loss": 0.6339, "step": 1840 }, { "epoch": 1.2399193548387097, "grad_norm": 0.41654508596227274, "learning_rate": 7.584069504253703e-06, "loss": 0.6316, "step": 1845 }, { "epoch": 1.243279569892473, "grad_norm": 0.4186537736756235, "learning_rate": 7.527193329771334e-06, "loss": 0.629, "step": 1850 }, { "epoch": 1.2466397849462365, "grad_norm": 0.40684069462084127, "learning_rate": 7.470402231379961e-06, "loss": 0.6294, "step": 1855 }, { "epoch": 1.25, "grad_norm": 0.4491543755620313, "learning_rate": 7.41369816295844e-06, "loss": 0.6283, "step": 1860 }, { "epoch": 1.2533602150537635, "grad_norm": 0.43910414887550725, "learning_rate": 7.357083075391373e-06, "loss": 0.6297, "step": 1865 }, { "epoch": 1.256720430107527, "grad_norm": 0.4242078353209094, "learning_rate": 7.300558916502028e-06, "loss": 0.6274, "step": 1870 }, { "epoch": 1.2600806451612903, "grad_norm": 0.39181603159793815, "learning_rate": 7.24412763098528e-06, "loss": 0.6268, "step": 1875 }, { "epoch": 1.2634408602150538, "grad_norm": 0.4093720135553839, "learning_rate": 7.1877911603407446e-06, "loss": 0.6314, "step": 1880 }, { "epoch": 1.2668010752688172, "grad_norm": 0.4542985585511893, "learning_rate": 7.131551442805957e-06, "loss": 0.6257, "step": 1885 }, { "epoch": 1.2701612903225805, "grad_norm": 0.39320303904121556, "learning_rate": 7.075410413289687e-06, "loss": 0.632, "step": 1890 }, { "epoch": 1.273521505376344, "grad_norm": 0.43112292294597837, "learning_rate": 7.019370003305383e-06, "loss": 0.6258, "step": 1895 }, { "epoch": 1.2768817204301075, "grad_norm": 0.4121211812097676, "learning_rate": 6.963432140904718e-06, "loss": 0.6256, "step": 1900 }, { "epoch": 1.280241935483871, "grad_norm": 0.4366695143108501, "learning_rate": 6.9075987506112305e-06, "loss": 0.6306, "step": 1905 }, { "epoch": 1.2836021505376345, "grad_norm": 0.420907190197836, "learning_rate": 6.851871753354154e-06, "loss": 0.6314, "step": 1910 }, { "epoch": 1.286962365591398, "grad_norm": 0.4473858705160112, "learning_rate": 6.796253066402282e-06, "loss": 0.627, "step": 1915 }, { "epoch": 1.2903225806451613, "grad_norm": 0.38612233770180776, "learning_rate": 6.740744603298046e-06, "loss": 0.629, "step": 1920 }, { "epoch": 1.2936827956989247, "grad_norm": 0.39974399253863463, "learning_rate": 6.685348273791661e-06, "loss": 0.6287, "step": 1925 }, { "epoch": 1.2970430107526882, "grad_norm": 0.3898544394591236, "learning_rate": 6.630065983775406e-06, "loss": 0.6292, "step": 1930 }, { "epoch": 1.3004032258064515, "grad_norm": 0.40401698025117533, "learning_rate": 6.574899635218091e-06, "loss": 0.6304, "step": 1935 }, { "epoch": 1.303763440860215, "grad_norm": 0.41519450649319134, "learning_rate": 6.519851126099586e-06, "loss": 0.6287, "step": 1940 }, { "epoch": 1.3071236559139785, "grad_norm": 0.381670315636576, "learning_rate": 6.464922350345534e-06, "loss": 0.6278, "step": 1945 }, { "epoch": 1.310483870967742, "grad_norm": 0.4365713923302317, "learning_rate": 6.4101151977622015e-06, "loss": 0.6258, "step": 1950 }, { "epoch": 1.3138440860215055, "grad_norm": 0.41198291276336774, "learning_rate": 6.3554315539714364e-06, "loss": 0.6261, "step": 1955 }, { "epoch": 1.3172043010752688, "grad_norm": 0.4118460369714954, "learning_rate": 6.300873300345819e-06, "loss": 0.6245, "step": 1960 }, { "epoch": 1.3205645161290323, "grad_norm": 0.38775383959228366, "learning_rate": 6.246442313943917e-06, "loss": 0.6258, "step": 1965 }, { "epoch": 1.3239247311827957, "grad_norm": 0.4088425446696767, "learning_rate": 6.192140467445712e-06, "loss": 0.6291, "step": 1970 }, { "epoch": 1.327284946236559, "grad_norm": 0.368749879948908, "learning_rate": 6.137969629088174e-06, "loss": 0.629, "step": 1975 }, { "epoch": 1.3306451612903225, "grad_norm": 0.3907700123306008, "learning_rate": 6.083931662600977e-06, "loss": 0.6287, "step": 1980 }, { "epoch": 1.334005376344086, "grad_norm": 0.3975024031852529, "learning_rate": 6.0300284271423834e-06, "loss": 0.6227, "step": 1985 }, { "epoch": 1.3373655913978495, "grad_norm": 0.4075706328227891, "learning_rate": 5.976261777235282e-06, "loss": 0.6248, "step": 1990 }, { "epoch": 1.340725806451613, "grad_norm": 0.38742274042231606, "learning_rate": 5.922633562703375e-06, "loss": 0.6263, "step": 1995 }, { "epoch": 1.3440860215053765, "grad_norm": 0.4225129189994307, "learning_rate": 5.869145628607551e-06, "loss": 0.6263, "step": 2000 }, { "epoch": 1.3474462365591398, "grad_norm": 0.3905401995286742, "learning_rate": 5.815799815182393e-06, "loss": 0.6293, "step": 2005 }, { "epoch": 1.3508064516129032, "grad_norm": 0.40752390323741944, "learning_rate": 5.762597957772856e-06, "loss": 0.6287, "step": 2010 }, { "epoch": 1.3541666666666667, "grad_norm": 0.4096029958781252, "learning_rate": 5.709541886771167e-06, "loss": 0.6266, "step": 2015 }, { "epoch": 1.35752688172043, "grad_norm": 0.38367545152375937, "learning_rate": 5.656633427553784e-06, "loss": 0.6303, "step": 2020 }, { "epoch": 1.3608870967741935, "grad_norm": 0.3848396403700666, "learning_rate": 5.60387440041866e-06, "loss": 0.6274, "step": 2025 }, { "epoch": 1.364247311827957, "grad_norm": 0.3871009890350265, "learning_rate": 5.55126662052257e-06, "loss": 0.6224, "step": 2030 }, { "epoch": 1.3676075268817205, "grad_norm": 0.4065087733067794, "learning_rate": 5.498811897818685e-06, "loss": 0.6268, "step": 2035 }, { "epoch": 1.370967741935484, "grad_norm": 0.38515450629994064, "learning_rate": 5.446512036994287e-06, "loss": 0.6248, "step": 2040 }, { "epoch": 1.3743279569892473, "grad_norm": 0.39646022406843767, "learning_rate": 5.394368837408705e-06, "loss": 0.6249, "step": 2045 }, { "epoch": 1.3776881720430108, "grad_norm": 0.4140801017591075, "learning_rate": 5.342384093031361e-06, "loss": 0.6264, "step": 2050 }, { "epoch": 1.3810483870967742, "grad_norm": 0.41497129669099586, "learning_rate": 5.290559592380111e-06, "loss": 0.6249, "step": 2055 }, { "epoch": 1.3844086021505375, "grad_norm": 0.3924624828028891, "learning_rate": 5.238897118459644e-06, "loss": 0.6258, "step": 2060 }, { "epoch": 1.387768817204301, "grad_norm": 0.3702574417131231, "learning_rate": 5.187398448700205e-06, "loss": 0.6284, "step": 2065 }, { "epoch": 1.3911290322580645, "grad_norm": 0.41653378611493, "learning_rate": 5.136065354896393e-06, "loss": 0.6263, "step": 2070 }, { "epoch": 1.394489247311828, "grad_norm": 0.3895372191639466, "learning_rate": 5.084899603146228e-06, "loss": 0.629, "step": 2075 }, { "epoch": 1.3978494623655915, "grad_norm": 0.3906095795336125, "learning_rate": 5.033902953790375e-06, "loss": 0.6258, "step": 2080 }, { "epoch": 1.4012096774193548, "grad_norm": 0.39681528851984266, "learning_rate": 4.983077161351606e-06, "loss": 0.6282, "step": 2085 }, { "epoch": 1.4045698924731183, "grad_norm": 0.40072163095580204, "learning_rate": 4.932423974474386e-06, "loss": 0.6288, "step": 2090 }, { "epoch": 1.4079301075268817, "grad_norm": 0.41393522692230517, "learning_rate": 4.8819451358647806e-06, "loss": 0.6269, "step": 2095 }, { "epoch": 1.4112903225806452, "grad_norm": 0.41100607384229376, "learning_rate": 4.831642382230424e-06, "loss": 0.6237, "step": 2100 }, { "epoch": 1.4146505376344085, "grad_norm": 0.3869191806592057, "learning_rate": 4.781517444220836e-06, "loss": 0.6262, "step": 2105 }, { "epoch": 1.418010752688172, "grad_norm": 0.40790781380796975, "learning_rate": 4.731572046367832e-06, "loss": 0.629, "step": 2110 }, { "epoch": 1.4213709677419355, "grad_norm": 0.39391109017587794, "learning_rate": 4.681807907026214e-06, "loss": 0.6241, "step": 2115 }, { "epoch": 1.424731182795699, "grad_norm": 0.40179170742485937, "learning_rate": 4.632226738314638e-06, "loss": 0.6255, "step": 2120 }, { "epoch": 1.4280913978494625, "grad_norm": 0.3858060561361162, "learning_rate": 4.582830246056735e-06, "loss": 0.6236, "step": 2125 }, { "epoch": 1.4314516129032258, "grad_norm": 0.38374047682817425, "learning_rate": 4.533620129722376e-06, "loss": 0.6204, "step": 2130 }, { "epoch": 1.4348118279569892, "grad_norm": 0.3973135891040321, "learning_rate": 4.4845980823692605e-06, "loss": 0.6238, "step": 2135 }, { "epoch": 1.4381720430107527, "grad_norm": 0.3883947152151204, "learning_rate": 4.435765790584605e-06, "loss": 0.6245, "step": 2140 }, { "epoch": 1.441532258064516, "grad_norm": 0.3991192638455965, "learning_rate": 4.387124934427181e-06, "loss": 0.6235, "step": 2145 }, { "epoch": 1.4448924731182795, "grad_norm": 0.38536161760649323, "learning_rate": 4.338677187369458e-06, "loss": 0.6239, "step": 2150 }, { "epoch": 1.448252688172043, "grad_norm": 0.38959778197902006, "learning_rate": 4.290424216240062e-06, "loss": 0.624, "step": 2155 }, { "epoch": 1.4516129032258065, "grad_norm": 0.3932900684900842, "learning_rate": 4.242367681166414e-06, "loss": 0.6261, "step": 2160 }, { "epoch": 1.45497311827957, "grad_norm": 0.39269029723244625, "learning_rate": 4.19450923551762e-06, "loss": 0.6217, "step": 2165 }, { "epoch": 1.4583333333333333, "grad_norm": 0.3849808642498751, "learning_rate": 4.1468505258475785e-06, "loss": 0.6205, "step": 2170 }, { "epoch": 1.4616935483870968, "grad_norm": 0.37433760170956354, "learning_rate": 4.0993931918383556e-06, "loss": 0.6181, "step": 2175 }, { "epoch": 1.4650537634408602, "grad_norm": 0.38330212314619344, "learning_rate": 4.0521388662437285e-06, "loss": 0.6243, "step": 2180 }, { "epoch": 1.4684139784946235, "grad_norm": 0.3934565346806046, "learning_rate": 4.0050891748330636e-06, "loss": 0.6253, "step": 2185 }, { "epoch": 1.471774193548387, "grad_norm": 0.3829851450390039, "learning_rate": 3.958245736335337e-06, "loss": 0.625, "step": 2190 }, { "epoch": 1.4751344086021505, "grad_norm": 0.38802181960954035, "learning_rate": 3.911610162383475e-06, "loss": 0.6265, "step": 2195 }, { "epoch": 1.478494623655914, "grad_norm": 0.410711397472092, "learning_rate": 3.865184057458883e-06, "loss": 0.6232, "step": 2200 }, { "epoch": 1.4818548387096775, "grad_norm": 0.3894505260206867, "learning_rate": 3.8189690188362615e-06, "loss": 0.628, "step": 2205 }, { "epoch": 1.485215053763441, "grad_norm": 0.385060920155747, "learning_rate": 3.772966636528641e-06, "loss": 0.6186, "step": 2210 }, { "epoch": 1.4885752688172043, "grad_norm": 0.39063304189411735, "learning_rate": 3.727178493232685e-06, "loss": 0.6236, "step": 2215 }, { "epoch": 1.4919354838709677, "grad_norm": 0.3690141904300633, "learning_rate": 3.6816061642742294e-06, "loss": 0.6251, "step": 2220 }, { "epoch": 1.4952956989247312, "grad_norm": 0.3915588795933579, "learning_rate": 3.6362512175541008e-06, "loss": 0.6231, "step": 2225 }, { "epoch": 1.4986559139784945, "grad_norm": 0.3823928248559548, "learning_rate": 3.591115213494153e-06, "loss": 0.6243, "step": 2230 }, { "epoch": 1.502016129032258, "grad_norm": 0.375431342074985, "learning_rate": 3.546199704983592e-06, "loss": 0.619, "step": 2235 }, { "epoch": 1.5053763440860215, "grad_norm": 0.3964171676918981, "learning_rate": 3.501506237325547e-06, "loss": 0.6221, "step": 2240 }, { "epoch": 1.508736559139785, "grad_norm": 0.3850008299468121, "learning_rate": 3.4570363481839097e-06, "loss": 0.6223, "step": 2245 }, { "epoch": 1.5120967741935485, "grad_norm": 0.39821907737468476, "learning_rate": 3.4127915675304236e-06, "loss": 0.6251, "step": 2250 }, { "epoch": 1.515456989247312, "grad_norm": 0.4127390828177173, "learning_rate": 3.3687734175920505e-06, "loss": 0.6235, "step": 2255 }, { "epoch": 1.5188172043010753, "grad_norm": 0.3900524242402812, "learning_rate": 3.324983412798597e-06, "loss": 0.6228, "step": 2260 }, { "epoch": 1.5221774193548387, "grad_norm": 0.39215462329521156, "learning_rate": 3.281423059730612e-06, "loss": 0.6272, "step": 2265 }, { "epoch": 1.525537634408602, "grad_norm": 0.37938495139611367, "learning_rate": 3.238093857067558e-06, "loss": 0.624, "step": 2270 }, { "epoch": 1.5288978494623655, "grad_norm": 0.38603540777357676, "learning_rate": 3.1949972955362383e-06, "loss": 0.6239, "step": 2275 }, { "epoch": 1.532258064516129, "grad_norm": 0.3863102533573958, "learning_rate": 3.1521348578595178e-06, "loss": 0.6242, "step": 2280 }, { "epoch": 1.5356182795698925, "grad_norm": 0.38583036503521356, "learning_rate": 3.1095080187053084e-06, "loss": 0.62, "step": 2285 }, { "epoch": 1.538978494623656, "grad_norm": 0.38861760389469036, "learning_rate": 3.067118244635833e-06, "loss": 0.6211, "step": 2290 }, { "epoch": 1.5423387096774195, "grad_norm": 0.3858262856892166, "learning_rate": 3.024966994057168e-06, "loss": 0.6193, "step": 2295 }, { "epoch": 1.5456989247311828, "grad_norm": 0.39743062676203483, "learning_rate": 2.98305571716907e-06, "loss": 0.6218, "step": 2300 }, { "epoch": 1.5490591397849462, "grad_norm": 0.3883508569759508, "learning_rate": 2.9413858559150776e-06, "loss": 0.6217, "step": 2305 }, { "epoch": 1.5524193548387095, "grad_norm": 0.38516305909935, "learning_rate": 2.899958843932915e-06, "loss": 0.6202, "step": 2310 }, { "epoch": 1.555779569892473, "grad_norm": 0.39366530353726675, "learning_rate": 2.858776106505139e-06, "loss": 0.6194, "step": 2315 }, { "epoch": 1.5591397849462365, "grad_norm": 0.3773278542481648, "learning_rate": 2.8178390605101414e-06, "loss": 0.6223, "step": 2320 }, { "epoch": 1.5625, "grad_norm": 0.4009756258267962, "learning_rate": 2.777149114373371e-06, "loss": 0.6195, "step": 2325 }, { "epoch": 1.5658602150537635, "grad_norm": 0.3858950731460861, "learning_rate": 2.73670766801889e-06, "loss": 0.6217, "step": 2330 }, { "epoch": 1.569220430107527, "grad_norm": 0.3971612319470807, "learning_rate": 2.696516112821208e-06, "loss": 0.622, "step": 2335 }, { "epoch": 1.5725806451612905, "grad_norm": 0.38420387771892894, "learning_rate": 2.6565758315574152e-06, "loss": 0.6201, "step": 2340 }, { "epoch": 1.5759408602150538, "grad_norm": 0.3933992088451066, "learning_rate": 2.6168881983595994e-06, "loss": 0.6226, "step": 2345 }, { "epoch": 1.5793010752688172, "grad_norm": 0.36479667118510656, "learning_rate": 2.5774545786675887e-06, "loss": 0.6175, "step": 2350 }, { "epoch": 1.5826612903225805, "grad_norm": 0.37711761279306893, "learning_rate": 2.538276329181942e-06, "loss": 0.6206, "step": 2355 }, { "epoch": 1.586021505376344, "grad_norm": 0.3816373703635669, "learning_rate": 2.499354797817312e-06, "loss": 0.6205, "step": 2360 }, { "epoch": 1.5893817204301075, "grad_norm": 0.38654799097580955, "learning_rate": 2.4606913236560283e-06, "loss": 0.6245, "step": 2365 }, { "epoch": 1.592741935483871, "grad_norm": 0.37621383637459765, "learning_rate": 2.4222872369020676e-06, "loss": 0.6237, "step": 2370 }, { "epoch": 1.5961021505376345, "grad_norm": 0.38574402970547833, "learning_rate": 2.384143858835258e-06, "loss": 0.6198, "step": 2375 }, { "epoch": 1.599462365591398, "grad_norm": 0.41266081486593603, "learning_rate": 2.3462625017658356e-06, "loss": 0.6203, "step": 2380 }, { "epoch": 1.6028225806451613, "grad_norm": 0.39723116620215, "learning_rate": 2.3086444689892872e-06, "loss": 0.6224, "step": 2385 }, { "epoch": 1.6061827956989247, "grad_norm": 0.39889771375855315, "learning_rate": 2.2712910547415266e-06, "loss": 0.6228, "step": 2390 }, { "epoch": 1.609543010752688, "grad_norm": 0.38397107249589707, "learning_rate": 2.234203544154335e-06, "loss": 0.6202, "step": 2395 }, { "epoch": 1.6129032258064515, "grad_norm": 0.3938406367129855, "learning_rate": 2.1973832132111906e-06, "loss": 0.6194, "step": 2400 }, { "epoch": 1.616263440860215, "grad_norm": 0.37138500127297974, "learning_rate": 2.16083132870332e-06, "loss": 0.6226, "step": 2405 }, { "epoch": 1.6196236559139785, "grad_norm": 0.386156003115288, "learning_rate": 2.1245491481861615e-06, "loss": 0.6174, "step": 2410 }, { "epoch": 1.622983870967742, "grad_norm": 0.3747547614429102, "learning_rate": 2.0885379199360646e-06, "loss": 0.6228, "step": 2415 }, { "epoch": 1.6263440860215055, "grad_norm": 0.3784295571335933, "learning_rate": 2.0527988829073587e-06, "loss": 0.6208, "step": 2420 }, { "epoch": 1.629704301075269, "grad_norm": 0.3741384831092738, "learning_rate": 2.0173332666897227e-06, "loss": 0.6205, "step": 2425 }, { "epoch": 1.6330645161290323, "grad_norm": 0.38786521686382086, "learning_rate": 1.982142291465896e-06, "loss": 0.6177, "step": 2430 }, { "epoch": 1.6364247311827957, "grad_norm": 0.38553674654791287, "learning_rate": 1.947227167969663e-06, "loss": 0.6219, "step": 2435 }, { "epoch": 1.639784946236559, "grad_norm": 0.37179654564247894, "learning_rate": 1.9125890974442475e-06, "loss": 0.62, "step": 2440 }, { "epoch": 1.6431451612903225, "grad_norm": 0.3705380721255442, "learning_rate": 1.878229271600931e-06, "loss": 0.6212, "step": 2445 }, { "epoch": 1.646505376344086, "grad_norm": 0.37577075286504236, "learning_rate": 1.8441488725781043e-06, "loss": 0.6219, "step": 2450 }, { "epoch": 1.6498655913978495, "grad_norm": 0.37950348342322227, "learning_rate": 1.8103490729005546e-06, "loss": 0.6168, "step": 2455 }, { "epoch": 1.653225806451613, "grad_norm": 0.3763349869752889, "learning_rate": 1.7768310354391505e-06, "loss": 0.6221, "step": 2460 }, { "epoch": 1.6565860215053765, "grad_norm": 0.3815949507602912, "learning_rate": 1.7435959133708169e-06, "loss": 0.6175, "step": 2465 }, { "epoch": 1.6599462365591398, "grad_norm": 0.38010076668823534, "learning_rate": 1.7106448501388827e-06, "loss": 0.6176, "step": 2470 }, { "epoch": 1.6633064516129032, "grad_norm": 0.3691580309508141, "learning_rate": 1.677978979413708e-06, "loss": 0.6164, "step": 2475 }, { "epoch": 1.6666666666666665, "grad_norm": 0.38980910582544764, "learning_rate": 1.645599425053721e-06, "loss": 0.6192, "step": 2480 }, { "epoch": 1.67002688172043, "grad_norm": 0.3915443888876836, "learning_rate": 1.6135073010667091e-06, "loss": 0.6178, "step": 2485 }, { "epoch": 1.6733870967741935, "grad_norm": 0.38991481296941843, "learning_rate": 1.5817037115715307e-06, "loss": 0.6192, "step": 2490 }, { "epoch": 1.676747311827957, "grad_norm": 0.3792230038765017, "learning_rate": 1.5501897507601016e-06, "loss": 0.6218, "step": 2495 }, { "epoch": 1.6801075268817205, "grad_norm": 0.37563408321675024, "learning_rate": 1.5189665028597622e-06, "loss": 0.6201, "step": 2500 }, { "epoch": 1.683467741935484, "grad_norm": 0.3782912838928693, "learning_rate": 1.4880350420959678e-06, "loss": 0.6148, "step": 2505 }, { "epoch": 1.6868279569892473, "grad_norm": 0.37676394416952275, "learning_rate": 1.4573964326553447e-06, "loss": 0.6213, "step": 2510 }, { "epoch": 1.6901881720430108, "grad_norm": 0.4026075731771035, "learning_rate": 1.4270517286490526e-06, "loss": 0.6212, "step": 2515 }, { "epoch": 1.6935483870967742, "grad_norm": 0.3740041929756673, "learning_rate": 1.397001974076546e-06, "loss": 0.6173, "step": 2520 }, { "epoch": 1.6969086021505375, "grad_norm": 0.37487781663955083, "learning_rate": 1.3672482027896295e-06, "loss": 0.6157, "step": 2525 }, { "epoch": 1.700268817204301, "grad_norm": 0.3658110013443917, "learning_rate": 1.3377914384569124e-06, "loss": 0.6197, "step": 2530 }, { "epoch": 1.7036290322580645, "grad_norm": 0.3698169716895195, "learning_rate": 1.3086326945285721e-06, "loss": 0.6172, "step": 2535 }, { "epoch": 1.706989247311828, "grad_norm": 0.3726837636042825, "learning_rate": 1.2797729742014952e-06, "loss": 0.6239, "step": 2540 }, { "epoch": 1.7103494623655915, "grad_norm": 0.3776863339195105, "learning_rate": 1.251213270384759e-06, "loss": 0.6166, "step": 2545 }, { "epoch": 1.713709677419355, "grad_norm": 0.3701247417408879, "learning_rate": 1.2229545656654784e-06, "loss": 0.6195, "step": 2550 }, { "epoch": 1.7170698924731183, "grad_norm": 0.37292321840424403, "learning_rate": 1.1949978322749833e-06, "loss": 0.62, "step": 2555 }, { "epoch": 1.7204301075268817, "grad_norm": 0.36380895900137356, "learning_rate": 1.1673440320553941e-06, "loss": 0.6208, "step": 2560 }, { "epoch": 1.723790322580645, "grad_norm": 0.3753481673103304, "learning_rate": 1.1399941164265016e-06, "loss": 0.6197, "step": 2565 }, { "epoch": 1.7271505376344085, "grad_norm": 0.38841007090968127, "learning_rate": 1.112949026353063e-06, "loss": 0.6197, "step": 2570 }, { "epoch": 1.730510752688172, "grad_norm": 0.4024753571663661, "learning_rate": 1.0862096923124032e-06, "loss": 0.6195, "step": 2575 }, { "epoch": 1.7338709677419355, "grad_norm": 0.40698102024501104, "learning_rate": 1.0597770342624169e-06, "loss": 0.6212, "step": 2580 }, { "epoch": 1.737231182795699, "grad_norm": 0.38275795281843294, "learning_rate": 1.0336519616099127e-06, "loss": 0.6164, "step": 2585 }, { "epoch": 1.7405913978494625, "grad_norm": 0.3849617861720321, "learning_rate": 1.0078353731793245e-06, "loss": 0.621, "step": 2590 }, { "epoch": 1.7439516129032258, "grad_norm": 0.37357311882407007, "learning_rate": 9.823281571817888e-07, "loss": 0.6154, "step": 2595 }, { "epoch": 1.7473118279569892, "grad_norm": 0.3700469086083545, "learning_rate": 9.571311911845938e-07, "loss": 0.6168, "step": 2600 }, { "epoch": 1.7506720430107527, "grad_norm": 0.3720192733243154, "learning_rate": 9.322453420809663e-07, "loss": 0.6188, "step": 2605 }, { "epoch": 1.754032258064516, "grad_norm": 0.3759460278402678, "learning_rate": 9.076714660602726e-07, "loss": 0.6188, "step": 2610 }, { "epoch": 1.7573924731182795, "grad_norm": 0.37780059312461417, "learning_rate": 8.834104085785411e-07, "loss": 0.619, "step": 2615 }, { "epoch": 1.760752688172043, "grad_norm": 0.3644735938402259, "learning_rate": 8.594630043293862e-07, "loss": 0.6166, "step": 2620 }, { "epoch": 1.7641129032258065, "grad_norm": 0.40102016373518057, "learning_rate": 8.35830077215285e-07, "loss": 0.6205, "step": 2625 }, { "epoch": 1.76747311827957, "grad_norm": 0.36805712689178754, "learning_rate": 8.125124403192353e-07, "loss": 0.6178, "step": 2630 }, { "epoch": 1.7708333333333335, "grad_norm": 0.3855167813689446, "learning_rate": 7.89510895876775e-07, "loss": 0.6187, "step": 2635 }, { "epoch": 1.7741935483870968, "grad_norm": 0.37050943347616344, "learning_rate": 7.668262352483957e-07, "loss": 0.6175, "step": 2640 }, { "epoch": 1.7775537634408602, "grad_norm": 0.37029783353120194, "learning_rate": 7.444592388922955e-07, "loss": 0.6184, "step": 2645 }, { "epoch": 1.7809139784946235, "grad_norm": 0.3842300371858157, "learning_rate": 7.224106763375493e-07, "loss": 0.6147, "step": 2650 }, { "epoch": 1.784274193548387, "grad_norm": 0.3700714291963043, "learning_rate": 7.006813061576145e-07, "loss": 0.6124, "step": 2655 }, { "epoch": 1.7876344086021505, "grad_norm": 0.3760694332290439, "learning_rate": 6.792718759442474e-07, "loss": 0.6194, "step": 2660 }, { "epoch": 1.790994623655914, "grad_norm": 0.3671308826023182, "learning_rate": 6.581831222817714e-07, "loss": 0.6149, "step": 2665 }, { "epoch": 1.7943548387096775, "grad_norm": 0.3746985320397756, "learning_rate": 6.374157707217421e-07, "loss": 0.6152, "step": 2670 }, { "epoch": 1.797715053763441, "grad_norm": 0.38527941649428066, "learning_rate": 6.169705357579813e-07, "loss": 0.6206, "step": 2675 }, { "epoch": 1.8010752688172043, "grad_norm": 0.3703868711176167, "learning_rate": 5.968481208020016e-07, "loss": 0.6185, "step": 2680 }, { "epoch": 1.8044354838709677, "grad_norm": 0.37853482566243507, "learning_rate": 5.770492181587906e-07, "loss": 0.6148, "step": 2685 }, { "epoch": 1.807795698924731, "grad_norm": 0.3580208698159626, "learning_rate": 5.575745090030138e-07, "loss": 0.6168, "step": 2690 }, { "epoch": 1.8111559139784945, "grad_norm": 0.3822688397822238, "learning_rate": 5.38424663355559e-07, "loss": 0.6188, "step": 2695 }, { "epoch": 1.814516129032258, "grad_norm": 0.37115606254622296, "learning_rate": 5.196003400604977e-07, "loss": 0.6159, "step": 2700 }, { "epoch": 1.8178763440860215, "grad_norm": 0.37498937385752057, "learning_rate": 5.01102186762411e-07, "loss": 0.6193, "step": 2705 }, { "epoch": 1.821236559139785, "grad_norm": 0.36638337590912584, "learning_rate": 4.829308398841104e-07, "loss": 0.6177, "step": 2710 }, { "epoch": 1.8245967741935485, "grad_norm": 0.3720335399527678, "learning_rate": 4.650869246047407e-07, "loss": 0.616, "step": 2715 }, { "epoch": 1.827956989247312, "grad_norm": 0.3883742285519788, "learning_rate": 4.475710548382717e-07, "loss": 0.6183, "step": 2720 }, { "epoch": 1.8313172043010753, "grad_norm": 0.3660996354157979, "learning_rate": 4.303838332123766e-07, "loss": 0.6181, "step": 2725 }, { "epoch": 1.8346774193548387, "grad_norm": 0.3832317742511122, "learning_rate": 4.1352585104770136e-07, "loss": 0.6173, "step": 2730 }, { "epoch": 1.838037634408602, "grad_norm": 0.3733758015066586, "learning_rate": 3.969976883375126e-07, "loss": 0.6212, "step": 2735 }, { "epoch": 1.8413978494623655, "grad_norm": 0.371152480421909, "learning_rate": 3.807999137277507e-07, "loss": 0.6154, "step": 2740 }, { "epoch": 1.844758064516129, "grad_norm": 0.38345844939628254, "learning_rate": 3.6493308449746525e-07, "loss": 0.6189, "step": 2745 }, { "epoch": 1.8481182795698925, "grad_norm": 0.36762462712757643, "learning_rate": 3.4939774653963587e-07, "loss": 0.6203, "step": 2750 }, { "epoch": 1.851478494623656, "grad_norm": 0.375536271265564, "learning_rate": 3.3419443434240083e-07, "loss": 0.6172, "step": 2755 }, { "epoch": 1.8548387096774195, "grad_norm": 0.3660221694767357, "learning_rate": 3.19323670970656e-07, "loss": 0.6181, "step": 2760 }, { "epoch": 1.8581989247311828, "grad_norm": 0.3599790898532119, "learning_rate": 3.0478596804807246e-07, "loss": 0.62, "step": 2765 }, { "epoch": 1.8615591397849462, "grad_norm": 0.3709615221652835, "learning_rate": 2.905818257394799e-07, "loss": 0.6159, "step": 2770 }, { "epoch": 1.8649193548387095, "grad_norm": 0.3726522140490793, "learning_rate": 2.7671173273367323e-07, "loss": 0.6192, "step": 2775 }, { "epoch": 1.868279569892473, "grad_norm": 0.3677308860539174, "learning_rate": 2.631761662265875e-07, "loss": 0.6188, "step": 2780 }, { "epoch": 1.8716397849462365, "grad_norm": 0.37507182923605303, "learning_rate": 2.499755919048863e-07, "loss": 0.6236, "step": 2785 }, { "epoch": 1.875, "grad_norm": 0.36561243105086355, "learning_rate": 2.371104639299393e-07, "loss": 0.616, "step": 2790 }, { "epoch": 1.8783602150537635, "grad_norm": 0.3727929463786164, "learning_rate": 2.2458122492219458e-07, "loss": 0.6186, "step": 2795 }, { "epoch": 1.881720430107527, "grad_norm": 0.3715493910859094, "learning_rate": 2.1238830594595195e-07, "loss": 0.619, "step": 2800 }, { "epoch": 1.8850806451612905, "grad_norm": 0.3701421554724482, "learning_rate": 2.005321264945348e-07, "loss": 0.615, "step": 2805 }, { "epoch": 1.8884408602150538, "grad_norm": 0.38162958713764294, "learning_rate": 1.890130944758528e-07, "loss": 0.6189, "step": 2810 }, { "epoch": 1.8918010752688172, "grad_norm": 0.36878364983672696, "learning_rate": 1.7783160619837202e-07, "loss": 0.6157, "step": 2815 }, { "epoch": 1.8951612903225805, "grad_norm": 0.3702088013665396, "learning_rate": 1.669880463574758e-07, "loss": 0.6136, "step": 2820 }, { "epoch": 1.898521505376344, "grad_norm": 0.37799049217494324, "learning_rate": 1.5648278802223526e-07, "loss": 0.6145, "step": 2825 }, { "epoch": 1.9018817204301075, "grad_norm": 0.37016035977530204, "learning_rate": 1.463161926225687e-07, "loss": 0.6202, "step": 2830 }, { "epoch": 1.905241935483871, "grad_norm": 0.3683670258810398, "learning_rate": 1.3648860993680903e-07, "loss": 0.6146, "step": 2835 }, { "epoch": 1.9086021505376345, "grad_norm": 0.3645705769628613, "learning_rate": 1.2700037807967026e-07, "loss": 0.6158, "step": 2840 }, { "epoch": 1.911962365591398, "grad_norm": 0.3646260231379924, "learning_rate": 1.1785182349061342e-07, "loss": 0.6151, "step": 2845 }, { "epoch": 1.9153225806451613, "grad_norm": 0.36357216498889616, "learning_rate": 1.0904326092261441e-07, "loss": 0.6173, "step": 2850 }, { "epoch": 1.9186827956989247, "grad_norm": 0.36891941735522826, "learning_rate": 1.0057499343134269e-07, "loss": 0.617, "step": 2855 }, { "epoch": 1.922043010752688, "grad_norm": 0.3618844540766792, "learning_rate": 9.24473123647196e-08, "loss": 0.6187, "step": 2860 }, { "epoch": 1.9254032258064515, "grad_norm": 0.3829664112005493, "learning_rate": 8.466049735291415e-08, "loss": 0.6194, "step": 2865 }, { "epoch": 1.928763440860215, "grad_norm": 0.36781144520431924, "learning_rate": 7.721481629870076e-08, "loss": 0.6169, "step": 2870 }, { "epoch": 1.9321236559139785, "grad_norm": 0.3853416687052536, "learning_rate": 7.011052536826435e-08, "loss": 0.6205, "step": 2875 }, { "epoch": 1.935483870967742, "grad_norm": 0.37397490558359964, "learning_rate": 6.334786898237078e-08, "loss": 0.6154, "step": 2880 }, { "epoch": 1.9388440860215055, "grad_norm": 0.36691620687136894, "learning_rate": 5.69270798079613e-08, "loss": 0.6196, "step": 2885 }, { "epoch": 1.942204301075269, "grad_norm": 0.3658247784366582, "learning_rate": 5.084837875015347e-08, "loss": 0.6132, "step": 2890 }, { "epoch": 1.9455645161290323, "grad_norm": 0.36412582119114073, "learning_rate": 4.511197494463493e-08, "loss": 0.6172, "step": 2895 }, { "epoch": 1.9489247311827957, "grad_norm": 0.3663620362729526, "learning_rate": 3.971806575047033e-08, "loss": 0.618, "step": 2900 }, { "epoch": 1.952284946236559, "grad_norm": 0.3675765196570775, "learning_rate": 3.466683674331228e-08, "loss": 0.6166, "step": 2905 }, { "epoch": 1.9556451612903225, "grad_norm": 0.3591512141413379, "learning_rate": 2.995846170901428e-08, "loss": 0.6156, "step": 2910 }, { "epoch": 1.959005376344086, "grad_norm": 0.36646296059037736, "learning_rate": 2.5593102637652136e-08, "loss": 0.6145, "step": 2915 }, { "epoch": 1.9623655913978495, "grad_norm": 0.36873568843189847, "learning_rate": 2.1570909717955058e-08, "loss": 0.6183, "step": 2920 }, { "epoch": 1.965725806451613, "grad_norm": 0.3643607047509, "learning_rate": 1.789202133212986e-08, "loss": 0.6144, "step": 2925 }, { "epoch": 1.9690860215053765, "grad_norm": 0.3696768969448687, "learning_rate": 1.4556564051110278e-08, "loss": 0.6221, "step": 2930 }, { "epoch": 1.9724462365591398, "grad_norm": 0.3739771169389071, "learning_rate": 1.1564652630192686e-08, "loss": 0.6171, "step": 2935 }, { "epoch": 1.9758064516129032, "grad_norm": 0.36410850268700906, "learning_rate": 8.916390005095921e-09, "loss": 0.619, "step": 2940 }, { "epoch": 1.9791666666666665, "grad_norm": 0.3635016376462389, "learning_rate": 6.61186728841412e-09, "loss": 0.6121, "step": 2945 }, { "epoch": 1.98252688172043, "grad_norm": 0.36237306270652886, "learning_rate": 4.651163766484779e-09, "loss": 0.6164, "step": 2950 }, { "epoch": 1.9858870967741935, "grad_norm": 0.3732498548310209, "learning_rate": 3.0343468966598234e-09, "loss": 0.6169, "step": 2955 }, { "epoch": 1.989247311827957, "grad_norm": 0.3679867274420493, "learning_rate": 1.761472304987466e-09, "loss": 0.618, "step": 2960 }, { "epoch": 1.9926075268817205, "grad_norm": 0.37009481264365335, "learning_rate": 8.325837842926288e-10, "loss": 0.6203, "step": 2965 }, { "epoch": 1.995967741935484, "grad_norm": 0.3760102338533851, "learning_rate": 2.4771329267703206e-10, "loss": 0.621, "step": 2970 }, { "epoch": 1.9993279569892473, "grad_norm": 0.3689799921255622, "learning_rate": 6.880952415633246e-12, "loss": 0.6204, "step": 2975 }, { "epoch": 2.0, "eval_loss": 0.642189085483551, "eval_runtime": 38.454, "eval_samples_per_second": 75.831, "eval_steps_per_second": 1.196, "step": 2976 }, { "epoch": 2.0, "step": 2976, "total_flos": 623113855303680.0, "train_loss": 0.6763581057950374, "train_runtime": 8673.6172, "train_samples_per_second": 21.958, "train_steps_per_second": 0.343 } ], "logging_steps": 5, "max_steps": 2976, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 623113855303680.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }