diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4230 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 2976, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006720430107526882, + "grad_norm": 48.78054272427667, + "learning_rate": 6.711409395973155e-08, + "loss": 2.3825, + "step": 1 + }, + { + "epoch": 0.003360215053763441, + "grad_norm": 48.47840626370759, + "learning_rate": 3.3557046979865777e-07, + "loss": 2.376, + "step": 5 + }, + { + "epoch": 0.006720430107526882, + "grad_norm": 51.80697175894909, + "learning_rate": 6.711409395973155e-07, + "loss": 2.3226, + "step": 10 + }, + { + "epoch": 0.010080645161290322, + "grad_norm": 13.624773264265663, + "learning_rate": 1.006711409395973e-06, + "loss": 2.1053, + "step": 15 + }, + { + "epoch": 0.013440860215053764, + "grad_norm": 6.262298470265524, + "learning_rate": 1.342281879194631e-06, + "loss": 1.9272, + "step": 20 + }, + { + "epoch": 0.016801075268817203, + "grad_norm": 7.548559183569387, + "learning_rate": 1.6778523489932889e-06, + "loss": 1.7186, + "step": 25 + }, + { + "epoch": 0.020161290322580645, + "grad_norm": 2.6440968174788626, + "learning_rate": 2.013422818791946e-06, + "loss": 1.5223, + "step": 30 + }, + { + "epoch": 0.023521505376344086, + "grad_norm": 3.58492629574647, + "learning_rate": 2.348993288590604e-06, + "loss": 1.3232, + "step": 35 + }, + { + "epoch": 0.026881720430107527, + "grad_norm": 1.6886536264742815, + "learning_rate": 2.684563758389262e-06, + "loss": 1.1928, + "step": 40 + }, + { + "epoch": 0.03024193548387097, + "grad_norm": 1.217304086312586, + "learning_rate": 3.02013422818792e-06, + "loss": 1.0767, + "step": 45 + }, + { + "epoch": 0.033602150537634407, + "grad_norm": 1.1946964624516874, + "learning_rate": 3.3557046979865777e-06, + "loss": 1.0086, + "step": 50 + }, + { + "epoch": 0.03696236559139785, + "grad_norm": 1.3676335716854129, + "learning_rate": 3.6912751677852355e-06, + "loss": 0.9521, + "step": 55 + }, + { + "epoch": 0.04032258064516129, + "grad_norm": 1.15160172673339, + "learning_rate": 4.026845637583892e-06, + "loss": 0.9144, + "step": 60 + }, + { + "epoch": 0.043682795698924734, + "grad_norm": 1.1898281923774547, + "learning_rate": 4.362416107382551e-06, + "loss": 0.8891, + "step": 65 + }, + { + "epoch": 0.04704301075268817, + "grad_norm": 1.2290162400825384, + "learning_rate": 4.697986577181208e-06, + "loss": 0.8684, + "step": 70 + }, + { + "epoch": 0.05040322580645161, + "grad_norm": 1.1047127902421139, + "learning_rate": 5.033557046979867e-06, + "loss": 0.852, + "step": 75 + }, + { + "epoch": 0.053763440860215055, + "grad_norm": 1.3842650719759613, + "learning_rate": 5.369127516778524e-06, + "loss": 0.8421, + "step": 80 + }, + { + "epoch": 0.05712365591397849, + "grad_norm": 1.1630054678494466, + "learning_rate": 5.704697986577181e-06, + "loss": 0.8293, + "step": 85 + }, + { + "epoch": 0.06048387096774194, + "grad_norm": 1.2895915993551423, + "learning_rate": 6.04026845637584e-06, + "loss": 0.8155, + "step": 90 + }, + { + "epoch": 0.06384408602150538, + "grad_norm": 1.3357187881014583, + "learning_rate": 6.375838926174497e-06, + "loss": 0.8094, + "step": 95 + }, + { + "epoch": 0.06720430107526881, + "grad_norm": 1.4294338181391608, + "learning_rate": 6.711409395973155e-06, + "loss": 0.807, + "step": 100 + }, + { + "epoch": 0.07056451612903226, + "grad_norm": 0.9685428191519895, + "learning_rate": 7.046979865771813e-06, + "loss": 0.7963, + "step": 105 + }, + { + "epoch": 0.0739247311827957, + "grad_norm": 1.0685231337896743, + "learning_rate": 7.382550335570471e-06, + "loss": 0.7903, + "step": 110 + }, + { + "epoch": 0.07728494623655914, + "grad_norm": 1.3129982717287054, + "learning_rate": 7.718120805369127e-06, + "loss": 0.7932, + "step": 115 + }, + { + "epoch": 0.08064516129032258, + "grad_norm": 1.6648712375867527, + "learning_rate": 8.053691275167785e-06, + "loss": 0.7865, + "step": 120 + }, + { + "epoch": 0.08400537634408602, + "grad_norm": 1.2705854884448164, + "learning_rate": 8.389261744966444e-06, + "loss": 0.7799, + "step": 125 + }, + { + "epoch": 0.08736559139784947, + "grad_norm": 1.3845304392408557, + "learning_rate": 8.724832214765101e-06, + "loss": 0.7794, + "step": 130 + }, + { + "epoch": 0.0907258064516129, + "grad_norm": 1.0490487686523224, + "learning_rate": 9.060402684563759e-06, + "loss": 0.7762, + "step": 135 + }, + { + "epoch": 0.09408602150537634, + "grad_norm": 1.1349198286002737, + "learning_rate": 9.395973154362416e-06, + "loss": 0.7704, + "step": 140 + }, + { + "epoch": 0.09744623655913978, + "grad_norm": 1.311025567661312, + "learning_rate": 9.731543624161075e-06, + "loss": 0.7624, + "step": 145 + }, + { + "epoch": 0.10080645161290322, + "grad_norm": 1.0240371131291772, + "learning_rate": 1.0067114093959734e-05, + "loss": 0.7645, + "step": 150 + }, + { + "epoch": 0.10416666666666667, + "grad_norm": 1.0576784893797677, + "learning_rate": 1.040268456375839e-05, + "loss": 0.7612, + "step": 155 + }, + { + "epoch": 0.10752688172043011, + "grad_norm": 0.9872090722303354, + "learning_rate": 1.0738255033557049e-05, + "loss": 0.762, + "step": 160 + }, + { + "epoch": 0.11088709677419355, + "grad_norm": 1.1448278866865749, + "learning_rate": 1.1073825503355706e-05, + "loss": 0.7595, + "step": 165 + }, + { + "epoch": 0.11424731182795698, + "grad_norm": 0.9099648734864951, + "learning_rate": 1.1409395973154362e-05, + "loss": 0.7545, + "step": 170 + }, + { + "epoch": 0.11760752688172044, + "grad_norm": 1.2385558169257114, + "learning_rate": 1.174496644295302e-05, + "loss": 0.7559, + "step": 175 + }, + { + "epoch": 0.12096774193548387, + "grad_norm": 0.9417818627828941, + "learning_rate": 1.208053691275168e-05, + "loss": 0.7533, + "step": 180 + }, + { + "epoch": 0.12432795698924731, + "grad_norm": 1.4888584917162164, + "learning_rate": 1.2416107382550337e-05, + "loss": 0.75, + "step": 185 + }, + { + "epoch": 0.12768817204301075, + "grad_norm": 1.6094338507124168, + "learning_rate": 1.2751677852348994e-05, + "loss": 0.7516, + "step": 190 + }, + { + "epoch": 0.1310483870967742, + "grad_norm": 1.1484532669834084, + "learning_rate": 1.3087248322147652e-05, + "loss": 0.7528, + "step": 195 + }, + { + "epoch": 0.13440860215053763, + "grad_norm": 0.9606481759674919, + "learning_rate": 1.342281879194631e-05, + "loss": 0.7432, + "step": 200 + }, + { + "epoch": 0.13776881720430106, + "grad_norm": 0.90666325947728, + "learning_rate": 1.3758389261744966e-05, + "loss": 0.7456, + "step": 205 + }, + { + "epoch": 0.14112903225806453, + "grad_norm": 0.9003130723406445, + "learning_rate": 1.4093959731543626e-05, + "loss": 0.741, + "step": 210 + }, + { + "epoch": 0.14448924731182797, + "grad_norm": 0.9046243900141376, + "learning_rate": 1.4429530201342283e-05, + "loss": 0.7413, + "step": 215 + }, + { + "epoch": 0.1478494623655914, + "grad_norm": 0.9495916559973592, + "learning_rate": 1.4765100671140942e-05, + "loss": 0.7396, + "step": 220 + }, + { + "epoch": 0.15120967741935484, + "grad_norm": 0.8392501968484852, + "learning_rate": 1.5100671140939598e-05, + "loss": 0.7451, + "step": 225 + }, + { + "epoch": 0.15456989247311828, + "grad_norm": 1.0111465864476745, + "learning_rate": 1.5436241610738255e-05, + "loss": 0.7434, + "step": 230 + }, + { + "epoch": 0.15793010752688172, + "grad_norm": 0.9025884088477472, + "learning_rate": 1.5771812080536916e-05, + "loss": 0.7354, + "step": 235 + }, + { + "epoch": 0.16129032258064516, + "grad_norm": 0.9720296761608475, + "learning_rate": 1.610738255033557e-05, + "loss": 0.7371, + "step": 240 + }, + { + "epoch": 0.1646505376344086, + "grad_norm": 0.9947001241550435, + "learning_rate": 1.644295302013423e-05, + "loss": 0.7288, + "step": 245 + }, + { + "epoch": 0.16801075268817203, + "grad_norm": 0.9174436365138352, + "learning_rate": 1.6778523489932888e-05, + "loss": 0.7347, + "step": 250 + }, + { + "epoch": 0.17137096774193547, + "grad_norm": 1.0267733434401298, + "learning_rate": 1.7114093959731545e-05, + "loss": 0.7375, + "step": 255 + }, + { + "epoch": 0.17473118279569894, + "grad_norm": 0.8397895938406165, + "learning_rate": 1.7449664429530202e-05, + "loss": 0.7341, + "step": 260 + }, + { + "epoch": 0.17809139784946237, + "grad_norm": 0.76727099531327, + "learning_rate": 1.778523489932886e-05, + "loss": 0.7309, + "step": 265 + }, + { + "epoch": 0.1814516129032258, + "grad_norm": 0.7412065721447615, + "learning_rate": 1.8120805369127517e-05, + "loss": 0.7269, + "step": 270 + }, + { + "epoch": 0.18481182795698925, + "grad_norm": 0.7810498280270748, + "learning_rate": 1.8456375838926174e-05, + "loss": 0.7257, + "step": 275 + }, + { + "epoch": 0.1881720430107527, + "grad_norm": 0.8613709749529885, + "learning_rate": 1.8791946308724832e-05, + "loss": 0.7315, + "step": 280 + }, + { + "epoch": 0.19153225806451613, + "grad_norm": 0.9739010230796492, + "learning_rate": 1.9127516778523493e-05, + "loss": 0.7272, + "step": 285 + }, + { + "epoch": 0.19489247311827956, + "grad_norm": 0.8646191524355271, + "learning_rate": 1.946308724832215e-05, + "loss": 0.7289, + "step": 290 + }, + { + "epoch": 0.198252688172043, + "grad_norm": 0.9363534800514449, + "learning_rate": 1.9798657718120807e-05, + "loss": 0.7258, + "step": 295 + }, + { + "epoch": 0.20161290322580644, + "grad_norm": 0.9701273350687407, + "learning_rate": 1.9999972476199807e-05, + "loss": 0.7317, + "step": 300 + }, + { + "epoch": 0.2049731182795699, + "grad_norm": 0.7773609368388674, + "learning_rate": 1.999966283518764e-05, + "loss": 0.7227, + "step": 305 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.8956665591253581, + "learning_rate": 1.999900915910167e-05, + "loss": 0.7231, + "step": 310 + }, + { + "epoch": 0.21169354838709678, + "grad_norm": 0.6269021652498205, + "learning_rate": 1.99980114704314e-05, + "loss": 0.7216, + "step": 315 + }, + { + "epoch": 0.21505376344086022, + "grad_norm": 1.011329580641999, + "learning_rate": 1.9996669803501982e-05, + "loss": 0.7276, + "step": 320 + }, + { + "epoch": 0.21841397849462366, + "grad_norm": 1.080429434985579, + "learning_rate": 1.999498420447302e-05, + "loss": 0.7236, + "step": 325 + }, + { + "epoch": 0.2217741935483871, + "grad_norm": 0.8583297562008422, + "learning_rate": 1.9992954731336992e-05, + "loss": 0.7225, + "step": 330 + }, + { + "epoch": 0.22513440860215053, + "grad_norm": 0.8080172656804933, + "learning_rate": 1.9990581453917234e-05, + "loss": 0.7209, + "step": 335 + }, + { + "epoch": 0.22849462365591397, + "grad_norm": 0.8049987876429506, + "learning_rate": 1.998786445386558e-05, + "loss": 0.7178, + "step": 340 + }, + { + "epoch": 0.2318548387096774, + "grad_norm": 0.7233007018856564, + "learning_rate": 1.9984803824659504e-05, + "loss": 0.7181, + "step": 345 + }, + { + "epoch": 0.23521505376344087, + "grad_norm": 0.6435752626797113, + "learning_rate": 1.998139967159894e-05, + "loss": 0.7124, + "step": 350 + }, + { + "epoch": 0.2385752688172043, + "grad_norm": 0.6988189980904659, + "learning_rate": 1.997765211180264e-05, + "loss": 0.7138, + "step": 355 + }, + { + "epoch": 0.24193548387096775, + "grad_norm": 0.8449192477747994, + "learning_rate": 1.9973561274204153e-05, + "loss": 0.7164, + "step": 360 + }, + { + "epoch": 0.2452956989247312, + "grad_norm": 0.6199401123601755, + "learning_rate": 1.9969127299547387e-05, + "loss": 0.711, + "step": 365 + }, + { + "epoch": 0.24865591397849462, + "grad_norm": 0.6508335116554123, + "learning_rate": 1.9964350340381763e-05, + "loss": 0.7099, + "step": 370 + }, + { + "epoch": 0.25201612903225806, + "grad_norm": 0.7240178041828755, + "learning_rate": 1.995923056105697e-05, + "loss": 0.7108, + "step": 375 + }, + { + "epoch": 0.2553763440860215, + "grad_norm": 0.7146640916604361, + "learning_rate": 1.9953768137717323e-05, + "loss": 0.714, + "step": 380 + }, + { + "epoch": 0.25873655913978494, + "grad_norm": 0.7235799398485985, + "learning_rate": 1.9947963258295665e-05, + "loss": 0.7141, + "step": 385 + }, + { + "epoch": 0.2620967741935484, + "grad_norm": 0.7085278247346875, + "learning_rate": 1.9941816122506958e-05, + "loss": 0.7123, + "step": 390 + }, + { + "epoch": 0.2654569892473118, + "grad_norm": 0.8336879205562984, + "learning_rate": 1.9935326941841348e-05, + "loss": 0.7118, + "step": 395 + }, + { + "epoch": 0.26881720430107525, + "grad_norm": 0.6472707436647601, + "learning_rate": 1.9928495939556952e-05, + "loss": 0.7112, + "step": 400 + }, + { + "epoch": 0.2721774193548387, + "grad_norm": 0.5404676396301624, + "learning_rate": 1.9921323350672123e-05, + "loss": 0.7023, + "step": 405 + }, + { + "epoch": 0.27553763440860213, + "grad_norm": 0.5867567569524356, + "learning_rate": 1.9913809421957395e-05, + "loss": 0.7061, + "step": 410 + }, + { + "epoch": 0.27889784946236557, + "grad_norm": 0.5310275193870236, + "learning_rate": 1.9905954411926992e-05, + "loss": 0.7097, + "step": 415 + }, + { + "epoch": 0.28225806451612906, + "grad_norm": 0.5428907392759953, + "learning_rate": 1.9897758590829915e-05, + "loss": 0.7046, + "step": 420 + }, + { + "epoch": 0.2856182795698925, + "grad_norm": 0.6360587388135657, + "learning_rate": 1.988922224064067e-05, + "loss": 0.7059, + "step": 425 + }, + { + "epoch": 0.28897849462365593, + "grad_norm": 0.5538060039668187, + "learning_rate": 1.988034565504954e-05, + "loss": 0.7055, + "step": 430 + }, + { + "epoch": 0.2923387096774194, + "grad_norm": 0.7074777520721658, + "learning_rate": 1.9871129139452502e-05, + "loss": 0.7023, + "step": 435 + }, + { + "epoch": 0.2956989247311828, + "grad_norm": 0.6033039358656708, + "learning_rate": 1.986157301094071e-05, + "loss": 0.702, + "step": 440 + }, + { + "epoch": 0.29905913978494625, + "grad_norm": 0.7020301366143326, + "learning_rate": 1.9851677598289588e-05, + "loss": 0.7072, + "step": 445 + }, + { + "epoch": 0.3024193548387097, + "grad_norm": 0.6735214131146494, + "learning_rate": 1.9841443241947515e-05, + "loss": 0.7046, + "step": 450 + }, + { + "epoch": 0.3057795698924731, + "grad_norm": 0.5846703075665397, + "learning_rate": 1.983087029402411e-05, + "loss": 0.7004, + "step": 455 + }, + { + "epoch": 0.30913978494623656, + "grad_norm": 0.558983961781733, + "learning_rate": 1.9819959118278144e-05, + "loss": 0.7046, + "step": 460 + }, + { + "epoch": 0.3125, + "grad_norm": 0.6880041036412101, + "learning_rate": 1.9808710090104983e-05, + "loss": 0.7037, + "step": 465 + }, + { + "epoch": 0.31586021505376344, + "grad_norm": 0.622425101525314, + "learning_rate": 1.9797123596523692e-05, + "loss": 0.6984, + "step": 470 + }, + { + "epoch": 0.3192204301075269, + "grad_norm": 0.6341081550172605, + "learning_rate": 1.978520003616374e-05, + "loss": 0.6985, + "step": 475 + }, + { + "epoch": 0.3225806451612903, + "grad_norm": 0.659287705673367, + "learning_rate": 1.9772939819251247e-05, + "loss": 0.6954, + "step": 480 + }, + { + "epoch": 0.32594086021505375, + "grad_norm": 0.5715874934602246, + "learning_rate": 1.976034336759491e-05, + "loss": 0.6988, + "step": 485 + }, + { + "epoch": 0.3293010752688172, + "grad_norm": 0.8050941273557731, + "learning_rate": 1.9747411114571445e-05, + "loss": 0.6993, + "step": 490 + }, + { + "epoch": 0.3326612903225806, + "grad_norm": 0.6666660914014055, + "learning_rate": 1.973414350511072e-05, + "loss": 0.7018, + "step": 495 + }, + { + "epoch": 0.33602150537634407, + "grad_norm": 0.5898448667662453, + "learning_rate": 1.9720540995680428e-05, + "loss": 0.697, + "step": 500 + }, + { + "epoch": 0.3393817204301075, + "grad_norm": 0.7204191081412994, + "learning_rate": 1.9706604054270376e-05, + "loss": 0.6966, + "step": 505 + }, + { + "epoch": 0.34274193548387094, + "grad_norm": 0.6353645462526023, + "learning_rate": 1.9692333160376407e-05, + "loss": 0.693, + "step": 510 + }, + { + "epoch": 0.34610215053763443, + "grad_norm": 0.5663739741433981, + "learning_rate": 1.967772880498387e-05, + "loss": 0.6985, + "step": 515 + }, + { + "epoch": 0.34946236559139787, + "grad_norm": 0.541549492600514, + "learning_rate": 1.9662791490550755e-05, + "loss": 0.6933, + "step": 520 + }, + { + "epoch": 0.3528225806451613, + "grad_norm": 0.5766179836769963, + "learning_rate": 1.9647521730990408e-05, + "loss": 0.6996, + "step": 525 + }, + { + "epoch": 0.35618279569892475, + "grad_norm": 0.528070730439876, + "learning_rate": 1.9631920051653813e-05, + "loss": 0.6967, + "step": 530 + }, + { + "epoch": 0.3595430107526882, + "grad_norm": 0.590209921037558, + "learning_rate": 1.9615986989311567e-05, + "loss": 0.6956, + "step": 535 + }, + { + "epoch": 0.3629032258064516, + "grad_norm": 0.5291818662889545, + "learning_rate": 1.9599723092135376e-05, + "loss": 0.695, + "step": 540 + }, + { + "epoch": 0.36626344086021506, + "grad_norm": 0.4970212649639191, + "learning_rate": 1.9583128919679218e-05, + "loss": 0.6917, + "step": 545 + }, + { + "epoch": 0.3696236559139785, + "grad_norm": 0.6225127520386264, + "learning_rate": 1.956620504286007e-05, + "loss": 0.6935, + "step": 550 + }, + { + "epoch": 0.37298387096774194, + "grad_norm": 0.6214865530844041, + "learning_rate": 1.9548952043938286e-05, + "loss": 0.6894, + "step": 555 + }, + { + "epoch": 0.3763440860215054, + "grad_norm": 0.6009400925112226, + "learning_rate": 1.9531370516497562e-05, + "loss": 0.6888, + "step": 560 + }, + { + "epoch": 0.3797043010752688, + "grad_norm": 0.6054051937463671, + "learning_rate": 1.95134610654245e-05, + "loss": 0.6904, + "step": 565 + }, + { + "epoch": 0.38306451612903225, + "grad_norm": 0.5850716552121316, + "learning_rate": 1.9495224306887797e-05, + "loss": 0.6924, + "step": 570 + }, + { + "epoch": 0.3864247311827957, + "grad_norm": 0.5796507375628339, + "learning_rate": 1.9476660868317076e-05, + "loss": 0.69, + "step": 575 + }, + { + "epoch": 0.3897849462365591, + "grad_norm": 0.72632335015395, + "learning_rate": 1.945777138838126e-05, + "loss": 0.6877, + "step": 580 + }, + { + "epoch": 0.39314516129032256, + "grad_norm": 0.5476131150629957, + "learning_rate": 1.943855651696663e-05, + "loss": 0.6902, + "step": 585 + }, + { + "epoch": 0.396505376344086, + "grad_norm": 0.5140382069241621, + "learning_rate": 1.941901691515444e-05, + "loss": 0.6929, + "step": 590 + }, + { + "epoch": 0.39986559139784944, + "grad_norm": 0.4839340720292055, + "learning_rate": 1.9399153255198193e-05, + "loss": 0.694, + "step": 595 + }, + { + "epoch": 0.4032258064516129, + "grad_norm": 0.4677177786124492, + "learning_rate": 1.9378966220500503e-05, + "loss": 0.6952, + "step": 600 + }, + { + "epoch": 0.40658602150537637, + "grad_norm": 0.5207299081688349, + "learning_rate": 1.9358456505589586e-05, + "loss": 0.6932, + "step": 605 + }, + { + "epoch": 0.4099462365591398, + "grad_norm": 0.5396909713942541, + "learning_rate": 1.933762481609536e-05, + "loss": 0.684, + "step": 610 + }, + { + "epoch": 0.41330645161290325, + "grad_norm": 0.49142294486557386, + "learning_rate": 1.9316471868725167e-05, + "loss": 0.6948, + "step": 615 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.5408353090528122, + "learning_rate": 1.9294998391239133e-05, + "loss": 0.6858, + "step": 620 + }, + { + "epoch": 0.4200268817204301, + "grad_norm": 0.5233730262407599, + "learning_rate": 1.9273205122425108e-05, + "loss": 0.6887, + "step": 625 + }, + { + "epoch": 0.42338709677419356, + "grad_norm": 0.5715799297785072, + "learning_rate": 1.925109281207326e-05, + "loss": 0.6872, + "step": 630 + }, + { + "epoch": 0.426747311827957, + "grad_norm": 0.5328961169706027, + "learning_rate": 1.922866222095026e-05, + "loss": 0.6899, + "step": 635 + }, + { + "epoch": 0.43010752688172044, + "grad_norm": 0.4887114084575751, + "learning_rate": 1.9205914120773146e-05, + "loss": 0.6906, + "step": 640 + }, + { + "epoch": 0.4334677419354839, + "grad_norm": 0.5209483295062493, + "learning_rate": 1.9182849294182734e-05, + "loss": 0.6879, + "step": 645 + }, + { + "epoch": 0.4368279569892473, + "grad_norm": 0.5695163664455944, + "learning_rate": 1.915946853471671e-05, + "loss": 0.6888, + "step": 650 + }, + { + "epoch": 0.44018817204301075, + "grad_norm": 0.5383685605458758, + "learning_rate": 1.913577264678233e-05, + "loss": 0.6866, + "step": 655 + }, + { + "epoch": 0.4435483870967742, + "grad_norm": 0.5016268706035367, + "learning_rate": 1.9111762445628738e-05, + "loss": 0.6828, + "step": 660 + }, + { + "epoch": 0.4469086021505376, + "grad_norm": 0.49415639069338374, + "learning_rate": 1.908743875731891e-05, + "loss": 0.6902, + "step": 665 + }, + { + "epoch": 0.45026881720430106, + "grad_norm": 0.5339722588835791, + "learning_rate": 1.906280241870126e-05, + "loss": 0.685, + "step": 670 + }, + { + "epoch": 0.4536290322580645, + "grad_norm": 0.6037780346276747, + "learning_rate": 1.903785427738082e-05, + "loss": 0.6837, + "step": 675 + }, + { + "epoch": 0.45698924731182794, + "grad_norm": 0.5163759102323392, + "learning_rate": 1.9012595191690096e-05, + "loss": 0.6864, + "step": 680 + }, + { + "epoch": 0.4603494623655914, + "grad_norm": 0.5540520547432817, + "learning_rate": 1.8987026030659527e-05, + "loss": 0.6868, + "step": 685 + }, + { + "epoch": 0.4637096774193548, + "grad_norm": 0.5141253497235312, + "learning_rate": 1.8961147673987598e-05, + "loss": 0.685, + "step": 690 + }, + { + "epoch": 0.46706989247311825, + "grad_norm": 0.47087600726350604, + "learning_rate": 1.893496101201056e-05, + "loss": 0.6879, + "step": 695 + }, + { + "epoch": 0.47043010752688175, + "grad_norm": 0.4745813202969137, + "learning_rate": 1.8908466945671805e-05, + "loss": 0.6873, + "step": 700 + }, + { + "epoch": 0.4737903225806452, + "grad_norm": 0.5512208302577559, + "learning_rate": 1.8881666386490874e-05, + "loss": 0.6816, + "step": 705 + }, + { + "epoch": 0.4771505376344086, + "grad_norm": 0.49728239853688544, + "learning_rate": 1.8854560256532098e-05, + "loss": 0.6839, + "step": 710 + }, + { + "epoch": 0.48051075268817206, + "grad_norm": 0.529373520548778, + "learning_rate": 1.882714948837286e-05, + "loss": 0.6824, + "step": 715 + }, + { + "epoch": 0.4838709677419355, + "grad_norm": 0.5220674061051042, + "learning_rate": 1.8799435025071515e-05, + "loss": 0.6786, + "step": 720 + }, + { + "epoch": 0.48723118279569894, + "grad_norm": 0.5130522433448756, + "learning_rate": 1.8771417820134963e-05, + "loss": 0.6822, + "step": 725 + }, + { + "epoch": 0.4905913978494624, + "grad_norm": 0.5290689359475228, + "learning_rate": 1.8743098837485813e-05, + "loss": 0.6865, + "step": 730 + }, + { + "epoch": 0.4939516129032258, + "grad_norm": 0.6064255629442339, + "learning_rate": 1.871447905142925e-05, + "loss": 0.6806, + "step": 735 + }, + { + "epoch": 0.49731182795698925, + "grad_norm": 0.5012647709893052, + "learning_rate": 1.868555944661949e-05, + "loss": 0.6835, + "step": 740 + }, + { + "epoch": 0.5006720430107527, + "grad_norm": 0.644030287593773, + "learning_rate": 1.865634101802592e-05, + "loss": 0.6833, + "step": 745 + }, + { + "epoch": 0.5040322580645161, + "grad_norm": 0.5147725717983542, + "learning_rate": 1.8626824770898856e-05, + "loss": 0.6869, + "step": 750 + }, + { + "epoch": 0.5073924731182796, + "grad_norm": 0.6476710537127688, + "learning_rate": 1.859701172073496e-05, + "loss": 0.6798, + "step": 755 + }, + { + "epoch": 0.510752688172043, + "grad_norm": 0.5471024984874042, + "learning_rate": 1.856690289324231e-05, + "loss": 0.6809, + "step": 760 + }, + { + "epoch": 0.5141129032258065, + "grad_norm": 0.5523881151273712, + "learning_rate": 1.8536499324305102e-05, + "loss": 0.6818, + "step": 765 + }, + { + "epoch": 0.5174731182795699, + "grad_norm": 0.5050686616013327, + "learning_rate": 1.8505802059948012e-05, + "loss": 0.6843, + "step": 770 + }, + { + "epoch": 0.5208333333333334, + "grad_norm": 0.5438632142417479, + "learning_rate": 1.847481215630021e-05, + "loss": 0.685, + "step": 775 + }, + { + "epoch": 0.5241935483870968, + "grad_norm": 0.4941593674742346, + "learning_rate": 1.844353067955902e-05, + "loss": 0.6808, + "step": 780 + }, + { + "epoch": 0.5275537634408602, + "grad_norm": 0.4754694994792113, + "learning_rate": 1.8411958705953248e-05, + "loss": 0.6816, + "step": 785 + }, + { + "epoch": 0.5309139784946236, + "grad_norm": 0.4805660656514537, + "learning_rate": 1.838009732170614e-05, + "loss": 0.6796, + "step": 790 + }, + { + "epoch": 0.5342741935483871, + "grad_norm": 0.44882280902701865, + "learning_rate": 1.834794762299803e-05, + "loss": 0.6835, + "step": 795 + }, + { + "epoch": 0.5376344086021505, + "grad_norm": 0.4955511641250313, + "learning_rate": 1.8315510715928607e-05, + "loss": 0.6847, + "step": 800 + }, + { + "epoch": 0.540994623655914, + "grad_norm": 0.4852311864762166, + "learning_rate": 1.828278771647887e-05, + "loss": 0.681, + "step": 805 + }, + { + "epoch": 0.5443548387096774, + "grad_norm": 0.5005675657439985, + "learning_rate": 1.8249779750472725e-05, + "loss": 0.6832, + "step": 810 + }, + { + "epoch": 0.5477150537634409, + "grad_norm": 0.5102495993533148, + "learning_rate": 1.821648795353827e-05, + "loss": 0.6817, + "step": 815 + }, + { + "epoch": 0.5510752688172043, + "grad_norm": 0.4943223187596057, + "learning_rate": 1.81829134710687e-05, + "loss": 0.6791, + "step": 820 + }, + { + "epoch": 0.5544354838709677, + "grad_norm": 0.542801548624785, + "learning_rate": 1.8149057458182924e-05, + "loss": 0.6832, + "step": 825 + }, + { + "epoch": 0.5577956989247311, + "grad_norm": 0.45148280962815884, + "learning_rate": 1.81149210796858e-05, + "loss": 0.6784, + "step": 830 + }, + { + "epoch": 0.5611559139784946, + "grad_norm": 0.4566100659521107, + "learning_rate": 1.8080505510028073e-05, + "loss": 0.6811, + "step": 835 + }, + { + "epoch": 0.5645161290322581, + "grad_norm": 0.45139497231587655, + "learning_rate": 1.8045811933265973e-05, + "loss": 0.6782, + "step": 840 + }, + { + "epoch": 0.5678763440860215, + "grad_norm": 0.5039358231725258, + "learning_rate": 1.8010841543020472e-05, + "loss": 0.6776, + "step": 845 + }, + { + "epoch": 0.571236559139785, + "grad_norm": 0.44244553735827763, + "learning_rate": 1.7975595542436207e-05, + "loss": 0.6795, + "step": 850 + }, + { + "epoch": 0.5745967741935484, + "grad_norm": 0.513649210570403, + "learning_rate": 1.794007514414011e-05, + "loss": 0.6777, + "step": 855 + }, + { + "epoch": 0.5779569892473119, + "grad_norm": 0.4779895963476636, + "learning_rate": 1.790428157019967e-05, + "loss": 0.6807, + "step": 860 + }, + { + "epoch": 0.5813172043010753, + "grad_norm": 0.5317771601563221, + "learning_rate": 1.7868216052080898e-05, + "loss": 0.6774, + "step": 865 + }, + { + "epoch": 0.5846774193548387, + "grad_norm": 0.4387347378891155, + "learning_rate": 1.783187983060594e-05, + "loss": 0.6771, + "step": 870 + }, + { + "epoch": 0.5880376344086021, + "grad_norm": 0.47208773953042626, + "learning_rate": 1.7795274155910408e-05, + "loss": 0.6795, + "step": 875 + }, + { + "epoch": 0.5913978494623656, + "grad_norm": 0.4919880987398928, + "learning_rate": 1.7758400287400372e-05, + "loss": 0.6764, + "step": 880 + }, + { + "epoch": 0.594758064516129, + "grad_norm": 0.4746143436746516, + "learning_rate": 1.772125949370901e-05, + "loss": 0.6777, + "step": 885 + }, + { + "epoch": 0.5981182795698925, + "grad_norm": 0.4866530917728411, + "learning_rate": 1.7683853052652974e-05, + "loss": 0.6778, + "step": 890 + }, + { + "epoch": 0.6014784946236559, + "grad_norm": 0.45607894178261754, + "learning_rate": 1.764618225118843e-05, + "loss": 0.675, + "step": 895 + }, + { + "epoch": 0.6048387096774194, + "grad_norm": 0.4623134720756107, + "learning_rate": 1.7608248385366774e-05, + "loss": 0.6724, + "step": 900 + }, + { + "epoch": 0.6081989247311828, + "grad_norm": 0.41997040926221346, + "learning_rate": 1.7570052760290036e-05, + "loss": 0.6749, + "step": 905 + }, + { + "epoch": 0.6115591397849462, + "grad_norm": 0.4352461962298591, + "learning_rate": 1.7531596690065998e-05, + "loss": 0.673, + "step": 910 + }, + { + "epoch": 0.6149193548387096, + "grad_norm": 0.47410360513736394, + "learning_rate": 1.7492881497762958e-05, + "loss": 0.6758, + "step": 915 + }, + { + "epoch": 0.6182795698924731, + "grad_norm": 0.45269850953936747, + "learning_rate": 1.7453908515364238e-05, + "loss": 0.6802, + "step": 920 + }, + { + "epoch": 0.6216397849462365, + "grad_norm": 0.46524887071623716, + "learning_rate": 1.741467908372233e-05, + "loss": 0.674, + "step": 925 + }, + { + "epoch": 0.625, + "grad_norm": 0.4739090399223664, + "learning_rate": 1.7375194552512777e-05, + "loss": 0.6743, + "step": 930 + }, + { + "epoch": 0.6283602150537635, + "grad_norm": 0.50057753354274, + "learning_rate": 1.7335456280187752e-05, + "loss": 0.678, + "step": 935 + }, + { + "epoch": 0.6317204301075269, + "grad_norm": 0.4342277157697679, + "learning_rate": 1.729546563392929e-05, + "loss": 0.6766, + "step": 940 + }, + { + "epoch": 0.6350806451612904, + "grad_norm": 0.5421833972178155, + "learning_rate": 1.7255223989602277e-05, + "loss": 0.6725, + "step": 945 + }, + { + "epoch": 0.6384408602150538, + "grad_norm": 0.5180088295892716, + "learning_rate": 1.72147327317071e-05, + "loss": 0.6766, + "step": 950 + }, + { + "epoch": 0.6418010752688172, + "grad_norm": 0.4835576817093598, + "learning_rate": 1.7173993253332016e-05, + "loss": 0.6754, + "step": 955 + }, + { + "epoch": 0.6451612903225806, + "grad_norm": 0.4566946543115902, + "learning_rate": 1.7133006956105237e-05, + "loss": 0.6751, + "step": 960 + }, + { + "epoch": 0.6485215053763441, + "grad_norm": 0.43744247420203414, + "learning_rate": 1.7091775250146678e-05, + "loss": 0.6708, + "step": 965 + }, + { + "epoch": 0.6518817204301075, + "grad_norm": 0.4237915807278061, + "learning_rate": 1.7050299554019466e-05, + "loss": 0.6702, + "step": 970 + }, + { + "epoch": 0.655241935483871, + "grad_norm": 0.4159411791640258, + "learning_rate": 1.700858129468114e-05, + "loss": 0.6738, + "step": 975 + }, + { + "epoch": 0.6586021505376344, + "grad_norm": 0.47229925238109904, + "learning_rate": 1.696662190743453e-05, + "loss": 0.6731, + "step": 980 + }, + { + "epoch": 0.6619623655913979, + "grad_norm": 0.4357051029703012, + "learning_rate": 1.69244228358784e-05, + "loss": 0.6732, + "step": 985 + }, + { + "epoch": 0.6653225806451613, + "grad_norm": 0.47900841918884185, + "learning_rate": 1.688198553185777e-05, + "loss": 0.6742, + "step": 990 + }, + { + "epoch": 0.6686827956989247, + "grad_norm": 0.529861867014197, + "learning_rate": 1.683931145541397e-05, + "loss": 0.6753, + "step": 995 + }, + { + "epoch": 0.6720430107526881, + "grad_norm": 0.49314598912759483, + "learning_rate": 1.6796402074734404e-05, + "loss": 0.6694, + "step": 1000 + }, + { + "epoch": 0.6754032258064516, + "grad_norm": 0.47658141232504353, + "learning_rate": 1.6753258866102047e-05, + "loss": 0.6733, + "step": 1005 + }, + { + "epoch": 0.678763440860215, + "grad_norm": 0.4555180579717891, + "learning_rate": 1.6709883313844634e-05, + "loss": 0.6771, + "step": 1010 + }, + { + "epoch": 0.6821236559139785, + "grad_norm": 0.4804224902104754, + "learning_rate": 1.6666276910283623e-05, + "loss": 0.6754, + "step": 1015 + }, + { + "epoch": 0.6854838709677419, + "grad_norm": 0.4676965423125691, + "learning_rate": 1.662244115568282e-05, + "loss": 0.6697, + "step": 1020 + }, + { + "epoch": 0.6888440860215054, + "grad_norm": 0.4309666314092742, + "learning_rate": 1.657837755819678e-05, + "loss": 0.6682, + "step": 1025 + }, + { + "epoch": 0.6922043010752689, + "grad_norm": 0.4526083077842183, + "learning_rate": 1.6534087633818914e-05, + "loss": 0.6738, + "step": 1030 + }, + { + "epoch": 0.6955645161290323, + "grad_norm": 0.42731232075306474, + "learning_rate": 1.6489572906329345e-05, + "loss": 0.6699, + "step": 1035 + }, + { + "epoch": 0.6989247311827957, + "grad_norm": 0.4584418364845978, + "learning_rate": 1.644483490724247e-05, + "loss": 0.671, + "step": 1040 + }, + { + "epoch": 0.7022849462365591, + "grad_norm": 0.45883989932174113, + "learning_rate": 1.6399875175754258e-05, + "loss": 0.6696, + "step": 1045 + }, + { + "epoch": 0.7056451612903226, + "grad_norm": 0.44771182993617314, + "learning_rate": 1.635469525868932e-05, + "loss": 0.6721, + "step": 1050 + }, + { + "epoch": 0.709005376344086, + "grad_norm": 0.4588720172063543, + "learning_rate": 1.6309296710447674e-05, + "loss": 0.6697, + "step": 1055 + }, + { + "epoch": 0.7123655913978495, + "grad_norm": 0.45001838732164606, + "learning_rate": 1.626368109295128e-05, + "loss": 0.667, + "step": 1060 + }, + { + "epoch": 0.7157258064516129, + "grad_norm": 0.42641910570356634, + "learning_rate": 1.6217849975590275e-05, + "loss": 0.6708, + "step": 1065 + }, + { + "epoch": 0.7190860215053764, + "grad_norm": 0.422498792435904, + "learning_rate": 1.617180493516901e-05, + "loss": 0.6711, + "step": 1070 + }, + { + "epoch": 0.7224462365591398, + "grad_norm": 0.4415726107421213, + "learning_rate": 1.6125547555851787e-05, + "loss": 0.6685, + "step": 1075 + }, + { + "epoch": 0.7258064516129032, + "grad_norm": 0.4534706255885532, + "learning_rate": 1.6079079429108357e-05, + "loss": 0.6696, + "step": 1080 + }, + { + "epoch": 0.7291666666666666, + "grad_norm": 0.4029176160192046, + "learning_rate": 1.6032402153659164e-05, + "loss": 0.6726, + "step": 1085 + }, + { + "epoch": 0.7325268817204301, + "grad_norm": 0.42621354076663304, + "learning_rate": 1.5985517335420346e-05, + "loss": 0.6666, + "step": 1090 + }, + { + "epoch": 0.7358870967741935, + "grad_norm": 0.44234432560741715, + "learning_rate": 1.5938426587448473e-05, + "loss": 0.6741, + "step": 1095 + }, + { + "epoch": 0.739247311827957, + "grad_norm": 0.4473272678937478, + "learning_rate": 1.589113152988507e-05, + "loss": 0.6709, + "step": 1100 + }, + { + "epoch": 0.7426075268817204, + "grad_norm": 0.44595646225862023, + "learning_rate": 1.5843633789900862e-05, + "loss": 0.6712, + "step": 1105 + }, + { + "epoch": 0.7459677419354839, + "grad_norm": 0.4324555524707172, + "learning_rate": 1.5795935001639796e-05, + "loss": 0.6658, + "step": 1110 + }, + { + "epoch": 0.7493279569892473, + "grad_norm": 0.46394662933500824, + "learning_rate": 1.5748036806162816e-05, + "loss": 0.6684, + "step": 1115 + }, + { + "epoch": 0.7526881720430108, + "grad_norm": 0.4947844886498111, + "learning_rate": 1.56999408513914e-05, + "loss": 0.6722, + "step": 1120 + }, + { + "epoch": 0.7560483870967742, + "grad_norm": 0.4250235043595764, + "learning_rate": 1.5651648792050886e-05, + "loss": 0.6686, + "step": 1125 + }, + { + "epoch": 0.7594086021505376, + "grad_norm": 0.4159591266629867, + "learning_rate": 1.5603162289613503e-05, + "loss": 0.669, + "step": 1130 + }, + { + "epoch": 0.7627688172043011, + "grad_norm": 0.4095733043870104, + "learning_rate": 1.555448301224124e-05, + "loss": 0.6692, + "step": 1135 + }, + { + "epoch": 0.7661290322580645, + "grad_norm": 0.43896150432589165, + "learning_rate": 1.550561263472845e-05, + "loss": 0.6657, + "step": 1140 + }, + { + "epoch": 0.769489247311828, + "grad_norm": 0.47253071219496107, + "learning_rate": 1.5456552838444215e-05, + "loss": 0.6695, + "step": 1145 + }, + { + "epoch": 0.7728494623655914, + "grad_norm": 0.43201524742454317, + "learning_rate": 1.5407305311274502e-05, + "loss": 0.6639, + "step": 1150 + }, + { + "epoch": 0.7762096774193549, + "grad_norm": 0.46875979315386157, + "learning_rate": 1.5357871747564108e-05, + "loss": 0.6696, + "step": 1155 + }, + { + "epoch": 0.7795698924731183, + "grad_norm": 0.4458966308423493, + "learning_rate": 1.530825384805835e-05, + "loss": 0.6671, + "step": 1160 + }, + { + "epoch": 0.7829301075268817, + "grad_norm": 0.4045923602073338, + "learning_rate": 1.5258453319844555e-05, + "loss": 0.6679, + "step": 1165 + }, + { + "epoch": 0.7862903225806451, + "grad_norm": 0.46081178192946914, + "learning_rate": 1.5208471876293334e-05, + "loss": 0.6653, + "step": 1170 + }, + { + "epoch": 0.7896505376344086, + "grad_norm": 0.4412474732597341, + "learning_rate": 1.5158311236999622e-05, + "loss": 0.6696, + "step": 1175 + }, + { + "epoch": 0.793010752688172, + "grad_norm": 0.4561523660741418, + "learning_rate": 1.5107973127723537e-05, + "loss": 0.6662, + "step": 1180 + }, + { + "epoch": 0.7963709677419355, + "grad_norm": 0.45370412836150925, + "learning_rate": 1.5057459280330984e-05, + "loss": 0.6668, + "step": 1185 + }, + { + "epoch": 0.7997311827956989, + "grad_norm": 0.4641379862022367, + "learning_rate": 1.500677143273408e-05, + "loss": 0.6618, + "step": 1190 + }, + { + "epoch": 0.8030913978494624, + "grad_norm": 0.4228354205291783, + "learning_rate": 1.4955911328831357e-05, + "loss": 0.6689, + "step": 1195 + }, + { + "epoch": 0.8064516129032258, + "grad_norm": 0.45194187006117376, + "learning_rate": 1.4904880718447781e-05, + "loss": 0.6654, + "step": 1200 + }, + { + "epoch": 0.8098118279569892, + "grad_norm": 0.41370408468731323, + "learning_rate": 1.4853681357274522e-05, + "loss": 0.6656, + "step": 1205 + }, + { + "epoch": 0.8131720430107527, + "grad_norm": 0.39658836438427153, + "learning_rate": 1.4802315006808578e-05, + "loss": 0.6642, + "step": 1210 + }, + { + "epoch": 0.8165322580645161, + "grad_norm": 0.39180055538376923, + "learning_rate": 1.4750783434292147e-05, + "loss": 0.6669, + "step": 1215 + }, + { + "epoch": 0.8198924731182796, + "grad_norm": 0.4140385959248494, + "learning_rate": 1.4699088412651841e-05, + "loss": 0.6677, + "step": 1220 + }, + { + "epoch": 0.823252688172043, + "grad_norm": 0.4135853126184148, + "learning_rate": 1.4647231720437687e-05, + "loss": 0.6691, + "step": 1225 + }, + { + "epoch": 0.8266129032258065, + "grad_norm": 0.3886901463807677, + "learning_rate": 1.4595215141761934e-05, + "loss": 0.6612, + "step": 1230 + }, + { + "epoch": 0.8299731182795699, + "grad_norm": 0.427261482590677, + "learning_rate": 1.4543040466237662e-05, + "loss": 0.6635, + "step": 1235 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.41557585081944015, + "learning_rate": 1.4490709488917239e-05, + "loss": 0.6642, + "step": 1240 + }, + { + "epoch": 0.8366935483870968, + "grad_norm": 0.41396481934719226, + "learning_rate": 1.4438224010230526e-05, + "loss": 0.6642, + "step": 1245 + }, + { + "epoch": 0.8400537634408602, + "grad_norm": 0.4000957409618339, + "learning_rate": 1.4385585835922962e-05, + "loss": 0.6679, + "step": 1250 + }, + { + "epoch": 0.8434139784946236, + "grad_norm": 0.5093831922663081, + "learning_rate": 1.4332796776993423e-05, + "loss": 0.6665, + "step": 1255 + }, + { + "epoch": 0.8467741935483871, + "grad_norm": 0.39777324014388976, + "learning_rate": 1.427985864963193e-05, + "loss": 0.6608, + "step": 1260 + }, + { + "epoch": 0.8501344086021505, + "grad_norm": 0.4123671465949071, + "learning_rate": 1.4226773275157142e-05, + "loss": 0.6679, + "step": 1265 + }, + { + "epoch": 0.853494623655914, + "grad_norm": 0.4178878559263764, + "learning_rate": 1.4173542479953712e-05, + "loss": 0.6629, + "step": 1270 + }, + { + "epoch": 0.8568548387096774, + "grad_norm": 0.4151008943507023, + "learning_rate": 1.412016809540944e-05, + "loss": 0.6708, + "step": 1275 + }, + { + "epoch": 0.8602150537634409, + "grad_norm": 0.39420851761758596, + "learning_rate": 1.406665195785228e-05, + "loss": 0.66, + "step": 1280 + }, + { + "epoch": 0.8635752688172043, + "grad_norm": 0.39995681473664046, + "learning_rate": 1.401299590848714e-05, + "loss": 0.6631, + "step": 1285 + }, + { + "epoch": 0.8669354838709677, + "grad_norm": 0.43024431364313487, + "learning_rate": 1.3959201793332554e-05, + "loss": 0.6648, + "step": 1290 + }, + { + "epoch": 0.8702956989247311, + "grad_norm": 0.3823413783572042, + "learning_rate": 1.3905271463157153e-05, + "loss": 0.6668, + "step": 1295 + }, + { + "epoch": 0.8736559139784946, + "grad_norm": 0.4168656965051171, + "learning_rate": 1.385120677341602e-05, + "loss": 0.6584, + "step": 1300 + }, + { + "epoch": 0.8770161290322581, + "grad_norm": 0.3967459217329594, + "learning_rate": 1.3797009584186818e-05, + "loss": 0.6627, + "step": 1305 + }, + { + "epoch": 0.8803763440860215, + "grad_norm": 0.4090750420770741, + "learning_rate": 1.3742681760105814e-05, + "loss": 0.6616, + "step": 1310 + }, + { + "epoch": 0.883736559139785, + "grad_norm": 0.4175005433442616, + "learning_rate": 1.3688225170303727e-05, + "loss": 0.6668, + "step": 1315 + }, + { + "epoch": 0.8870967741935484, + "grad_norm": 0.425478340178616, + "learning_rate": 1.3633641688341413e-05, + "loss": 0.6613, + "step": 1320 + }, + { + "epoch": 0.8904569892473119, + "grad_norm": 0.39081095331783994, + "learning_rate": 1.3578933192145421e-05, + "loss": 0.6631, + "step": 1325 + }, + { + "epoch": 0.8938172043010753, + "grad_norm": 0.460628551115726, + "learning_rate": 1.3524101563943356e-05, + "loss": 0.6617, + "step": 1330 + }, + { + "epoch": 0.8971774193548387, + "grad_norm": 0.4558498194690748, + "learning_rate": 1.3469148690199157e-05, + "loss": 0.665, + "step": 1335 + }, + { + "epoch": 0.9005376344086021, + "grad_norm": 0.4182583525353485, + "learning_rate": 1.3414076461548162e-05, + "loss": 0.662, + "step": 1340 + }, + { + "epoch": 0.9038978494623656, + "grad_norm": 0.4438115831427305, + "learning_rate": 1.3358886772732085e-05, + "loss": 0.6638, + "step": 1345 + }, + { + "epoch": 0.907258064516129, + "grad_norm": 0.45301099015215857, + "learning_rate": 1.3303581522533806e-05, + "loss": 0.6602, + "step": 1350 + }, + { + "epoch": 0.9106182795698925, + "grad_norm": 0.4353757710607073, + "learning_rate": 1.3248162613712066e-05, + "loss": 0.6614, + "step": 1355 + }, + { + "epoch": 0.9139784946236559, + "grad_norm": 0.3966488654029928, + "learning_rate": 1.319263195293599e-05, + "loss": 0.661, + "step": 1360 + }, + { + "epoch": 0.9173387096774194, + "grad_norm": 0.43451849890336813, + "learning_rate": 1.3136991450719493e-05, + "loss": 0.6632, + "step": 1365 + }, + { + "epoch": 0.9206989247311828, + "grad_norm": 0.4331957813206352, + "learning_rate": 1.3081243021355542e-05, + "loss": 0.6618, + "step": 1370 + }, + { + "epoch": 0.9240591397849462, + "grad_norm": 0.42461868845233375, + "learning_rate": 1.3025388582850311e-05, + "loss": 0.661, + "step": 1375 + }, + { + "epoch": 0.9274193548387096, + "grad_norm": 0.4270005545440327, + "learning_rate": 1.2969430056857177e-05, + "loss": 0.6619, + "step": 1380 + }, + { + "epoch": 0.9307795698924731, + "grad_norm": 0.40199632811605207, + "learning_rate": 1.2913369368610618e-05, + "loss": 0.6625, + "step": 1385 + }, + { + "epoch": 0.9341397849462365, + "grad_norm": 0.40238063432322, + "learning_rate": 1.285720844685996e-05, + "loss": 0.6619, + "step": 1390 + }, + { + "epoch": 0.9375, + "grad_norm": 0.4063919100845298, + "learning_rate": 1.2800949223803039e-05, + "loss": 0.6593, + "step": 1395 + }, + { + "epoch": 0.9408602150537635, + "grad_norm": 0.42658950759070957, + "learning_rate": 1.274459363501971e-05, + "loss": 0.6663, + "step": 1400 + }, + { + "epoch": 0.9442204301075269, + "grad_norm": 0.40879880761812026, + "learning_rate": 1.2688143619405266e-05, + "loss": 0.6626, + "step": 1405 + }, + { + "epoch": 0.9475806451612904, + "grad_norm": 0.4136553833868636, + "learning_rate": 1.2631601119103714e-05, + "loss": 0.6595, + "step": 1410 + }, + { + "epoch": 0.9509408602150538, + "grad_norm": 0.4566408970103592, + "learning_rate": 1.2574968079440969e-05, + "loss": 0.6612, + "step": 1415 + }, + { + "epoch": 0.9543010752688172, + "grad_norm": 0.4707155898476047, + "learning_rate": 1.251824644885792e-05, + "loss": 0.661, + "step": 1420 + }, + { + "epoch": 0.9576612903225806, + "grad_norm": 0.44843086084440054, + "learning_rate": 1.2461438178843409e-05, + "loss": 0.6603, + "step": 1425 + }, + { + "epoch": 0.9610215053763441, + "grad_norm": 0.3887174931118394, + "learning_rate": 1.2404545223867067e-05, + "loss": 0.6648, + "step": 1430 + }, + { + "epoch": 0.9643817204301075, + "grad_norm": 0.3844627071853656, + "learning_rate": 1.2347569541312086e-05, + "loss": 0.661, + "step": 1435 + }, + { + "epoch": 0.967741935483871, + "grad_norm": 0.38712969083554144, + "learning_rate": 1.2290513091407871e-05, + "loss": 0.6626, + "step": 1440 + }, + { + "epoch": 0.9711021505376344, + "grad_norm": 0.3765260995061332, + "learning_rate": 1.223337783716261e-05, + "loss": 0.658, + "step": 1445 + }, + { + "epoch": 0.9744623655913979, + "grad_norm": 0.38880975825103575, + "learning_rate": 1.2176165744295716e-05, + "loss": 0.6625, + "step": 1450 + }, + { + "epoch": 0.9778225806451613, + "grad_norm": 0.38891299522851835, + "learning_rate": 1.2118878781170213e-05, + "loss": 0.6571, + "step": 1455 + }, + { + "epoch": 0.9811827956989247, + "grad_norm": 0.3883707422936783, + "learning_rate": 1.2061518918725019e-05, + "loss": 0.6554, + "step": 1460 + }, + { + "epoch": 0.9845430107526881, + "grad_norm": 0.4326330855587226, + "learning_rate": 1.2004088130407114e-05, + "loss": 0.6617, + "step": 1465 + }, + { + "epoch": 0.9879032258064516, + "grad_norm": 0.40177893100437934, + "learning_rate": 1.1946588392103678e-05, + "loss": 0.6601, + "step": 1470 + }, + { + "epoch": 0.991263440860215, + "grad_norm": 0.4069182709848244, + "learning_rate": 1.1889021682074074e-05, + "loss": 0.6578, + "step": 1475 + }, + { + "epoch": 0.9946236559139785, + "grad_norm": 0.371562924814927, + "learning_rate": 1.1831389980881815e-05, + "loss": 0.6595, + "step": 1480 + }, + { + "epoch": 0.9979838709677419, + "grad_norm": 0.40260684742734143, + "learning_rate": 1.1773695271326413e-05, + "loss": 0.6598, + "step": 1485 + }, + { + "epoch": 1.0, + "eval_loss": 0.6596925854682922, + "eval_runtime": 38.4699, + "eval_samples_per_second": 75.8, + "eval_steps_per_second": 1.196, + "step": 1488 + }, + { + "epoch": 1.0013440860215055, + "grad_norm": 0.43923520164769597, + "learning_rate": 1.1715939538375159e-05, + "loss": 0.6515, + "step": 1490 + }, + { + "epoch": 1.0047043010752688, + "grad_norm": 0.4093115648975466, + "learning_rate": 1.1658124769094834e-05, + "loss": 0.6335, + "step": 1495 + }, + { + "epoch": 1.0080645161290323, + "grad_norm": 0.44488404164651246, + "learning_rate": 1.160025295258335e-05, + "loss": 0.6351, + "step": 1500 + }, + { + "epoch": 1.0114247311827957, + "grad_norm": 0.41541519893053236, + "learning_rate": 1.1542326079901296e-05, + "loss": 0.6363, + "step": 1505 + }, + { + "epoch": 1.0147849462365592, + "grad_norm": 0.4120083737414266, + "learning_rate": 1.1484346144003467e-05, + "loss": 0.629, + "step": 1510 + }, + { + "epoch": 1.0181451612903225, + "grad_norm": 0.4134775783414065, + "learning_rate": 1.1426315139670268e-05, + "loss": 0.6358, + "step": 1515 + }, + { + "epoch": 1.021505376344086, + "grad_norm": 0.414634455881357, + "learning_rate": 1.1368235063439103e-05, + "loss": 0.6316, + "step": 1520 + }, + { + "epoch": 1.0248655913978495, + "grad_norm": 0.4254446985228589, + "learning_rate": 1.1310107913535677e-05, + "loss": 0.6323, + "step": 1525 + }, + { + "epoch": 1.028225806451613, + "grad_norm": 0.4034820728921977, + "learning_rate": 1.1251935689805249e-05, + "loss": 0.6348, + "step": 1530 + }, + { + "epoch": 1.0315860215053763, + "grad_norm": 0.41791569897419, + "learning_rate": 1.1193720393643826e-05, + "loss": 0.6319, + "step": 1535 + }, + { + "epoch": 1.0349462365591398, + "grad_norm": 0.41814029821287946, + "learning_rate": 1.1135464027929306e-05, + "loss": 0.6384, + "step": 1540 + }, + { + "epoch": 1.0383064516129032, + "grad_norm": 0.42081017954332256, + "learning_rate": 1.1077168596952579e-05, + "loss": 0.6378, + "step": 1545 + }, + { + "epoch": 1.0416666666666667, + "grad_norm": 0.39427585600241666, + "learning_rate": 1.1018836106348558e-05, + "loss": 0.6308, + "step": 1550 + }, + { + "epoch": 1.04502688172043, + "grad_norm": 0.41657992186432496, + "learning_rate": 1.0960468563027178e-05, + "loss": 0.6359, + "step": 1555 + }, + { + "epoch": 1.0483870967741935, + "grad_norm": 0.42812446527859066, + "learning_rate": 1.0902067975104354e-05, + "loss": 0.6375, + "step": 1560 + }, + { + "epoch": 1.051747311827957, + "grad_norm": 0.4250184457384185, + "learning_rate": 1.0843636351832888e-05, + "loss": 0.6335, + "step": 1565 + }, + { + "epoch": 1.0551075268817205, + "grad_norm": 0.4039339824459457, + "learning_rate": 1.0785175703533341e-05, + "loss": 0.6354, + "step": 1570 + }, + { + "epoch": 1.0584677419354838, + "grad_norm": 0.4350937331558841, + "learning_rate": 1.072668804152488e-05, + "loss": 0.6346, + "step": 1575 + }, + { + "epoch": 1.0618279569892473, + "grad_norm": 0.43413177405165077, + "learning_rate": 1.0668175378056053e-05, + "loss": 0.6328, + "step": 1580 + }, + { + "epoch": 1.0651881720430108, + "grad_norm": 0.4543315336438448, + "learning_rate": 1.0609639726235592e-05, + "loss": 0.6291, + "step": 1585 + }, + { + "epoch": 1.0685483870967742, + "grad_norm": 0.4639621657382721, + "learning_rate": 1.0551083099963125e-05, + "loss": 0.6347, + "step": 1590 + }, + { + "epoch": 1.0719086021505377, + "grad_norm": 0.38636625101348104, + "learning_rate": 1.0492507513859904e-05, + "loss": 0.6329, + "step": 1595 + }, + { + "epoch": 1.075268817204301, + "grad_norm": 0.43112374827834826, + "learning_rate": 1.043391498319948e-05, + "loss": 0.6334, + "step": 1600 + }, + { + "epoch": 1.0786290322580645, + "grad_norm": 0.4083117563136851, + "learning_rate": 1.037530752383839e-05, + "loss": 0.6312, + "step": 1605 + }, + { + "epoch": 1.081989247311828, + "grad_norm": 0.4092678104422559, + "learning_rate": 1.0316687152146774e-05, + "loss": 0.6341, + "step": 1610 + }, + { + "epoch": 1.0853494623655915, + "grad_norm": 0.4236890040624566, + "learning_rate": 1.0258055884939023e-05, + "loss": 0.6342, + "step": 1615 + }, + { + "epoch": 1.0887096774193548, + "grad_norm": 0.4129609373942744, + "learning_rate": 1.0199415739404381e-05, + "loss": 0.6337, + "step": 1620 + }, + { + "epoch": 1.0920698924731183, + "grad_norm": 0.45620399681821466, + "learning_rate": 1.0140768733037558e-05, + "loss": 0.6355, + "step": 1625 + }, + { + "epoch": 1.0954301075268817, + "grad_norm": 0.3996898071123565, + "learning_rate": 1.0082116883569295e-05, + "loss": 0.6353, + "step": 1630 + }, + { + "epoch": 1.0987903225806452, + "grad_norm": 0.42181670497621626, + "learning_rate": 1.0023462208896967e-05, + "loss": 0.6368, + "step": 1635 + }, + { + "epoch": 1.1021505376344085, + "grad_norm": 0.4170971985416292, + "learning_rate": 9.964806727015144e-06, + "loss": 0.6369, + "step": 1640 + }, + { + "epoch": 1.105510752688172, + "grad_norm": 0.3850003704964035, + "learning_rate": 9.906152455946176e-06, + "loss": 0.6316, + "step": 1645 + }, + { + "epoch": 1.1088709677419355, + "grad_norm": 0.3959138602164967, + "learning_rate": 9.847501413670742e-06, + "loss": 0.6329, + "step": 1650 + }, + { + "epoch": 1.112231182795699, + "grad_norm": 0.38653649392484757, + "learning_rate": 9.788855618058446e-06, + "loss": 0.6315, + "step": 1655 + }, + { + "epoch": 1.1155913978494623, + "grad_norm": 0.434626659914302, + "learning_rate": 9.730217086798387e-06, + "loss": 0.6329, + "step": 1660 + }, + { + "epoch": 1.1189516129032258, + "grad_norm": 0.4043062586269176, + "learning_rate": 9.671587837329717e-06, + "loss": 0.6314, + "step": 1665 + }, + { + "epoch": 1.1223118279569892, + "grad_norm": 0.3912022835272916, + "learning_rate": 9.612969886772272e-06, + "loss": 0.6347, + "step": 1670 + }, + { + "epoch": 1.1256720430107527, + "grad_norm": 0.3945451917100905, + "learning_rate": 9.554365251857141e-06, + "loss": 0.6334, + "step": 1675 + }, + { + "epoch": 1.129032258064516, + "grad_norm": 0.39115203980879226, + "learning_rate": 9.495775948857292e-06, + "loss": 0.631, + "step": 1680 + }, + { + "epoch": 1.1323924731182795, + "grad_norm": 0.3899923876549679, + "learning_rate": 9.437203993518214e-06, + "loss": 0.6304, + "step": 1685 + }, + { + "epoch": 1.135752688172043, + "grad_norm": 0.4002252410564929, + "learning_rate": 9.37865140098854e-06, + "loss": 0.6313, + "step": 1690 + }, + { + "epoch": 1.1391129032258065, + "grad_norm": 0.43477646973135253, + "learning_rate": 9.320120185750745e-06, + "loss": 0.6319, + "step": 1695 + }, + { + "epoch": 1.14247311827957, + "grad_norm": 0.4090247302348437, + "learning_rate": 9.261612361551827e-06, + "loss": 0.6317, + "step": 1700 + }, + { + "epoch": 1.1458333333333333, + "grad_norm": 0.45307609961969436, + "learning_rate": 9.203129941334004e-06, + "loss": 0.6333, + "step": 1705 + }, + { + "epoch": 1.1491935483870968, + "grad_norm": 0.41729176695078596, + "learning_rate": 9.144674937165503e-06, + "loss": 0.6351, + "step": 1710 + }, + { + "epoch": 1.1525537634408602, + "grad_norm": 0.40602215785080276, + "learning_rate": 9.086249360171291e-06, + "loss": 0.6343, + "step": 1715 + }, + { + "epoch": 1.1559139784946237, + "grad_norm": 0.45074003009366553, + "learning_rate": 9.027855220463915e-06, + "loss": 0.6335, + "step": 1720 + }, + { + "epoch": 1.159274193548387, + "grad_norm": 0.4134807886527817, + "learning_rate": 8.969494527074333e-06, + "loss": 0.6324, + "step": 1725 + }, + { + "epoch": 1.1626344086021505, + "grad_norm": 0.4342008270842711, + "learning_rate": 8.91116928788278e-06, + "loss": 0.6272, + "step": 1730 + }, + { + "epoch": 1.165994623655914, + "grad_norm": 0.4390573307122215, + "learning_rate": 8.852881509549716e-06, + "loss": 0.6319, + "step": 1735 + }, + { + "epoch": 1.1693548387096775, + "grad_norm": 0.39093522707469097, + "learning_rate": 8.79463319744677e-06, + "loss": 0.6339, + "step": 1740 + }, + { + "epoch": 1.172715053763441, + "grad_norm": 0.3954417150387851, + "learning_rate": 8.73642635558774e-06, + "loss": 0.6323, + "step": 1745 + }, + { + "epoch": 1.1760752688172043, + "grad_norm": 0.40100373608359324, + "learning_rate": 8.678262986559667e-06, + "loss": 0.6269, + "step": 1750 + }, + { + "epoch": 1.1794354838709677, + "grad_norm": 0.39734268517939386, + "learning_rate": 8.620145091453907e-06, + "loss": 0.6332, + "step": 1755 + }, + { + "epoch": 1.1827956989247312, + "grad_norm": 0.39585129412547304, + "learning_rate": 8.562074669797313e-06, + "loss": 0.6292, + "step": 1760 + }, + { + "epoch": 1.1861559139784945, + "grad_norm": 0.4037432361209901, + "learning_rate": 8.504053719483433e-06, + "loss": 0.6311, + "step": 1765 + }, + { + "epoch": 1.189516129032258, + "grad_norm": 0.3975358417069771, + "learning_rate": 8.446084236703758e-06, + "loss": 0.6292, + "step": 1770 + }, + { + "epoch": 1.1928763440860215, + "grad_norm": 0.4254328638432713, + "learning_rate": 8.38816821587906e-06, + "loss": 0.6294, + "step": 1775 + }, + { + "epoch": 1.196236559139785, + "grad_norm": 0.4130515802670977, + "learning_rate": 8.330307649590782e-06, + "loss": 0.6304, + "step": 1780 + }, + { + "epoch": 1.1995967741935485, + "grad_norm": 0.417130552856278, + "learning_rate": 8.272504528512448e-06, + "loss": 0.6329, + "step": 1785 + }, + { + "epoch": 1.2029569892473118, + "grad_norm": 0.40653243786568827, + "learning_rate": 8.214760841341223e-06, + "loss": 0.6343, + "step": 1790 + }, + { + "epoch": 1.2063172043010753, + "grad_norm": 0.40605875241786166, + "learning_rate": 8.157078574729451e-06, + "loss": 0.6302, + "step": 1795 + }, + { + "epoch": 1.2096774193548387, + "grad_norm": 0.38683973621773765, + "learning_rate": 8.099459713216331e-06, + "loss": 0.6266, + "step": 1800 + }, + { + "epoch": 1.2130376344086022, + "grad_norm": 0.39898560454739485, + "learning_rate": 8.041906239159636e-06, + "loss": 0.6303, + "step": 1805 + }, + { + "epoch": 1.2163978494623655, + "grad_norm": 0.4377517352185253, + "learning_rate": 7.984420132667483e-06, + "loss": 0.63, + "step": 1810 + }, + { + "epoch": 1.219758064516129, + "grad_norm": 0.39945365735612615, + "learning_rate": 7.927003371530254e-06, + "loss": 0.6291, + "step": 1815 + }, + { + "epoch": 1.2231182795698925, + "grad_norm": 0.4030460272109659, + "learning_rate": 7.869657931152522e-06, + "loss": 0.6262, + "step": 1820 + }, + { + "epoch": 1.226478494623656, + "grad_norm": 0.3958516566003825, + "learning_rate": 7.812385784485079e-06, + "loss": 0.631, + "step": 1825 + }, + { + "epoch": 1.2298387096774193, + "grad_norm": 0.4234150471780541, + "learning_rate": 7.755188901957093e-06, + "loss": 0.6271, + "step": 1830 + }, + { + "epoch": 1.2331989247311828, + "grad_norm": 0.43581653936134584, + "learning_rate": 7.698069251408271e-06, + "loss": 0.6289, + "step": 1835 + }, + { + "epoch": 1.2365591397849462, + "grad_norm": 0.871860809323723, + "learning_rate": 7.641028798021197e-06, + "loss": 0.6339, + "step": 1840 + }, + { + "epoch": 1.2399193548387097, + "grad_norm": 0.41654508596227274, + "learning_rate": 7.584069504253703e-06, + "loss": 0.6316, + "step": 1845 + }, + { + "epoch": 1.243279569892473, + "grad_norm": 0.4186537736756235, + "learning_rate": 7.527193329771334e-06, + "loss": 0.629, + "step": 1850 + }, + { + "epoch": 1.2466397849462365, + "grad_norm": 0.40684069462084127, + "learning_rate": 7.470402231379961e-06, + "loss": 0.6294, + "step": 1855 + }, + { + "epoch": 1.25, + "grad_norm": 0.4491543755620313, + "learning_rate": 7.41369816295844e-06, + "loss": 0.6283, + "step": 1860 + }, + { + "epoch": 1.2533602150537635, + "grad_norm": 0.43910414887550725, + "learning_rate": 7.357083075391373e-06, + "loss": 0.6297, + "step": 1865 + }, + { + "epoch": 1.256720430107527, + "grad_norm": 0.4242078353209094, + "learning_rate": 7.300558916502028e-06, + "loss": 0.6274, + "step": 1870 + }, + { + "epoch": 1.2600806451612903, + "grad_norm": 0.39181603159793815, + "learning_rate": 7.24412763098528e-06, + "loss": 0.6268, + "step": 1875 + }, + { + "epoch": 1.2634408602150538, + "grad_norm": 0.4093720135553839, + "learning_rate": 7.1877911603407446e-06, + "loss": 0.6314, + "step": 1880 + }, + { + "epoch": 1.2668010752688172, + "grad_norm": 0.4542985585511893, + "learning_rate": 7.131551442805957e-06, + "loss": 0.6257, + "step": 1885 + }, + { + "epoch": 1.2701612903225805, + "grad_norm": 0.39320303904121556, + "learning_rate": 7.075410413289687e-06, + "loss": 0.632, + "step": 1890 + }, + { + "epoch": 1.273521505376344, + "grad_norm": 0.43112292294597837, + "learning_rate": 7.019370003305383e-06, + "loss": 0.6258, + "step": 1895 + }, + { + "epoch": 1.2768817204301075, + "grad_norm": 0.4121211812097676, + "learning_rate": 6.963432140904718e-06, + "loss": 0.6256, + "step": 1900 + }, + { + "epoch": 1.280241935483871, + "grad_norm": 0.4366695143108501, + "learning_rate": 6.9075987506112305e-06, + "loss": 0.6306, + "step": 1905 + }, + { + "epoch": 1.2836021505376345, + "grad_norm": 0.420907190197836, + "learning_rate": 6.851871753354154e-06, + "loss": 0.6314, + "step": 1910 + }, + { + "epoch": 1.286962365591398, + "grad_norm": 0.4473858705160112, + "learning_rate": 6.796253066402282e-06, + "loss": 0.627, + "step": 1915 + }, + { + "epoch": 1.2903225806451613, + "grad_norm": 0.38612233770180776, + "learning_rate": 6.740744603298046e-06, + "loss": 0.629, + "step": 1920 + }, + { + "epoch": 1.2936827956989247, + "grad_norm": 0.39974399253863463, + "learning_rate": 6.685348273791661e-06, + "loss": 0.6287, + "step": 1925 + }, + { + "epoch": 1.2970430107526882, + "grad_norm": 0.3898544394591236, + "learning_rate": 6.630065983775406e-06, + "loss": 0.6292, + "step": 1930 + }, + { + "epoch": 1.3004032258064515, + "grad_norm": 0.40401698025117533, + "learning_rate": 6.574899635218091e-06, + "loss": 0.6304, + "step": 1935 + }, + { + "epoch": 1.303763440860215, + "grad_norm": 0.41519450649319134, + "learning_rate": 6.519851126099586e-06, + "loss": 0.6287, + "step": 1940 + }, + { + "epoch": 1.3071236559139785, + "grad_norm": 0.381670315636576, + "learning_rate": 6.464922350345534e-06, + "loss": 0.6278, + "step": 1945 + }, + { + "epoch": 1.310483870967742, + "grad_norm": 0.4365713923302317, + "learning_rate": 6.4101151977622015e-06, + "loss": 0.6258, + "step": 1950 + }, + { + "epoch": 1.3138440860215055, + "grad_norm": 0.41198291276336774, + "learning_rate": 6.3554315539714364e-06, + "loss": 0.6261, + "step": 1955 + }, + { + "epoch": 1.3172043010752688, + "grad_norm": 0.4118460369714954, + "learning_rate": 6.300873300345819e-06, + "loss": 0.6245, + "step": 1960 + }, + { + "epoch": 1.3205645161290323, + "grad_norm": 0.38775383959228366, + "learning_rate": 6.246442313943917e-06, + "loss": 0.6258, + "step": 1965 + }, + { + "epoch": 1.3239247311827957, + "grad_norm": 0.4088425446696767, + "learning_rate": 6.192140467445712e-06, + "loss": 0.6291, + "step": 1970 + }, + { + "epoch": 1.327284946236559, + "grad_norm": 0.368749879948908, + "learning_rate": 6.137969629088174e-06, + "loss": 0.629, + "step": 1975 + }, + { + "epoch": 1.3306451612903225, + "grad_norm": 0.3907700123306008, + "learning_rate": 6.083931662600977e-06, + "loss": 0.6287, + "step": 1980 + }, + { + "epoch": 1.334005376344086, + "grad_norm": 0.3975024031852529, + "learning_rate": 6.0300284271423834e-06, + "loss": 0.6227, + "step": 1985 + }, + { + "epoch": 1.3373655913978495, + "grad_norm": 0.4075706328227891, + "learning_rate": 5.976261777235282e-06, + "loss": 0.6248, + "step": 1990 + }, + { + "epoch": 1.340725806451613, + "grad_norm": 0.38742274042231606, + "learning_rate": 5.922633562703375e-06, + "loss": 0.6263, + "step": 1995 + }, + { + "epoch": 1.3440860215053765, + "grad_norm": 0.4225129189994307, + "learning_rate": 5.869145628607551e-06, + "loss": 0.6263, + "step": 2000 + }, + { + "epoch": 1.3474462365591398, + "grad_norm": 0.3905401995286742, + "learning_rate": 5.815799815182393e-06, + "loss": 0.6293, + "step": 2005 + }, + { + "epoch": 1.3508064516129032, + "grad_norm": 0.40752390323741944, + "learning_rate": 5.762597957772856e-06, + "loss": 0.6287, + "step": 2010 + }, + { + "epoch": 1.3541666666666667, + "grad_norm": 0.4096029958781252, + "learning_rate": 5.709541886771167e-06, + "loss": 0.6266, + "step": 2015 + }, + { + "epoch": 1.35752688172043, + "grad_norm": 0.38367545152375937, + "learning_rate": 5.656633427553784e-06, + "loss": 0.6303, + "step": 2020 + }, + { + "epoch": 1.3608870967741935, + "grad_norm": 0.3848396403700666, + "learning_rate": 5.60387440041866e-06, + "loss": 0.6274, + "step": 2025 + }, + { + "epoch": 1.364247311827957, + "grad_norm": 0.3871009890350265, + "learning_rate": 5.55126662052257e-06, + "loss": 0.6224, + "step": 2030 + }, + { + "epoch": 1.3676075268817205, + "grad_norm": 0.4065087733067794, + "learning_rate": 5.498811897818685e-06, + "loss": 0.6268, + "step": 2035 + }, + { + "epoch": 1.370967741935484, + "grad_norm": 0.38515450629994064, + "learning_rate": 5.446512036994287e-06, + "loss": 0.6248, + "step": 2040 + }, + { + "epoch": 1.3743279569892473, + "grad_norm": 0.39646022406843767, + "learning_rate": 5.394368837408705e-06, + "loss": 0.6249, + "step": 2045 + }, + { + "epoch": 1.3776881720430108, + "grad_norm": 0.4140801017591075, + "learning_rate": 5.342384093031361e-06, + "loss": 0.6264, + "step": 2050 + }, + { + "epoch": 1.3810483870967742, + "grad_norm": 0.41497129669099586, + "learning_rate": 5.290559592380111e-06, + "loss": 0.6249, + "step": 2055 + }, + { + "epoch": 1.3844086021505375, + "grad_norm": 0.3924624828028891, + "learning_rate": 5.238897118459644e-06, + "loss": 0.6258, + "step": 2060 + }, + { + "epoch": 1.387768817204301, + "grad_norm": 0.3702574417131231, + "learning_rate": 5.187398448700205e-06, + "loss": 0.6284, + "step": 2065 + }, + { + "epoch": 1.3911290322580645, + "grad_norm": 0.41653378611493, + "learning_rate": 5.136065354896393e-06, + "loss": 0.6263, + "step": 2070 + }, + { + "epoch": 1.394489247311828, + "grad_norm": 0.3895372191639466, + "learning_rate": 5.084899603146228e-06, + "loss": 0.629, + "step": 2075 + }, + { + "epoch": 1.3978494623655915, + "grad_norm": 0.3906095795336125, + "learning_rate": 5.033902953790375e-06, + "loss": 0.6258, + "step": 2080 + }, + { + "epoch": 1.4012096774193548, + "grad_norm": 0.39681528851984266, + "learning_rate": 4.983077161351606e-06, + "loss": 0.6282, + "step": 2085 + }, + { + "epoch": 1.4045698924731183, + "grad_norm": 0.40072163095580204, + "learning_rate": 4.932423974474386e-06, + "loss": 0.6288, + "step": 2090 + }, + { + "epoch": 1.4079301075268817, + "grad_norm": 0.41393522692230517, + "learning_rate": 4.8819451358647806e-06, + "loss": 0.6269, + "step": 2095 + }, + { + "epoch": 1.4112903225806452, + "grad_norm": 0.41100607384229376, + "learning_rate": 4.831642382230424e-06, + "loss": 0.6237, + "step": 2100 + }, + { + "epoch": 1.4146505376344085, + "grad_norm": 0.3869191806592057, + "learning_rate": 4.781517444220836e-06, + "loss": 0.6262, + "step": 2105 + }, + { + "epoch": 1.418010752688172, + "grad_norm": 0.40790781380796975, + "learning_rate": 4.731572046367832e-06, + "loss": 0.629, + "step": 2110 + }, + { + "epoch": 1.4213709677419355, + "grad_norm": 0.39391109017587794, + "learning_rate": 4.681807907026214e-06, + "loss": 0.6241, + "step": 2115 + }, + { + "epoch": 1.424731182795699, + "grad_norm": 0.40179170742485937, + "learning_rate": 4.632226738314638e-06, + "loss": 0.6255, + "step": 2120 + }, + { + "epoch": 1.4280913978494625, + "grad_norm": 0.3858060561361162, + "learning_rate": 4.582830246056735e-06, + "loss": 0.6236, + "step": 2125 + }, + { + "epoch": 1.4314516129032258, + "grad_norm": 0.38374047682817425, + "learning_rate": 4.533620129722376e-06, + "loss": 0.6204, + "step": 2130 + }, + { + "epoch": 1.4348118279569892, + "grad_norm": 0.3973135891040321, + "learning_rate": 4.4845980823692605e-06, + "loss": 0.6238, + "step": 2135 + }, + { + "epoch": 1.4381720430107527, + "grad_norm": 0.3883947152151204, + "learning_rate": 4.435765790584605e-06, + "loss": 0.6245, + "step": 2140 + }, + { + "epoch": 1.441532258064516, + "grad_norm": 0.3991192638455965, + "learning_rate": 4.387124934427181e-06, + "loss": 0.6235, + "step": 2145 + }, + { + "epoch": 1.4448924731182795, + "grad_norm": 0.38536161760649323, + "learning_rate": 4.338677187369458e-06, + "loss": 0.6239, + "step": 2150 + }, + { + "epoch": 1.448252688172043, + "grad_norm": 0.38959778197902006, + "learning_rate": 4.290424216240062e-06, + "loss": 0.624, + "step": 2155 + }, + { + "epoch": 1.4516129032258065, + "grad_norm": 0.3932900684900842, + "learning_rate": 4.242367681166414e-06, + "loss": 0.6261, + "step": 2160 + }, + { + "epoch": 1.45497311827957, + "grad_norm": 0.39269029723244625, + "learning_rate": 4.19450923551762e-06, + "loss": 0.6217, + "step": 2165 + }, + { + "epoch": 1.4583333333333333, + "grad_norm": 0.3849808642498751, + "learning_rate": 4.1468505258475785e-06, + "loss": 0.6205, + "step": 2170 + }, + { + "epoch": 1.4616935483870968, + "grad_norm": 0.37433760170956354, + "learning_rate": 4.0993931918383556e-06, + "loss": 0.6181, + "step": 2175 + }, + { + "epoch": 1.4650537634408602, + "grad_norm": 0.38330212314619344, + "learning_rate": 4.0521388662437285e-06, + "loss": 0.6243, + "step": 2180 + }, + { + "epoch": 1.4684139784946235, + "grad_norm": 0.3934565346806046, + "learning_rate": 4.0050891748330636e-06, + "loss": 0.6253, + "step": 2185 + }, + { + "epoch": 1.471774193548387, + "grad_norm": 0.3829851450390039, + "learning_rate": 3.958245736335337e-06, + "loss": 0.625, + "step": 2190 + }, + { + "epoch": 1.4751344086021505, + "grad_norm": 0.38802181960954035, + "learning_rate": 3.911610162383475e-06, + "loss": 0.6265, + "step": 2195 + }, + { + "epoch": 1.478494623655914, + "grad_norm": 0.410711397472092, + "learning_rate": 3.865184057458883e-06, + "loss": 0.6232, + "step": 2200 + }, + { + "epoch": 1.4818548387096775, + "grad_norm": 0.3894505260206867, + "learning_rate": 3.8189690188362615e-06, + "loss": 0.628, + "step": 2205 + }, + { + "epoch": 1.485215053763441, + "grad_norm": 0.385060920155747, + "learning_rate": 3.772966636528641e-06, + "loss": 0.6186, + "step": 2210 + }, + { + "epoch": 1.4885752688172043, + "grad_norm": 0.39063304189411735, + "learning_rate": 3.727178493232685e-06, + "loss": 0.6236, + "step": 2215 + }, + { + "epoch": 1.4919354838709677, + "grad_norm": 0.3690141904300633, + "learning_rate": 3.6816061642742294e-06, + "loss": 0.6251, + "step": 2220 + }, + { + "epoch": 1.4952956989247312, + "grad_norm": 0.3915588795933579, + "learning_rate": 3.6362512175541008e-06, + "loss": 0.6231, + "step": 2225 + }, + { + "epoch": 1.4986559139784945, + "grad_norm": 0.3823928248559548, + "learning_rate": 3.591115213494153e-06, + "loss": 0.6243, + "step": 2230 + }, + { + "epoch": 1.502016129032258, + "grad_norm": 0.375431342074985, + "learning_rate": 3.546199704983592e-06, + "loss": 0.619, + "step": 2235 + }, + { + "epoch": 1.5053763440860215, + "grad_norm": 0.3964171676918981, + "learning_rate": 3.501506237325547e-06, + "loss": 0.6221, + "step": 2240 + }, + { + "epoch": 1.508736559139785, + "grad_norm": 0.3850008299468121, + "learning_rate": 3.4570363481839097e-06, + "loss": 0.6223, + "step": 2245 + }, + { + "epoch": 1.5120967741935485, + "grad_norm": 0.39821907737468476, + "learning_rate": 3.4127915675304236e-06, + "loss": 0.6251, + "step": 2250 + }, + { + "epoch": 1.515456989247312, + "grad_norm": 0.4127390828177173, + "learning_rate": 3.3687734175920505e-06, + "loss": 0.6235, + "step": 2255 + }, + { + "epoch": 1.5188172043010753, + "grad_norm": 0.3900524242402812, + "learning_rate": 3.324983412798597e-06, + "loss": 0.6228, + "step": 2260 + }, + { + "epoch": 1.5221774193548387, + "grad_norm": 0.39215462329521156, + "learning_rate": 3.281423059730612e-06, + "loss": 0.6272, + "step": 2265 + }, + { + "epoch": 1.525537634408602, + "grad_norm": 0.37938495139611367, + "learning_rate": 3.238093857067558e-06, + "loss": 0.624, + "step": 2270 + }, + { + "epoch": 1.5288978494623655, + "grad_norm": 0.38603540777357676, + "learning_rate": 3.1949972955362383e-06, + "loss": 0.6239, + "step": 2275 + }, + { + "epoch": 1.532258064516129, + "grad_norm": 0.3863102533573958, + "learning_rate": 3.1521348578595178e-06, + "loss": 0.6242, + "step": 2280 + }, + { + "epoch": 1.5356182795698925, + "grad_norm": 0.38583036503521356, + "learning_rate": 3.1095080187053084e-06, + "loss": 0.62, + "step": 2285 + }, + { + "epoch": 1.538978494623656, + "grad_norm": 0.38861760389469036, + "learning_rate": 3.067118244635833e-06, + "loss": 0.6211, + "step": 2290 + }, + { + "epoch": 1.5423387096774195, + "grad_norm": 0.3858262856892166, + "learning_rate": 3.024966994057168e-06, + "loss": 0.6193, + "step": 2295 + }, + { + "epoch": 1.5456989247311828, + "grad_norm": 0.39743062676203483, + "learning_rate": 2.98305571716907e-06, + "loss": 0.6218, + "step": 2300 + }, + { + "epoch": 1.5490591397849462, + "grad_norm": 0.3883508569759508, + "learning_rate": 2.9413858559150776e-06, + "loss": 0.6217, + "step": 2305 + }, + { + "epoch": 1.5524193548387095, + "grad_norm": 0.38516305909935, + "learning_rate": 2.899958843932915e-06, + "loss": 0.6202, + "step": 2310 + }, + { + "epoch": 1.555779569892473, + "grad_norm": 0.39366530353726675, + "learning_rate": 2.858776106505139e-06, + "loss": 0.6194, + "step": 2315 + }, + { + "epoch": 1.5591397849462365, + "grad_norm": 0.3773278542481648, + "learning_rate": 2.8178390605101414e-06, + "loss": 0.6223, + "step": 2320 + }, + { + "epoch": 1.5625, + "grad_norm": 0.4009756258267962, + "learning_rate": 2.777149114373371e-06, + "loss": 0.6195, + "step": 2325 + }, + { + "epoch": 1.5658602150537635, + "grad_norm": 0.3858950731460861, + "learning_rate": 2.73670766801889e-06, + "loss": 0.6217, + "step": 2330 + }, + { + "epoch": 1.569220430107527, + "grad_norm": 0.3971612319470807, + "learning_rate": 2.696516112821208e-06, + "loss": 0.622, + "step": 2335 + }, + { + "epoch": 1.5725806451612905, + "grad_norm": 0.38420387771892894, + "learning_rate": 2.6565758315574152e-06, + "loss": 0.6201, + "step": 2340 + }, + { + "epoch": 1.5759408602150538, + "grad_norm": 0.3933992088451066, + "learning_rate": 2.6168881983595994e-06, + "loss": 0.6226, + "step": 2345 + }, + { + "epoch": 1.5793010752688172, + "grad_norm": 0.36479667118510656, + "learning_rate": 2.5774545786675887e-06, + "loss": 0.6175, + "step": 2350 + }, + { + "epoch": 1.5826612903225805, + "grad_norm": 0.37711761279306893, + "learning_rate": 2.538276329181942e-06, + "loss": 0.6206, + "step": 2355 + }, + { + "epoch": 1.586021505376344, + "grad_norm": 0.3816373703635669, + "learning_rate": 2.499354797817312e-06, + "loss": 0.6205, + "step": 2360 + }, + { + "epoch": 1.5893817204301075, + "grad_norm": 0.38654799097580955, + "learning_rate": 2.4606913236560283e-06, + "loss": 0.6245, + "step": 2365 + }, + { + "epoch": 1.592741935483871, + "grad_norm": 0.37621383637459765, + "learning_rate": 2.4222872369020676e-06, + "loss": 0.6237, + "step": 2370 + }, + { + "epoch": 1.5961021505376345, + "grad_norm": 0.38574402970547833, + "learning_rate": 2.384143858835258e-06, + "loss": 0.6198, + "step": 2375 + }, + { + "epoch": 1.599462365591398, + "grad_norm": 0.41266081486593603, + "learning_rate": 2.3462625017658356e-06, + "loss": 0.6203, + "step": 2380 + }, + { + "epoch": 1.6028225806451613, + "grad_norm": 0.39723116620215, + "learning_rate": 2.3086444689892872e-06, + "loss": 0.6224, + "step": 2385 + }, + { + "epoch": 1.6061827956989247, + "grad_norm": 0.39889771375855315, + "learning_rate": 2.2712910547415266e-06, + "loss": 0.6228, + "step": 2390 + }, + { + "epoch": 1.609543010752688, + "grad_norm": 0.38397107249589707, + "learning_rate": 2.234203544154335e-06, + "loss": 0.6202, + "step": 2395 + }, + { + "epoch": 1.6129032258064515, + "grad_norm": 0.3938406367129855, + "learning_rate": 2.1973832132111906e-06, + "loss": 0.6194, + "step": 2400 + }, + { + "epoch": 1.616263440860215, + "grad_norm": 0.37138500127297974, + "learning_rate": 2.16083132870332e-06, + "loss": 0.6226, + "step": 2405 + }, + { + "epoch": 1.6196236559139785, + "grad_norm": 0.386156003115288, + "learning_rate": 2.1245491481861615e-06, + "loss": 0.6174, + "step": 2410 + }, + { + "epoch": 1.622983870967742, + "grad_norm": 0.3747547614429102, + "learning_rate": 2.0885379199360646e-06, + "loss": 0.6228, + "step": 2415 + }, + { + "epoch": 1.6263440860215055, + "grad_norm": 0.3784295571335933, + "learning_rate": 2.0527988829073587e-06, + "loss": 0.6208, + "step": 2420 + }, + { + "epoch": 1.629704301075269, + "grad_norm": 0.3741384831092738, + "learning_rate": 2.0173332666897227e-06, + "loss": 0.6205, + "step": 2425 + }, + { + "epoch": 1.6330645161290323, + "grad_norm": 0.38786521686382086, + "learning_rate": 1.982142291465896e-06, + "loss": 0.6177, + "step": 2430 + }, + { + "epoch": 1.6364247311827957, + "grad_norm": 0.38553674654791287, + "learning_rate": 1.947227167969663e-06, + "loss": 0.6219, + "step": 2435 + }, + { + "epoch": 1.639784946236559, + "grad_norm": 0.37179654564247894, + "learning_rate": 1.9125890974442475e-06, + "loss": 0.62, + "step": 2440 + }, + { + "epoch": 1.6431451612903225, + "grad_norm": 0.3705380721255442, + "learning_rate": 1.878229271600931e-06, + "loss": 0.6212, + "step": 2445 + }, + { + "epoch": 1.646505376344086, + "grad_norm": 0.37577075286504236, + "learning_rate": 1.8441488725781043e-06, + "loss": 0.6219, + "step": 2450 + }, + { + "epoch": 1.6498655913978495, + "grad_norm": 0.37950348342322227, + "learning_rate": 1.8103490729005546e-06, + "loss": 0.6168, + "step": 2455 + }, + { + "epoch": 1.653225806451613, + "grad_norm": 0.3763349869752889, + "learning_rate": 1.7768310354391505e-06, + "loss": 0.6221, + "step": 2460 + }, + { + "epoch": 1.6565860215053765, + "grad_norm": 0.3815949507602912, + "learning_rate": 1.7435959133708169e-06, + "loss": 0.6175, + "step": 2465 + }, + { + "epoch": 1.6599462365591398, + "grad_norm": 0.38010076668823534, + "learning_rate": 1.7106448501388827e-06, + "loss": 0.6176, + "step": 2470 + }, + { + "epoch": 1.6633064516129032, + "grad_norm": 0.3691580309508141, + "learning_rate": 1.677978979413708e-06, + "loss": 0.6164, + "step": 2475 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.38980910582544764, + "learning_rate": 1.645599425053721e-06, + "loss": 0.6192, + "step": 2480 + }, + { + "epoch": 1.67002688172043, + "grad_norm": 0.3915443888876836, + "learning_rate": 1.6135073010667091e-06, + "loss": 0.6178, + "step": 2485 + }, + { + "epoch": 1.6733870967741935, + "grad_norm": 0.38991481296941843, + "learning_rate": 1.5817037115715307e-06, + "loss": 0.6192, + "step": 2490 + }, + { + "epoch": 1.676747311827957, + "grad_norm": 0.3792230038765017, + "learning_rate": 1.5501897507601016e-06, + "loss": 0.6218, + "step": 2495 + }, + { + "epoch": 1.6801075268817205, + "grad_norm": 0.37563408321675024, + "learning_rate": 1.5189665028597622e-06, + "loss": 0.6201, + "step": 2500 + }, + { + "epoch": 1.683467741935484, + "grad_norm": 0.3782912838928693, + "learning_rate": 1.4880350420959678e-06, + "loss": 0.6148, + "step": 2505 + }, + { + "epoch": 1.6868279569892473, + "grad_norm": 0.37676394416952275, + "learning_rate": 1.4573964326553447e-06, + "loss": 0.6213, + "step": 2510 + }, + { + "epoch": 1.6901881720430108, + "grad_norm": 0.4026075731771035, + "learning_rate": 1.4270517286490526e-06, + "loss": 0.6212, + "step": 2515 + }, + { + "epoch": 1.6935483870967742, + "grad_norm": 0.3740041929756673, + "learning_rate": 1.397001974076546e-06, + "loss": 0.6173, + "step": 2520 + }, + { + "epoch": 1.6969086021505375, + "grad_norm": 0.37487781663955083, + "learning_rate": 1.3672482027896295e-06, + "loss": 0.6157, + "step": 2525 + }, + { + "epoch": 1.700268817204301, + "grad_norm": 0.3658110013443917, + "learning_rate": 1.3377914384569124e-06, + "loss": 0.6197, + "step": 2530 + }, + { + "epoch": 1.7036290322580645, + "grad_norm": 0.3698169716895195, + "learning_rate": 1.3086326945285721e-06, + "loss": 0.6172, + "step": 2535 + }, + { + "epoch": 1.706989247311828, + "grad_norm": 0.3726837636042825, + "learning_rate": 1.2797729742014952e-06, + "loss": 0.6239, + "step": 2540 + }, + { + "epoch": 1.7103494623655915, + "grad_norm": 0.3776863339195105, + "learning_rate": 1.251213270384759e-06, + "loss": 0.6166, + "step": 2545 + }, + { + "epoch": 1.713709677419355, + "grad_norm": 0.3701247417408879, + "learning_rate": 1.2229545656654784e-06, + "loss": 0.6195, + "step": 2550 + }, + { + "epoch": 1.7170698924731183, + "grad_norm": 0.37292321840424403, + "learning_rate": 1.1949978322749833e-06, + "loss": 0.62, + "step": 2555 + }, + { + "epoch": 1.7204301075268817, + "grad_norm": 0.36380895900137356, + "learning_rate": 1.1673440320553941e-06, + "loss": 0.6208, + "step": 2560 + }, + { + "epoch": 1.723790322580645, + "grad_norm": 0.3753481673103304, + "learning_rate": 1.1399941164265016e-06, + "loss": 0.6197, + "step": 2565 + }, + { + "epoch": 1.7271505376344085, + "grad_norm": 0.38841007090968127, + "learning_rate": 1.112949026353063e-06, + "loss": 0.6197, + "step": 2570 + }, + { + "epoch": 1.730510752688172, + "grad_norm": 0.4024753571663661, + "learning_rate": 1.0862096923124032e-06, + "loss": 0.6195, + "step": 2575 + }, + { + "epoch": 1.7338709677419355, + "grad_norm": 0.40698102024501104, + "learning_rate": 1.0597770342624169e-06, + "loss": 0.6212, + "step": 2580 + }, + { + "epoch": 1.737231182795699, + "grad_norm": 0.38275795281843294, + "learning_rate": 1.0336519616099127e-06, + "loss": 0.6164, + "step": 2585 + }, + { + "epoch": 1.7405913978494625, + "grad_norm": 0.3849617861720321, + "learning_rate": 1.0078353731793245e-06, + "loss": 0.621, + "step": 2590 + }, + { + "epoch": 1.7439516129032258, + "grad_norm": 0.37357311882407007, + "learning_rate": 9.823281571817888e-07, + "loss": 0.6154, + "step": 2595 + }, + { + "epoch": 1.7473118279569892, + "grad_norm": 0.3700469086083545, + "learning_rate": 9.571311911845938e-07, + "loss": 0.6168, + "step": 2600 + }, + { + "epoch": 1.7506720430107527, + "grad_norm": 0.3720192733243154, + "learning_rate": 9.322453420809663e-07, + "loss": 0.6188, + "step": 2605 + }, + { + "epoch": 1.754032258064516, + "grad_norm": 0.3759460278402678, + "learning_rate": 9.076714660602726e-07, + "loss": 0.6188, + "step": 2610 + }, + { + "epoch": 1.7573924731182795, + "grad_norm": 0.37780059312461417, + "learning_rate": 8.834104085785411e-07, + "loss": 0.619, + "step": 2615 + }, + { + "epoch": 1.760752688172043, + "grad_norm": 0.3644735938402259, + "learning_rate": 8.594630043293862e-07, + "loss": 0.6166, + "step": 2620 + }, + { + "epoch": 1.7641129032258065, + "grad_norm": 0.40102016373518057, + "learning_rate": 8.35830077215285e-07, + "loss": 0.6205, + "step": 2625 + }, + { + "epoch": 1.76747311827957, + "grad_norm": 0.36805712689178754, + "learning_rate": 8.125124403192353e-07, + "loss": 0.6178, + "step": 2630 + }, + { + "epoch": 1.7708333333333335, + "grad_norm": 0.3855167813689446, + "learning_rate": 7.89510895876775e-07, + "loss": 0.6187, + "step": 2635 + }, + { + "epoch": 1.7741935483870968, + "grad_norm": 0.37050943347616344, + "learning_rate": 7.668262352483957e-07, + "loss": 0.6175, + "step": 2640 + }, + { + "epoch": 1.7775537634408602, + "grad_norm": 0.37029783353120194, + "learning_rate": 7.444592388922955e-07, + "loss": 0.6184, + "step": 2645 + }, + { + "epoch": 1.7809139784946235, + "grad_norm": 0.3842300371858157, + "learning_rate": 7.224106763375493e-07, + "loss": 0.6147, + "step": 2650 + }, + { + "epoch": 1.784274193548387, + "grad_norm": 0.3700714291963043, + "learning_rate": 7.006813061576145e-07, + "loss": 0.6124, + "step": 2655 + }, + { + "epoch": 1.7876344086021505, + "grad_norm": 0.3760694332290439, + "learning_rate": 6.792718759442474e-07, + "loss": 0.6194, + "step": 2660 + }, + { + "epoch": 1.790994623655914, + "grad_norm": 0.3671308826023182, + "learning_rate": 6.581831222817714e-07, + "loss": 0.6149, + "step": 2665 + }, + { + "epoch": 1.7943548387096775, + "grad_norm": 0.3746985320397756, + "learning_rate": 6.374157707217421e-07, + "loss": 0.6152, + "step": 2670 + }, + { + "epoch": 1.797715053763441, + "grad_norm": 0.38527941649428066, + "learning_rate": 6.169705357579813e-07, + "loss": 0.6206, + "step": 2675 + }, + { + "epoch": 1.8010752688172043, + "grad_norm": 0.3703868711176167, + "learning_rate": 5.968481208020016e-07, + "loss": 0.6185, + "step": 2680 + }, + { + "epoch": 1.8044354838709677, + "grad_norm": 0.37853482566243507, + "learning_rate": 5.770492181587906e-07, + "loss": 0.6148, + "step": 2685 + }, + { + "epoch": 1.807795698924731, + "grad_norm": 0.3580208698159626, + "learning_rate": 5.575745090030138e-07, + "loss": 0.6168, + "step": 2690 + }, + { + "epoch": 1.8111559139784945, + "grad_norm": 0.3822688397822238, + "learning_rate": 5.38424663355559e-07, + "loss": 0.6188, + "step": 2695 + }, + { + "epoch": 1.814516129032258, + "grad_norm": 0.37115606254622296, + "learning_rate": 5.196003400604977e-07, + "loss": 0.6159, + "step": 2700 + }, + { + "epoch": 1.8178763440860215, + "grad_norm": 0.37498937385752057, + "learning_rate": 5.01102186762411e-07, + "loss": 0.6193, + "step": 2705 + }, + { + "epoch": 1.821236559139785, + "grad_norm": 0.36638337590912584, + "learning_rate": 4.829308398841104e-07, + "loss": 0.6177, + "step": 2710 + }, + { + "epoch": 1.8245967741935485, + "grad_norm": 0.3720335399527678, + "learning_rate": 4.650869246047407e-07, + "loss": 0.616, + "step": 2715 + }, + { + "epoch": 1.827956989247312, + "grad_norm": 0.3883742285519788, + "learning_rate": 4.475710548382717e-07, + "loss": 0.6183, + "step": 2720 + }, + { + "epoch": 1.8313172043010753, + "grad_norm": 0.3660996354157979, + "learning_rate": 4.303838332123766e-07, + "loss": 0.6181, + "step": 2725 + }, + { + "epoch": 1.8346774193548387, + "grad_norm": 0.3832317742511122, + "learning_rate": 4.1352585104770136e-07, + "loss": 0.6173, + "step": 2730 + }, + { + "epoch": 1.838037634408602, + "grad_norm": 0.3733758015066586, + "learning_rate": 3.969976883375126e-07, + "loss": 0.6212, + "step": 2735 + }, + { + "epoch": 1.8413978494623655, + "grad_norm": 0.371152480421909, + "learning_rate": 3.807999137277507e-07, + "loss": 0.6154, + "step": 2740 + }, + { + "epoch": 1.844758064516129, + "grad_norm": 0.38345844939628254, + "learning_rate": 3.6493308449746525e-07, + "loss": 0.6189, + "step": 2745 + }, + { + "epoch": 1.8481182795698925, + "grad_norm": 0.36762462712757643, + "learning_rate": 3.4939774653963587e-07, + "loss": 0.6203, + "step": 2750 + }, + { + "epoch": 1.851478494623656, + "grad_norm": 0.375536271265564, + "learning_rate": 3.3419443434240083e-07, + "loss": 0.6172, + "step": 2755 + }, + { + "epoch": 1.8548387096774195, + "grad_norm": 0.3660221694767357, + "learning_rate": 3.19323670970656e-07, + "loss": 0.6181, + "step": 2760 + }, + { + "epoch": 1.8581989247311828, + "grad_norm": 0.3599790898532119, + "learning_rate": 3.0478596804807246e-07, + "loss": 0.62, + "step": 2765 + }, + { + "epoch": 1.8615591397849462, + "grad_norm": 0.3709615221652835, + "learning_rate": 2.905818257394799e-07, + "loss": 0.6159, + "step": 2770 + }, + { + "epoch": 1.8649193548387095, + "grad_norm": 0.3726522140490793, + "learning_rate": 2.7671173273367323e-07, + "loss": 0.6192, + "step": 2775 + }, + { + "epoch": 1.868279569892473, + "grad_norm": 0.3677308860539174, + "learning_rate": 2.631761662265875e-07, + "loss": 0.6188, + "step": 2780 + }, + { + "epoch": 1.8716397849462365, + "grad_norm": 0.37507182923605303, + "learning_rate": 2.499755919048863e-07, + "loss": 0.6236, + "step": 2785 + }, + { + "epoch": 1.875, + "grad_norm": 0.36561243105086355, + "learning_rate": 2.371104639299393e-07, + "loss": 0.616, + "step": 2790 + }, + { + "epoch": 1.8783602150537635, + "grad_norm": 0.3727929463786164, + "learning_rate": 2.2458122492219458e-07, + "loss": 0.6186, + "step": 2795 + }, + { + "epoch": 1.881720430107527, + "grad_norm": 0.3715493910859094, + "learning_rate": 2.1238830594595195e-07, + "loss": 0.619, + "step": 2800 + }, + { + "epoch": 1.8850806451612905, + "grad_norm": 0.3701421554724482, + "learning_rate": 2.005321264945348e-07, + "loss": 0.615, + "step": 2805 + }, + { + "epoch": 1.8884408602150538, + "grad_norm": 0.38162958713764294, + "learning_rate": 1.890130944758528e-07, + "loss": 0.6189, + "step": 2810 + }, + { + "epoch": 1.8918010752688172, + "grad_norm": 0.36878364983672696, + "learning_rate": 1.7783160619837202e-07, + "loss": 0.6157, + "step": 2815 + }, + { + "epoch": 1.8951612903225805, + "grad_norm": 0.3702088013665396, + "learning_rate": 1.669880463574758e-07, + "loss": 0.6136, + "step": 2820 + }, + { + "epoch": 1.898521505376344, + "grad_norm": 0.37799049217494324, + "learning_rate": 1.5648278802223526e-07, + "loss": 0.6145, + "step": 2825 + }, + { + "epoch": 1.9018817204301075, + "grad_norm": 0.37016035977530204, + "learning_rate": 1.463161926225687e-07, + "loss": 0.6202, + "step": 2830 + }, + { + "epoch": 1.905241935483871, + "grad_norm": 0.3683670258810398, + "learning_rate": 1.3648860993680903e-07, + "loss": 0.6146, + "step": 2835 + }, + { + "epoch": 1.9086021505376345, + "grad_norm": 0.3645705769628613, + "learning_rate": 1.2700037807967026e-07, + "loss": 0.6158, + "step": 2840 + }, + { + "epoch": 1.911962365591398, + "grad_norm": 0.3646260231379924, + "learning_rate": 1.1785182349061342e-07, + "loss": 0.6151, + "step": 2845 + }, + { + "epoch": 1.9153225806451613, + "grad_norm": 0.36357216498889616, + "learning_rate": 1.0904326092261441e-07, + "loss": 0.6173, + "step": 2850 + }, + { + "epoch": 1.9186827956989247, + "grad_norm": 0.36891941735522826, + "learning_rate": 1.0057499343134269e-07, + "loss": 0.617, + "step": 2855 + }, + { + "epoch": 1.922043010752688, + "grad_norm": 0.3618844540766792, + "learning_rate": 9.24473123647196e-08, + "loss": 0.6187, + "step": 2860 + }, + { + "epoch": 1.9254032258064515, + "grad_norm": 0.3829664112005493, + "learning_rate": 8.466049735291415e-08, + "loss": 0.6194, + "step": 2865 + }, + { + "epoch": 1.928763440860215, + "grad_norm": 0.36781144520431924, + "learning_rate": 7.721481629870076e-08, + "loss": 0.6169, + "step": 2870 + }, + { + "epoch": 1.9321236559139785, + "grad_norm": 0.3853416687052536, + "learning_rate": 7.011052536826435e-08, + "loss": 0.6205, + "step": 2875 + }, + { + "epoch": 1.935483870967742, + "grad_norm": 0.37397490558359964, + "learning_rate": 6.334786898237078e-08, + "loss": 0.6154, + "step": 2880 + }, + { + "epoch": 1.9388440860215055, + "grad_norm": 0.36691620687136894, + "learning_rate": 5.69270798079613e-08, + "loss": 0.6196, + "step": 2885 + }, + { + "epoch": 1.942204301075269, + "grad_norm": 0.3658247784366582, + "learning_rate": 5.084837875015347e-08, + "loss": 0.6132, + "step": 2890 + }, + { + "epoch": 1.9455645161290323, + "grad_norm": 0.36412582119114073, + "learning_rate": 4.511197494463493e-08, + "loss": 0.6172, + "step": 2895 + }, + { + "epoch": 1.9489247311827957, + "grad_norm": 0.3663620362729526, + "learning_rate": 3.971806575047033e-08, + "loss": 0.618, + "step": 2900 + }, + { + "epoch": 1.952284946236559, + "grad_norm": 0.3675765196570775, + "learning_rate": 3.466683674331228e-08, + "loss": 0.6166, + "step": 2905 + }, + { + "epoch": 1.9556451612903225, + "grad_norm": 0.3591512141413379, + "learning_rate": 2.995846170901428e-08, + "loss": 0.6156, + "step": 2910 + }, + { + "epoch": 1.959005376344086, + "grad_norm": 0.36646296059037736, + "learning_rate": 2.5593102637652136e-08, + "loss": 0.6145, + "step": 2915 + }, + { + "epoch": 1.9623655913978495, + "grad_norm": 0.36873568843189847, + "learning_rate": 2.1570909717955058e-08, + "loss": 0.6183, + "step": 2920 + }, + { + "epoch": 1.965725806451613, + "grad_norm": 0.3643607047509, + "learning_rate": 1.789202133212986e-08, + "loss": 0.6144, + "step": 2925 + }, + { + "epoch": 1.9690860215053765, + "grad_norm": 0.3696768969448687, + "learning_rate": 1.4556564051110278e-08, + "loss": 0.6221, + "step": 2930 + }, + { + "epoch": 1.9724462365591398, + "grad_norm": 0.3739771169389071, + "learning_rate": 1.1564652630192686e-08, + "loss": 0.6171, + "step": 2935 + }, + { + "epoch": 1.9758064516129032, + "grad_norm": 0.36410850268700906, + "learning_rate": 8.916390005095921e-09, + "loss": 0.619, + "step": 2940 + }, + { + "epoch": 1.9791666666666665, + "grad_norm": 0.3635016376462389, + "learning_rate": 6.61186728841412e-09, + "loss": 0.6121, + "step": 2945 + }, + { + "epoch": 1.98252688172043, + "grad_norm": 0.36237306270652886, + "learning_rate": 4.651163766484779e-09, + "loss": 0.6164, + "step": 2950 + }, + { + "epoch": 1.9858870967741935, + "grad_norm": 0.3732498548310209, + "learning_rate": 3.0343468966598234e-09, + "loss": 0.6169, + "step": 2955 + }, + { + "epoch": 1.989247311827957, + "grad_norm": 0.3679867274420493, + "learning_rate": 1.761472304987466e-09, + "loss": 0.618, + "step": 2960 + }, + { + "epoch": 1.9926075268817205, + "grad_norm": 0.37009481264365335, + "learning_rate": 8.325837842926288e-10, + "loss": 0.6203, + "step": 2965 + }, + { + "epoch": 1.995967741935484, + "grad_norm": 0.3760102338533851, + "learning_rate": 2.4771329267703206e-10, + "loss": 0.621, + "step": 2970 + }, + { + "epoch": 1.9993279569892473, + "grad_norm": 0.3689799921255622, + "learning_rate": 6.880952415633246e-12, + "loss": 0.6204, + "step": 2975 + }, + { + "epoch": 2.0, + "eval_loss": 0.642189085483551, + "eval_runtime": 38.454, + "eval_samples_per_second": 75.831, + "eval_steps_per_second": 1.196, + "step": 2976 + }, + { + "epoch": 2.0, + "step": 2976, + "total_flos": 623113855303680.0, + "train_loss": 0.6763581057950374, + "train_runtime": 8673.6172, + "train_samples_per_second": 21.958, + "train_steps_per_second": 0.343 + } + ], + "logging_steps": 5, + "max_steps": 2976, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 623113855303680.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}