{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 2282, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008764241893076249, "grad_norm": 3.7227299213409424, "learning_rate": 9.999654593547529e-05, "loss": 0.8928, "step": 10 }, { "epoch": 0.017528483786152498, "grad_norm": 2.602724552154541, "learning_rate": 9.998460664009619e-05, "loss": 0.7339, "step": 20 }, { "epoch": 0.026292725679228746, "grad_norm": 2.0315356254577637, "learning_rate": 9.996414173332143e-05, "loss": 0.8093, "step": 30 }, { "epoch": 0.035056967572304996, "grad_norm": 2.265153408050537, "learning_rate": 9.993515509371871e-05, "loss": 0.7516, "step": 40 }, { "epoch": 0.04382120946538125, "grad_norm": 1.9205143451690674, "learning_rate": 9.989765221491895e-05, "loss": 0.7691, "step": 50 }, { "epoch": 0.05258545135845749, "grad_norm": 2.219170331954956, "learning_rate": 9.985164020457504e-05, "loss": 0.7539, "step": 60 }, { "epoch": 0.06134969325153374, "grad_norm": 2.3345134258270264, "learning_rate": 9.979712778301489e-05, "loss": 0.6932, "step": 70 }, { "epoch": 0.07011393514460999, "grad_norm": 1.9078369140625, "learning_rate": 9.973412528158862e-05, "loss": 0.7058, "step": 80 }, { "epoch": 0.07887817703768624, "grad_norm": 2.3402259349823, "learning_rate": 9.966264464071064e-05, "loss": 0.7181, "step": 90 }, { "epoch": 0.0876424189307625, "grad_norm": 1.6266059875488281, "learning_rate": 9.958269940759659e-05, "loss": 0.6715, "step": 100 }, { "epoch": 0.09640666082383874, "grad_norm": 1.8910496234893799, "learning_rate": 9.94943047336958e-05, "loss": 0.6983, "step": 110 }, { "epoch": 0.10517090271691498, "grad_norm": 1.970109224319458, "learning_rate": 9.939747737181993e-05, "loss": 0.7639, "step": 120 }, { "epoch": 0.11393514460999124, "grad_norm": 1.9310839176177979, "learning_rate": 9.929223567296766e-05, "loss": 0.6289, "step": 130 }, { "epoch": 0.12269938650306748, "grad_norm": 1.5292866230010986, "learning_rate": 9.917859958284699e-05, "loss": 0.6575, "step": 140 }, { "epoch": 0.13146362839614373, "grad_norm": 1.3497545719146729, "learning_rate": 9.905659063809492e-05, "loss": 0.6986, "step": 150 }, { "epoch": 0.14022787028921999, "grad_norm": 2.1464974880218506, "learning_rate": 9.892623196219586e-05, "loss": 0.7055, "step": 160 }, { "epoch": 0.14899211218229624, "grad_norm": 2.5107662677764893, "learning_rate": 9.878754826109915e-05, "loss": 0.7421, "step": 170 }, { "epoch": 0.15775635407537247, "grad_norm": 1.9968948364257812, "learning_rate": 9.864056581853674e-05, "loss": 0.6956, "step": 180 }, { "epoch": 0.16652059596844873, "grad_norm": 1.5380990505218506, "learning_rate": 9.84853124910418e-05, "loss": 0.7439, "step": 190 }, { "epoch": 0.175284837861525, "grad_norm": 1.8293559551239014, "learning_rate": 9.832181770266927e-05, "loss": 0.6924, "step": 200 }, { "epoch": 0.18404907975460122, "grad_norm": 1.5421850681304932, "learning_rate": 9.815011243941939e-05, "loss": 0.6443, "step": 210 }, { "epoch": 0.19281332164767748, "grad_norm": 1.7461329698562622, "learning_rate": 9.797022924336504e-05, "loss": 0.7049, "step": 220 }, { "epoch": 0.20157756354075373, "grad_norm": 1.9567017555236816, "learning_rate": 9.778220220648439e-05, "loss": 0.6797, "step": 230 }, { "epoch": 0.21034180543382996, "grad_norm": 2.222736358642578, "learning_rate": 9.75860669641996e-05, "loss": 0.6958, "step": 240 }, { "epoch": 0.21910604732690622, "grad_norm": 1.6702378988265991, "learning_rate": 9.738186068862311e-05, "loss": 0.6895, "step": 250 }, { "epoch": 0.22787028921998248, "grad_norm": 1.9190706014633179, "learning_rate": 9.716962208151269e-05, "loss": 0.6747, "step": 260 }, { "epoch": 0.2366345311130587, "grad_norm": 2.277719020843506, "learning_rate": 9.69493913669366e-05, "loss": 0.704, "step": 270 }, { "epoch": 0.24539877300613497, "grad_norm": 1.6245454549789429, "learning_rate": 9.672121028365014e-05, "loss": 0.6023, "step": 280 }, { "epoch": 0.2541630148992112, "grad_norm": 1.7889143228530884, "learning_rate": 9.648512207718532e-05, "loss": 0.7218, "step": 290 }, { "epoch": 0.26292725679228746, "grad_norm": 2.422360897064209, "learning_rate": 9.624117149165466e-05, "loss": 0.6986, "step": 300 }, { "epoch": 0.27169149868536374, "grad_norm": 2.14021372795105, "learning_rate": 9.598940476127131e-05, "loss": 0.6476, "step": 310 }, { "epoch": 0.28045574057843997, "grad_norm": 1.9130802154541016, "learning_rate": 9.57298696015866e-05, "loss": 0.7056, "step": 320 }, { "epoch": 0.2892199824715162, "grad_norm": 2.4466943740844727, "learning_rate": 9.546261520044675e-05, "loss": 0.6542, "step": 330 }, { "epoch": 0.2979842243645925, "grad_norm": 1.6736353635787964, "learning_rate": 9.518769220867076e-05, "loss": 0.6648, "step": 340 }, { "epoch": 0.3067484662576687, "grad_norm": 1.745496153831482, "learning_rate": 9.490515273045085e-05, "loss": 0.6411, "step": 350 }, { "epoch": 0.31551270815074495, "grad_norm": 2.82519268989563, "learning_rate": 9.461505031347753e-05, "loss": 0.6597, "step": 360 }, { "epoch": 0.32427695004382123, "grad_norm": 2.2791597843170166, "learning_rate": 9.431743993879119e-05, "loss": 0.6931, "step": 370 }, { "epoch": 0.33304119193689746, "grad_norm": 1.6820751428604126, "learning_rate": 9.401237801036176e-05, "loss": 0.6529, "step": 380 }, { "epoch": 0.3418054338299737, "grad_norm": 1.7358545064926147, "learning_rate": 9.369992234439899e-05, "loss": 0.7001, "step": 390 }, { "epoch": 0.35056967572305, "grad_norm": 1.7271596193313599, "learning_rate": 9.338013215839495e-05, "loss": 0.6801, "step": 400 }, { "epoch": 0.3593339176161262, "grad_norm": 1.7952642440795898, "learning_rate": 9.305306805990093e-05, "loss": 0.7023, "step": 410 }, { "epoch": 0.36809815950920244, "grad_norm": 2.275848865509033, "learning_rate": 9.271879203504094e-05, "loss": 0.6641, "step": 420 }, { "epoch": 0.3768624014022787, "grad_norm": 1.435059666633606, "learning_rate": 9.237736743676386e-05, "loss": 0.6739, "step": 430 }, { "epoch": 0.38562664329535495, "grad_norm": 1.6760456562042236, "learning_rate": 9.202885897283674e-05, "loss": 0.6245, "step": 440 }, { "epoch": 0.3943908851884312, "grad_norm": 1.5440309047698975, "learning_rate": 9.167333269358109e-05, "loss": 0.6436, "step": 450 }, { "epoch": 0.40315512708150747, "grad_norm": 1.798532485961914, "learning_rate": 9.131085597935487e-05, "loss": 0.6097, "step": 460 }, { "epoch": 0.4119193689745837, "grad_norm": 1.5643911361694336, "learning_rate": 9.094149752778233e-05, "loss": 0.7032, "step": 470 }, { "epoch": 0.42068361086765993, "grad_norm": 1.9498308897018433, "learning_rate": 9.056532734073434e-05, "loss": 0.6429, "step": 480 }, { "epoch": 0.4294478527607362, "grad_norm": 2.7362070083618164, "learning_rate": 9.018241671106134e-05, "loss": 0.6518, "step": 490 }, { "epoch": 0.43821209465381245, "grad_norm": 1.755315899848938, "learning_rate": 8.979283820908174e-05, "loss": 0.6145, "step": 500 }, { "epoch": 0.4469763365468887, "grad_norm": 1.6948930025100708, "learning_rate": 8.939666566882821e-05, "loss": 0.6809, "step": 510 }, { "epoch": 0.45574057843996496, "grad_norm": 1.9926345348358154, "learning_rate": 8.899397417405442e-05, "loss": 0.6811, "step": 520 }, { "epoch": 0.4645048203330412, "grad_norm": 2.0141072273254395, "learning_rate": 8.858484004400496e-05, "loss": 0.6326, "step": 530 }, { "epoch": 0.4732690622261174, "grad_norm": 1.5361511707305908, "learning_rate": 8.816934081895105e-05, "loss": 0.6316, "step": 540 }, { "epoch": 0.4820333041191937, "grad_norm": 2.0940563678741455, "learning_rate": 8.774755524549503e-05, "loss": 0.7468, "step": 550 }, { "epoch": 0.49079754601226994, "grad_norm": 2.6678926944732666, "learning_rate": 8.731956326164591e-05, "loss": 0.6853, "step": 560 }, { "epoch": 0.49956178790534617, "grad_norm": 2.2798871994018555, "learning_rate": 8.688544598166935e-05, "loss": 0.6488, "step": 570 }, { "epoch": 0.5083260297984225, "grad_norm": 2.1741902828216553, "learning_rate": 8.644528568071472e-05, "loss": 0.7044, "step": 580 }, { "epoch": 0.5170902716914987, "grad_norm": 1.9935109615325928, "learning_rate": 8.599916577922198e-05, "loss": 0.6689, "step": 590 }, { "epoch": 0.5258545135845749, "grad_norm": 2.2509982585906982, "learning_rate": 8.554717082711164e-05, "loss": 0.6697, "step": 600 }, { "epoch": 0.5346187554776511, "grad_norm": 1.5349043607711792, "learning_rate": 8.508938648776062e-05, "loss": 0.6741, "step": 610 }, { "epoch": 0.5433829973707275, "grad_norm": 1.8080490827560425, "learning_rate": 8.462589952176709e-05, "loss": 0.6448, "step": 620 }, { "epoch": 0.5521472392638037, "grad_norm": 1.4415456056594849, "learning_rate": 8.415679777050735e-05, "loss": 0.6585, "step": 630 }, { "epoch": 0.5609114811568799, "grad_norm": 1.9184411764144897, "learning_rate": 8.368217013948786e-05, "loss": 0.6934, "step": 640 }, { "epoch": 0.5696757230499562, "grad_norm": 2.0871620178222656, "learning_rate": 8.320210658149562e-05, "loss": 0.642, "step": 650 }, { "epoch": 0.5784399649430324, "grad_norm": 1.835964322090149, "learning_rate": 8.271669807955007e-05, "loss": 0.7067, "step": 660 }, { "epoch": 0.5872042068361086, "grad_norm": 2.068668842315674, "learning_rate": 8.222603662965974e-05, "loss": 0.6759, "step": 670 }, { "epoch": 0.595968448729185, "grad_norm": 1.837086796760559, "learning_rate": 8.173021522338687e-05, "loss": 0.6301, "step": 680 }, { "epoch": 0.6047326906222612, "grad_norm": 1.995373010635376, "learning_rate": 8.122932783022342e-05, "loss": 0.688, "step": 690 }, { "epoch": 0.6134969325153374, "grad_norm": 1.736932635307312, "learning_rate": 8.072346937978168e-05, "loss": 0.6538, "step": 700 }, { "epoch": 0.6222611744084137, "grad_norm": 2.074141025543213, "learning_rate": 8.02127357438029e-05, "loss": 0.7275, "step": 710 }, { "epoch": 0.6310254163014899, "grad_norm": 2.758039951324463, "learning_rate": 7.969722371798753e-05, "loss": 0.7139, "step": 720 }, { "epoch": 0.6397896581945661, "grad_norm": 1.8415740728378296, "learning_rate": 7.917703100365005e-05, "loss": 0.5433, "step": 730 }, { "epoch": 0.6485539000876425, "grad_norm": 2.0133821964263916, "learning_rate": 7.865225618920248e-05, "loss": 0.6369, "step": 740 }, { "epoch": 0.6573181419807187, "grad_norm": 1.943287968635559, "learning_rate": 7.812299873146955e-05, "loss": 0.6168, "step": 750 }, { "epoch": 0.6660823838737949, "grad_norm": 1.7971194982528687, "learning_rate": 7.758935893683939e-05, "loss": 0.6518, "step": 760 }, { "epoch": 0.6748466257668712, "grad_norm": 1.5026414394378662, "learning_rate": 7.705143794225315e-05, "loss": 0.6498, "step": 770 }, { "epoch": 0.6836108676599474, "grad_norm": 1.8013416528701782, "learning_rate": 7.65093376960372e-05, "loss": 0.6598, "step": 780 }, { "epoch": 0.6923751095530236, "grad_norm": 1.8364579677581787, "learning_rate": 7.596316093858172e-05, "loss": 0.6934, "step": 790 }, { "epoch": 0.7011393514461, "grad_norm": 1.7535738945007324, "learning_rate": 7.541301118286894e-05, "loss": 0.5668, "step": 800 }, { "epoch": 0.7099035933391762, "grad_norm": 1.9401272535324097, "learning_rate": 7.485899269485506e-05, "loss": 0.6852, "step": 810 }, { "epoch": 0.7186678352322524, "grad_norm": 1.886604905128479, "learning_rate": 7.430121047370955e-05, "loss": 0.6845, "step": 820 }, { "epoch": 0.7274320771253286, "grad_norm": 1.9620131254196167, "learning_rate": 7.37397702319153e-05, "loss": 0.6344, "step": 830 }, { "epoch": 0.7361963190184049, "grad_norm": 1.949866771697998, "learning_rate": 7.32314343370074e-05, "loss": 0.6247, "step": 840 }, { "epoch": 0.7449605609114811, "grad_norm": 1.5609701871871948, "learning_rate": 7.266333756059938e-05, "loss": 0.604, "step": 850 }, { "epoch": 0.7537248028045574, "grad_norm": 2.5070557594299316, "learning_rate": 7.209189317790467e-05, "loss": 0.6262, "step": 860 }, { "epoch": 0.7624890446976337, "grad_norm": 1.672145128250122, "learning_rate": 7.151720949069814e-05, "loss": 0.6188, "step": 870 }, { "epoch": 0.7712532865907099, "grad_norm": 1.456264615058899, "learning_rate": 7.093939541467697e-05, "loss": 0.6026, "step": 880 }, { "epoch": 0.7800175284837861, "grad_norm": 1.967781901359558, "learning_rate": 7.035856045881851e-05, "loss": 0.6649, "step": 890 }, { "epoch": 0.7887817703768624, "grad_norm": 1.6206531524658203, "learning_rate": 6.977481470462593e-05, "loss": 0.6018, "step": 900 }, { "epoch": 0.7975460122699386, "grad_norm": 2.240525960922241, "learning_rate": 6.918826878526527e-05, "loss": 0.5912, "step": 910 }, { "epoch": 0.8063102541630149, "grad_norm": 1.8358842134475708, "learning_rate": 6.859903386459781e-05, "loss": 0.6435, "step": 920 }, { "epoch": 0.8150744960560912, "grad_norm": 2.574652671813965, "learning_rate": 6.80072216161121e-05, "loss": 0.6789, "step": 930 }, { "epoch": 0.8238387379491674, "grad_norm": 1.668445348739624, "learning_rate": 6.741294420175927e-05, "loss": 0.6103, "step": 940 }, { "epoch": 0.8326029798422436, "grad_norm": 1.7513461112976074, "learning_rate": 6.681631425069566e-05, "loss": 0.5696, "step": 950 }, { "epoch": 0.8413672217353199, "grad_norm": 2.508164405822754, "learning_rate": 6.621744483793715e-05, "loss": 0.653, "step": 960 }, { "epoch": 0.8501314636283961, "grad_norm": 1.9786324501037598, "learning_rate": 6.56164494629288e-05, "loss": 0.635, "step": 970 }, { "epoch": 0.8588957055214724, "grad_norm": 1.8781009912490845, "learning_rate": 6.501344202803414e-05, "loss": 0.7071, "step": 980 }, { "epoch": 0.8676599474145487, "grad_norm": 1.819648027420044, "learning_rate": 6.440853681694801e-05, "loss": 0.5952, "step": 990 }, { "epoch": 0.8764241893076249, "grad_norm": 1.8697113990783691, "learning_rate": 6.380184847303727e-05, "loss": 0.6179, "step": 1000 }, { "epoch": 0.8851884312007011, "grad_norm": 1.6153459548950195, "learning_rate": 6.319349197761317e-05, "loss": 0.6071, "step": 1010 }, { "epoch": 0.8939526730937774, "grad_norm": 1.5980494022369385, "learning_rate": 6.25835826281398e-05, "loss": 0.588, "step": 1020 }, { "epoch": 0.9027169149868537, "grad_norm": 1.509323239326477, "learning_rate": 6.197223601638266e-05, "loss": 0.6636, "step": 1030 }, { "epoch": 0.9114811568799299, "grad_norm": 1.91567862033844, "learning_rate": 6.135956800650128e-05, "loss": 0.6727, "step": 1040 }, { "epoch": 0.9202453987730062, "grad_norm": 1.7963509559631348, "learning_rate": 6.074569471309032e-05, "loss": 0.5768, "step": 1050 }, { "epoch": 0.9290096406660824, "grad_norm": 1.8217496871948242, "learning_rate": 6.013073247917326e-05, "loss": 0.6243, "step": 1060 }, { "epoch": 0.9377738825591586, "grad_norm": 1.648887276649475, "learning_rate": 5.951479785415266e-05, "loss": 0.6717, "step": 1070 }, { "epoch": 0.9465381244522348, "grad_norm": 1.668747067451477, "learning_rate": 5.889800757172146e-05, "loss": 0.5823, "step": 1080 }, { "epoch": 0.9553023663453112, "grad_norm": 2.224945545196533, "learning_rate": 5.8280478527739235e-05, "loss": 0.6203, "step": 1090 }, { "epoch": 0.9640666082383874, "grad_norm": NaN, "learning_rate": 5.7724167474641534e-05, "loss": 0.6271, "step": 1100 }, { "epoch": 0.9728308501314636, "grad_norm": 2.2267634868621826, "learning_rate": 5.7105557315385284e-05, "loss": 0.6614, "step": 1110 }, { "epoch": 0.9815950920245399, "grad_norm": 1.7963807582855225, "learning_rate": 5.6486548104880555e-05, "loss": 0.7113, "step": 1120 }, { "epoch": 0.9903593339176161, "grad_norm": 2.0616729259490967, "learning_rate": 5.586725715952452e-05, "loss": 0.6077, "step": 1130 }, { "epoch": 0.9991235758106923, "grad_norm": 1.7366951704025269, "learning_rate": 5.5247801849109526e-05, "loss": 0.634, "step": 1140 }, { "epoch": 1.0078878177037687, "grad_norm": 1.4753001928329468, "learning_rate": 5.462829957457888e-05, "loss": 0.44, "step": 1150 }, { "epoch": 1.016652059596845, "grad_norm": 1.493257999420166, "learning_rate": 5.400886774577667e-05, "loss": 0.3635, "step": 1160 }, { "epoch": 1.0254163014899211, "grad_norm": 1.7157448530197144, "learning_rate": 5.338962375919589e-05, "loss": 0.3525, "step": 1170 }, { "epoch": 1.0341805433829974, "grad_norm": 1.7190569639205933, "learning_rate": 5.277068497572914e-05, "loss": 0.3751, "step": 1180 }, { "epoch": 1.0429447852760736, "grad_norm": 1.5113599300384521, "learning_rate": 5.215216869842604e-05, "loss": 0.3857, "step": 1190 }, { "epoch": 1.0517090271691498, "grad_norm": 1.6809605360031128, "learning_rate": 5.1534192150261676e-05, "loss": 0.3611, "step": 1200 }, { "epoch": 1.060473269062226, "grad_norm": 1.6623985767364502, "learning_rate": 5.091687245192006e-05, "loss": 0.3558, "step": 1210 }, { "epoch": 1.0692375109553023, "grad_norm": 2.009783983230591, "learning_rate": 5.030032659959722e-05, "loss": 0.3654, "step": 1220 }, { "epoch": 1.0780017528483785, "grad_norm": 1.6117240190505981, "learning_rate": 4.968467144282759e-05, "loss": 0.3715, "step": 1230 }, { "epoch": 1.086765994741455, "grad_norm": 1.3283199071884155, "learning_rate": 4.9070023662338523e-05, "loss": 0.3736, "step": 1240 }, { "epoch": 1.0955302366345312, "grad_norm": 1.8791778087615967, "learning_rate": 4.8456499747936465e-05, "loss": 0.4082, "step": 1250 }, { "epoch": 1.1042944785276074, "grad_norm": 1.7714508771896362, "learning_rate": 4.7844215976429576e-05, "loss": 0.3743, "step": 1260 }, { "epoch": 1.1130587204206837, "grad_norm": 1.773903489112854, "learning_rate": 4.723328838959057e-05, "loss": 0.3858, "step": 1270 }, { "epoch": 1.1218229623137599, "grad_norm": 1.4871599674224854, "learning_rate": 4.662383277216418e-05, "loss": 0.3486, "step": 1280 }, { "epoch": 1.1305872042068361, "grad_norm": 1.3975831270217896, "learning_rate": 4.601596462992326e-05, "loss": 0.3657, "step": 1290 }, { "epoch": 1.1393514460999123, "grad_norm": 1.5370299816131592, "learning_rate": 4.540979916777783e-05, "loss": 0.3371, "step": 1300 }, { "epoch": 1.1481156879929886, "grad_norm": 1.5495011806488037, "learning_rate": 4.480545126794115e-05, "loss": 0.3357, "step": 1310 }, { "epoch": 1.1568799298860648, "grad_norm": 1.694589376449585, "learning_rate": 4.420303546815678e-05, "loss": 0.3645, "step": 1320 }, { "epoch": 1.165644171779141, "grad_norm": 1.985825777053833, "learning_rate": 4.360266593999124e-05, "loss": 0.3546, "step": 1330 }, { "epoch": 1.1744084136722173, "grad_norm": 1.5808460712432861, "learning_rate": 4.300445646719573e-05, "loss": 0.3638, "step": 1340 }, { "epoch": 1.1831726555652935, "grad_norm": 1.7759652137756348, "learning_rate": 4.240852042414162e-05, "loss": 0.4059, "step": 1350 }, { "epoch": 1.19193689745837, "grad_norm": 1.7563869953155518, "learning_rate": 4.181497075433334e-05, "loss": 0.3446, "step": 1360 }, { "epoch": 1.2007011393514462, "grad_norm": 1.9143909215927124, "learning_rate": 4.1223919949003045e-05, "loss": 0.3487, "step": 1370 }, { "epoch": 1.2094653812445224, "grad_norm": 1.8407511711120605, "learning_rate": 4.0635480025790926e-05, "loss": 0.3649, "step": 1380 }, { "epoch": 1.2182296231375986, "grad_norm": 1.7706880569458008, "learning_rate": 4.0049762507515355e-05, "loss": 0.3612, "step": 1390 }, { "epoch": 1.2269938650306749, "grad_norm": 1.0870561599731445, "learning_rate": 3.9466878401036686e-05, "loss": 0.3401, "step": 1400 }, { "epoch": 1.235758106923751, "grad_norm": 1.91828453540802, "learning_rate": 3.8886938176219024e-05, "loss": 0.3327, "step": 1410 }, { "epoch": 1.2445223488168273, "grad_norm": 1.61056649684906, "learning_rate": 3.8310051744993514e-05, "loss": 0.3386, "step": 1420 }, { "epoch": 1.2532865907099036, "grad_norm": 2.071869373321533, "learning_rate": 3.773632844052767e-05, "loss": 0.363, "step": 1430 }, { "epoch": 1.2620508326029798, "grad_norm": 1.671288251876831, "learning_rate": 3.7165876996504125e-05, "loss": 0.3828, "step": 1440 }, { "epoch": 1.270815074496056, "grad_norm": 1.8811005353927612, "learning_rate": 3.659880552651317e-05, "loss": 0.3551, "step": 1450 }, { "epoch": 1.2795793163891322, "grad_norm": 1.5208740234375, "learning_rate": 3.6035221503562775e-05, "loss": 0.3566, "step": 1460 }, { "epoch": 1.2883435582822087, "grad_norm": 1.7736235857009888, "learning_rate": 3.547523173970989e-05, "loss": 0.3629, "step": 1470 }, { "epoch": 1.2971078001752847, "grad_norm": 1.6049748659133911, "learning_rate": 3.491894236581728e-05, "loss": 0.3359, "step": 1480 }, { "epoch": 1.3058720420683612, "grad_norm": 1.4260120391845703, "learning_rate": 3.436645881143918e-05, "loss": 0.3758, "step": 1490 }, { "epoch": 1.3146362839614374, "grad_norm": 1.9971890449523926, "learning_rate": 3.3817885784839986e-05, "loss": 0.314, "step": 1500 }, { "epoch": 1.3234005258545136, "grad_norm": 1.7832164764404297, "learning_rate": 3.327332725314974e-05, "loss": 0.3901, "step": 1510 }, { "epoch": 1.3321647677475899, "grad_norm": 1.6533173322677612, "learning_rate": 3.273288642265985e-05, "loss": 0.3324, "step": 1520 }, { "epoch": 1.340929009640666, "grad_norm": 1.5945855379104614, "learning_rate": 3.2196665719263266e-05, "loss": 0.3435, "step": 1530 }, { "epoch": 1.3496932515337423, "grad_norm": 1.51680588722229, "learning_rate": 3.166476676904235e-05, "loss": 0.3714, "step": 1540 }, { "epoch": 1.3584574934268185, "grad_norm": 1.6684399843215942, "learning_rate": 3.113729037900843e-05, "loss": 0.333, "step": 1550 }, { "epoch": 1.3672217353198948, "grad_norm": 1.5105247497558594, "learning_rate": 3.0614336517996576e-05, "loss": 0.3615, "step": 1560 }, { "epoch": 1.375985977212971, "grad_norm": 2.113157272338867, "learning_rate": 3.0096004297719205e-05, "loss": 0.3002, "step": 1570 }, { "epoch": 1.3847502191060475, "grad_norm": 1.1173641681671143, "learning_rate": 2.958239195398217e-05, "loss": 0.3571, "step": 1580 }, { "epoch": 1.3935144609991235, "grad_norm": 1.4548070430755615, "learning_rate": 2.90735968280668e-05, "loss": 0.3252, "step": 1590 }, { "epoch": 1.4022787028922, "grad_norm": 1.7521406412124634, "learning_rate": 2.8569715348281547e-05, "loss": 0.3409, "step": 1600 }, { "epoch": 1.4110429447852761, "grad_norm": 1.7292704582214355, "learning_rate": 2.807084301168652e-05, "loss": 0.3282, "step": 1610 }, { "epoch": 1.4198071866783524, "grad_norm": 1.7453776597976685, "learning_rate": 2.7577074365994747e-05, "loss": 0.3217, "step": 1620 }, { "epoch": 1.4285714285714286, "grad_norm": 2.02482271194458, "learning_rate": 2.70885029916531e-05, "loss": 0.3355, "step": 1630 }, { "epoch": 1.4373356704645048, "grad_norm": 1.927150845527649, "learning_rate": 2.660522148410675e-05, "loss": 0.3528, "step": 1640 }, { "epoch": 1.446099912357581, "grad_norm": 1.5125665664672852, "learning_rate": 2.6127321436250117e-05, "loss": 0.3355, "step": 1650 }, { "epoch": 1.4548641542506573, "grad_norm": 1.771646499633789, "learning_rate": 2.565489342106805e-05, "loss": 0.3375, "step": 1660 }, { "epoch": 1.4636283961437335, "grad_norm": 2.540931463241577, "learning_rate": 2.518802697447003e-05, "loss": 0.3601, "step": 1670 }, { "epoch": 1.4723926380368098, "grad_norm": 1.5853700637817383, "learning_rate": 2.472681057832121e-05, "loss": 0.3207, "step": 1680 }, { "epoch": 1.481156879929886, "grad_norm": 2.11691951751709, "learning_rate": 2.427133164367296e-05, "loss": 0.3357, "step": 1690 }, { "epoch": 1.4899211218229622, "grad_norm": 1.55061674118042, "learning_rate": 2.3821676494196572e-05, "loss": 0.3314, "step": 1700 }, { "epoch": 1.4986853637160387, "grad_norm": 1.5113292932510376, "learning_rate": 2.3377930349822856e-05, "loss": 0.3162, "step": 1710 }, { "epoch": 1.5074496056091147, "grad_norm": 2.0338950157165527, "learning_rate": 2.2940177310591113e-05, "loss": 0.3693, "step": 1720 }, { "epoch": 1.5162138475021911, "grad_norm": 2.0127060413360596, "learning_rate": 2.250850034071016e-05, "loss": 0.324, "step": 1730 }, { "epoch": 1.5249780893952674, "grad_norm": 1.9195280075073242, "learning_rate": 2.20829812528348e-05, "loss": 0.3559, "step": 1740 }, { "epoch": 1.5337423312883436, "grad_norm": 1.6400375366210938, "learning_rate": 2.1663700692560373e-05, "loss": 0.3371, "step": 1750 }, { "epoch": 1.5425065731814198, "grad_norm": 1.9201463460922241, "learning_rate": 2.1250738123138665e-05, "loss": 0.3536, "step": 1760 }, { "epoch": 1.551270815074496, "grad_norm": 1.8780487775802612, "learning_rate": 2.084417181041769e-05, "loss": 0.3829, "step": 1770 }, { "epoch": 1.5600350569675723, "grad_norm": 1.8463397026062012, "learning_rate": 2.0444078808008655e-05, "loss": 0.3431, "step": 1780 }, { "epoch": 1.5687992988606485, "grad_norm": 1.8080838918685913, "learning_rate": 2.005053494268241e-05, "loss": 0.3748, "step": 1790 }, { "epoch": 1.577563540753725, "grad_norm": 1.9387633800506592, "learning_rate": 1.9663614799998635e-05, "loss": 0.3586, "step": 1800 }, { "epoch": 1.586327782646801, "grad_norm": 1.7084999084472656, "learning_rate": 1.928339171017015e-05, "loss": 0.357, "step": 1810 }, { "epoch": 1.5950920245398774, "grad_norm": 2.1515519618988037, "learning_rate": 1.8909937734165107e-05, "loss": 0.3333, "step": 1820 }, { "epoch": 1.6038562664329534, "grad_norm": 1.4635021686553955, "learning_rate": 1.8543323650049864e-05, "loss": 0.3337, "step": 1830 }, { "epoch": 1.6126205083260299, "grad_norm": 1.7444740533828735, "learning_rate": 1.8183618939574904e-05, "loss": 0.3686, "step": 1840 }, { "epoch": 1.6213847502191059, "grad_norm": 1.587889552116394, "learning_rate": 1.7830891775006396e-05, "loss": 0.3167, "step": 1850 }, { "epoch": 1.6301489921121823, "grad_norm": 1.9329102039337158, "learning_rate": 1.748520900620609e-05, "loss": 0.3349, "step": 1860 }, { "epoch": 1.6389132340052586, "grad_norm": 1.610293984413147, "learning_rate": 1.714663614796167e-05, "loss": 0.3841, "step": 1870 }, { "epoch": 1.6476774758983348, "grad_norm": 1.9500341415405273, "learning_rate": 1.6815237367570197e-05, "loss": 0.3575, "step": 1880 }, { "epoch": 1.656441717791411, "grad_norm": 1.717809796333313, "learning_rate": 1.6491075472677016e-05, "loss": 0.2917, "step": 1890 }, { "epoch": 1.6652059596844873, "grad_norm": 1.2370789051055908, "learning_rate": 1.6174211899372175e-05, "loss": 0.3535, "step": 1900 }, { "epoch": 1.6739702015775635, "grad_norm": 1.6636922359466553, "learning_rate": 1.5864706700546955e-05, "loss": 0.3242, "step": 1910 }, { "epoch": 1.6827344434706397, "grad_norm": 1.3746954202651978, "learning_rate": 1.5562618534512428e-05, "loss": 0.3463, "step": 1920 }, { "epoch": 1.6914986853637162, "grad_norm": 1.9031116962432861, "learning_rate": 1.5268004653882406e-05, "loss": 0.3448, "step": 1930 }, { "epoch": 1.7002629272567922, "grad_norm": 1.8231595754623413, "learning_rate": 1.4980920894722692e-05, "loss": 0.3327, "step": 1940 }, { "epoch": 1.7090271691498686, "grad_norm": 2.2330541610717773, "learning_rate": 1.4701421665969001e-05, "loss": 0.344, "step": 1950 }, { "epoch": 1.7177914110429446, "grad_norm": 1.5624206066131592, "learning_rate": 1.442955993911505e-05, "loss": 0.3194, "step": 1960 }, { "epoch": 1.726555652936021, "grad_norm": 1.7255851030349731, "learning_rate": 1.4165387238173399e-05, "loss": 0.3122, "step": 1970 }, { "epoch": 1.7353198948290973, "grad_norm": 1.7800745964050293, "learning_rate": 1.3908953629910376e-05, "loss": 0.3005, "step": 1980 }, { "epoch": 1.7440841367221736, "grad_norm": 2.472658395767212, "learning_rate": 1.3660307714357338e-05, "loss": 0.3571, "step": 1990 }, { "epoch": 1.7528483786152498, "grad_norm": 1.6620064973831177, "learning_rate": 1.3419496615599805e-05, "loss": 0.3522, "step": 2000 }, { "epoch": 1.761612620508326, "grad_norm": 1.5490673780441284, "learning_rate": 1.318656597284643e-05, "loss": 0.2967, "step": 2010 }, { "epoch": 1.7703768624014022, "grad_norm": 2.1536474227905273, "learning_rate": 1.2961559931779257e-05, "loss": 0.3703, "step": 2020 }, { "epoch": 1.7791411042944785, "grad_norm": 1.490907073020935, "learning_rate": 1.274452113618716e-05, "loss": 0.3076, "step": 2030 }, { "epoch": 1.787905346187555, "grad_norm": 1.4357167482376099, "learning_rate": 1.2535490719883835e-05, "loss": 0.3265, "step": 2040 }, { "epoch": 1.796669588080631, "grad_norm": 1.5666704177856445, "learning_rate": 1.233450829891203e-05, "loss": 0.324, "step": 2050 }, { "epoch": 1.8054338299737074, "grad_norm": 1.3453813791275024, "learning_rate": 1.2141611964035366e-05, "loss": 0.2977, "step": 2060 }, { "epoch": 1.8141980718667834, "grad_norm": 1.782593011856079, "learning_rate": 1.195683827351931e-05, "loss": 0.2795, "step": 2070 }, { "epoch": 1.8229623137598598, "grad_norm": 1.9441533088684082, "learning_rate": 1.1780222246202494e-05, "loss": 0.3166, "step": 2080 }, { "epoch": 1.831726555652936, "grad_norm": 1.778911828994751, "learning_rate": 1.1611797354859892e-05, "loss": 0.3493, "step": 2090 }, { "epoch": 1.8404907975460123, "grad_norm": 1.7141963243484497, "learning_rate": 1.145159551985894e-05, "loss": 0.3313, "step": 2100 }, { "epoch": 1.8492550394390885, "grad_norm": 3.3165926933288574, "learning_rate": 1.1299647103109908e-05, "loss": 0.356, "step": 2110 }, { "epoch": 1.8580192813321648, "grad_norm": 1.787851095199585, "learning_rate": 1.11559809023116e-05, "loss": 0.3219, "step": 2120 }, { "epoch": 1.866783523225241, "grad_norm": 1.7090057134628296, "learning_rate": 1.1020624145493572e-05, "loss": 0.3445, "step": 2130 }, { "epoch": 1.8755477651183172, "grad_norm": 1.5748744010925293, "learning_rate": 1.0893602485855766e-05, "loss": 0.3253, "step": 2140 }, { "epoch": 1.8843120070113937, "grad_norm": 1.9979685544967651, "learning_rate": 1.0774939996906644e-05, "loss": 0.3177, "step": 2150 }, { "epoch": 1.8930762489044697, "grad_norm": 1.936296820640564, "learning_rate": 1.0664659167900723e-05, "loss": 0.3304, "step": 2160 }, { "epoch": 1.9018404907975461, "grad_norm": 1.8121618032455444, "learning_rate": 1.0562780899576344e-05, "loss": 0.3493, "step": 2170 }, { "epoch": 1.9106047326906221, "grad_norm": 1.6870180368423462, "learning_rate": 1.046932450019448e-05, "loss": 0.328, "step": 2180 }, { "epoch": 1.9193689745836986, "grad_norm": 2.0619869232177734, "learning_rate": 1.0384307681879428e-05, "loss": 0.3845, "step": 2190 }, { "epoch": 1.9281332164767746, "grad_norm": 2.1429038047790527, "learning_rate": 1.030774655726191e-05, "loss": 0.3143, "step": 2200 }, { "epoch": 1.936897458369851, "grad_norm": 1.9444646835327148, "learning_rate": 1.0239655636425374e-05, "loss": 0.3135, "step": 2210 }, { "epoch": 1.9456617002629273, "grad_norm": 1.6065791845321655, "learning_rate": 1.0180047824156011e-05, "loss": 0.3142, "step": 2220 }, { "epoch": 1.9544259421560035, "grad_norm": 2.215041160583496, "learning_rate": 1.0128934417497004e-05, "loss": 0.3234, "step": 2230 }, { "epoch": 1.9631901840490797, "grad_norm": 1.766499638557434, "learning_rate": 1.008632510360747e-05, "loss": 0.3395, "step": 2240 }, { "epoch": 1.971954425942156, "grad_norm": 2.355278491973877, "learning_rate": 1.0052227957926518e-05, "loss": 0.3476, "step": 2250 }, { "epoch": 1.9807186678352322, "grad_norm": 1.6923573017120361, "learning_rate": 1.0026649442642785e-05, "loss": 0.386, "step": 2260 }, { "epoch": 1.9894829097283084, "grad_norm": 1.454087495803833, "learning_rate": 1.0009594405469695e-05, "loss": 0.3059, "step": 2270 }, { "epoch": 1.9982471516213849, "grad_norm": 1.5868600606918335, "learning_rate": 1.0001066078726703e-05, "loss": 0.3474, "step": 2280 } ], "logging_steps": 10, "max_steps": 2282, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.7769146165323366e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }