{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 3202, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006246096189881324, "grad_norm": 0.47147321701049805, "learning_rate": 4.968769519050593e-05, "loss": 4.3946, "step": 20 }, { "epoch": 0.012492192379762648, "grad_norm": 1.0567554235458374, "learning_rate": 4.937539038101187e-05, "loss": 4.4724, "step": 40 }, { "epoch": 0.018738288569643973, "grad_norm": 1.647952675819397, "learning_rate": 4.90630855715178e-05, "loss": 4.162, "step": 60 }, { "epoch": 0.024984384759525295, "grad_norm": 1.286374568939209, "learning_rate": 4.875078076202374e-05, "loss": 3.7767, "step": 80 }, { "epoch": 0.03123048094940662, "grad_norm": 1.465066909790039, "learning_rate": 4.843847595252967e-05, "loss": 3.9672, "step": 100 }, { "epoch": 0.037476577139287946, "grad_norm": 1.7631100416183472, "learning_rate": 4.81261711430356e-05, "loss": 3.6736, "step": 120 }, { "epoch": 0.04372267332916927, "grad_norm": 1.5231457948684692, "learning_rate": 4.781386633354154e-05, "loss": 3.4314, "step": 140 }, { "epoch": 0.04996876951905059, "grad_norm": 3.9318079948425293, "learning_rate": 4.750156152404747e-05, "loss": 3.501, "step": 160 }, { "epoch": 0.056214865708931916, "grad_norm": 2.3082265853881836, "learning_rate": 4.71892567145534e-05, "loss": 3.3423, "step": 180 }, { "epoch": 0.06246096189881324, "grad_norm": 2.0189499855041504, "learning_rate": 4.6876951905059344e-05, "loss": 3.4936, "step": 200 }, { "epoch": 0.06870705808869457, "grad_norm": 2.6035380363464355, "learning_rate": 4.656464709556527e-05, "loss": 2.9782, "step": 220 }, { "epoch": 0.07495315427857589, "grad_norm": 1.5224251747131348, "learning_rate": 4.625234228607121e-05, "loss": 3.0411, "step": 240 }, { "epoch": 0.08119925046845722, "grad_norm": 3.759772777557373, "learning_rate": 4.5940037476577143e-05, "loss": 3.0212, "step": 260 }, { "epoch": 0.08744534665833854, "grad_norm": 3.752288341522217, "learning_rate": 4.562773266708307e-05, "loss": 3.0461, "step": 280 }, { "epoch": 0.09369144284821987, "grad_norm": 4.104545593261719, "learning_rate": 4.531542785758901e-05, "loss": 3.0462, "step": 300 }, { "epoch": 0.09993753903810118, "grad_norm": 4.520786762237549, "learning_rate": 4.500312304809494e-05, "loss": 2.8036, "step": 320 }, { "epoch": 0.1061836352279825, "grad_norm": 2.0589022636413574, "learning_rate": 4.469081823860088e-05, "loss": 2.9551, "step": 340 }, { "epoch": 0.11242973141786383, "grad_norm": 3.8759829998016357, "learning_rate": 4.437851342910681e-05, "loss": 2.5833, "step": 360 }, { "epoch": 0.11867582760774516, "grad_norm": 3.9548556804656982, "learning_rate": 4.406620861961274e-05, "loss": 3.0385, "step": 380 }, { "epoch": 0.12492192379762648, "grad_norm": 2.6232974529266357, "learning_rate": 4.375390381011868e-05, "loss": 2.7124, "step": 400 }, { "epoch": 0.1311680199875078, "grad_norm": 2.080956220626831, "learning_rate": 4.3441599000624614e-05, "loss": 2.8504, "step": 420 }, { "epoch": 0.13741411617738913, "grad_norm": 2.893707036972046, "learning_rate": 4.312929419113055e-05, "loss": 2.8075, "step": 440 }, { "epoch": 0.14366021236727045, "grad_norm": 3.174916982650757, "learning_rate": 4.281698938163648e-05, "loss": 2.6756, "step": 460 }, { "epoch": 0.14990630855715179, "grad_norm": 2.7715773582458496, "learning_rate": 4.2504684572142414e-05, "loss": 2.6994, "step": 480 }, { "epoch": 0.1561524047470331, "grad_norm": 3.1229937076568604, "learning_rate": 4.219237976264835e-05, "loss": 2.5, "step": 500 }, { "epoch": 0.16239850093691444, "grad_norm": 3.7433629035949707, "learning_rate": 4.188007495315428e-05, "loss": 2.7145, "step": 520 }, { "epoch": 0.16864459712679575, "grad_norm": 4.687792778015137, "learning_rate": 4.156777014366021e-05, "loss": 2.7268, "step": 540 }, { "epoch": 0.1748906933166771, "grad_norm": 4.73487663269043, "learning_rate": 4.125546533416615e-05, "loss": 3.1233, "step": 560 }, { "epoch": 0.1811367895065584, "grad_norm": 3.7153172492980957, "learning_rate": 4.094316052467208e-05, "loss": 2.7226, "step": 580 }, { "epoch": 0.18738288569643974, "grad_norm": 4.160110950469971, "learning_rate": 4.063085571517802e-05, "loss": 2.6628, "step": 600 }, { "epoch": 0.19362898188632105, "grad_norm": 3.126046657562256, "learning_rate": 4.031855090568395e-05, "loss": 3.0178, "step": 620 }, { "epoch": 0.19987507807620236, "grad_norm": 2.5289721488952637, "learning_rate": 4.0006246096189884e-05, "loss": 2.6422, "step": 640 }, { "epoch": 0.2061211742660837, "grad_norm": 3.9148857593536377, "learning_rate": 3.969394128669582e-05, "loss": 2.6393, "step": 660 }, { "epoch": 0.212367270455965, "grad_norm": 2.8504717350006104, "learning_rate": 3.938163647720175e-05, "loss": 2.9792, "step": 680 }, { "epoch": 0.21861336664584635, "grad_norm": 3.4905455112457275, "learning_rate": 3.906933166770769e-05, "loss": 2.8159, "step": 700 }, { "epoch": 0.22485946283572766, "grad_norm": 3.08007550239563, "learning_rate": 3.875702685821362e-05, "loss": 2.6937, "step": 720 }, { "epoch": 0.231105559025609, "grad_norm": 2.3973348140716553, "learning_rate": 3.844472204871955e-05, "loss": 2.9008, "step": 740 }, { "epoch": 0.23735165521549031, "grad_norm": 2.920398473739624, "learning_rate": 3.813241723922549e-05, "loss": 2.7197, "step": 760 }, { "epoch": 0.24359775140537165, "grad_norm": 4.2394280433654785, "learning_rate": 3.782011242973142e-05, "loss": 2.5608, "step": 780 }, { "epoch": 0.24984384759525297, "grad_norm": 4.320163249969482, "learning_rate": 3.750780762023735e-05, "loss": 2.7325, "step": 800 }, { "epoch": 0.2560899437851343, "grad_norm": 7.346144199371338, "learning_rate": 3.719550281074329e-05, "loss": 2.7655, "step": 820 }, { "epoch": 0.2623360399750156, "grad_norm": 3.444692850112915, "learning_rate": 3.688319800124922e-05, "loss": 2.6974, "step": 840 }, { "epoch": 0.26858213616489696, "grad_norm": 2.8311755657196045, "learning_rate": 3.6570893191755154e-05, "loss": 2.571, "step": 860 }, { "epoch": 0.27482823235477827, "grad_norm": 3.064018726348877, "learning_rate": 3.625858838226109e-05, "loss": 2.8051, "step": 880 }, { "epoch": 0.2810743285446596, "grad_norm": 2.4530911445617676, "learning_rate": 3.594628357276702e-05, "loss": 2.4469, "step": 900 }, { "epoch": 0.2873204247345409, "grad_norm": 3.4664344787597656, "learning_rate": 3.563397876327296e-05, "loss": 2.7295, "step": 920 }, { "epoch": 0.29356652092442226, "grad_norm": 4.130941867828369, "learning_rate": 3.532167395377889e-05, "loss": 2.8405, "step": 940 }, { "epoch": 0.29981261711430357, "grad_norm": 2.76470685005188, "learning_rate": 3.5009369144284825e-05, "loss": 2.7108, "step": 960 }, { "epoch": 0.3060587133041849, "grad_norm": 4.205817222595215, "learning_rate": 3.469706433479076e-05, "loss": 2.659, "step": 980 }, { "epoch": 0.3123048094940662, "grad_norm": 3.306403160095215, "learning_rate": 3.438475952529669e-05, "loss": 2.7492, "step": 1000 }, { "epoch": 0.3185509056839475, "grad_norm": 2.0320382118225098, "learning_rate": 3.4072454715802624e-05, "loss": 2.5899, "step": 1020 }, { "epoch": 0.32479700187382887, "grad_norm": 4.874257564544678, "learning_rate": 3.376014990630856e-05, "loss": 2.8844, "step": 1040 }, { "epoch": 0.3310430980637102, "grad_norm": 3.102703809738159, "learning_rate": 3.3447845096814495e-05, "loss": 2.6927, "step": 1060 }, { "epoch": 0.3372891942535915, "grad_norm": 3.8374264240264893, "learning_rate": 3.3135540287320424e-05, "loss": 2.7193, "step": 1080 }, { "epoch": 0.3435352904434728, "grad_norm": 4.128782749176025, "learning_rate": 3.282323547782636e-05, "loss": 2.7898, "step": 1100 }, { "epoch": 0.3497813866333542, "grad_norm": 2.540234088897705, "learning_rate": 3.2510930668332295e-05, "loss": 2.492, "step": 1120 }, { "epoch": 0.3560274828232355, "grad_norm": 5.775226593017578, "learning_rate": 3.219862585883823e-05, "loss": 2.7778, "step": 1140 }, { "epoch": 0.3622735790131168, "grad_norm": 3.100594997406006, "learning_rate": 3.188632104934416e-05, "loss": 2.2936, "step": 1160 }, { "epoch": 0.3685196752029981, "grad_norm": 8.619044303894043, "learning_rate": 3.1574016239850095e-05, "loss": 2.5594, "step": 1180 }, { "epoch": 0.3747657713928795, "grad_norm": 3.1239614486694336, "learning_rate": 3.126171143035603e-05, "loss": 2.8581, "step": 1200 }, { "epoch": 0.3810118675827608, "grad_norm": 3.2376351356506348, "learning_rate": 3.0949406620861966e-05, "loss": 2.4649, "step": 1220 }, { "epoch": 0.3872579637726421, "grad_norm": 2.9151909351348877, "learning_rate": 3.0637101811367894e-05, "loss": 2.6663, "step": 1240 }, { "epoch": 0.3935040599625234, "grad_norm": 4.819116592407227, "learning_rate": 3.0324797001873826e-05, "loss": 2.5575, "step": 1260 }, { "epoch": 0.3997501561524047, "grad_norm": 5.1033430099487305, "learning_rate": 3.0012492192379765e-05, "loss": 2.5924, "step": 1280 }, { "epoch": 0.4059962523422861, "grad_norm": 3.506040096282959, "learning_rate": 2.9700187382885697e-05, "loss": 2.6526, "step": 1300 }, { "epoch": 0.4122423485321674, "grad_norm": 5.795025825500488, "learning_rate": 2.9387882573391633e-05, "loss": 2.8599, "step": 1320 }, { "epoch": 0.4184884447220487, "grad_norm": 3.329371452331543, "learning_rate": 2.9075577763897565e-05, "loss": 2.5105, "step": 1340 }, { "epoch": 0.42473454091193, "grad_norm": 2.9050350189208984, "learning_rate": 2.8763272954403497e-05, "loss": 2.4808, "step": 1360 }, { "epoch": 0.4309806371018114, "grad_norm": 5.538816928863525, "learning_rate": 2.8450968144909436e-05, "loss": 2.6096, "step": 1380 }, { "epoch": 0.4372267332916927, "grad_norm": 4.255838394165039, "learning_rate": 2.8138663335415365e-05, "loss": 2.6359, "step": 1400 }, { "epoch": 0.443472829481574, "grad_norm": 6.425328731536865, "learning_rate": 2.7826358525921297e-05, "loss": 2.3929, "step": 1420 }, { "epoch": 0.4497189256714553, "grad_norm": 4.074633598327637, "learning_rate": 2.7514053716427236e-05, "loss": 2.3325, "step": 1440 }, { "epoch": 0.45596502186133664, "grad_norm": 4.7358245849609375, "learning_rate": 2.7201748906933168e-05, "loss": 2.4945, "step": 1460 }, { "epoch": 0.462211118051218, "grad_norm": 4.505492210388184, "learning_rate": 2.6889444097439103e-05, "loss": 2.5168, "step": 1480 }, { "epoch": 0.4684572142410993, "grad_norm": 6.606976509094238, "learning_rate": 2.6577139287945035e-05, "loss": 2.5106, "step": 1500 }, { "epoch": 0.47470331043098063, "grad_norm": 4.111032962799072, "learning_rate": 2.6264834478450968e-05, "loss": 2.7365, "step": 1520 }, { "epoch": 0.48094940662086194, "grad_norm": 6.724081993103027, "learning_rate": 2.5952529668956903e-05, "loss": 2.2786, "step": 1540 }, { "epoch": 0.4871955028107433, "grad_norm": 4.5777268409729, "learning_rate": 2.5640224859462835e-05, "loss": 2.1842, "step": 1560 }, { "epoch": 0.4934415990006246, "grad_norm": 5.641876697540283, "learning_rate": 2.5327920049968774e-05, "loss": 2.2779, "step": 1580 }, { "epoch": 0.49968769519050593, "grad_norm": 7.818845748901367, "learning_rate": 2.5015615240474706e-05, "loss": 2.1692, "step": 1600 }, { "epoch": 0.5059337913803873, "grad_norm": 5.147799968719482, "learning_rate": 2.4703310430980638e-05, "loss": 2.2715, "step": 1620 }, { "epoch": 0.5121798875702686, "grad_norm": 10.44902229309082, "learning_rate": 2.4391005621486574e-05, "loss": 2.4603, "step": 1640 }, { "epoch": 0.5184259837601499, "grad_norm": 3.7985732555389404, "learning_rate": 2.4078700811992506e-05, "loss": 2.3849, "step": 1660 }, { "epoch": 0.5246720799500312, "grad_norm": 5.865043640136719, "learning_rate": 2.3766396002498438e-05, "loss": 2.4521, "step": 1680 }, { "epoch": 0.5309181761399125, "grad_norm": 6.91799783706665, "learning_rate": 2.3454091193004373e-05, "loss": 2.159, "step": 1700 }, { "epoch": 0.5371642723297939, "grad_norm": 4.18131685256958, "learning_rate": 2.314178638351031e-05, "loss": 2.1075, "step": 1720 }, { "epoch": 0.5434103685196752, "grad_norm": 5.681297779083252, "learning_rate": 2.282948157401624e-05, "loss": 2.3221, "step": 1740 }, { "epoch": 0.5496564647095565, "grad_norm": 5.769021034240723, "learning_rate": 2.2517176764522173e-05, "loss": 2.3497, "step": 1760 }, { "epoch": 0.5559025608994379, "grad_norm": 9.655105590820312, "learning_rate": 2.220487195502811e-05, "loss": 2.5534, "step": 1780 }, { "epoch": 0.5621486570893192, "grad_norm": 8.151744842529297, "learning_rate": 2.1892567145534044e-05, "loss": 2.1035, "step": 1800 }, { "epoch": 0.5683947532792005, "grad_norm": 5.74683141708374, "learning_rate": 2.1580262336039976e-05, "loss": 2.154, "step": 1820 }, { "epoch": 0.5746408494690818, "grad_norm": 3.5920357704162598, "learning_rate": 2.1267957526545908e-05, "loss": 2.0646, "step": 1840 }, { "epoch": 0.5808869456589631, "grad_norm": 10.239977836608887, "learning_rate": 2.0955652717051844e-05, "loss": 2.198, "step": 1860 }, { "epoch": 0.5871330418488445, "grad_norm": 3.631121873855591, "learning_rate": 2.064334790755778e-05, "loss": 2.1863, "step": 1880 }, { "epoch": 0.5933791380387258, "grad_norm": 9.566389083862305, "learning_rate": 2.033104309806371e-05, "loss": 1.9127, "step": 1900 }, { "epoch": 0.5996252342286071, "grad_norm": 7.540945529937744, "learning_rate": 2.0018738288569643e-05, "loss": 2.2545, "step": 1920 }, { "epoch": 0.6058713304184884, "grad_norm": 5.457769393920898, "learning_rate": 1.970643347907558e-05, "loss": 1.7829, "step": 1940 }, { "epoch": 0.6121174266083698, "grad_norm": 4.6898088455200195, "learning_rate": 1.939412866958151e-05, "loss": 2.092, "step": 1960 }, { "epoch": 0.6183635227982511, "grad_norm": 4.707203388214111, "learning_rate": 1.9081823860087447e-05, "loss": 2.0059, "step": 1980 }, { "epoch": 0.6246096189881324, "grad_norm": 5.7073469161987305, "learning_rate": 1.876951905059338e-05, "loss": 2.0865, "step": 2000 }, { "epoch": 0.6308557151780138, "grad_norm": 6.401088237762451, "learning_rate": 1.8457214241099314e-05, "loss": 1.7292, "step": 2020 }, { "epoch": 0.637101811367895, "grad_norm": 8.511351585388184, "learning_rate": 1.8144909431605246e-05, "loss": 2.2739, "step": 2040 }, { "epoch": 0.6433479075577764, "grad_norm": 6.70497465133667, "learning_rate": 1.7832604622111182e-05, "loss": 2.042, "step": 2060 }, { "epoch": 0.6495940037476577, "grad_norm": 5.246578693389893, "learning_rate": 1.7520299812617117e-05, "loss": 2.0974, "step": 2080 }, { "epoch": 0.655840099937539, "grad_norm": 11.206995964050293, "learning_rate": 1.720799500312305e-05, "loss": 2.0204, "step": 2100 }, { "epoch": 0.6620861961274204, "grad_norm": 6.476535320281982, "learning_rate": 1.689569019362898e-05, "loss": 2.3546, "step": 2120 }, { "epoch": 0.6683322923173017, "grad_norm": 2.9628567695617676, "learning_rate": 1.6583385384134917e-05, "loss": 2.0813, "step": 2140 }, { "epoch": 0.674578388507183, "grad_norm": 5.9349188804626465, "learning_rate": 1.6271080574640852e-05, "loss": 2.2729, "step": 2160 }, { "epoch": 0.6808244846970644, "grad_norm": 6.260815143585205, "learning_rate": 1.5958775765146785e-05, "loss": 1.9583, "step": 2180 }, { "epoch": 0.6870705808869456, "grad_norm": 5.544328689575195, "learning_rate": 1.5646470955652717e-05, "loss": 2.0548, "step": 2200 }, { "epoch": 0.693316677076827, "grad_norm": 11.215269088745117, "learning_rate": 1.5334166146158652e-05, "loss": 1.7925, "step": 2220 }, { "epoch": 0.6995627732667083, "grad_norm": 7.354444980621338, "learning_rate": 1.5021861336664586e-05, "loss": 2.2688, "step": 2240 }, { "epoch": 0.7058088694565896, "grad_norm": 6.186635971069336, "learning_rate": 1.470955652717052e-05, "loss": 1.8723, "step": 2260 }, { "epoch": 0.712054965646471, "grad_norm": 6.673027992248535, "learning_rate": 1.4397251717676452e-05, "loss": 2.2833, "step": 2280 }, { "epoch": 0.7183010618363522, "grad_norm": 6.882096290588379, "learning_rate": 1.4084946908182387e-05, "loss": 1.8425, "step": 2300 }, { "epoch": 0.7245471580262336, "grad_norm": 5.77081823348999, "learning_rate": 1.3772642098688321e-05, "loss": 2.2437, "step": 2320 }, { "epoch": 0.730793254216115, "grad_norm": 4.107306957244873, "learning_rate": 1.3460337289194255e-05, "loss": 1.7821, "step": 2340 }, { "epoch": 0.7370393504059962, "grad_norm": 6.597945690155029, "learning_rate": 1.3148032479700187e-05, "loss": 2.0115, "step": 2360 }, { "epoch": 0.7432854465958776, "grad_norm": 13.545961380004883, "learning_rate": 1.283572767020612e-05, "loss": 2.325, "step": 2380 }, { "epoch": 0.749531542785759, "grad_norm": 5.746728420257568, "learning_rate": 1.2523422860712056e-05, "loss": 2.0989, "step": 2400 }, { "epoch": 0.7557776389756402, "grad_norm": 7.114035606384277, "learning_rate": 1.2211118051217988e-05, "loss": 1.8291, "step": 2420 }, { "epoch": 0.7620237351655216, "grad_norm": 6.319833278656006, "learning_rate": 1.1898813241723924e-05, "loss": 2.0398, "step": 2440 }, { "epoch": 0.7682698313554028, "grad_norm": 8.076285362243652, "learning_rate": 1.1586508432229858e-05, "loss": 2.365, "step": 2460 }, { "epoch": 0.7745159275452842, "grad_norm": 5.4164628982543945, "learning_rate": 1.127420362273579e-05, "loss": 2.2452, "step": 2480 }, { "epoch": 0.7807620237351656, "grad_norm": 8.143169403076172, "learning_rate": 1.0961898813241725e-05, "loss": 2.3809, "step": 2500 }, { "epoch": 0.7870081199250468, "grad_norm": 8.056733131408691, "learning_rate": 1.0649594003747657e-05, "loss": 2.2368, "step": 2520 }, { "epoch": 0.7932542161149282, "grad_norm": 7.611058712005615, "learning_rate": 1.0337289194253593e-05, "loss": 2.0644, "step": 2540 }, { "epoch": 0.7995003123048094, "grad_norm": 7.84691858291626, "learning_rate": 1.0024984384759525e-05, "loss": 1.8021, "step": 2560 }, { "epoch": 0.8057464084946908, "grad_norm": 5.361764430999756, "learning_rate": 9.71267957526546e-06, "loss": 1.9417, "step": 2580 }, { "epoch": 0.8119925046845722, "grad_norm": 6.095186233520508, "learning_rate": 9.400374765771393e-06, "loss": 2.3096, "step": 2600 }, { "epoch": 0.8182386008744534, "grad_norm": 7.828145980834961, "learning_rate": 9.088069956277328e-06, "loss": 2.2737, "step": 2620 }, { "epoch": 0.8244846970643348, "grad_norm": 5.065859794616699, "learning_rate": 8.77576514678326e-06, "loss": 2.182, "step": 2640 }, { "epoch": 0.8307307932542161, "grad_norm": 5.14509391784668, "learning_rate": 8.463460337289194e-06, "loss": 2.1187, "step": 2660 }, { "epoch": 0.8369768894440974, "grad_norm": 8.84304428100586, "learning_rate": 8.15115552779513e-06, "loss": 1.8987, "step": 2680 }, { "epoch": 0.8432229856339788, "grad_norm": 6.434870719909668, "learning_rate": 7.838850718301062e-06, "loss": 1.9413, "step": 2700 }, { "epoch": 0.84946908182386, "grad_norm": 4.600275039672852, "learning_rate": 7.526545908806996e-06, "loss": 2.1107, "step": 2720 }, { "epoch": 0.8557151780137414, "grad_norm": 9.691720962524414, "learning_rate": 7.214241099312929e-06, "loss": 1.8678, "step": 2740 }, { "epoch": 0.8619612742036228, "grad_norm": 5.024956226348877, "learning_rate": 6.901936289818864e-06, "loss": 2.1413, "step": 2760 }, { "epoch": 0.868207370393504, "grad_norm": 4.373764991760254, "learning_rate": 6.589631480324797e-06, "loss": 2.2157, "step": 2780 }, { "epoch": 0.8744534665833854, "grad_norm": 5.964173316955566, "learning_rate": 6.277326670830731e-06, "loss": 1.9203, "step": 2800 }, { "epoch": 0.8806995627732667, "grad_norm": 4.583627223968506, "learning_rate": 5.965021861336665e-06, "loss": 1.7377, "step": 2820 }, { "epoch": 0.886945658963148, "grad_norm": 5.218972206115723, "learning_rate": 5.652717051842599e-06, "loss": 2.0339, "step": 2840 }, { "epoch": 0.8931917551530294, "grad_norm": 4.644482135772705, "learning_rate": 5.340412242348533e-06, "loss": 1.9294, "step": 2860 }, { "epoch": 0.8994378513429107, "grad_norm": 3.316526412963867, "learning_rate": 5.028107432854466e-06, "loss": 1.9216, "step": 2880 }, { "epoch": 0.905683947532792, "grad_norm": 3.9178998470306396, "learning_rate": 4.7158026233603995e-06, "loss": 2.1137, "step": 2900 }, { "epoch": 0.9119300437226733, "grad_norm": 5.773944854736328, "learning_rate": 4.403497813866333e-06, "loss": 2.3483, "step": 2920 }, { "epoch": 0.9181761399125546, "grad_norm": 6.133700847625732, "learning_rate": 4.091193004372267e-06, "loss": 2.0656, "step": 2940 }, { "epoch": 0.924422236102436, "grad_norm": 3.0790977478027344, "learning_rate": 3.7788881948782017e-06, "loss": 1.9562, "step": 2960 }, { "epoch": 0.9306683322923173, "grad_norm": 4.9188103675842285, "learning_rate": 3.4665833853841355e-06, "loss": 2.4327, "step": 2980 }, { "epoch": 0.9369144284821986, "grad_norm": 4.5354509353637695, "learning_rate": 3.154278575890069e-06, "loss": 2.0456, "step": 3000 }, { "epoch": 0.94316052467208, "grad_norm": 4.375840187072754, "learning_rate": 2.8419737663960027e-06, "loss": 1.9895, "step": 3020 }, { "epoch": 0.9494066208619613, "grad_norm": 5.6817307472229, "learning_rate": 2.5296689569019365e-06, "loss": 1.9656, "step": 3040 }, { "epoch": 0.9556527170518426, "grad_norm": 6.242130756378174, "learning_rate": 2.2173641474078703e-06, "loss": 2.0195, "step": 3060 }, { "epoch": 0.9618988132417239, "grad_norm": 6.264340877532959, "learning_rate": 1.9050593379138039e-06, "loss": 1.826, "step": 3080 }, { "epoch": 0.9681449094316052, "grad_norm": 6.317758083343506, "learning_rate": 1.5927545284197377e-06, "loss": 2.2237, "step": 3100 }, { "epoch": 0.9743910056214866, "grad_norm": 7.347826957702637, "learning_rate": 1.2804497189256715e-06, "loss": 2.0525, "step": 3120 }, { "epoch": 0.9806371018113679, "grad_norm": 4.710007667541504, "learning_rate": 9.681449094316053e-07, "loss": 2.1825, "step": 3140 }, { "epoch": 0.9868831980012492, "grad_norm": 3.5454931259155273, "learning_rate": 6.558400999375391e-07, "loss": 2.1784, "step": 3160 }, { "epoch": 0.9931292941911305, "grad_norm": 4.577638149261475, "learning_rate": 3.435352904434728e-07, "loss": 1.703, "step": 3180 }, { "epoch": 0.9993753903810119, "grad_norm": 8.630655288696289, "learning_rate": 3.123048094940662e-08, "loss": 2.0709, "step": 3200 } ], "logging_steps": 20, "max_steps": 3202, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2326032083828736.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }