{ "best_metric": 1.5737380981445312, "best_model_checkpoint": "fat5-fr-small_v1/checkpoint-1545000", "epoch": 6.518048374103387, "eval_steps": 1000, "global_step": 1600000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00040737802338146166, "grad_norm": 2.0072696208953857, "learning_rate": 0.002512499999999994, "loss": 24.7291, "step": 100 }, { "epoch": 0.0008147560467629233, "grad_norm": 1.5881823301315308, "learning_rate": 0.0025249999999999904, "loss": 20.4769, "step": 200 }, { "epoch": 0.001222134070144385, "grad_norm": 2.2315056324005127, "learning_rate": 0.00253749999999999, "loss": 19.0568, "step": 300 }, { "epoch": 0.0016295120935258466, "grad_norm": 2.400899887084961, "learning_rate": 0.0025499999999999885, "loss": 18.3718, "step": 400 }, { "epoch": 0.0020368901169073085, "grad_norm": 2.5054872035980225, "learning_rate": 0.0025624999999999862, "loss": 17.9085, "step": 500 }, { "epoch": 0.00244426814028877, "grad_norm": 3.132996082305908, "learning_rate": 0.0025749999999999827, "loss": 17.5639, "step": 600 }, { "epoch": 0.0028516461636702317, "grad_norm": 3.571537971496582, "learning_rate": 0.0025874999999999813, "loss": 17.2831, "step": 700 }, { "epoch": 0.0032590241870516933, "grad_norm": 3.905449390411377, "learning_rate": 0.0025999999999999795, "loss": 16.9897, "step": 800 }, { "epoch": 0.003666402210433155, "grad_norm": 3.9679980278015137, "learning_rate": 0.0026124999999999764, "loss": 16.8025, "step": 900 }, { "epoch": 0.004073780233814617, "grad_norm": 6.748215675354004, "learning_rate": 0.00262499999999997, "loss": 16.5832, "step": 1000 }, { "epoch": 0.004073780233814617, "eval_MaskedAccuracy": 0.21481029309245273, "eval_loss": 3.4091806411743164, "eval_runtime": 210.9409, "eval_samples_per_second": 300.918, "eval_steps_per_second": 1.176, "step": 1000 }, { "epoch": 0.0044811582571960785, "grad_norm": 3.9621713161468506, "learning_rate": 0.002637499999999968, "loss": 16.4083, "step": 1100 }, { "epoch": 0.00488853628057754, "grad_norm": 4.407429218292236, "learning_rate": 0.002649999999999965, "loss": 16.2311, "step": 1200 }, { "epoch": 0.005295914303959002, "grad_norm": 3.0638349056243896, "learning_rate": 0.0026624999999999635, "loss": 16.0918, "step": 1300 }, { "epoch": 0.005703292327340463, "grad_norm": 3.7296924591064453, "learning_rate": 0.002674999999999961, "loss": 15.9018, "step": 1400 }, { "epoch": 0.006110670350721925, "grad_norm": 5.068863391876221, "learning_rate": 0.0026874999999999564, "loss": 15.721, "step": 1500 }, { "epoch": 0.0065180483741033865, "grad_norm": 3.687300682067871, "learning_rate": 0.002699999999999955, "loss": 15.6485, "step": 1600 }, { "epoch": 0.006925426397484848, "grad_norm": 3.4417190551757812, "learning_rate": 0.002712499999999955, "loss": 15.4687, "step": 1700 }, { "epoch": 0.00733280442086631, "grad_norm": 3.9998209476470947, "learning_rate": 0.0027249999999999514, "loss": 15.3952, "step": 1800 }, { "epoch": 0.007740182444247771, "grad_norm": 3.1600570678710938, "learning_rate": 0.002737499999999953, "loss": 15.2506, "step": 1900 }, { "epoch": 0.008147560467629234, "grad_norm": 3.644062042236328, "learning_rate": 0.0027499999999999495, "loss": 15.1622, "step": 2000 }, { "epoch": 0.008147560467629234, "eval_MaskedAccuracy": 0.24587377267058683, "eval_loss": 3.1341230869293213, "eval_runtime": 1685.5139, "eval_samples_per_second": 37.66, "eval_steps_per_second": 0.147, "step": 2000 }, { "epoch": 0.008554938491010695, "grad_norm": 4.730564117431641, "learning_rate": 0.0027624999999999425, "loss": 15.0285, "step": 2100 }, { "epoch": 0.008962316514392157, "grad_norm": 4.022442817687988, "learning_rate": 0.0027749999999999394, "loss": 14.9321, "step": 2200 }, { "epoch": 0.009369694537773619, "grad_norm": 2.78922700881958, "learning_rate": 0.0027874999999999363, "loss": 14.8595, "step": 2300 }, { "epoch": 0.00977707256115508, "grad_norm": 3.2824580669403076, "learning_rate": 0.0027999999999999345, "loss": 14.7518, "step": 2400 }, { "epoch": 0.010184450584536542, "grad_norm": 2.8441643714904785, "learning_rate": 0.0028124999999999322, "loss": 14.6065, "step": 2500 }, { "epoch": 0.010591828607918003, "grad_norm": 3.85910701751709, "learning_rate": 0.00282499999999993, "loss": 14.5693, "step": 2600 }, { "epoch": 0.010999206631299465, "grad_norm": 3.6006782054901123, "learning_rate": 0.0028374999999999286, "loss": 14.5064, "step": 2700 }, { "epoch": 0.011406584654680927, "grad_norm": 3.207846164703369, "learning_rate": 0.002849999999999924, "loss": 14.4506, "step": 2800 }, { "epoch": 0.011813962678062388, "grad_norm": 4.087400436401367, "learning_rate": 0.0028624999999999224, "loss": 14.314, "step": 2900 }, { "epoch": 0.01222134070144385, "grad_norm": 2.7313225269317627, "learning_rate": 0.002874999999999916, "loss": 14.2647, "step": 3000 }, { "epoch": 0.01222134070144385, "eval_MaskedAccuracy": 0.2672389482733827, "eval_loss": 2.9623844623565674, "eval_runtime": 156.1547, "eval_samples_per_second": 406.494, "eval_steps_per_second": 1.588, "step": 3000 }, { "epoch": 0.012628718724825311, "grad_norm": 2.6286871433258057, "learning_rate": 0.0028874999999999123, "loss": 14.2328, "step": 3100 }, { "epoch": 0.013036096748206773, "grad_norm": 3.34944486618042, "learning_rate": 0.0028999999999999113, "loss": 14.0831, "step": 3200 }, { "epoch": 0.013443474771588235, "grad_norm": 2.468932628631592, "learning_rate": 0.002912499999999909, "loss": 14.0768, "step": 3300 }, { "epoch": 0.013850852794969696, "grad_norm": 2.6487114429473877, "learning_rate": 0.0029249999999999042, "loss": 13.9294, "step": 3400 }, { "epoch": 0.014258230818351158, "grad_norm": 2.071166753768921, "learning_rate": 0.002937499999999899, "loss": 13.9091, "step": 3500 }, { "epoch": 0.01466560884173262, "grad_norm": 3.003756284713745, "learning_rate": 0.002949999999999894, "loss": 13.7766, "step": 3600 }, { "epoch": 0.015072986865114081, "grad_norm": 3.022969961166382, "learning_rate": 0.002962499999999894, "loss": 13.7965, "step": 3700 }, { "epoch": 0.015480364888495543, "grad_norm": 2.5271995067596436, "learning_rate": 0.002974999999999889, "loss": 13.7016, "step": 3800 }, { "epoch": 0.015887742911877004, "grad_norm": 2.9642605781555176, "learning_rate": 0.002987499999999884, "loss": 13.6264, "step": 3900 }, { "epoch": 0.016295120935258468, "grad_norm": 2.866058111190796, "learning_rate": 0.002999999999999876, "loss": 13.5886, "step": 4000 }, { "epoch": 0.016295120935258468, "eval_MaskedAccuracy": 0.2823220328091172, "eval_loss": 2.8647966384887695, "eval_runtime": 583.093, "eval_samples_per_second": 108.861, "eval_steps_per_second": 0.425, "step": 4000 }, { "epoch": 0.016702498958639928, "grad_norm": 3.0533554553985596, "learning_rate": 0.003012499999999872, "loss": 13.51, "step": 4100 }, { "epoch": 0.01710987698202139, "grad_norm": 3.9694809913635254, "learning_rate": 0.0030249999999998685, "loss": 13.4058, "step": 4200 }, { "epoch": 0.01751725500540285, "grad_norm": 3.5330469608306885, "learning_rate": 0.003037499999999864, "loss": 13.3674, "step": 4300 }, { "epoch": 0.017924633028784314, "grad_norm": 4.143810272216797, "learning_rate": 0.0030499999999998627, "loss": 13.2921, "step": 4400 }, { "epoch": 0.018332011052165774, "grad_norm": 3.4098572731018066, "learning_rate": 0.003062499999999858, "loss": 13.2422, "step": 4500 }, { "epoch": 0.018739389075547237, "grad_norm": 2.815377950668335, "learning_rate": 0.0030749999999998565, "loss": 13.1686, "step": 4600 }, { "epoch": 0.019146767098928697, "grad_norm": 2.2003087997436523, "learning_rate": 0.0030874999999998525, "loss": 13.1174, "step": 4700 }, { "epoch": 0.01955414512231016, "grad_norm": 3.600919723510742, "learning_rate": 0.003099999999999849, "loss": 13.0667, "step": 4800 }, { "epoch": 0.01996152314569162, "grad_norm": 3.6641998291015625, "learning_rate": 0.0031124999999998476, "loss": 13.0321, "step": 4900 }, { "epoch": 0.020368901169073084, "grad_norm": 2.7134268283843994, "learning_rate": 0.003124999999999845, "loss": 12.997, "step": 5000 }, { "epoch": 0.020368901169073084, "eval_MaskedAccuracy": 0.297161426868168, "eval_loss": 2.7633774280548096, "eval_runtime": 497.1203, "eval_samples_per_second": 127.687, "eval_steps_per_second": 0.499, "step": 5000 }, { "epoch": 0.020776279192454544, "grad_norm": 5.596516132354736, "learning_rate": 0.003137499999999841, "loss": 12.9323, "step": 5100 }, { "epoch": 0.021183657215836007, "grad_norm": 6.993663311004639, "learning_rate": 0.003149999999999838, "loss": 12.8955, "step": 5200 }, { "epoch": 0.021591035239217467, "grad_norm": 2.8680942058563232, "learning_rate": 0.003162499999999835, "loss": 12.8421, "step": 5300 }, { "epoch": 0.02199841326259893, "grad_norm": 3.843177556991577, "learning_rate": 0.003174999999999833, "loss": 12.8343, "step": 5400 }, { "epoch": 0.02240579128598039, "grad_norm": 10.011335372924805, "learning_rate": 0.0031874999999998307, "loss": 12.7531, "step": 5500 }, { "epoch": 0.022813169309361853, "grad_norm": 3.5039470195770264, "learning_rate": 0.003199999999999828, "loss": 12.7002, "step": 5600 }, { "epoch": 0.023220547332743313, "grad_norm": 3.037332773208618, "learning_rate": 0.0032124999999998288, "loss": 12.6757, "step": 5700 }, { "epoch": 0.023627925356124777, "grad_norm": 3.4344160556793213, "learning_rate": 0.0032249999999998287, "loss": 12.6907, "step": 5800 }, { "epoch": 0.024035303379506236, "grad_norm": 3.7940402030944824, "learning_rate": 0.003237499999999828, "loss": 12.6524, "step": 5900 }, { "epoch": 0.0244426814028877, "grad_norm": 3.782240152359009, "learning_rate": 0.0032499999999998216, "loss": 12.5874, "step": 6000 }, { "epoch": 0.0244426814028877, "eval_MaskedAccuracy": 0.3055268174984478, "eval_loss": 2.7101807594299316, "eval_runtime": 642.9482, "eval_samples_per_second": 98.726, "eval_steps_per_second": 0.386, "step": 6000 }, { "epoch": 0.02485005942626916, "grad_norm": 3.1856234073638916, "learning_rate": 0.0032624999999998194, "loss": 12.5659, "step": 6100 }, { "epoch": 0.025257437449650623, "grad_norm": 2.2912404537200928, "learning_rate": 0.0032749999999998154, "loss": 12.5519, "step": 6200 }, { "epoch": 0.025664815473032083, "grad_norm": 2.196720838546753, "learning_rate": 0.003287499999999813, "loss": 12.4975, "step": 6300 }, { "epoch": 0.026072193496413546, "grad_norm": 2.9122323989868164, "learning_rate": 0.003299999999999811, "loss": 12.4262, "step": 6400 }, { "epoch": 0.026479571519795006, "grad_norm": 2.5234193801879883, "learning_rate": 0.0033124999999998082, "loss": 12.4096, "step": 6500 }, { "epoch": 0.02688694954317647, "grad_norm": 4.733991622924805, "learning_rate": 0.0033249999999998042, "loss": 12.3823, "step": 6600 }, { "epoch": 0.02729432756655793, "grad_norm": 2.9852294921875, "learning_rate": 0.0033374999999997994, "loss": 12.3526, "step": 6700 }, { "epoch": 0.027701705589939393, "grad_norm": 3.347761392593384, "learning_rate": 0.0033499999999997932, "loss": 12.3387, "step": 6800 }, { "epoch": 0.028109083613320852, "grad_norm": 2.850923538208008, "learning_rate": 0.003362499999999787, "loss": 12.3076, "step": 6900 }, { "epoch": 0.028516461636702316, "grad_norm": 2.5441689491271973, "learning_rate": 0.0033749999999997853, "loss": 12.2344, "step": 7000 }, { "epoch": 0.028516461636702316, "eval_MaskedAccuracy": 0.31737777984121823, "eval_loss": 2.6147336959838867, "eval_runtime": 523.4424, "eval_samples_per_second": 121.266, "eval_steps_per_second": 0.474, "step": 7000 }, { "epoch": 0.028923839660083776, "grad_norm": 2.6782543659210205, "learning_rate": 0.0033874999999997804, "loss": 12.1975, "step": 7100 }, { "epoch": 0.02933121768346524, "grad_norm": 4.042718410491943, "learning_rate": 0.0033999999999997713, "loss": 12.1332, "step": 7200 }, { "epoch": 0.029738595706846702, "grad_norm": 2.6437010765075684, "learning_rate": 0.0034124999999997647, "loss": 12.0517, "step": 7300 }, { "epoch": 0.030145973730228162, "grad_norm": 3.0328099727630615, "learning_rate": 0.003424999999999764, "loss": 11.9627, "step": 7400 }, { "epoch": 0.030553351753609626, "grad_norm": 2.895012855529785, "learning_rate": 0.0034374999999997606, "loss": 11.9723, "step": 7500 }, { "epoch": 0.030960729776991085, "grad_norm": 8.878241539001465, "learning_rate": 0.003449999999999758, "loss": 11.9015, "step": 7600 }, { "epoch": 0.031368107800372545, "grad_norm": 2.7825474739074707, "learning_rate": 0.003462499999999754, "loss": 11.8462, "step": 7700 }, { "epoch": 0.03177548582375401, "grad_norm": 2.184889316558838, "learning_rate": 0.003474999999999749, "loss": 11.7993, "step": 7800 }, { "epoch": 0.03218286384713547, "grad_norm": 2.5624606609344482, "learning_rate": 0.003487499999999749, "loss": 11.7754, "step": 7900 }, { "epoch": 0.032590241870516935, "grad_norm": 2.3846676349639893, "learning_rate": 0.003499999999999745, "loss": 11.7032, "step": 8000 }, { "epoch": 0.032590241870516935, "eval_MaskedAccuracy": 0.3393313736809394, "eval_loss": 2.4879164695739746, "eval_runtime": 483.8698, "eval_samples_per_second": 131.184, "eval_steps_per_second": 0.513, "step": 8000 }, { "epoch": 0.03299761989389839, "grad_norm": 3.5283403396606445, "learning_rate": 0.0035124999999997415, "loss": 11.6798, "step": 8100 }, { "epoch": 0.033404997917279855, "grad_norm": 2.359555959701538, "learning_rate": 0.003524999999999741, "loss": 11.5984, "step": 8200 }, { "epoch": 0.03381237594066132, "grad_norm": 1.9014986753463745, "learning_rate": 0.0035374999999997353, "loss": 11.5199, "step": 8300 }, { "epoch": 0.03421975396404278, "grad_norm": 4.172168254852295, "learning_rate": 0.0035499999999997313, "loss": 11.4863, "step": 8400 }, { "epoch": 0.03462713198742424, "grad_norm": 2.5342233180999756, "learning_rate": 0.00356249999999973, "loss": 11.4043, "step": 8500 }, { "epoch": 0.0350345100108057, "grad_norm": 8.684643745422363, "learning_rate": 0.0035749999999997255, "loss": 11.3848, "step": 8600 }, { "epoch": 0.035441888034187165, "grad_norm": 11.089255332946777, "learning_rate": 0.0035874999999997237, "loss": 11.3441, "step": 8700 }, { "epoch": 0.03584926605756863, "grad_norm": 2.411904811859131, "learning_rate": 0.0035999999999997236, "loss": 11.3067, "step": 8800 }, { "epoch": 0.036256644080950085, "grad_norm": 4.111713409423828, "learning_rate": 0.003612499999999721, "loss": 11.2171, "step": 8900 }, { "epoch": 0.03666402210433155, "grad_norm": 5.4018354415893555, "learning_rate": 0.00362499999999972, "loss": 11.242, "step": 9000 }, { "epoch": 0.03666402210433155, "eval_MaskedAccuracy": 0.36379460285274207, "eval_loss": 2.403564929962158, "eval_runtime": 502.7075, "eval_samples_per_second": 126.268, "eval_steps_per_second": 0.493, "step": 9000 }, { "epoch": 0.03707140012771301, "grad_norm": 2.967698812484741, "learning_rate": 0.0036374999999997113, "loss": 11.1676, "step": 9100 }, { "epoch": 0.037478778151094475, "grad_norm": 1.8247146606445312, "learning_rate": 0.0036499999999997073, "loss": 11.151, "step": 9200 }, { "epoch": 0.03788615617447593, "grad_norm": 2.4903030395507812, "learning_rate": 0.003662499999999711, "loss": 11.1003, "step": 9300 }, { "epoch": 0.038293534197857394, "grad_norm": 3.0097227096557617, "learning_rate": 0.0036749999999997097, "loss": 11.0943, "step": 9400 }, { "epoch": 0.03870091222123886, "grad_norm": 5.106104373931885, "learning_rate": 0.0036874999999997097, "loss": 11.0692, "step": 9500 }, { "epoch": 0.03910829024462032, "grad_norm": 2.8577446937561035, "learning_rate": 0.0036999999999997118, "loss": 11.02, "step": 9600 }, { "epoch": 0.03951566826800178, "grad_norm": 2.462996244430542, "learning_rate": 0.0037124999999997074, "loss": 10.9984, "step": 9700 }, { "epoch": 0.03992304629138324, "grad_norm": 3.696761131286621, "learning_rate": 0.0037249999999997034, "loss": 10.94, "step": 9800 }, { "epoch": 0.040330424314764704, "grad_norm": 6.5250983238220215, "learning_rate": 0.0037374999999996972, "loss": 10.9396, "step": 9900 }, { "epoch": 0.04073780233814617, "grad_norm": 1.9880354404449463, "learning_rate": 0.003749999999999697, "loss": 10.9231, "step": 10000 }, { "epoch": 0.04073780233814617, "eval_MaskedAccuracy": 0.3768304916518788, "eval_loss": 2.3210949897766113, "eval_runtime": 560.8588, "eval_samples_per_second": 113.176, "eval_steps_per_second": 0.442, "step": 10000 }, { "epoch": 0.041145180361527624, "grad_norm": 3.661884307861328, "learning_rate": 0.003762499999999692, "loss": 10.9047, "step": 10100 }, { "epoch": 0.04155255838490909, "grad_norm": 4.004960060119629, "learning_rate": 0.0037749999999996914, "loss": 10.9156, "step": 10200 }, { "epoch": 0.04195993640829055, "grad_norm": 2.3835198879241943, "learning_rate": 0.003787499999999695, "loss": 10.8281, "step": 10300 }, { "epoch": 0.042367314431672014, "grad_norm": 5.180380821228027, "learning_rate": 0.0037999999999996873, "loss": 10.8489, "step": 10400 }, { "epoch": 0.04277469245505347, "grad_norm": 3.258373975753784, "learning_rate": 0.003812499999999687, "loss": 10.8193, "step": 10500 }, { "epoch": 0.043182070478434934, "grad_norm": 2.7809267044067383, "learning_rate": 0.0038249999999996833, "loss": 10.7944, "step": 10600 }, { "epoch": 0.0435894485018164, "grad_norm": 3.132779121398926, "learning_rate": 0.0038374999999996823, "loss": 10.8097, "step": 10700 }, { "epoch": 0.04399682652519786, "grad_norm": 3.2742056846618652, "learning_rate": 0.0038499999999996775, "loss": 10.7187, "step": 10800 }, { "epoch": 0.044404204548579324, "grad_norm": 2.2164018154144287, "learning_rate": 0.0038624999999996735, "loss": 10.7667, "step": 10900 }, { "epoch": 0.04481158257196078, "grad_norm": 2.96410870552063, "learning_rate": 0.0038749999999996712, "loss": 10.7232, "step": 11000 }, { "epoch": 0.04481158257196078, "eval_MaskedAccuracy": 0.3850977976517477, "eval_loss": 2.2640016078948975, "eval_runtime": 545.8539, "eval_samples_per_second": 116.288, "eval_steps_per_second": 0.454, "step": 11000 }, { "epoch": 0.04521896059534224, "grad_norm": 4.5663957595825195, "learning_rate": 0.0038874999999996686, "loss": 10.7144, "step": 11100 }, { "epoch": 0.04562633861872371, "grad_norm": 2.1517741680145264, "learning_rate": 0.003899999999999665, "loss": 10.6831, "step": 11200 }, { "epoch": 0.04603371664210517, "grad_norm": 2.739794969558716, "learning_rate": 0.003912499999999662, "loss": 10.6677, "step": 11300 }, { "epoch": 0.046441094665486626, "grad_norm": 1.7963323593139648, "learning_rate": 0.003924999999999659, "loss": 10.6297, "step": 11400 }, { "epoch": 0.04684847268886809, "grad_norm": 4.963115215301514, "learning_rate": 0.0039374999999996566, "loss": 10.6341, "step": 11500 }, { "epoch": 0.04725585071224955, "grad_norm": 4.522215366363525, "learning_rate": 0.0039499999999996595, "loss": 10.5891, "step": 11600 }, { "epoch": 0.047663228735631016, "grad_norm": 1.9597105979919434, "learning_rate": 0.00396249999999966, "loss": 10.572, "step": 11700 }, { "epoch": 0.04807060675901247, "grad_norm": 3.4971799850463867, "learning_rate": 0.003974999999999655, "loss": 10.5531, "step": 11800 }, { "epoch": 0.048477984782393936, "grad_norm": 3.4723501205444336, "learning_rate": 0.003987499999999656, "loss": 10.527, "step": 11900 }, { "epoch": 0.0488853628057754, "grad_norm": 3.51145339012146, "learning_rate": 0.003999999999999657, "loss": 10.5652, "step": 12000 }, { "epoch": 0.0488853628057754, "eval_MaskedAccuracy": 0.3894803210237721, "eval_loss": 2.252936840057373, "eval_runtime": 492.0122, "eval_samples_per_second": 129.013, "eval_steps_per_second": 0.504, "step": 12000 }, { "epoch": 0.04929274082915686, "grad_norm": 4.758820056915283, "learning_rate": 0.004012499999999649, "loss": 10.5526, "step": 12100 }, { "epoch": 0.04970011885253832, "grad_norm": 4.174615383148193, "learning_rate": 0.00402499999999965, "loss": 10.5421, "step": 12200 }, { "epoch": 0.05010749687591978, "grad_norm": 3.07735013961792, "learning_rate": 0.004037499999999652, "loss": 10.4709, "step": 12300 }, { "epoch": 0.050514874899301246, "grad_norm": 2.38267183303833, "learning_rate": 0.00404999999999965, "loss": 10.487, "step": 12400 }, { "epoch": 0.05092225292268271, "grad_norm": 9.329970359802246, "learning_rate": 0.004062499999999646, "loss": 10.4827, "step": 12500 }, { "epoch": 0.051329630946064166, "grad_norm": 1.8156367540359497, "learning_rate": 0.004074999999999641, "loss": 10.4427, "step": 12600 }, { "epoch": 0.05173700896944563, "grad_norm": 2.2023582458496094, "learning_rate": 0.004087499999999641, "loss": 10.4732, "step": 12700 }, { "epoch": 0.05214438699282709, "grad_norm": 6.574010372161865, "learning_rate": 0.004099999999999645, "loss": 10.4458, "step": 12800 }, { "epoch": 0.052551765016208556, "grad_norm": 2.2283875942230225, "learning_rate": 0.004112499999999646, "loss": 10.4411, "step": 12900 }, { "epoch": 0.05295914303959001, "grad_norm": 2.2073442935943604, "learning_rate": 0.004124999999999645, "loss": 10.4139, "step": 13000 }, { "epoch": 0.05295914303959001, "eval_MaskedAccuracy": 0.3959563478200268, "eval_loss": 2.1902289390563965, "eval_runtime": 503.6059, "eval_samples_per_second": 126.043, "eval_steps_per_second": 0.492, "step": 13000 }, { "epoch": 0.053366521062971475, "grad_norm": 1.9398807287216187, "learning_rate": 0.0041374999999996415, "loss": 10.3892, "step": 13100 }, { "epoch": 0.05377389908635294, "grad_norm": 1.7884042263031006, "learning_rate": 0.004149999999999641, "loss": 10.4056, "step": 13200 }, { "epoch": 0.0541812771097344, "grad_norm": 1.9226492643356323, "learning_rate": 0.004162499999999636, "loss": 10.3725, "step": 13300 }, { "epoch": 0.05458865513311586, "grad_norm": 3.9229259490966797, "learning_rate": 0.004174999999999633, "loss": 10.3876, "step": 13400 }, { "epoch": 0.05499603315649732, "grad_norm": 3.9717321395874023, "learning_rate": 0.004187499999999628, "loss": 10.3371, "step": 13500 }, { "epoch": 0.055403411179878785, "grad_norm": 2.2701687812805176, "learning_rate": 0.004199999999999622, "loss": 10.3493, "step": 13600 }, { "epoch": 0.05581078920326025, "grad_norm": 2.833211898803711, "learning_rate": 0.004212499999999617, "loss": 10.3324, "step": 13700 }, { "epoch": 0.056218167226641705, "grad_norm": 2.876721143722534, "learning_rate": 0.004224999999999617, "loss": 10.3139, "step": 13800 }, { "epoch": 0.05662554525002317, "grad_norm": 2.396615982055664, "learning_rate": 0.004237499999999612, "loss": 10.2732, "step": 13900 }, { "epoch": 0.05703292327340463, "grad_norm": 2.2113237380981445, "learning_rate": 0.004249999999999605, "loss": 10.2791, "step": 14000 }, { "epoch": 0.05703292327340463, "eval_MaskedAccuracy": 0.3979616627594577, "eval_loss": 2.1914055347442627, "eval_runtime": 428.8843, "eval_samples_per_second": 148.003, "eval_steps_per_second": 0.578, "step": 14000 }, { "epoch": 0.057440301296786095, "grad_norm": 2.713885545730591, "learning_rate": 0.004262499999999602, "loss": 10.2823, "step": 14100 }, { "epoch": 0.05784767932016755, "grad_norm": 6.887316703796387, "learning_rate": 0.004274999999999597, "loss": 10.3201, "step": 14200 }, { "epoch": 0.058255057343549015, "grad_norm": 2.308656930923462, "learning_rate": 0.004287499999999591, "loss": 10.2849, "step": 14300 }, { "epoch": 0.05866243536693048, "grad_norm": 3.651346206665039, "learning_rate": 0.004299999999999582, "loss": 10.2324, "step": 14400 }, { "epoch": 0.05906981339031194, "grad_norm": 2.9673140048980713, "learning_rate": 0.004312499999999577, "loss": 10.2551, "step": 14500 }, { "epoch": 0.059477191413693405, "grad_norm": 2.1455464363098145, "learning_rate": 0.004324999999999574, "loss": 10.2204, "step": 14600 }, { "epoch": 0.05988456943707486, "grad_norm": 2.9998409748077393, "learning_rate": 0.004337499999999567, "loss": 10.1696, "step": 14700 }, { "epoch": 0.060291947460456324, "grad_norm": 2.5606961250305176, "learning_rate": 0.004349999999999561, "loss": 10.1846, "step": 14800 }, { "epoch": 0.06069932548383779, "grad_norm": 3.6551334857940674, "learning_rate": 0.004362499999999559, "loss": 10.1903, "step": 14900 }, { "epoch": 0.06110670350721925, "grad_norm": 3.281557559967041, "learning_rate": 0.0043749999999995555, "loss": 10.1966, "step": 15000 }, { "epoch": 0.06110670350721925, "eval_MaskedAccuracy": 0.40231697911507147, "eval_loss": 2.163734197616577, "eval_runtime": 525.3812, "eval_samples_per_second": 120.819, "eval_steps_per_second": 0.472, "step": 15000 }, { "epoch": 0.06151408153060071, "grad_norm": 2.2004635334014893, "learning_rate": 0.004387499999999551, "loss": 10.1657, "step": 15100 }, { "epoch": 0.06192145955398217, "grad_norm": 4.036062717437744, "learning_rate": 0.004399999999999546, "loss": 10.2044, "step": 15200 }, { "epoch": 0.062328837577363634, "grad_norm": 3.3084022998809814, "learning_rate": 0.004412499999999543, "loss": 10.1388, "step": 15300 }, { "epoch": 0.06273621560074509, "grad_norm": 5.075527191162109, "learning_rate": 0.004424999999999538, "loss": 10.1232, "step": 15400 }, { "epoch": 0.06314359362412655, "grad_norm": 2.4572486877441406, "learning_rate": 0.004437499999999531, "loss": 10.1193, "step": 15500 }, { "epoch": 0.06355097164750802, "grad_norm": 2.4803900718688965, "learning_rate": 0.004449999999999525, "loss": 10.1271, "step": 15600 }, { "epoch": 0.06395834967088948, "grad_norm": 2.1690118312835693, "learning_rate": 0.004462499999999522, "loss": 10.089, "step": 15700 }, { "epoch": 0.06436572769427094, "grad_norm": 2.363934278488159, "learning_rate": 0.004474999999999518, "loss": 10.0945, "step": 15800 }, { "epoch": 0.06477310571765241, "grad_norm": 3.3021128177642822, "learning_rate": 0.004487499999999511, "loss": 10.1015, "step": 15900 }, { "epoch": 0.06518048374103387, "grad_norm": 3.0676352977752686, "learning_rate": 0.00449999999999951, "loss": 10.055, "step": 16000 }, { "epoch": 0.06518048374103387, "eval_MaskedAccuracy": 0.4079402003894328, "eval_loss": 2.1536672115325928, "eval_runtime": 538.1235, "eval_samples_per_second": 117.958, "eval_steps_per_second": 0.461, "step": 16000 }, { "epoch": 0.06558786176441532, "grad_norm": 2.4352786540985107, "learning_rate": 0.004512499999999503, "loss": 10.0648, "step": 16100 }, { "epoch": 0.06599523978779678, "grad_norm": 3.645604133605957, "learning_rate": 0.004524999999999504, "loss": 10.026, "step": 16200 }, { "epoch": 0.06640261781117825, "grad_norm": 2.6965138912200928, "learning_rate": 0.004537499999999495, "loss": 10.0569, "step": 16300 }, { "epoch": 0.06680999583455971, "grad_norm": 2.6536505222320557, "learning_rate": 0.004549999999999481, "loss": 10.0487, "step": 16400 }, { "epoch": 0.06721737385794117, "grad_norm": 2.891232490539551, "learning_rate": 0.0045624999999994715, "loss": 10.0514, "step": 16500 }, { "epoch": 0.06762475188132264, "grad_norm": 2.675612688064575, "learning_rate": 0.0045749999999994675, "loss": 10.016, "step": 16600 }, { "epoch": 0.0680321299047041, "grad_norm": 4.82465934753418, "learning_rate": 0.004587499999999466, "loss": 10.0081, "step": 16700 }, { "epoch": 0.06843950792808556, "grad_norm": 2.8379580974578857, "learning_rate": 0.004599999999999462, "loss": 10.0316, "step": 16800 }, { "epoch": 0.06884688595146701, "grad_norm": 2.4882845878601074, "learning_rate": 0.0046124999999994564, "loss": 9.991, "step": 16900 }, { "epoch": 0.06925426397484848, "grad_norm": 1.8132230043411255, "learning_rate": 0.004624999999999449, "loss": 9.9832, "step": 17000 }, { "epoch": 0.06925426397484848, "eval_MaskedAccuracy": 0.41064153600548414, "eval_loss": 2.115696430206299, "eval_runtime": 591.8965, "eval_samples_per_second": 107.242, "eval_steps_per_second": 0.419, "step": 17000 }, { "epoch": 0.06966164199822994, "grad_norm": 2.6878249645233154, "learning_rate": 0.004637499999999442, "loss": 9.9764, "step": 17100 }, { "epoch": 0.0700690200216114, "grad_norm": 1.909752607345581, "learning_rate": 0.00464999999999944, "loss": 10.0211, "step": 17200 }, { "epoch": 0.07047639804499287, "grad_norm": 2.974292516708374, "learning_rate": 0.00466249999999944, "loss": 9.9357, "step": 17300 }, { "epoch": 0.07088377606837433, "grad_norm": 2.8411359786987305, "learning_rate": 0.004674999999999435, "loss": 9.9466, "step": 17400 }, { "epoch": 0.07129115409175579, "grad_norm": 1.8513596057891846, "learning_rate": 0.004687499999999431, "loss": 9.9352, "step": 17500 }, { "epoch": 0.07169853211513726, "grad_norm": 2.7194032669067383, "learning_rate": 0.004699999999999424, "loss": 9.9283, "step": 17600 }, { "epoch": 0.0721059101385187, "grad_norm": 3.459972858428955, "learning_rate": 0.004712499999999414, "loss": 9.9118, "step": 17700 }, { "epoch": 0.07251328816190017, "grad_norm": 1.906203031539917, "learning_rate": 0.004724999999999409, "loss": 9.9149, "step": 17800 }, { "epoch": 0.07292066618528163, "grad_norm": 2.151893377304077, "learning_rate": 0.0047374999999993984, "loss": 9.9155, "step": 17900 }, { "epoch": 0.0733280442086631, "grad_norm": 3.4184906482696533, "learning_rate": 0.004749999999999393, "loss": 9.8929, "step": 18000 }, { "epoch": 0.0733280442086631, "eval_MaskedAccuracy": 0.41359918387247874, "eval_loss": 2.0913889408111572, "eval_runtime": 477.7729, "eval_samples_per_second": 132.858, "eval_steps_per_second": 0.519, "step": 18000 }, { "epoch": 0.07373542223204456, "grad_norm": 3.0248520374298096, "learning_rate": 0.004762499999999384, "loss": 9.9187, "step": 18100 }, { "epoch": 0.07414280025542602, "grad_norm": 3.433152675628662, "learning_rate": 0.004774999999999384, "loss": 9.9119, "step": 18200 }, { "epoch": 0.07455017827880749, "grad_norm": 1.9969996213912964, "learning_rate": 0.00478749999999938, "loss": 9.8964, "step": 18300 }, { "epoch": 0.07495755630218895, "grad_norm": 2.542823553085327, "learning_rate": 0.004799999999999372, "loss": 9.9127, "step": 18400 }, { "epoch": 0.07536493432557041, "grad_norm": 2.1811461448669434, "learning_rate": 0.004812499999999362, "loss": 9.9061, "step": 18500 }, { "epoch": 0.07577231234895186, "grad_norm": 2.4389021396636963, "learning_rate": 0.004824999999999358, "loss": 9.8594, "step": 18600 }, { "epoch": 0.07617969037233333, "grad_norm": 1.6460280418395996, "learning_rate": 0.004837499999999354, "loss": 9.8418, "step": 18700 }, { "epoch": 0.07658706839571479, "grad_norm": 3.33437442779541, "learning_rate": 0.00484999999999935, "loss": 9.8456, "step": 18800 }, { "epoch": 0.07699444641909625, "grad_norm": 2.103670120239258, "learning_rate": 0.004862499999999345, "loss": 9.8474, "step": 18900 }, { "epoch": 0.07740182444247772, "grad_norm": 19.984230041503906, "learning_rate": 0.004874999999999335, "loss": 9.8543, "step": 19000 }, { "epoch": 0.07740182444247772, "eval_MaskedAccuracy": 0.41617825892081933, "eval_loss": 2.082752227783203, "eval_runtime": 497.0849, "eval_samples_per_second": 127.697, "eval_steps_per_second": 0.499, "step": 19000 }, { "epoch": 0.07780920246585918, "grad_norm": 3.1915500164031982, "learning_rate": 0.004887499999999324, "loss": 9.8487, "step": 19100 }, { "epoch": 0.07821658048924064, "grad_norm": 3.868563652038574, "learning_rate": 0.004899999999999312, "loss": 9.8177, "step": 19200 }, { "epoch": 0.0786239585126221, "grad_norm": 1.9810203313827515, "learning_rate": 0.0049124999999993054, "loss": 9.8262, "step": 19300 }, { "epoch": 0.07903133653600355, "grad_norm": 8.838945388793945, "learning_rate": 0.004924999999999296, "loss": 9.8182, "step": 19400 }, { "epoch": 0.07943871455938502, "grad_norm": 2.2315778732299805, "learning_rate": 0.004937499999999295, "loss": 9.8063, "step": 19500 }, { "epoch": 0.07984609258276648, "grad_norm": 3.6076674461364746, "learning_rate": 0.004949999999999293, "loss": 9.7827, "step": 19600 }, { "epoch": 0.08025347060614794, "grad_norm": 1.9593068361282349, "learning_rate": 0.004962499999999282, "loss": 9.772, "step": 19700 }, { "epoch": 0.08066084862952941, "grad_norm": 5.378314018249512, "learning_rate": 0.004974999999999273, "loss": 9.8082, "step": 19800 }, { "epoch": 0.08106822665291087, "grad_norm": 1.7450096607208252, "learning_rate": 0.004987499999999261, "loss": 9.7298, "step": 19900 }, { "epoch": 0.08147560467629233, "grad_norm": 2.121546983718872, "learning_rate": 0.005, "loss": 9.7705, "step": 20000 }, { "epoch": 0.08147560467629233, "eval_MaskedAccuracy": 0.4192546410315609, "eval_loss": 2.068751573562622, "eval_runtime": 537.5828, "eval_samples_per_second": 118.077, "eval_steps_per_second": 0.461, "step": 20000 }, { "epoch": 0.0818829826996738, "grad_norm": 3.6601402759552, "learning_rate": 0.004999999950679656, "loss": 9.7526, "step": 20100 }, { "epoch": 0.08229036072305525, "grad_norm": 4.677398204803467, "learning_rate": 0.004999999802718617, "loss": 9.7548, "step": 20200 }, { "epoch": 0.08269773874643671, "grad_norm": 1.8637826442718506, "learning_rate": 0.0049999995561169075, "loss": 9.7575, "step": 20300 }, { "epoch": 0.08310511676981817, "grad_norm": 2.215644598007202, "learning_rate": 0.004999999210874492, "loss": 9.7453, "step": 20400 }, { "epoch": 0.08351249479319964, "grad_norm": 3.032287359237671, "learning_rate": 0.004999998766991425, "loss": 9.7357, "step": 20500 }, { "epoch": 0.0839198728165811, "grad_norm": 4.440647125244141, "learning_rate": 0.004999998224467713, "loss": 9.7288, "step": 20600 }, { "epoch": 0.08432725083996256, "grad_norm": 2.7516491413116455, "learning_rate": 0.004999997583303374, "loss": 9.718, "step": 20700 }, { "epoch": 0.08473462886334403, "grad_norm": 3.037583112716675, "learning_rate": 0.004999996843498439, "loss": 9.6802, "step": 20800 }, { "epoch": 0.08514200688672549, "grad_norm": 2.4326248168945312, "learning_rate": 0.004999996005052938, "loss": 9.7056, "step": 20900 }, { "epoch": 0.08554938491010694, "grad_norm": 2.9642679691314697, "learning_rate": 0.0049999950679669, "loss": 9.6847, "step": 21000 }, { "epoch": 0.08554938491010694, "eval_MaskedAccuracy": 0.41856565694952075, "eval_loss": 2.056820869445801, "eval_runtime": 591.4303, "eval_samples_per_second": 107.326, "eval_steps_per_second": 0.419, "step": 21000 }, { "epoch": 0.0859567629334884, "grad_norm": 5.9169816970825195, "learning_rate": 0.004999994032240362, "loss": 9.7047, "step": 21100 }, { "epoch": 0.08636414095686987, "grad_norm": 3.5219688415527344, "learning_rate": 0.004999992897873365, "loss": 9.6912, "step": 21200 }, { "epoch": 0.08677151898025133, "grad_norm": 2.1443097591400146, "learning_rate": 0.004999991664865955, "loss": 9.6986, "step": 21300 }, { "epoch": 0.0871788970036328, "grad_norm": 2.3986363410949707, "learning_rate": 0.004999990333218182, "loss": 9.6317, "step": 21400 }, { "epoch": 0.08758627502701426, "grad_norm": 3.069018602371216, "learning_rate": 0.004999988902930097, "loss": 9.6689, "step": 21500 }, { "epoch": 0.08799365305039572, "grad_norm": 2.816075325012207, "learning_rate": 0.004999987374001755, "loss": 9.6185, "step": 21600 }, { "epoch": 0.08840103107377718, "grad_norm": 3.1795783042907715, "learning_rate": 0.004999985746433214, "loss": 9.6014, "step": 21700 }, { "epoch": 0.08880840909715865, "grad_norm": 2.872256278991699, "learning_rate": 0.004999984020224545, "loss": 9.648, "step": 21800 }, { "epoch": 0.0892157871205401, "grad_norm": 1.5614632368087769, "learning_rate": 0.004999982195375815, "loss": 9.6818, "step": 21900 }, { "epoch": 0.08962316514392156, "grad_norm": 10.637242317199707, "learning_rate": 0.004999980271887092, "loss": 9.6328, "step": 22000 }, { "epoch": 0.08962316514392156, "eval_MaskedAccuracy": 0.4240338106169538, "eval_loss": 2.0506956577301025, "eval_runtime": 454.9073, "eval_samples_per_second": 139.536, "eval_steps_per_second": 0.545, "step": 22000 }, { "epoch": 0.09003054316730302, "grad_norm": 3.350553512573242, "learning_rate": 0.004999978249758461, "loss": 9.6098, "step": 22100 }, { "epoch": 0.09043792119068449, "grad_norm": 7.316722869873047, "learning_rate": 0.004999976128989992, "loss": 9.6121, "step": 22200 }, { "epoch": 0.09084529921406595, "grad_norm": 2.7911298274993896, "learning_rate": 0.00499997390958177, "loss": 9.6148, "step": 22300 }, { "epoch": 0.09125267723744741, "grad_norm": 2.023620843887329, "learning_rate": 0.004999971591533885, "loss": 9.5944, "step": 22400 }, { "epoch": 0.09166005526082888, "grad_norm": 3.6603286266326904, "learning_rate": 0.004999969174846427, "loss": 9.597, "step": 22500 }, { "epoch": 0.09206743328421034, "grad_norm": 1.7879774570465088, "learning_rate": 0.004999966659519497, "loss": 9.6179, "step": 22600 }, { "epoch": 0.09247481130759179, "grad_norm": 2.120997428894043, "learning_rate": 0.004999964045553194, "loss": 9.5787, "step": 22700 }, { "epoch": 0.09288218933097325, "grad_norm": 3.384791851043701, "learning_rate": 0.00499996133294762, "loss": 9.5839, "step": 22800 }, { "epoch": 0.09328956735435472, "grad_norm": 3.9101996421813965, "learning_rate": 0.004999958521702876, "loss": 9.593, "step": 22900 }, { "epoch": 0.09369694537773618, "grad_norm": 3.653648614883423, "learning_rate": 0.00499995561181908, "loss": 9.5407, "step": 23000 }, { "epoch": 0.09369694537773618, "eval_MaskedAccuracy": 0.42734800606093365, "eval_loss": 2.0217700004577637, "eval_runtime": 553.2274, "eval_samples_per_second": 114.738, "eval_steps_per_second": 0.448, "step": 23000 }, { "epoch": 0.09410432340111764, "grad_norm": 1.577781319618225, "learning_rate": 0.004999952603296345, "loss": 9.5353, "step": 23100 }, { "epoch": 0.0945117014244991, "grad_norm": 4.443587303161621, "learning_rate": 0.004999949496134789, "loss": 9.5974, "step": 23200 }, { "epoch": 0.09491907944788057, "grad_norm": 3.1883552074432373, "learning_rate": 0.004999946290334532, "loss": 9.5262, "step": 23300 }, { "epoch": 0.09532645747126203, "grad_norm": 3.181093215942383, "learning_rate": 0.004999942985895707, "loss": 9.5158, "step": 23400 }, { "epoch": 0.09573383549464348, "grad_norm": 1.8355048894882202, "learning_rate": 0.004999939582818444, "loss": 9.5269, "step": 23500 }, { "epoch": 0.09614121351802495, "grad_norm": 2.4568188190460205, "learning_rate": 0.004999936081102877, "loss": 9.5686, "step": 23600 }, { "epoch": 0.09654859154140641, "grad_norm": 1.7274566888809204, "learning_rate": 0.004999932480749139, "loss": 9.5021, "step": 23700 }, { "epoch": 0.09695596956478787, "grad_norm": 1.916005253791809, "learning_rate": 0.004999928781757373, "loss": 9.4656, "step": 23800 }, { "epoch": 0.09736334758816934, "grad_norm": 2.241408109664917, "learning_rate": 0.004999924984127735, "loss": 9.5168, "step": 23900 }, { "epoch": 0.0977707256115508, "grad_norm": 1.5999352931976318, "learning_rate": 0.004999921087860366, "loss": 9.539, "step": 24000 }, { "epoch": 0.0977707256115508, "eval_MaskedAccuracy": 0.42926830999883303, "eval_loss": 2.0102176666259766, "eval_runtime": 525.098, "eval_samples_per_second": 120.884, "eval_steps_per_second": 0.472, "step": 24000 }, { "epoch": 0.09817810363493226, "grad_norm": 3.433403253555298, "learning_rate": 0.004999917092955428, "loss": 9.5043, "step": 24100 }, { "epoch": 0.09858548165831373, "grad_norm": 1.3292396068572998, "learning_rate": 0.004999912999413068, "loss": 9.4909, "step": 24200 }, { "epoch": 0.09899285968169518, "grad_norm": 1.9488959312438965, "learning_rate": 0.004999908807233453, "loss": 9.5018, "step": 24300 }, { "epoch": 0.09940023770507664, "grad_norm": 2.3496742248535156, "learning_rate": 0.004999904516416743, "loss": 9.5013, "step": 24400 }, { "epoch": 0.0998076157284581, "grad_norm": 3.0133869647979736, "learning_rate": 0.0049999001269631195, "loss": 9.483, "step": 24500 }, { "epoch": 0.10021499375183957, "grad_norm": 2.9716012477874756, "learning_rate": 0.004999895638872754, "loss": 9.4936, "step": 24600 }, { "epoch": 0.10062237177522103, "grad_norm": 1.4587233066558838, "learning_rate": 0.004999891052145823, "loss": 9.4348, "step": 24700 }, { "epoch": 0.10102974979860249, "grad_norm": 3.042893171310425, "learning_rate": 0.004999886366782502, "loss": 9.4609, "step": 24800 }, { "epoch": 0.10143712782198396, "grad_norm": 4.598676681518555, "learning_rate": 0.004999881582782976, "loss": 9.4539, "step": 24900 }, { "epoch": 0.10184450584536542, "grad_norm": 7.61506986618042, "learning_rate": 0.0049998767001474395, "loss": 9.4283, "step": 25000 }, { "epoch": 0.10184450584536542, "eval_MaskedAccuracy": 0.4277697153857394, "eval_loss": 2.019444465637207, "eval_runtime": 599.1003, "eval_samples_per_second": 105.952, "eval_steps_per_second": 0.414, "step": 25000 }, { "epoch": 0.10225188386874687, "grad_norm": 2.0222227573394775, "learning_rate": 0.00499987171887608, "loss": 9.4448, "step": 25100 }, { "epoch": 0.10265926189212833, "grad_norm": 4.220450401306152, "learning_rate": 0.004999866638969104, "loss": 9.4704, "step": 25200 }, { "epoch": 0.1030666399155098, "grad_norm": 3.21671199798584, "learning_rate": 0.004999861460426701, "loss": 9.4621, "step": 25300 }, { "epoch": 0.10347401793889126, "grad_norm": 3.3255972862243652, "learning_rate": 0.004999856183249082, "loss": 9.4166, "step": 25400 }, { "epoch": 0.10388139596227272, "grad_norm": 2.6986167430877686, "learning_rate": 0.004999850807436452, "loss": 9.4351, "step": 25500 }, { "epoch": 0.10428877398565418, "grad_norm": 2.148319959640503, "learning_rate": 0.004999845332989027, "loss": 9.4018, "step": 25600 }, { "epoch": 0.10469615200903565, "grad_norm": 3.1325294971466064, "learning_rate": 0.004999839759907026, "loss": 9.4271, "step": 25700 }, { "epoch": 0.10510353003241711, "grad_norm": 2.4521164894104004, "learning_rate": 0.0049998340881906665, "loss": 9.3909, "step": 25800 }, { "epoch": 0.10551090805579857, "grad_norm": 4.3904571533203125, "learning_rate": 0.004999828317840176, "loss": 9.431, "step": 25900 }, { "epoch": 0.10591828607918002, "grad_norm": 3.555058717727661, "learning_rate": 0.004999822448855773, "loss": 9.3742, "step": 26000 }, { "epoch": 0.10591828607918002, "eval_MaskedAccuracy": 0.43347176834705675, "eval_loss": 1.9825259447097778, "eval_runtime": 484.8715, "eval_samples_per_second": 130.913, "eval_steps_per_second": 0.511, "step": 26000 }, { "epoch": 0.10632566410256149, "grad_norm": 2.8984215259552, "learning_rate": 0.004999816481237698, "loss": 9.3632, "step": 26100 }, { "epoch": 0.10673304212594295, "grad_norm": 3.2958881855010986, "learning_rate": 0.0049998104149861855, "loss": 9.433, "step": 26200 }, { "epoch": 0.10714042014932441, "grad_norm": 3.2342183589935303, "learning_rate": 0.004999804250101472, "loss": 9.3908, "step": 26300 }, { "epoch": 0.10754779817270588, "grad_norm": 2.292680501937866, "learning_rate": 0.004999797986583799, "loss": 9.385, "step": 26400 }, { "epoch": 0.10795517619608734, "grad_norm": 1.9405272006988525, "learning_rate": 0.004999791624433422, "loss": 9.4612, "step": 26500 }, { "epoch": 0.1083625542194688, "grad_norm": 5.097912788391113, "learning_rate": 0.004999785163650587, "loss": 9.4082, "step": 26600 }, { "epoch": 0.10876993224285027, "grad_norm": 3.781742572784424, "learning_rate": 0.004999778604235553, "loss": 9.4121, "step": 26700 }, { "epoch": 0.10917731026623172, "grad_norm": 2.784597158432007, "learning_rate": 0.004999771946188575, "loss": 9.3863, "step": 26800 }, { "epoch": 0.10958468828961318, "grad_norm": 2.1151978969573975, "learning_rate": 0.004999765189509921, "loss": 9.341, "step": 26900 }, { "epoch": 0.10999206631299464, "grad_norm": 3.7644362449645996, "learning_rate": 0.004999758334199855, "loss": 9.37, "step": 27000 }, { "epoch": 0.10999206631299464, "eval_MaskedAccuracy": 0.4307816741389779, "eval_loss": 1.9972769021987915, "eval_runtime": 427.1236, "eval_samples_per_second": 148.613, "eval_steps_per_second": 0.581, "step": 27000 }, { "epoch": 0.1103994443363761, "grad_norm": 4.518658638000488, "learning_rate": 0.004999751380258652, "loss": 9.4088, "step": 27100 }, { "epoch": 0.11080682235975757, "grad_norm": 3.079120397567749, "learning_rate": 0.004999744327686584, "loss": 9.3893, "step": 27200 }, { "epoch": 0.11121420038313903, "grad_norm": 3.1319916248321533, "learning_rate": 0.0049997371764839245, "loss": 9.3885, "step": 27300 }, { "epoch": 0.1116215784065205, "grad_norm": 2.0859529972076416, "learning_rate": 0.004999729926650963, "loss": 9.3566, "step": 27400 }, { "epoch": 0.11202895642990196, "grad_norm": 4.668200492858887, "learning_rate": 0.004999722578187986, "loss": 9.3839, "step": 27500 }, { "epoch": 0.11243633445328341, "grad_norm": 3.367041826248169, "learning_rate": 0.00499971513109529, "loss": 9.3382, "step": 27600 }, { "epoch": 0.11284371247666487, "grad_norm": 3.040536880493164, "learning_rate": 0.004999707585373157, "loss": 9.3642, "step": 27700 }, { "epoch": 0.11325109050004634, "grad_norm": 2.1498825550079346, "learning_rate": 0.004999699941021894, "loss": 9.3099, "step": 27800 }, { "epoch": 0.1136584685234278, "grad_norm": 1.7926315069198608, "learning_rate": 0.004999692198041797, "loss": 9.3113, "step": 27900 }, { "epoch": 0.11406584654680926, "grad_norm": 1.7881391048431396, "learning_rate": 0.004999684356433179, "loss": 9.4188, "step": 28000 }, { "epoch": 0.11406584654680926, "eval_MaskedAccuracy": 0.4331004890207451, "eval_loss": 1.993469476699829, "eval_runtime": 520.9301, "eval_samples_per_second": 121.851, "eval_steps_per_second": 0.476, "step": 28000 }, { "epoch": 0.11447322457019073, "grad_norm": 4.676214218139648, "learning_rate": 0.0049996764161963385, "loss": 9.3526, "step": 28100 }, { "epoch": 0.11488060259357219, "grad_norm": 3.5479514598846436, "learning_rate": 0.004999668377331597, "loss": 9.3614, "step": 28200 }, { "epoch": 0.11528798061695365, "grad_norm": 3.582327365875244, "learning_rate": 0.004999660239839272, "loss": 9.3217, "step": 28300 }, { "epoch": 0.1156953586403351, "grad_norm": 1.5663789510726929, "learning_rate": 0.004999652003719687, "loss": 9.2903, "step": 28400 }, { "epoch": 0.11610273666371657, "grad_norm": 2.083988904953003, "learning_rate": 0.004999643668973165, "loss": 9.3266, "step": 28500 }, { "epoch": 0.11651011468709803, "grad_norm": 4.117608547210693, "learning_rate": 0.0049996352356000365, "loss": 9.307, "step": 28600 }, { "epoch": 0.11691749271047949, "grad_norm": 1.3780686855316162, "learning_rate": 0.0049996267036006336, "loss": 9.3073, "step": 28700 }, { "epoch": 0.11732487073386096, "grad_norm": 11.629694938659668, "learning_rate": 0.004999618072975303, "loss": 9.3441, "step": 28800 }, { "epoch": 0.11773224875724242, "grad_norm": 2.022599220275879, "learning_rate": 0.004999609343724368, "loss": 9.2835, "step": 28900 }, { "epoch": 0.11813962678062388, "grad_norm": 3.4412121772766113, "learning_rate": 0.004999600515848186, "loss": 9.2739, "step": 29000 }, { "epoch": 0.11813962678062388, "eval_MaskedAccuracy": 0.43849306787282605, "eval_loss": 1.9530013799667358, "eval_runtime": 507.2801, "eval_samples_per_second": 125.13, "eval_steps_per_second": 0.489, "step": 29000 }, { "epoch": 0.11854700480400535, "grad_norm": 3.3755948543548584, "learning_rate": 0.004999591589347103, "loss": 9.2399, "step": 29100 }, { "epoch": 0.11895438282738681, "grad_norm": 1.9202914237976074, "learning_rate": 0.004999582564221479, "loss": 9.2946, "step": 29200 }, { "epoch": 0.11936176085076826, "grad_norm": 3.1575686931610107, "learning_rate": 0.004999573440471659, "loss": 9.3542, "step": 29300 }, { "epoch": 0.11976913887414972, "grad_norm": 5.3306732177734375, "learning_rate": 0.0049995642180980125, "loss": 9.3686, "step": 29400 }, { "epoch": 0.12017651689753119, "grad_norm": 3.7781152725219727, "learning_rate": 0.0049995548971008925, "loss": 9.3227, "step": 29500 }, { "epoch": 0.12058389492091265, "grad_norm": 3.726844549179077, "learning_rate": 0.004999545477480681, "loss": 9.3034, "step": 29600 }, { "epoch": 0.12099127294429411, "grad_norm": 2.921781301498413, "learning_rate": 0.004999535959237747, "loss": 9.2338, "step": 29700 }, { "epoch": 0.12139865096767558, "grad_norm": 2.6174159049987793, "learning_rate": 0.004999526342372459, "loss": 9.2237, "step": 29800 }, { "epoch": 0.12180602899105704, "grad_norm": 3.727325201034546, "learning_rate": 0.004999516626885207, "loss": 9.2219, "step": 29900 }, { "epoch": 0.1222134070144385, "grad_norm": 3.3698318004608154, "learning_rate": 0.004999506812776362, "loss": 9.2405, "step": 30000 }, { "epoch": 0.1222134070144385, "eval_MaskedAccuracy": 0.4407026905200056, "eval_loss": 1.9574971199035645, "eval_runtime": 588.8466, "eval_samples_per_second": 107.797, "eval_steps_per_second": 0.421, "step": 30000 }, { "epoch": 0.12262078503781995, "grad_norm": 1.5510843992233276, "learning_rate": 0.004999496900046327, "loss": 9.2626, "step": 30100 }, { "epoch": 0.12302816306120141, "grad_norm": 1.7259398698806763, "learning_rate": 0.00499948688869549, "loss": 9.2664, "step": 30200 }, { "epoch": 0.12343554108458288, "grad_norm": 3.4563167095184326, "learning_rate": 0.004999476778724248, "loss": 9.2955, "step": 30300 }, { "epoch": 0.12384291910796434, "grad_norm": 1.9410121440887451, "learning_rate": 0.004999466570132995, "loss": 9.2675, "step": 30400 }, { "epoch": 0.1242502971313458, "grad_norm": 2.845703601837158, "learning_rate": 0.004999456262922131, "loss": 9.2495, "step": 30500 }, { "epoch": 0.12465767515472727, "grad_norm": 2.4379312992095947, "learning_rate": 0.004999445857092077, "loss": 9.278, "step": 30600 }, { "epoch": 0.12506505317810873, "grad_norm": 3.158965587615967, "learning_rate": 0.004999435352643232, "loss": 9.189, "step": 30700 }, { "epoch": 0.12547243120149018, "grad_norm": 3.224649667739868, "learning_rate": 0.004999424749576019, "loss": 9.2487, "step": 30800 }, { "epoch": 0.12587980922487166, "grad_norm": 3.579566240310669, "learning_rate": 0.004999414047890854, "loss": 9.2058, "step": 30900 }, { "epoch": 0.1262871872482531, "grad_norm": 1.4517889022827148, "learning_rate": 0.004999403247588162, "loss": 9.1853, "step": 31000 }, { "epoch": 0.1262871872482531, "eval_MaskedAccuracy": 0.44081633697535333, "eval_loss": 1.9580498933792114, "eval_runtime": 589.0477, "eval_samples_per_second": 107.76, "eval_steps_per_second": 0.421, "step": 31000 }, { "epoch": 0.12669456527163458, "grad_norm": 3.9054057598114014, "learning_rate": 0.004999392348668368, "loss": 9.2766, "step": 31100 }, { "epoch": 0.12710194329501603, "grad_norm": 2.204495906829834, "learning_rate": 0.004999381351131904, "loss": 9.2299, "step": 31200 }, { "epoch": 0.12750932131839748, "grad_norm": 2.9038403034210205, "learning_rate": 0.004999370254979204, "loss": 9.1957, "step": 31300 }, { "epoch": 0.12791669934177896, "grad_norm": 9.680912017822266, "learning_rate": 0.004999359060210702, "loss": 9.2389, "step": 31400 }, { "epoch": 0.1283240773651604, "grad_norm": 1.493072748184204, "learning_rate": 0.004999347766826851, "loss": 9.2887, "step": 31500 }, { "epoch": 0.1287314553885419, "grad_norm": 3.562403678894043, "learning_rate": 0.004999336374828095, "loss": 9.2845, "step": 31600 }, { "epoch": 0.12913883341192334, "grad_norm": 3.176454544067383, "learning_rate": 0.004999324884214878, "loss": 9.2753, "step": 31700 }, { "epoch": 0.12954621143530481, "grad_norm": 3.772649049758911, "learning_rate": 0.004999313294987656, "loss": 9.2608, "step": 31800 }, { "epoch": 0.12995358945868626, "grad_norm": 3.224039316177368, "learning_rate": 0.00499930160714689, "loss": 9.182, "step": 31900 }, { "epoch": 0.13036096748206774, "grad_norm": 3.68146014213562, "learning_rate": 0.004999289820693044, "loss": 9.1812, "step": 32000 }, { "epoch": 0.13036096748206774, "eval_MaskedAccuracy": 0.4414966716646933, "eval_loss": 1.9461902379989624, "eval_runtime": 575.5966, "eval_samples_per_second": 110.279, "eval_steps_per_second": 0.431, "step": 32000 }, { "epoch": 0.1307683455054492, "grad_norm": 3.2047019004821777, "learning_rate": 0.004999277935626578, "loss": 9.1784, "step": 32100 }, { "epoch": 0.13117572352883064, "grad_norm": 3.677278518676758, "learning_rate": 0.004999265951947974, "loss": 9.157, "step": 32200 }, { "epoch": 0.13158310155221212, "grad_norm": 3.956178665161133, "learning_rate": 0.004999253869657687, "loss": 9.1224, "step": 32300 }, { "epoch": 0.13199047957559357, "grad_norm": 1.5574619770050049, "learning_rate": 0.004999241688756209, "loss": 9.2461, "step": 32400 }, { "epoch": 0.13239785759897504, "grad_norm": 2.0814273357391357, "learning_rate": 0.004999229409244015, "loss": 9.2823, "step": 32500 }, { "epoch": 0.1328052356223565, "grad_norm": 3.6148457527160645, "learning_rate": 0.004999217031121593, "loss": 9.1491, "step": 32600 }, { "epoch": 0.13321261364573797, "grad_norm": 3.312826633453369, "learning_rate": 0.00499920455438943, "loss": 9.212, "step": 32700 }, { "epoch": 0.13361999166911942, "grad_norm": 1.8035812377929688, "learning_rate": 0.004999191979048022, "loss": 9.1641, "step": 32800 }, { "epoch": 0.1340273696925009, "grad_norm": 4.133431434631348, "learning_rate": 0.004999179305097862, "loss": 9.2661, "step": 32900 }, { "epoch": 0.13443474771588235, "grad_norm": 3.4171595573425293, "learning_rate": 0.004999166532539454, "loss": 9.154, "step": 33000 }, { "epoch": 0.13443474771588235, "eval_MaskedAccuracy": 0.444123885179477, "eval_loss": 1.9266562461853027, "eval_runtime": 444.8407, "eval_samples_per_second": 142.694, "eval_steps_per_second": 0.558, "step": 33000 }, { "epoch": 0.1348421257392638, "grad_norm": 4.883106231689453, "learning_rate": 0.004999153661373301, "loss": 9.1874, "step": 33100 }, { "epoch": 0.13524950376264527, "grad_norm": 3.6476919651031494, "learning_rate": 0.0049991406915999145, "loss": 9.163, "step": 33200 }, { "epoch": 0.13565688178602672, "grad_norm": 2.720723867416382, "learning_rate": 0.004999127623219813, "loss": 9.1493, "step": 33300 }, { "epoch": 0.1360642598094082, "grad_norm": 3.7319369316101074, "learning_rate": 0.004999114456233506, "loss": 9.1741, "step": 33400 }, { "epoch": 0.13647163783278965, "grad_norm": 3.6861164569854736, "learning_rate": 0.004999101190641518, "loss": 9.1107, "step": 33500 }, { "epoch": 0.13687901585617113, "grad_norm": 2.780924081802368, "learning_rate": 0.004999087826444369, "loss": 9.1452, "step": 33600 }, { "epoch": 0.13728639387955258, "grad_norm": 2.3631842136383057, "learning_rate": 0.004999074363642589, "loss": 9.1763, "step": 33700 }, { "epoch": 0.13769377190293403, "grad_norm": 3.5727689266204834, "learning_rate": 0.004999060802236713, "loss": 9.1375, "step": 33800 }, { "epoch": 0.1381011499263155, "grad_norm": 2.269345998764038, "learning_rate": 0.004999047142227271, "loss": 9.143, "step": 33900 }, { "epoch": 0.13850852794969695, "grad_norm": 7.34450101852417, "learning_rate": 0.004999033383614803, "loss": 9.162, "step": 34000 }, { "epoch": 0.13850852794969695, "eval_MaskedAccuracy": 0.43828248619245724, "eval_loss": 1.9571964740753174, "eval_runtime": 495.8873, "eval_samples_per_second": 128.005, "eval_steps_per_second": 0.5, "step": 34000 }, { "epoch": 0.13891590597307843, "grad_norm": 4.355815410614014, "learning_rate": 0.004999019526399854, "loss": 9.1878, "step": 34100 }, { "epoch": 0.13932328399645988, "grad_norm": 4.175578594207764, "learning_rate": 0.004999005570582981, "loss": 9.132, "step": 34200 }, { "epoch": 0.13973066201984136, "grad_norm": 3.2906596660614014, "learning_rate": 0.004998991516164727, "loss": 9.1537, "step": 34300 }, { "epoch": 0.1401380400432228, "grad_norm": 3.8736915588378906, "learning_rate": 0.0049989773631456515, "loss": 9.1409, "step": 34400 }, { "epoch": 0.14054541806660428, "grad_norm": 3.4141225814819336, "learning_rate": 0.004998963111526308, "loss": 9.1178, "step": 34500 }, { "epoch": 0.14095279608998573, "grad_norm": 3.814049482345581, "learning_rate": 0.004998948761307268, "loss": 9.0791, "step": 34600 }, { "epoch": 0.14136017411336718, "grad_norm": 3.3079888820648193, "learning_rate": 0.004998934312489091, "loss": 9.1364, "step": 34700 }, { "epoch": 0.14176755213674866, "grad_norm": 1.326229453086853, "learning_rate": 0.004998919765072354, "loss": 9.199, "step": 34800 }, { "epoch": 0.1421749301601301, "grad_norm": 2.8008034229278564, "learning_rate": 0.004998905119057631, "loss": 9.1457, "step": 34900 }, { "epoch": 0.14258230818351159, "grad_norm": 4.717260837554932, "learning_rate": 0.0049988903744454995, "loss": 9.0929, "step": 35000 }, { "epoch": 0.14258230818351159, "eval_MaskedAccuracy": 0.44587848230103844, "eval_loss": 1.9244799613952637, "eval_runtime": 654.16, "eval_samples_per_second": 97.034, "eval_steps_per_second": 0.379, "step": 35000 }, { "epoch": 0.14298968620689304, "grad_norm": 2.6231470108032227, "learning_rate": 0.004998875531236537, "loss": 9.1243, "step": 35100 }, { "epoch": 0.1433970642302745, "grad_norm": 3.066685199737549, "learning_rate": 0.004998860589431345, "loss": 9.0725, "step": 35200 }, { "epoch": 0.14380444225365596, "grad_norm": 3.0163044929504395, "learning_rate": 0.0049988455490305025, "loss": 9.0527, "step": 35300 }, { "epoch": 0.1442118202770374, "grad_norm": 3.432121515274048, "learning_rate": 0.00499883041003461, "loss": 9.0402, "step": 35400 }, { "epoch": 0.1446191983004189, "grad_norm": 1.467442274093628, "learning_rate": 0.0049988151724442605, "loss": 9.0789, "step": 35500 }, { "epoch": 0.14502657632380034, "grad_norm": 4.966704368591309, "learning_rate": 0.004998799836260063, "loss": 9.1025, "step": 35600 }, { "epoch": 0.14543395434718182, "grad_norm": 4.940292835235596, "learning_rate": 0.004998784401482619, "loss": 9.0479, "step": 35700 }, { "epoch": 0.14584133237056326, "grad_norm": 5.558600425720215, "learning_rate": 0.004998768868112538, "loss": 9.1459, "step": 35800 }, { "epoch": 0.14624871039394474, "grad_norm": 1.7292475700378418, "learning_rate": 0.00499875323615044, "loss": 9.1045, "step": 35900 }, { "epoch": 0.1466560884173262, "grad_norm": 1.3726286888122559, "learning_rate": 0.00499873750559693, "loss": 9.1893, "step": 36000 }, { "epoch": 0.1466560884173262, "eval_MaskedAccuracy": 0.4421415294091426, "eval_loss": 1.929854154586792, "eval_runtime": 512.033, "eval_samples_per_second": 123.969, "eval_steps_per_second": 0.484, "step": 36000 }, { "epoch": 0.14706346644070767, "grad_norm": 6.443113803863525, "learning_rate": 0.004998721676452642, "loss": 9.1884, "step": 36100 }, { "epoch": 0.14747084446408912, "grad_norm": 5.186279773712158, "learning_rate": 0.0049987057487181985, "loss": 9.0583, "step": 36200 }, { "epoch": 0.14787822248747057, "grad_norm": 2.67094349861145, "learning_rate": 0.0049986897223942344, "loss": 9.0881, "step": 36300 }, { "epoch": 0.14828560051085204, "grad_norm": 3.5158534049987793, "learning_rate": 0.004998673597481375, "loss": 9.063, "step": 36400 }, { "epoch": 0.1486929785342335, "grad_norm": 4.55438232421875, "learning_rate": 0.004998657373980268, "loss": 9.1245, "step": 36500 }, { "epoch": 0.14910035655761497, "grad_norm": 3.9908504486083984, "learning_rate": 0.004998641051891542, "loss": 9.0492, "step": 36600 }, { "epoch": 0.14950773458099642, "grad_norm": 2.482670307159424, "learning_rate": 0.004998624631215852, "loss": 9.1073, "step": 36700 }, { "epoch": 0.1499151126043779, "grad_norm": 3.83728289604187, "learning_rate": 0.004998608111953842, "loss": 9.1892, "step": 36800 }, { "epoch": 0.15032249062775935, "grad_norm": 4.212090969085693, "learning_rate": 0.004998591494106167, "loss": 9.1035, "step": 36900 }, { "epoch": 0.15072986865114082, "grad_norm": 3.91371488571167, "learning_rate": 0.004998574777673482, "loss": 9.0584, "step": 37000 }, { "epoch": 0.15072986865114082, "eval_MaskedAccuracy": 0.4479879046411305, "eval_loss": 1.9146876335144043, "eval_runtime": 589.2359, "eval_samples_per_second": 107.726, "eval_steps_per_second": 0.421, "step": 37000 }, { "epoch": 0.15113724667452227, "grad_norm": 3.9847943782806396, "learning_rate": 0.004998557962656451, "loss": 9.1373, "step": 37100 }, { "epoch": 0.15154462469790372, "grad_norm": 2.304649591445923, "learning_rate": 0.004998541049055731, "loss": 9.1021, "step": 37200 }, { "epoch": 0.1519520027212852, "grad_norm": 2.305466890335083, "learning_rate": 0.004998524036872005, "loss": 9.0259, "step": 37300 }, { "epoch": 0.15235938074466665, "grad_norm": 5.452589511871338, "learning_rate": 0.004998506926105931, "loss": 9.0077, "step": 37400 }, { "epoch": 0.15276675876804813, "grad_norm": 3.1669082641601562, "learning_rate": 0.004998489716758196, "loss": 9.0085, "step": 37500 }, { "epoch": 0.15317413679142958, "grad_norm": 2.483513355255127, "learning_rate": 0.004998472408829474, "loss": 9.0579, "step": 37600 }, { "epoch": 0.15358151481481105, "grad_norm": 4.417145729064941, "learning_rate": 0.004998455002320451, "loss": 9.0336, "step": 37700 }, { "epoch": 0.1539888928381925, "grad_norm": 3.7182934284210205, "learning_rate": 0.004998437497231818, "loss": 9.0433, "step": 37800 }, { "epoch": 0.15439627086157395, "grad_norm": 3.135267734527588, "learning_rate": 0.00499841989356426, "loss": 9.0677, "step": 37900 }, { "epoch": 0.15480364888495543, "grad_norm": 2.6014578342437744, "learning_rate": 0.0049984021913184835, "loss": 9.1458, "step": 38000 }, { "epoch": 0.15480364888495543, "eval_MaskedAccuracy": 0.44533889755137634, "eval_loss": 1.9252251386642456, "eval_runtime": 479.8735, "eval_samples_per_second": 132.277, "eval_steps_per_second": 0.517, "step": 38000 }, { "epoch": 0.15521102690833688, "grad_norm": 4.145866870880127, "learning_rate": 0.0049983843904951855, "loss": 9.0325, "step": 38100 }, { "epoch": 0.15561840493171836, "grad_norm": 2.6572349071502686, "learning_rate": 0.00499836649109506, "loss": 8.9937, "step": 38200 }, { "epoch": 0.1560257829550998, "grad_norm": 4.326454162597656, "learning_rate": 0.004998348493118833, "loss": 9.005, "step": 38300 }, { "epoch": 0.15643316097848128, "grad_norm": 2.5977251529693604, "learning_rate": 0.004998330396567195, "loss": 9.0657, "step": 38400 }, { "epoch": 0.15684053900186273, "grad_norm": 7.225991249084473, "learning_rate": 0.004998312201440868, "loss": 9.1711, "step": 38500 }, { "epoch": 0.1572479170252442, "grad_norm": 1.0980727672576904, "learning_rate": 0.0049982939077405815, "loss": 9.1604, "step": 38600 }, { "epoch": 0.15765529504862566, "grad_norm": 4.309670448303223, "learning_rate": 0.004998275515467049, "loss": 9.131, "step": 38700 }, { "epoch": 0.1580626730720071, "grad_norm": 4.189974308013916, "learning_rate": 0.004998257024621001, "loss": 9.0594, "step": 38800 }, { "epoch": 0.1584700510953886, "grad_norm": 3.872173309326172, "learning_rate": 0.004998238435203172, "loss": 8.999, "step": 38900 }, { "epoch": 0.15887742911877004, "grad_norm": 8.60802936553955, "learning_rate": 0.0049982197472142895, "loss": 9.0032, "step": 39000 }, { "epoch": 0.15887742911877004, "eval_MaskedAccuracy": 0.4432360188291969, "eval_loss": 1.9270706176757812, "eval_runtime": 453.8616, "eval_samples_per_second": 139.858, "eval_steps_per_second": 0.546, "step": 39000 }, { "epoch": 0.1592848071421515, "grad_norm": 4.140707969665527, "learning_rate": 0.004998200960655094, "loss": 9.1444, "step": 39100 }, { "epoch": 0.15969218516553296, "grad_norm": 3.9461541175842285, "learning_rate": 0.0049981820755263286, "loss": 9.0186, "step": 39200 }, { "epoch": 0.16009956318891444, "grad_norm": 5.20798397064209, "learning_rate": 0.004998163091828741, "loss": 8.9638, "step": 39300 }, { "epoch": 0.1605069412122959, "grad_norm": 1.8364992141723633, "learning_rate": 0.00499814400956308, "loss": 8.9612, "step": 39400 }, { "epoch": 0.16091431923567734, "grad_norm": 4.967864990234375, "learning_rate": 0.004998124828730113, "loss": 9.0387, "step": 39500 }, { "epoch": 0.16132169725905882, "grad_norm": 2.861976146697998, "learning_rate": 0.004998105549330576, "loss": 8.9825, "step": 39600 }, { "epoch": 0.16172907528244027, "grad_norm": 25.292007446289062, "learning_rate": 0.004998086171365244, "loss": 9.0026, "step": 39700 }, { "epoch": 0.16213645330582174, "grad_norm": 2.862931251525879, "learning_rate": 0.004998066694834885, "loss": 9.0044, "step": 39800 }, { "epoch": 0.1625438313292032, "grad_norm": 4.256959438323975, "learning_rate": 0.004998047119740263, "loss": 8.9882, "step": 39900 }, { "epoch": 0.16295120935258467, "grad_norm": 4.4340362548828125, "learning_rate": 0.004998027446082152, "loss": 9.0686, "step": 40000 }, { "epoch": 0.16295120935258467, "eval_MaskedAccuracy": 0.44911529133323386, "eval_loss": 1.9079149961471558, "eval_runtime": 533.8636, "eval_samples_per_second": 118.899, "eval_steps_per_second": 0.465, "step": 40000 }, { "epoch": 0.16335858737596612, "grad_norm": 4.338792324066162, "learning_rate": 0.004998007673861338, "loss": 8.9793, "step": 40100 }, { "epoch": 0.1637659653993476, "grad_norm": 7.50832462310791, "learning_rate": 0.004997987803078598, "loss": 8.9996, "step": 40200 }, { "epoch": 0.16417334342272905, "grad_norm": 5.61702823638916, "learning_rate": 0.004997967833734717, "loss": 9.0622, "step": 40300 }, { "epoch": 0.1645807214461105, "grad_norm": 1.6239995956420898, "learning_rate": 0.004997947765830481, "loss": 9.1193, "step": 40400 }, { "epoch": 0.16498809946949197, "grad_norm": 3.3413801193237305, "learning_rate": 0.0049979275993666935, "loss": 9.0051, "step": 40500 }, { "epoch": 0.16539547749287342, "grad_norm": 4.474870681762695, "learning_rate": 0.004997907334344139, "loss": 8.9585, "step": 40600 }, { "epoch": 0.1658028555162549, "grad_norm": 4.810618877410889, "learning_rate": 0.004997886970763626, "loss": 9.0324, "step": 40700 }, { "epoch": 0.16621023353963635, "grad_norm": 3.7743003368377686, "learning_rate": 0.0049978665086259574, "loss": 9.0539, "step": 40800 }, { "epoch": 0.16661761156301783, "grad_norm": 3.4350481033325195, "learning_rate": 0.004997845947931944, "loss": 8.9873, "step": 40900 }, { "epoch": 0.16702498958639928, "grad_norm": 4.730776786804199, "learning_rate": 0.004997825288682397, "loss": 8.9994, "step": 41000 }, { "epoch": 0.16702498958639928, "eval_MaskedAccuracy": 0.4477450902474816, "eval_loss": 1.90780770778656, "eval_runtime": 582.4452, "eval_samples_per_second": 108.982, "eval_steps_per_second": 0.426, "step": 41000 }, { "epoch": 0.16743236760978075, "grad_norm": 5.645813941955566, "learning_rate": 0.004997804530878132, "loss": 8.9852, "step": 41100 }, { "epoch": 0.1678397456331622, "grad_norm": 7.603764533996582, "learning_rate": 0.004997783674519973, "loss": 8.9917, "step": 41200 }, { "epoch": 0.16824712365654365, "grad_norm": 3.657362461090088, "learning_rate": 0.004997762719608744, "loss": 9.0294, "step": 41300 }, { "epoch": 0.16865450167992513, "grad_norm": 4.409435749053955, "learning_rate": 0.00499774166614527, "loss": 8.9524, "step": 41400 }, { "epoch": 0.16906187970330658, "grad_norm": 4.743533611297607, "learning_rate": 0.00499772051413039, "loss": 8.9682, "step": 41500 }, { "epoch": 0.16946925772668806, "grad_norm": 5.022209644317627, "learning_rate": 0.004997699263564937, "loss": 8.9024, "step": 41600 }, { "epoch": 0.1698766357500695, "grad_norm": 4.124392032623291, "learning_rate": 0.004997677914449744, "loss": 8.9699, "step": 41700 }, { "epoch": 0.17028401377345098, "grad_norm": 2.79866361618042, "learning_rate": 0.004997656466785669, "loss": 8.9582, "step": 41800 }, { "epoch": 0.17069139179683243, "grad_norm": 4.4310221672058105, "learning_rate": 0.00499763492057355, "loss": 8.9343, "step": 41900 }, { "epoch": 0.17109876982021388, "grad_norm": 1.807766079902649, "learning_rate": 0.004997613275814238, "loss": 9.0676, "step": 42000 }, { "epoch": 0.17109876982021388, "eval_MaskedAccuracy": 0.44673300940251104, "eval_loss": 1.9179065227508545, "eval_runtime": 480.8601, "eval_samples_per_second": 132.005, "eval_steps_per_second": 0.516, "step": 42000 }, { "epoch": 0.17150614784359536, "grad_norm": 3.888913154602051, "learning_rate": 0.0049975915325085945, "loss": 9.0499, "step": 42100 }, { "epoch": 0.1719135258669768, "grad_norm": 5.673905849456787, "learning_rate": 0.0049975696906574725, "loss": 8.9339, "step": 42200 }, { "epoch": 0.17232090389035828, "grad_norm": 8.747493743896484, "learning_rate": 0.004997547750261739, "loss": 8.9281, "step": 42300 }, { "epoch": 0.17272828191373973, "grad_norm": 7.244274139404297, "learning_rate": 0.0049975257113222624, "loss": 9.0767, "step": 42400 }, { "epoch": 0.1731356599371212, "grad_norm": 3.129075765609741, "learning_rate": 0.004997503573839917, "loss": 9.1039, "step": 42500 }, { "epoch": 0.17354303796050266, "grad_norm": 3.9406702518463135, "learning_rate": 0.004997481337815575, "loss": 8.9543, "step": 42600 }, { "epoch": 0.17395041598388414, "grad_norm": 2.617393732070923, "learning_rate": 0.004997459003250109, "loss": 8.9232, "step": 42700 }, { "epoch": 0.1743577940072656, "grad_norm": 5.305755138397217, "learning_rate": 0.004997436570144415, "loss": 8.9391, "step": 42800 }, { "epoch": 0.17476517203064704, "grad_norm": 5.731649398803711, "learning_rate": 0.004997414038499373, "loss": 9.0504, "step": 42900 }, { "epoch": 0.17517255005402851, "grad_norm": 5.849124431610107, "learning_rate": 0.004997391408315867, "loss": 8.9588, "step": 43000 }, { "epoch": 0.17517255005402851, "eval_MaskedAccuracy": 0.4417444098255367, "eval_loss": 1.950426697731018, "eval_runtime": 468.5442, "eval_samples_per_second": 135.475, "eval_steps_per_second": 0.529, "step": 43000 }, { "epoch": 0.17557992807740996, "grad_norm": 4.830824851989746, "learning_rate": 0.0049973686795948, "loss": 9.0612, "step": 43100 }, { "epoch": 0.17598730610079144, "grad_norm": 4.50791597366333, "learning_rate": 0.004997345852337069, "loss": 8.9523, "step": 43200 }, { "epoch": 0.1763946841241729, "grad_norm": 4.371399402618408, "learning_rate": 0.004997322926543576, "loss": 8.8987, "step": 43300 }, { "epoch": 0.17680206214755437, "grad_norm": 3.6514639854431152, "learning_rate": 0.004997299902215233, "loss": 8.9056, "step": 43400 }, { "epoch": 0.17720944017093582, "grad_norm": 5.054758071899414, "learning_rate": 0.004997276779352933, "loss": 8.875, "step": 43500 }, { "epoch": 0.1776168181943173, "grad_norm": 6.961836814880371, "learning_rate": 0.004997253557957605, "loss": 8.9972, "step": 43600 }, { "epoch": 0.17802419621769874, "grad_norm": 1.7716807126998901, "learning_rate": 0.004997230238030173, "loss": 9.0609, "step": 43700 }, { "epoch": 0.1784315742410802, "grad_norm": 4.531676769256592, "learning_rate": 0.004997206819571546, "loss": 9.0159, "step": 43800 }, { "epoch": 0.17883895226446167, "grad_norm": 6.115238666534424, "learning_rate": 0.0049971833025826565, "loss": 8.9111, "step": 43900 }, { "epoch": 0.17924633028784312, "grad_norm": 3.474392890930176, "learning_rate": 0.004997159687064435, "loss": 8.9168, "step": 44000 }, { "epoch": 0.17924633028784312, "eval_MaskedAccuracy": 0.45380778193670457, "eval_loss": 1.8783373832702637, "eval_runtime": 510.3313, "eval_samples_per_second": 124.382, "eval_steps_per_second": 0.486, "step": 44000 }, { "epoch": 0.1796537083112246, "grad_norm": 4.088130474090576, "learning_rate": 0.004997135973017803, "loss": 8.8733, "step": 44100 }, { "epoch": 0.18006108633460605, "grad_norm": 5.331000328063965, "learning_rate": 0.0049971121604437145, "loss": 8.9507, "step": 44200 }, { "epoch": 0.18046846435798752, "grad_norm": 3.348855495452881, "learning_rate": 0.004997088249343102, "loss": 8.9441, "step": 44300 }, { "epoch": 0.18087584238136897, "grad_norm": 5.045528888702393, "learning_rate": 0.004997064239716911, "loss": 8.9874, "step": 44400 }, { "epoch": 0.18128322040475042, "grad_norm": 2.4286417961120605, "learning_rate": 0.004997040131566088, "loss": 8.9286, "step": 44500 }, { "epoch": 0.1816905984281319, "grad_norm": 4.78425407409668, "learning_rate": 0.004997015924891591, "loss": 8.8713, "step": 44600 }, { "epoch": 0.18209797645151335, "grad_norm": 1.8142859935760498, "learning_rate": 0.004996991619694378, "loss": 8.9267, "step": 44700 }, { "epoch": 0.18250535447489483, "grad_norm": 3.1893045902252197, "learning_rate": 0.004996967215975412, "loss": 8.9035, "step": 44800 }, { "epoch": 0.18291273249827628, "grad_norm": 6.472596645355225, "learning_rate": 0.00499694271373565, "loss": 8.9118, "step": 44900 }, { "epoch": 0.18332011052165775, "grad_norm": 4.773984909057617, "learning_rate": 0.004996918112976068, "loss": 9.0264, "step": 45000 }, { "epoch": 0.18332011052165775, "eval_MaskedAccuracy": 0.4526010996035226, "eval_loss": 1.8884249925613403, "eval_runtime": 1591.7003, "eval_samples_per_second": 39.879, "eval_steps_per_second": 0.156, "step": 45000 }, { "epoch": 0.1837274885450392, "grad_norm": 3.957942485809326, "learning_rate": 0.004996893413697636, "loss": 8.9141, "step": 45100 }, { "epoch": 0.18413486656842068, "grad_norm": 4.710155010223389, "learning_rate": 0.004996868615901332, "loss": 8.8604, "step": 45200 }, { "epoch": 0.18454224459180213, "grad_norm": 9.699567794799805, "learning_rate": 0.004996843719588128, "loss": 8.8876, "step": 45300 }, { "epoch": 0.18494962261518358, "grad_norm": 2.785186529159546, "learning_rate": 0.0049968187247590124, "loss": 8.9351, "step": 45400 }, { "epoch": 0.18535700063856506, "grad_norm": 5.596234321594238, "learning_rate": 0.004996793631414983, "loss": 8.9158, "step": 45500 }, { "epoch": 0.1857643786619465, "grad_norm": 4.283908367156982, "learning_rate": 0.00499676843955702, "loss": 8.9738, "step": 45600 }, { "epoch": 0.18617175668532798, "grad_norm": 6.712310791015625, "learning_rate": 0.004996743149186122, "loss": 8.8925, "step": 45700 }, { "epoch": 0.18657913470870943, "grad_norm": 4.3184895515441895, "learning_rate": 0.004996717760303289, "loss": 8.8861, "step": 45800 }, { "epoch": 0.1869865127320909, "grad_norm": 2.1695480346679688, "learning_rate": 0.004996692272909525, "loss": 8.8591, "step": 45900 }, { "epoch": 0.18739389075547236, "grad_norm": 5.650905609130859, "learning_rate": 0.004996666687005845, "loss": 9.07, "step": 46000 }, { "epoch": 0.18739389075547236, "eval_MaskedAccuracy": 0.4467761955244632, "eval_loss": 1.9052798748016357, "eval_runtime": 1276.7049, "eval_samples_per_second": 49.719, "eval_steps_per_second": 0.194, "step": 46000 }, { "epoch": 0.1878012687788538, "grad_norm": 4.653480052947998, "learning_rate": 0.004996641002593249, "loss": 8.9139, "step": 46100 }, { "epoch": 0.18820864680223529, "grad_norm": 3.2729389667510986, "learning_rate": 0.004996615219672769, "loss": 8.8715, "step": 46200 }, { "epoch": 0.18861602482561673, "grad_norm": 1.0644035339355469, "learning_rate": 0.004996589338245408, "loss": 8.8974, "step": 46300 }, { "epoch": 0.1890234028489982, "grad_norm": 4.8218770027160645, "learning_rate": 0.004996563358312191, "loss": 9.0405, "step": 46400 }, { "epoch": 0.18943078087237966, "grad_norm": 3.1921133995056152, "learning_rate": 0.0049965372798741504, "loss": 8.8825, "step": 46500 }, { "epoch": 0.18983815889576114, "grad_norm": 3.8239128589630127, "learning_rate": 0.004996511102932316, "loss": 8.8928, "step": 46600 }, { "epoch": 0.1902455369191426, "grad_norm": 1.8124943971633911, "learning_rate": 0.0049964848274877205, "loss": 8.8667, "step": 46700 }, { "epoch": 0.19065291494252407, "grad_norm": 6.511573314666748, "learning_rate": 0.004996458453541405, "loss": 8.9185, "step": 46800 }, { "epoch": 0.19106029296590551, "grad_norm": 5.872045516967773, "learning_rate": 0.004996431981094414, "loss": 8.8477, "step": 46900 }, { "epoch": 0.19146767098928696, "grad_norm": 4.800133228302002, "learning_rate": 0.004996405410147791, "loss": 8.849, "step": 47000 }, { "epoch": 0.19146767098928696, "eval_MaskedAccuracy": 0.45543538868908684, "eval_loss": 1.8698941469192505, "eval_runtime": 1367.0173, "eval_samples_per_second": 46.434, "eval_steps_per_second": 0.181, "step": 47000 }, { "epoch": 0.19187504901266844, "grad_norm": 3.088144302368164, "learning_rate": 0.0049963787407025844, "loss": 8.9498, "step": 47100 }, { "epoch": 0.1922824270360499, "grad_norm": 3.6458115577697754, "learning_rate": 0.004996351972759855, "loss": 9.0039, "step": 47200 }, { "epoch": 0.19268980505943137, "grad_norm": 6.0658369064331055, "learning_rate": 0.004996325106320655, "loss": 8.8997, "step": 47300 }, { "epoch": 0.19309718308281282, "grad_norm": 5.804576873779297, "learning_rate": 0.004996298141386048, "loss": 9.0567, "step": 47400 }, { "epoch": 0.1935045611061943, "grad_norm": 4.082156658172607, "learning_rate": 0.004996271077957102, "loss": 8.8726, "step": 47500 }, { "epoch": 0.19391193912957574, "grad_norm": 3.362668752670288, "learning_rate": 0.004996243916034889, "loss": 8.8642, "step": 47600 }, { "epoch": 0.19431931715295722, "grad_norm": 1.1706517934799194, "learning_rate": 0.0049962166556204776, "loss": 8.8542, "step": 47700 }, { "epoch": 0.19472669517633867, "grad_norm": 6.859764099121094, "learning_rate": 0.0049961892967149485, "loss": 9.0527, "step": 47800 }, { "epoch": 0.19513407319972012, "grad_norm": 4.326186656951904, "learning_rate": 0.004996161839319382, "loss": 8.9689, "step": 47900 }, { "epoch": 0.1955414512231016, "grad_norm": 6.308228969573975, "learning_rate": 0.004996134283434865, "loss": 8.8275, "step": 48000 }, { "epoch": 0.1955414512231016, "eval_MaskedAccuracy": 0.4556454833718163, "eval_loss": 1.8734564781188965, "eval_runtime": 1654.1594, "eval_samples_per_second": 38.374, "eval_steps_per_second": 0.15, "step": 48000 }, { "epoch": 0.19594882924648305, "grad_norm": 4.332017421722412, "learning_rate": 0.004996106629062483, "loss": 8.8454, "step": 48100 }, { "epoch": 0.19635620726986452, "grad_norm": 6.285112380981445, "learning_rate": 0.004996078876203332, "loss": 8.8178, "step": 48200 }, { "epoch": 0.19676358529324597, "grad_norm": 3.0181238651275635, "learning_rate": 0.00499605102485851, "loss": 8.9695, "step": 48300 }, { "epoch": 0.19717096331662745, "grad_norm": 5.3587446212768555, "learning_rate": 0.004996023075029118, "loss": 8.857, "step": 48400 }, { "epoch": 0.1975783413400089, "grad_norm": 4.083210468292236, "learning_rate": 0.004995995026716264, "loss": 8.8019, "step": 48500 }, { "epoch": 0.19798571936339035, "grad_norm": 1.806215763092041, "learning_rate": 0.004995966879921056, "loss": 8.9378, "step": 48600 }, { "epoch": 0.19839309738677183, "grad_norm": 4.553736686706543, "learning_rate": 0.004995938634644601, "loss": 8.8551, "step": 48700 }, { "epoch": 0.19880047541015328, "grad_norm": 6.222628593444824, "learning_rate": 0.004995910290888019, "loss": 8.828, "step": 48800 }, { "epoch": 0.19920785343353475, "grad_norm": 3.1822943687438965, "learning_rate": 0.004995881848652431, "loss": 8.8323, "step": 48900 }, { "epoch": 0.1996152314569162, "grad_norm": 3.611772298812866, "learning_rate": 0.004995853307938964, "loss": 8.8966, "step": 49000 }, { "epoch": 0.1996152314569162, "eval_MaskedAccuracy": 0.45529773088340925, "eval_loss": 1.8736827373504639, "eval_runtime": 1125.2104, "eval_samples_per_second": 56.413, "eval_steps_per_second": 0.22, "step": 49000 }, { "epoch": 0.20002260948029768, "grad_norm": 5.808901786804199, "learning_rate": 0.004995824668748749, "loss": 8.7982, "step": 49100 }, { "epoch": 0.20042998750367913, "grad_norm": 4.918834209442139, "learning_rate": 0.004995795931082909, "loss": 8.806, "step": 49200 }, { "epoch": 0.2008373655270606, "grad_norm": 23.904325485229492, "learning_rate": 0.004995767094942582, "loss": 8.9147, "step": 49300 }, { "epoch": 0.20124474355044206, "grad_norm": 4.423624038696289, "learning_rate": 0.004995738160328911, "loss": 9.0289, "step": 49400 }, { "epoch": 0.2016521215738235, "grad_norm": 3.2176241874694824, "learning_rate": 0.004995709127243038, "loss": 8.9621, "step": 49500 }, { "epoch": 0.20205949959720498, "grad_norm": 4.553608417510986, "learning_rate": 0.004995679995686104, "loss": 8.8743, "step": 49600 }, { "epoch": 0.20246687762058643, "grad_norm": 4.142465114593506, "learning_rate": 0.004995650765659282, "loss": 8.816, "step": 49700 }, { "epoch": 0.2028742556439679, "grad_norm": 5.3594794273376465, "learning_rate": 0.00499562143716371, "loss": 8.7982, "step": 49800 }, { "epoch": 0.20328163366734936, "grad_norm": 3.36631441116333, "learning_rate": 0.004995592010200553, "loss": 8.8225, "step": 49900 }, { "epoch": 0.20368901169073084, "grad_norm": 1.200137972831726, "learning_rate": 0.004995562484770975, "loss": 8.833, "step": 50000 }, { "epoch": 0.20368901169073084, "eval_MaskedAccuracy": 0.45342948136380307, "eval_loss": 1.8760101795196533, "eval_runtime": 1721.9536, "eval_samples_per_second": 36.863, "eval_steps_per_second": 0.144, "step": 50000 }, { "epoch": 0.2040963897141123, "grad_norm": 2.7836992740631104, "learning_rate": 0.0049955328608761364, "loss": 8.9665, "step": 50100 }, { "epoch": 0.20450376773749374, "grad_norm": 4.889101982116699, "learning_rate": 0.004995503138517217, "loss": 8.9262, "step": 50200 }, { "epoch": 0.2049111457608752, "grad_norm": 5.656112194061279, "learning_rate": 0.004995473317695388, "loss": 8.826, "step": 50300 }, { "epoch": 0.20531852378425666, "grad_norm": 9.625349044799805, "learning_rate": 0.00499544339841183, "loss": 8.7877, "step": 50400 }, { "epoch": 0.20572590180763814, "grad_norm": 5.409666061401367, "learning_rate": 0.004995413380667721, "loss": 8.939, "step": 50500 }, { "epoch": 0.2061332798310196, "grad_norm": 6.664834022521973, "learning_rate": 0.00499538326446425, "loss": 8.7978, "step": 50600 }, { "epoch": 0.20654065785440107, "grad_norm": 6.404662609100342, "learning_rate": 0.004995353049802609, "loss": 8.8049, "step": 50700 }, { "epoch": 0.20694803587778252, "grad_norm": 7.683322429656982, "learning_rate": 0.00499532273668399, "loss": 8.8133, "step": 50800 }, { "epoch": 0.207355413901164, "grad_norm": 3.8872005939483643, "learning_rate": 0.004995292325109595, "loss": 8.8927, "step": 50900 }, { "epoch": 0.20776279192454544, "grad_norm": 4.828040599822998, "learning_rate": 0.004995261815080626, "loss": 8.8109, "step": 51000 }, { "epoch": 0.20776279192454544, "eval_MaskedAccuracy": 0.4566938346297085, "eval_loss": 1.8743842840194702, "eval_runtime": 1101.6221, "eval_samples_per_second": 57.62, "eval_steps_per_second": 0.225, "step": 51000 }, { "epoch": 0.2081701699479269, "grad_norm": 5.939586162567139, "learning_rate": 0.00499523120659829, "loss": 8.8242, "step": 51100 }, { "epoch": 0.20857754797130837, "grad_norm": 27.979389190673828, "learning_rate": 0.004995200499663799, "loss": 8.7949, "step": 51200 }, { "epoch": 0.20898492599468982, "grad_norm": 8.024907112121582, "learning_rate": 0.004995169694278358, "loss": 9.0259, "step": 51300 }, { "epoch": 0.2093923040180713, "grad_norm": 5.7522807121276855, "learning_rate": 0.0049951387904431915, "loss": 8.8851, "step": 51400 }, { "epoch": 0.20979968204145275, "grad_norm": 4.299382209777832, "learning_rate": 0.00499510778815952, "loss": 8.8128, "step": 51500 }, { "epoch": 0.21020706006483422, "grad_norm": 4.014888763427734, "learning_rate": 0.004995076687428578, "loss": 8.8239, "step": 51600 }, { "epoch": 0.21061443808821567, "grad_norm": 1.072213053703308, "learning_rate": 0.004995045488251584, "loss": 8.8771, "step": 51700 }, { "epoch": 0.21102181611159715, "grad_norm": 4.117224216461182, "learning_rate": 0.0049950141906297775, "loss": 8.9155, "step": 51800 }, { "epoch": 0.2114291941349786, "grad_norm": 5.043557643890381, "learning_rate": 0.004994982794564388, "loss": 8.7899, "step": 51900 }, { "epoch": 0.21183657215836005, "grad_norm": 5.4120683670043945, "learning_rate": 0.004994951300056659, "loss": 8.782, "step": 52000 }, { "epoch": 0.21183657215836005, "eval_MaskedAccuracy": 0.45812245160372533, "eval_loss": 1.8528733253479004, "eval_runtime": 1573.3328, "eval_samples_per_second": 40.345, "eval_steps_per_second": 0.158, "step": 52000 }, { "epoch": 0.21224395018174153, "grad_norm": 6.679693698883057, "learning_rate": 0.004994919707107843, "loss": 8.7559, "step": 52100 }, { "epoch": 0.21265132820512297, "grad_norm": 6.865571022033691, "learning_rate": 0.004994888015719179, "loss": 8.883, "step": 52200 }, { "epoch": 0.21305870622850445, "grad_norm": 6.2120561599731445, "learning_rate": 0.004994856225891923, "loss": 8.9445, "step": 52300 }, { "epoch": 0.2134660842518859, "grad_norm": 6.802185535430908, "learning_rate": 0.004994824337627339, "loss": 8.8191, "step": 52400 }, { "epoch": 0.21387346227526738, "grad_norm": 4.90602970123291, "learning_rate": 0.004994792350926673, "loss": 8.8454, "step": 52500 }, { "epoch": 0.21428084029864883, "grad_norm": 5.3922953605651855, "learning_rate": 0.004994760265791208, "loss": 8.9565, "step": 52600 }, { "epoch": 0.21468821832203028, "grad_norm": 3.9712696075439453, "learning_rate": 0.0049947280822222025, "loss": 8.8104, "step": 52700 }, { "epoch": 0.21509559634541175, "grad_norm": 3.618863821029663, "learning_rate": 0.004994695800220931, "loss": 8.8186, "step": 52800 }, { "epoch": 0.2155029743687932, "grad_norm": 3.201653003692627, "learning_rate": 0.004994663419788666, "loss": 8.7529, "step": 52900 }, { "epoch": 0.21591035239217468, "grad_norm": 6.746595859527588, "learning_rate": 0.004994630940926693, "loss": 8.871, "step": 53000 }, { "epoch": 0.21591035239217468, "eval_MaskedAccuracy": 0.45650221757496223, "eval_loss": 1.8506487607955933, "eval_runtime": 1421.674, "eval_samples_per_second": 44.649, "eval_steps_per_second": 0.174, "step": 53000 }, { "epoch": 0.21631773041555613, "grad_norm": 5.726940631866455, "learning_rate": 0.0049945983636362915, "loss": 8.7594, "step": 53100 }, { "epoch": 0.2167251084389376, "grad_norm": 9.217852592468262, "learning_rate": 0.00499456568791875, "loss": 8.8618, "step": 53200 }, { "epoch": 0.21713248646231906, "grad_norm": 5.963167667388916, "learning_rate": 0.004994532913775363, "loss": 8.9854, "step": 53300 }, { "epoch": 0.21753986448570053, "grad_norm": 3.00286865234375, "learning_rate": 0.004994500041207422, "loss": 8.8253, "step": 53400 }, { "epoch": 0.21794724250908198, "grad_norm": 6.345184326171875, "learning_rate": 0.004994467070216233, "loss": 8.7499, "step": 53500 }, { "epoch": 0.21835462053246343, "grad_norm": 8.022476196289062, "learning_rate": 0.004994434000803093, "loss": 8.762, "step": 53600 }, { "epoch": 0.2187619985558449, "grad_norm": 6.483421802520752, "learning_rate": 0.004994400832969317, "loss": 8.7591, "step": 53700 }, { "epoch": 0.21916937657922636, "grad_norm": 2.510129451751709, "learning_rate": 0.004994367566716207, "loss": 8.9242, "step": 53800 }, { "epoch": 0.21957675460260784, "grad_norm": 1.064130425453186, "learning_rate": 0.0049943342020450825, "loss": 8.979, "step": 53900 }, { "epoch": 0.2199841326259893, "grad_norm": 6.173582077026367, "learning_rate": 0.004994300738957267, "loss": 8.9965, "step": 54000 }, { "epoch": 0.2199841326259893, "eval_MaskedAccuracy": 0.4541979935100313, "eval_loss": 1.8765358924865723, "eval_runtime": 1215.8881, "eval_samples_per_second": 52.205, "eval_steps_per_second": 0.204, "step": 54000 }, { "epoch": 0.22039151064937076, "grad_norm": 5.219405651092529, "learning_rate": 0.004994267177454082, "loss": 8.8305, "step": 54100 }, { "epoch": 0.2207988886727522, "grad_norm": 6.215470790863037, "learning_rate": 0.00499423351753685, "loss": 8.7512, "step": 54200 }, { "epoch": 0.22120626669613366, "grad_norm": 11.539061546325684, "learning_rate": 0.004994199759206904, "loss": 8.7085, "step": 54300 }, { "epoch": 0.22161364471951514, "grad_norm": 2.4697351455688477, "learning_rate": 0.004994165902465575, "loss": 8.8035, "step": 54400 }, { "epoch": 0.2220210227428966, "grad_norm": 4.639867305755615, "learning_rate": 0.004994131947314205, "loss": 8.8734, "step": 54500 }, { "epoch": 0.22242840076627807, "grad_norm": 7.649751663208008, "learning_rate": 0.0049940978937541404, "loss": 8.8174, "step": 54600 }, { "epoch": 0.22283577878965952, "grad_norm": 6.019172191619873, "learning_rate": 0.004994063741786718, "loss": 8.7393, "step": 54700 }, { "epoch": 0.223243156813041, "grad_norm": 4.9053425788879395, "learning_rate": 0.0049940294914133, "loss": 8.7257, "step": 54800 }, { "epoch": 0.22365053483642244, "grad_norm": 6.808137893676758, "learning_rate": 0.004993995142635232, "loss": 8.7289, "step": 54900 }, { "epoch": 0.22405791285980392, "grad_norm": 10.978095054626465, "learning_rate": 0.00499396069545387, "loss": 8.755, "step": 55000 }, { "epoch": 0.22405791285980392, "eval_MaskedAccuracy": 0.4493424909929303, "eval_loss": 1.8973885774612427, "eval_runtime": 1872.0602, "eval_samples_per_second": 33.907, "eval_steps_per_second": 0.132, "step": 55000 }, { "epoch": 0.22446529088318537, "grad_norm": 4.419609069824219, "learning_rate": 0.004993926149870585, "loss": 8.9114, "step": 55100 }, { "epoch": 0.22487266890656682, "grad_norm": 5.7764458656311035, "learning_rate": 0.004993891505886738, "loss": 8.8835, "step": 55200 }, { "epoch": 0.2252800469299483, "grad_norm": 7.038376331329346, "learning_rate": 0.004993856763503692, "loss": 8.7756, "step": 55300 }, { "epoch": 0.22568742495332975, "grad_norm": 2.246647357940674, "learning_rate": 0.004993821922722831, "loss": 8.7382, "step": 55400 }, { "epoch": 0.22609480297671122, "grad_norm": 2.824277400970459, "learning_rate": 0.004993786983545529, "loss": 8.94, "step": 55500 }, { "epoch": 0.22650218100009267, "grad_norm": 2.477095603942871, "learning_rate": 0.004993751945973163, "loss": 8.9129, "step": 55600 }, { "epoch": 0.22690955902347415, "grad_norm": 8.029402732849121, "learning_rate": 0.004993716810007125, "loss": 8.786, "step": 55700 }, { "epoch": 0.2273169370468556, "grad_norm": 6.447448253631592, "learning_rate": 0.004993681575648798, "loss": 8.7651, "step": 55800 }, { "epoch": 0.22772431507023708, "grad_norm": 4.86328125, "learning_rate": 0.004993646242899573, "loss": 8.8977, "step": 55900 }, { "epoch": 0.22813169309361853, "grad_norm": 5.140337944030762, "learning_rate": 0.004993610811760857, "loss": 8.756, "step": 56000 }, { "epoch": 0.22813169309361853, "eval_MaskedAccuracy": 0.45951580848885953, "eval_loss": 1.8519104719161987, "eval_runtime": 1576.2602, "eval_samples_per_second": 40.27, "eval_steps_per_second": 0.157, "step": 56000 }, { "epoch": 0.22853907111699998, "grad_norm": 8.257657051086426, "learning_rate": 0.004993575282234048, "loss": 8.7562, "step": 56100 }, { "epoch": 0.22894644914038145, "grad_norm": 3.961374521255493, "learning_rate": 0.004993539654320549, "loss": 8.9159, "step": 56200 }, { "epoch": 0.2293538271637629, "grad_norm": 4.514026165008545, "learning_rate": 0.004993503928021772, "loss": 8.9226, "step": 56300 }, { "epoch": 0.22976120518714438, "grad_norm": 4.246365547180176, "learning_rate": 0.004993468103339117, "loss": 8.9462, "step": 56400 }, { "epoch": 0.23016858321052583, "grad_norm": 7.508754253387451, "learning_rate": 0.004993432180274019, "loss": 8.9351, "step": 56500 }, { "epoch": 0.2305759612339073, "grad_norm": 2.7310702800750732, "learning_rate": 0.004993396158827879, "loss": 8.8479, "step": 56600 }, { "epoch": 0.23098333925728876, "grad_norm": 6.079079627990723, "learning_rate": 0.004993360039002134, "loss": 8.8298, "step": 56700 }, { "epoch": 0.2313907172806702, "grad_norm": 5.1133809089660645, "learning_rate": 0.004993323820798205, "loss": 8.81, "step": 56800 }, { "epoch": 0.23179809530405168, "grad_norm": 3.1408135890960693, "learning_rate": 0.004993287504217525, "loss": 8.7568, "step": 56900 }, { "epoch": 0.23220547332743313, "grad_norm": 5.984044551849365, "learning_rate": 0.004993251089261535, "loss": 8.7266, "step": 57000 }, { "epoch": 0.23220547332743313, "eval_MaskedAccuracy": 0.4607020510876605, "eval_loss": 1.8500642776489258, "eval_runtime": 1521.2445, "eval_samples_per_second": 41.726, "eval_steps_per_second": 0.163, "step": 57000 }, { "epoch": 0.2326128513508146, "grad_norm": 7.032481670379639, "learning_rate": 0.004993214575931667, "loss": 8.698, "step": 57100 }, { "epoch": 0.23302022937419606, "grad_norm": 3.9696929454803467, "learning_rate": 0.0049931779642293705, "loss": 8.6799, "step": 57200 }, { "epoch": 0.23342760739757754, "grad_norm": 5.725532531738281, "learning_rate": 0.004993141254156091, "loss": 8.7401, "step": 57300 }, { "epoch": 0.23383498542095899, "grad_norm": 3.45983624458313, "learning_rate": 0.004993104445713283, "loss": 8.9454, "step": 57400 }, { "epoch": 0.23424236344434046, "grad_norm": 3.7529096603393555, "learning_rate": 0.0049930675389024, "loss": 8.8428, "step": 57500 }, { "epoch": 0.2346497414677219, "grad_norm": 4.558876991271973, "learning_rate": 0.004993030533724895, "loss": 8.7754, "step": 57600 }, { "epoch": 0.23505711949110336, "grad_norm": 7.477844715118408, "learning_rate": 0.00499299343018224, "loss": 8.6963, "step": 57700 }, { "epoch": 0.23546449751448484, "grad_norm": 7.134383678436279, "learning_rate": 0.004992956228275898, "loss": 8.7056, "step": 57800 }, { "epoch": 0.2358718755378663, "grad_norm": 4.128894805908203, "learning_rate": 0.004992918928007337, "loss": 8.8289, "step": 57900 }, { "epoch": 0.23627925356124777, "grad_norm": 5.839402675628662, "learning_rate": 0.004992881529378037, "loss": 8.8334, "step": 58000 }, { "epoch": 0.23627925356124777, "eval_MaskedAccuracy": 0.4585491223611061, "eval_loss": 1.8525534868240356, "eval_runtime": 1401.9315, "eval_samples_per_second": 45.278, "eval_steps_per_second": 0.177, "step": 58000 }, { "epoch": 0.23668663158462921, "grad_norm": 3.721687078475952, "learning_rate": 0.00499284403238947, "loss": 8.7689, "step": 58100 }, { "epoch": 0.2370940096080107, "grad_norm": 4.26967716217041, "learning_rate": 0.004992806437043121, "loss": 8.8639, "step": 58200 }, { "epoch": 0.23750138763139214, "grad_norm": 3.958195924758911, "learning_rate": 0.004992768743340482, "loss": 8.9223, "step": 58300 }, { "epoch": 0.23790876565477362, "grad_norm": 2.781923532485962, "learning_rate": 0.00499273095128304, "loss": 8.7765, "step": 58400 }, { "epoch": 0.23831614367815507, "grad_norm": 8.125414848327637, "learning_rate": 0.004992693060872287, "loss": 8.76, "step": 58500 }, { "epoch": 0.23872352170153652, "grad_norm": 8.391327857971191, "learning_rate": 0.00499265507210972, "loss": 8.7101, "step": 58600 }, { "epoch": 0.239130899724918, "grad_norm": 6.260959148406982, "learning_rate": 0.0049926169849968515, "loss": 8.7667, "step": 58700 }, { "epoch": 0.23953827774829944, "grad_norm": 5.657060146331787, "learning_rate": 0.004992578799535172, "loss": 8.8167, "step": 58800 }, { "epoch": 0.23994565577168092, "grad_norm": 5.699919700622559, "learning_rate": 0.004992540515726205, "loss": 8.6987, "step": 58900 }, { "epoch": 0.24035303379506237, "grad_norm": 5.916398525238037, "learning_rate": 0.004992502133571444, "loss": 8.6812, "step": 59000 }, { "epoch": 0.24035303379506237, "eval_MaskedAccuracy": 0.46217061996832853, "eval_loss": 1.8404490947723389, "eval_runtime": 581.3056, "eval_samples_per_second": 109.196, "eval_steps_per_second": 0.427, "step": 59000 }, { "epoch": 0.24076041181844385, "grad_norm": 8.000349998474121, "learning_rate": 0.004992463653072431, "loss": 8.7485, "step": 59100 }, { "epoch": 0.2411677898418253, "grad_norm": 6.2110490798950195, "learning_rate": 0.004992425074230673, "loss": 8.6799, "step": 59200 }, { "epoch": 0.24157516786520675, "grad_norm": 1.3083547353744507, "learning_rate": 0.0049923863970476965, "loss": 8.792, "step": 59300 }, { "epoch": 0.24198254588858822, "grad_norm": 3.161991834640503, "learning_rate": 0.0049923476215250305, "loss": 8.9828, "step": 59400 }, { "epoch": 0.24238992391196967, "grad_norm": 4.440457820892334, "learning_rate": 0.004992308747664208, "loss": 8.983, "step": 59500 }, { "epoch": 0.24279730193535115, "grad_norm": 4.067751407623291, "learning_rate": 0.004992269775466766, "loss": 8.8012, "step": 59600 }, { "epoch": 0.2432046799587326, "grad_norm": 6.714059352874756, "learning_rate": 0.004992230704934249, "loss": 8.7334, "step": 59700 }, { "epoch": 0.24361205798211408, "grad_norm": 6.335543155670166, "learning_rate": 0.004992191536068201, "loss": 8.7069, "step": 59800 }, { "epoch": 0.24401943600549553, "grad_norm": 5.72700309753418, "learning_rate": 0.004992152268870167, "loss": 8.6779, "step": 59900 }, { "epoch": 0.244426814028877, "grad_norm": 18.419189453125, "learning_rate": 0.004992112903341708, "loss": 8.7899, "step": 60000 }, { "epoch": 0.244426814028877, "eval_MaskedAccuracy": 0.4408497008426111, "eval_loss": 1.9382939338684082, "eval_runtime": 373.1181, "eval_samples_per_second": 170.123, "eval_steps_per_second": 0.665, "step": 60000 }, { "epoch": 0.24483419205225845, "grad_norm": 5.117299556732178, "learning_rate": 0.004992073439484375, "loss": 8.8255, "step": 60100 }, { "epoch": 0.2452415700756399, "grad_norm": 3.9339141845703125, "learning_rate": 0.004992033877299723, "loss": 8.7172, "step": 60200 }, { "epoch": 0.24564894809902138, "grad_norm": 6.391584873199463, "learning_rate": 0.004991994216789326, "loss": 8.6556, "step": 60300 }, { "epoch": 0.24605632612240283, "grad_norm": 7.434480667114258, "learning_rate": 0.004991954457954737, "loss": 8.7071, "step": 60400 }, { "epoch": 0.2464637041457843, "grad_norm": 2.5101046562194824, "learning_rate": 0.004991914600797545, "loss": 8.6685, "step": 60500 }, { "epoch": 0.24687108216916576, "grad_norm": 6.6819939613342285, "learning_rate": 0.004991874645319314, "loss": 8.7207, "step": 60600 }, { "epoch": 0.24727846019254723, "grad_norm": 6.217319965362549, "learning_rate": 0.004991834591521631, "loss": 8.7524, "step": 60700 }, { "epoch": 0.24768583821592868, "grad_norm": 3.4094226360321045, "learning_rate": 0.004991794439406076, "loss": 8.9701, "step": 60800 }, { "epoch": 0.24809321623931013, "grad_norm": 6.1427903175354, "learning_rate": 0.004991754188974236, "loss": 8.8112, "step": 60900 }, { "epoch": 0.2485005942626916, "grad_norm": 6.913832187652588, "learning_rate": 0.004991713840227701, "loss": 8.7124, "step": 61000 }, { "epoch": 0.2485005942626916, "eval_MaskedAccuracy": 0.4621929598472666, "eval_loss": 1.8318604230880737, "eval_runtime": 527.4568, "eval_samples_per_second": 120.343, "eval_steps_per_second": 0.47, "step": 61000 }, { "epoch": 0.24890797228607306, "grad_norm": 6.938937187194824, "learning_rate": 0.004991673393168064, "loss": 8.7164, "step": 61100 }, { "epoch": 0.24931535030945454, "grad_norm": 4.936263084411621, "learning_rate": 0.004991632847796928, "loss": 8.6831, "step": 61200 }, { "epoch": 0.24972272833283599, "grad_norm": 3.5639712810516357, "learning_rate": 0.004991592204115897, "loss": 8.6581, "step": 61300 }, { "epoch": 0.25013010635621746, "grad_norm": 4.962512493133545, "learning_rate": 0.004991551462126576, "loss": 8.8954, "step": 61400 }, { "epoch": 0.25053748437959894, "grad_norm": 5.8719987869262695, "learning_rate": 0.004991510621830578, "loss": 8.7383, "step": 61500 }, { "epoch": 0.25094486240298036, "grad_norm": 5.943603515625, "learning_rate": 0.004991469683229517, "loss": 8.7059, "step": 61600 }, { "epoch": 0.25135224042636184, "grad_norm": 7.711507320404053, "learning_rate": 0.00499142864632501, "loss": 8.6766, "step": 61700 }, { "epoch": 0.2517596184497433, "grad_norm": 6.331603527069092, "learning_rate": 0.00499138751111868, "loss": 8.6879, "step": 61800 }, { "epoch": 0.25216699647312474, "grad_norm": 2.5456717014312744, "learning_rate": 0.004991346277612157, "loss": 8.6847, "step": 61900 }, { "epoch": 0.2525743744965062, "grad_norm": 1.222424030303955, "learning_rate": 0.004991304945807071, "loss": 8.6913, "step": 62000 }, { "epoch": 0.2525743744965062, "eval_MaskedAccuracy": 0.45581933930365287, "eval_loss": 1.8417388200759888, "eval_runtime": 538.808, "eval_samples_per_second": 117.808, "eval_steps_per_second": 0.46, "step": 62000 }, { "epoch": 0.2529817525198877, "grad_norm": 7.086363315582275, "learning_rate": 0.004991263515705049, "loss": 8.832, "step": 62100 }, { "epoch": 0.25338913054326917, "grad_norm": 6.7353949546813965, "learning_rate": 0.004991221987307737, "loss": 8.7419, "step": 62200 }, { "epoch": 0.2537965085666506, "grad_norm": 1.0387322902679443, "learning_rate": 0.004991180360616769, "loss": 8.7183, "step": 62300 }, { "epoch": 0.25420388659003207, "grad_norm": 6.057994842529297, "learning_rate": 0.00499113863563379, "loss": 8.8768, "step": 62400 }, { "epoch": 0.25461126461341355, "grad_norm": 6.997557163238525, "learning_rate": 0.00499109681236046, "loss": 8.737, "step": 62500 }, { "epoch": 0.25501864263679497, "grad_norm": 4.8636555671691895, "learning_rate": 0.004991054890798418, "loss": 8.7255, "step": 62600 }, { "epoch": 0.25542602066017644, "grad_norm": 6.203702926635742, "learning_rate": 0.004991012870949338, "loss": 8.6889, "step": 62700 }, { "epoch": 0.2558333986835579, "grad_norm": 4.030526638031006, "learning_rate": 0.004990970752814874, "loss": 8.6639, "step": 62800 }, { "epoch": 0.2562407767069394, "grad_norm": 11.469111442565918, "learning_rate": 0.004990928536396685, "loss": 8.91, "step": 62900 }, { "epoch": 0.2566481547303208, "grad_norm": 6.844756126403809, "learning_rate": 0.004990886221696451, "loss": 8.9484, "step": 63000 }, { "epoch": 0.2566481547303208, "eval_MaskedAccuracy": 0.45651751724645373, "eval_loss": 1.866458773612976, "eval_runtime": 543.3437, "eval_samples_per_second": 116.825, "eval_steps_per_second": 0.456, "step": 63000 }, { "epoch": 0.2570555327537023, "grad_norm": 8.265596389770508, "learning_rate": 0.004990843808715836, "loss": 8.7548, "step": 63100 }, { "epoch": 0.2574629107770838, "grad_norm": 8.50532054901123, "learning_rate": 0.0049908012974565225, "loss": 8.6947, "step": 63200 }, { "epoch": 0.25787028880046525, "grad_norm": 11.911187171936035, "learning_rate": 0.004990758687920184, "loss": 8.7065, "step": 63300 }, { "epoch": 0.2582776668238467, "grad_norm": 7.431310653686523, "learning_rate": 0.004990715980108504, "loss": 8.6872, "step": 63400 }, { "epoch": 0.25868504484722815, "grad_norm": 6.245077610015869, "learning_rate": 0.004990673174023182, "loss": 8.6888, "step": 63500 }, { "epoch": 0.25909242287060963, "grad_norm": 4.772067070007324, "learning_rate": 0.004990630269665909, "loss": 8.6529, "step": 63600 }, { "epoch": 0.25949980089399105, "grad_norm": 8.589908599853516, "learning_rate": 0.004990587267038376, "loss": 8.7132, "step": 63700 }, { "epoch": 0.25990717891737253, "grad_norm": 4.484492301940918, "learning_rate": 0.004990544166142284, "loss": 8.8358, "step": 63800 }, { "epoch": 0.260314556940754, "grad_norm": 4.792771339416504, "learning_rate": 0.004990500966979338, "loss": 8.8237, "step": 63900 }, { "epoch": 0.2607219349641355, "grad_norm": 9.162529945373535, "learning_rate": 0.004990457669551245, "loss": 8.7504, "step": 64000 }, { "epoch": 0.2607219349641355, "eval_MaskedAccuracy": 0.46101908946857945, "eval_loss": 1.8516390323638916, "eval_runtime": 524.9316, "eval_samples_per_second": 120.922, "eval_steps_per_second": 0.472, "step": 64000 }, { "epoch": 0.2611293129875169, "grad_norm": 7.857509613037109, "learning_rate": 0.004990414273859712, "loss": 8.687, "step": 64100 }, { "epoch": 0.2615366910108984, "grad_norm": 8.928559303283691, "learning_rate": 0.004990370779906467, "loss": 8.8045, "step": 64200 }, { "epoch": 0.26194406903427986, "grad_norm": 16.3757266998291, "learning_rate": 0.00499032718769322, "loss": 8.9208, "step": 64300 }, { "epoch": 0.2623514470576613, "grad_norm": 6.15322732925415, "learning_rate": 0.0049902834972216925, "loss": 8.908, "step": 64400 }, { "epoch": 0.26275882508104276, "grad_norm": 5.626424312591553, "learning_rate": 0.004990239708493618, "loss": 8.7201, "step": 64500 }, { "epoch": 0.26316620310442423, "grad_norm": 6.347898960113525, "learning_rate": 0.004990195821510728, "loss": 8.6724, "step": 64600 }, { "epoch": 0.2635735811278057, "grad_norm": 5.051441669464111, "learning_rate": 0.004990151836274756, "loss": 8.6605, "step": 64700 }, { "epoch": 0.26398095915118713, "grad_norm": 2.98770809173584, "learning_rate": 0.004990107752787442, "loss": 8.6744, "step": 64800 }, { "epoch": 0.2643883371745686, "grad_norm": 3.0073490142822266, "learning_rate": 0.004990063571050524, "loss": 8.8322, "step": 64900 }, { "epoch": 0.2647957151979501, "grad_norm": 6.210570335388184, "learning_rate": 0.004990019291065754, "loss": 8.7453, "step": 65000 }, { "epoch": 0.2647957151979501, "eval_MaskedAccuracy": 0.46210706583991695, "eval_loss": 1.822614073753357, "eval_runtime": 547.6703, "eval_samples_per_second": 115.902, "eval_steps_per_second": 0.453, "step": 65000 }, { "epoch": 0.2652030932213315, "grad_norm": 8.379491806030273, "learning_rate": 0.00498997491283488, "loss": 8.6684, "step": 65100 }, { "epoch": 0.265610471244713, "grad_norm": 9.686884880065918, "learning_rate": 0.00498993043635965, "loss": 8.681, "step": 65200 }, { "epoch": 0.26601784926809446, "grad_norm": 7.550168991088867, "learning_rate": 0.004989885861641834, "loss": 8.6959, "step": 65300 }, { "epoch": 0.26642522729147594, "grad_norm": 3.529690742492676, "learning_rate": 0.00498984118868319, "loss": 8.6661, "step": 65400 }, { "epoch": 0.26683260531485736, "grad_norm": 3.6193811893463135, "learning_rate": 0.004989796417485486, "loss": 8.8082, "step": 65500 }, { "epoch": 0.26723998333823884, "grad_norm": 7.745871543884277, "learning_rate": 0.004989751548050491, "loss": 8.8334, "step": 65600 }, { "epoch": 0.2676473613616203, "grad_norm": 10.003741264343262, "learning_rate": 0.004989706580379977, "loss": 8.7687, "step": 65700 }, { "epoch": 0.2680547393850018, "grad_norm": 9.515006065368652, "learning_rate": 0.0049896615144757244, "loss": 8.7927, "step": 65800 }, { "epoch": 0.2684621174083832, "grad_norm": 6.813485622406006, "learning_rate": 0.004989616350339516, "loss": 8.7092, "step": 65900 }, { "epoch": 0.2688694954317647, "grad_norm": 7.490106582641602, "learning_rate": 0.004989571087973132, "loss": 8.6178, "step": 66000 }, { "epoch": 0.2688694954317647, "eval_MaskedAccuracy": 0.46375460599596263, "eval_loss": 1.8165909051895142, "eval_runtime": 566.3043, "eval_samples_per_second": 112.088, "eval_steps_per_second": 0.438, "step": 66000 }, { "epoch": 0.26927687345514617, "grad_norm": 9.557563781738281, "learning_rate": 0.004989525727378363, "loss": 8.647, "step": 66100 }, { "epoch": 0.2696842514785276, "grad_norm": 4.538211345672607, "learning_rate": 0.004989480268557006, "loss": 8.663, "step": 66200 }, { "epoch": 0.27009162950190907, "grad_norm": 8.971290588378906, "learning_rate": 0.004989434711510853, "loss": 8.6516, "step": 66300 }, { "epoch": 0.27049900752529055, "grad_norm": 5.639621257781982, "learning_rate": 0.004989389056241712, "loss": 8.5922, "step": 66400 }, { "epoch": 0.270906385548672, "grad_norm": 7.516700267791748, "learning_rate": 0.004989343302751382, "loss": 8.6842, "step": 66500 }, { "epoch": 0.27131376357205345, "grad_norm": 18.569461822509766, "learning_rate": 0.00498929745104168, "loss": 8.8898, "step": 66600 }, { "epoch": 0.2717211415954349, "grad_norm": 7.216564655303955, "learning_rate": 0.004989251501114402, "loss": 8.852, "step": 66700 }, { "epoch": 0.2721285196188164, "grad_norm": 4.8928632736206055, "learning_rate": 0.004989205452971374, "loss": 8.6904, "step": 66800 }, { "epoch": 0.2725358976421978, "grad_norm": 9.051252365112305, "learning_rate": 0.004989159306614425, "loss": 8.6597, "step": 66900 }, { "epoch": 0.2729432756655793, "grad_norm": 8.281542778015137, "learning_rate": 0.004989113062045374, "loss": 8.7904, "step": 67000 }, { "epoch": 0.2729432756655793, "eval_MaskedAccuracy": 0.45793338315066245, "eval_loss": 1.8609939813613892, "eval_runtime": 1394.6305, "eval_samples_per_second": 45.515, "eval_steps_per_second": 0.178, "step": 67000 }, { "epoch": 0.2733506536889608, "grad_norm": 4.64543342590332, "learning_rate": 0.00498906671926605, "loss": 8.7046, "step": 67100 }, { "epoch": 0.27375803171234225, "grad_norm": 5.933874607086182, "learning_rate": 0.004989020278278275, "loss": 8.6247, "step": 67200 }, { "epoch": 0.2741654097357237, "grad_norm": 8.769102096557617, "learning_rate": 0.004988973739083899, "loss": 8.6526, "step": 67300 }, { "epoch": 0.27457278775910515, "grad_norm": 6.038580894470215, "learning_rate": 0.004988927101684754, "loss": 8.6191, "step": 67400 }, { "epoch": 0.27498016578248663, "grad_norm": 4.009898662567139, "learning_rate": 0.004988880366082679, "loss": 8.6574, "step": 67500 }, { "epoch": 0.27538754380586805, "grad_norm": 14.188034057617188, "learning_rate": 0.004988833532279532, "loss": 8.8855, "step": 67600 }, { "epoch": 0.27579492182924953, "grad_norm": 8.408965110778809, "learning_rate": 0.004988786600277165, "loss": 8.8519, "step": 67700 }, { "epoch": 0.276202299852631, "grad_norm": 8.768186569213867, "learning_rate": 0.004988739570077424, "loss": 8.6812, "step": 67800 }, { "epoch": 0.2766096778760125, "grad_norm": 11.160076141357422, "learning_rate": 0.004988692441682174, "loss": 8.6409, "step": 67900 }, { "epoch": 0.2770170558993939, "grad_norm": 5.295092582702637, "learning_rate": 0.004988645215093282, "loss": 8.6146, "step": 68000 }, { "epoch": 0.2770170558993939, "eval_MaskedAccuracy": 0.4649399672145674, "eval_loss": 1.8226548433303833, "eval_runtime": 1226.5993, "eval_samples_per_second": 51.75, "eval_steps_per_second": 0.202, "step": 68000 }, { "epoch": 0.2774244339227754, "grad_norm": 6.942083835601807, "learning_rate": 0.004988597890312612, "loss": 8.6312, "step": 68100 }, { "epoch": 0.27783181194615686, "grad_norm": 6.901147842407227, "learning_rate": 0.004988550467342032, "loss": 8.6041, "step": 68200 }, { "epoch": 0.2782391899695383, "grad_norm": 2.903517961502075, "learning_rate": 0.0049885029461834195, "loss": 8.5955, "step": 68300 }, { "epoch": 0.27864656799291976, "grad_norm": 5.638279438018799, "learning_rate": 0.004988455326838654, "loss": 8.7353, "step": 68400 }, { "epoch": 0.27905394601630124, "grad_norm": 6.898649215698242, "learning_rate": 0.004988407609309608, "loss": 8.7646, "step": 68500 }, { "epoch": 0.2794613240396827, "grad_norm": 4.677414894104004, "learning_rate": 0.004988359793598184, "loss": 8.6375, "step": 68600 }, { "epoch": 0.27986870206306413, "grad_norm": 4.530789852142334, "learning_rate": 0.004988311879706264, "loss": 8.5985, "step": 68700 }, { "epoch": 0.2802760800864456, "grad_norm": 7.5350661277771, "learning_rate": 0.004988263867635745, "loss": 8.6005, "step": 68800 }, { "epoch": 0.2806834581098271, "grad_norm": 8.78704833984375, "learning_rate": 0.004988215757388528, "loss": 8.6037, "step": 68900 }, { "epoch": 0.28109083613320857, "grad_norm": 4.651316165924072, "learning_rate": 0.004988167548966506, "loss": 8.683, "step": 69000 }, { "epoch": 0.28109083613320857, "eval_MaskedAccuracy": 0.4591831419850631, "eval_loss": 1.8432483673095703, "eval_runtime": 945.5524, "eval_samples_per_second": 67.131, "eval_steps_per_second": 0.262, "step": 69000 }, { "epoch": 0.28149821415659, "grad_norm": 4.739138603210449, "learning_rate": 0.004988119242371587, "loss": 8.6377, "step": 69100 }, { "epoch": 0.28190559217997146, "grad_norm": 5.9082818031311035, "learning_rate": 0.004988070837605688, "loss": 8.6987, "step": 69200 }, { "epoch": 0.28231297020335294, "grad_norm": 4.415525436401367, "learning_rate": 0.0049880223346707225, "loss": 8.8657, "step": 69300 }, { "epoch": 0.28272034822673436, "grad_norm": 3.6525769233703613, "learning_rate": 0.0049879737335686005, "loss": 8.7889, "step": 69400 }, { "epoch": 0.28312772625011584, "grad_norm": 1.016323447227478, "learning_rate": 0.004987925034301244, "loss": 8.7423, "step": 69500 }, { "epoch": 0.2835351042734973, "grad_norm": 4.312750339508057, "learning_rate": 0.004987876236870586, "loss": 8.8335, "step": 69600 }, { "epoch": 0.2839424822968788, "grad_norm": 8.026240348815918, "learning_rate": 0.00498782734127855, "loss": 8.6856, "step": 69700 }, { "epoch": 0.2843498603202602, "grad_norm": 1.7114349603652954, "learning_rate": 0.004987778347527068, "loss": 8.7786, "step": 69800 }, { "epoch": 0.2847572383436417, "grad_norm": 4.468791961669922, "learning_rate": 0.004987729255618083, "loss": 8.7087, "step": 69900 }, { "epoch": 0.28516461636702317, "grad_norm": 5.468358516693115, "learning_rate": 0.004987680065553528, "loss": 8.7154, "step": 70000 }, { "epoch": 0.28516461636702317, "eval_MaskedAccuracy": 0.46311365588665654, "eval_loss": 1.8235976696014404, "eval_runtime": 975.8884, "eval_samples_per_second": 65.044, "eval_steps_per_second": 0.254, "step": 70000 }, { "epoch": 0.2855719943904046, "grad_norm": 4.024952411651611, "learning_rate": 0.004987630777335353, "loss": 8.6081, "step": 70100 }, { "epoch": 0.28597937241378607, "grad_norm": 10.850265502929688, "learning_rate": 0.0049875813909655, "loss": 8.6298, "step": 70200 }, { "epoch": 0.28638675043716755, "grad_norm": 6.1410441398620605, "learning_rate": 0.004987531906445936, "loss": 8.7084, "step": 70300 }, { "epoch": 0.286794128460549, "grad_norm": 4.831986427307129, "learning_rate": 0.004987482323778607, "loss": 8.8194, "step": 70400 }, { "epoch": 0.28720150648393045, "grad_norm": 3.9731411933898926, "learning_rate": 0.004987432642965471, "loss": 8.7009, "step": 70500 }, { "epoch": 0.2876088845073119, "grad_norm": 5.253785610198975, "learning_rate": 0.00498738286400849, "loss": 8.7323, "step": 70600 }, { "epoch": 0.2880162625306934, "grad_norm": 5.914802551269531, "learning_rate": 0.004987332986909641, "loss": 8.682, "step": 70700 }, { "epoch": 0.2884236405540748, "grad_norm": 9.221420288085938, "learning_rate": 0.004987283011670893, "loss": 8.6853, "step": 70800 }, { "epoch": 0.2888310185774563, "grad_norm": 6.638362884521484, "learning_rate": 0.004987232938294225, "loss": 8.7717, "step": 70900 }, { "epoch": 0.2892383966008378, "grad_norm": 8.450394630432129, "learning_rate": 0.004987182766781605, "loss": 8.6537, "step": 71000 }, { "epoch": 0.2892383966008378, "eval_MaskedAccuracy": 0.46501880714079386, "eval_loss": 1.813057541847229, "eval_runtime": 1393.548, "eval_samples_per_second": 45.55, "eval_steps_per_second": 0.178, "step": 71000 }, { "epoch": 0.28964577462421925, "grad_norm": 6.9521942138671875, "learning_rate": 0.004987132497135028, "loss": 8.6107, "step": 71100 }, { "epoch": 0.2900531526476007, "grad_norm": 1.0710653066635132, "learning_rate": 0.004987082129356481, "loss": 8.6578, "step": 71200 }, { "epoch": 0.29046053067098215, "grad_norm": 8.927386283874512, "learning_rate": 0.004987031663447951, "loss": 8.7727, "step": 71300 }, { "epoch": 0.29086790869436363, "grad_norm": 35.31528854370117, "learning_rate": 0.00498698109941143, "loss": 8.6934, "step": 71400 }, { "epoch": 0.2912752867177451, "grad_norm": 1.607964038848877, "learning_rate": 0.004986930437248926, "loss": 8.7996, "step": 71500 }, { "epoch": 0.29168266474112653, "grad_norm": 6.970991611480713, "learning_rate": 0.0049868796769624415, "loss": 8.8118, "step": 71600 }, { "epoch": 0.292090042764508, "grad_norm": 9.207018852233887, "learning_rate": 0.004986828818553978, "loss": 8.8205, "step": 71700 }, { "epoch": 0.2924974207878895, "grad_norm": 4.5547332763671875, "learning_rate": 0.004986777862025549, "loss": 8.7335, "step": 71800 }, { "epoch": 0.2929047988112709, "grad_norm": 3.7256174087524414, "learning_rate": 0.004986726807379168, "loss": 8.6877, "step": 71900 }, { "epoch": 0.2933121768346524, "grad_norm": 5.260006904602051, "learning_rate": 0.0049866756546168515, "loss": 8.7415, "step": 72000 }, { "epoch": 0.2933121768346524, "eval_MaskedAccuracy": 0.46264094166115644, "eval_loss": 1.828948736190796, "eval_runtime": 598.5044, "eval_samples_per_second": 106.058, "eval_steps_per_second": 0.414, "step": 72000 }, { "epoch": 0.29371955485803386, "grad_norm": 7.642179012298584, "learning_rate": 0.004986624403740618, "loss": 8.6125, "step": 72100 }, { "epoch": 0.29412693288141534, "grad_norm": 8.284234046936035, "learning_rate": 0.004986573054752507, "loss": 8.6121, "step": 72200 }, { "epoch": 0.29453431090479676, "grad_norm": 7.844073295593262, "learning_rate": 0.0049865216076545385, "loss": 8.5973, "step": 72300 }, { "epoch": 0.29494168892817824, "grad_norm": 8.898551940917969, "learning_rate": 0.004986470062448749, "loss": 8.5608, "step": 72400 }, { "epoch": 0.2953490669515597, "grad_norm": 11.39210319519043, "learning_rate": 0.004986418419137175, "loss": 8.5659, "step": 72500 }, { "epoch": 0.29575644497494114, "grad_norm": 8.301942825317383, "learning_rate": 0.004986366677721854, "loss": 8.5845, "step": 72600 }, { "epoch": 0.2961638229983226, "grad_norm": 6.694902420043945, "learning_rate": 0.004986314838204841, "loss": 8.5574, "step": 72700 }, { "epoch": 0.2965712010217041, "grad_norm": 9.159902572631836, "learning_rate": 0.00498626290058818, "loss": 8.5525, "step": 72800 }, { "epoch": 0.29697857904508557, "grad_norm": 6.356729507446289, "learning_rate": 0.004986210864873929, "loss": 8.5978, "step": 72900 }, { "epoch": 0.297385957068467, "grad_norm": 8.628966331481934, "learning_rate": 0.004986158731064144, "loss": 8.559, "step": 73000 }, { "epoch": 0.297385957068467, "eval_MaskedAccuracy": 0.46742987719858947, "eval_loss": 1.810995101928711, "eval_runtime": 515.208, "eval_samples_per_second": 123.205, "eval_steps_per_second": 0.481, "step": 73000 }, { "epoch": 0.29779333509184847, "grad_norm": 9.128714561462402, "learning_rate": 0.004986106499160881, "loss": 8.62, "step": 73100 }, { "epoch": 0.29820071311522994, "grad_norm": 5.086336612701416, "learning_rate": 0.004986054169166205, "loss": 8.8034, "step": 73200 }, { "epoch": 0.29860809113861136, "grad_norm": 8.508166313171387, "learning_rate": 0.004986001741082194, "loss": 8.6297, "step": 73300 }, { "epoch": 0.29901546916199284, "grad_norm": 4.634451866149902, "learning_rate": 0.004985949214910917, "loss": 8.6497, "step": 73400 }, { "epoch": 0.2994228471853743, "grad_norm": 6.528726100921631, "learning_rate": 0.0049858965906544464, "loss": 8.5753, "step": 73500 }, { "epoch": 0.2998302252087558, "grad_norm": 5.7112555503845215, "learning_rate": 0.004985843868314864, "loss": 8.5338, "step": 73600 }, { "epoch": 0.3002376032321372, "grad_norm": 2.9792256355285645, "learning_rate": 0.00498579104789425, "loss": 8.5698, "step": 73700 }, { "epoch": 0.3006449812555187, "grad_norm": 3.0966081619262695, "learning_rate": 0.0049857381293947015, "loss": 8.87, "step": 73800 }, { "epoch": 0.3010523592789002, "grad_norm": 4.575057029724121, "learning_rate": 0.004985685112818309, "loss": 8.6763, "step": 73900 }, { "epoch": 0.30145973730228165, "grad_norm": 7.030023097991943, "learning_rate": 0.004985631998167159, "loss": 8.601, "step": 74000 }, { "epoch": 0.30145973730228165, "eval_MaskedAccuracy": 0.46606080791487003, "eval_loss": 1.8151373863220215, "eval_runtime": 682.5644, "eval_samples_per_second": 92.996, "eval_steps_per_second": 0.363, "step": 74000 }, { "epoch": 0.30186711532566307, "grad_norm": 9.47155475616455, "learning_rate": 0.004985578785443366, "loss": 8.582, "step": 74100 }, { "epoch": 0.30227449334904455, "grad_norm": 8.0533447265625, "learning_rate": 0.004985525474649029, "loss": 8.6827, "step": 74200 }, { "epoch": 0.302681871372426, "grad_norm": 5.068324565887451, "learning_rate": 0.004985472065786245, "loss": 8.5732, "step": 74300 }, { "epoch": 0.30308924939580745, "grad_norm": 6.542651176452637, "learning_rate": 0.004985418558857134, "loss": 8.5886, "step": 74400 }, { "epoch": 0.3034966274191889, "grad_norm": 5.114233493804932, "learning_rate": 0.00498536495386381, "loss": 8.5641, "step": 74500 }, { "epoch": 0.3039040054425704, "grad_norm": 1.2797762155532837, "learning_rate": 0.004985311250808395, "loss": 8.6368, "step": 74600 }, { "epoch": 0.3043113834659519, "grad_norm": 9.820923805236816, "learning_rate": 0.0049852574496930145, "loss": 8.903, "step": 74700 }, { "epoch": 0.3047187614893333, "grad_norm": 2.4579174518585205, "learning_rate": 0.004985203550519789, "loss": 8.8724, "step": 74800 }, { "epoch": 0.3051261395127148, "grad_norm": 6.749464511871338, "learning_rate": 0.004985149553290851, "loss": 8.7658, "step": 74900 }, { "epoch": 0.30553351753609626, "grad_norm": 2.1830623149871826, "learning_rate": 0.0049850954580083415, "loss": 8.6716, "step": 75000 }, { "epoch": 0.30553351753609626, "eval_MaskedAccuracy": 0.45829476062798746, "eval_loss": 1.8497532606124878, "eval_runtime": 556.5347, "eval_samples_per_second": 114.056, "eval_steps_per_second": 0.446, "step": 75000 }, { "epoch": 0.3059408955594777, "grad_norm": 8.475727081298828, "learning_rate": 0.004985041264674393, "loss": 8.8036, "step": 75100 }, { "epoch": 0.30634827358285915, "grad_norm": 7.727541923522949, "learning_rate": 0.0049849869732911485, "loss": 8.7284, "step": 75200 }, { "epoch": 0.30675565160624063, "grad_norm": 6.423786163330078, "learning_rate": 0.004984932583860751, "loss": 8.6141, "step": 75300 }, { "epoch": 0.3071630296296221, "grad_norm": 7.17932653427124, "learning_rate": 0.004984878096385355, "loss": 8.5934, "step": 75400 }, { "epoch": 0.30757040765300353, "grad_norm": 5.807397365570068, "learning_rate": 0.0049848235108671205, "loss": 8.5996, "step": 75500 }, { "epoch": 0.307977785676385, "grad_norm": 9.838018417358398, "learning_rate": 0.004984768827308196, "loss": 8.5559, "step": 75600 }, { "epoch": 0.3083851636997665, "grad_norm": 14.574153900146484, "learning_rate": 0.004984714045710747, "loss": 8.6299, "step": 75700 }, { "epoch": 0.3087925417231479, "grad_norm": 5.1707658767700195, "learning_rate": 0.004984659166076941, "loss": 8.7743, "step": 75800 }, { "epoch": 0.3091999197465294, "grad_norm": 5.780153751373291, "learning_rate": 0.004984604188408943, "loss": 8.7648, "step": 75900 }, { "epoch": 0.30960729776991086, "grad_norm": 7.285679817199707, "learning_rate": 0.004984549112708933, "loss": 8.6375, "step": 76000 }, { "epoch": 0.30960729776991086, "eval_MaskedAccuracy": 0.4661069620890094, "eval_loss": 1.8157120943069458, "eval_runtime": 640.5674, "eval_samples_per_second": 99.093, "eval_steps_per_second": 0.387, "step": 76000 }, { "epoch": 0.31001467579329234, "grad_norm": 6.567680835723877, "learning_rate": 0.0049844939389790816, "loss": 8.5938, "step": 76100 }, { "epoch": 0.31042205381667376, "grad_norm": 4.858399868011475, "learning_rate": 0.00498443866722158, "loss": 8.5711, "step": 76200 }, { "epoch": 0.31082943184005524, "grad_norm": 5.496742248535156, "learning_rate": 0.004984383297438603, "loss": 8.578, "step": 76300 }, { "epoch": 0.3112368098634367, "grad_norm": 5.734500885009766, "learning_rate": 0.004984327829632341, "loss": 8.5659, "step": 76400 }, { "epoch": 0.3116441878868182, "grad_norm": 4.251106262207031, "learning_rate": 0.004984272263804993, "loss": 8.5555, "step": 76500 }, { "epoch": 0.3120515659101996, "grad_norm": 4.126178741455078, "learning_rate": 0.0049842165999587525, "loss": 8.5562, "step": 76600 }, { "epoch": 0.3124589439335811, "grad_norm": 8.147299766540527, "learning_rate": 0.004984160838095821, "loss": 8.5382, "step": 76700 }, { "epoch": 0.31286632195696257, "grad_norm": 4.034787654876709, "learning_rate": 0.004984104978218403, "loss": 8.69, "step": 76800 }, { "epoch": 0.313273699980344, "grad_norm": 1.8307220935821533, "learning_rate": 0.004984049020328701, "loss": 8.7546, "step": 76900 }, { "epoch": 0.31368107800372547, "grad_norm": 6.058050632476807, "learning_rate": 0.004983992964428932, "loss": 8.6887, "step": 77000 }, { "epoch": 0.31368107800372547, "eval_MaskedAccuracy": 0.45887611000533407, "eval_loss": 1.8508230447769165, "eval_runtime": 513.6801, "eval_samples_per_second": 123.571, "eval_steps_per_second": 0.483, "step": 77000 }, { "epoch": 0.31408845602710694, "grad_norm": 8.168664932250977, "learning_rate": 0.0049839368105213154, "loss": 8.7375, "step": 77100 }, { "epoch": 0.3144958340504884, "grad_norm": 4.794949054718018, "learning_rate": 0.0049838805586080665, "loss": 8.6035, "step": 77200 }, { "epoch": 0.31490321207386984, "grad_norm": 3.7873361110687256, "learning_rate": 0.004983824208691406, "loss": 8.5769, "step": 77300 }, { "epoch": 0.3153105900972513, "grad_norm": 9.073785781860352, "learning_rate": 0.004983767760773574, "loss": 8.568, "step": 77400 }, { "epoch": 0.3157179681206328, "grad_norm": 8.142977714538574, "learning_rate": 0.004983711214856787, "loss": 8.5618, "step": 77500 }, { "epoch": 0.3161253461440142, "grad_norm": 6.451813220977783, "learning_rate": 0.0049836545709432975, "loss": 8.5512, "step": 77600 }, { "epoch": 0.3165327241673957, "grad_norm": 10.494230270385742, "learning_rate": 0.004983597829035338, "loss": 8.5219, "step": 77700 }, { "epoch": 0.3169401021907772, "grad_norm": 3.895531177520752, "learning_rate": 0.004983540989135144, "loss": 8.6625, "step": 77800 }, { "epoch": 0.31734748021415865, "grad_norm": 8.601829528808594, "learning_rate": 0.004983484051244962, "loss": 8.59, "step": 77900 }, { "epoch": 0.31775485823754007, "grad_norm": 6.697279453277588, "learning_rate": 0.004983427015367055, "loss": 8.5592, "step": 78000 }, { "epoch": 0.31775485823754007, "eval_MaskedAccuracy": 0.46543396075038707, "eval_loss": 1.8149820566177368, "eval_runtime": 577.5418, "eval_samples_per_second": 109.907, "eval_steps_per_second": 0.429, "step": 78000 }, { "epoch": 0.31816223626092155, "grad_norm": 8.476865768432617, "learning_rate": 0.004983369881503679, "loss": 8.7323, "step": 78100 }, { "epoch": 0.318569614284303, "grad_norm": 5.520491123199463, "learning_rate": 0.004983312649657079, "loss": 8.5741, "step": 78200 }, { "epoch": 0.31897699230768445, "grad_norm": 8.3047513961792, "learning_rate": 0.004983255319829533, "loss": 8.5857, "step": 78300 }, { "epoch": 0.3193843703310659, "grad_norm": 10.212699890136719, "learning_rate": 0.004983197892023291, "loss": 8.5407, "step": 78400 }, { "epoch": 0.3197917483544474, "grad_norm": 6.702394962310791, "learning_rate": 0.004983140366240634, "loss": 8.5472, "step": 78500 }, { "epoch": 0.3201991263778289, "grad_norm": 3.3476109504699707, "learning_rate": 0.0049830827424838416, "loss": 8.6871, "step": 78600 }, { "epoch": 0.3206065044012103, "grad_norm": 4.181885242462158, "learning_rate": 0.0049830250207551765, "loss": 8.8476, "step": 78700 }, { "epoch": 0.3210138824245918, "grad_norm": 6.616336345672607, "learning_rate": 0.004982967201056927, "loss": 8.8723, "step": 78800 }, { "epoch": 0.32142126044797326, "grad_norm": 5.890897750854492, "learning_rate": 0.0049829092833913865, "loss": 8.6229, "step": 78900 }, { "epoch": 0.3218286384713547, "grad_norm": 8.629895210266113, "learning_rate": 0.004982851267760835, "loss": 8.5627, "step": 79000 }, { "epoch": 0.3218286384713547, "eval_MaskedAccuracy": 0.4663628527517574, "eval_loss": 1.8107352256774902, "eval_runtime": 485.2463, "eval_samples_per_second": 130.812, "eval_steps_per_second": 0.511, "step": 79000 }, { "epoch": 0.32223601649473615, "grad_norm": 4.689969539642334, "learning_rate": 0.0049827931541675695, "loss": 8.5525, "step": 79100 }, { "epoch": 0.32264339451811763, "grad_norm": 7.801525592803955, "learning_rate": 0.004982734942613896, "loss": 8.5295, "step": 79200 }, { "epoch": 0.3230507725414991, "grad_norm": 5.901822566986084, "learning_rate": 0.00498267663310211, "loss": 8.5588, "step": 79300 }, { "epoch": 0.32345815056488053, "grad_norm": 4.35959529876709, "learning_rate": 0.004982618225634512, "loss": 8.6651, "step": 79400 }, { "epoch": 0.323865528588262, "grad_norm": 6.286835670471191, "learning_rate": 0.004982559720213415, "loss": 8.5631, "step": 79500 }, { "epoch": 0.3242729066116435, "grad_norm": 5.67689847946167, "learning_rate": 0.004982501116841123, "loss": 8.5648, "step": 79600 }, { "epoch": 0.32468028463502496, "grad_norm": 7.631079196929932, "learning_rate": 0.0049824424155199614, "loss": 8.5563, "step": 79700 }, { "epoch": 0.3250876626584064, "grad_norm": 9.412803649902344, "learning_rate": 0.00498238361625226, "loss": 8.5585, "step": 79800 }, { "epoch": 0.32549504068178786, "grad_norm": 8.735962867736816, "learning_rate": 0.004982324719040331, "loss": 8.5384, "step": 79900 }, { "epoch": 0.32590241870516934, "grad_norm": 14.457242965698242, "learning_rate": 0.004982265723886508, "loss": 8.5336, "step": 80000 }, { "epoch": 0.32590241870516934, "eval_MaskedAccuracy": 0.4618274738562086, "eval_loss": 1.8267515897750854, "eval_runtime": 545.2404, "eval_samples_per_second": 116.418, "eval_steps_per_second": 0.455, "step": 80000 }, { "epoch": 0.32630979672855076, "grad_norm": 5.96212100982666, "learning_rate": 0.004982206630793117, "loss": 8.8503, "step": 80100 }, { "epoch": 0.32671717475193224, "grad_norm": 18.13055992126465, "learning_rate": 0.004982147439762501, "loss": 8.842, "step": 80200 }, { "epoch": 0.3271245527753137, "grad_norm": 20.262971878051758, "learning_rate": 0.004982088150796999, "loss": 8.8396, "step": 80300 }, { "epoch": 0.3275319307986952, "grad_norm": 9.294783592224121, "learning_rate": 0.004982028763898953, "loss": 8.7368, "step": 80400 }, { "epoch": 0.3279393088220766, "grad_norm": 5.545958518981934, "learning_rate": 0.00498196927907071, "loss": 8.5956, "step": 80500 }, { "epoch": 0.3283466868454581, "grad_norm": 5.721169948577881, "learning_rate": 0.004981909696314625, "loss": 8.5734, "step": 80600 }, { "epoch": 0.32875406486883957, "grad_norm": 5.6723761558532715, "learning_rate": 0.004981850015633058, "loss": 8.5697, "step": 80700 }, { "epoch": 0.329161442892221, "grad_norm": 6.1140031814575195, "learning_rate": 0.0049817902370283594, "loss": 8.5231, "step": 80800 }, { "epoch": 0.32956882091560247, "grad_norm": 5.004663944244385, "learning_rate": 0.004981730360502891, "loss": 8.6437, "step": 80900 }, { "epoch": 0.32997619893898394, "grad_norm": 9.134260177612305, "learning_rate": 0.00498167038605903, "loss": 8.7893, "step": 81000 }, { "epoch": 0.32997619893898394, "eval_MaskedAccuracy": 0.4559564923524422, "eval_loss": 1.8662664890289307, "eval_runtime": 580.9179, "eval_samples_per_second": 109.268, "eval_steps_per_second": 0.427, "step": 81000 }, { "epoch": 0.3303835769623654, "grad_norm": 7.398022174835205, "learning_rate": 0.004981610313699138, "loss": 8.7318, "step": 81100 }, { "epoch": 0.33079095498574684, "grad_norm": 10.874109268188477, "learning_rate": 0.0049815501434255964, "loss": 8.6043, "step": 81200 }, { "epoch": 0.3311983330091283, "grad_norm": 7.2064690589904785, "learning_rate": 0.0049814898752407796, "loss": 8.5529, "step": 81300 }, { "epoch": 0.3316057110325098, "grad_norm": 13.496102333068848, "learning_rate": 0.004981429509147076, "loss": 8.5061, "step": 81400 }, { "epoch": 0.3320130890558912, "grad_norm": 7.787205696105957, "learning_rate": 0.004981369045146868, "loss": 8.5397, "step": 81500 }, { "epoch": 0.3324204670792727, "grad_norm": 5.602849006652832, "learning_rate": 0.004981308483242548, "loss": 8.4998, "step": 81600 }, { "epoch": 0.3328278451026542, "grad_norm": 8.219766616821289, "learning_rate": 0.0049812478234365085, "loss": 8.5309, "step": 81700 }, { "epoch": 0.33323522312603565, "grad_norm": 6.64454984664917, "learning_rate": 0.004981187065731154, "loss": 8.6075, "step": 81800 }, { "epoch": 0.3336426011494171, "grad_norm": 7.264410018920898, "learning_rate": 0.004981126210128876, "loss": 8.5623, "step": 81900 }, { "epoch": 0.33404997917279855, "grad_norm": 9.636812210083008, "learning_rate": 0.004981065256632084, "loss": 8.5134, "step": 82000 }, { "epoch": 0.33404997917279855, "eval_MaskedAccuracy": 0.4689584496984937, "eval_loss": 1.8067864179611206, "eval_runtime": 597.0935, "eval_samples_per_second": 106.308, "eval_steps_per_second": 0.415, "step": 82000 }, { "epoch": 0.33445735719618, "grad_norm": 8.728602409362793, "learning_rate": 0.0049810042052431955, "loss": 8.5174, "step": 82100 }, { "epoch": 0.3348647352195615, "grad_norm": 7.863314628601074, "learning_rate": 0.004980943055964616, "loss": 8.7261, "step": 82200 }, { "epoch": 0.3352721132429429, "grad_norm": 5.752391338348389, "learning_rate": 0.004980881808798764, "loss": 8.7496, "step": 82300 }, { "epoch": 0.3356794912663244, "grad_norm": 6.728067874908447, "learning_rate": 0.004980820463748064, "loss": 8.5905, "step": 82400 }, { "epoch": 0.3360868692897059, "grad_norm": 4.3324995040893555, "learning_rate": 0.004980759020814935, "loss": 8.5585, "step": 82500 }, { "epoch": 0.3364942473130873, "grad_norm": 3.047788619995117, "learning_rate": 0.004980697480001814, "loss": 8.5441, "step": 82600 }, { "epoch": 0.3369016253364688, "grad_norm": 7.090338706970215, "learning_rate": 0.004980635841311131, "loss": 8.6671, "step": 82700 }, { "epoch": 0.33730900335985026, "grad_norm": 5.892022132873535, "learning_rate": 0.004980574104745324, "loss": 8.5548, "step": 82800 }, { "epoch": 0.33771638138323173, "grad_norm": 8.809314727783203, "learning_rate": 0.004980512270306831, "loss": 8.5253, "step": 82900 }, { "epoch": 0.33812375940661316, "grad_norm": 8.679837226867676, "learning_rate": 0.004980450337998095, "loss": 8.5425, "step": 83000 }, { "epoch": 0.33812375940661316, "eval_MaskedAccuracy": 0.46861824620965875, "eval_loss": 1.7987737655639648, "eval_runtime": 643.4937, "eval_samples_per_second": 98.643, "eval_steps_per_second": 0.385, "step": 83000 }, { "epoch": 0.33853113742999463, "grad_norm": 24.381132125854492, "learning_rate": 0.0049803883078215734, "loss": 8.6264, "step": 83100 }, { "epoch": 0.3389385154533761, "grad_norm": 5.536754131317139, "learning_rate": 0.0049803261797797095, "loss": 8.7931, "step": 83200 }, { "epoch": 0.33934589347675753, "grad_norm": 8.775206565856934, "learning_rate": 0.004980263953874957, "loss": 8.7717, "step": 83300 }, { "epoch": 0.339753271500139, "grad_norm": 3.58613920211792, "learning_rate": 0.004980201630109785, "loss": 8.725, "step": 83400 }, { "epoch": 0.3401606495235205, "grad_norm": 5.676400184631348, "learning_rate": 0.004980139208486654, "loss": 8.6465, "step": 83500 }, { "epoch": 0.34056802754690196, "grad_norm": 8.117194175720215, "learning_rate": 0.004980076689008034, "loss": 8.5568, "step": 83600 }, { "epoch": 0.3409754055702834, "grad_norm": 8.87285327911377, "learning_rate": 0.004980014071676394, "loss": 8.5427, "step": 83700 }, { "epoch": 0.34138278359366486, "grad_norm": 6.456417560577393, "learning_rate": 0.004979951356494205, "loss": 8.5264, "step": 83800 }, { "epoch": 0.34179016161704634, "grad_norm": 5.127048969268799, "learning_rate": 0.004979888543463963, "loss": 8.6865, "step": 83900 }, { "epoch": 0.34219753964042776, "grad_norm": 6.285412788391113, "learning_rate": 0.004979825632588136, "loss": 8.6639, "step": 84000 }, { "epoch": 0.34219753964042776, "eval_MaskedAccuracy": 0.46654122265143805, "eval_loss": 1.8142414093017578, "eval_runtime": 591.5507, "eval_samples_per_second": 107.304, "eval_steps_per_second": 0.419, "step": 84000 }, { "epoch": 0.34260491766380924, "grad_norm": 6.347151279449463, "learning_rate": 0.004979762623869219, "loss": 8.567, "step": 84100 }, { "epoch": 0.3430122956871907, "grad_norm": 6.459830284118652, "learning_rate": 0.0049796995173096974, "loss": 8.5622, "step": 84200 }, { "epoch": 0.3434196737105722, "grad_norm": 7.853311061859131, "learning_rate": 0.004979636312912063, "loss": 8.5209, "step": 84300 }, { "epoch": 0.3438270517339536, "grad_norm": 7.669728755950928, "learning_rate": 0.0049795730106788234, "loss": 8.4813, "step": 84400 }, { "epoch": 0.3442344297573351, "grad_norm": 10.450735092163086, "learning_rate": 0.004979509610612482, "loss": 8.6282, "step": 84500 }, { "epoch": 0.34464180778071657, "grad_norm": 7.543275356292725, "learning_rate": 0.004979446112715538, "loss": 8.6334, "step": 84600 }, { "epoch": 0.34504918580409805, "grad_norm": 4.0467023849487305, "learning_rate": 0.004979382516990501, "loss": 8.7694, "step": 84700 }, { "epoch": 0.34545656382747947, "grad_norm": 8.46611213684082, "learning_rate": 0.0049793188234398865, "loss": 8.6023, "step": 84800 }, { "epoch": 0.34586394185086095, "grad_norm": 6.743054389953613, "learning_rate": 0.004979255032066221, "loss": 8.5541, "step": 84900 }, { "epoch": 0.3462713198742424, "grad_norm": 7.6256585121154785, "learning_rate": 0.004979191142872017, "loss": 8.5375, "step": 85000 }, { "epoch": 0.3462713198742424, "eval_MaskedAccuracy": 0.4691629318915052, "eval_loss": 1.7983323335647583, "eval_runtime": 544.0842, "eval_samples_per_second": 116.666, "eval_steps_per_second": 0.456, "step": 85000 }, { "epoch": 0.34667869789762384, "grad_norm": 1.0273075103759766, "learning_rate": 0.004979127155859808, "loss": 8.5629, "step": 85100 }, { "epoch": 0.3470860759210053, "grad_norm": 9.683170318603516, "learning_rate": 0.004979063071032122, "loss": 8.7399, "step": 85200 }, { "epoch": 0.3474934539443868, "grad_norm": 7.662843704223633, "learning_rate": 0.004978998888391494, "loss": 8.5619, "step": 85300 }, { "epoch": 0.3479008319677683, "grad_norm": 12.199478149414062, "learning_rate": 0.004978934607940453, "loss": 8.5566, "step": 85400 }, { "epoch": 0.3483082099911497, "grad_norm": 7.151951789855957, "learning_rate": 0.0049788702296815444, "loss": 8.5126, "step": 85500 }, { "epoch": 0.3487155880145312, "grad_norm": 8.834843635559082, "learning_rate": 0.004978805753617314, "loss": 8.5281, "step": 85600 }, { "epoch": 0.34912296603791265, "grad_norm": 7.378878593444824, "learning_rate": 0.004978741179750311, "loss": 8.5081, "step": 85700 }, { "epoch": 0.3495303440612941, "grad_norm": 5.579213619232178, "learning_rate": 0.004978676508083091, "loss": 8.48, "step": 85800 }, { "epoch": 0.34993772208467555, "grad_norm": 5.404116153717041, "learning_rate": 0.004978611738618207, "loss": 8.7612, "step": 85900 }, { "epoch": 0.35034510010805703, "grad_norm": 7.545348167419434, "learning_rate": 0.0049785468713582255, "loss": 8.5953, "step": 86000 }, { "epoch": 0.35034510010805703, "eval_MaskedAccuracy": 0.4679827090386452, "eval_loss": 1.8021892309188843, "eval_runtime": 607.8016, "eval_samples_per_second": 104.435, "eval_steps_per_second": 0.408, "step": 86000 }, { "epoch": 0.3507524781314385, "grad_norm": 7.222796440124512, "learning_rate": 0.004978481906305703, "loss": 8.5486, "step": 86100 }, { "epoch": 0.3511598561548199, "grad_norm": 4.052384853363037, "learning_rate": 0.00497841684346321, "loss": 8.511, "step": 86200 }, { "epoch": 0.3515672341782014, "grad_norm": 13.966856002807617, "learning_rate": 0.004978351682833326, "loss": 8.5318, "step": 86300 }, { "epoch": 0.3519746122015829, "grad_norm": 1.1165910959243774, "learning_rate": 0.004978286424418617, "loss": 8.7596, "step": 86400 }, { "epoch": 0.3523819902249643, "grad_norm": 1.760972023010254, "learning_rate": 0.004978221068221663, "loss": 8.7927, "step": 86500 }, { "epoch": 0.3527893682483458, "grad_norm": 4.525177955627441, "learning_rate": 0.004978155614245055, "loss": 8.8414, "step": 86600 }, { "epoch": 0.35319674627172726, "grad_norm": 5.550790786743164, "learning_rate": 0.00497809006249138, "loss": 8.5977, "step": 86700 }, { "epoch": 0.35360412429510873, "grad_norm": 7.4307451248168945, "learning_rate": 0.00497802441296323, "loss": 8.631, "step": 86800 }, { "epoch": 0.35401150231849016, "grad_norm": 1.908689260482788, "learning_rate": 0.004977958665663198, "loss": 8.5632, "step": 86900 }, { "epoch": 0.35441888034187163, "grad_norm": 7.362282752990723, "learning_rate": 0.004977892820593886, "loss": 8.6296, "step": 87000 }, { "epoch": 0.35441888034187163, "eval_MaskedAccuracy": 0.46437557347380914, "eval_loss": 1.8164002895355225, "eval_runtime": 503.8685, "eval_samples_per_second": 125.977, "eval_steps_per_second": 0.492, "step": 87000 }, { "epoch": 0.3548262583652531, "grad_norm": 3.9842371940612793, "learning_rate": 0.004977826877757894, "loss": 8.5993, "step": 87100 }, { "epoch": 0.3552336363886346, "grad_norm": 6.476138591766357, "learning_rate": 0.004977760837157826, "loss": 8.6622, "step": 87200 }, { "epoch": 0.355641014412016, "grad_norm": 9.106497764587402, "learning_rate": 0.0049776946987962955, "loss": 8.5143, "step": 87300 }, { "epoch": 0.3560483924353975, "grad_norm": 5.475508689880371, "learning_rate": 0.004977628462675919, "loss": 8.5169, "step": 87400 }, { "epoch": 0.35645577045877896, "grad_norm": 9.564685821533203, "learning_rate": 0.004977562128799318, "loss": 8.5448, "step": 87500 }, { "epoch": 0.3568631484821604, "grad_norm": 3.261197328567505, "learning_rate": 0.004977495697169109, "loss": 8.6543, "step": 87600 }, { "epoch": 0.35727052650554186, "grad_norm": 11.773866653442383, "learning_rate": 0.004977429167787918, "loss": 8.6109, "step": 87700 }, { "epoch": 0.35767790452892334, "grad_norm": 5.849395275115967, "learning_rate": 0.004977362540658388, "loss": 8.7551, "step": 87800 }, { "epoch": 0.3580852825523048, "grad_norm": 6.478888511657715, "learning_rate": 0.00497729581578314, "loss": 8.6222, "step": 87900 }, { "epoch": 0.35849266057568624, "grad_norm": 10.449518203735352, "learning_rate": 0.004977228993164813, "loss": 8.5137, "step": 88000 }, { "epoch": 0.35849266057568624, "eval_MaskedAccuracy": 0.46903022098579256, "eval_loss": 1.7992993593215942, "eval_runtime": 596.9731, "eval_samples_per_second": 106.33, "eval_steps_per_second": 0.415, "step": 88000 }, { "epoch": 0.3589000385990677, "grad_norm": 4.7867302894592285, "learning_rate": 0.004977162072806052, "loss": 8.5145, "step": 88100 }, { "epoch": 0.3593074166224492, "grad_norm": 9.004382133483887, "learning_rate": 0.004977095054709505, "loss": 8.4894, "step": 88200 }, { "epoch": 0.3597147946458306, "grad_norm": 9.401776313781738, "learning_rate": 0.0049770279388778186, "loss": 8.4734, "step": 88300 }, { "epoch": 0.3601221726692121, "grad_norm": 9.095968246459961, "learning_rate": 0.004976960725313647, "loss": 8.4613, "step": 88400 }, { "epoch": 0.36052955069259357, "grad_norm": 9.433191299438477, "learning_rate": 0.004976893414019646, "loss": 8.4729, "step": 88500 }, { "epoch": 0.36093692871597505, "grad_norm": 7.030093193054199, "learning_rate": 0.004976826004998478, "loss": 8.4404, "step": 88600 }, { "epoch": 0.36134430673935647, "grad_norm": 4.911047458648682, "learning_rate": 0.004976758498252802, "loss": 8.5508, "step": 88700 }, { "epoch": 0.36175168476273795, "grad_norm": 12.30935287475586, "learning_rate": 0.004976690893785297, "loss": 8.5833, "step": 88800 }, { "epoch": 0.3621590627861194, "grad_norm": 12.394389152526855, "learning_rate": 0.004976623191598629, "loss": 8.5081, "step": 88900 }, { "epoch": 0.36256644080950085, "grad_norm": 11.784280776977539, "learning_rate": 0.004976555391695481, "loss": 8.4756, "step": 89000 }, { "epoch": 0.36256644080950085, "eval_MaskedAccuracy": 0.47060521979681946, "eval_loss": 1.7932844161987305, "eval_runtime": 467.7312, "eval_samples_per_second": 135.71, "eval_steps_per_second": 0.53, "step": 89000 }, { "epoch": 0.3629738188328823, "grad_norm": 11.131914138793945, "learning_rate": 0.004976487494078528, "loss": 8.4545, "step": 89100 }, { "epoch": 0.3633811968562638, "grad_norm": 8.43490219116211, "learning_rate": 0.0049764194987504526, "loss": 8.4633, "step": 89200 }, { "epoch": 0.3637885748796453, "grad_norm": 3.71230149269104, "learning_rate": 0.004976351405713957, "loss": 8.4831, "step": 89300 }, { "epoch": 0.3641959529030267, "grad_norm": 11.018815994262695, "learning_rate": 0.004976283214971711, "loss": 8.6283, "step": 89400 }, { "epoch": 0.3646033309264082, "grad_norm": 7.818960666656494, "learning_rate": 0.00497621492652643, "loss": 8.5877, "step": 89500 }, { "epoch": 0.36501070894978965, "grad_norm": 7.818572044372559, "learning_rate": 0.004976146540380799, "loss": 8.7524, "step": 89600 }, { "epoch": 0.3654180869731711, "grad_norm": 5.199432373046875, "learning_rate": 0.004976078056537536, "loss": 8.7144, "step": 89700 }, { "epoch": 0.36582546499655255, "grad_norm": 8.510984420776367, "learning_rate": 0.004976009474999339, "loss": 8.5381, "step": 89800 }, { "epoch": 0.36623284301993403, "grad_norm": 8.88054370880127, "learning_rate": 0.004975940795768925, "loss": 8.5168, "step": 89900 }, { "epoch": 0.3666402210433155, "grad_norm": 8.716156005859375, "learning_rate": 0.004975872018849006, "loss": 8.522, "step": 90000 }, { "epoch": 0.3666402210433155, "eval_MaskedAccuracy": 0.4706358157887873, "eval_loss": 1.7848535776138306, "eval_runtime": 594.3953, "eval_samples_per_second": 106.791, "eval_steps_per_second": 0.417, "step": 90000 }, { "epoch": 0.36704759906669693, "grad_norm": 10.945036888122559, "learning_rate": 0.0049758031442422935, "loss": 8.4675, "step": 90100 }, { "epoch": 0.3674549770900784, "grad_norm": 4.423859119415283, "learning_rate": 0.004975734171951515, "loss": 8.583, "step": 90200 }, { "epoch": 0.3678623551134599, "grad_norm": 10.427994728088379, "learning_rate": 0.004975665101979403, "loss": 8.7836, "step": 90300 }, { "epoch": 0.36826973313684136, "grad_norm": 9.24545669555664, "learning_rate": 0.004975595934328689, "loss": 8.759, "step": 90400 }, { "epoch": 0.3686771111602228, "grad_norm": 8.96117877960205, "learning_rate": 0.004975526669002103, "loss": 8.7391, "step": 90500 }, { "epoch": 0.36908448918360426, "grad_norm": 3.695530891418457, "learning_rate": 0.004975457306002383, "loss": 8.5696, "step": 90600 }, { "epoch": 0.36949186720698574, "grad_norm": 8.851283073425293, "learning_rate": 0.004975387845332268, "loss": 8.6098, "step": 90700 }, { "epoch": 0.36989924523036716, "grad_norm": 8.128582000732422, "learning_rate": 0.004975318286994518, "loss": 8.5255, "step": 90800 }, { "epoch": 0.37030662325374863, "grad_norm": 7.524702072143555, "learning_rate": 0.004975248630991871, "loss": 8.4705, "step": 90900 }, { "epoch": 0.3707140012771301, "grad_norm": 10.401105880737305, "learning_rate": 0.004975178877327081, "loss": 8.4252, "step": 91000 }, { "epoch": 0.3707140012771301, "eval_MaskedAccuracy": 0.47044726839432044, "eval_loss": 1.7901692390441895, "eval_runtime": 552.0048, "eval_samples_per_second": 114.992, "eval_steps_per_second": 0.449, "step": 91000 }, { "epoch": 0.3711213793005116, "grad_norm": 7.144312381744385, "learning_rate": 0.004975109026002911, "loss": 8.46, "step": 91100 }, { "epoch": 0.371528757323893, "grad_norm": 7.993409156799316, "learning_rate": 0.00497503907702212, "loss": 8.5332, "step": 91200 }, { "epoch": 0.3719361353472745, "grad_norm": 3.0928328037261963, "learning_rate": 0.004974969030387474, "loss": 8.7197, "step": 91300 }, { "epoch": 0.37234351337065597, "grad_norm": 11.095446586608887, "learning_rate": 0.004974898886101745, "loss": 8.5891, "step": 91400 }, { "epoch": 0.3727508913940374, "grad_norm": 10.600836753845215, "learning_rate": 0.004974828644167699, "loss": 8.6775, "step": 91500 }, { "epoch": 0.37315826941741886, "grad_norm": 9.994548797607422, "learning_rate": 0.0049747583045881205, "loss": 8.5214, "step": 91600 }, { "epoch": 0.37356564744080034, "grad_norm": 9.244461059570312, "learning_rate": 0.004974687867365784, "loss": 8.4631, "step": 91700 }, { "epoch": 0.3739730254641818, "grad_norm": 8.088507652282715, "learning_rate": 0.004974617332503477, "loss": 8.4735, "step": 91800 }, { "epoch": 0.37438040348756324, "grad_norm": 7.182349681854248, "learning_rate": 0.0049745467000039895, "loss": 8.4874, "step": 91900 }, { "epoch": 0.3747877815109447, "grad_norm": 3.551943302154541, "learning_rate": 0.004974475969870113, "loss": 8.4846, "step": 92000 }, { "epoch": 0.3747877815109447, "eval_MaskedAccuracy": 0.469990103072361, "eval_loss": 1.7921059131622314, "eval_runtime": 584.8413, "eval_samples_per_second": 108.535, "eval_steps_per_second": 0.424, "step": 92000 }, { "epoch": 0.3751951595343262, "grad_norm": 4.345943927764893, "learning_rate": 0.004974405142104643, "loss": 8.7141, "step": 92100 }, { "epoch": 0.3756025375577076, "grad_norm": 2.48052978515625, "learning_rate": 0.004974334216710381, "loss": 8.6144, "step": 92200 }, { "epoch": 0.3760099155810891, "grad_norm": 9.305281639099121, "learning_rate": 0.00497426319369013, "loss": 8.703, "step": 92300 }, { "epoch": 0.37641729360447057, "grad_norm": 8.072779655456543, "learning_rate": 0.0049741920730466994, "loss": 8.815, "step": 92400 }, { "epoch": 0.37682467162785205, "grad_norm": 7.126327991485596, "learning_rate": 0.004974120854782898, "loss": 8.5336, "step": 92500 }, { "epoch": 0.37723204965123347, "grad_norm": 10.760848999023438, "learning_rate": 0.004974049538901545, "loss": 8.5313, "step": 92600 }, { "epoch": 0.37763942767461495, "grad_norm": 8.534592628479004, "learning_rate": 0.004973978125405459, "loss": 8.4804, "step": 92700 }, { "epoch": 0.3780468056979964, "grad_norm": 6.255151748657227, "learning_rate": 0.004973906614297455, "loss": 8.4687, "step": 92800 }, { "epoch": 0.3784541837213779, "grad_norm": 10.237187385559082, "learning_rate": 0.004973835005580372, "loss": 8.4838, "step": 92900 }, { "epoch": 0.3788615617447593, "grad_norm": 3.644022226333618, "learning_rate": 0.004973763299257041, "loss": 8.5314, "step": 93000 }, { "epoch": 0.3788615617447593, "eval_MaskedAccuracy": 0.4678807769183222, "eval_loss": 1.7888926267623901, "eval_runtime": 535.7103, "eval_samples_per_second": 118.489, "eval_steps_per_second": 0.463, "step": 93000 }, { "epoch": 0.3792689397681408, "grad_norm": 8.058805465698242, "learning_rate": 0.0049736914953302895, "loss": 8.814, "step": 93100 }, { "epoch": 0.3796763177915223, "grad_norm": 4.705103874206543, "learning_rate": 0.0049736195938029585, "loss": 8.5721, "step": 93200 }, { "epoch": 0.3800836958149037, "grad_norm": 10.882928848266602, "learning_rate": 0.004973547594677897, "loss": 8.6773, "step": 93300 }, { "epoch": 0.3804910738382852, "grad_norm": 5.981499195098877, "learning_rate": 0.0049734754979579455, "loss": 8.551, "step": 93400 }, { "epoch": 0.38089845186166665, "grad_norm": 3.3413658142089844, "learning_rate": 0.004973403303645947, "loss": 8.6139, "step": 93500 }, { "epoch": 0.38130582988504813, "grad_norm": 3.6258671283721924, "learning_rate": 0.004973331011744769, "loss": 8.6114, "step": 93600 }, { "epoch": 0.38171320790842955, "grad_norm": 7.946436882019043, "learning_rate": 0.004973258622257264, "loss": 8.6811, "step": 93700 }, { "epoch": 0.38212058593181103, "grad_norm": 6.5823974609375, "learning_rate": 0.004973186135186295, "loss": 8.5271, "step": 93800 }, { "epoch": 0.3825279639551925, "grad_norm": 10.669353485107422, "learning_rate": 0.004973113550534733, "loss": 8.5442, "step": 93900 }, { "epoch": 0.38293534197857393, "grad_norm": 10.489116668701172, "learning_rate": 0.00497304086830544, "loss": 8.7497, "step": 94000 }, { "epoch": 0.38293534197857393, "eval_MaskedAccuracy": 0.45569368019319123, "eval_loss": 1.8612487316131592, "eval_runtime": 590.2393, "eval_samples_per_second": 107.543, "eval_steps_per_second": 0.42, "step": 94000 }, { "epoch": 0.3833427200019554, "grad_norm": 3.391709566116333, "learning_rate": 0.004972968088501293, "loss": 8.7032, "step": 94100 }, { "epoch": 0.3837500980253369, "grad_norm": 7.481471061706543, "learning_rate": 0.004972895211125167, "loss": 8.5595, "step": 94200 }, { "epoch": 0.38415747604871836, "grad_norm": 9.863663673400879, "learning_rate": 0.004972822236179941, "loss": 8.5347, "step": 94300 }, { "epoch": 0.3845648540720998, "grad_norm": 8.590025901794434, "learning_rate": 0.004972749163668501, "loss": 8.6051, "step": 94400 }, { "epoch": 0.38497223209548126, "grad_norm": 5.421779632568359, "learning_rate": 0.00497267599359374, "loss": 8.6653, "step": 94500 }, { "epoch": 0.38537961011886274, "grad_norm": 5.929932594299316, "learning_rate": 0.004972602725958551, "loss": 8.514, "step": 94600 }, { "epoch": 0.38578698814224416, "grad_norm": 5.623597145080566, "learning_rate": 0.0049725293607658224, "loss": 8.5508, "step": 94700 }, { "epoch": 0.38619436616562564, "grad_norm": 6.509292125701904, "learning_rate": 0.004972455898018459, "loss": 8.5169, "step": 94800 }, { "epoch": 0.3866017441890071, "grad_norm": 7.418100357055664, "learning_rate": 0.004972382337719371, "loss": 8.4773, "step": 94900 }, { "epoch": 0.3870091222123886, "grad_norm": 9.678910255432129, "learning_rate": 0.004972308679871465, "loss": 8.4783, "step": 95000 }, { "epoch": 0.3870091222123886, "eval_MaskedAccuracy": 0.47081274182536137, "eval_loss": 1.7964165210723877, "eval_runtime": 606.0193, "eval_samples_per_second": 104.743, "eval_steps_per_second": 0.409, "step": 95000 }, { "epoch": 0.38741650023577, "grad_norm": 5.7913994789123535, "learning_rate": 0.0049722349244776505, "loss": 8.4547, "step": 95100 }, { "epoch": 0.3878238782591515, "grad_norm": 8.161794662475586, "learning_rate": 0.0049721610715408445, "loss": 8.4734, "step": 95200 }, { "epoch": 0.38823125628253297, "grad_norm": 6.818782806396484, "learning_rate": 0.004972087121063967, "loss": 8.471, "step": 95300 }, { "epoch": 0.38863863430591444, "grad_norm": 7.340078830718994, "learning_rate": 0.004972013073049942, "loss": 8.4317, "step": 95400 }, { "epoch": 0.38904601232929586, "grad_norm": 6.700837135314941, "learning_rate": 0.004971938927501688, "loss": 8.4477, "step": 95500 }, { "epoch": 0.38945339035267734, "grad_norm": 13.21121883392334, "learning_rate": 0.004971864684422152, "loss": 8.4427, "step": 95600 }, { "epoch": 0.3898607683760588, "grad_norm": 11.779099464416504, "learning_rate": 0.00497179034381426, "loss": 8.664, "step": 95700 }, { "epoch": 0.39026814639944024, "grad_norm": 3.3817567825317383, "learning_rate": 0.0049717159056809545, "loss": 8.7076, "step": 95800 }, { "epoch": 0.3906755244228217, "grad_norm": 8.147451400756836, "learning_rate": 0.004971641370025169, "loss": 8.5892, "step": 95900 }, { "epoch": 0.3910829024462032, "grad_norm": 12.229639053344727, "learning_rate": 0.00497156673684986, "loss": 8.4967, "step": 96000 }, { "epoch": 0.3910829024462032, "eval_MaskedAccuracy": 0.47064134862438395, "eval_loss": 1.8006526231765747, "eval_runtime": 587.9392, "eval_samples_per_second": 107.964, "eval_steps_per_second": 0.422, "step": 96000 }, { "epoch": 0.3914902804695847, "grad_norm": 8.995532989501953, "learning_rate": 0.004971492006157979, "loss": 8.463, "step": 96100 }, { "epoch": 0.3918976584929661, "grad_norm": 7.136062145233154, "learning_rate": 0.0049714171779524745, "loss": 8.4514, "step": 96200 }, { "epoch": 0.39230503651634757, "grad_norm": 5.498950481414795, "learning_rate": 0.004971342252236309, "loss": 8.4614, "step": 96300 }, { "epoch": 0.39271241453972905, "grad_norm": 7.159139633178711, "learning_rate": 0.004971267229012441, "loss": 8.4478, "step": 96400 }, { "epoch": 0.39311979256311047, "grad_norm": 10.60985279083252, "learning_rate": 0.0049711921082838395, "loss": 8.4169, "step": 96500 }, { "epoch": 0.39352717058649195, "grad_norm": 7.3362812995910645, "learning_rate": 0.004971116890053479, "loss": 8.4131, "step": 96600 }, { "epoch": 0.3939345486098734, "grad_norm": 7.946305274963379, "learning_rate": 0.0049710415743243214, "loss": 8.46, "step": 96700 }, { "epoch": 0.3943419266332549, "grad_norm": 2.3922622203826904, "learning_rate": 0.0049709661610993485, "loss": 8.659, "step": 96800 }, { "epoch": 0.3947493046566363, "grad_norm": 10.708338737487793, "learning_rate": 0.0049708906503815545, "loss": 8.8707, "step": 96900 }, { "epoch": 0.3951566826800178, "grad_norm": 1.006100058555603, "learning_rate": 0.004970815042173913, "loss": 8.6811, "step": 97000 }, { "epoch": 0.3951566826800178, "eval_MaskedAccuracy": 0.4597566056094459, "eval_loss": 1.8534563779830933, "eval_runtime": 628.0543, "eval_samples_per_second": 101.068, "eval_steps_per_second": 0.395, "step": 97000 }, { "epoch": 0.3955640607033993, "grad_norm": 5.042269706726074, "learning_rate": 0.004970739336479416, "loss": 8.6412, "step": 97100 }, { "epoch": 0.3959714387267807, "grad_norm": 6.670810699462891, "learning_rate": 0.004970663533301056, "loss": 8.5012, "step": 97200 }, { "epoch": 0.3963788167501622, "grad_norm": 7.702014923095703, "learning_rate": 0.004970587632641828, "loss": 8.4667, "step": 97300 }, { "epoch": 0.39678619477354365, "grad_norm": 6.220357418060303, "learning_rate": 0.004970511634504726, "loss": 8.4563, "step": 97400 }, { "epoch": 0.39719357279692513, "grad_norm": 2.774712085723877, "learning_rate": 0.004970435538892763, "loss": 8.5194, "step": 97500 }, { "epoch": 0.39760095082030655, "grad_norm": 2.297208786010742, "learning_rate": 0.004970359345808957, "loss": 8.8205, "step": 97600 }, { "epoch": 0.39800832884368803, "grad_norm": 7.010386943817139, "learning_rate": 0.004970283055256305, "loss": 8.6058, "step": 97700 }, { "epoch": 0.3984157068670695, "grad_norm": 5.929687023162842, "learning_rate": 0.004970206667237826, "loss": 8.5004, "step": 97800 }, { "epoch": 0.39882308489045093, "grad_norm": 9.975981712341309, "learning_rate": 0.004970130181756547, "loss": 8.4581, "step": 97900 }, { "epoch": 0.3992304629138324, "grad_norm": 11.034873008728027, "learning_rate": 0.0049700535988154845, "loss": 8.4762, "step": 98000 }, { "epoch": 0.3992304629138324, "eval_MaskedAccuracy": 0.4724678743496642, "eval_loss": 1.784359335899353, "eval_runtime": 562.0174, "eval_samples_per_second": 112.943, "eval_steps_per_second": 0.441, "step": 98000 }, { "epoch": 0.3996378409372139, "grad_norm": 9.151312828063965, "learning_rate": 0.004969976918417675, "loss": 8.455, "step": 98100 }, { "epoch": 0.40004521896059536, "grad_norm": 7.673705577850342, "learning_rate": 0.0049699001405661405, "loss": 8.4271, "step": 98200 }, { "epoch": 0.4004525969839768, "grad_norm": 7.930306911468506, "learning_rate": 0.004969823265263923, "loss": 8.4594, "step": 98300 }, { "epoch": 0.40085997500735826, "grad_norm": 5.796153545379639, "learning_rate": 0.00496974629251405, "loss": 8.4188, "step": 98400 }, { "epoch": 0.40126735303073974, "grad_norm": 6.536154270172119, "learning_rate": 0.004969669222319581, "loss": 8.4624, "step": 98500 }, { "epoch": 0.4016747310541212, "grad_norm": 3.913081645965576, "learning_rate": 0.004969592054683553, "loss": 8.4665, "step": 98600 }, { "epoch": 0.40208210907750264, "grad_norm": 7.269486427307129, "learning_rate": 0.004969514789609019, "loss": 8.4192, "step": 98700 }, { "epoch": 0.4024894871008841, "grad_norm": 10.048105239868164, "learning_rate": 0.004969437427099035, "loss": 8.3996, "step": 98800 }, { "epoch": 0.4028968651242656, "grad_norm": 11.666584968566895, "learning_rate": 0.004969359967156659, "loss": 8.4127, "step": 98900 }, { "epoch": 0.403304243147647, "grad_norm": 3.249028205871582, "learning_rate": 0.004969282409784957, "loss": 8.4762, "step": 99000 }, { "epoch": 0.403304243147647, "eval_MaskedAccuracy": 0.46012254295805927, "eval_loss": 1.8292229175567627, "eval_runtime": 602.0341, "eval_samples_per_second": 105.436, "eval_steps_per_second": 0.412, "step": 99000 }, { "epoch": 0.4037116211710285, "grad_norm": 1.0890861749649048, "learning_rate": 0.004969204754986983, "loss": 8.8252, "step": 99100 }, { "epoch": 0.40411899919440997, "grad_norm": 10.644725799560547, "learning_rate": 0.004969127002765818, "loss": 8.7852, "step": 99200 }, { "epoch": 0.40452637721779144, "grad_norm": 7.145195007324219, "learning_rate": 0.004969049153124536, "loss": 8.6053, "step": 99300 }, { "epoch": 0.40493375524117287, "grad_norm": 10.985939979553223, "learning_rate": 0.004968971206066211, "loss": 8.6446, "step": 99400 }, { "epoch": 0.40534113326455434, "grad_norm": 3.1606171131134033, "learning_rate": 0.004968893161593926, "loss": 8.5559, "step": 99500 }, { "epoch": 0.4057485112879358, "grad_norm": 4.031942844390869, "learning_rate": 0.004968815019710773, "loss": 8.4871, "step": 99600 }, { "epoch": 0.40615588931131724, "grad_norm": 6.07538366317749, "learning_rate": 0.004968736780419826, "loss": 8.5672, "step": 99700 }, { "epoch": 0.4065632673346987, "grad_norm": 12.900278091430664, "learning_rate": 0.004968658443724192, "loss": 8.5097, "step": 99800 }, { "epoch": 0.4069706453580802, "grad_norm": 6.876996994018555, "learning_rate": 0.004968580009626964, "loss": 8.4321, "step": 99900 }, { "epoch": 0.4073780233814617, "grad_norm": 8.868197441101074, "learning_rate": 0.004968501478131237, "loss": 8.4285, "step": 100000 }, { "epoch": 0.4073780233814617, "eval_MaskedAccuracy": 0.4721042950935722, "eval_loss": 1.777195692062378, "eval_runtime": 495.6412, "eval_samples_per_second": 128.068, "eval_steps_per_second": 0.5, "step": 100000 }, { "epoch": 0.4077854014048431, "grad_norm": 5.752632141113281, "learning_rate": 0.004968422849240128, "loss": 8.3971, "step": 100100 }, { "epoch": 0.4081927794282246, "grad_norm": 6.129387378692627, "learning_rate": 0.004968344122956736, "loss": 8.4651, "step": 100200 }, { "epoch": 0.40860015745160605, "grad_norm": 4.563933849334717, "learning_rate": 0.004968265299284173, "loss": 8.7484, "step": 100300 }, { "epoch": 0.40900753547498747, "grad_norm": 4.392648220062256, "learning_rate": 0.004968186378225553, "loss": 8.697, "step": 100400 }, { "epoch": 0.40941491349836895, "grad_norm": 4.389554977416992, "learning_rate": 0.004968107359784003, "loss": 8.4855, "step": 100500 }, { "epoch": 0.4098222915217504, "grad_norm": 14.288431167602539, "learning_rate": 0.004968028243962654, "loss": 8.5336, "step": 100600 }, { "epoch": 0.4102296695451319, "grad_norm": 0.7195647358894348, "learning_rate": 0.004967949030764624, "loss": 8.705, "step": 100700 }, { "epoch": 0.4106370475685133, "grad_norm": 9.668696403503418, "learning_rate": 0.0049678697201930464, "loss": 8.5831, "step": 100800 }, { "epoch": 0.4110444255918948, "grad_norm": 10.904997825622559, "learning_rate": 0.004967790312251054, "loss": 8.4528, "step": 100900 }, { "epoch": 0.4114518036152763, "grad_norm": 9.388179779052734, "learning_rate": 0.00496771080694179, "loss": 8.4365, "step": 101000 }, { "epoch": 0.4114518036152763, "eval_MaskedAccuracy": 0.47232041490235255, "eval_loss": 1.7801474332809448, "eval_runtime": 579.6663, "eval_samples_per_second": 109.504, "eval_steps_per_second": 0.428, "step": 101000 }, { "epoch": 0.41185918163865776, "grad_norm": 10.938289642333984, "learning_rate": 0.004967631204268392, "loss": 8.4674, "step": 101100 }, { "epoch": 0.4122665596620392, "grad_norm": 8.339547157287598, "learning_rate": 0.004967551504234018, "loss": 8.426, "step": 101200 }, { "epoch": 0.41267393768542066, "grad_norm": 9.121721267700195, "learning_rate": 0.00496747170684181, "loss": 8.4499, "step": 101300 }, { "epoch": 0.41308131570880213, "grad_norm": 11.89256477355957, "learning_rate": 0.004967391812094921, "loss": 8.4271, "step": 101400 }, { "epoch": 0.41348869373218355, "grad_norm": 15.700182914733887, "learning_rate": 0.004967311819996515, "loss": 8.426, "step": 101500 }, { "epoch": 0.41389607175556503, "grad_norm": 11.21250057220459, "learning_rate": 0.004967231730549757, "loss": 8.4552, "step": 101600 }, { "epoch": 0.4143034497789465, "grad_norm": 15.058488845825195, "learning_rate": 0.00496715154375781, "loss": 8.4029, "step": 101700 }, { "epoch": 0.414710827802328, "grad_norm": 9.321598052978516, "learning_rate": 0.0049670712596238415, "loss": 8.4129, "step": 101800 }, { "epoch": 0.4151182058257094, "grad_norm": 12.426863670349121, "learning_rate": 0.004966990878151028, "loss": 8.4271, "step": 101900 }, { "epoch": 0.4155255838490909, "grad_norm": 2.9854533672332764, "learning_rate": 0.004966910399342545, "loss": 8.5209, "step": 102000 }, { "epoch": 0.4155255838490909, "eval_MaskedAccuracy": 0.46132029272790565, "eval_loss": 1.8232041597366333, "eval_runtime": 639.7025, "eval_samples_per_second": 99.227, "eval_steps_per_second": 0.388, "step": 102000 }, { "epoch": 0.41593296187247236, "grad_norm": 6.4361395835876465, "learning_rate": 0.004966829823201583, "loss": 8.7334, "step": 102100 }, { "epoch": 0.4163403398958538, "grad_norm": 7.562118053436279, "learning_rate": 0.004966749149731316, "loss": 8.6135, "step": 102200 }, { "epoch": 0.41674771791923526, "grad_norm": 6.0594964027404785, "learning_rate": 0.004966668378934941, "loss": 8.6054, "step": 102300 }, { "epoch": 0.41715509594261674, "grad_norm": 3.716524600982666, "learning_rate": 0.004966587510815647, "loss": 8.779, "step": 102400 }, { "epoch": 0.4175624739659982, "grad_norm": 5.204922676086426, "learning_rate": 0.0049665065453766335, "loss": 8.7758, "step": 102500 }, { "epoch": 0.41796985198937964, "grad_norm": 6.669337749481201, "learning_rate": 0.004966425482621107, "loss": 8.541, "step": 102600 }, { "epoch": 0.4183772300127611, "grad_norm": 4.090687274932861, "learning_rate": 0.00496634432255226, "loss": 8.5421, "step": 102700 }, { "epoch": 0.4187846080361426, "grad_norm": 10.983367919921875, "learning_rate": 0.00496626306517331, "loss": 8.6436, "step": 102800 }, { "epoch": 0.419191986059524, "grad_norm": 9.434021949768066, "learning_rate": 0.004966181710487462, "loss": 8.6067, "step": 102900 }, { "epoch": 0.4195993640829055, "grad_norm": 8.15808391571045, "learning_rate": 0.004966100258497939, "loss": 8.512, "step": 103000 }, { "epoch": 0.4195993640829055, "eval_MaskedAccuracy": 0.47153795500267925, "eval_loss": 1.798658847808838, "eval_runtime": 577.6914, "eval_samples_per_second": 109.879, "eval_steps_per_second": 0.429, "step": 103000 }, { "epoch": 0.42000674210628697, "grad_norm": 9.656765937805176, "learning_rate": 0.004966018709207965, "loss": 8.4555, "step": 103100 }, { "epoch": 0.42041412012966844, "grad_norm": 11.301863670349121, "learning_rate": 0.0049659370626207585, "loss": 8.4537, "step": 103200 }, { "epoch": 0.42082149815304987, "grad_norm": 2.013126850128174, "learning_rate": 0.004965855318739549, "loss": 8.4458, "step": 103300 }, { "epoch": 0.42122887617643134, "grad_norm": 6.856706142425537, "learning_rate": 0.004965773477567567, "loss": 8.5382, "step": 103400 }, { "epoch": 0.4216362541998128, "grad_norm": 7.913936614990234, "learning_rate": 0.004965691539108045, "loss": 8.4526, "step": 103500 }, { "epoch": 0.4220436322231943, "grad_norm": 6.275994300842285, "learning_rate": 0.004965609503364228, "loss": 8.4138, "step": 103600 }, { "epoch": 0.4224510102465757, "grad_norm": 9.040349006652832, "learning_rate": 0.004965527370339356, "loss": 8.4321, "step": 103700 }, { "epoch": 0.4228583882699572, "grad_norm": 6.3520307540893555, "learning_rate": 0.0049654451400366825, "loss": 8.4265, "step": 103800 }, { "epoch": 0.4232657662933387, "grad_norm": 5.726953029632568, "learning_rate": 0.0049653628124594435, "loss": 8.5499, "step": 103900 }, { "epoch": 0.4236731443167201, "grad_norm": 3.7572128772735596, "learning_rate": 0.004965280387610907, "loss": 8.6229, "step": 104000 }, { "epoch": 0.4236731443167201, "eval_MaskedAccuracy": 0.46590958911815616, "eval_loss": 1.8005009889602661, "eval_runtime": 605.7562, "eval_samples_per_second": 104.788, "eval_steps_per_second": 0.409, "step": 104000 }, { "epoch": 0.4240805223401016, "grad_norm": 7.807491779327393, "learning_rate": 0.004965197865494326, "loss": 8.5348, "step": 104100 }, { "epoch": 0.42448790036348305, "grad_norm": 5.640869617462158, "learning_rate": 0.004965115246112967, "loss": 8.5162, "step": 104200 }, { "epoch": 0.42489527838686453, "grad_norm": 3.8039796352386475, "learning_rate": 0.0049650325294700965, "loss": 8.6223, "step": 104300 }, { "epoch": 0.42530265641024595, "grad_norm": 6.969115257263184, "learning_rate": 0.004964949715568982, "loss": 8.5224, "step": 104400 }, { "epoch": 0.4257100344336274, "grad_norm": 6.870429515838623, "learning_rate": 0.004964866804412895, "loss": 8.5626, "step": 104500 }, { "epoch": 0.4261174124570089, "grad_norm": 1.0034383535385132, "learning_rate": 0.004964783796005118, "loss": 8.7603, "step": 104600 }, { "epoch": 0.4265247904803903, "grad_norm": 4.922119140625, "learning_rate": 0.0049647006903489315, "loss": 8.6798, "step": 104700 }, { "epoch": 0.4269321685037718, "grad_norm": 5.869476795196533, "learning_rate": 0.004964617487447619, "loss": 8.4977, "step": 104800 }, { "epoch": 0.4273395465271533, "grad_norm": 6.778334140777588, "learning_rate": 0.004964534187304481, "loss": 8.4221, "step": 104900 }, { "epoch": 0.42774692455053476, "grad_norm": 5.279184341430664, "learning_rate": 0.004964450789922798, "loss": 8.4347, "step": 105000 }, { "epoch": 0.42774692455053476, "eval_MaskedAccuracy": 0.4724774508604719, "eval_loss": 1.787004828453064, "eval_runtime": 599.3399, "eval_samples_per_second": 105.91, "eval_steps_per_second": 0.414, "step": 105000 }, { "epoch": 0.4281543025739162, "grad_norm": 6.847875595092773, "learning_rate": 0.004964367295305874, "loss": 8.4475, "step": 105100 }, { "epoch": 0.42856168059729766, "grad_norm": 7.180721759796143, "learning_rate": 0.004964283703457004, "loss": 8.4224, "step": 105200 }, { "epoch": 0.42896905862067913, "grad_norm": 8.042956352233887, "learning_rate": 0.004964200014379491, "loss": 8.3657, "step": 105300 }, { "epoch": 0.42937643664406056, "grad_norm": 6.460809230804443, "learning_rate": 0.004964116228076652, "loss": 8.4149, "step": 105400 }, { "epoch": 0.42978381466744203, "grad_norm": 2.5339605808258057, "learning_rate": 0.004964032344551802, "loss": 8.3877, "step": 105500 }, { "epoch": 0.4301911926908235, "grad_norm": 8.963078498840332, "learning_rate": 0.004963948363808248, "loss": 8.4022, "step": 105600 }, { "epoch": 0.430598570714205, "grad_norm": 5.23846960067749, "learning_rate": 0.004963864285849314, "loss": 8.3939, "step": 105700 }, { "epoch": 0.4310059487375864, "grad_norm": 3.0979650020599365, "learning_rate": 0.00496378011067832, "loss": 8.3992, "step": 105800 }, { "epoch": 0.4314133267609679, "grad_norm": 4.946394443511963, "learning_rate": 0.004963695838298602, "loss": 8.4295, "step": 105900 }, { "epoch": 0.43182070478434936, "grad_norm": 5.411014080047607, "learning_rate": 0.004963611468713486, "loss": 8.372, "step": 106000 }, { "epoch": 0.43182070478434936, "eval_MaskedAccuracy": 0.4736338868371074, "eval_loss": 1.7651697397232056, "eval_runtime": 607.1531, "eval_samples_per_second": 104.547, "eval_steps_per_second": 0.408, "step": 106000 }, { "epoch": 0.43222808280773084, "grad_norm": 4.002007484436035, "learning_rate": 0.004963527001926312, "loss": 8.4388, "step": 106100 }, { "epoch": 0.43263546083111226, "grad_norm": 5.016679763793945, "learning_rate": 0.004963442437940414, "loss": 8.513, "step": 106200 }, { "epoch": 0.43304283885449374, "grad_norm": 4.849783420562744, "learning_rate": 0.004963357776759134, "loss": 8.4713, "step": 106300 }, { "epoch": 0.4334502168778752, "grad_norm": 5.830073356628418, "learning_rate": 0.004963273018385826, "loss": 8.6484, "step": 106400 }, { "epoch": 0.43385759490125664, "grad_norm": 9.937734603881836, "learning_rate": 0.004963188162823837, "loss": 8.4738, "step": 106500 }, { "epoch": 0.4342649729246381, "grad_norm": 6.670325756072998, "learning_rate": 0.004963103210076525, "loss": 8.4172, "step": 106600 }, { "epoch": 0.4346723509480196, "grad_norm": 10.503084182739258, "learning_rate": 0.004963018160147245, "loss": 8.4091, "step": 106700 }, { "epoch": 0.43507972897140107, "grad_norm": 9.465533256530762, "learning_rate": 0.004962933013039362, "loss": 8.4091, "step": 106800 }, { "epoch": 0.4354871069947825, "grad_norm": 10.29710578918457, "learning_rate": 0.004962847768756245, "loss": 8.4316, "step": 106900 }, { "epoch": 0.43589448501816397, "grad_norm": 12.24212646484375, "learning_rate": 0.004962762427301256, "loss": 8.405, "step": 107000 }, { "epoch": 0.43589448501816397, "eval_MaskedAccuracy": 0.4743431114096333, "eval_loss": 1.7652925252914429, "eval_runtime": 570.9129, "eval_samples_per_second": 111.183, "eval_steps_per_second": 0.434, "step": 107000 }, { "epoch": 0.43630186304154545, "grad_norm": 9.348734855651855, "learning_rate": 0.0049626769886777725, "loss": 8.6629, "step": 107100 }, { "epoch": 0.43670924106492687, "grad_norm": 4.134520053863525, "learning_rate": 0.0049625914528891755, "loss": 8.5898, "step": 107200 }, { "epoch": 0.43711661908830834, "grad_norm": 9.459831237792969, "learning_rate": 0.004962505819938845, "loss": 8.4488, "step": 107300 }, { "epoch": 0.4375239971116898, "grad_norm": 8.720632553100586, "learning_rate": 0.004962420089830158, "loss": 8.5787, "step": 107400 }, { "epoch": 0.4379313751350713, "grad_norm": 9.35346794128418, "learning_rate": 0.004962334262566517, "loss": 8.5486, "step": 107500 }, { "epoch": 0.4383387531584527, "grad_norm": 13.627753257751465, "learning_rate": 0.00496224833815131, "loss": 8.614, "step": 107600 }, { "epoch": 0.4387461311818342, "grad_norm": 5.3978166580200195, "learning_rate": 0.0049621623165879295, "loss": 8.506, "step": 107700 }, { "epoch": 0.4391535092052157, "grad_norm": 7.623312950134277, "learning_rate": 0.00496207619787979, "loss": 8.7496, "step": 107800 }, { "epoch": 0.4395608872285971, "grad_norm": 2.9135639667510986, "learning_rate": 0.00496198998203028, "loss": 8.7128, "step": 107900 }, { "epoch": 0.4399682652519786, "grad_norm": 6.541151523590088, "learning_rate": 0.004961903669042818, "loss": 8.5844, "step": 108000 }, { "epoch": 0.4399682652519786, "eval_MaskedAccuracy": 0.47024074789879794, "eval_loss": 1.7921466827392578, "eval_runtime": 654.1479, "eval_samples_per_second": 97.036, "eval_steps_per_second": 0.379, "step": 108000 }, { "epoch": 0.44037564327536005, "grad_norm": 7.330739498138428, "learning_rate": 0.004961817258920815, "loss": 8.4422, "step": 108100 }, { "epoch": 0.44078302129874153, "grad_norm": 9.475118637084961, "learning_rate": 0.0049617307516676866, "loss": 8.4373, "step": 108200 }, { "epoch": 0.44119039932212295, "grad_norm": 8.402446746826172, "learning_rate": 0.004961644147286852, "loss": 8.4133, "step": 108300 }, { "epoch": 0.4415977773455044, "grad_norm": 6.884156227111816, "learning_rate": 0.00496155744578173, "loss": 8.4033, "step": 108400 }, { "epoch": 0.4420051553688859, "grad_norm": 9.831668853759766, "learning_rate": 0.004961470647155754, "loss": 8.4261, "step": 108500 }, { "epoch": 0.4424125333922673, "grad_norm": 7.965442657470703, "learning_rate": 0.0049613837514123574, "loss": 8.3904, "step": 108600 }, { "epoch": 0.4428199114156488, "grad_norm": 1.3925095796585083, "learning_rate": 0.004961296758554973, "loss": 8.4134, "step": 108700 }, { "epoch": 0.4432272894390303, "grad_norm": 8.52515983581543, "learning_rate": 0.004961209668587038, "loss": 8.6539, "step": 108800 }, { "epoch": 0.44363466746241176, "grad_norm": 2.4925291538238525, "learning_rate": 0.004961122481512003, "loss": 8.4962, "step": 108900 }, { "epoch": 0.4440420454857932, "grad_norm": 1.4443053007125854, "learning_rate": 0.004961035197333306, "loss": 8.6677, "step": 109000 }, { "epoch": 0.4440420454857932, "eval_MaskedAccuracy": 0.4628872529788418, "eval_loss": 1.829740047454834, "eval_runtime": 602.3877, "eval_samples_per_second": 105.374, "eval_steps_per_second": 0.412, "step": 109000 }, { "epoch": 0.44444942350917466, "grad_norm": 7.29392671585083, "learning_rate": 0.004960947816054404, "loss": 8.5859, "step": 109100 }, { "epoch": 0.44485680153255613, "grad_norm": 7.652862071990967, "learning_rate": 0.004960860337678751, "loss": 8.458, "step": 109200 }, { "epoch": 0.4452641795559376, "grad_norm": 8.172975540161133, "learning_rate": 0.004960772762209805, "loss": 8.426, "step": 109300 }, { "epoch": 0.44567155757931903, "grad_norm": 5.396576404571533, "learning_rate": 0.004960685089651025, "loss": 8.4052, "step": 109400 }, { "epoch": 0.4460789356027005, "grad_norm": 6.155834674835205, "learning_rate": 0.0049605973200058774, "loss": 8.4008, "step": 109500 }, { "epoch": 0.446486313626082, "grad_norm": 6.791605472564697, "learning_rate": 0.004960509453277837, "loss": 8.4069, "step": 109600 }, { "epoch": 0.4468936916494634, "grad_norm": 8.964988708496094, "learning_rate": 0.00496042148947037, "loss": 8.366, "step": 109700 }, { "epoch": 0.4473010696728449, "grad_norm": 5.947779178619385, "learning_rate": 0.0049603334285869645, "loss": 8.4173, "step": 109800 }, { "epoch": 0.44770844769622636, "grad_norm": 5.3750176429748535, "learning_rate": 0.004960245270631098, "loss": 8.4632, "step": 109900 }, { "epoch": 0.44811582571960784, "grad_norm": 8.55760669708252, "learning_rate": 0.004960157015606248, "loss": 8.4325, "step": 110000 }, { "epoch": 0.44811582571960784, "eval_MaskedAccuracy": 0.47331954585294894, "eval_loss": 1.7828290462493896, "eval_runtime": 598.6767, "eval_samples_per_second": 106.027, "eval_steps_per_second": 0.414, "step": 110000 }, { "epoch": 0.44852320374298926, "grad_norm": 7.122706413269043, "learning_rate": 0.00496006866351591, "loss": 8.374, "step": 110100 }, { "epoch": 0.44893058176637074, "grad_norm": 8.731146812438965, "learning_rate": 0.004959980214363583, "loss": 8.4295, "step": 110200 }, { "epoch": 0.4493379597897522, "grad_norm": 7.568539619445801, "learning_rate": 0.004959891668152759, "loss": 8.3725, "step": 110300 }, { "epoch": 0.44974533781313364, "grad_norm": 7.53127384185791, "learning_rate": 0.004959803024886933, "loss": 8.4216, "step": 110400 }, { "epoch": 0.4501527158365151, "grad_norm": 2.6870458126068115, "learning_rate": 0.00495971428456961, "loss": 8.4128, "step": 110500 }, { "epoch": 0.4505600938598966, "grad_norm": 15.371285438537598, "learning_rate": 0.004959625447204312, "loss": 8.6718, "step": 110600 }, { "epoch": 0.45096747188327807, "grad_norm": 2.650428533554077, "learning_rate": 0.004959536512794538, "loss": 8.77, "step": 110700 }, { "epoch": 0.4513748499066595, "grad_norm": 1.186808466911316, "learning_rate": 0.004959447481343806, "loss": 8.7843, "step": 110800 }, { "epoch": 0.45178222793004097, "grad_norm": 2.998349905014038, "learning_rate": 0.004959358352855635, "loss": 8.7435, "step": 110900 }, { "epoch": 0.45218960595342245, "grad_norm": 4.527252674102783, "learning_rate": 0.004959269127333554, "loss": 8.6732, "step": 111000 }, { "epoch": 0.45218960595342245, "eval_MaskedAccuracy": 0.46516753932012583, "eval_loss": 1.809565782546997, "eval_runtime": 579.2909, "eval_samples_per_second": 109.575, "eval_steps_per_second": 0.428, "step": 111000 }, { "epoch": 0.45259698397680387, "grad_norm": 4.419383525848389, "learning_rate": 0.00495917980478109, "loss": 8.5257, "step": 111100 }, { "epoch": 0.45300436200018535, "grad_norm": 4.69866943359375, "learning_rate": 0.004959090385201771, "loss": 8.4745, "step": 111200 }, { "epoch": 0.4534117400235668, "grad_norm": 4.82834005355835, "learning_rate": 0.004959000868599137, "loss": 8.5087, "step": 111300 }, { "epoch": 0.4538191180469483, "grad_norm": 9.863975524902344, "learning_rate": 0.004958911254976722, "loss": 8.5017, "step": 111400 }, { "epoch": 0.4542264960703297, "grad_norm": 5.013560771942139, "learning_rate": 0.004958821544338068, "loss": 8.4531, "step": 111500 }, { "epoch": 0.4546338740937112, "grad_norm": 1.191077709197998, "learning_rate": 0.004958731736686727, "loss": 8.5021, "step": 111600 }, { "epoch": 0.4550412521170927, "grad_norm": 5.876702785491943, "learning_rate": 0.004958641832026253, "loss": 8.618, "step": 111700 }, { "epoch": 0.45544863014047415, "grad_norm": 5.5669779777526855, "learning_rate": 0.0049585518303601845, "loss": 8.5249, "step": 111800 }, { "epoch": 0.4558560081638556, "grad_norm": 6.915269374847412, "learning_rate": 0.004958461731692095, "loss": 8.4397, "step": 111900 }, { "epoch": 0.45626338618723705, "grad_norm": 8.481354713439941, "learning_rate": 0.004958371536025539, "loss": 8.6175, "step": 112000 }, { "epoch": 0.45626338618723705, "eval_MaskedAccuracy": 0.46605112787424247, "eval_loss": 1.8209140300750732, "eval_runtime": 605.9854, "eval_samples_per_second": 104.748, "eval_steps_per_second": 0.409, "step": 112000 }, { "epoch": 0.45667076421061853, "grad_norm": 7.431014537811279, "learning_rate": 0.004958281243364088, "loss": 8.5776, "step": 112100 }, { "epoch": 0.45707814223399995, "grad_norm": 6.935708522796631, "learning_rate": 0.004958190853711309, "loss": 8.4614, "step": 112200 }, { "epoch": 0.45748552025738143, "grad_norm": 4.441507816314697, "learning_rate": 0.004958100367070768, "loss": 8.4313, "step": 112300 }, { "epoch": 0.4578928982807629, "grad_norm": 1.8756330013275146, "learning_rate": 0.004958009783446055, "loss": 8.5578, "step": 112400 }, { "epoch": 0.4583002763041444, "grad_norm": 3.120640993118286, "learning_rate": 0.004957919102840741, "loss": 8.6338, "step": 112500 }, { "epoch": 0.4587076543275258, "grad_norm": 1.1729477643966675, "learning_rate": 0.004957828325258414, "loss": 8.5977, "step": 112600 }, { "epoch": 0.4591150323509073, "grad_norm": 0.5741099715232849, "learning_rate": 0.004957737450702669, "loss": 8.7102, "step": 112700 }, { "epoch": 0.45952241037428876, "grad_norm": 8.047289848327637, "learning_rate": 0.004957646479177092, "loss": 8.5728, "step": 112800 }, { "epoch": 0.4599297883976702, "grad_norm": 9.095126152038574, "learning_rate": 0.0049575554106852816, "loss": 8.4705, "step": 112900 }, { "epoch": 0.46033716642105166, "grad_norm": 9.453558921813965, "learning_rate": 0.004957464245230838, "loss": 8.4057, "step": 113000 }, { "epoch": 0.46033716642105166, "eval_MaskedAccuracy": 0.4738057493999801, "eval_loss": 1.781148076057434, "eval_runtime": 540.2794, "eval_samples_per_second": 117.487, "eval_steps_per_second": 0.459, "step": 113000 }, { "epoch": 0.46074454444443314, "grad_norm": 8.071161270141602, "learning_rate": 0.004957372982817367, "loss": 8.4066, "step": 113100 }, { "epoch": 0.4611519224678146, "grad_norm": 10.719406127929688, "learning_rate": 0.004957281623448478, "loss": 8.4156, "step": 113200 }, { "epoch": 0.46155930049119603, "grad_norm": 7.539982795715332, "learning_rate": 0.004957190167127779, "loss": 8.3838, "step": 113300 }, { "epoch": 0.4619666785145775, "grad_norm": 10.179490089416504, "learning_rate": 0.004957098613858882, "loss": 8.3715, "step": 113400 }, { "epoch": 0.462374056537959, "grad_norm": 9.369685173034668, "learning_rate": 0.0049570069636454194, "loss": 8.342, "step": 113500 }, { "epoch": 0.4627814345613404, "grad_norm": 9.911388397216797, "learning_rate": 0.004956915216491005, "loss": 8.5081, "step": 113600 }, { "epoch": 0.4631888125847219, "grad_norm": 11.996129035949707, "learning_rate": 0.004956823372399269, "loss": 8.4799, "step": 113700 }, { "epoch": 0.46359619060810336, "grad_norm": 4.705869197845459, "learning_rate": 0.004956731431373847, "loss": 8.3913, "step": 113800 }, { "epoch": 0.46400356863148484, "grad_norm": 2.45719575881958, "learning_rate": 0.004956639393418368, "loss": 8.5583, "step": 113900 }, { "epoch": 0.46441094665486626, "grad_norm": 1.0450843572616577, "learning_rate": 0.004956547258536471, "loss": 8.619, "step": 114000 }, { "epoch": 0.46441094665486626, "eval_MaskedAccuracy": 0.46584400777593554, "eval_loss": 1.818083643913269, "eval_runtime": 601.3887, "eval_samples_per_second": 105.549, "eval_steps_per_second": 0.412, "step": 114000 }, { "epoch": 0.46481832467824774, "grad_norm": 2.7218010425567627, "learning_rate": 0.004956455026731795, "loss": 8.6125, "step": 114100 }, { "epoch": 0.4652257027016292, "grad_norm": 3.7615129947662354, "learning_rate": 0.004956362698007991, "loss": 8.6228, "step": 114200 }, { "epoch": 0.4656330807250107, "grad_norm": 4.030871391296387, "learning_rate": 0.004956270272368709, "loss": 8.546, "step": 114300 }, { "epoch": 0.4660404587483921, "grad_norm": 5.961259365081787, "learning_rate": 0.004956177749817598, "loss": 8.4686, "step": 114400 }, { "epoch": 0.4664478367717736, "grad_norm": 9.751999855041504, "learning_rate": 0.004956085130358327, "loss": 8.4324, "step": 114500 }, { "epoch": 0.46685521479515507, "grad_norm": 9.5776948928833, "learning_rate": 0.004955992413994554, "loss": 8.5815, "step": 114600 }, { "epoch": 0.4672625928185365, "grad_norm": 5.737105369567871, "learning_rate": 0.0049558996007299395, "loss": 8.5721, "step": 114700 }, { "epoch": 0.46766997084191797, "grad_norm": 8.12684440612793, "learning_rate": 0.004955806690568156, "loss": 8.451, "step": 114800 }, { "epoch": 0.46807734886529945, "grad_norm": 3.4034082889556885, "learning_rate": 0.004955713683512877, "loss": 8.4247, "step": 114900 }, { "epoch": 0.4684847268886809, "grad_norm": 4.736820220947266, "learning_rate": 0.004955620579567771, "loss": 8.4251, "step": 115000 }, { "epoch": 0.4684847268886809, "eval_MaskedAccuracy": 0.4688468475901898, "eval_loss": 1.8014590740203857, "eval_runtime": 562.4603, "eval_samples_per_second": 112.854, "eval_steps_per_second": 0.441, "step": 115000 }, { "epoch": 0.46889210491206235, "grad_norm": 10.829277992248535, "learning_rate": 0.004955527378736533, "loss": 8.5794, "step": 115100 }, { "epoch": 0.4692994829354438, "grad_norm": 1.7777608633041382, "learning_rate": 0.004955434081022844, "loss": 8.635, "step": 115200 }, { "epoch": 0.4697068609588253, "grad_norm": 17.929372787475586, "learning_rate": 0.0049553406864303855, "loss": 8.5905, "step": 115300 }, { "epoch": 0.4701142389822067, "grad_norm": 3.8750357627868652, "learning_rate": 0.004955247194962858, "loss": 8.5732, "step": 115400 }, { "epoch": 0.4705216170055882, "grad_norm": 3.1751935482025146, "learning_rate": 0.004955153606623948, "loss": 8.4471, "step": 115500 }, { "epoch": 0.4709289950289697, "grad_norm": 11.84594440460205, "learning_rate": 0.0049550599214173595, "loss": 8.4343, "step": 115600 }, { "epoch": 0.47133637305235115, "grad_norm": 6.070246696472168, "learning_rate": 0.0049549661393468, "loss": 8.4336, "step": 115700 }, { "epoch": 0.4717437510757326, "grad_norm": 3.038576602935791, "learning_rate": 0.004954872260415978, "loss": 8.5264, "step": 115800 }, { "epoch": 0.47215112909911405, "grad_norm": 1.0032246112823486, "learning_rate": 0.0049547782846286045, "loss": 8.5696, "step": 115900 }, { "epoch": 0.47255850712249553, "grad_norm": 5.927453994750977, "learning_rate": 0.004954684211988393, "loss": 8.6381, "step": 116000 }, { "epoch": 0.47255850712249553, "eval_MaskedAccuracy": 0.4656466126507873, "eval_loss": 1.8105868101119995, "eval_runtime": 617.0844, "eval_samples_per_second": 102.864, "eval_steps_per_second": 0.402, "step": 116000 }, { "epoch": 0.47296588514587695, "grad_norm": 4.60734748840332, "learning_rate": 0.004954590042499063, "loss": 8.6123, "step": 116100 }, { "epoch": 0.47337326316925843, "grad_norm": 7.652487754821777, "learning_rate": 0.004954495776164337, "loss": 8.5055, "step": 116200 }, { "epoch": 0.4737806411926399, "grad_norm": 5.244068145751953, "learning_rate": 0.004954401412987941, "loss": 8.464, "step": 116300 }, { "epoch": 0.4741880192160214, "grad_norm": 12.130002975463867, "learning_rate": 0.004954306952973607, "loss": 8.4014, "step": 116400 }, { "epoch": 0.4745953972394028, "grad_norm": 7.4617919921875, "learning_rate": 0.004954212396125075, "loss": 8.5934, "step": 116500 }, { "epoch": 0.4750027752627843, "grad_norm": 3.1740314960479736, "learning_rate": 0.004954117742446068, "loss": 8.5771, "step": 116600 }, { "epoch": 0.47541015328616576, "grad_norm": 4.737682819366455, "learning_rate": 0.004954022991940346, "loss": 8.4457, "step": 116700 }, { "epoch": 0.47581753130954724, "grad_norm": 11.205411911010742, "learning_rate": 0.004953928144611644, "loss": 8.5085, "step": 116800 }, { "epoch": 0.47622490933292866, "grad_norm": 3.0664117336273193, "learning_rate": 0.004953833200463713, "loss": 8.5006, "step": 116900 }, { "epoch": 0.47663228735631014, "grad_norm": 11.607182502746582, "learning_rate": 0.004953738159500306, "loss": 8.4609, "step": 117000 }, { "epoch": 0.47663228735631014, "eval_MaskedAccuracy": 0.4652727326260346, "eval_loss": 1.8133021593093872, "eval_runtime": 639.7971, "eval_samples_per_second": 99.213, "eval_steps_per_second": 0.388, "step": 117000 }, { "epoch": 0.4770396653796916, "grad_norm": 4.19486665725708, "learning_rate": 0.00495364302172519, "loss": 8.5505, "step": 117100 }, { "epoch": 0.47744704340307303, "grad_norm": 1.922994613647461, "learning_rate": 0.004953547787142112, "loss": 8.5968, "step": 117200 }, { "epoch": 0.4778544214264545, "grad_norm": 5.716039657592773, "learning_rate": 0.004953452455754846, "loss": 8.525, "step": 117300 }, { "epoch": 0.478261799449836, "grad_norm": 8.691774368286133, "learning_rate": 0.004953357027567162, "loss": 8.6127, "step": 117400 }, { "epoch": 0.47866917747321747, "grad_norm": 5.32814884185791, "learning_rate": 0.004953261502582827, "loss": 8.5103, "step": 117500 }, { "epoch": 0.4790765554965989, "grad_norm": 7.465363025665283, "learning_rate": 0.00495316588080562, "loss": 8.4228, "step": 117600 }, { "epoch": 0.47948393351998037, "grad_norm": 10.480701446533203, "learning_rate": 0.00495307016223933, "loss": 8.392, "step": 117700 }, { "epoch": 0.47989131154336184, "grad_norm": 6.020634174346924, "learning_rate": 0.004952974346887732, "loss": 8.4383, "step": 117800 }, { "epoch": 0.48029868956674326, "grad_norm": 9.17965030670166, "learning_rate": 0.00495287843475461, "loss": 8.4746, "step": 117900 }, { "epoch": 0.48070606759012474, "grad_norm": 6.490881443023682, "learning_rate": 0.004952782425843763, "loss": 8.4617, "step": 118000 }, { "epoch": 0.48070606759012474, "eval_MaskedAccuracy": 0.46867127279022996, "eval_loss": 1.8012827634811401, "eval_runtime": 540.5029, "eval_samples_per_second": 117.439, "eval_steps_per_second": 0.459, "step": 118000 }, { "epoch": 0.4811134456135062, "grad_norm": 8.385427474975586, "learning_rate": 0.004952686320158984, "loss": 8.4601, "step": 118100 }, { "epoch": 0.4815208236368877, "grad_norm": 7.8449506759643555, "learning_rate": 0.0049525901177040685, "loss": 8.4005, "step": 118200 }, { "epoch": 0.4819282016602691, "grad_norm": 5.87675666809082, "learning_rate": 0.004952493818482826, "loss": 8.3504, "step": 118300 }, { "epoch": 0.4823355796836506, "grad_norm": 8.414234161376953, "learning_rate": 0.00495239742249907, "loss": 8.3437, "step": 118400 }, { "epoch": 0.48274295770703207, "grad_norm": 8.45103931427002, "learning_rate": 0.004952300929756606, "loss": 8.3787, "step": 118500 }, { "epoch": 0.4831503357304135, "grad_norm": 4.254857063293457, "learning_rate": 0.004952204340259244, "loss": 8.3363, "step": 118600 }, { "epoch": 0.48355771375379497, "grad_norm": 8.870880126953125, "learning_rate": 0.0049521076540108054, "loss": 8.3965, "step": 118700 }, { "epoch": 0.48396509177717645, "grad_norm": 7.823821067810059, "learning_rate": 0.004952010871015116, "loss": 8.3472, "step": 118800 }, { "epoch": 0.4843724698005579, "grad_norm": 8.193089485168457, "learning_rate": 0.004951913991275995, "loss": 8.3609, "step": 118900 }, { "epoch": 0.48477984782393935, "grad_norm": 8.305012702941895, "learning_rate": 0.004951817014797274, "loss": 8.3527, "step": 119000 }, { "epoch": 0.48477984782393935, "eval_MaskedAccuracy": 0.4767895611606481, "eval_loss": 1.7638946771621704, "eval_runtime": 648.6472, "eval_samples_per_second": 97.859, "eval_steps_per_second": 0.382, "step": 119000 }, { "epoch": 0.4851872258473208, "grad_norm": 8.314849853515625, "learning_rate": 0.0049517199415827955, "loss": 8.3497, "step": 119100 }, { "epoch": 0.4855946038707023, "grad_norm": 3.263856887817383, "learning_rate": 0.004951622771636393, "loss": 8.4411, "step": 119200 }, { "epoch": 0.4860019818940837, "grad_norm": 2.1813316345214844, "learning_rate": 0.004951525504961904, "loss": 8.5163, "step": 119300 }, { "epoch": 0.4864093599174652, "grad_norm": 9.317235946655273, "learning_rate": 0.004951428141563178, "loss": 8.5087, "step": 119400 }, { "epoch": 0.4868167379408467, "grad_norm": 4.322655200958252, "learning_rate": 0.004951330681444059, "loss": 8.604, "step": 119500 }, { "epoch": 0.48722411596422815, "grad_norm": 7.114287376403809, "learning_rate": 0.0049512331246084, "loss": 8.6363, "step": 119600 }, { "epoch": 0.4876314939876096, "grad_norm": 6.147729396820068, "learning_rate": 0.00495113547106007, "loss": 8.4465, "step": 119700 }, { "epoch": 0.48803887201099105, "grad_norm": 6.949401378631592, "learning_rate": 0.004951037720802924, "loss": 8.3767, "step": 119800 }, { "epoch": 0.48844625003437253, "grad_norm": 7.660586833953857, "learning_rate": 0.004950939873840821, "loss": 8.408, "step": 119900 }, { "epoch": 0.488853628057754, "grad_norm": 8.358684539794922, "learning_rate": 0.004950841930177634, "loss": 8.3833, "step": 120000 }, { "epoch": 0.488853628057754, "eval_MaskedAccuracy": 0.47606467814721937, "eval_loss": 1.7586658000946045, "eval_runtime": 593.8693, "eval_samples_per_second": 106.885, "eval_steps_per_second": 0.418, "step": 120000 }, { "epoch": 0.48926100608113543, "grad_norm": 9.096843719482422, "learning_rate": 0.004950743889817234, "loss": 8.3418, "step": 120100 }, { "epoch": 0.4896683841045169, "grad_norm": 8.729216575622559, "learning_rate": 0.004950645752763492, "loss": 8.381, "step": 120200 }, { "epoch": 0.4900757621278984, "grad_norm": 7.4013447761535645, "learning_rate": 0.004950547519020298, "loss": 8.3483, "step": 120300 }, { "epoch": 0.4904831401512798, "grad_norm": 7.02334451675415, "learning_rate": 0.004950449188591533, "loss": 8.3802, "step": 120400 }, { "epoch": 0.4908905181746613, "grad_norm": 3.4499237537384033, "learning_rate": 0.004950350761481079, "loss": 8.3577, "step": 120500 }, { "epoch": 0.49129789619804276, "grad_norm": 4.924149036407471, "learning_rate": 0.004950252237692828, "loss": 8.3735, "step": 120600 }, { "epoch": 0.49170527422142424, "grad_norm": 6.146267414093018, "learning_rate": 0.0049501536172306786, "loss": 8.363, "step": 120700 }, { "epoch": 0.49211265224480566, "grad_norm": 7.329911231994629, "learning_rate": 0.004950054900098534, "loss": 8.3391, "step": 120800 }, { "epoch": 0.49252003026818714, "grad_norm": 2.1442506313323975, "learning_rate": 0.004949956086300295, "loss": 8.5251, "step": 120900 }, { "epoch": 0.4929274082915686, "grad_norm": 9.27274227142334, "learning_rate": 0.004949857175839865, "loss": 8.5657, "step": 121000 }, { "epoch": 0.4929274082915686, "eval_MaskedAccuracy": 0.47242173123299347, "eval_loss": 1.7904921770095825, "eval_runtime": 611.4048, "eval_samples_per_second": 103.82, "eval_steps_per_second": 0.406, "step": 121000 }, { "epoch": 0.49333478631495004, "grad_norm": 7.481986045837402, "learning_rate": 0.0049497581687211484, "loss": 8.3722, "step": 121100 }, { "epoch": 0.4937421643383315, "grad_norm": 7.524771213531494, "learning_rate": 0.004949659064948069, "loss": 8.378, "step": 121200 }, { "epoch": 0.494149542361713, "grad_norm": 9.83425235748291, "learning_rate": 0.004949559864524543, "loss": 8.3516, "step": 121300 }, { "epoch": 0.49455692038509447, "grad_norm": 8.452327728271484, "learning_rate": 0.004949460567454488, "loss": 8.3802, "step": 121400 }, { "epoch": 0.4949642984084759, "grad_norm": 4.382816791534424, "learning_rate": 0.004949361173741836, "loss": 8.4591, "step": 121500 }, { "epoch": 0.49537167643185737, "grad_norm": 6.4604172706604, "learning_rate": 0.004949261683390515, "loss": 8.5378, "step": 121600 }, { "epoch": 0.49577905445523884, "grad_norm": 6.597474575042725, "learning_rate": 0.00494916209640446, "loss": 8.5965, "step": 121700 }, { "epoch": 0.49618643247862027, "grad_norm": 8.31852912902832, "learning_rate": 0.0049490624127876, "loss": 8.4832, "step": 121800 }, { "epoch": 0.49659381050200174, "grad_norm": 5.9900641441345215, "learning_rate": 0.004948962632543879, "loss": 8.6186, "step": 121900 }, { "epoch": 0.4970011885253832, "grad_norm": 6.915224552154541, "learning_rate": 0.004948862755677247, "loss": 8.454, "step": 122000 }, { "epoch": 0.4970011885253832, "eval_MaskedAccuracy": 0.472195087355659, "eval_loss": 1.7736320495605469, "eval_runtime": 404.842, "eval_samples_per_second": 156.792, "eval_steps_per_second": 0.613, "step": 122000 }, { "epoch": 0.4974085665487647, "grad_norm": 8.118802070617676, "learning_rate": 0.0049487627821916454, "loss": 8.3735, "step": 122100 }, { "epoch": 0.4978159445721461, "grad_norm": 9.109051704406738, "learning_rate": 0.004948662712091032, "loss": 8.358, "step": 122200 }, { "epoch": 0.4982233225955276, "grad_norm": 7.688298225402832, "learning_rate": 0.004948562545379363, "loss": 8.357, "step": 122300 }, { "epoch": 0.4986307006189091, "grad_norm": 5.265169620513916, "learning_rate": 0.004948462282060591, "loss": 8.3677, "step": 122400 }, { "epoch": 0.49903807864229055, "grad_norm": 5.895035743713379, "learning_rate": 0.004948361922138693, "loss": 8.3613, "step": 122500 }, { "epoch": 0.49944545666567197, "grad_norm": 3.273195743560791, "learning_rate": 0.004948261465617631, "loss": 8.3804, "step": 122600 }, { "epoch": 0.49985283468905345, "grad_norm": 5.652338981628418, "learning_rate": 0.004948160912501374, "loss": 8.5127, "step": 122700 }, { "epoch": 0.5002602127124349, "grad_norm": 10.528982162475586, "learning_rate": 0.0049480602627939, "loss": 8.3952, "step": 122800 }, { "epoch": 0.5006675907358163, "grad_norm": 7.678232192993164, "learning_rate": 0.00494795951649919, "loss": 8.3522, "step": 122900 }, { "epoch": 0.5010749687591979, "grad_norm": 6.645458698272705, "learning_rate": 0.004947858673621225, "loss": 8.3805, "step": 123000 }, { "epoch": 0.5010749687591979, "eval_MaskedAccuracy": 0.4759519419330508, "eval_loss": 1.7681282758712769, "eval_runtime": 571.6199, "eval_samples_per_second": 111.046, "eval_steps_per_second": 0.434, "step": 123000 }, { "epoch": 0.5014823467825793, "grad_norm": 6.954684734344482, "learning_rate": 0.004947757734163989, "loss": 8.3092, "step": 123100 }, { "epoch": 0.5018897248059607, "grad_norm": 8.839340209960938, "learning_rate": 0.004947656698131469, "loss": 8.3486, "step": 123200 }, { "epoch": 0.5022971028293423, "grad_norm": 6.953580856323242, "learning_rate": 0.004947555565527669, "loss": 8.3612, "step": 123300 }, { "epoch": 0.5027044808527237, "grad_norm": 9.655802726745605, "learning_rate": 0.004947454336356585, "loss": 8.3551, "step": 123400 }, { "epoch": 0.5031118588761051, "grad_norm": 0.8042750954627991, "learning_rate": 0.00494735301062222, "loss": 8.7023, "step": 123500 }, { "epoch": 0.5035192368994866, "grad_norm": 10.146775245666504, "learning_rate": 0.004947251588328577, "loss": 8.6243, "step": 123600 }, { "epoch": 0.503926614922868, "grad_norm": 6.121560096740723, "learning_rate": 0.004947150069479665, "loss": 8.4713, "step": 123700 }, { "epoch": 0.5043339929462495, "grad_norm": 6.796733379364014, "learning_rate": 0.004947048454079505, "loss": 8.4259, "step": 123800 }, { "epoch": 0.504741370969631, "grad_norm": 11.119961738586426, "learning_rate": 0.004946946742132101, "loss": 8.5861, "step": 123900 }, { "epoch": 0.5051487489930124, "grad_norm": 4.724853038787842, "learning_rate": 0.004946844933641488, "loss": 8.5123, "step": 124000 }, { "epoch": 0.5051487489930124, "eval_MaskedAccuracy": 0.4723671499722319, "eval_loss": 1.7843869924545288, "eval_runtime": 619.0846, "eval_samples_per_second": 102.532, "eval_steps_per_second": 0.401, "step": 124000 }, { "epoch": 0.505556127016394, "grad_norm": 4.339019298553467, "learning_rate": 0.004946743028611682, "loss": 8.5424, "step": 124100 }, { "epoch": 0.5059635050397754, "grad_norm": 0.9369896054267883, "learning_rate": 0.004946641027046716, "loss": 8.597, "step": 124200 }, { "epoch": 0.5063708830631568, "grad_norm": 9.629310607910156, "learning_rate": 0.004946538928950624, "loss": 8.515, "step": 124300 }, { "epoch": 0.5067782610865383, "grad_norm": 7.729372501373291, "learning_rate": 0.004946436734327439, "loss": 8.4293, "step": 124400 }, { "epoch": 0.5071856391099198, "grad_norm": 2.7494254112243652, "learning_rate": 0.004946334443181198, "loss": 8.3897, "step": 124500 }, { "epoch": 0.5075930171333012, "grad_norm": 1.270338773727417, "learning_rate": 0.004946232055515947, "loss": 8.6036, "step": 124600 }, { "epoch": 0.5080003951566827, "grad_norm": 10.06462574005127, "learning_rate": 0.004946129571335741, "loss": 8.5373, "step": 124700 }, { "epoch": 0.5084077731800641, "grad_norm": 2.283895969390869, "learning_rate": 0.004946026990644626, "loss": 8.4413, "step": 124800 }, { "epoch": 0.5088151512034456, "grad_norm": 6.6044111251831055, "learning_rate": 0.004945924313446655, "loss": 8.4213, "step": 124900 }, { "epoch": 0.5092225292268271, "grad_norm": 2.018730640411377, "learning_rate": 0.004945821539745899, "loss": 8.4662, "step": 125000 }, { "epoch": 0.5092225292268271, "eval_MaskedAccuracy": 0.46727423518335215, "eval_loss": 1.8027355670928955, "eval_runtime": 586.8485, "eval_samples_per_second": 108.164, "eval_steps_per_second": 0.423, "step": 125000 }, { "epoch": 0.5096299072502085, "grad_norm": 8.053539276123047, "learning_rate": 0.00494571866954641, "loss": 8.5339, "step": 125100 }, { "epoch": 0.5100372852735899, "grad_norm": 5.74146842956543, "learning_rate": 0.00494561570285225, "loss": 8.4135, "step": 125200 }, { "epoch": 0.5104446632969715, "grad_norm": 7.048740386962891, "learning_rate": 0.004945512639667504, "loss": 8.4548, "step": 125300 }, { "epoch": 0.5108520413203529, "grad_norm": 7.538445472717285, "learning_rate": 0.004945409479996239, "loss": 8.4078, "step": 125400 }, { "epoch": 0.5112594193437344, "grad_norm": 10.846882820129395, "learning_rate": 0.004945306223842535, "loss": 8.3321, "step": 125500 }, { "epoch": 0.5116667973671158, "grad_norm": 6.417740821838379, "learning_rate": 0.004945202871210477, "loss": 8.3654, "step": 125600 }, { "epoch": 0.5120741753904973, "grad_norm": 9.360198974609375, "learning_rate": 0.004945099422104144, "loss": 8.3416, "step": 125700 }, { "epoch": 0.5124815534138788, "grad_norm": 4.266257286071777, "learning_rate": 0.004944995876527626, "loss": 8.3595, "step": 125800 }, { "epoch": 0.5128889314372602, "grad_norm": 4.841434955596924, "learning_rate": 0.004944892234485027, "loss": 8.5323, "step": 125900 }, { "epoch": 0.5132963094606416, "grad_norm": 11.480585098266602, "learning_rate": 0.004944788495980438, "loss": 8.3728, "step": 126000 }, { "epoch": 0.5132963094606416, "eval_MaskedAccuracy": 0.4755163862779088, "eval_loss": 1.7594823837280273, "eval_runtime": 648.4406, "eval_samples_per_second": 97.89, "eval_steps_per_second": 0.382, "step": 126000 }, { "epoch": 0.5137036874840232, "grad_norm": 5.7990403175354, "learning_rate": 0.004944684661017963, "loss": 8.3259, "step": 126100 }, { "epoch": 0.5141110655074046, "grad_norm": 1.583735704421997, "learning_rate": 0.004944580729601707, "loss": 8.3592, "step": 126200 }, { "epoch": 0.514518443530786, "grad_norm": 12.780102729797363, "learning_rate": 0.0049444767017357725, "loss": 8.43, "step": 126300 }, { "epoch": 0.5149258215541676, "grad_norm": 5.658169269561768, "learning_rate": 0.0049443725774242775, "loss": 8.6245, "step": 126400 }, { "epoch": 0.515333199577549, "grad_norm": 7.621128082275391, "learning_rate": 0.004944268356671341, "loss": 8.4524, "step": 126500 }, { "epoch": 0.5157405776009305, "grad_norm": 9.25407600402832, "learning_rate": 0.004944164039481081, "loss": 8.4178, "step": 126600 }, { "epoch": 0.5161479556243119, "grad_norm": 5.097949028015137, "learning_rate": 0.004944059625857619, "loss": 8.6409, "step": 126700 }, { "epoch": 0.5165553336476933, "grad_norm": 9.274227142333984, "learning_rate": 0.004943955115805082, "loss": 8.6079, "step": 126800 }, { "epoch": 0.5169627116710749, "grad_norm": 5.212336540222168, "learning_rate": 0.004943850509327609, "loss": 8.4844, "step": 126900 }, { "epoch": 0.5173700896944563, "grad_norm": 10.940645217895508, "learning_rate": 0.004943745806429329, "loss": 8.4035, "step": 127000 }, { "epoch": 0.5173700896944563, "eval_MaskedAccuracy": 0.4748734666305473, "eval_loss": 1.7772326469421387, "eval_runtime": 616.2331, "eval_samples_per_second": 103.006, "eval_steps_per_second": 0.402, "step": 127000 }, { "epoch": 0.5177774677178377, "grad_norm": 5.6482696533203125, "learning_rate": 0.004943641007114383, "loss": 8.3536, "step": 127100 }, { "epoch": 0.5181848457412193, "grad_norm": 8.932602882385254, "learning_rate": 0.004943536111386913, "loss": 8.3414, "step": 127200 }, { "epoch": 0.5185922237646007, "grad_norm": 13.874133110046387, "learning_rate": 0.004943431119251073, "loss": 8.3514, "step": 127300 }, { "epoch": 0.5189996017879821, "grad_norm": 8.456324577331543, "learning_rate": 0.0049433260307110115, "loss": 8.3177, "step": 127400 }, { "epoch": 0.5194069798113636, "grad_norm": 8.561285972595215, "learning_rate": 0.004943220845770884, "loss": 8.3386, "step": 127500 }, { "epoch": 0.5198143578347451, "grad_norm": 8.178915023803711, "learning_rate": 0.004943115564434837, "loss": 8.3262, "step": 127600 }, { "epoch": 0.5202217358581265, "grad_norm": 4.443258285522461, "learning_rate": 0.004943010186707044, "loss": 8.3302, "step": 127700 }, { "epoch": 0.520629113881508, "grad_norm": 6.866542816162109, "learning_rate": 0.00494290471259167, "loss": 8.4339, "step": 127800 }, { "epoch": 0.5210364919048894, "grad_norm": 11.089356422424316, "learning_rate": 0.004942799142092884, "loss": 8.6393, "step": 127900 }, { "epoch": 0.521443869928271, "grad_norm": 1.7169142961502075, "learning_rate": 0.004942693475214859, "loss": 8.6402, "step": 128000 }, { "epoch": 0.521443869928271, "eval_MaskedAccuracy": 0.4648368387713934, "eval_loss": 1.8243728876113892, "eval_runtime": 559.7984, "eval_samples_per_second": 113.391, "eval_steps_per_second": 0.443, "step": 128000 }, { "epoch": 0.5218512479516524, "grad_norm": 7.574516773223877, "learning_rate": 0.0049425877119617725, "loss": 8.5531, "step": 128100 }, { "epoch": 0.5222586259750338, "grad_norm": 4.418700695037842, "learning_rate": 0.004942481852337807, "loss": 8.4875, "step": 128200 }, { "epoch": 0.5226660039984153, "grad_norm": 8.742793083190918, "learning_rate": 0.004942375896347151, "loss": 8.5511, "step": 128300 }, { "epoch": 0.5230733820217968, "grad_norm": 11.146100044250488, "learning_rate": 0.004942269843993984, "loss": 8.4107, "step": 128400 }, { "epoch": 0.5234807600451782, "grad_norm": 9.998716354370117, "learning_rate": 0.004942163695282504, "loss": 8.4421, "step": 128500 }, { "epoch": 0.5238881380685597, "grad_norm": 2.967787027359009, "learning_rate": 0.004942057450216913, "loss": 8.4074, "step": 128600 }, { "epoch": 0.5242955160919411, "grad_norm": 6.486566543579102, "learning_rate": 0.004941951108801405, "loss": 8.4151, "step": 128700 }, { "epoch": 0.5247028941153226, "grad_norm": 2.534282922744751, "learning_rate": 0.004941844671040186, "loss": 8.3743, "step": 128800 }, { "epoch": 0.5251102721387041, "grad_norm": 5.772778511047363, "learning_rate": 0.0049417381369374624, "loss": 8.5069, "step": 128900 }, { "epoch": 0.5255176501620855, "grad_norm": 7.692689895629883, "learning_rate": 0.00494163150649745, "loss": 8.3617, "step": 129000 }, { "epoch": 0.5255176501620855, "eval_MaskedAccuracy": 0.47622878724681494, "eval_loss": 1.7545971870422363, "eval_runtime": 686.8219, "eval_samples_per_second": 92.42, "eval_steps_per_second": 0.361, "step": 129000 }, { "epoch": 0.525925028185467, "grad_norm": 7.9084577560424805, "learning_rate": 0.004941524779724357, "loss": 8.2893, "step": 129100 }, { "epoch": 0.5263324062088485, "grad_norm": 5.903682708740234, "learning_rate": 0.004941417956622409, "loss": 8.3253, "step": 129200 }, { "epoch": 0.5267397842322299, "grad_norm": 10.002944946289062, "learning_rate": 0.004941311037195834, "loss": 8.3331, "step": 129300 }, { "epoch": 0.5271471622556114, "grad_norm": 5.873917102813721, "learning_rate": 0.004941204021448854, "loss": 8.3173, "step": 129400 }, { "epoch": 0.5275545402789928, "grad_norm": 7.145143508911133, "learning_rate": 0.0049410969093856985, "loss": 8.3968, "step": 129500 }, { "epoch": 0.5279619183023743, "grad_norm": 4.4543328285217285, "learning_rate": 0.004940989701010604, "loss": 8.449, "step": 129600 }, { "epoch": 0.5283692963257558, "grad_norm": 7.055647850036621, "learning_rate": 0.004940882396327806, "loss": 8.3819, "step": 129700 }, { "epoch": 0.5287766743491372, "grad_norm": 8.206925392150879, "learning_rate": 0.004940774995341549, "loss": 8.3501, "step": 129800 }, { "epoch": 0.5291840523725186, "grad_norm": 10.196109771728516, "learning_rate": 0.004940667498056074, "loss": 8.2884, "step": 129900 }, { "epoch": 0.5295914303959002, "grad_norm": 10.326079368591309, "learning_rate": 0.004940559904475633, "loss": 8.3304, "step": 130000 }, { "epoch": 0.5295914303959002, "eval_MaskedAccuracy": 0.4778809703901036, "eval_loss": 1.7554049491882324, "eval_runtime": 432.0501, "eval_samples_per_second": 146.918, "eval_steps_per_second": 0.574, "step": 130000 }, { "epoch": 0.5299988084192816, "grad_norm": 7.801156997680664, "learning_rate": 0.004940452214604491, "loss": 8.3286, "step": 130100 }, { "epoch": 0.530406186442663, "grad_norm": 6.964962959289551, "learning_rate": 0.004940344428446893, "loss": 8.4938, "step": 130200 }, { "epoch": 0.5308135644660446, "grad_norm": 3.655668258666992, "learning_rate": 0.004940236546007105, "loss": 8.5877, "step": 130300 }, { "epoch": 0.531220942489426, "grad_norm": 8.000005722045898, "learning_rate": 0.004940128567289394, "loss": 8.5344, "step": 130400 }, { "epoch": 0.5316283205128075, "grad_norm": 8.468135833740234, "learning_rate": 0.004940020492298025, "loss": 8.3759, "step": 130500 }, { "epoch": 0.5320356985361889, "grad_norm": 6.7972917556762695, "learning_rate": 0.004939912321037274, "loss": 8.3218, "step": 130600 }, { "epoch": 0.5324430765595703, "grad_norm": 10.092455863952637, "learning_rate": 0.004939804053511412, "loss": 8.342, "step": 130700 }, { "epoch": 0.5328504545829519, "grad_norm": 4.264617919921875, "learning_rate": 0.004939695689724729, "loss": 8.6042, "step": 130800 }, { "epoch": 0.5332578326063333, "grad_norm": 7.822512626647949, "learning_rate": 0.0049395872296815, "loss": 8.5632, "step": 130900 }, { "epoch": 0.5336652106297147, "grad_norm": 3.6305220127105713, "learning_rate": 0.004939478673386021, "loss": 8.5285, "step": 131000 }, { "epoch": 0.5336652106297147, "eval_MaskedAccuracy": 0.47348632156424786, "eval_loss": 1.7749598026275635, "eval_runtime": 551.4523, "eval_samples_per_second": 115.107, "eval_steps_per_second": 0.45, "step": 131000 }, { "epoch": 0.5340725886530963, "grad_norm": 12.928987503051758, "learning_rate": 0.004939370020842574, "loss": 8.4338, "step": 131100 }, { "epoch": 0.5344799666764777, "grad_norm": 4.38896369934082, "learning_rate": 0.004939261272055469, "loss": 8.5271, "step": 131200 }, { "epoch": 0.5348873446998591, "grad_norm": 4.418875694274902, "learning_rate": 0.004939152427028993, "loss": 8.3756, "step": 131300 }, { "epoch": 0.5352947227232406, "grad_norm": 4.9500651359558105, "learning_rate": 0.0049390434857674494, "loss": 8.3532, "step": 131400 }, { "epoch": 0.5357021007466221, "grad_norm": 7.733002185821533, "learning_rate": 0.004938934448275152, "loss": 8.3283, "step": 131500 }, { "epoch": 0.5361094787700036, "grad_norm": 6.218177318572998, "learning_rate": 0.0049388253145564096, "loss": 8.3114, "step": 131600 }, { "epoch": 0.536516856793385, "grad_norm": 5.060550212860107, "learning_rate": 0.0049387160846155334, "loss": 8.3091, "step": 131700 }, { "epoch": 0.5369242348167664, "grad_norm": 9.628071784973145, "learning_rate": 0.004938606758456842, "loss": 8.3289, "step": 131800 }, { "epoch": 0.537331612840148, "grad_norm": 4.559714317321777, "learning_rate": 0.004938497336084662, "loss": 8.3044, "step": 131900 }, { "epoch": 0.5377389908635294, "grad_norm": 6.963791847229004, "learning_rate": 0.004938387817503315, "loss": 8.2859, "step": 132000 }, { "epoch": 0.5377389908635294, "eval_MaskedAccuracy": 0.47790219219810026, "eval_loss": 1.7602800130844116, "eval_runtime": 606.5816, "eval_samples_per_second": 104.645, "eval_steps_per_second": 0.409, "step": 132000 }, { "epoch": 0.5381463688869108, "grad_norm": 8.68433952331543, "learning_rate": 0.004938278202717131, "loss": 8.316, "step": 132100 }, { "epoch": 0.5385537469102923, "grad_norm": 6.305760383605957, "learning_rate": 0.004938168491730447, "loss": 8.3034, "step": 132200 }, { "epoch": 0.5389611249336738, "grad_norm": 8.102996826171875, "learning_rate": 0.004938058684547598, "loss": 8.3054, "step": 132300 }, { "epoch": 0.5393685029570552, "grad_norm": 4.82603120803833, "learning_rate": 0.004937948781172932, "loss": 8.317, "step": 132400 }, { "epoch": 0.5397758809804367, "grad_norm": 9.478666305541992, "learning_rate": 0.004937838781610782, "loss": 8.3199, "step": 132500 }, { "epoch": 0.5401832590038181, "grad_norm": 6.930224418640137, "learning_rate": 0.004937728685865504, "loss": 8.3277, "step": 132600 }, { "epoch": 0.5405906370271996, "grad_norm": 6.685028076171875, "learning_rate": 0.00493761849394145, "loss": 8.3096, "step": 132700 }, { "epoch": 0.5409980150505811, "grad_norm": 11.711406707763672, "learning_rate": 0.004937508205842981, "loss": 8.3485, "step": 132800 }, { "epoch": 0.5414053930739625, "grad_norm": 4.872280120849609, "learning_rate": 0.004937397821574446, "loss": 8.3167, "step": 132900 }, { "epoch": 0.541812771097344, "grad_norm": 5.365285873413086, "learning_rate": 0.004937287341140217, "loss": 8.3758, "step": 133000 }, { "epoch": 0.541812771097344, "eval_MaskedAccuracy": 0.47756522020477266, "eval_loss": 1.7520103454589844, "eval_runtime": 573.9139, "eval_samples_per_second": 110.602, "eval_steps_per_second": 0.432, "step": 133000 }, { "epoch": 0.5422201491207255, "grad_norm": 5.993309020996094, "learning_rate": 0.004937176764544669, "loss": 8.325, "step": 133100 }, { "epoch": 0.5426275271441069, "grad_norm": 6.58394193649292, "learning_rate": 0.004937066091792161, "loss": 8.3942, "step": 133200 }, { "epoch": 0.5430349051674884, "grad_norm": 2.4660582542419434, "learning_rate": 0.004936955322887076, "loss": 8.607, "step": 133300 }, { "epoch": 0.5434422831908698, "grad_norm": 1.6601381301879883, "learning_rate": 0.004936844457833782, "loss": 8.6692, "step": 133400 }, { "epoch": 0.5438496612142513, "grad_norm": 8.589520454406738, "learning_rate": 0.004936733496636674, "loss": 8.6054, "step": 133500 }, { "epoch": 0.5442570392376328, "grad_norm": 10.06407642364502, "learning_rate": 0.004936622439300137, "loss": 8.4439, "step": 133600 }, { "epoch": 0.5446644172610142, "grad_norm": 2.2744853496551514, "learning_rate": 0.0049365112858285555, "loss": 8.559, "step": 133700 }, { "epoch": 0.5450717952843956, "grad_norm": 2.906050205230713, "learning_rate": 0.004936400036226332, "loss": 8.5895, "step": 133800 }, { "epoch": 0.5454791733077772, "grad_norm": 4.517184257507324, "learning_rate": 0.004936288690497856, "loss": 8.6234, "step": 133900 }, { "epoch": 0.5458865513311586, "grad_norm": 4.899981498718262, "learning_rate": 0.004936177248647536, "loss": 8.4567, "step": 134000 }, { "epoch": 0.5458865513311586, "eval_MaskedAccuracy": 0.4734875833116333, "eval_loss": 1.77745521068573, "eval_runtime": 588.8793, "eval_samples_per_second": 107.791, "eval_steps_per_second": 0.421, "step": 134000 }, { "epoch": 0.54629392935454, "grad_norm": 5.668209075927734, "learning_rate": 0.004936065710679779, "loss": 8.4023, "step": 134100 }, { "epoch": 0.5467013073779216, "grad_norm": 7.033673286437988, "learning_rate": 0.0049359540765989965, "loss": 8.449, "step": 134200 }, { "epoch": 0.547108685401303, "grad_norm": 4.8322601318359375, "learning_rate": 0.00493584234640959, "loss": 8.3554, "step": 134300 }, { "epoch": 0.5475160634246845, "grad_norm": 8.219858169555664, "learning_rate": 0.004935730520115993, "loss": 8.3085, "step": 134400 }, { "epoch": 0.5479234414480659, "grad_norm": 8.772672653198242, "learning_rate": 0.004935618597722611, "loss": 8.3246, "step": 134500 }, { "epoch": 0.5483308194714474, "grad_norm": 4.388508319854736, "learning_rate": 0.0049355065792338745, "loss": 8.3255, "step": 134600 }, { "epoch": 0.5487381974948289, "grad_norm": 7.620246410369873, "learning_rate": 0.004935394464654222, "loss": 8.3211, "step": 134700 }, { "epoch": 0.5491455755182103, "grad_norm": 8.451358795166016, "learning_rate": 0.004935282253988074, "loss": 8.2897, "step": 134800 }, { "epoch": 0.5495529535415917, "grad_norm": 8.059579849243164, "learning_rate": 0.004935169947239869, "loss": 8.2934, "step": 134900 }, { "epoch": 0.5499603315649733, "grad_norm": 5.406369209289551, "learning_rate": 0.004935057544414045, "loss": 8.3126, "step": 135000 }, { "epoch": 0.5499603315649733, "eval_MaskedAccuracy": 0.4790078726465579, "eval_loss": 1.7325547933578491, "eval_runtime": 561.955, "eval_samples_per_second": 112.956, "eval_steps_per_second": 0.441, "step": 135000 }, { "epoch": 0.5503677095883547, "grad_norm": 6.986901760101318, "learning_rate": 0.004934945045515049, "loss": 8.3266, "step": 135100 }, { "epoch": 0.5507750876117361, "grad_norm": 6.454778671264648, "learning_rate": 0.00493483245054733, "loss": 8.2776, "step": 135200 }, { "epoch": 0.5511824656351176, "grad_norm": 5.6888813972473145, "learning_rate": 0.004934719759515342, "loss": 8.2973, "step": 135300 }, { "epoch": 0.5515898436584991, "grad_norm": 10.446785926818848, "learning_rate": 0.004934606972423534, "loss": 8.3179, "step": 135400 }, { "epoch": 0.5519972216818806, "grad_norm": 4.9595112800598145, "learning_rate": 0.004934494089276374, "loss": 8.2849, "step": 135500 }, { "epoch": 0.552404599705262, "grad_norm": 7.039740562438965, "learning_rate": 0.004934381110078313, "loss": 8.2879, "step": 135600 }, { "epoch": 0.5528119777286434, "grad_norm": 5.197790622711182, "learning_rate": 0.004934268034833823, "loss": 8.3102, "step": 135700 }, { "epoch": 0.553219355752025, "grad_norm": 9.838872909545898, "learning_rate": 0.004934154863547372, "loss": 8.3484, "step": 135800 }, { "epoch": 0.5536267337754064, "grad_norm": 8.352523803710938, "learning_rate": 0.004934041596223441, "loss": 8.3332, "step": 135900 }, { "epoch": 0.5540341117987878, "grad_norm": 6.191901206970215, "learning_rate": 0.004933928232866506, "loss": 8.5556, "step": 136000 }, { "epoch": 0.5540341117987878, "eval_MaskedAccuracy": 0.45859378365217834, "eval_loss": 1.8376259803771973, "eval_runtime": 558.7141, "eval_samples_per_second": 113.611, "eval_steps_per_second": 0.444, "step": 136000 }, { "epoch": 0.5544414898221693, "grad_norm": 6.703093528747559, "learning_rate": 0.004933814773481047, "loss": 8.6737, "step": 136100 }, { "epoch": 0.5548488678455508, "grad_norm": 8.970337867736816, "learning_rate": 0.004933701218071543, "loss": 8.5675, "step": 136200 }, { "epoch": 0.5552562458689322, "grad_norm": 5.416504383087158, "learning_rate": 0.004933587566642498, "loss": 8.5321, "step": 136300 }, { "epoch": 0.5556636238923137, "grad_norm": 2.75545072555542, "learning_rate": 0.004933473819198396, "loss": 8.4297, "step": 136400 }, { "epoch": 0.5560710019156951, "grad_norm": 4.553280830383301, "learning_rate": 0.004933359975743742, "loss": 8.5563, "step": 136500 }, { "epoch": 0.5564783799390766, "grad_norm": 6.438226222991943, "learning_rate": 0.004933246036283021, "loss": 8.423, "step": 136600 }, { "epoch": 0.5568857579624581, "grad_norm": 9.083197593688965, "learning_rate": 0.004933132000820752, "loss": 8.389, "step": 136700 }, { "epoch": 0.5572931359858395, "grad_norm": 8.170239448547363, "learning_rate": 0.00493301786936144, "loss": 8.3369, "step": 136800 }, { "epoch": 0.557700514009221, "grad_norm": 10.418622970581055, "learning_rate": 0.0049329036419095895, "loss": 8.3401, "step": 136900 }, { "epoch": 0.5581078920326025, "grad_norm": 8.396859169006348, "learning_rate": 0.004932789318469727, "loss": 8.3074, "step": 137000 }, { "epoch": 0.5581078920326025, "eval_MaskedAccuracy": 0.478139934353604, "eval_loss": 1.757191777229309, "eval_runtime": 604.6911, "eval_samples_per_second": 104.973, "eval_steps_per_second": 0.41, "step": 137000 }, { "epoch": 0.5585152700559839, "grad_norm": 10.67445182800293, "learning_rate": 0.004932674899046365, "loss": 8.3036, "step": 137100 }, { "epoch": 0.5589226480793654, "grad_norm": 8.79953384399414, "learning_rate": 0.004932560383644033, "loss": 8.2774, "step": 137200 }, { "epoch": 0.5593300261027468, "grad_norm": 8.39030933380127, "learning_rate": 0.0049324457722672525, "loss": 8.274, "step": 137300 }, { "epoch": 0.5597374041261283, "grad_norm": 9.26461124420166, "learning_rate": 0.004932331064920559, "loss": 8.3156, "step": 137400 }, { "epoch": 0.5601447821495098, "grad_norm": 4.736127853393555, "learning_rate": 0.004932216261608489, "loss": 8.3264, "step": 137500 }, { "epoch": 0.5605521601728912, "grad_norm": 9.321123123168945, "learning_rate": 0.0049321013623355794, "loss": 8.285, "step": 137600 }, { "epoch": 0.5609595381962726, "grad_norm": 10.020621299743652, "learning_rate": 0.0049319863671063655, "loss": 8.2578, "step": 137700 }, { "epoch": 0.5613669162196542, "grad_norm": 6.300183296203613, "learning_rate": 0.0049318712759254014, "loss": 8.2911, "step": 137800 }, { "epoch": 0.5617742942430356, "grad_norm": 8.268228530883789, "learning_rate": 0.004931756088797239, "loss": 8.4312, "step": 137900 }, { "epoch": 0.5621816722664171, "grad_norm": 9.11463737487793, "learning_rate": 0.004931640805726424, "loss": 8.344, "step": 138000 }, { "epoch": 0.5621816722664171, "eval_MaskedAccuracy": 0.47664963225182544, "eval_loss": 1.7589812278747559, "eval_runtime": 603.7869, "eval_samples_per_second": 105.13, "eval_steps_per_second": 0.411, "step": 138000 }, { "epoch": 0.5625890502897986, "grad_norm": 8.544374465942383, "learning_rate": 0.004931525426717527, "loss": 8.3289, "step": 138100 }, { "epoch": 0.56299642831318, "grad_norm": 5.79107141494751, "learning_rate": 0.004931409951775097, "loss": 8.3046, "step": 138200 }, { "epoch": 0.5634038063365615, "grad_norm": 8.56974983215332, "learning_rate": 0.004931294380903704, "loss": 8.3242, "step": 138300 }, { "epoch": 0.5638111843599429, "grad_norm": 12.155808448791504, "learning_rate": 0.004931178714107916, "loss": 8.3155, "step": 138400 }, { "epoch": 0.5642185623833244, "grad_norm": 4.285138130187988, "learning_rate": 0.004931062951392311, "loss": 8.2667, "step": 138500 }, { "epoch": 0.5646259404067059, "grad_norm": 8.989679336547852, "learning_rate": 0.004930947092761459, "loss": 8.5222, "step": 138600 }, { "epoch": 0.5650333184300873, "grad_norm": 4.6177978515625, "learning_rate": 0.004930831138219945, "loss": 8.6581, "step": 138700 }, { "epoch": 0.5654406964534687, "grad_norm": 2.7358109951019287, "learning_rate": 0.0049307150877723515, "loss": 8.5795, "step": 138800 }, { "epoch": 0.5658480744768503, "grad_norm": 3.21806263923645, "learning_rate": 0.004930598941423266, "loss": 8.542, "step": 138900 }, { "epoch": 0.5662554525002317, "grad_norm": 5.059041500091553, "learning_rate": 0.004930482699177287, "loss": 8.3765, "step": 139000 }, { "epoch": 0.5662554525002317, "eval_MaskedAccuracy": 0.47533519953307585, "eval_loss": 1.7726454734802246, "eval_runtime": 609.0225, "eval_samples_per_second": 104.226, "eval_steps_per_second": 0.407, "step": 139000 }, { "epoch": 0.5666628305236131, "grad_norm": 5.088695526123047, "learning_rate": 0.004930366361038996, "loss": 8.3651, "step": 139100 }, { "epoch": 0.5670702085469946, "grad_norm": 11.858138084411621, "learning_rate": 0.004930249927013009, "loss": 8.404, "step": 139200 }, { "epoch": 0.5674775865703761, "grad_norm": 5.233626842498779, "learning_rate": 0.004930133397103916, "loss": 8.4429, "step": 139300 }, { "epoch": 0.5678849645937576, "grad_norm": 9.309755325317383, "learning_rate": 0.004930016771316329, "loss": 8.3795, "step": 139400 }, { "epoch": 0.568292342617139, "grad_norm": 3.2571256160736084, "learning_rate": 0.004929900049654861, "loss": 8.3915, "step": 139500 }, { "epoch": 0.5686997206405204, "grad_norm": 23.565610885620117, "learning_rate": 0.00492978323212412, "loss": 8.4156, "step": 139600 }, { "epoch": 0.569107098663902, "grad_norm": 3.5002174377441406, "learning_rate": 0.00492966631872873, "loss": 8.5306, "step": 139700 }, { "epoch": 0.5695144766872834, "grad_norm": 0.7858268022537231, "learning_rate": 0.004929549309473314, "loss": 8.3792, "step": 139800 }, { "epoch": 0.5699218547106648, "grad_norm": 15.595468521118164, "learning_rate": 0.0049294322043625044, "loss": 8.5528, "step": 139900 }, { "epoch": 0.5703292327340463, "grad_norm": 3.5991625785827637, "learning_rate": 0.00492931500340092, "loss": 8.4807, "step": 140000 }, { "epoch": 0.5703292327340463, "eval_MaskedAccuracy": 0.47439803391024454, "eval_loss": 1.7724093198776245, "eval_runtime": 499.4875, "eval_samples_per_second": 127.082, "eval_steps_per_second": 0.497, "step": 140000 }, { "epoch": 0.5707366107574278, "grad_norm": 4.950541019439697, "learning_rate": 0.004929197706593197, "loss": 8.3725, "step": 140100 }, { "epoch": 0.5711439887808092, "grad_norm": 1.7901657819747925, "learning_rate": 0.004929080313943971, "loss": 8.3586, "step": 140200 }, { "epoch": 0.5715513668041907, "grad_norm": 7.542468547821045, "learning_rate": 0.004928962825457888, "loss": 8.5297, "step": 140300 }, { "epoch": 0.5719587448275721, "grad_norm": 4.861608028411865, "learning_rate": 0.004928845241139586, "loss": 8.5119, "step": 140400 }, { "epoch": 0.5723661228509537, "grad_norm": 5.734555244445801, "learning_rate": 0.004928727560993721, "loss": 8.3784, "step": 140500 }, { "epoch": 0.5727735008743351, "grad_norm": 9.214383125305176, "learning_rate": 0.00492860978502494, "loss": 8.3441, "step": 140600 }, { "epoch": 0.5731808788977165, "grad_norm": 4.167412757873535, "learning_rate": 0.004928491913237904, "loss": 8.4657, "step": 140700 }, { "epoch": 0.573588256921098, "grad_norm": 6.125998497009277, "learning_rate": 0.004928373945637273, "loss": 8.4581, "step": 140800 }, { "epoch": 0.5739956349444795, "grad_norm": 3.892979621887207, "learning_rate": 0.004928255882227712, "loss": 8.3622, "step": 140900 }, { "epoch": 0.5744030129678609, "grad_norm": 6.536438941955566, "learning_rate": 0.004928137723013875, "loss": 8.4651, "step": 141000 }, { "epoch": 0.5744030129678609, "eval_MaskedAccuracy": 0.46521993284921864, "eval_loss": 1.8148672580718994, "eval_runtime": 612.1042, "eval_samples_per_second": 103.701, "eval_steps_per_second": 0.405, "step": 141000 }, { "epoch": 0.5748103909912424, "grad_norm": 7.157055854797363, "learning_rate": 0.004928019468000448, "loss": 8.4738, "step": 141100 }, { "epoch": 0.5752177690146238, "grad_norm": 5.56843900680542, "learning_rate": 0.004927901117192104, "loss": 8.3646, "step": 141200 }, { "epoch": 0.5756251470380053, "grad_norm": 0.8016031384468079, "learning_rate": 0.004927782670593521, "loss": 8.387, "step": 141300 }, { "epoch": 0.5760325250613868, "grad_norm": 4.794798374176025, "learning_rate": 0.004927664128209382, "loss": 8.4828, "step": 141400 }, { "epoch": 0.5764399030847682, "grad_norm": 5.881836891174316, "learning_rate": 0.004927545490044371, "loss": 8.3491, "step": 141500 }, { "epoch": 0.5768472811081496, "grad_norm": 9.797046661376953, "learning_rate": 0.004927426756103177, "loss": 8.385, "step": 141600 }, { "epoch": 0.5772546591315312, "grad_norm": 0.9898270964622498, "learning_rate": 0.004927307926390499, "loss": 8.3714, "step": 141700 }, { "epoch": 0.5776620371549126, "grad_norm": 6.364189147949219, "learning_rate": 0.004927189000911034, "loss": 8.5285, "step": 141800 }, { "epoch": 0.5780694151782941, "grad_norm": 2.7466721534729004, "learning_rate": 0.004927069979669483, "loss": 8.3986, "step": 141900 }, { "epoch": 0.5784767932016756, "grad_norm": 7.290796279907227, "learning_rate": 0.004926950862670552, "loss": 8.3574, "step": 142000 }, { "epoch": 0.5784767932016756, "eval_MaskedAccuracy": 0.4768711731034431, "eval_loss": 1.7562979459762573, "eval_runtime": 633.151, "eval_samples_per_second": 100.254, "eval_steps_per_second": 0.392, "step": 142000 }, { "epoch": 0.578884171225057, "grad_norm": 8.044575691223145, "learning_rate": 0.0049268316499189455, "loss": 8.3856, "step": 142100 }, { "epoch": 0.5792915492484385, "grad_norm": 9.492392539978027, "learning_rate": 0.004926712341419386, "loss": 8.4513, "step": 142200 }, { "epoch": 0.5796989272718199, "grad_norm": 3.2362284660339355, "learning_rate": 0.00492659293717658, "loss": 8.347, "step": 142300 }, { "epoch": 0.5801063052952014, "grad_norm": 7.138864040374756, "learning_rate": 0.004926473437195258, "loss": 8.3386, "step": 142400 }, { "epoch": 0.5805136833185829, "grad_norm": 7.8301100730896, "learning_rate": 0.00492635384148014, "loss": 8.4787, "step": 142500 }, { "epoch": 0.5809210613419643, "grad_norm": 7.749133586883545, "learning_rate": 0.004926234150035957, "loss": 8.5, "step": 142600 }, { "epoch": 0.5813284393653457, "grad_norm": 17.91340446472168, "learning_rate": 0.004926114362867435, "loss": 8.4313, "step": 142700 }, { "epoch": 0.5817358173887273, "grad_norm": 7.80926513671875, "learning_rate": 0.004925994479979315, "loss": 8.4978, "step": 142800 }, { "epoch": 0.5821431954121087, "grad_norm": 1.9094536304473877, "learning_rate": 0.004925874501376334, "loss": 8.4321, "step": 142900 }, { "epoch": 0.5825505734354902, "grad_norm": 7.612975120544434, "learning_rate": 0.0049257544270632355, "loss": 8.3529, "step": 143000 }, { "epoch": 0.5825505734354902, "eval_MaskedAccuracy": 0.4764777458492212, "eval_loss": 1.761231780052185, "eval_runtime": 633.3846, "eval_samples_per_second": 100.217, "eval_steps_per_second": 0.392, "step": 143000 }, { "epoch": 0.5829579514588716, "grad_norm": 7.09923791885376, "learning_rate": 0.004925634257044773, "loss": 8.3188, "step": 143100 }, { "epoch": 0.5833653294822531, "grad_norm": 7.374269485473633, "learning_rate": 0.004925513991325686, "loss": 8.351, "step": 143200 }, { "epoch": 0.5837727075056346, "grad_norm": 3.873291492462158, "learning_rate": 0.004925393629910736, "loss": 8.3384, "step": 143300 }, { "epoch": 0.584180085529016, "grad_norm": 5.784383296966553, "learning_rate": 0.004925273172804682, "loss": 8.4272, "step": 143400 }, { "epoch": 0.5845874635523974, "grad_norm": 2.9218976497650146, "learning_rate": 0.004925152620012286, "loss": 8.4527, "step": 143500 }, { "epoch": 0.584994841575779, "grad_norm": 3.2860617637634277, "learning_rate": 0.004925031971538311, "loss": 8.4885, "step": 143600 }, { "epoch": 0.5854022195991604, "grad_norm": 10.385235786437988, "learning_rate": 0.004924911227387532, "loss": 8.5233, "step": 143700 }, { "epoch": 0.5858095976225418, "grad_norm": 7.59553861618042, "learning_rate": 0.004924790387564717, "loss": 8.5009, "step": 143800 }, { "epoch": 0.5862169756459233, "grad_norm": 3.1202356815338135, "learning_rate": 0.0049246694520746455, "loss": 8.4833, "step": 143900 }, { "epoch": 0.5866243536693048, "grad_norm": 5.625021457672119, "learning_rate": 0.0049245484209221035, "loss": 8.4456, "step": 144000 }, { "epoch": 0.5866243536693048, "eval_MaskedAccuracy": 0.4735239587801145, "eval_loss": 1.7806174755096436, "eval_runtime": 633.5303, "eval_samples_per_second": 100.194, "eval_steps_per_second": 0.391, "step": 144000 }, { "epoch": 0.5870317316926862, "grad_norm": 8.998865127563477, "learning_rate": 0.004924427294111858, "loss": 8.3641, "step": 144100 }, { "epoch": 0.5874391097160677, "grad_norm": 7.449718952178955, "learning_rate": 0.00492430607164872, "loss": 8.3482, "step": 144200 }, { "epoch": 0.5878464877394491, "grad_norm": 6.765208721160889, "learning_rate": 0.004924184753537476, "loss": 8.315, "step": 144300 }, { "epoch": 0.5882538657628307, "grad_norm": 6.553773880004883, "learning_rate": 0.004924063339782917, "loss": 8.291, "step": 144400 }, { "epoch": 0.5886612437862121, "grad_norm": 5.706290245056152, "learning_rate": 0.0049239418303898465, "loss": 8.3187, "step": 144500 }, { "epoch": 0.5890686218095935, "grad_norm": 8.585490226745605, "learning_rate": 0.00492382022536307, "loss": 8.4939, "step": 144600 }, { "epoch": 0.589475999832975, "grad_norm": 6.214458465576172, "learning_rate": 0.00492369852470739, "loss": 8.481, "step": 144700 }, { "epoch": 0.5898833778563565, "grad_norm": 10.59347915649414, "learning_rate": 0.004923576728427617, "loss": 8.3291, "step": 144800 }, { "epoch": 0.5902907558797379, "grad_norm": 12.514204025268555, "learning_rate": 0.004923454836528572, "loss": 8.3762, "step": 144900 }, { "epoch": 0.5906981339031194, "grad_norm": 4.104231834411621, "learning_rate": 0.004923332849015071, "loss": 8.5356, "step": 145000 }, { "epoch": 0.5906981339031194, "eval_MaskedAccuracy": 0.4719728494655418, "eval_loss": 1.7796320915222168, "eval_runtime": 601.6769, "eval_samples_per_second": 105.498, "eval_steps_per_second": 0.412, "step": 145000 }, { "epoch": 0.5911055119265008, "grad_norm": 6.915755748748779, "learning_rate": 0.004923210765891941, "loss": 8.3611, "step": 145100 }, { "epoch": 0.5915128899498823, "grad_norm": 4.886312484741211, "learning_rate": 0.004923088587164003, "loss": 8.3103, "step": 145200 }, { "epoch": 0.5919202679732638, "grad_norm": 7.537599086761475, "learning_rate": 0.004922966312836096, "loss": 8.3021, "step": 145300 }, { "epoch": 0.5923276459966452, "grad_norm": 6.0286054611206055, "learning_rate": 0.0049228439429130485, "loss": 8.3138, "step": 145400 }, { "epoch": 0.5927350240200268, "grad_norm": 5.109126567840576, "learning_rate": 0.0049227214773997, "loss": 8.2938, "step": 145500 }, { "epoch": 0.5931424020434082, "grad_norm": 5.64884090423584, "learning_rate": 0.004922598916300887, "loss": 8.3078, "step": 145600 }, { "epoch": 0.5935497800667896, "grad_norm": 8.799834251403809, "learning_rate": 0.004922476259621459, "loss": 8.344, "step": 145700 }, { "epoch": 0.5939571580901711, "grad_norm": 5.0454792976379395, "learning_rate": 0.004922353507366266, "loss": 8.4802, "step": 145800 }, { "epoch": 0.5943645361135526, "grad_norm": 7.879946708679199, "learning_rate": 0.0049222306595401565, "loss": 8.3843, "step": 145900 }, { "epoch": 0.594771914136934, "grad_norm": 5.273271560668945, "learning_rate": 0.004922107716147994, "loss": 8.3082, "step": 146000 }, { "epoch": 0.594771914136934, "eval_MaskedAccuracy": 0.4789688832855816, "eval_loss": 1.7472257614135742, "eval_runtime": 532.2718, "eval_samples_per_second": 119.255, "eval_steps_per_second": 0.466, "step": 146000 }, { "epoch": 0.5951792921603155, "grad_norm": 5.88913106918335, "learning_rate": 0.004921984677194639, "loss": 8.2821, "step": 146100 }, { "epoch": 0.5955866701836969, "grad_norm": 6.070159912109375, "learning_rate": 0.00492186154268495, "loss": 8.279, "step": 146200 }, { "epoch": 0.5959940482070784, "grad_norm": 3.6523008346557617, "learning_rate": 0.004921738312623806, "loss": 8.2728, "step": 146300 }, { "epoch": 0.5964014262304599, "grad_norm": 8.161672592163086, "learning_rate": 0.004921614987016069, "loss": 8.2584, "step": 146400 }, { "epoch": 0.5968088042538413, "grad_norm": 4.909224987030029, "learning_rate": 0.004921491565866608, "loss": 8.3252, "step": 146500 }, { "epoch": 0.5972161822772227, "grad_norm": 8.278355598449707, "learning_rate": 0.0049213680491803115, "loss": 8.4315, "step": 146600 }, { "epoch": 0.5976235603006043, "grad_norm": 4.000131607055664, "learning_rate": 0.004921244436962065, "loss": 8.3199, "step": 146700 }, { "epoch": 0.5980309383239857, "grad_norm": 7.900701522827148, "learning_rate": 0.004921120729216753, "loss": 8.2887, "step": 146800 }, { "epoch": 0.5984383163473672, "grad_norm": 4.253056049346924, "learning_rate": 0.0049209969259492664, "loss": 8.2676, "step": 146900 }, { "epoch": 0.5988456943707486, "grad_norm": 3.7337450981140137, "learning_rate": 0.0049208730271644964, "loss": 8.4013, "step": 147000 }, { "epoch": 0.5988456943707486, "eval_MaskedAccuracy": 0.45803042398340266, "eval_loss": 1.8447405099868774, "eval_runtime": 566.7738, "eval_samples_per_second": 111.995, "eval_steps_per_second": 0.438, "step": 147000 }, { "epoch": 0.5992530723941301, "grad_norm": 8.883218765258789, "learning_rate": 0.0049207490328673446, "loss": 8.4825, "step": 147100 }, { "epoch": 0.5996604504175116, "grad_norm": 2.6650524139404297, "learning_rate": 0.0049206249430627175, "loss": 8.3655, "step": 147200 }, { "epoch": 0.600067828440893, "grad_norm": 3.044719696044922, "learning_rate": 0.004920500757755516, "loss": 8.5077, "step": 147300 }, { "epoch": 0.6004752064642744, "grad_norm": 3.7706477642059326, "learning_rate": 0.004920376476950655, "loss": 8.5518, "step": 147400 }, { "epoch": 0.600882584487656, "grad_norm": 2.005147933959961, "learning_rate": 0.004920252100653035, "loss": 8.5234, "step": 147500 }, { "epoch": 0.6012899625110374, "grad_norm": 5.612418174743652, "learning_rate": 0.004920127628867589, "loss": 8.5105, "step": 147600 }, { "epoch": 0.6016973405344188, "grad_norm": 6.48015832901001, "learning_rate": 0.004920003061599228, "loss": 8.4509, "step": 147700 }, { "epoch": 0.6021047185578003, "grad_norm": 6.271529197692871, "learning_rate": 0.00491987839885288, "loss": 8.3557, "step": 147800 }, { "epoch": 0.6025120965811818, "grad_norm": 9.45332145690918, "learning_rate": 0.004919753640633472, "loss": 8.3089, "step": 147900 }, { "epoch": 0.6029194746045633, "grad_norm": 6.876777648925781, "learning_rate": 0.004919628786945938, "loss": 8.3361, "step": 148000 }, { "epoch": 0.6029194746045633, "eval_MaskedAccuracy": 0.4786036508942568, "eval_loss": 1.7497118711471558, "eval_runtime": 528.3629, "eval_samples_per_second": 120.137, "eval_steps_per_second": 0.469, "step": 148000 }, { "epoch": 0.6033268526279447, "grad_norm": 7.976334571838379, "learning_rate": 0.004919503837795211, "loss": 8.2906, "step": 148100 }, { "epoch": 0.6037342306513261, "grad_norm": 7.192055702209473, "learning_rate": 0.004919378793186239, "loss": 8.2782, "step": 148200 }, { "epoch": 0.6041416086747077, "grad_norm": 9.158371925354004, "learning_rate": 0.004919253653123958, "loss": 8.2861, "step": 148300 }, { "epoch": 0.6045489866980891, "grad_norm": 6.050761699676514, "learning_rate": 0.004919128417613319, "loss": 8.2923, "step": 148400 }, { "epoch": 0.6049563647214705, "grad_norm": 6.686251640319824, "learning_rate": 0.0049190030866592665, "loss": 8.3018, "step": 148500 }, { "epoch": 0.605363742744852, "grad_norm": 8.751076698303223, "learning_rate": 0.0049188776602667635, "loss": 8.3218, "step": 148600 }, { "epoch": 0.6057711207682335, "grad_norm": 9.721341133117676, "learning_rate": 0.004918752138440773, "loss": 8.2988, "step": 148700 }, { "epoch": 0.6061784987916149, "grad_norm": 9.185450553894043, "learning_rate": 0.004918626521186247, "loss": 8.2807, "step": 148800 }, { "epoch": 0.6065858768149964, "grad_norm": 6.511389255523682, "learning_rate": 0.004918500808508156, "loss": 8.2823, "step": 148900 }, { "epoch": 0.6069932548383778, "grad_norm": 4.399498462677002, "learning_rate": 0.004918375000411468, "loss": 8.4705, "step": 149000 }, { "epoch": 0.6069932548383778, "eval_MaskedAccuracy": 0.46920769914216875, "eval_loss": 1.803868055343628, "eval_runtime": 559.0038, "eval_samples_per_second": 113.552, "eval_steps_per_second": 0.444, "step": 149000 }, { "epoch": 0.6074006328617593, "grad_norm": 5.837223529815674, "learning_rate": 0.004918249096901156, "loss": 8.5529, "step": 149100 }, { "epoch": 0.6078080108851408, "grad_norm": 6.941884994506836, "learning_rate": 0.004918123097982208, "loss": 8.4394, "step": 149200 }, { "epoch": 0.6082153889085222, "grad_norm": 5.623409271240234, "learning_rate": 0.004917997003659595, "loss": 8.4554, "step": 149300 }, { "epoch": 0.6086227669319038, "grad_norm": 7.008805751800537, "learning_rate": 0.004917870813938307, "loss": 8.3244, "step": 149400 }, { "epoch": 0.6090301449552852, "grad_norm": 6.360350131988525, "learning_rate": 0.004917744528823322, "loss": 8.2924, "step": 149500 }, { "epoch": 0.6094375229786666, "grad_norm": 6.548702239990234, "learning_rate": 0.0049176181483196425, "loss": 8.3391, "step": 149600 }, { "epoch": 0.6098449010020481, "grad_norm": 6.634683609008789, "learning_rate": 0.004917491672432266, "loss": 8.2663, "step": 149700 }, { "epoch": 0.6102522790254296, "grad_norm": 5.925806522369385, "learning_rate": 0.004917365101166191, "loss": 8.2479, "step": 149800 }, { "epoch": 0.610659657048811, "grad_norm": 7.2930521965026855, "learning_rate": 0.004917238434526421, "loss": 8.2717, "step": 149900 }, { "epoch": 0.6110670350721925, "grad_norm": 6.2033796310424805, "learning_rate": 0.004917111672517968, "loss": 8.2765, "step": 150000 }, { "epoch": 0.6110670350721925, "eval_MaskedAccuracy": 0.48004479177415543, "eval_loss": 1.741084098815918, "eval_runtime": 552.5727, "eval_samples_per_second": 114.874, "eval_steps_per_second": 0.449, "step": 150000 }, { "epoch": 0.6114744130955739, "grad_norm": 2.6907870769500732, "learning_rate": 0.004916984815145834, "loss": 8.4017, "step": 150100 }, { "epoch": 0.6118817911189554, "grad_norm": 0.7989844679832458, "learning_rate": 0.004916857862415046, "loss": 8.4558, "step": 150200 }, { "epoch": 0.6122891691423369, "grad_norm": 9.022034645080566, "learning_rate": 0.004916730814330616, "loss": 8.6093, "step": 150300 }, { "epoch": 0.6126965471657183, "grad_norm": 8.180450439453125, "learning_rate": 0.004916603670897565, "loss": 8.3932, "step": 150400 }, { "epoch": 0.6131039251890998, "grad_norm": 1.3381363153457642, "learning_rate": 0.004916476432120923, "loss": 8.4623, "step": 150500 }, { "epoch": 0.6135113032124813, "grad_norm": 6.8818511962890625, "learning_rate": 0.0049163490980057185, "loss": 8.5591, "step": 150600 }, { "epoch": 0.6139186812358627, "grad_norm": 6.092988967895508, "learning_rate": 0.004916221668556986, "loss": 8.4932, "step": 150700 }, { "epoch": 0.6143260592592442, "grad_norm": 7.2855448722839355, "learning_rate": 0.004916094143779766, "loss": 8.4204, "step": 150800 }, { "epoch": 0.6147334372826256, "grad_norm": 4.892360210418701, "learning_rate": 0.004915966523679098, "loss": 8.3325, "step": 150900 }, { "epoch": 0.6151408153060071, "grad_norm": 1.7383997440338135, "learning_rate": 0.0049158388082600375, "loss": 8.3469, "step": 151000 }, { "epoch": 0.6151408153060071, "eval_MaskedAccuracy": 0.4731508727785146, "eval_loss": 1.7681819200515747, "eval_runtime": 575.5654, "eval_samples_per_second": 110.285, "eval_steps_per_second": 0.431, "step": 151000 }, { "epoch": 0.6155481933293886, "grad_norm": 36.812713623046875, "learning_rate": 0.004915710997527619, "loss": 8.4677, "step": 151100 }, { "epoch": 0.61595557135277, "grad_norm": 2.0609169006347656, "learning_rate": 0.0049155830914869, "loss": 8.5544, "step": 151200 }, { "epoch": 0.6163629493761514, "grad_norm": 3.7911245822906494, "learning_rate": 0.004915455090142937, "loss": 8.4017, "step": 151300 }, { "epoch": 0.616770327399533, "grad_norm": 6.143694877624512, "learning_rate": 0.004915326993500799, "loss": 8.3425, "step": 151400 }, { "epoch": 0.6171777054229144, "grad_norm": 2.891474485397339, "learning_rate": 0.004915198801565535, "loss": 8.3782, "step": 151500 }, { "epoch": 0.6175850834462958, "grad_norm": 3.7956511974334717, "learning_rate": 0.0049150705143422325, "loss": 8.4126, "step": 151600 }, { "epoch": 0.6179924614696773, "grad_norm": 6.636448383331299, "learning_rate": 0.004914942131835944, "loss": 8.4108, "step": 151700 }, { "epoch": 0.6183998394930588, "grad_norm": 7.5709052085876465, "learning_rate": 0.004914813654051754, "loss": 8.3272, "step": 151800 }, { "epoch": 0.6188072175164403, "grad_norm": 6.166447639465332, "learning_rate": 0.004914685080994747, "loss": 8.2731, "step": 151900 }, { "epoch": 0.6192145955398217, "grad_norm": 8.206730842590332, "learning_rate": 0.00491455641267, "loss": 8.2621, "step": 152000 }, { "epoch": 0.6192145955398217, "eval_MaskedAccuracy": 0.48002182329563664, "eval_loss": 1.7566319704055786, "eval_runtime": 512.7412, "eval_samples_per_second": 123.797, "eval_steps_per_second": 0.484, "step": 152000 }, { "epoch": 0.6196219735632031, "grad_norm": 9.208566665649414, "learning_rate": 0.004914427649082595, "loss": 8.2899, "step": 152100 }, { "epoch": 0.6200293515865847, "grad_norm": 6.308719635009766, "learning_rate": 0.0049142987902376315, "loss": 8.2675, "step": 152200 }, { "epoch": 0.6204367296099661, "grad_norm": 3.0895836353302, "learning_rate": 0.004914169836140206, "loss": 8.3639, "step": 152300 }, { "epoch": 0.6208441076333475, "grad_norm": 3.028787136077881, "learning_rate": 0.004914040786795409, "loss": 8.5091, "step": 152400 }, { "epoch": 0.621251485656729, "grad_norm": 7.628536701202393, "learning_rate": 0.004913911642208341, "loss": 8.3584, "step": 152500 }, { "epoch": 0.6216588636801105, "grad_norm": 14.235428810119629, "learning_rate": 0.004913782402384117, "loss": 8.3257, "step": 152600 }, { "epoch": 0.6220662417034919, "grad_norm": 12.227068901062012, "learning_rate": 0.00491365306732784, "loss": 8.4322, "step": 152700 }, { "epoch": 0.6224736197268734, "grad_norm": 1.0032848119735718, "learning_rate": 0.004913523637044627, "loss": 8.4773, "step": 152800 }, { "epoch": 0.6228809977502549, "grad_norm": 0.7832779884338379, "learning_rate": 0.004913394111539591, "loss": 8.4854, "step": 152900 }, { "epoch": 0.6232883757736364, "grad_norm": 34.24857711791992, "learning_rate": 0.004913264490817857, "loss": 8.5275, "step": 153000 }, { "epoch": 0.6232883757736364, "eval_MaskedAccuracy": 0.46698241044431904, "eval_loss": 1.8068398237228394, "eval_runtime": 561.1496, "eval_samples_per_second": 113.118, "eval_steps_per_second": 0.442, "step": 153000 }, { "epoch": 0.6236957537970178, "grad_norm": 5.277442455291748, "learning_rate": 0.004913134774884545, "loss": 8.4271, "step": 153100 }, { "epoch": 0.6241031318203992, "grad_norm": 9.542320251464844, "learning_rate": 0.0049130049637447805, "loss": 8.3496, "step": 153200 }, { "epoch": 0.6245105098437808, "grad_norm": 2.137045383453369, "learning_rate": 0.004912875057403706, "loss": 8.3199, "step": 153300 }, { "epoch": 0.6249178878671622, "grad_norm": 4.267445087432861, "learning_rate": 0.0049127450558664534, "loss": 8.4279, "step": 153400 }, { "epoch": 0.6253252658905436, "grad_norm": 4.157717227935791, "learning_rate": 0.0049126149591381596, "loss": 8.453, "step": 153500 }, { "epoch": 0.6257326439139251, "grad_norm": 7.502955913543701, "learning_rate": 0.00491248476722397, "loss": 8.4648, "step": 153600 }, { "epoch": 0.6261400219373066, "grad_norm": 8.893965721130371, "learning_rate": 0.004912354480129033, "loss": 8.4351, "step": 153700 }, { "epoch": 0.626547399960688, "grad_norm": 4.194019317626953, "learning_rate": 0.004912224097858498, "loss": 8.3335, "step": 153800 }, { "epoch": 0.6269547779840695, "grad_norm": 5.781638145446777, "learning_rate": 0.0049120936204175136, "loss": 8.3219, "step": 153900 }, { "epoch": 0.6273621560074509, "grad_norm": 4.300761699676514, "learning_rate": 0.004911963047811252, "loss": 8.3777, "step": 154000 }, { "epoch": 0.6273621560074509, "eval_MaskedAccuracy": 0.4780379626562326, "eval_loss": 1.7531262636184692, "eval_runtime": 613.0712, "eval_samples_per_second": 103.538, "eval_steps_per_second": 0.405, "step": 154000 }, { "epoch": 0.6277695340308324, "grad_norm": 5.300182342529297, "learning_rate": 0.004911832380044862, "loss": 8.2957, "step": 154100 }, { "epoch": 0.6281769120542139, "grad_norm": 7.4471025466918945, "learning_rate": 0.004911701617123519, "loss": 8.2896, "step": 154200 }, { "epoch": 0.6285842900775953, "grad_norm": 5.349423408508301, "learning_rate": 0.0049115707590523874, "loss": 8.3444, "step": 154300 }, { "epoch": 0.6289916681009768, "grad_norm": 6.0373053550720215, "learning_rate": 0.004911439805836638, "loss": 8.2733, "step": 154400 }, { "epoch": 0.6293990461243583, "grad_norm": 8.793858528137207, "learning_rate": 0.004911308757481456, "loss": 8.2432, "step": 154500 }, { "epoch": 0.6298064241477397, "grad_norm": 1.461338996887207, "learning_rate": 0.004911177613992021, "loss": 8.3495, "step": 154600 }, { "epoch": 0.6302138021711212, "grad_norm": 4.147045135498047, "learning_rate": 0.004911046375373516, "loss": 8.5307, "step": 154700 }, { "epoch": 0.6306211801945026, "grad_norm": 6.397643089294434, "learning_rate": 0.004910915041631126, "loss": 8.4319, "step": 154800 }, { "epoch": 0.6310285582178841, "grad_norm": 6.000017166137695, "learning_rate": 0.004910783612770045, "loss": 8.3459, "step": 154900 }, { "epoch": 0.6314359362412656, "grad_norm": 4.597378730773926, "learning_rate": 0.0049106520887954724, "loss": 8.3301, "step": 155000 }, { "epoch": 0.6314359362412656, "eval_MaskedAccuracy": 0.4786940926800944, "eval_loss": 1.7530438899993896, "eval_runtime": 550.1194, "eval_samples_per_second": 115.386, "eval_steps_per_second": 0.451, "step": 155000 }, { "epoch": 0.631843314264647, "grad_norm": 5.755496978759766, "learning_rate": 0.004910520469712606, "loss": 8.3068, "step": 155100 }, { "epoch": 0.6322506922880284, "grad_norm": 8.617079734802246, "learning_rate": 0.004910388755526653, "loss": 8.2794, "step": 155200 }, { "epoch": 0.63265807031141, "grad_norm": 2.8633389472961426, "learning_rate": 0.004910256946242821, "loss": 8.2812, "step": 155300 }, { "epoch": 0.6330654483347914, "grad_norm": 4.23527193069458, "learning_rate": 0.004910125041866313, "loss": 8.4366, "step": 155400 }, { "epoch": 0.6334728263581728, "grad_norm": 9.153361320495605, "learning_rate": 0.00490999304240235, "loss": 8.4435, "step": 155500 }, { "epoch": 0.6338802043815543, "grad_norm": 1.2283235788345337, "learning_rate": 0.004909860947856153, "loss": 8.5084, "step": 155600 }, { "epoch": 0.6342875824049358, "grad_norm": 6.179910182952881, "learning_rate": 0.004909728758232938, "loss": 8.4251, "step": 155700 }, { "epoch": 0.6346949604283173, "grad_norm": 9.873910903930664, "learning_rate": 0.004909596473537926, "loss": 8.3496, "step": 155800 }, { "epoch": 0.6351023384516987, "grad_norm": 0.6780279874801636, "learning_rate": 0.004909464093776359, "loss": 8.4366, "step": 155900 }, { "epoch": 0.6355097164750801, "grad_norm": 6.819117546081543, "learning_rate": 0.004909331618953468, "loss": 8.4343, "step": 156000 }, { "epoch": 0.6355097164750801, "eval_MaskedAccuracy": 0.4746228182690614, "eval_loss": 1.7684835195541382, "eval_runtime": 609.0402, "eval_samples_per_second": 104.223, "eval_steps_per_second": 0.407, "step": 156000 }, { "epoch": 0.6359170944984617, "grad_norm": 7.058218955993652, "learning_rate": 0.0049091990490744905, "loss": 8.3529, "step": 156100 }, { "epoch": 0.6363244725218431, "grad_norm": 2.344837188720703, "learning_rate": 0.004909066384144668, "loss": 8.4224, "step": 156200 }, { "epoch": 0.6367318505452245, "grad_norm": 7.422060489654541, "learning_rate": 0.00490893362416924, "loss": 8.4797, "step": 156300 }, { "epoch": 0.637139228568606, "grad_norm": 6.679649829864502, "learning_rate": 0.004908800769153458, "loss": 8.4663, "step": 156400 }, { "epoch": 0.6375466065919875, "grad_norm": 1.7687937021255493, "learning_rate": 0.004908667819102575, "loss": 8.3526, "step": 156500 }, { "epoch": 0.6379539846153689, "grad_norm": 4.981142997741699, "learning_rate": 0.004908534774021848, "loss": 8.3066, "step": 156600 }, { "epoch": 0.6383613626387504, "grad_norm": 5.302996635437012, "learning_rate": 0.004908401633916531, "loss": 8.3902, "step": 156700 }, { "epoch": 0.6387687406621319, "grad_norm": 9.538987159729004, "learning_rate": 0.0049082683987919, "loss": 8.3133, "step": 156800 }, { "epoch": 0.6391761186855134, "grad_norm": 1.8474633693695068, "learning_rate": 0.004908135068653213, "loss": 8.3133, "step": 156900 }, { "epoch": 0.6395834967088948, "grad_norm": 8.414902687072754, "learning_rate": 0.004908001643505744, "loss": 8.4922, "step": 157000 }, { "epoch": 0.6395834967088948, "eval_MaskedAccuracy": 0.47553048548253046, "eval_loss": 1.7700469493865967, "eval_runtime": 577.0729, "eval_samples_per_second": 109.996, "eval_steps_per_second": 0.43, "step": 157000 }, { "epoch": 0.6399908747322762, "grad_norm": 3.761073350906372, "learning_rate": 0.004907868123354759, "loss": 8.3444, "step": 157100 }, { "epoch": 0.6403982527556578, "grad_norm": 1.2089842557907104, "learning_rate": 0.004907734508205559, "loss": 8.4, "step": 157200 }, { "epoch": 0.6408056307790392, "grad_norm": 8.433003425598145, "learning_rate": 0.004907600798063412, "loss": 8.4174, "step": 157300 }, { "epoch": 0.6412130088024206, "grad_norm": 3.711627721786499, "learning_rate": 0.004907466992933599, "loss": 8.3402, "step": 157400 }, { "epoch": 0.6416203868258021, "grad_norm": 8.343791007995605, "learning_rate": 0.004907333092821423, "loss": 8.2904, "step": 157500 }, { "epoch": 0.6420277648491836, "grad_norm": 4.790099620819092, "learning_rate": 0.004907199097732165, "loss": 8.2752, "step": 157600 }, { "epoch": 0.642435142872565, "grad_norm": 6.770668983459473, "learning_rate": 0.004907065007671135, "loss": 8.2258, "step": 157700 }, { "epoch": 0.6428425208959465, "grad_norm": 6.104489803314209, "learning_rate": 0.0049069308226436245, "loss": 8.2567, "step": 157800 }, { "epoch": 0.6432498989193279, "grad_norm": 4.77765417098999, "learning_rate": 0.004906796542654943, "loss": 8.2373, "step": 157900 }, { "epoch": 0.6436572769427094, "grad_norm": 30.578868865966797, "learning_rate": 0.004906662167710398, "loss": 8.2344, "step": 158000 }, { "epoch": 0.6436572769427094, "eval_MaskedAccuracy": 0.4684047134907011, "eval_loss": 1.8017596006393433, "eval_runtime": 636.8838, "eval_samples_per_second": 99.667, "eval_steps_per_second": 0.389, "step": 158000 }, { "epoch": 0.6440646549660909, "grad_norm": 4.308048248291016, "learning_rate": 0.004906527697815302, "loss": 8.5158, "step": 158100 }, { "epoch": 0.6444720329894723, "grad_norm": 9.857576370239258, "learning_rate": 0.004906393132974976, "loss": 8.498, "step": 158200 }, { "epoch": 0.6448794110128538, "grad_norm": 5.082419395446777, "learning_rate": 0.004906258473194737, "loss": 8.4119, "step": 158300 }, { "epoch": 0.6452867890362353, "grad_norm": 1.5055186748504639, "learning_rate": 0.004906123718479909, "loss": 8.4332, "step": 158400 }, { "epoch": 0.6456941670596167, "grad_norm": 5.3352580070495605, "learning_rate": 0.004905988868835811, "loss": 8.4906, "step": 158500 }, { "epoch": 0.6461015450829982, "grad_norm": 3.541630268096924, "learning_rate": 0.004905853924267782, "loss": 8.3953, "step": 158600 }, { "epoch": 0.6465089231063796, "grad_norm": 4.715964317321777, "learning_rate": 0.00490571888478116, "loss": 8.2978, "step": 158700 }, { "epoch": 0.6469163011297611, "grad_norm": 4.749425888061523, "learning_rate": 0.004905583750381281, "loss": 8.4208, "step": 158800 }, { "epoch": 0.6473236791531426, "grad_norm": 5.415860652923584, "learning_rate": 0.004905448521073488, "loss": 8.4136, "step": 158900 }, { "epoch": 0.647731057176524, "grad_norm": 5.361938953399658, "learning_rate": 0.004905313196863122, "loss": 8.2605, "step": 159000 }, { "epoch": 0.647731057176524, "eval_MaskedAccuracy": 0.47887928738637614, "eval_loss": 1.7427128553390503, "eval_runtime": 652.528, "eval_samples_per_second": 97.277, "eval_steps_per_second": 0.38, "step": 159000 }, { "epoch": 0.6481384351999054, "grad_norm": 4.954558372497559, "learning_rate": 0.004905177777755537, "loss": 8.2781, "step": 159100 }, { "epoch": 0.648545813223287, "grad_norm": 5.150397300720215, "learning_rate": 0.004905042263756091, "loss": 8.2828, "step": 159200 }, { "epoch": 0.6489531912466684, "grad_norm": 7.26869535446167, "learning_rate": 0.004904906654870136, "loss": 8.254, "step": 159300 }, { "epoch": 0.6493605692700499, "grad_norm": 2.4618828296661377, "learning_rate": 0.004904770951103038, "loss": 8.2973, "step": 159400 }, { "epoch": 0.6497679472934313, "grad_norm": 9.42536449432373, "learning_rate": 0.0049046351524601565, "loss": 8.3643, "step": 159500 }, { "epoch": 0.6501753253168128, "grad_norm": 8.196399688720703, "learning_rate": 0.004904499258946864, "loss": 8.4622, "step": 159600 }, { "epoch": 0.6505827033401943, "grad_norm": 5.312618255615234, "learning_rate": 0.004904363270568535, "loss": 8.2905, "step": 159700 }, { "epoch": 0.6509900813635757, "grad_norm": 4.90754508972168, "learning_rate": 0.004904227187330539, "loss": 8.2546, "step": 159800 }, { "epoch": 0.6513974593869571, "grad_norm": 6.547611236572266, "learning_rate": 0.004904091009238261, "loss": 8.2654, "step": 159900 }, { "epoch": 0.6518048374103387, "grad_norm": 4.732059478759766, "learning_rate": 0.004903954736297084, "loss": 8.2189, "step": 160000 }, { "epoch": 0.6518048374103387, "eval_MaskedAccuracy": 0.48032427367158237, "eval_loss": 1.7542961835861206, "eval_runtime": 711.8517, "eval_samples_per_second": 89.17, "eval_steps_per_second": 0.348, "step": 160000 }, { "epoch": 0.6522122154337201, "grad_norm": 4.2720184326171875, "learning_rate": 0.004903818368512396, "loss": 8.2616, "step": 160100 }, { "epoch": 0.6526195934571015, "grad_norm": 5.928277492523193, "learning_rate": 0.0049036819058895925, "loss": 8.2555, "step": 160200 }, { "epoch": 0.653026971480483, "grad_norm": 7.389522552490234, "learning_rate": 0.004903545348434064, "loss": 8.2081, "step": 160300 }, { "epoch": 0.6534343495038645, "grad_norm": 8.315999984741211, "learning_rate": 0.004903408696151203, "loss": 8.2393, "step": 160400 }, { "epoch": 0.6538417275272459, "grad_norm": 5.6302571296691895, "learning_rate": 0.0049032719490464255, "loss": 8.2477, "step": 160500 }, { "epoch": 0.6542491055506274, "grad_norm": 6.31380558013916, "learning_rate": 0.004903135107125126, "loss": 8.2348, "step": 160600 }, { "epoch": 0.6546564835740089, "grad_norm": 1.6448034048080444, "learning_rate": 0.0049029981703927125, "loss": 8.2952, "step": 160700 }, { "epoch": 0.6550638615973904, "grad_norm": 2.5281286239624023, "learning_rate": 0.004902861138854616, "loss": 8.5663, "step": 160800 }, { "epoch": 0.6554712396207718, "grad_norm": 5.702998161315918, "learning_rate": 0.00490272401251624, "loss": 8.5445, "step": 160900 }, { "epoch": 0.6558786176441532, "grad_norm": 7.67074728012085, "learning_rate": 0.004902586791383006, "loss": 8.5403, "step": 161000 }, { "epoch": 0.6558786176441532, "eval_MaskedAccuracy": 0.4705086464907593, "eval_loss": 1.8039331436157227, "eval_runtime": 632.0011, "eval_samples_per_second": 100.437, "eval_steps_per_second": 0.392, "step": 161000 }, { "epoch": 0.6562859956675348, "grad_norm": 4.913644790649414, "learning_rate": 0.004902449475460347, "loss": 8.397, "step": 161100 }, { "epoch": 0.6566933736909162, "grad_norm": 6.445557117462158, "learning_rate": 0.004902312064753687, "loss": 8.3314, "step": 161200 }, { "epoch": 0.6571007517142976, "grad_norm": 10.202012062072754, "learning_rate": 0.0049021745592684645, "loss": 8.2995, "step": 161300 }, { "epoch": 0.6575081297376791, "grad_norm": 7.1944403648376465, "learning_rate": 0.004902036959010109, "loss": 8.2713, "step": 161400 }, { "epoch": 0.6579155077610606, "grad_norm": 6.703820705413818, "learning_rate": 0.004901899263984063, "loss": 8.265, "step": 161500 }, { "epoch": 0.658322885784442, "grad_norm": 1.373208999633789, "learning_rate": 0.004901761474195769, "loss": 8.2205, "step": 161600 }, { "epoch": 0.6587302638078235, "grad_norm": 9.944040298461914, "learning_rate": 0.004901623589650673, "loss": 8.4279, "step": 161700 }, { "epoch": 0.6591376418312049, "grad_norm": 1.1036696434020996, "learning_rate": 0.004901485610354228, "loss": 8.4604, "step": 161800 }, { "epoch": 0.6595450198545865, "grad_norm": 1.1476789712905884, "learning_rate": 0.00490134753631189, "loss": 8.5233, "step": 161900 }, { "epoch": 0.6599523978779679, "grad_norm": 2.216991662979126, "learning_rate": 0.004901209367529117, "loss": 8.5094, "step": 162000 }, { "epoch": 0.6599523978779679, "eval_MaskedAccuracy": 0.4710504473440415, "eval_loss": 1.7816741466522217, "eval_runtime": 691.9791, "eval_samples_per_second": 91.731, "eval_steps_per_second": 0.358, "step": 162000 }, { "epoch": 0.6603597759013493, "grad_norm": 8.646736145019531, "learning_rate": 0.004901071104011381, "loss": 8.4393, "step": 162100 }, { "epoch": 0.6607671539247308, "grad_norm": 4.416395664215088, "learning_rate": 0.00490093274576413, "loss": 8.3638, "step": 162200 }, { "epoch": 0.6611745319481123, "grad_norm": 5.148403167724609, "learning_rate": 0.004900794292792844, "loss": 8.3285, "step": 162300 }, { "epoch": 0.6615819099714937, "grad_norm": 1.9926255941390991, "learning_rate": 0.004900655745102995, "loss": 8.3124, "step": 162400 }, { "epoch": 0.6619892879948752, "grad_norm": 6.878363132476807, "learning_rate": 0.004900517102700059, "loss": 8.3455, "step": 162500 }, { "epoch": 0.6623966660182566, "grad_norm": 8.724032402038574, "learning_rate": 0.004900378365589525, "loss": 8.331, "step": 162600 }, { "epoch": 0.6628040440416381, "grad_norm": 8.976378440856934, "learning_rate": 0.004900239533776872, "loss": 8.2772, "step": 162700 }, { "epoch": 0.6632114220650196, "grad_norm": 4.16154146194458, "learning_rate": 0.004900100607267592, "loss": 8.2905, "step": 162800 }, { "epoch": 0.663618800088401, "grad_norm": 10.567460060119629, "learning_rate": 0.004899961586067173, "loss": 8.4158, "step": 162900 }, { "epoch": 0.6640261781117824, "grad_norm": 3.53060245513916, "learning_rate": 0.004899822470181118, "loss": 8.3608, "step": 163000 }, { "epoch": 0.6640261781117824, "eval_MaskedAccuracy": 0.47414340365640284, "eval_loss": 1.773229956626892, "eval_runtime": 641.1106, "eval_samples_per_second": 99.009, "eval_steps_per_second": 0.387, "step": 163000 }, { "epoch": 0.664433556135164, "grad_norm": 4.364983558654785, "learning_rate": 0.0048996832596149235, "loss": 8.3465, "step": 163100 }, { "epoch": 0.6648409341585454, "grad_norm": 3.7606041431427, "learning_rate": 0.004899543954374083, "loss": 8.3469, "step": 163200 }, { "epoch": 0.6652483121819269, "grad_norm": 5.685455799102783, "learning_rate": 0.004899404554464118, "loss": 8.2964, "step": 163300 }, { "epoch": 0.6656556902053083, "grad_norm": 10.511970520019531, "learning_rate": 0.004899265059890539, "loss": 8.4242, "step": 163400 }, { "epoch": 0.6660630682286898, "grad_norm": 5.845661640167236, "learning_rate": 0.004899125470658849, "loss": 8.4663, "step": 163500 }, { "epoch": 0.6664704462520713, "grad_norm": 6.697454929351807, "learning_rate": 0.004898985786774578, "loss": 8.3532, "step": 163600 }, { "epoch": 0.6668778242754527, "grad_norm": 6.9508376121521, "learning_rate": 0.004898846008243234, "loss": 8.3082, "step": 163700 }, { "epoch": 0.6672852022988341, "grad_norm": 7.337489604949951, "learning_rate": 0.00489870613507036, "loss": 8.293, "step": 163800 }, { "epoch": 0.6676925803222157, "grad_norm": 6.600030422210693, "learning_rate": 0.00489856616726148, "loss": 8.2511, "step": 163900 }, { "epoch": 0.6680999583455971, "grad_norm": 8.571969032287598, "learning_rate": 0.0048984261048221275, "loss": 8.2392, "step": 164000 }, { "epoch": 0.6680999583455971, "eval_MaskedAccuracy": 0.48152138193449695, "eval_loss": 1.7331830263137817, "eval_runtime": 525.0777, "eval_samples_per_second": 120.889, "eval_steps_per_second": 0.472, "step": 164000 }, { "epoch": 0.6685073363689785, "grad_norm": 5.374701499938965, "learning_rate": 0.004898285947757848, "loss": 8.2564, "step": 164100 }, { "epoch": 0.66891471439236, "grad_norm": 0.8485134840011597, "learning_rate": 0.004898145696074172, "loss": 8.3965, "step": 164200 }, { "epoch": 0.6693220924157415, "grad_norm": 7.08267879486084, "learning_rate": 0.004898005349776646, "loss": 8.4201, "step": 164300 }, { "epoch": 0.669729470439123, "grad_norm": 9.659616470336914, "learning_rate": 0.004897864908870817, "loss": 8.2908, "step": 164400 }, { "epoch": 0.6701368484625044, "grad_norm": 2.042405366897583, "learning_rate": 0.004897724373362244, "loss": 8.3799, "step": 164500 }, { "epoch": 0.6705442264858859, "grad_norm": 3.2800698280334473, "learning_rate": 0.004897583743256474, "loss": 8.5298, "step": 164600 }, { "epoch": 0.6709516045092674, "grad_norm": 2.759813070297241, "learning_rate": 0.004897443018559074, "loss": 8.4652, "step": 164700 }, { "epoch": 0.6713589825326488, "grad_norm": 5.804178237915039, "learning_rate": 0.00489730219927561, "loss": 8.3006, "step": 164800 }, { "epoch": 0.6717663605560302, "grad_norm": 3.275555372238159, "learning_rate": 0.004897161285411646, "loss": 8.3407, "step": 164900 }, { "epoch": 0.6721737385794118, "grad_norm": 6.999525547027588, "learning_rate": 0.004897020276972746, "loss": 8.3238, "step": 165000 }, { "epoch": 0.6721737385794118, "eval_MaskedAccuracy": 0.47875500870127324, "eval_loss": 1.7594870328903198, "eval_runtime": 658.3203, "eval_samples_per_second": 96.421, "eval_steps_per_second": 0.377, "step": 165000 }, { "epoch": 0.6725811166027932, "grad_norm": 5.938575267791748, "learning_rate": 0.0048968791739644945, "loss": 8.2815, "step": 165100 }, { "epoch": 0.6729884946261746, "grad_norm": 9.04610824584961, "learning_rate": 0.004896737976392463, "loss": 8.2412, "step": 165200 }, { "epoch": 0.6733958726495561, "grad_norm": 5.345634460449219, "learning_rate": 0.0048965966842622445, "loss": 8.2667, "step": 165300 }, { "epoch": 0.6738032506729376, "grad_norm": 8.443628311157227, "learning_rate": 0.0048964552975794134, "loss": 8.2333, "step": 165400 }, { "epoch": 0.674210628696319, "grad_norm": 4.402365684509277, "learning_rate": 0.004896313816349567, "loss": 8.3441, "step": 165500 }, { "epoch": 0.6746180067197005, "grad_norm": 10.775688171386719, "learning_rate": 0.0048961722405782985, "loss": 8.3199, "step": 165600 }, { "epoch": 0.6750253847430819, "grad_norm": 3.270447254180908, "learning_rate": 0.004896030570271197, "loss": 8.528, "step": 165700 }, { "epoch": 0.6754327627664635, "grad_norm": 4.356602191925049, "learning_rate": 0.004895888805433864, "loss": 8.3563, "step": 165800 }, { "epoch": 0.6758401407898449, "grad_norm": 7.3610100746154785, "learning_rate": 0.0048957469460719095, "loss": 8.3676, "step": 165900 }, { "epoch": 0.6762475188132263, "grad_norm": 9.1506986618042, "learning_rate": 0.004895604992190948, "loss": 8.3984, "step": 166000 }, { "epoch": 0.6762475188132263, "eval_MaskedAccuracy": 0.4770375833347552, "eval_loss": 1.7629963159561157, "eval_runtime": 551.11, "eval_samples_per_second": 115.178, "eval_steps_per_second": 0.45, "step": 166000 }, { "epoch": 0.6766548968366078, "grad_norm": 10.568071365356445, "learning_rate": 0.004895462943796584, "loss": 8.2751, "step": 166100 }, { "epoch": 0.6770622748599893, "grad_norm": 4.640679836273193, "learning_rate": 0.004895320800894436, "loss": 8.2899, "step": 166200 }, { "epoch": 0.6774696528833707, "grad_norm": 7.352025032043457, "learning_rate": 0.004895178563490116, "loss": 8.363, "step": 166300 }, { "epoch": 0.6778770309067522, "grad_norm": 9.487617492675781, "learning_rate": 0.004895036231589264, "loss": 8.3213, "step": 166400 }, { "epoch": 0.6782844089301336, "grad_norm": 5.678429126739502, "learning_rate": 0.004894893805197495, "loss": 8.2882, "step": 166500 }, { "epoch": 0.6786917869535151, "grad_norm": 6.872136116027832, "learning_rate": 0.004894751284320436, "loss": 8.3267, "step": 166600 }, { "epoch": 0.6790991649768966, "grad_norm": 9.50109577178955, "learning_rate": 0.004894608668963724, "loss": 8.4098, "step": 166700 }, { "epoch": 0.679506543000278, "grad_norm": 17.34059715270996, "learning_rate": 0.004894465959133007, "loss": 8.4633, "step": 166800 }, { "epoch": 0.6799139210236596, "grad_norm": 1.1411833763122559, "learning_rate": 0.004894323154833926, "loss": 8.4391, "step": 166900 }, { "epoch": 0.680321299047041, "grad_norm": 5.388540744781494, "learning_rate": 0.004894180256072116, "loss": 8.3438, "step": 167000 }, { "epoch": 0.680321299047041, "eval_MaskedAccuracy": 0.47816457264364465, "eval_loss": 1.7510942220687866, "eval_runtime": 621.2353, "eval_samples_per_second": 102.177, "eval_steps_per_second": 0.399, "step": 167000 }, { "epoch": 0.6807286770704224, "grad_norm": 6.737392902374268, "learning_rate": 0.004894037262853236, "loss": 8.2886, "step": 167100 }, { "epoch": 0.6811360550938039, "grad_norm": 6.978421211242676, "learning_rate": 0.00489389417518293, "loss": 8.3475, "step": 167200 }, { "epoch": 0.6815434331171853, "grad_norm": 0.6066324710845947, "learning_rate": 0.0048937509930668584, "loss": 8.3985, "step": 167300 }, { "epoch": 0.6819508111405668, "grad_norm": 10.237112998962402, "learning_rate": 0.0048936077165106845, "loss": 8.4404, "step": 167400 }, { "epoch": 0.6823581891639483, "grad_norm": 6.3599748611450195, "learning_rate": 0.004893464345520071, "loss": 8.3485, "step": 167500 }, { "epoch": 0.6827655671873297, "grad_norm": 5.881896495819092, "learning_rate": 0.004893320880100694, "loss": 8.2703, "step": 167600 }, { "epoch": 0.6831729452107111, "grad_norm": 18.456289291381836, "learning_rate": 0.0048931773202582195, "loss": 8.2824, "step": 167700 }, { "epoch": 0.6835803232340927, "grad_norm": 6.398762226104736, "learning_rate": 0.00489303366599832, "loss": 8.4018, "step": 167800 }, { "epoch": 0.6839877012574741, "grad_norm": 3.5661845207214355, "learning_rate": 0.004892889917326678, "loss": 8.4498, "step": 167900 }, { "epoch": 0.6843950792808555, "grad_norm": 2.394167184829712, "learning_rate": 0.004892746074248975, "loss": 8.4847, "step": 168000 }, { "epoch": 0.6843950792808555, "eval_MaskedAccuracy": 0.47158817847142775, "eval_loss": 1.7774649858474731, "eval_runtime": 569.0822, "eval_samples_per_second": 111.541, "eval_steps_per_second": 0.436, "step": 168000 }, { "epoch": 0.6848024573042371, "grad_norm": 7.174911975860596, "learning_rate": 0.004892602136770898, "loss": 8.4165, "step": 168100 }, { "epoch": 0.6852098353276185, "grad_norm": 3.2920947074890137, "learning_rate": 0.0048924581048981446, "loss": 8.3214, "step": 168200 }, { "epoch": 0.685617213351, "grad_norm": 12.373799324035645, "learning_rate": 0.004892313978636404, "loss": 8.3641, "step": 168300 }, { "epoch": 0.6860245913743814, "grad_norm": 3.5428054332733154, "learning_rate": 0.004892169757991367, "loss": 8.3479, "step": 168400 }, { "epoch": 0.6864319693977629, "grad_norm": 5.213382244110107, "learning_rate": 0.004892025442968745, "loss": 8.2797, "step": 168500 }, { "epoch": 0.6868393474211444, "grad_norm": 9.658862113952637, "learning_rate": 0.004891881033574241, "loss": 8.2737, "step": 168600 }, { "epoch": 0.6872467254445258, "grad_norm": 4.990167140960693, "learning_rate": 0.0048917365298135635, "loss": 8.333, "step": 168700 }, { "epoch": 0.6876541034679072, "grad_norm": 7.842627048492432, "learning_rate": 0.0048915919316924236, "loss": 8.3933, "step": 168800 }, { "epoch": 0.6880614814912888, "grad_norm": 2.300215721130371, "learning_rate": 0.004891447239216542, "loss": 8.2848, "step": 168900 }, { "epoch": 0.6884688595146702, "grad_norm": 4.792848110198975, "learning_rate": 0.004891302452391639, "loss": 8.261, "step": 169000 }, { "epoch": 0.6884688595146702, "eval_MaskedAccuracy": 0.480171492947199, "eval_loss": 1.7475031614303589, "eval_runtime": 656.8193, "eval_samples_per_second": 96.642, "eval_steps_per_second": 0.378, "step": 169000 }, { "epoch": 0.6888762375380516, "grad_norm": 8.848061561584473, "learning_rate": 0.0048911575712234315, "loss": 8.2706, "step": 169100 }, { "epoch": 0.6892836155614331, "grad_norm": 3.295029640197754, "learning_rate": 0.004891012595717655, "loss": 8.3237, "step": 169200 }, { "epoch": 0.6896909935848146, "grad_norm": 5.978100776672363, "learning_rate": 0.004890867525880042, "loss": 8.4107, "step": 169300 }, { "epoch": 0.6900983716081961, "grad_norm": 0.6782967448234558, "learning_rate": 0.0048907223617163246, "loss": 8.4126, "step": 169400 }, { "epoch": 0.6905057496315775, "grad_norm": 6.610276222229004, "learning_rate": 0.004890577103232248, "loss": 8.4434, "step": 169500 }, { "epoch": 0.6909131276549589, "grad_norm": 0.7157424688339233, "learning_rate": 0.004890431750433545, "loss": 8.4295, "step": 169600 }, { "epoch": 0.6913205056783405, "grad_norm": 10.258817672729492, "learning_rate": 0.004890286303325968, "loss": 8.3865, "step": 169700 }, { "epoch": 0.6917278837017219, "grad_norm": 2.953791856765747, "learning_rate": 0.004890140761915269, "loss": 8.2532, "step": 169800 }, { "epoch": 0.6921352617251033, "grad_norm": 5.6295270919799805, "learning_rate": 0.004889995126207192, "loss": 8.289, "step": 169900 }, { "epoch": 0.6925426397484848, "grad_norm": 0.6412755250930786, "learning_rate": 0.0048898493962075, "loss": 8.2589, "step": 170000 }, { "epoch": 0.6925426397484848, "eval_MaskedAccuracy": 0.4773358240766947, "eval_loss": 1.7543995380401611, "eval_runtime": 609.8017, "eval_samples_per_second": 104.093, "eval_steps_per_second": 0.407, "step": 170000 }, { "epoch": 0.6929500177718663, "grad_norm": 0.6819936037063599, "learning_rate": 0.004889703571921961, "loss": 8.36, "step": 170100 }, { "epoch": 0.6933573957952477, "grad_norm": 4.7626214027404785, "learning_rate": 0.004889557653356331, "loss": 8.4279, "step": 170200 }, { "epoch": 0.6937647738186292, "grad_norm": 10.89247989654541, "learning_rate": 0.004889411640516388, "loss": 8.3306, "step": 170300 }, { "epoch": 0.6941721518420106, "grad_norm": 0.7468447089195251, "learning_rate": 0.004889265533407896, "loss": 8.3002, "step": 170400 }, { "epoch": 0.6945795298653921, "grad_norm": 3.9062106609344482, "learning_rate": 0.004889119332036641, "loss": 8.408, "step": 170500 }, { "epoch": 0.6949869078887736, "grad_norm": 8.741107940673828, "learning_rate": 0.004888973036408398, "loss": 8.3202, "step": 170600 }, { "epoch": 0.695394285912155, "grad_norm": 3.7794833183288574, "learning_rate": 0.004888826646528943, "loss": 8.3113, "step": 170700 }, { "epoch": 0.6958016639355366, "grad_norm": 1.2744481563568115, "learning_rate": 0.004888680162404073, "loss": 8.4118, "step": 170800 }, { "epoch": 0.696209041958918, "grad_norm": 9.467698097229004, "learning_rate": 0.00488853358403958, "loss": 8.3829, "step": 170900 }, { "epoch": 0.6966164199822994, "grad_norm": 1.5702000856399536, "learning_rate": 0.0048883869114412564, "loss": 8.4425, "step": 171000 }, { "epoch": 0.6966164199822994, "eval_MaskedAccuracy": 0.47248101723780406, "eval_loss": 1.7865709066390991, "eval_runtime": 538.5232, "eval_samples_per_second": 117.87, "eval_steps_per_second": 0.461, "step": 171000 }, { "epoch": 0.6970237980056809, "grad_norm": 9.437499046325684, "learning_rate": 0.004888240144614903, "loss": 8.4318, "step": 171100 }, { "epoch": 0.6974311760290623, "grad_norm": 3.6925880908966064, "learning_rate": 0.004888093283566316, "loss": 8.4064, "step": 171200 }, { "epoch": 0.6978385540524438, "grad_norm": 5.525794506072998, "learning_rate": 0.004887946328301304, "loss": 8.3119, "step": 171300 }, { "epoch": 0.6982459320758253, "grad_norm": 5.053781032562256, "learning_rate": 0.004887799278825681, "loss": 8.2764, "step": 171400 }, { "epoch": 0.6986533100992067, "grad_norm": 5.076804161071777, "learning_rate": 0.004887652135145267, "loss": 8.2487, "step": 171500 }, { "epoch": 0.6990606881225881, "grad_norm": 2.078188896179199, "learning_rate": 0.004887504897265856, "loss": 8.283, "step": 171600 }, { "epoch": 0.6994680661459697, "grad_norm": 3.048565149307251, "learning_rate": 0.0048873575651932925, "loss": 8.3919, "step": 171700 }, { "epoch": 0.6998754441693511, "grad_norm": 1.426169753074646, "learning_rate": 0.004887210138933391, "loss": 8.4645, "step": 171800 }, { "epoch": 0.7002828221927326, "grad_norm": 2.3991544246673584, "learning_rate": 0.004887062618491986, "loss": 8.4391, "step": 171900 }, { "epoch": 0.7006902002161141, "grad_norm": 2.98624587059021, "learning_rate": 0.004886915003874903, "loss": 8.4758, "step": 172000 }, { "epoch": 0.7006902002161141, "eval_MaskedAccuracy": 0.4691883130532846, "eval_loss": 1.790522813796997, "eval_runtime": 515.1359, "eval_samples_per_second": 123.222, "eval_steps_per_second": 0.481, "step": 172000 }, { "epoch": 0.7010975782394955, "grad_norm": 4.742863178253174, "learning_rate": 0.004886767295087984, "loss": 8.3807, "step": 172100 }, { "epoch": 0.701504956262877, "grad_norm": 3.5053226947784424, "learning_rate": 0.004886619492137062, "loss": 8.3139, "step": 172200 }, { "epoch": 0.7019123342862584, "grad_norm": 8.727147102355957, "learning_rate": 0.004886471595027978, "loss": 8.2916, "step": 172300 }, { "epoch": 0.7023197123096399, "grad_norm": 4.358945369720459, "learning_rate": 0.004886323603766593, "loss": 8.287, "step": 172400 }, { "epoch": 0.7027270903330214, "grad_norm": 4.907960891723633, "learning_rate": 0.00488617551835875, "loss": 8.2544, "step": 172500 }, { "epoch": 0.7031344683564028, "grad_norm": 2.0375194549560547, "learning_rate": 0.004886027338810299, "loss": 8.2582, "step": 172600 }, { "epoch": 0.7035418463797842, "grad_norm": 5.98710298538208, "learning_rate": 0.004885879065127108, "loss": 8.3614, "step": 172700 }, { "epoch": 0.7039492244031658, "grad_norm": 3.117462635040283, "learning_rate": 0.0048857306973150265, "loss": 8.2707, "step": 172800 }, { "epoch": 0.7043566024265472, "grad_norm": 2.485978603363037, "learning_rate": 0.004885582235379937, "loss": 8.3653, "step": 172900 }, { "epoch": 0.7047639804499286, "grad_norm": 4.734591960906982, "learning_rate": 0.004885433679327683, "loss": 8.407, "step": 173000 }, { "epoch": 0.7047639804499286, "eval_MaskedAccuracy": 0.47496040524555855, "eval_loss": 1.7643200159072876, "eval_runtime": 545.751, "eval_samples_per_second": 116.309, "eval_steps_per_second": 0.454, "step": 173000 }, { "epoch": 0.7051713584733101, "grad_norm": 2.7611308097839355, "learning_rate": 0.004885285029164163, "loss": 8.4275, "step": 173100 }, { "epoch": 0.7055787364966916, "grad_norm": 6.302596569061279, "learning_rate": 0.004885136284895244, "loss": 8.3923, "step": 173200 }, { "epoch": 0.7059861145200731, "grad_norm": 3.493154287338257, "learning_rate": 0.00488498744652681, "loss": 8.3187, "step": 173300 }, { "epoch": 0.7063934925434545, "grad_norm": 3.9514620304107666, "learning_rate": 0.004884838514064751, "loss": 8.2603, "step": 173400 }, { "epoch": 0.7068008705668359, "grad_norm": 7.091209411621094, "learning_rate": 0.004884689487514938, "loss": 8.3939, "step": 173500 }, { "epoch": 0.7072082485902175, "grad_norm": 8.952984809875488, "learning_rate": 0.004884540366883272, "loss": 8.3596, "step": 173600 }, { "epoch": 0.7076156266135989, "grad_norm": 4.8596649169921875, "learning_rate": 0.00488439115217565, "loss": 8.3088, "step": 173700 }, { "epoch": 0.7080230046369803, "grad_norm": 1.3575553894042969, "learning_rate": 0.004884241843397972, "loss": 8.2992, "step": 173800 }, { "epoch": 0.7084303826603618, "grad_norm": 4.548696517944336, "learning_rate": 0.0048840924405561354, "loss": 8.2768, "step": 173900 }, { "epoch": 0.7088377606837433, "grad_norm": 4.349992275238037, "learning_rate": 0.004883942943656046, "loss": 8.319, "step": 174000 }, { "epoch": 0.7088377606837433, "eval_MaskedAccuracy": 0.4732281269916888, "eval_loss": 1.7842973470687866, "eval_runtime": 651.7468, "eval_samples_per_second": 97.394, "eval_steps_per_second": 0.381, "step": 174000 }, { "epoch": 0.7092451387071247, "grad_norm": 6.522211074829102, "learning_rate": 0.004883793352703624, "loss": 8.3246, "step": 174100 }, { "epoch": 0.7096525167305062, "grad_norm": 6.698734283447266, "learning_rate": 0.00488364366770477, "loss": 8.2695, "step": 174200 }, { "epoch": 0.7100598947538876, "grad_norm": 3.7172343730926514, "learning_rate": 0.004883493888665418, "loss": 8.4204, "step": 174300 }, { "epoch": 0.7104672727772692, "grad_norm": 3.83941388130188, "learning_rate": 0.004883344015591483, "loss": 8.3345, "step": 174400 }, { "epoch": 0.7108746508006506, "grad_norm": 7.8376970291137695, "learning_rate": 0.004883194048488885, "loss": 8.3458, "step": 174500 }, { "epoch": 0.711282028824032, "grad_norm": 4.667509078979492, "learning_rate": 0.004883043987363555, "loss": 8.2559, "step": 174600 }, { "epoch": 0.7116894068474136, "grad_norm": 5.577969551086426, "learning_rate": 0.004882893832221431, "loss": 8.2355, "step": 174700 }, { "epoch": 0.712096784870795, "grad_norm": 7.191335201263428, "learning_rate": 0.004882743583068447, "loss": 8.3568, "step": 174800 }, { "epoch": 0.7125041628941764, "grad_norm": 2.6492528915405273, "learning_rate": 0.004882593239910542, "loss": 8.2721, "step": 174900 }, { "epoch": 0.7129115409175579, "grad_norm": 1.0866893529891968, "learning_rate": 0.004882442802753663, "loss": 8.3197, "step": 175000 }, { "epoch": 0.7129115409175579, "eval_MaskedAccuracy": 0.4768590181371025, "eval_loss": 1.7546759843826294, "eval_runtime": 575.3248, "eval_samples_per_second": 110.331, "eval_steps_per_second": 0.431, "step": 175000 }, { "epoch": 0.7133189189409394, "grad_norm": 7.7724456787109375, "learning_rate": 0.004882292271603746, "loss": 8.3555, "step": 175100 }, { "epoch": 0.7137262969643208, "grad_norm": 3.023310422897339, "learning_rate": 0.004882141646466756, "loss": 8.2732, "step": 175200 }, { "epoch": 0.7141336749877023, "grad_norm": 1.2857904434204102, "learning_rate": 0.004881990927348646, "loss": 8.2973, "step": 175300 }, { "epoch": 0.7145410530110837, "grad_norm": 9.354995727539062, "learning_rate": 0.004881840114255371, "loss": 8.437, "step": 175400 }, { "epoch": 0.7149484310344651, "grad_norm": 7.249983310699463, "learning_rate": 0.004881689207192886, "loss": 8.3225, "step": 175500 }, { "epoch": 0.7153558090578467, "grad_norm": 3.115715980529785, "learning_rate": 0.0048815382061671705, "loss": 8.2588, "step": 175600 }, { "epoch": 0.7157631870812281, "grad_norm": 6.985172271728516, "learning_rate": 0.00488138711118419, "loss": 8.2741, "step": 175700 }, { "epoch": 0.7161705651046096, "grad_norm": 3.081441879272461, "learning_rate": 0.004881235922249919, "loss": 8.326, "step": 175800 }, { "epoch": 0.7165779431279911, "grad_norm": 4.380251407623291, "learning_rate": 0.004881084639370334, "loss": 8.2507, "step": 175900 }, { "epoch": 0.7169853211513725, "grad_norm": 1.9868642091751099, "learning_rate": 0.004880933262551412, "loss": 8.3041, "step": 176000 }, { "epoch": 0.7169853211513725, "eval_MaskedAccuracy": 0.47505992785756423, "eval_loss": 1.7654045820236206, "eval_runtime": 650.8605, "eval_samples_per_second": 97.526, "eval_steps_per_second": 0.381, "step": 176000 }, { "epoch": 0.717392699174754, "grad_norm": 4.893762111663818, "learning_rate": 0.004880781791799138, "loss": 8.4209, "step": 176100 }, { "epoch": 0.7178000771981354, "grad_norm": 4.261211395263672, "learning_rate": 0.004880630227119507, "loss": 8.3148, "step": 176200 }, { "epoch": 0.7182074552215169, "grad_norm": 1.6230965852737427, "learning_rate": 0.004880478568518509, "loss": 8.4022, "step": 176300 }, { "epoch": 0.7186148332448984, "grad_norm": 1.696700930595398, "learning_rate": 0.004880326816002138, "loss": 8.3866, "step": 176400 }, { "epoch": 0.7190222112682798, "grad_norm": 5.27692985534668, "learning_rate": 0.0048801749695763915, "loss": 8.2963, "step": 176500 }, { "epoch": 0.7194295892916612, "grad_norm": 5.148913383483887, "learning_rate": 0.004880023029247276, "loss": 8.2342, "step": 176600 }, { "epoch": 0.7198369673150428, "grad_norm": 9.100824356079102, "learning_rate": 0.004879870995020799, "loss": 8.2097, "step": 176700 }, { "epoch": 0.7202443453384242, "grad_norm": 1.109858512878418, "learning_rate": 0.0048797188669029625, "loss": 8.3588, "step": 176800 }, { "epoch": 0.7206517233618056, "grad_norm": 6.3910322189331055, "learning_rate": 0.004879566644899795, "loss": 8.4023, "step": 176900 }, { "epoch": 0.7210591013851871, "grad_norm": 2.662142515182495, "learning_rate": 0.004879414329017307, "loss": 8.2879, "step": 177000 }, { "epoch": 0.7210591013851871, "eval_MaskedAccuracy": 0.4803536924996919, "eval_loss": 1.7457139492034912, "eval_runtime": 563.9514, "eval_samples_per_second": 112.556, "eval_steps_per_second": 0.44, "step": 177000 }, { "epoch": 0.7214664794085686, "grad_norm": 0.49664661288261414, "learning_rate": 0.004879261919261518, "loss": 8.2449, "step": 177100 }, { "epoch": 0.7218738574319501, "grad_norm": 2.5227560997009277, "learning_rate": 0.004879109415638463, "loss": 8.3815, "step": 177200 }, { "epoch": 0.7222812354553315, "grad_norm": 7.061991214752197, "learning_rate": 0.0048789568181541575, "loss": 8.3137, "step": 177300 }, { "epoch": 0.7226886134787129, "grad_norm": 6.182156562805176, "learning_rate": 0.004878804126814647, "loss": 8.2393, "step": 177400 }, { "epoch": 0.7230959915020945, "grad_norm": 6.707774639129639, "learning_rate": 0.004878651341625962, "loss": 8.2374, "step": 177500 }, { "epoch": 0.7235033695254759, "grad_norm": 8.791891098022461, "learning_rate": 0.0048784984625941455, "loss": 8.2205, "step": 177600 }, { "epoch": 0.7239107475488573, "grad_norm": 3.0393781661987305, "learning_rate": 0.004878345489725241, "loss": 8.1966, "step": 177700 }, { "epoch": 0.7243181255722388, "grad_norm": 7.30899715423584, "learning_rate": 0.004878192423025295, "loss": 8.356, "step": 177800 }, { "epoch": 0.7247255035956203, "grad_norm": 3.005924940109253, "learning_rate": 0.00487803926250036, "loss": 8.3474, "step": 177900 }, { "epoch": 0.7251328816190017, "grad_norm": 5.2232666015625, "learning_rate": 0.004877886008156489, "loss": 8.2996, "step": 178000 }, { "epoch": 0.7251328816190017, "eval_MaskedAccuracy": 0.48003651713678797, "eval_loss": 1.7420556545257568, "eval_runtime": 587.5573, "eval_samples_per_second": 108.034, "eval_steps_per_second": 0.422, "step": 178000 }, { "epoch": 0.7255402596423832, "grad_norm": 2.2677695751190186, "learning_rate": 0.004877732659999748, "loss": 8.357, "step": 178100 }, { "epoch": 0.7259476376657646, "grad_norm": 8.819666862487793, "learning_rate": 0.004877579218036192, "loss": 8.2959, "step": 178200 }, { "epoch": 0.7263550156891462, "grad_norm": 3.094377279281616, "learning_rate": 0.00487742568227189, "loss": 8.2471, "step": 178300 }, { "epoch": 0.7267623937125276, "grad_norm": 4.4123125076293945, "learning_rate": 0.004877272052712911, "loss": 8.2269, "step": 178400 }, { "epoch": 0.727169771735909, "grad_norm": 3.1336193084716797, "learning_rate": 0.00487711832936533, "loss": 8.3825, "step": 178500 }, { "epoch": 0.7275771497592906, "grad_norm": 2.860325336456299, "learning_rate": 0.004876964512235228, "loss": 8.4471, "step": 178600 }, { "epoch": 0.727984527782672, "grad_norm": 4.890446186065674, "learning_rate": 0.004876810601328681, "loss": 8.4431, "step": 178700 }, { "epoch": 0.7283919058060534, "grad_norm": 4.390522003173828, "learning_rate": 0.004876656596651773, "loss": 8.305, "step": 178800 }, { "epoch": 0.7287992838294349, "grad_norm": 6.204839706420898, "learning_rate": 0.004876502498210598, "loss": 8.2462, "step": 178900 }, { "epoch": 0.7292066618528164, "grad_norm": 4.126978874206543, "learning_rate": 0.004876348306011245, "loss": 8.2346, "step": 179000 }, { "epoch": 0.7292066618528164, "eval_MaskedAccuracy": 0.4809214081013844, "eval_loss": 1.7444720268249512, "eval_runtime": 597.017, "eval_samples_per_second": 106.322, "eval_steps_per_second": 0.415, "step": 179000 }, { "epoch": 0.7296140398761978, "grad_norm": 1.363865852355957, "learning_rate": 0.004876194020059811, "loss": 8.3255, "step": 179100 }, { "epoch": 0.7300214178995793, "grad_norm": 4.949935436248779, "learning_rate": 0.004876039640362395, "loss": 8.2939, "step": 179200 }, { "epoch": 0.7304287959229607, "grad_norm": 6.188343048095703, "learning_rate": 0.004875885166925098, "loss": 8.3008, "step": 179300 }, { "epoch": 0.7308361739463421, "grad_norm": 6.2942070960998535, "learning_rate": 0.004875730599754032, "loss": 8.3243, "step": 179400 }, { "epoch": 0.7312435519697237, "grad_norm": 2.720975160598755, "learning_rate": 0.00487557593885531, "loss": 8.2844, "step": 179500 }, { "epoch": 0.7316509299931051, "grad_norm": 3.3526389598846436, "learning_rate": 0.004875421184235038, "loss": 8.2559, "step": 179600 }, { "epoch": 0.7320583080164866, "grad_norm": 8.793871879577637, "learning_rate": 0.0048752663358993375, "loss": 8.2198, "step": 179700 }, { "epoch": 0.7324656860398681, "grad_norm": 6.481076717376709, "learning_rate": 0.004875111393854334, "loss": 8.186, "step": 179800 }, { "epoch": 0.7328730640632495, "grad_norm": 6.413854122161865, "learning_rate": 0.004874956358106152, "loss": 8.208, "step": 179900 }, { "epoch": 0.733280442086631, "grad_norm": 6.735567092895508, "learning_rate": 0.004874801228660921, "loss": 8.1682, "step": 180000 }, { "epoch": 0.733280442086631, "eval_MaskedAccuracy": 0.482909605258875, "eval_loss": 1.729762077331543, "eval_runtime": 590.1032, "eval_samples_per_second": 107.568, "eval_steps_per_second": 0.42, "step": 180000 }, { "epoch": 0.7336878201100124, "grad_norm": 2.869108200073242, "learning_rate": 0.004874646005524773, "loss": 8.3157, "step": 180100 }, { "epoch": 0.7340951981333939, "grad_norm": 5.248936176300049, "learning_rate": 0.0048744906887038374, "loss": 8.3361, "step": 180200 }, { "epoch": 0.7345025761567754, "grad_norm": 2.505849838256836, "learning_rate": 0.0048743352782042665, "loss": 8.4022, "step": 180300 }, { "epoch": 0.7349099541801568, "grad_norm": 6.214993000030518, "learning_rate": 0.0048741797740322005, "loss": 8.3097, "step": 180400 }, { "epoch": 0.7353173322035382, "grad_norm": 5.4167890548706055, "learning_rate": 0.004874024176193786, "loss": 8.3306, "step": 180500 }, { "epoch": 0.7357247102269198, "grad_norm": 1.3875280618667603, "learning_rate": 0.004873868484695169, "loss": 8.3948, "step": 180600 }, { "epoch": 0.7361320882503012, "grad_norm": 6.5398125648498535, "learning_rate": 0.004873712699542514, "loss": 8.3518, "step": 180700 }, { "epoch": 0.7365394662736827, "grad_norm": 6.465213298797607, "learning_rate": 0.004873556820741981, "loss": 8.2596, "step": 180800 }, { "epoch": 0.7369468442970641, "grad_norm": 7.302810192108154, "learning_rate": 0.004873400848299721, "loss": 8.2491, "step": 180900 }, { "epoch": 0.7373542223204456, "grad_norm": 7.1983466148376465, "learning_rate": 0.00487324478222191, "loss": 8.2249, "step": 181000 }, { "epoch": 0.7373542223204456, "eval_MaskedAccuracy": 0.4827771830212063, "eval_loss": 1.7254915237426758, "eval_runtime": 599.5115, "eval_samples_per_second": 105.88, "eval_steps_per_second": 0.414, "step": 181000 }, { "epoch": 0.7377616003438271, "grad_norm": 7.062850475311279, "learning_rate": 0.004873088622514718, "loss": 8.195, "step": 181100 }, { "epoch": 0.7381689783672085, "grad_norm": 7.937536716461182, "learning_rate": 0.004872932369184326, "loss": 8.1692, "step": 181200 }, { "epoch": 0.7385763563905899, "grad_norm": 5.309225559234619, "learning_rate": 0.004872776022236897, "loss": 8.1978, "step": 181300 }, { "epoch": 0.7389837344139715, "grad_norm": 3.140054225921631, "learning_rate": 0.004872619581678617, "loss": 8.235, "step": 181400 }, { "epoch": 0.7393911124373529, "grad_norm": 9.557893753051758, "learning_rate": 0.004872463047515668, "loss": 8.3554, "step": 181500 }, { "epoch": 0.7397984904607343, "grad_norm": 3.6274309158325195, "learning_rate": 0.004872306419754254, "loss": 8.2993, "step": 181600 }, { "epoch": 0.7402058684841158, "grad_norm": 4.35516357421875, "learning_rate": 0.0048721496984005545, "loss": 8.2472, "step": 181700 }, { "epoch": 0.7406132465074973, "grad_norm": 0.8326847553253174, "learning_rate": 0.004871992883460763, "loss": 8.3899, "step": 181800 }, { "epoch": 0.7410206245308787, "grad_norm": 8.322855949401855, "learning_rate": 0.0048718359749410824, "loss": 8.3415, "step": 181900 }, { "epoch": 0.7414280025542602, "grad_norm": 2.791518449783325, "learning_rate": 0.004871678972847715, "loss": 8.2363, "step": 182000 }, { "epoch": 0.7414280025542602, "eval_MaskedAccuracy": 0.481054271312736, "eval_loss": 1.738249659538269, "eval_runtime": 728.8516, "eval_samples_per_second": 87.09, "eval_steps_per_second": 0.34, "step": 182000 }, { "epoch": 0.7418353805776416, "grad_norm": 2.209365129470825, "learning_rate": 0.004871521877186866, "loss": 8.2858, "step": 182100 }, { "epoch": 0.7422427586010232, "grad_norm": 3.326539993286133, "learning_rate": 0.004871364687964754, "loss": 8.317, "step": 182200 }, { "epoch": 0.7426501366244046, "grad_norm": 6.098221302032471, "learning_rate": 0.004871207405187595, "loss": 8.2708, "step": 182300 }, { "epoch": 0.743057514647786, "grad_norm": 3.575340509414673, "learning_rate": 0.004871050028861597, "loss": 8.2523, "step": 182400 }, { "epoch": 0.7434648926711676, "grad_norm": 4.331416606903076, "learning_rate": 0.004870892558992985, "loss": 8.4662, "step": 182500 }, { "epoch": 0.743872270694549, "grad_norm": 1.635528564453125, "learning_rate": 0.004870734995587994, "loss": 8.3721, "step": 182600 }, { "epoch": 0.7442796487179304, "grad_norm": 5.0897393226623535, "learning_rate": 0.004870577338652834, "loss": 8.4446, "step": 182700 }, { "epoch": 0.7446870267413119, "grad_norm": 0.853996217250824, "learning_rate": 0.004870419588193759, "loss": 8.4412, "step": 182800 }, { "epoch": 0.7450944047646934, "grad_norm": 2.650754451751709, "learning_rate": 0.004870261744216997, "loss": 8.4779, "step": 182900 }, { "epoch": 0.7455017827880748, "grad_norm": 3.4979281425476074, "learning_rate": 0.004870103806728794, "loss": 8.4302, "step": 183000 }, { "epoch": 0.7455017827880748, "eval_MaskedAccuracy": 0.47285557428123637, "eval_loss": 1.7717115879058838, "eval_runtime": 338.4866, "eval_samples_per_second": 187.529, "eval_steps_per_second": 0.733, "step": 183000 }, { "epoch": 0.7459091608114563, "grad_norm": 3.0517659187316895, "learning_rate": 0.004869945775735384, "loss": 8.4316, "step": 183100 }, { "epoch": 0.7463165388348377, "grad_norm": 5.8500518798828125, "learning_rate": 0.004869787651243016, "loss": 8.3039, "step": 183200 }, { "epoch": 0.7467239168582193, "grad_norm": 8.321171760559082, "learning_rate": 0.004869629433257944, "loss": 8.2616, "step": 183300 }, { "epoch": 0.7471312948816007, "grad_norm": 3.770230293273926, "learning_rate": 0.004869471121786423, "loss": 8.223, "step": 183400 }, { "epoch": 0.7475386729049821, "grad_norm": 5.35031795501709, "learning_rate": 0.00486931271683471, "loss": 8.2396, "step": 183500 }, { "epoch": 0.7479460509283636, "grad_norm": 1.7099127769470215, "learning_rate": 0.004869154218409075, "loss": 8.2524, "step": 183600 }, { "epoch": 0.7483534289517451, "grad_norm": 6.3081560134887695, "learning_rate": 0.004868995626515777, "loss": 8.3537, "step": 183700 }, { "epoch": 0.7487608069751265, "grad_norm": 4.459376811981201, "learning_rate": 0.004868836941161092, "loss": 8.2728, "step": 183800 }, { "epoch": 0.749168184998508, "grad_norm": 0.8340696096420288, "learning_rate": 0.0048686781623512795, "loss": 8.2427, "step": 183900 }, { "epoch": 0.7495755630218894, "grad_norm": 5.280702590942383, "learning_rate": 0.004868519290092641, "loss": 8.3643, "step": 184000 }, { "epoch": 0.7495755630218894, "eval_MaskedAccuracy": 0.4768380283348209, "eval_loss": 1.7494298219680786, "eval_runtime": 579.1395, "eval_samples_per_second": 109.604, "eval_steps_per_second": 0.428, "step": 184000 }, { "epoch": 0.7499829410452709, "grad_norm": 4.741583824157715, "learning_rate": 0.004868360324391437, "loss": 8.2572, "step": 184100 }, { "epoch": 0.7503903190686524, "grad_norm": 7.211680889129639, "learning_rate": 0.004868201265253965, "loss": 8.2716, "step": 184200 }, { "epoch": 0.7507976970920338, "grad_norm": 6.9085588455200195, "learning_rate": 0.004868042112686504, "loss": 8.2017, "step": 184300 }, { "epoch": 0.7512050751154152, "grad_norm": 6.018723964691162, "learning_rate": 0.004867882866695344, "loss": 8.3096, "step": 184400 }, { "epoch": 0.7516124531387968, "grad_norm": 3.685755491256714, "learning_rate": 0.004867723527286793, "loss": 8.3765, "step": 184500 }, { "epoch": 0.7520198311621782, "grad_norm": 8.629942893981934, "learning_rate": 0.004867564094467144, "loss": 8.3778, "step": 184600 }, { "epoch": 0.7524272091855597, "grad_norm": 6.810964584350586, "learning_rate": 0.0048674045682427, "loss": 8.348, "step": 184700 }, { "epoch": 0.7528345872089411, "grad_norm": 3.482246160507202, "learning_rate": 0.004867244948619771, "loss": 8.2458, "step": 184800 }, { "epoch": 0.7532419652323226, "grad_norm": 6.576715469360352, "learning_rate": 0.0048670852356046635, "loss": 8.2585, "step": 184900 }, { "epoch": 0.7536493432557041, "grad_norm": 5.824215888977051, "learning_rate": 0.004866925429203695, "loss": 8.2768, "step": 185000 }, { "epoch": 0.7536493432557041, "eval_MaskedAccuracy": 0.4760389425166015, "eval_loss": 1.7571865320205688, "eval_runtime": 711.5395, "eval_samples_per_second": 89.209, "eval_steps_per_second": 0.349, "step": 185000 }, { "epoch": 0.7540567212790855, "grad_norm": 6.5599164962768555, "learning_rate": 0.004866765529423172, "loss": 8.434, "step": 185100 }, { "epoch": 0.7544640993024669, "grad_norm": 4.8812479972839355, "learning_rate": 0.004866605536269431, "loss": 8.3051, "step": 185200 }, { "epoch": 0.7548714773258485, "grad_norm": 2.002317428588867, "learning_rate": 0.00486644544974879, "loss": 8.3022, "step": 185300 }, { "epoch": 0.7552788553492299, "grad_norm": 12.033347129821777, "learning_rate": 0.004866285269867578, "loss": 8.3938, "step": 185400 }, { "epoch": 0.7556862333726113, "grad_norm": 5.843840599060059, "learning_rate": 0.004866124996632135, "loss": 8.3901, "step": 185500 }, { "epoch": 0.7560936113959928, "grad_norm": 11.444961547851562, "learning_rate": 0.004865964630048786, "loss": 8.3889, "step": 185600 }, { "epoch": 0.7565009894193743, "grad_norm": 2.2533037662506104, "learning_rate": 0.004865804170123884, "loss": 8.3197, "step": 185700 }, { "epoch": 0.7569083674427558, "grad_norm": 5.55755615234375, "learning_rate": 0.0048656436168637756, "loss": 8.2175, "step": 185800 }, { "epoch": 0.7573157454661372, "grad_norm": 7.763909339904785, "learning_rate": 0.0048654829702747876, "loss": 8.2303, "step": 185900 }, { "epoch": 0.7577231234895186, "grad_norm": 6.464959144592285, "learning_rate": 0.0048653222303632815, "loss": 8.2303, "step": 186000 }, { "epoch": 0.7577231234895186, "eval_MaskedAccuracy": 0.47948798329779957, "eval_loss": 1.7422912120819092, "eval_runtime": 562.167, "eval_samples_per_second": 112.913, "eval_steps_per_second": 0.441, "step": 186000 }, { "epoch": 0.7581305015129002, "grad_norm": 4.902656078338623, "learning_rate": 0.004865161397135624, "loss": 8.3223, "step": 186100 }, { "epoch": 0.7585378795362816, "grad_norm": 8.939846992492676, "learning_rate": 0.004865000470598161, "loss": 8.2164, "step": 186200 }, { "epoch": 0.758945257559663, "grad_norm": 7.8905792236328125, "learning_rate": 0.004864839450757255, "loss": 8.2015, "step": 186300 }, { "epoch": 0.7593526355830446, "grad_norm": 3.3256075382232666, "learning_rate": 0.004864678337619275, "loss": 8.3425, "step": 186400 }, { "epoch": 0.759760013606426, "grad_norm": 1.2854762077331543, "learning_rate": 0.004864517131190597, "loss": 8.3916, "step": 186500 }, { "epoch": 0.7601673916298074, "grad_norm": 7.77571439743042, "learning_rate": 0.004864355831477576, "loss": 8.3372, "step": 186600 }, { "epoch": 0.7605747696531889, "grad_norm": 2.4621174335479736, "learning_rate": 0.004864194438486606, "loss": 8.2787, "step": 186700 }, { "epoch": 0.7609821476765704, "grad_norm": 9.451438903808594, "learning_rate": 0.004864032952224062, "loss": 8.2355, "step": 186800 }, { "epoch": 0.7613895256999518, "grad_norm": 5.953160285949707, "learning_rate": 0.004863871372696324, "loss": 8.2821, "step": 186900 }, { "epoch": 0.7617969037233333, "grad_norm": 2.5793473720550537, "learning_rate": 0.004863709699909789, "loss": 8.2502, "step": 187000 }, { "epoch": 0.7617969037233333, "eval_MaskedAccuracy": 0.48226549821371534, "eval_loss": 1.7357350587844849, "eval_runtime": 549.9313, "eval_samples_per_second": 115.425, "eval_steps_per_second": 0.451, "step": 187000 }, { "epoch": 0.7622042817467147, "grad_norm": 8.321205139160156, "learning_rate": 0.0048635479338708446, "loss": 8.2542, "step": 187100 }, { "epoch": 0.7626116597700963, "grad_norm": 10.006879806518555, "learning_rate": 0.004863386074585885, "loss": 8.4086, "step": 187200 }, { "epoch": 0.7630190377934777, "grad_norm": 4.29319953918457, "learning_rate": 0.0048632241220613135, "loss": 8.4075, "step": 187300 }, { "epoch": 0.7634264158168591, "grad_norm": 3.3111531734466553, "learning_rate": 0.004863062076303527, "loss": 8.3403, "step": 187400 }, { "epoch": 0.7638337938402406, "grad_norm": 4.626220703125, "learning_rate": 0.004862899937318938, "loss": 8.3157, "step": 187500 }, { "epoch": 0.7642411718636221, "grad_norm": 10.10457992553711, "learning_rate": 0.004862737705113952, "loss": 8.3852, "step": 187600 }, { "epoch": 0.7646485498870035, "grad_norm": 6.324151992797852, "learning_rate": 0.004862575379694986, "loss": 8.2689, "step": 187700 }, { "epoch": 0.765055927910385, "grad_norm": 2.697753667831421, "learning_rate": 0.0048624129610684505, "loss": 8.2501, "step": 187800 }, { "epoch": 0.7654633059337664, "grad_norm": 3.552042245864868, "learning_rate": 0.00486225044924077, "loss": 8.3004, "step": 187900 }, { "epoch": 0.7658706839571479, "grad_norm": 6.898065567016602, "learning_rate": 0.004862087844218373, "loss": 8.3036, "step": 188000 }, { "epoch": 0.7658706839571479, "eval_MaskedAccuracy": 0.48030488243074565, "eval_loss": 1.7357308864593506, "eval_runtime": 686.6961, "eval_samples_per_second": 92.437, "eval_steps_per_second": 0.361, "step": 188000 }, { "epoch": 0.7662780619805294, "grad_norm": 2.1473889350891113, "learning_rate": 0.004861925146007684, "loss": 8.2513, "step": 188100 }, { "epoch": 0.7666854400039108, "grad_norm": 3.4288330078125, "learning_rate": 0.004861762354615138, "loss": 8.403, "step": 188200 }, { "epoch": 0.7670928180272923, "grad_norm": 2.9138879776000977, "learning_rate": 0.004861599470047177, "loss": 8.3305, "step": 188300 }, { "epoch": 0.7675001960506738, "grad_norm": 6.231320381164551, "learning_rate": 0.0048614364923102345, "loss": 8.318, "step": 188400 }, { "epoch": 0.7679075740740552, "grad_norm": 6.695070266723633, "learning_rate": 0.004861273421410755, "loss": 8.2621, "step": 188500 }, { "epoch": 0.7683149520974367, "grad_norm": 0.6956781148910522, "learning_rate": 0.004861110257355179, "loss": 8.2153, "step": 188600 }, { "epoch": 0.7687223301208181, "grad_norm": 1.3013149499893188, "learning_rate": 0.004860947000149968, "loss": 8.343, "step": 188700 }, { "epoch": 0.7691297081441996, "grad_norm": 1.5729635953903198, "learning_rate": 0.004860783649801571, "loss": 8.3601, "step": 188800 }, { "epoch": 0.7695370861675811, "grad_norm": 5.179935932159424, "learning_rate": 0.004860620206316447, "loss": 8.2648, "step": 188900 }, { "epoch": 0.7699444641909625, "grad_norm": 4.961496829986572, "learning_rate": 0.004860456669701059, "loss": 8.2418, "step": 189000 }, { "epoch": 0.7699444641909625, "eval_MaskedAccuracy": 0.4811944379313617, "eval_loss": 1.7441548109054565, "eval_runtime": 494.085, "eval_samples_per_second": 128.472, "eval_steps_per_second": 0.502, "step": 189000 }, { "epoch": 0.7703518422143439, "grad_norm": 6.041499137878418, "learning_rate": 0.00486029303996187, "loss": 8.2266, "step": 189100 }, { "epoch": 0.7707592202377255, "grad_norm": 7.916848182678223, "learning_rate": 0.004860129317105353, "loss": 8.3198, "step": 189200 }, { "epoch": 0.7711665982611069, "grad_norm": 3.322267770767212, "learning_rate": 0.00485996550113798, "loss": 8.2506, "step": 189300 }, { "epoch": 0.7715739762844883, "grad_norm": 4.494008541107178, "learning_rate": 0.004859801592066219, "loss": 8.2102, "step": 189400 }, { "epoch": 0.7719813543078698, "grad_norm": 8.828492164611816, "learning_rate": 0.004859637589896558, "loss": 8.2277, "step": 189500 }, { "epoch": 0.7723887323312513, "grad_norm": 6.599806308746338, "learning_rate": 0.00485947349463548, "loss": 8.353, "step": 189600 }, { "epoch": 0.7727961103546328, "grad_norm": 0.9402397274971008, "learning_rate": 0.004859309306289469, "loss": 8.3772, "step": 189700 }, { "epoch": 0.7732034883780142, "grad_norm": 21.609149932861328, "learning_rate": 0.0048591450248650185, "loss": 8.373, "step": 189800 }, { "epoch": 0.7736108664013956, "grad_norm": 5.11705207824707, "learning_rate": 0.004858980650368628, "loss": 8.3223, "step": 189900 }, { "epoch": 0.7740182444247772, "grad_norm": 3.844451427459717, "learning_rate": 0.004858816182806792, "loss": 8.2618, "step": 190000 }, { "epoch": 0.7740182444247772, "eval_MaskedAccuracy": 0.4816098762850331, "eval_loss": 1.7375717163085938, "eval_runtime": 406.9675, "eval_samples_per_second": 155.973, "eval_steps_per_second": 0.609, "step": 190000 }, { "epoch": 0.7744256224481586, "grad_norm": 4.231984615325928, "learning_rate": 0.004858651622186017, "loss": 8.2077, "step": 190100 }, { "epoch": 0.77483300047154, "grad_norm": 4.590526103973389, "learning_rate": 0.0048584869685128025, "loss": 8.1978, "step": 190200 }, { "epoch": 0.7752403784949216, "grad_norm": 4.9509429931640625, "learning_rate": 0.004858322221793658, "loss": 8.1997, "step": 190300 }, { "epoch": 0.775647756518303, "grad_norm": 5.255951404571533, "learning_rate": 0.004858157382035101, "loss": 8.1573, "step": 190400 }, { "epoch": 0.7760551345416844, "grad_norm": 6.821035385131836, "learning_rate": 0.004857992449243652, "loss": 8.1405, "step": 190500 }, { "epoch": 0.7764625125650659, "grad_norm": 4.925536155700684, "learning_rate": 0.004857827423425829, "loss": 8.1715, "step": 190600 }, { "epoch": 0.7768698905884474, "grad_norm": 7.465526580810547, "learning_rate": 0.004857662304588147, "loss": 8.1682, "step": 190700 }, { "epoch": 0.7772772686118289, "grad_norm": 5.358131408691406, "learning_rate": 0.004857497092737141, "loss": 8.3492, "step": 190800 }, { "epoch": 0.7776846466352103, "grad_norm": 7.660170078277588, "learning_rate": 0.0048573317878793446, "loss": 8.2337, "step": 190900 }, { "epoch": 0.7780920246585917, "grad_norm": 1.671234369277954, "learning_rate": 0.004857166390021285, "loss": 8.3512, "step": 191000 }, { "epoch": 0.7780920246585917, "eval_MaskedAccuracy": 0.47423356197852706, "eval_loss": 1.7600901126861572, "eval_runtime": 713.4928, "eval_samples_per_second": 88.965, "eval_steps_per_second": 0.348, "step": 191000 }, { "epoch": 0.7784994026819733, "grad_norm": 6.394827842712402, "learning_rate": 0.004857000899169509, "loss": 8.4037, "step": 191100 }, { "epoch": 0.7789067807053547, "grad_norm": 25.751123428344727, "learning_rate": 0.004856835315330559, "loss": 8.4075, "step": 191200 }, { "epoch": 0.7793141587287361, "grad_norm": 2.641561269760132, "learning_rate": 0.004856669638510987, "loss": 8.335, "step": 191300 }, { "epoch": 0.7797215367521176, "grad_norm": 3.0915346145629883, "learning_rate": 0.004856503868717332, "loss": 8.2223, "step": 191400 }, { "epoch": 0.7801289147754991, "grad_norm": 8.381845474243164, "learning_rate": 0.00485633800595615, "loss": 8.2886, "step": 191500 }, { "epoch": 0.7805362927988805, "grad_norm": 3.6453874111175537, "learning_rate": 0.004856172050233998, "loss": 8.3798, "step": 191600 }, { "epoch": 0.780943670822262, "grad_norm": 7.310152530670166, "learning_rate": 0.004856006001557444, "loss": 8.3099, "step": 191700 }, { "epoch": 0.7813510488456434, "grad_norm": 2.473210096359253, "learning_rate": 0.004855839859933053, "loss": 8.2403, "step": 191800 }, { "epoch": 0.7817584268690249, "grad_norm": 3.5466408729553223, "learning_rate": 0.004855673625367388, "loss": 8.1866, "step": 191900 }, { "epoch": 0.7821658048924064, "grad_norm": 4.585660457611084, "learning_rate": 0.0048555072978670195, "loss": 8.3252, "step": 192000 }, { "epoch": 0.7821658048924064, "eval_MaskedAccuracy": 0.4768169207854122, "eval_loss": 1.760154366493225, "eval_runtime": 611.2653, "eval_samples_per_second": 103.844, "eval_steps_per_second": 0.406, "step": 192000 }, { "epoch": 0.7825731829157878, "grad_norm": 4.345408916473389, "learning_rate": 0.004855340877438528, "loss": 8.3354, "step": 192100 }, { "epoch": 0.7829805609391693, "grad_norm": 6.655874729156494, "learning_rate": 0.004855174364088492, "loss": 8.2959, "step": 192200 }, { "epoch": 0.7833879389625508, "grad_norm": 3.4941017627716064, "learning_rate": 0.004855007757823497, "loss": 8.1996, "step": 192300 }, { "epoch": 0.7837953169859322, "grad_norm": 4.970099925994873, "learning_rate": 0.004854841058650124, "loss": 8.3653, "step": 192400 }, { "epoch": 0.7842026950093137, "grad_norm": 0.5349174737930298, "learning_rate": 0.00485467426657497, "loss": 8.3485, "step": 192500 }, { "epoch": 0.7846100730326951, "grad_norm": 0.6330676078796387, "learning_rate": 0.004854507381604621, "loss": 8.3705, "step": 192600 }, { "epoch": 0.7850174510560766, "grad_norm": 3.8508827686309814, "learning_rate": 0.004854340403745676, "loss": 8.3403, "step": 192700 }, { "epoch": 0.7854248290794581, "grad_norm": 6.208312511444092, "learning_rate": 0.004854173333004748, "loss": 8.3188, "step": 192800 }, { "epoch": 0.7858322071028395, "grad_norm": 6.375416278839111, "learning_rate": 0.004854006169388426, "loss": 8.2416, "step": 192900 }, { "epoch": 0.7862395851262209, "grad_norm": 5.320347309112549, "learning_rate": 0.004853838912903331, "loss": 8.2092, "step": 193000 }, { "epoch": 0.7862395851262209, "eval_MaskedAccuracy": 0.4831812609564819, "eval_loss": 1.7338577508926392, "eval_runtime": 646.1593, "eval_samples_per_second": 98.236, "eval_steps_per_second": 0.384, "step": 193000 }, { "epoch": 0.7866469631496025, "grad_norm": 6.850183486938477, "learning_rate": 0.0048536715635560754, "loss": 8.2364, "step": 193100 }, { "epoch": 0.7870543411729839, "grad_norm": 8.502579689025879, "learning_rate": 0.0048535041213532614, "loss": 8.2027, "step": 193200 }, { "epoch": 0.7874617191963654, "grad_norm": 4.212939739227295, "learning_rate": 0.0048533365863015245, "loss": 8.3491, "step": 193300 }, { "epoch": 0.7878690972197469, "grad_norm": 5.045016765594482, "learning_rate": 0.004853168958407482, "loss": 8.3094, "step": 193400 }, { "epoch": 0.7882764752431283, "grad_norm": 2.5898854732513428, "learning_rate": 0.0048530012376777594, "loss": 8.3289, "step": 193500 }, { "epoch": 0.7886838532665098, "grad_norm": 5.046809673309326, "learning_rate": 0.0048528334241189925, "loss": 8.2256, "step": 193600 }, { "epoch": 0.7890912312898912, "grad_norm": 8.55445384979248, "learning_rate": 0.0048526655177378135, "loss": 8.2187, "step": 193700 }, { "epoch": 0.7894986093132726, "grad_norm": 2.4749691486358643, "learning_rate": 0.004852497518540859, "loss": 8.1947, "step": 193800 }, { "epoch": 0.7899059873366542, "grad_norm": 3.637096643447876, "learning_rate": 0.004852329426534777, "loss": 8.2319, "step": 193900 }, { "epoch": 0.7903133653600356, "grad_norm": 4.979690074920654, "learning_rate": 0.004852161241726202, "loss": 8.1991, "step": 194000 }, { "epoch": 0.7903133653600356, "eval_MaskedAccuracy": 0.4823404586882282, "eval_loss": 1.726420283317566, "eval_runtime": 590.1867, "eval_samples_per_second": 107.552, "eval_steps_per_second": 0.42, "step": 194000 }, { "epoch": 0.790720743383417, "grad_norm": 4.382778167724609, "learning_rate": 0.004851992964121791, "loss": 8.2844, "step": 194100 }, { "epoch": 0.7911281214067986, "grad_norm": 10.199458122253418, "learning_rate": 0.004851824593728198, "loss": 8.3121, "step": 194200 }, { "epoch": 0.79153549943018, "grad_norm": 2.9020090103149414, "learning_rate": 0.004851656130552078, "loss": 8.3118, "step": 194300 }, { "epoch": 0.7919428774535614, "grad_norm": 0.5850937366485596, "learning_rate": 0.0048514875746000895, "loss": 8.2712, "step": 194400 }, { "epoch": 0.7923502554769429, "grad_norm": 2.5323173999786377, "learning_rate": 0.004851318925878893, "loss": 8.3253, "step": 194500 }, { "epoch": 0.7927576335003244, "grad_norm": 1.724382996559143, "learning_rate": 0.004851150184395162, "loss": 8.3354, "step": 194600 }, { "epoch": 0.7931650115237059, "grad_norm": 6.714611530303955, "learning_rate": 0.004850981350155566, "loss": 8.3365, "step": 194700 }, { "epoch": 0.7935723895470873, "grad_norm": 2.9206438064575195, "learning_rate": 0.0048508124231667745, "loss": 8.2588, "step": 194800 }, { "epoch": 0.7939797675704687, "grad_norm": 6.360642433166504, "learning_rate": 0.004850643403435478, "loss": 8.243, "step": 194900 }, { "epoch": 0.7943871455938503, "grad_norm": 1.3880046606063843, "learning_rate": 0.004850474290968347, "loss": 8.304, "step": 195000 }, { "epoch": 0.7943871455938503, "eval_MaskedAccuracy": 0.4745305035933312, "eval_loss": 1.7650314569473267, "eval_runtime": 612.4558, "eval_samples_per_second": 103.642, "eval_steps_per_second": 0.405, "step": 195000 }, { "epoch": 0.7947945236172317, "grad_norm": 7.486367225646973, "learning_rate": 0.004850305085772075, "loss": 8.3045, "step": 195100 }, { "epoch": 0.7952019016406131, "grad_norm": 6.169907569885254, "learning_rate": 0.004850135787853349, "loss": 8.1965, "step": 195200 }, { "epoch": 0.7956092796639946, "grad_norm": 0.9016996026039124, "learning_rate": 0.004849966397218864, "loss": 8.3548, "step": 195300 }, { "epoch": 0.7960166576873761, "grad_norm": 1.0470725297927856, "learning_rate": 0.004849796913875317, "loss": 8.353, "step": 195400 }, { "epoch": 0.7964240357107575, "grad_norm": 6.29168176651001, "learning_rate": 0.004849627337829407, "loss": 8.289, "step": 195500 }, { "epoch": 0.796831413734139, "grad_norm": 8.154321670532227, "learning_rate": 0.004849457669087843, "loss": 8.2741, "step": 195600 }, { "epoch": 0.7972387917575204, "grad_norm": 6.071690559387207, "learning_rate": 0.004849287907657329, "loss": 8.2308, "step": 195700 }, { "epoch": 0.7976461697809019, "grad_norm": 6.007944107055664, "learning_rate": 0.004849118053544568, "loss": 8.2006, "step": 195800 }, { "epoch": 0.7980535478042834, "grad_norm": 3.953540563583374, "learning_rate": 0.004848948106756286, "loss": 8.2979, "step": 195900 }, { "epoch": 0.7984609258276648, "grad_norm": 7.509738922119141, "learning_rate": 0.004848778067299203, "loss": 8.2262, "step": 196000 }, { "epoch": 0.7984609258276648, "eval_MaskedAccuracy": 0.479848862356713, "eval_loss": 1.7400267124176025, "eval_runtime": 513.6673, "eval_samples_per_second": 123.574, "eval_steps_per_second": 0.483, "step": 196000 }, { "epoch": 0.7988683038510463, "grad_norm": 6.1329545974731445, "learning_rate": 0.004848607935180032, "loss": 8.2389, "step": 196100 }, { "epoch": 0.7992756818744278, "grad_norm": 11.7472505569458, "learning_rate": 0.004848437710405505, "loss": 8.2541, "step": 196200 }, { "epoch": 0.7996830598978092, "grad_norm": 0.4944990575313568, "learning_rate": 0.00484826739298235, "loss": 8.3522, "step": 196300 }, { "epoch": 0.8000904379211907, "grad_norm": 5.864392280578613, "learning_rate": 0.004848096982917298, "loss": 8.2947, "step": 196400 }, { "epoch": 0.8004978159445721, "grad_norm": 5.442176342010498, "learning_rate": 0.004847926480217097, "loss": 8.277, "step": 196500 }, { "epoch": 0.8009051939679536, "grad_norm": 2.9440054893493652, "learning_rate": 0.004847755884888478, "loss": 8.4247, "step": 196600 }, { "epoch": 0.8013125719913351, "grad_norm": 17.107017517089844, "learning_rate": 0.004847585196938187, "loss": 8.3849, "step": 196700 }, { "epoch": 0.8017199500147165, "grad_norm": 0.6312718391418457, "learning_rate": 0.004847414416372972, "loss": 8.3402, "step": 196800 }, { "epoch": 0.8021273280380979, "grad_norm": 4.328444480895996, "learning_rate": 0.004847243543199599, "loss": 8.3553, "step": 196900 }, { "epoch": 0.8025347060614795, "grad_norm": 4.721122741699219, "learning_rate": 0.004847072577424805, "loss": 8.2438, "step": 197000 }, { "epoch": 0.8025347060614795, "eval_MaskedAccuracy": 0.4820260232919574, "eval_loss": 1.7297272682189941, "eval_runtime": 687.611, "eval_samples_per_second": 92.314, "eval_steps_per_second": 0.361, "step": 197000 }, { "epoch": 0.8029420840848609, "grad_norm": 6.645321369171143, "learning_rate": 0.004846901519055362, "loss": 8.2717, "step": 197100 }, { "epoch": 0.8033494621082424, "grad_norm": 6.457027912139893, "learning_rate": 0.00484673036809802, "loss": 8.2698, "step": 197200 }, { "epoch": 0.8037568401316239, "grad_norm": 2.528052806854248, "learning_rate": 0.004846559124559557, "loss": 8.213, "step": 197300 }, { "epoch": 0.8041642181550053, "grad_norm": 2.305532217025757, "learning_rate": 0.004846387788446739, "loss": 8.2166, "step": 197400 }, { "epoch": 0.8045715961783868, "grad_norm": 10.579450607299805, "learning_rate": 0.004846216359766331, "loss": 8.3314, "step": 197500 }, { "epoch": 0.8049789742017682, "grad_norm": 1.0553219318389893, "learning_rate": 0.004846044838525123, "loss": 8.3549, "step": 197600 }, { "epoch": 0.8053863522251496, "grad_norm": 6.5963335037231445, "learning_rate": 0.004845873224729893, "loss": 8.3154, "step": 197700 }, { "epoch": 0.8057937302485312, "grad_norm": 3.267850160598755, "learning_rate": 0.004845701518387424, "loss": 8.3146, "step": 197800 }, { "epoch": 0.8062011082719126, "grad_norm": 3.7062668800354004, "learning_rate": 0.004845529719504509, "loss": 8.3205, "step": 197900 }, { "epoch": 0.806608486295294, "grad_norm": 7.010178565979004, "learning_rate": 0.004845357828087938, "loss": 8.3182, "step": 198000 }, { "epoch": 0.806608486295294, "eval_MaskedAccuracy": 0.47622369932805414, "eval_loss": 1.7665005922317505, "eval_runtime": 476.6637, "eval_samples_per_second": 133.167, "eval_steps_per_second": 0.52, "step": 198000 }, { "epoch": 0.8070158643186756, "grad_norm": 5.485898017883301, "learning_rate": 0.004845185844144501, "loss": 8.2616, "step": 198100 }, { "epoch": 0.807423242342057, "grad_norm": 4.966923713684082, "learning_rate": 0.0048450137676810025, "loss": 8.3116, "step": 198200 }, { "epoch": 0.8078306203654384, "grad_norm": 6.153297424316406, "learning_rate": 0.00484484159870424, "loss": 8.2513, "step": 198300 }, { "epoch": 0.8082379983888199, "grad_norm": 2.917039155960083, "learning_rate": 0.004844669337221025, "loss": 8.2135, "step": 198400 }, { "epoch": 0.8086453764122014, "grad_norm": 3.5864193439483643, "learning_rate": 0.004844496983238169, "loss": 8.2675, "step": 198500 }, { "epoch": 0.8090527544355829, "grad_norm": 5.927716255187988, "learning_rate": 0.004844324536762485, "loss": 8.2741, "step": 198600 }, { "epoch": 0.8094601324589643, "grad_norm": 5.406866073608398, "learning_rate": 0.0048441519978007904, "loss": 8.1962, "step": 198700 }, { "epoch": 0.8098675104823457, "grad_norm": 4.743386745452881, "learning_rate": 0.004843979366359911, "loss": 8.1696, "step": 198800 }, { "epoch": 0.8102748885057273, "grad_norm": 3.488309621810913, "learning_rate": 0.004843806642446664, "loss": 8.1757, "step": 198900 }, { "epoch": 0.8106822665291087, "grad_norm": 3.7075552940368652, "learning_rate": 0.004843633826067882, "loss": 8.1915, "step": 199000 }, { "epoch": 0.8106822665291087, "eval_MaskedAccuracy": 0.479225615367175, "eval_loss": 1.742332935333252, "eval_runtime": 730.2844, "eval_samples_per_second": 86.92, "eval_steps_per_second": 0.34, "step": 199000 }, { "epoch": 0.8110896445524901, "grad_norm": 1.1896377801895142, "learning_rate": 0.004843460917230393, "loss": 8.2984, "step": 199100 }, { "epoch": 0.8114970225758716, "grad_norm": 0.9556285738945007, "learning_rate": 0.004843287915941042, "loss": 8.3639, "step": 199200 }, { "epoch": 0.8119044005992531, "grad_norm": 7.886438846588135, "learning_rate": 0.0048431148222066595, "loss": 8.3558, "step": 199300 }, { "epoch": 0.8123117786226345, "grad_norm": 6.792365074157715, "learning_rate": 0.004842941636034093, "loss": 8.3307, "step": 199400 }, { "epoch": 0.812719156646016, "grad_norm": 3.0975301265716553, "learning_rate": 0.0048427683574301895, "loss": 8.2943, "step": 199500 }, { "epoch": 0.8131265346693974, "grad_norm": 4.0706281661987305, "learning_rate": 0.0048425949864017975, "loss": 8.2511, "step": 199600 }, { "epoch": 0.813533912692779, "grad_norm": 1.3213456869125366, "learning_rate": 0.0048424215229557754, "loss": 8.2188, "step": 199700 }, { "epoch": 0.8139412907161604, "grad_norm": 3.0209662914276123, "learning_rate": 0.004842247967098978, "loss": 8.2851, "step": 199800 }, { "epoch": 0.8143486687395418, "grad_norm": 4.856011390686035, "learning_rate": 0.00484207431883827, "loss": 8.203, "step": 199900 }, { "epoch": 0.8147560467629233, "grad_norm": 4.1549296379089355, "learning_rate": 0.004841900578180518, "loss": 8.316, "step": 200000 }, { "epoch": 0.8147560467629233, "eval_MaskedAccuracy": 0.47899325286543376, "eval_loss": 1.7504117488861084, "eval_runtime": 626.9726, "eval_samples_per_second": 101.242, "eval_steps_per_second": 0.396, "step": 200000 }, { "epoch": 0.8151634247863048, "grad_norm": 6.633772373199463, "learning_rate": 0.0048417267451325895, "loss": 8.2596, "step": 200100 }, { "epoch": 0.8155708028096862, "grad_norm": 2.556081533432007, "learning_rate": 0.004841552819701353, "loss": 8.2201, "step": 200200 }, { "epoch": 0.8159781808330677, "grad_norm": 5.128891468048096, "learning_rate": 0.004841378801893684, "loss": 8.2111, "step": 200300 }, { "epoch": 0.8163855588564491, "grad_norm": 7.334069728851318, "learning_rate": 0.004841204691716469, "loss": 8.201, "step": 200400 }, { "epoch": 0.8167929368798306, "grad_norm": 2.5291171073913574, "learning_rate": 0.004841030489176587, "loss": 8.2851, "step": 200500 }, { "epoch": 0.8172003149032121, "grad_norm": 3.6551196575164795, "learning_rate": 0.004840856194280921, "loss": 8.2998, "step": 200600 }, { "epoch": 0.8176076929265935, "grad_norm": 5.844996452331543, "learning_rate": 0.004840681807036361, "loss": 8.2128, "step": 200700 }, { "epoch": 0.8180150709499749, "grad_norm": 0.4380483627319336, "learning_rate": 0.004840507327449816, "loss": 8.2821, "step": 200800 }, { "epoch": 0.8184224489733565, "grad_norm": 5.027729511260986, "learning_rate": 0.004840332755528171, "loss": 8.2823, "step": 200900 }, { "epoch": 0.8188298269967379, "grad_norm": 4.696913719177246, "learning_rate": 0.004840158091278329, "loss": 8.2708, "step": 201000 }, { "epoch": 0.8188298269967379, "eval_MaskedAccuracy": 0.4771777011503206, "eval_loss": 1.7562700510025024, "eval_runtime": 481.2755, "eval_samples_per_second": 131.891, "eval_steps_per_second": 0.515, "step": 201000 }, { "epoch": 0.8192372050201194, "grad_norm": 4.170097827911377, "learning_rate": 0.004839983334707201, "loss": 8.2597, "step": 201100 }, { "epoch": 0.8196445830435009, "grad_norm": 3.8776440620422363, "learning_rate": 0.004839808485821691, "loss": 8.2401, "step": 201200 }, { "epoch": 0.8200519610668823, "grad_norm": 5.236380577087402, "learning_rate": 0.004839633544628719, "loss": 8.1898, "step": 201300 }, { "epoch": 0.8204593390902638, "grad_norm": 0.6355003118515015, "learning_rate": 0.004839458511135197, "loss": 8.309, "step": 201400 }, { "epoch": 0.8208667171136452, "grad_norm": 4.979259967803955, "learning_rate": 0.004839283385348045, "loss": 8.2883, "step": 201500 }, { "epoch": 0.8212740951370266, "grad_norm": 6.542636871337891, "learning_rate": 0.00483910816727419, "loss": 8.1954, "step": 201600 }, { "epoch": 0.8216814731604082, "grad_norm": 2.4118804931640625, "learning_rate": 0.004838932856920554, "loss": 8.2065, "step": 201700 }, { "epoch": 0.8220888511837896, "grad_norm": 6.951291561126709, "learning_rate": 0.004838757454294068, "loss": 8.2174, "step": 201800 }, { "epoch": 0.822496229207171, "grad_norm": 6.4014177322387695, "learning_rate": 0.004838581959401661, "loss": 8.2311, "step": 201900 }, { "epoch": 0.8229036072305526, "grad_norm": 7.916312217712402, "learning_rate": 0.0048384063722502835, "loss": 8.1887, "step": 202000 }, { "epoch": 0.8229036072305526, "eval_MaskedAccuracy": 0.47698157197199315, "eval_loss": 1.75382399559021, "eval_runtime": 508.5186, "eval_samples_per_second": 124.825, "eval_steps_per_second": 0.488, "step": 202000 }, { "epoch": 0.823310985253934, "grad_norm": 4.69990348815918, "learning_rate": 0.004838230692846875, "loss": 8.2324, "step": 202100 }, { "epoch": 0.8237183632773155, "grad_norm": 1.1547861099243164, "learning_rate": 0.004838054921198377, "loss": 8.3198, "step": 202200 }, { "epoch": 0.8241257413006969, "grad_norm": 2.918012857437134, "learning_rate": 0.004837879057311743, "loss": 8.3413, "step": 202300 }, { "epoch": 0.8245331193240784, "grad_norm": 7.248411655426025, "learning_rate": 0.004837703101193917, "loss": 8.3124, "step": 202400 }, { "epoch": 0.8249404973474599, "grad_norm": 2.161184549331665, "learning_rate": 0.004837527052851858, "loss": 8.2408, "step": 202500 }, { "epoch": 0.8253478753708413, "grad_norm": 5.13443660736084, "learning_rate": 0.004837350912292535, "loss": 8.2681, "step": 202600 }, { "epoch": 0.8257552533942227, "grad_norm": 5.112154960632324, "learning_rate": 0.0048371746795229, "loss": 8.2744, "step": 202700 }, { "epoch": 0.8261626314176043, "grad_norm": 5.996163368225098, "learning_rate": 0.0048369983545499306, "loss": 8.2252, "step": 202800 }, { "epoch": 0.8265700094409857, "grad_norm": 4.185304164886475, "learning_rate": 0.0048368219373806, "loss": 8.1794, "step": 202900 }, { "epoch": 0.8269773874643671, "grad_norm": 5.415948867797852, "learning_rate": 0.004836645428021872, "loss": 8.2494, "step": 203000 }, { "epoch": 0.8269773874643671, "eval_MaskedAccuracy": 0.4802600949969667, "eval_loss": 1.7474703788757324, "eval_runtime": 557.2325, "eval_samples_per_second": 113.913, "eval_steps_per_second": 0.445, "step": 203000 }, { "epoch": 0.8273847654877486, "grad_norm": 2.4445393085479736, "learning_rate": 0.004836468826480733, "loss": 8.1957, "step": 203100 }, { "epoch": 0.8277921435111301, "grad_norm": 3.8953542709350586, "learning_rate": 0.004836292132764158, "loss": 8.2104, "step": 203200 }, { "epoch": 0.8281995215345115, "grad_norm": 3.1333765983581543, "learning_rate": 0.004836115346879137, "loss": 8.2912, "step": 203300 }, { "epoch": 0.828606899557893, "grad_norm": 3.902015209197998, "learning_rate": 0.004835938468832664, "loss": 8.2786, "step": 203400 }, { "epoch": 0.8290142775812744, "grad_norm": 6.234856605529785, "learning_rate": 0.004835761498631721, "loss": 8.2223, "step": 203500 }, { "epoch": 0.829421655604656, "grad_norm": 1.9330283403396606, "learning_rate": 0.004835584436283312, "loss": 8.1993, "step": 203600 }, { "epoch": 0.8298290336280374, "grad_norm": 1.8898357152938843, "learning_rate": 0.004835407281794435, "loss": 8.2911, "step": 203700 }, { "epoch": 0.8302364116514188, "grad_norm": 7.752363681793213, "learning_rate": 0.00483523003517209, "loss": 8.305, "step": 203800 }, { "epoch": 0.8306437896748003, "grad_norm": 1.9813470840454102, "learning_rate": 0.004835052696423295, "loss": 8.3072, "step": 203900 }, { "epoch": 0.8310511676981818, "grad_norm": 4.670867443084717, "learning_rate": 0.004834875265555053, "loss": 8.2232, "step": 204000 }, { "epoch": 0.8310511676981818, "eval_MaskedAccuracy": 0.4832815956989023, "eval_loss": 1.7318273782730103, "eval_runtime": 478.7258, "eval_samples_per_second": 132.594, "eval_steps_per_second": 0.518, "step": 204000 }, { "epoch": 0.8314585457215632, "grad_norm": 6.116964340209961, "learning_rate": 0.004834697742574377, "loss": 8.2046, "step": 204100 }, { "epoch": 0.8318659237449447, "grad_norm": 2.5044636726379395, "learning_rate": 0.004834520127488294, "loss": 8.2273, "step": 204200 }, { "epoch": 0.8322733017683261, "grad_norm": 4.650371551513672, "learning_rate": 0.0048343424203038205, "loss": 8.3231, "step": 204300 }, { "epoch": 0.8326806797917076, "grad_norm": 2.9488110542297363, "learning_rate": 0.00483416462102798, "loss": 8.3034, "step": 204400 }, { "epoch": 0.8330880578150891, "grad_norm": 4.800295352935791, "learning_rate": 0.004833986729667807, "loss": 8.2262, "step": 204500 }, { "epoch": 0.8334954358384705, "grad_norm": 5.492660045623779, "learning_rate": 0.004833808746230335, "loss": 8.2045, "step": 204600 }, { "epoch": 0.833902813861852, "grad_norm": 3.42221999168396, "learning_rate": 0.004833630670722601, "loss": 8.3011, "step": 204700 }, { "epoch": 0.8343101918852335, "grad_norm": 5.113192558288574, "learning_rate": 0.004833452503151636, "loss": 8.3247, "step": 204800 }, { "epoch": 0.8347175699086149, "grad_norm": 3.8817861080169678, "learning_rate": 0.004833274243524488, "loss": 8.3233, "step": 204900 }, { "epoch": 0.8351249479319964, "grad_norm": 4.52166748046875, "learning_rate": 0.004833095891848208, "loss": 8.2378, "step": 205000 }, { "epoch": 0.8351249479319964, "eval_MaskedAccuracy": 0.48239745624371705, "eval_loss": 1.7319921255111694, "eval_runtime": 463.2795, "eval_samples_per_second": 137.014, "eval_steps_per_second": 0.535, "step": 205000 }, { "epoch": 0.8355323259553779, "grad_norm": 1.309309720993042, "learning_rate": 0.004832917448129844, "loss": 8.2326, "step": 205100 }, { "epoch": 0.8359397039787593, "grad_norm": 5.92283296585083, "learning_rate": 0.004832738912376453, "loss": 8.32, "step": 205200 }, { "epoch": 0.8363470820021408, "grad_norm": 3.4327287673950195, "learning_rate": 0.004832560284595092, "loss": 8.2146, "step": 205300 }, { "epoch": 0.8367544600255222, "grad_norm": 3.296018600463867, "learning_rate": 0.004832381564792831, "loss": 8.2043, "step": 205400 }, { "epoch": 0.8371618380489037, "grad_norm": 12.764690399169922, "learning_rate": 0.004832202752976721, "loss": 8.2826, "step": 205500 }, { "epoch": 0.8375692160722852, "grad_norm": 10.4335355758667, "learning_rate": 0.004832023849153844, "loss": 8.3267, "step": 205600 }, { "epoch": 0.8379765940956666, "grad_norm": 2.16225528717041, "learning_rate": 0.00483184485333127, "loss": 8.2872, "step": 205700 }, { "epoch": 0.838383972119048, "grad_norm": 5.040310859680176, "learning_rate": 0.004831665765516073, "loss": 8.1863, "step": 205800 }, { "epoch": 0.8387913501424296, "grad_norm": 6.956847190856934, "learning_rate": 0.00483148658571533, "loss": 8.1587, "step": 205900 }, { "epoch": 0.839198728165811, "grad_norm": 3.0041253566741943, "learning_rate": 0.004831307313936133, "loss": 8.1822, "step": 206000 }, { "epoch": 0.839198728165811, "eval_MaskedAccuracy": 0.48427526915216307, "eval_loss": 1.7122535705566406, "eval_runtime": 597.3019, "eval_samples_per_second": 106.271, "eval_steps_per_second": 0.415, "step": 206000 }, { "epoch": 0.8396061061891925, "grad_norm": 12.86263370513916, "learning_rate": 0.004831127950185568, "loss": 8.1797, "step": 206100 }, { "epoch": 0.8400134842125739, "grad_norm": 6.6114702224731445, "learning_rate": 0.004830948494470716, "loss": 8.2865, "step": 206200 }, { "epoch": 0.8404208622359554, "grad_norm": 1.7653467655181885, "learning_rate": 0.0048307689467986855, "loss": 8.3733, "step": 206300 }, { "epoch": 0.8408282402593369, "grad_norm": 4.993772983551025, "learning_rate": 0.004830589307176563, "loss": 8.3702, "step": 206400 }, { "epoch": 0.8412356182827183, "grad_norm": 4.925052165985107, "learning_rate": 0.004830409575611455, "loss": 8.3024, "step": 206500 }, { "epoch": 0.8416429963060997, "grad_norm": 4.0374064445495605, "learning_rate": 0.004830229752110483, "loss": 8.266, "step": 206600 }, { "epoch": 0.8420503743294813, "grad_norm": 5.8857502937316895, "learning_rate": 0.004830049836680735, "loss": 8.2176, "step": 206700 }, { "epoch": 0.8424577523528627, "grad_norm": 2.368356943130493, "learning_rate": 0.004829869829329334, "loss": 8.1805, "step": 206800 }, { "epoch": 0.8428651303762441, "grad_norm": 1.3813912868499756, "learning_rate": 0.0048296897300634, "loss": 8.2906, "step": 206900 }, { "epoch": 0.8432725083996256, "grad_norm": 2.6662254333496094, "learning_rate": 0.004829509538890036, "loss": 8.3135, "step": 207000 }, { "epoch": 0.8432725083996256, "eval_MaskedAccuracy": 0.4793077358420627, "eval_loss": 1.7458363771438599, "eval_runtime": 594.7579, "eval_samples_per_second": 106.726, "eval_steps_per_second": 0.417, "step": 207000 }, { "epoch": 0.8436798864230071, "grad_norm": 4.135429382324219, "learning_rate": 0.004829329255816385, "loss": 8.3306, "step": 207100 }, { "epoch": 0.8440872644463886, "grad_norm": 4.559167861938477, "learning_rate": 0.004829148880849567, "loss": 8.2143, "step": 207200 }, { "epoch": 0.84449464246977, "grad_norm": 5.941912651062012, "learning_rate": 0.004828968413996712, "loss": 8.2289, "step": 207300 }, { "epoch": 0.8449020204931514, "grad_norm": 5.322763919830322, "learning_rate": 0.00482878785526496, "loss": 8.2137, "step": 207400 }, { "epoch": 0.845309398516533, "grad_norm": 3.8385138511657715, "learning_rate": 0.004828607204661446, "loss": 8.2313, "step": 207500 }, { "epoch": 0.8457167765399144, "grad_norm": 5.177248001098633, "learning_rate": 0.00482842646219331, "loss": 8.2714, "step": 207600 }, { "epoch": 0.8461241545632958, "grad_norm": 5.964561939239502, "learning_rate": 0.004828245627867698, "loss": 8.2937, "step": 207700 }, { "epoch": 0.8465315325866773, "grad_norm": 2.427088499069214, "learning_rate": 0.004828064701691758, "loss": 8.1818, "step": 207800 }, { "epoch": 0.8469389106100588, "grad_norm": 8.923009872436523, "learning_rate": 0.0048278836836726435, "loss": 8.2604, "step": 207900 }, { "epoch": 0.8473462886334402, "grad_norm": 2.955294370651245, "learning_rate": 0.004827702573817521, "loss": 8.2247, "step": 208000 }, { "epoch": 0.8473462886334402, "eval_MaskedAccuracy": 0.4828405931102095, "eval_loss": 1.7338587045669556, "eval_runtime": 401.7982, "eval_samples_per_second": 157.98, "eval_steps_per_second": 0.617, "step": 208000 }, { "epoch": 0.8477536666568217, "grad_norm": 4.922137260437012, "learning_rate": 0.0048275213721335405, "loss": 8.1972, "step": 208100 }, { "epoch": 0.8481610446802031, "grad_norm": 7.44797945022583, "learning_rate": 0.004827340078627866, "loss": 8.1997, "step": 208200 }, { "epoch": 0.8485684227035846, "grad_norm": 3.5857973098754883, "learning_rate": 0.004827158693307668, "loss": 8.1282, "step": 208300 }, { "epoch": 0.8489758007269661, "grad_norm": 5.662656784057617, "learning_rate": 0.004826977216180114, "loss": 8.1625, "step": 208400 }, { "epoch": 0.8493831787503475, "grad_norm": 5.123605728149414, "learning_rate": 0.004826795647252382, "loss": 8.1483, "step": 208500 }, { "epoch": 0.8497905567737291, "grad_norm": 6.954997539520264, "learning_rate": 0.0048266139865316545, "loss": 8.1298, "step": 208600 }, { "epoch": 0.8501979347971105, "grad_norm": 5.438270092010498, "learning_rate": 0.004826432234025106, "loss": 8.1321, "step": 208700 }, { "epoch": 0.8506053128204919, "grad_norm": 6.686572551727295, "learning_rate": 0.004826250389739933, "loss": 8.0882, "step": 208800 }, { "epoch": 0.8510126908438734, "grad_norm": 5.251838207244873, "learning_rate": 0.004826068453683314, "loss": 8.1225, "step": 208900 }, { "epoch": 0.8514200688672549, "grad_norm": 4.220588684082031, "learning_rate": 0.004825886425862444, "loss": 8.1675, "step": 209000 }, { "epoch": 0.8514200688672549, "eval_MaskedAccuracy": 0.48592952135121487, "eval_loss": 1.7134249210357666, "eval_runtime": 416.3753, "eval_samples_per_second": 152.449, "eval_steps_per_second": 0.596, "step": 209000 }, { "epoch": 0.8518274468906363, "grad_norm": 8.761860847473145, "learning_rate": 0.004825704306284519, "loss": 8.0808, "step": 209100 }, { "epoch": 0.8522348249140178, "grad_norm": 4.385628700256348, "learning_rate": 0.004825522094956739, "loss": 8.1519, "step": 209200 }, { "epoch": 0.8526422029373992, "grad_norm": 6.701074600219727, "learning_rate": 0.004825339791886317, "loss": 8.1493, "step": 209300 }, { "epoch": 0.8530495809607807, "grad_norm": 4.271905422210693, "learning_rate": 0.004825157397080448, "loss": 8.131, "step": 209400 }, { "epoch": 0.8534569589841622, "grad_norm": 5.665177345275879, "learning_rate": 0.004824974910546352, "loss": 8.1649, "step": 209500 }, { "epoch": 0.8538643370075436, "grad_norm": 1.0476863384246826, "learning_rate": 0.004824792332291235, "loss": 8.2765, "step": 209600 }, { "epoch": 0.8542717150309251, "grad_norm": 5.7169928550720215, "learning_rate": 0.004824609662322323, "loss": 8.3905, "step": 209700 }, { "epoch": 0.8546790930543066, "grad_norm": 3.344644784927368, "learning_rate": 0.004824426900646833, "loss": 8.327, "step": 209800 }, { "epoch": 0.855086471077688, "grad_norm": 3.115905523300171, "learning_rate": 0.004824244047271991, "loss": 8.2777, "step": 209900 }, { "epoch": 0.8554938491010695, "grad_norm": 2.913144111633301, "learning_rate": 0.004824061102205035, "loss": 8.2966, "step": 210000 }, { "epoch": 0.8554938491010695, "eval_MaskedAccuracy": 0.4802323235623799, "eval_loss": 1.7398486137390137, "eval_runtime": 612.4014, "eval_samples_per_second": 103.651, "eval_steps_per_second": 0.405, "step": 210000 }, { "epoch": 0.8559012271244509, "grad_norm": 4.763663291931152, "learning_rate": 0.0048238780654531865, "loss": 8.1959, "step": 210100 }, { "epoch": 0.8563086051478324, "grad_norm": 7.354310512542725, "learning_rate": 0.004823694937023684, "loss": 8.198, "step": 210200 }, { "epoch": 0.8567159831712139, "grad_norm": 1.976967692375183, "learning_rate": 0.0048235117169237686, "loss": 8.3064, "step": 210300 }, { "epoch": 0.8571233611945953, "grad_norm": 5.958095073699951, "learning_rate": 0.004823328405160683, "loss": 8.3467, "step": 210400 }, { "epoch": 0.8575307392179767, "grad_norm": 5.468306064605713, "learning_rate": 0.004823145001741677, "loss": 8.2654, "step": 210500 }, { "epoch": 0.8579381172413583, "grad_norm": 3.822810173034668, "learning_rate": 0.004822961506674, "loss": 8.2077, "step": 210600 }, { "epoch": 0.8583454952647397, "grad_norm": 7.963845252990723, "learning_rate": 0.004822777919964915, "loss": 8.1987, "step": 210700 }, { "epoch": 0.8587528732881211, "grad_norm": 8.342864990234375, "learning_rate": 0.004822594241621665, "loss": 8.2783, "step": 210800 }, { "epoch": 0.8591602513115026, "grad_norm": 5.5617852210998535, "learning_rate": 0.004822410471651524, "loss": 8.3149, "step": 210900 }, { "epoch": 0.8595676293348841, "grad_norm": 3.1351521015167236, "learning_rate": 0.004822226610061748, "loss": 8.1942, "step": 211000 }, { "epoch": 0.8595676293348841, "eval_MaskedAccuracy": 0.48314665494330045, "eval_loss": 1.7261412143707275, "eval_runtime": 672.4997, "eval_samples_per_second": 94.388, "eval_steps_per_second": 0.369, "step": 211000 }, { "epoch": 0.8599750073582656, "grad_norm": 4.378059387207031, "learning_rate": 0.004822042656859616, "loss": 8.205, "step": 211100 }, { "epoch": 0.860382385381647, "grad_norm": 3.8245508670806885, "learning_rate": 0.0048218586120523915, "loss": 8.2617, "step": 211200 }, { "epoch": 0.8607897634050284, "grad_norm": 3.023498773574829, "learning_rate": 0.004821674475647355, "loss": 8.2805, "step": 211300 }, { "epoch": 0.86119714142841, "grad_norm": 5.668664455413818, "learning_rate": 0.004821490247651791, "loss": 8.1826, "step": 211400 }, { "epoch": 0.8616045194517914, "grad_norm": 5.902626991271973, "learning_rate": 0.004821305928072968, "loss": 8.1585, "step": 211500 }, { "epoch": 0.8620118974751728, "grad_norm": 4.480663299560547, "learning_rate": 0.004821121516918187, "loss": 8.1259, "step": 211600 }, { "epoch": 0.8624192754985543, "grad_norm": 7.29260778427124, "learning_rate": 0.004820937014194738, "loss": 8.1503, "step": 211700 }, { "epoch": 0.8628266535219358, "grad_norm": 3.6903669834136963, "learning_rate": 0.004820752419909912, "loss": 8.1551, "step": 211800 }, { "epoch": 0.8632340315453172, "grad_norm": 4.0630388259887695, "learning_rate": 0.00482056773407101, "loss": 8.1472, "step": 211900 }, { "epoch": 0.8636414095686987, "grad_norm": 4.894790172576904, "learning_rate": 0.004820382956685329, "loss": 8.1251, "step": 212000 }, { "epoch": 0.8636414095686987, "eval_MaskedAccuracy": 0.48551628369224803, "eval_loss": 1.7234163284301758, "eval_runtime": 639.0434, "eval_samples_per_second": 99.33, "eval_steps_per_second": 0.388, "step": 212000 }, { "epoch": 0.8640487875920801, "grad_norm": 5.390108585357666, "learning_rate": 0.00482019808776018, "loss": 8.1315, "step": 212100 }, { "epoch": 0.8644561656154617, "grad_norm": 5.917113780975342, "learning_rate": 0.00482001312730286, "loss": 8.1449, "step": 212200 }, { "epoch": 0.8648635436388431, "grad_norm": 6.876038551330566, "learning_rate": 0.004819828075320687, "loss": 8.108, "step": 212300 }, { "epoch": 0.8652709216622245, "grad_norm": 4.333359241485596, "learning_rate": 0.004819642931820984, "loss": 8.1293, "step": 212400 }, { "epoch": 0.8656782996856061, "grad_norm": 4.042921543121338, "learning_rate": 0.0048194576968110725, "loss": 8.2434, "step": 212500 }, { "epoch": 0.8660856777089875, "grad_norm": 20.584726333618164, "learning_rate": 0.004819272370298266, "loss": 8.3424, "step": 212600 }, { "epoch": 0.8664930557323689, "grad_norm": 0.9013800621032715, "learning_rate": 0.004819086952289896, "loss": 8.4333, "step": 212700 }, { "epoch": 0.8669004337557504, "grad_norm": 2.704648971557617, "learning_rate": 0.004818901442793292, "loss": 8.3012, "step": 212800 }, { "epoch": 0.8673078117791319, "grad_norm": 5.314194679260254, "learning_rate": 0.00481871584181579, "loss": 8.2101, "step": 212900 }, { "epoch": 0.8677151898025133, "grad_norm": 5.427734851837158, "learning_rate": 0.004818530149364725, "loss": 8.1636, "step": 213000 }, { "epoch": 0.8677151898025133, "eval_MaskedAccuracy": 0.4840721691740113, "eval_loss": 1.7264434099197388, "eval_runtime": 526.1976, "eval_samples_per_second": 120.631, "eval_steps_per_second": 0.471, "step": 213000 }, { "epoch": 0.8681225678258948, "grad_norm": 3.8905630111694336, "learning_rate": 0.004818344365447432, "loss": 8.1378, "step": 213100 }, { "epoch": 0.8685299458492762, "grad_norm": 3.7320637702941895, "learning_rate": 0.004818158490071262, "loss": 8.1892, "step": 213200 }, { "epoch": 0.8689373238726577, "grad_norm": 6.944694995880127, "learning_rate": 0.004817972523243571, "loss": 8.1385, "step": 213300 }, { "epoch": 0.8693447018960392, "grad_norm": 3.708341598510742, "learning_rate": 0.004817786464971709, "loss": 8.1633, "step": 213400 }, { "epoch": 0.8697520799194206, "grad_norm": 4.6970720291137695, "learning_rate": 0.004817600315263026, "loss": 8.1282, "step": 213500 }, { "epoch": 0.8701594579428021, "grad_norm": 4.433615207672119, "learning_rate": 0.0048174140741248815, "loss": 8.1219, "step": 213600 }, { "epoch": 0.8705668359661836, "grad_norm": 3.3684661388397217, "learning_rate": 0.0048172277415646425, "loss": 8.1104, "step": 213700 }, { "epoch": 0.870974213989565, "grad_norm": 5.546548366546631, "learning_rate": 0.004817041317589675, "loss": 8.1474, "step": 213800 }, { "epoch": 0.8713815920129465, "grad_norm": 4.353720188140869, "learning_rate": 0.004816854802207345, "loss": 8.1071, "step": 213900 }, { "epoch": 0.8717889700363279, "grad_norm": 3.027634859085083, "learning_rate": 0.0048166681954250265, "loss": 8.097, "step": 214000 }, { "epoch": 0.8717889700363279, "eval_MaskedAccuracy": 0.4856966584998987, "eval_loss": 1.7125508785247803, "eval_runtime": 593.1009, "eval_samples_per_second": 107.024, "eval_steps_per_second": 0.418, "step": 214000 }, { "epoch": 0.8721963480597094, "grad_norm": 6.464120388031006, "learning_rate": 0.00481648149725011, "loss": 8.2545, "step": 214100 }, { "epoch": 0.8726037260830909, "grad_norm": 1.8093879222869873, "learning_rate": 0.004816294707689969, "loss": 8.3416, "step": 214200 }, { "epoch": 0.8730111041064723, "grad_norm": 9.328423500061035, "learning_rate": 0.004816107826751984, "loss": 8.3573, "step": 214300 }, { "epoch": 0.8734184821298537, "grad_norm": 6.261687755584717, "learning_rate": 0.004815920854443545, "loss": 8.3704, "step": 214400 }, { "epoch": 0.8738258601532353, "grad_norm": 5.908254623413086, "learning_rate": 0.0048157337907720425, "loss": 8.2167, "step": 214500 }, { "epoch": 0.8742332381766167, "grad_norm": 4.408290386199951, "learning_rate": 0.004815546635744878, "loss": 8.2114, "step": 214600 }, { "epoch": 0.8746406161999982, "grad_norm": 6.703604221343994, "learning_rate": 0.004815359389369443, "loss": 8.2206, "step": 214700 }, { "epoch": 0.8750479942233796, "grad_norm": 4.957728385925293, "learning_rate": 0.004815172051653151, "loss": 8.2255, "step": 214800 }, { "epoch": 0.8754553722467611, "grad_norm": 8.815536499023438, "learning_rate": 0.004814984622603397, "loss": 8.229, "step": 214900 }, { "epoch": 0.8758627502701426, "grad_norm": 7.434412956237793, "learning_rate": 0.0048147971022275944, "loss": 8.3109, "step": 215000 }, { "epoch": 0.8758627502701426, "eval_MaskedAccuracy": 0.4821472093482054, "eval_loss": 1.7343100309371948, "eval_runtime": 575.8219, "eval_samples_per_second": 110.235, "eval_steps_per_second": 0.431, "step": 215000 }, { "epoch": 0.876270128293524, "grad_norm": 3.0055723190307617, "learning_rate": 0.004814609490533161, "loss": 8.239, "step": 215100 }, { "epoch": 0.8766775063169054, "grad_norm": 2.9075305461883545, "learning_rate": 0.004814421787527511, "loss": 8.2266, "step": 215200 }, { "epoch": 0.877084884340287, "grad_norm": 2.666818141937256, "learning_rate": 0.004814233993218074, "loss": 8.271, "step": 215300 }, { "epoch": 0.8774922623636684, "grad_norm": 5.189484119415283, "learning_rate": 0.0048140461076122635, "loss": 8.2923, "step": 215400 }, { "epoch": 0.8778996403870498, "grad_norm": 2.3578219413757324, "learning_rate": 0.00481385813071751, "loss": 8.2224, "step": 215500 }, { "epoch": 0.8783070184104314, "grad_norm": 5.166990280151367, "learning_rate": 0.004813670062541242, "loss": 8.1742, "step": 215600 }, { "epoch": 0.8787143964338128, "grad_norm": 3.812676429748535, "learning_rate": 0.004813481903090902, "loss": 8.2637, "step": 215700 }, { "epoch": 0.8791217744571942, "grad_norm": 4.2266645431518555, "learning_rate": 0.004813293652373921, "loss": 8.3079, "step": 215800 }, { "epoch": 0.8795291524805757, "grad_norm": 7.559784889221191, "learning_rate": 0.004813105310397747, "loss": 8.2331, "step": 215900 }, { "epoch": 0.8799365305039571, "grad_norm": 2.5249407291412354, "learning_rate": 0.004812916877169828, "loss": 8.2012, "step": 216000 }, { "epoch": 0.8799365305039571, "eval_MaskedAccuracy": 0.4843116807781347, "eval_loss": 1.7242289781570435, "eval_runtime": 619.7865, "eval_samples_per_second": 102.416, "eval_steps_per_second": 0.4, "step": 216000 }, { "epoch": 0.8803439085273387, "grad_norm": 7.770275592803955, "learning_rate": 0.004812728352697608, "loss": 8.2158, "step": 216100 }, { "epoch": 0.8807512865507201, "grad_norm": 5.232234477996826, "learning_rate": 0.004812539736988547, "loss": 8.3067, "step": 216200 }, { "epoch": 0.8811586645741015, "grad_norm": 6.027997016906738, "learning_rate": 0.004812351030050096, "loss": 8.1876, "step": 216300 }, { "epoch": 0.8815660425974831, "grad_norm": 3.0700552463531494, "learning_rate": 0.004812162231889718, "loss": 8.1517, "step": 216400 }, { "epoch": 0.8819734206208645, "grad_norm": 2.3771629333496094, "learning_rate": 0.004811973342514877, "loss": 8.1428, "step": 216500 }, { "epoch": 0.8823807986442459, "grad_norm": 2.1454079151153564, "learning_rate": 0.0048117843619330455, "loss": 8.2716, "step": 216600 }, { "epoch": 0.8827881766676274, "grad_norm": 6.59639835357666, "learning_rate": 0.004811595290151687, "loss": 8.2629, "step": 216700 }, { "epoch": 0.8831955546910089, "grad_norm": 3.637233257293701, "learning_rate": 0.004811406127178285, "loss": 8.1797, "step": 216800 }, { "epoch": 0.8836029327143903, "grad_norm": 5.26485013961792, "learning_rate": 0.004811216873020314, "loss": 8.144, "step": 216900 }, { "epoch": 0.8840103107377718, "grad_norm": 3.676942825317383, "learning_rate": 0.004811027527685249, "loss": 8.1487, "step": 217000 }, { "epoch": 0.8840103107377718, "eval_MaskedAccuracy": 0.4837425891402228, "eval_loss": 1.7248014211654663, "eval_runtime": 715.549, "eval_samples_per_second": 88.71, "eval_steps_per_second": 0.347, "step": 217000 }, { "epoch": 0.8844176887611532, "grad_norm": 5.9648356437683105, "learning_rate": 0.00481083809118058, "loss": 8.2781, "step": 217100 }, { "epoch": 0.8848250667845347, "grad_norm": 2.26716947555542, "learning_rate": 0.004810648563513804, "loss": 8.3153, "step": 217200 }, { "epoch": 0.8852324448079162, "grad_norm": 4.865094184875488, "learning_rate": 0.004810458944692406, "loss": 8.3395, "step": 217300 }, { "epoch": 0.8856398228312976, "grad_norm": 3.7103941440582275, "learning_rate": 0.004810269234723887, "loss": 8.3044, "step": 217400 }, { "epoch": 0.8860472008546791, "grad_norm": 6.04404878616333, "learning_rate": 0.004810079433615747, "loss": 8.2884, "step": 217500 }, { "epoch": 0.8864545788780606, "grad_norm": 8.202356338500977, "learning_rate": 0.004809889541375489, "loss": 8.2241, "step": 217600 }, { "epoch": 0.886861956901442, "grad_norm": 4.739055156707764, "learning_rate": 0.004809699558010626, "loss": 8.2208, "step": 217700 }, { "epoch": 0.8872693349248235, "grad_norm": 4.190792560577393, "learning_rate": 0.0048095094835286555, "loss": 8.2104, "step": 217800 }, { "epoch": 0.8876767129482049, "grad_norm": 4.8854241371154785, "learning_rate": 0.0048093193179371, "loss": 8.1439, "step": 217900 }, { "epoch": 0.8880840909715864, "grad_norm": 5.460559368133545, "learning_rate": 0.004809129061243479, "loss": 8.1067, "step": 218000 }, { "epoch": 0.8880840909715864, "eval_MaskedAccuracy": 0.4865924815837562, "eval_loss": 1.7140520811080933, "eval_runtime": 552.9316, "eval_samples_per_second": 114.799, "eval_steps_per_second": 0.449, "step": 218000 }, { "epoch": 0.8884914689949679, "grad_norm": 12.777691841125488, "learning_rate": 0.0048089387134553135, "loss": 8.1755, "step": 218100 }, { "epoch": 0.8888988470183493, "grad_norm": 5.406383991241455, "learning_rate": 0.004808748274580128, "loss": 8.2656, "step": 218200 }, { "epoch": 0.8893062250417307, "grad_norm": 6.635910511016846, "learning_rate": 0.004808557744625446, "loss": 8.2507, "step": 218300 }, { "epoch": 0.8897136030651123, "grad_norm": 5.01666259765625, "learning_rate": 0.004808367123598811, "loss": 8.1709, "step": 218400 }, { "epoch": 0.8901209810884937, "grad_norm": 1.3777137994766235, "learning_rate": 0.004808176411507757, "loss": 8.1806, "step": 218500 }, { "epoch": 0.8905283591118752, "grad_norm": 0.8259178996086121, "learning_rate": 0.004807985608359816, "loss": 8.2648, "step": 218600 }, { "epoch": 0.8909357371352566, "grad_norm": 5.552270889282227, "learning_rate": 0.004807794714162537, "loss": 8.3019, "step": 218700 }, { "epoch": 0.8913431151586381, "grad_norm": 5.504025459289551, "learning_rate": 0.004807603728923469, "loss": 8.2998, "step": 218800 }, { "epoch": 0.8917504931820196, "grad_norm": 3.2907345294952393, "learning_rate": 0.004807412652650158, "loss": 8.2237, "step": 218900 }, { "epoch": 0.892157871205401, "grad_norm": 2.9608821868896484, "learning_rate": 0.004807221485350164, "loss": 8.1758, "step": 219000 }, { "epoch": 0.892157871205401, "eval_MaskedAccuracy": 0.48468191219256895, "eval_loss": 1.7179230451583862, "eval_runtime": 607.2542, "eval_samples_per_second": 104.53, "eval_steps_per_second": 0.408, "step": 219000 }, { "epoch": 0.8925652492287824, "grad_norm": 4.358753204345703, "learning_rate": 0.004807030227031042, "loss": 8.1467, "step": 219100 }, { "epoch": 0.892972627252164, "grad_norm": 3.311702013015747, "learning_rate": 0.004806838877700354, "loss": 8.1996, "step": 219200 }, { "epoch": 0.8933800052755454, "grad_norm": 4.425884246826172, "learning_rate": 0.00480664743736566, "loss": 8.1468, "step": 219300 }, { "epoch": 0.8937873832989268, "grad_norm": 4.711964130401611, "learning_rate": 0.004806455906034538, "loss": 8.1468, "step": 219400 }, { "epoch": 0.8941947613223084, "grad_norm": 18.98705291748047, "learning_rate": 0.004806264283714551, "loss": 8.1923, "step": 219500 }, { "epoch": 0.8946021393456898, "grad_norm": 4.508944511413574, "learning_rate": 0.004806072570413277, "loss": 8.2644, "step": 219600 }, { "epoch": 0.8950095173690712, "grad_norm": 6.128897190093994, "learning_rate": 0.004805880766138295, "loss": 8.1459, "step": 219700 }, { "epoch": 0.8954168953924527, "grad_norm": 3.9684112071990967, "learning_rate": 0.004805688870897193, "loss": 8.1247, "step": 219800 }, { "epoch": 0.8958242734158341, "grad_norm": 4.559117794036865, "learning_rate": 0.004805496884697555, "loss": 8.1418, "step": 219900 }, { "epoch": 0.8962316514392157, "grad_norm": 4.458334922790527, "learning_rate": 0.004805304807546975, "loss": 8.1243, "step": 220000 }, { "epoch": 0.8962316514392157, "eval_MaskedAccuracy": 0.4864829520402149, "eval_loss": 1.7186992168426514, "eval_runtime": 605.6496, "eval_samples_per_second": 104.806, "eval_steps_per_second": 0.409, "step": 220000 }, { "epoch": 0.8966390294625971, "grad_norm": 6.1991286277771, "learning_rate": 0.004805112639453043, "loss": 8.1469, "step": 220100 }, { "epoch": 0.8970464074859785, "grad_norm": 4.865583896636963, "learning_rate": 0.004804920380423352, "loss": 8.0631, "step": 220200 }, { "epoch": 0.8974537855093601, "grad_norm": 4.732316493988037, "learning_rate": 0.004804728030465512, "loss": 8.0916, "step": 220300 }, { "epoch": 0.8978611635327415, "grad_norm": 13.524624824523926, "learning_rate": 0.004804535589587124, "loss": 8.1176, "step": 220400 }, { "epoch": 0.8982685415561229, "grad_norm": 5.715292930603027, "learning_rate": 0.004804343057795791, "loss": 8.2386, "step": 220500 }, { "epoch": 0.8986759195795044, "grad_norm": 5.178189277648926, "learning_rate": 0.004804150435099133, "loss": 8.2304, "step": 220600 }, { "epoch": 0.8990832976028859, "grad_norm": 4.106097221374512, "learning_rate": 0.004803957721504758, "loss": 8.203, "step": 220700 }, { "epoch": 0.8994906756262673, "grad_norm": 6.310755729675293, "learning_rate": 0.004803764917020284, "loss": 8.1797, "step": 220800 }, { "epoch": 0.8998980536496488, "grad_norm": 6.1565752029418945, "learning_rate": 0.004803572021653345, "loss": 8.0992, "step": 220900 }, { "epoch": 0.9003054316730302, "grad_norm": 6.599998474121094, "learning_rate": 0.004803379035411562, "loss": 8.1698, "step": 221000 }, { "epoch": 0.9003054316730302, "eval_MaskedAccuracy": 0.4855747451078135, "eval_loss": 1.7180355787277222, "eval_runtime": 632.8244, "eval_samples_per_second": 100.306, "eval_steps_per_second": 0.392, "step": 221000 }, { "epoch": 0.9007128096964118, "grad_norm": 1.949260950088501, "learning_rate": 0.004803185958302558, "loss": 8.2509, "step": 221100 }, { "epoch": 0.9011201877197932, "grad_norm": 4.0300211906433105, "learning_rate": 0.004802992790333973, "loss": 8.2532, "step": 221200 }, { "epoch": 0.9015275657431746, "grad_norm": 4.199332237243652, "learning_rate": 0.0048027995315134435, "loss": 8.2503, "step": 221300 }, { "epoch": 0.9019349437665561, "grad_norm": 5.702756881713867, "learning_rate": 0.004802606181848606, "loss": 8.1876, "step": 221400 }, { "epoch": 0.9023423217899376, "grad_norm": 5.0414838790893555, "learning_rate": 0.004802412741347112, "loss": 8.1659, "step": 221500 }, { "epoch": 0.902749699813319, "grad_norm": 3.2576799392700195, "learning_rate": 0.004802219210016601, "loss": 8.1218, "step": 221600 }, { "epoch": 0.9031570778367005, "grad_norm": 3.9214446544647217, "learning_rate": 0.004802025587864734, "loss": 8.1341, "step": 221700 }, { "epoch": 0.9035644558600819, "grad_norm": 6.032358646392822, "learning_rate": 0.004801831874899155, "loss": 8.1105, "step": 221800 }, { "epoch": 0.9039718338834634, "grad_norm": 6.424914836883545, "learning_rate": 0.004801638071127528, "loss": 8.1357, "step": 221900 }, { "epoch": 0.9043792119068449, "grad_norm": 3.933969020843506, "learning_rate": 0.004801444176557512, "loss": 8.1293, "step": 222000 }, { "epoch": 0.9043792119068449, "eval_MaskedAccuracy": 0.486812708382253, "eval_loss": 1.7057329416275024, "eval_runtime": 636.6308, "eval_samples_per_second": 99.706, "eval_steps_per_second": 0.39, "step": 222000 }, { "epoch": 0.9047865899302263, "grad_norm": 4.703785419464111, "learning_rate": 0.004801250191196776, "loss": 8.1116, "step": 222100 }, { "epoch": 0.9051939679536077, "grad_norm": 4.067076206207275, "learning_rate": 0.0048010561150529955, "loss": 8.1207, "step": 222200 }, { "epoch": 0.9056013459769893, "grad_norm": 4.6195502281188965, "learning_rate": 0.004800861948133838, "loss": 8.114, "step": 222300 }, { "epoch": 0.9060087240003707, "grad_norm": 3.2566609382629395, "learning_rate": 0.0048006676904469745, "loss": 8.2789, "step": 222400 }, { "epoch": 0.9064161020237522, "grad_norm": 5.413635730743408, "learning_rate": 0.00480047334200009, "loss": 8.3084, "step": 222500 }, { "epoch": 0.9068234800471336, "grad_norm": 5.0853095054626465, "learning_rate": 0.004800278902800868, "loss": 8.2031, "step": 222600 }, { "epoch": 0.9072308580705151, "grad_norm": 3.3110313415527344, "learning_rate": 0.004800084372856993, "loss": 8.3235, "step": 222700 }, { "epoch": 0.9076382360938966, "grad_norm": 2.2277956008911133, "learning_rate": 0.004799889752176161, "loss": 8.2674, "step": 222800 }, { "epoch": 0.908045614117278, "grad_norm": 10.274942398071289, "learning_rate": 0.004799695040766057, "loss": 8.3539, "step": 222900 }, { "epoch": 0.9084529921406594, "grad_norm": 4.456951141357422, "learning_rate": 0.004799500238634387, "loss": 8.3261, "step": 223000 }, { "epoch": 0.9084529921406594, "eval_MaskedAccuracy": 0.47838385192669225, "eval_loss": 1.750592827796936, "eval_runtime": 567.4718, "eval_samples_per_second": 111.858, "eval_steps_per_second": 0.437, "step": 223000 }, { "epoch": 0.908860370164041, "grad_norm": 6.92998743057251, "learning_rate": 0.004799305345788852, "loss": 8.1969, "step": 223100 }, { "epoch": 0.9092677481874224, "grad_norm": 2.9873671531677246, "learning_rate": 0.0047991103622371575, "loss": 8.1887, "step": 223200 }, { "epoch": 0.9096751262108038, "grad_norm": 4.813361167907715, "learning_rate": 0.0047989152879870075, "loss": 8.178, "step": 223300 }, { "epoch": 0.9100825042341854, "grad_norm": 2.244819402694702, "learning_rate": 0.0047987201230461135, "loss": 8.2732, "step": 223400 }, { "epoch": 0.9104898822575668, "grad_norm": 4.322752475738525, "learning_rate": 0.004798524867422201, "loss": 8.2718, "step": 223500 }, { "epoch": 0.9108972602809483, "grad_norm": 3.421313762664795, "learning_rate": 0.004798329521122984, "loss": 8.2029, "step": 223600 }, { "epoch": 0.9113046383043297, "grad_norm": 4.621647834777832, "learning_rate": 0.004798134084156185, "loss": 8.27, "step": 223700 }, { "epoch": 0.9117120163277111, "grad_norm": 4.301358699798584, "learning_rate": 0.004797938556529529, "loss": 8.1699, "step": 223800 }, { "epoch": 0.9121193943510927, "grad_norm": 2.086094617843628, "learning_rate": 0.004797742938250747, "loss": 8.217, "step": 223900 }, { "epoch": 0.9125267723744741, "grad_norm": 4.431074142456055, "learning_rate": 0.0047975472293275805, "loss": 8.2045, "step": 224000 }, { "epoch": 0.9125267723744741, "eval_MaskedAccuracy": 0.48309843252640383, "eval_loss": 1.7208740711212158, "eval_runtime": 585.9545, "eval_samples_per_second": 108.329, "eval_steps_per_second": 0.423, "step": 224000 }, { "epoch": 0.9129341503978555, "grad_norm": 5.076047897338867, "learning_rate": 0.004797351429767757, "loss": 8.2371, "step": 224100 }, { "epoch": 0.9133415284212371, "grad_norm": 2.933804512023926, "learning_rate": 0.004797155539579018, "loss": 8.2578, "step": 224200 }, { "epoch": 0.9137489064446185, "grad_norm": 2.729539632797241, "learning_rate": 0.004796959558769112, "loss": 8.2741, "step": 224300 }, { "epoch": 0.9141562844679999, "grad_norm": 4.2180962562561035, "learning_rate": 0.004796763487345783, "loss": 8.1683, "step": 224400 }, { "epoch": 0.9145636624913814, "grad_norm": 5.814899444580078, "learning_rate": 0.0047965673253167924, "loss": 8.2012, "step": 224500 }, { "epoch": 0.9149710405147629, "grad_norm": 1.9807255268096924, "learning_rate": 0.004796371072689886, "loss": 8.176, "step": 224600 }, { "epoch": 0.9153784185381443, "grad_norm": 6.077608585357666, "learning_rate": 0.004796174729472829, "loss": 8.1689, "step": 224700 }, { "epoch": 0.9157857965615258, "grad_norm": 2.7711193561553955, "learning_rate": 0.004795978295673379, "loss": 8.2387, "step": 224800 }, { "epoch": 0.9161931745849072, "grad_norm": 3.4545180797576904, "learning_rate": 0.004795781771299311, "loss": 8.152, "step": 224900 }, { "epoch": 0.9166005526082888, "grad_norm": 5.071669578552246, "learning_rate": 0.004795585156358379, "loss": 8.2743, "step": 225000 }, { "epoch": 0.9166005526082888, "eval_MaskedAccuracy": 0.48105270695267294, "eval_loss": 1.7473244667053223, "eval_runtime": 547.578, "eval_samples_per_second": 115.921, "eval_steps_per_second": 0.453, "step": 225000 }, { "epoch": 0.9170079306316702, "grad_norm": 6.067659378051758, "learning_rate": 0.0047953884508583665, "loss": 8.2189, "step": 225100 }, { "epoch": 0.9174153086550516, "grad_norm": 3.740722179412842, "learning_rate": 0.004795191654807049, "loss": 8.2815, "step": 225200 }, { "epoch": 0.9178226866784331, "grad_norm": 1.9998103380203247, "learning_rate": 0.004794994768212203, "loss": 8.2135, "step": 225300 }, { "epoch": 0.9182300647018146, "grad_norm": 4.962710380554199, "learning_rate": 0.004794797791081626, "loss": 8.166, "step": 225400 }, { "epoch": 0.918637442725196, "grad_norm": 0.5499230623245239, "learning_rate": 0.0047946007234230936, "loss": 8.1433, "step": 225500 }, { "epoch": 0.9190448207485775, "grad_norm": 3.616839647293091, "learning_rate": 0.004794403565244396, "loss": 8.267, "step": 225600 }, { "epoch": 0.9194521987719589, "grad_norm": 2.500506639480591, "learning_rate": 0.004794206316553328, "loss": 8.1619, "step": 225700 }, { "epoch": 0.9198595767953404, "grad_norm": 4.259340286254883, "learning_rate": 0.0047940089773576946, "loss": 8.2626, "step": 225800 }, { "epoch": 0.9202669548187219, "grad_norm": 3.6628053188323975, "learning_rate": 0.004793811547665293, "loss": 8.2379, "step": 225900 }, { "epoch": 0.9206743328421033, "grad_norm": 3.1447908878326416, "learning_rate": 0.0047936140274839325, "loss": 8.2492, "step": 226000 }, { "epoch": 0.9206743328421033, "eval_MaskedAccuracy": 0.47904936406652626, "eval_loss": 1.7529270648956299, "eval_runtime": 577.6785, "eval_samples_per_second": 109.881, "eval_steps_per_second": 0.429, "step": 226000 }, { "epoch": 0.9210817108654848, "grad_norm": 8.835447311401367, "learning_rate": 0.00479341641682142, "loss": 8.2834, "step": 226100 }, { "epoch": 0.9214890888888663, "grad_norm": 6.550544738769531, "learning_rate": 0.004793218715685563, "loss": 8.2341, "step": 226200 }, { "epoch": 0.9218964669122477, "grad_norm": 6.690319061279297, "learning_rate": 0.0047930209240841844, "loss": 8.182, "step": 226300 }, { "epoch": 0.9223038449356292, "grad_norm": 5.749084949493408, "learning_rate": 0.004792823042025101, "loss": 8.2981, "step": 226400 }, { "epoch": 0.9227112229590106, "grad_norm": 4.789872169494629, "learning_rate": 0.004792625069516135, "loss": 8.1843, "step": 226500 }, { "epoch": 0.9231186009823921, "grad_norm": 2.5787711143493652, "learning_rate": 0.004792427006565109, "loss": 8.1366, "step": 226600 }, { "epoch": 0.9235259790057736, "grad_norm": 3.2012808322906494, "learning_rate": 0.004792228853179861, "loss": 8.198, "step": 226700 }, { "epoch": 0.923933357029155, "grad_norm": 3.811048984527588, "learning_rate": 0.004792030609368229, "loss": 8.1901, "step": 226800 }, { "epoch": 0.9243407350525364, "grad_norm": 4.617664813995361, "learning_rate": 0.004791832275138043, "loss": 8.1577, "step": 226900 }, { "epoch": 0.924748113075918, "grad_norm": 6.046353340148926, "learning_rate": 0.004791633850497142, "loss": 8.1584, "step": 227000 }, { "epoch": 0.924748113075918, "eval_MaskedAccuracy": 0.4844946360455185, "eval_loss": 1.7199093103408813, "eval_runtime": 634.3689, "eval_samples_per_second": 100.062, "eval_steps_per_second": 0.391, "step": 227000 }, { "epoch": 0.9251554910992994, "grad_norm": 3.4626755714416504, "learning_rate": 0.004791435335453377, "loss": 8.2272, "step": 227100 }, { "epoch": 0.9255628691226808, "grad_norm": 1.8042727708816528, "learning_rate": 0.004791236730014602, "loss": 8.1952, "step": 227200 }, { "epoch": 0.9259702471460624, "grad_norm": 2.550905466079712, "learning_rate": 0.004791038034188654, "loss": 8.294, "step": 227300 }, { "epoch": 0.9263776251694438, "grad_norm": 0.6780903339385986, "learning_rate": 0.0047908392479833984, "loss": 8.2901, "step": 227400 }, { "epoch": 0.9267850031928253, "grad_norm": 2.728419542312622, "learning_rate": 0.004790640371406698, "loss": 8.3103, "step": 227500 }, { "epoch": 0.9271923812162067, "grad_norm": 2.4830403327941895, "learning_rate": 0.004790441404466405, "loss": 8.26, "step": 227600 }, { "epoch": 0.9275997592395882, "grad_norm": 5.732196807861328, "learning_rate": 0.004790242347170389, "loss": 8.2094, "step": 227700 }, { "epoch": 0.9280071372629697, "grad_norm": 5.323036193847656, "learning_rate": 0.004790043199526522, "loss": 8.174, "step": 227800 }, { "epoch": 0.9284145152863511, "grad_norm": 5.812360763549805, "learning_rate": 0.004789843961542677, "loss": 8.116, "step": 227900 }, { "epoch": 0.9288218933097325, "grad_norm": 5.612771034240723, "learning_rate": 0.0047896446332267315, "loss": 8.0961, "step": 228000 }, { "epoch": 0.9288218933097325, "eval_MaskedAccuracy": 0.48676481560811663, "eval_loss": 1.708063006401062, "eval_runtime": 536.141, "eval_samples_per_second": 118.394, "eval_steps_per_second": 0.463, "step": 228000 }, { "epoch": 0.9292292713331141, "grad_norm": 5.939042091369629, "learning_rate": 0.004789445214586563, "loss": 8.1447, "step": 228100 }, { "epoch": 0.9296366493564955, "grad_norm": 5.644360065460205, "learning_rate": 0.004789245705630057, "loss": 8.0991, "step": 228200 }, { "epoch": 0.9300440273798769, "grad_norm": 6.242342948913574, "learning_rate": 0.004789046106365095, "loss": 8.0959, "step": 228300 }, { "epoch": 0.9304514054032584, "grad_norm": 4.7664475440979, "learning_rate": 0.004788846416799579, "loss": 8.1116, "step": 228400 }, { "epoch": 0.9308587834266399, "grad_norm": 4.1409807205200195, "learning_rate": 0.0047886466369413995, "loss": 8.1154, "step": 228500 }, { "epoch": 0.9312661614500214, "grad_norm": 5.9070658683776855, "learning_rate": 0.004788446766798459, "loss": 8.1011, "step": 228600 }, { "epoch": 0.9316735394734028, "grad_norm": 4.916849136352539, "learning_rate": 0.004788246806378654, "loss": 8.0977, "step": 228700 }, { "epoch": 0.9320809174967842, "grad_norm": 4.7911553382873535, "learning_rate": 0.00478804675568989, "loss": 8.0778, "step": 228800 }, { "epoch": 0.9324882955201658, "grad_norm": 4.861231803894043, "learning_rate": 0.004787846614740072, "loss": 8.0898, "step": 228900 }, { "epoch": 0.9328956735435472, "grad_norm": 2.1762640476226807, "learning_rate": 0.004787646383537121, "loss": 8.142, "step": 229000 }, { "epoch": 0.9328956735435472, "eval_MaskedAccuracy": 0.4788053354069694, "eval_loss": 1.7484434843063354, "eval_runtime": 558.994, "eval_samples_per_second": 113.554, "eval_steps_per_second": 0.444, "step": 229000 }, { "epoch": 0.9333030515669286, "grad_norm": 4.250251770019531, "learning_rate": 0.004787446062088946, "loss": 8.2858, "step": 229100 }, { "epoch": 0.9337104295903101, "grad_norm": 3.8813138008117676, "learning_rate": 0.004787245650403472, "loss": 8.2115, "step": 229200 }, { "epoch": 0.9341178076136916, "grad_norm": 5.123370170593262, "learning_rate": 0.00478704514848862, "loss": 8.1835, "step": 229300 }, { "epoch": 0.934525185637073, "grad_norm": 5.150139331817627, "learning_rate": 0.004786844556352322, "loss": 8.2508, "step": 229400 }, { "epoch": 0.9349325636604545, "grad_norm": 6.384917736053467, "learning_rate": 0.004786643874002503, "loss": 8.1474, "step": 229500 }, { "epoch": 0.9353399416838359, "grad_norm": 5.372497081756592, "learning_rate": 0.0047864431014470995, "loss": 8.1572, "step": 229600 }, { "epoch": 0.9357473197072174, "grad_norm": 6.097373008728027, "learning_rate": 0.004786242238694046, "loss": 8.1109, "step": 229700 }, { "epoch": 0.9361546977305989, "grad_norm": 4.463663578033447, "learning_rate": 0.004786041285751292, "loss": 8.1152, "step": 229800 }, { "epoch": 0.9365620757539803, "grad_norm": 5.550436973571777, "learning_rate": 0.004785840242626772, "loss": 8.115, "step": 229900 }, { "epoch": 0.9369694537773618, "grad_norm": 6.260983467102051, "learning_rate": 0.0047856391093284425, "loss": 8.1214, "step": 230000 }, { "epoch": 0.9369694537773618, "eval_MaskedAccuracy": 0.48657717794435645, "eval_loss": 1.7167582511901855, "eval_runtime": 611.5136, "eval_samples_per_second": 103.801, "eval_steps_per_second": 0.406, "step": 230000 }, { "epoch": 0.9373768318007433, "grad_norm": 5.187522888183594, "learning_rate": 0.004785437885864245, "loss": 8.1297, "step": 230100 }, { "epoch": 0.9377842098241247, "grad_norm": 5.606808185577393, "learning_rate": 0.00478523657224214, "loss": 8.1035, "step": 230200 }, { "epoch": 0.9381915878475062, "grad_norm": 3.369248867034912, "learning_rate": 0.0047850351684700886, "loss": 8.2793, "step": 230300 }, { "epoch": 0.9385989658708876, "grad_norm": 6.86972188949585, "learning_rate": 0.004784833674556057, "loss": 8.2764, "step": 230400 }, { "epoch": 0.9390063438942691, "grad_norm": 6.138077259063721, "learning_rate": 0.004784632090508006, "loss": 8.1714, "step": 230500 }, { "epoch": 0.9394137219176506, "grad_norm": 6.523768901824951, "learning_rate": 0.004784430416333906, "loss": 8.1312, "step": 230600 }, { "epoch": 0.939821099941032, "grad_norm": 4.867851257324219, "learning_rate": 0.004784228652041728, "loss": 8.1492, "step": 230700 }, { "epoch": 0.9402284779644134, "grad_norm": 8.429380416870117, "learning_rate": 0.004784026797639454, "loss": 8.0925, "step": 230800 }, { "epoch": 0.940635855987795, "grad_norm": 5.946533203125, "learning_rate": 0.0047838248531350625, "loss": 8.0991, "step": 230900 }, { "epoch": 0.9410432340111764, "grad_norm": 5.149491786956787, "learning_rate": 0.004783622818536535, "loss": 8.0938, "step": 231000 }, { "epoch": 0.9410432340111764, "eval_MaskedAccuracy": 0.4877264425603884, "eval_loss": 1.7004443407058716, "eval_runtime": 635.2588, "eval_samples_per_second": 99.921, "eval_steps_per_second": 0.39, "step": 231000 }, { "epoch": 0.9414506120345579, "grad_norm": 3.676353693008423, "learning_rate": 0.004783420693851859, "loss": 8.0954, "step": 231100 }, { "epoch": 0.9418579900579394, "grad_norm": 8.989383697509766, "learning_rate": 0.004783218479089029, "loss": 8.1397, "step": 231200 }, { "epoch": 0.9422653680813208, "grad_norm": 8.321223258972168, "learning_rate": 0.004783016174256035, "loss": 8.3312, "step": 231300 }, { "epoch": 0.9426727461047023, "grad_norm": 4.853785037994385, "learning_rate": 0.004782813779360881, "loss": 8.2826, "step": 231400 }, { "epoch": 0.9430801241280837, "grad_norm": 2.8699069023132324, "learning_rate": 0.004782611294411558, "loss": 8.2423, "step": 231500 }, { "epoch": 0.9434875021514652, "grad_norm": 2.4450199604034424, "learning_rate": 0.004782408719416081, "loss": 8.2719, "step": 231600 }, { "epoch": 0.9438948801748467, "grad_norm": 6.831642150878906, "learning_rate": 0.004782206054382461, "loss": 8.1932, "step": 231700 }, { "epoch": 0.9443022581982281, "grad_norm": 1.0070613622665405, "learning_rate": 0.004782003299318706, "loss": 8.2675, "step": 231800 }, { "epoch": 0.9447096362216095, "grad_norm": 4.175207138061523, "learning_rate": 0.004781800454232826, "loss": 8.2211, "step": 231900 }, { "epoch": 0.9451170142449911, "grad_norm": 4.457779407501221, "learning_rate": 0.004781597519132854, "loss": 8.1839, "step": 232000 }, { "epoch": 0.9451170142449911, "eval_MaskedAccuracy": 0.4806816294542539, "eval_loss": 1.7424594163894653, "eval_runtime": 585.7934, "eval_samples_per_second": 108.359, "eval_steps_per_second": 0.423, "step": 232000 }, { "epoch": 0.9455243922683725, "grad_norm": 1.6671425104141235, "learning_rate": 0.0047813944940268025, "loss": 8.2148, "step": 232100 }, { "epoch": 0.9459317702917539, "grad_norm": 2.40228271484375, "learning_rate": 0.004781191378922703, "loss": 8.219, "step": 232200 }, { "epoch": 0.9463391483151354, "grad_norm": 2.7009570598602295, "learning_rate": 0.004780988173828587, "loss": 8.205, "step": 232300 }, { "epoch": 0.9467465263385169, "grad_norm": 2.485184907913208, "learning_rate": 0.004780784878752484, "loss": 8.1793, "step": 232400 }, { "epoch": 0.9471539043618984, "grad_norm": 10.31472110748291, "learning_rate": 0.00478058149370243, "loss": 8.247, "step": 232500 }, { "epoch": 0.9475612823852798, "grad_norm": 5.390857696533203, "learning_rate": 0.004780378018686473, "loss": 8.2979, "step": 232600 }, { "epoch": 0.9479686604086612, "grad_norm": 1.4720979928970337, "learning_rate": 0.004780174453712653, "loss": 8.1861, "step": 232700 }, { "epoch": 0.9483760384320428, "grad_norm": 0.8540545701980591, "learning_rate": 0.004779970798789013, "loss": 8.213, "step": 232800 }, { "epoch": 0.9487834164554242, "grad_norm": 2.123645305633545, "learning_rate": 0.004779767053923611, "loss": 8.2196, "step": 232900 }, { "epoch": 0.9491907944788056, "grad_norm": 0.45868319272994995, "learning_rate": 0.0047795632191245056, "loss": 8.2264, "step": 233000 }, { "epoch": 0.9491907944788056, "eval_MaskedAccuracy": 0.4805641898194909, "eval_loss": 1.7387572526931763, "eval_runtime": 633.9238, "eval_samples_per_second": 100.132, "eval_steps_per_second": 0.391, "step": 233000 }, { "epoch": 0.9495981725021871, "grad_norm": 6.627081394195557, "learning_rate": 0.004779359294399752, "loss": 8.1783, "step": 233100 }, { "epoch": 0.9500055505255686, "grad_norm": 6.244868278503418, "learning_rate": 0.004779155279757411, "loss": 8.154, "step": 233200 }, { "epoch": 0.95041292854895, "grad_norm": 6.573578834533691, "learning_rate": 0.00477895117520555, "loss": 8.1343, "step": 233300 }, { "epoch": 0.9508203065723315, "grad_norm": 5.183780193328857, "learning_rate": 0.004778746980752234, "loss": 8.1178, "step": 233400 }, { "epoch": 0.9512276845957129, "grad_norm": 7.468198776245117, "learning_rate": 0.004778542696405532, "loss": 8.1136, "step": 233500 }, { "epoch": 0.9516350626190945, "grad_norm": 4.690491676330566, "learning_rate": 0.004778338322173534, "loss": 8.1155, "step": 233600 }, { "epoch": 0.9520424406424759, "grad_norm": 5.657881736755371, "learning_rate": 0.004778133858064316, "loss": 8.1164, "step": 233700 }, { "epoch": 0.9524498186658573, "grad_norm": 5.6973772048950195, "learning_rate": 0.004777929304085957, "loss": 8.0804, "step": 233800 }, { "epoch": 0.9528571966892389, "grad_norm": 3.773672103881836, "learning_rate": 0.004777724660246545, "loss": 8.0917, "step": 233900 }, { "epoch": 0.9532645747126203, "grad_norm": 5.486586570739746, "learning_rate": 0.004777519926554175, "loss": 8.0745, "step": 234000 }, { "epoch": 0.9532645747126203, "eval_MaskedAccuracy": 0.4871141334534211, "eval_loss": 1.711511492729187, "eval_runtime": 565.6537, "eval_samples_per_second": 112.217, "eval_steps_per_second": 0.438, "step": 234000 }, { "epoch": 0.9536719527360017, "grad_norm": 6.356557369232178, "learning_rate": 0.004777315103016933, "loss": 8.0888, "step": 234100 }, { "epoch": 0.9540793307593832, "grad_norm": 4.9865546226501465, "learning_rate": 0.004777110189642922, "loss": 8.0503, "step": 234200 }, { "epoch": 0.9544867087827646, "grad_norm": 3.628206968307495, "learning_rate": 0.0047769051864402475, "loss": 8.1838, "step": 234300 }, { "epoch": 0.9548940868061461, "grad_norm": 4.247488498687744, "learning_rate": 0.004776700093417009, "loss": 8.2788, "step": 234400 }, { "epoch": 0.9553014648295276, "grad_norm": 2.128532886505127, "learning_rate": 0.004776494910581318, "loss": 8.2221, "step": 234500 }, { "epoch": 0.955708842852909, "grad_norm": 4.686913013458252, "learning_rate": 0.004776289637941285, "loss": 8.189, "step": 234600 }, { "epoch": 0.9561162208762904, "grad_norm": 3.012515068054199, "learning_rate": 0.00477608427550502, "loss": 8.2732, "step": 234700 }, { "epoch": 0.956523598899672, "grad_norm": 2.0626823902130127, "learning_rate": 0.004775878823280654, "loss": 8.2672, "step": 234800 }, { "epoch": 0.9569309769230534, "grad_norm": 2.8990352153778076, "learning_rate": 0.004775673281276299, "loss": 8.2235, "step": 234900 }, { "epoch": 0.9573383549464349, "grad_norm": 3.8956902027130127, "learning_rate": 0.004775467649500082, "loss": 8.1327, "step": 235000 }, { "epoch": 0.9573383549464349, "eval_MaskedAccuracy": 0.4866171493477707, "eval_loss": 1.7065390348434448, "eval_runtime": 585.4599, "eval_samples_per_second": 108.421, "eval_steps_per_second": 0.424, "step": 235000 }, { "epoch": 0.9577457329698164, "grad_norm": 3.7147789001464844, "learning_rate": 0.004775261927960138, "loss": 8.1519, "step": 235100 }, { "epoch": 0.9581531109931978, "grad_norm": 2.170576333999634, "learning_rate": 0.004775056116664596, "loss": 8.2474, "step": 235200 }, { "epoch": 0.9585604890165793, "grad_norm": 2.1491498947143555, "learning_rate": 0.0047748502156215915, "loss": 8.2071, "step": 235300 }, { "epoch": 0.9589678670399607, "grad_norm": 6.1416401863098145, "learning_rate": 0.004774644224839271, "loss": 8.1715, "step": 235400 }, { "epoch": 0.9593752450633422, "grad_norm": 1.067305326461792, "learning_rate": 0.004774438144325776, "loss": 8.1493, "step": 235500 }, { "epoch": 0.9597826230867237, "grad_norm": 3.929832696914673, "learning_rate": 0.004774231974089251, "loss": 8.2178, "step": 235600 }, { "epoch": 0.9601900011101051, "grad_norm": 4.910709857940674, "learning_rate": 0.004774025714137852, "loss": 8.1504, "step": 235700 }, { "epoch": 0.9605973791334865, "grad_norm": 4.173421382904053, "learning_rate": 0.004773819364479737, "loss": 8.2535, "step": 235800 }, { "epoch": 0.9610047571568681, "grad_norm": 0.8288520574569702, "learning_rate": 0.004773612925123056, "loss": 8.2277, "step": 235900 }, { "epoch": 0.9614121351802495, "grad_norm": 20.392513275146484, "learning_rate": 0.0047734063960759646, "loss": 8.234, "step": 236000 }, { "epoch": 0.9614121351802495, "eval_MaskedAccuracy": 0.4783882551329281, "eval_loss": 1.7532862424850464, "eval_runtime": 514.8995, "eval_samples_per_second": 123.278, "eval_steps_per_second": 0.482, "step": 236000 }, { "epoch": 0.961819513203631, "grad_norm": 2.5947885513305664, "learning_rate": 0.004773199777346642, "loss": 8.256, "step": 236100 }, { "epoch": 0.9622268912270124, "grad_norm": 8.043744087219238, "learning_rate": 0.004772993068943253, "loss": 8.1542, "step": 236200 }, { "epoch": 0.9626342692503939, "grad_norm": 5.169923305511475, "learning_rate": 0.004772786270873959, "loss": 8.129, "step": 236300 }, { "epoch": 0.9630416472737754, "grad_norm": 4.159392356872559, "learning_rate": 0.004772579383146947, "loss": 8.2045, "step": 236400 }, { "epoch": 0.9634490252971568, "grad_norm": 3.3252909183502197, "learning_rate": 0.004772372405770392, "loss": 8.1375, "step": 236500 }, { "epoch": 0.9638564033205382, "grad_norm": 4.924101829528809, "learning_rate": 0.004772165338752485, "loss": 8.1663, "step": 236600 }, { "epoch": 0.9642637813439198, "grad_norm": 3.6718101501464844, "learning_rate": 0.0047719581821014075, "loss": 8.1902, "step": 236700 }, { "epoch": 0.9646711593673012, "grad_norm": 3.7462430000305176, "learning_rate": 0.004771750935825343, "loss": 8.1595, "step": 236800 }, { "epoch": 0.9650785373906826, "grad_norm": 5.799289703369141, "learning_rate": 0.004771543599932501, "loss": 8.1417, "step": 236900 }, { "epoch": 0.9654859154140641, "grad_norm": 5.882828712463379, "learning_rate": 0.0047713361744310566, "loss": 8.0669, "step": 237000 }, { "epoch": 0.9654859154140641, "eval_MaskedAccuracy": 0.48794035925860585, "eval_loss": 1.7013667821884155, "eval_runtime": 567.2874, "eval_samples_per_second": 111.894, "eval_steps_per_second": 0.437, "step": 237000 }, { "epoch": 0.9658932934374456, "grad_norm": 4.099752426147461, "learning_rate": 0.0047711286593292235, "loss": 8.1008, "step": 237100 }, { "epoch": 0.966300671460827, "grad_norm": 13.680000305175781, "learning_rate": 0.004770921054635203, "loss": 8.08, "step": 237200 }, { "epoch": 0.9667080494842085, "grad_norm": 4.279512882232666, "learning_rate": 0.004770713360357208, "loss": 8.2708, "step": 237300 }, { "epoch": 0.9671154275075899, "grad_norm": 4.536948204040527, "learning_rate": 0.004770505576503441, "loss": 8.1607, "step": 237400 }, { "epoch": 0.9675228055309715, "grad_norm": 4.030037879943848, "learning_rate": 0.004770297703082127, "loss": 8.075, "step": 237500 }, { "epoch": 0.9679301835543529, "grad_norm": 6.2266459465026855, "learning_rate": 0.004770089740101478, "loss": 8.0967, "step": 237600 }, { "epoch": 0.9683375615777343, "grad_norm": 4.494446754455566, "learning_rate": 0.004769881687569713, "loss": 8.0975, "step": 237700 }, { "epoch": 0.9687449396011159, "grad_norm": 4.861915111541748, "learning_rate": 0.004769673545495061, "loss": 8.125, "step": 237800 }, { "epoch": 0.9691523176244973, "grad_norm": 6.571840286254883, "learning_rate": 0.004769465313885759, "loss": 8.1002, "step": 237900 }, { "epoch": 0.9695596956478787, "grad_norm": 1.3625410795211792, "learning_rate": 0.004769256992750033, "loss": 8.2364, "step": 238000 }, { "epoch": 0.9695596956478787, "eval_MaskedAccuracy": 0.48214247495957324, "eval_loss": 1.7290294170379639, "eval_runtime": 535.4762, "eval_samples_per_second": 118.541, "eval_steps_per_second": 0.463, "step": 238000 }, { "epoch": 0.9699670736712602, "grad_norm": 4.830008029937744, "learning_rate": 0.004769048582096117, "loss": 8.1727, "step": 238100 }, { "epoch": 0.9703744516946416, "grad_norm": 1.5990383625030518, "learning_rate": 0.0047688400819322525, "loss": 8.1817, "step": 238200 }, { "epoch": 0.9707818297180231, "grad_norm": 3.1707334518432617, "learning_rate": 0.004768631492266677, "loss": 8.238, "step": 238300 }, { "epoch": 0.9711892077414046, "grad_norm": 8.788178443908691, "learning_rate": 0.00476842281310764, "loss": 8.2217, "step": 238400 }, { "epoch": 0.971596585764786, "grad_norm": 2.474496364593506, "learning_rate": 0.004768214044463394, "loss": 8.2857, "step": 238500 }, { "epoch": 0.9720039637881674, "grad_norm": 5.40533971786499, "learning_rate": 0.004768005186342199, "loss": 8.2393, "step": 238600 }, { "epoch": 0.972411341811549, "grad_norm": 4.223481178283691, "learning_rate": 0.0047677962387523065, "loss": 8.1331, "step": 238700 }, { "epoch": 0.9728187198349304, "grad_norm": 5.080321788787842, "learning_rate": 0.004767587201701966, "loss": 8.1194, "step": 238800 }, { "epoch": 0.9732260978583119, "grad_norm": 4.823538303375244, "learning_rate": 0.004767378075199454, "loss": 8.1349, "step": 238900 }, { "epoch": 0.9736334758816934, "grad_norm": 5.152235507965088, "learning_rate": 0.00476716885925304, "loss": 8.0819, "step": 239000 }, { "epoch": 0.9736334758816934, "eval_MaskedAccuracy": 0.48745703216586467, "eval_loss": 1.6963986158370972, "eval_runtime": 620.819, "eval_samples_per_second": 102.246, "eval_steps_per_second": 0.399, "step": 239000 }, { "epoch": 0.9740408539050748, "grad_norm": 9.305587768554688, "learning_rate": 0.004766959553870994, "loss": 8.0992, "step": 239100 }, { "epoch": 0.9744482319284563, "grad_norm": 3.530278205871582, "learning_rate": 0.004766750159061589, "loss": 8.0774, "step": 239200 }, { "epoch": 0.9748556099518377, "grad_norm": 1.316430926322937, "learning_rate": 0.004766540674833102, "loss": 8.1489, "step": 239300 }, { "epoch": 0.9752629879752192, "grad_norm": 3.6197664737701416, "learning_rate": 0.004766331101193819, "loss": 8.2662, "step": 239400 }, { "epoch": 0.9756703659986007, "grad_norm": 2.255551815032959, "learning_rate": 0.004766121438152025, "loss": 8.2711, "step": 239500 }, { "epoch": 0.9760777440219821, "grad_norm": 3.01822566986084, "learning_rate": 0.004765911685716005, "loss": 8.2768, "step": 239600 }, { "epoch": 0.9764851220453635, "grad_norm": 3.766021490097046, "learning_rate": 0.004765701843894052, "loss": 8.1865, "step": 239700 }, { "epoch": 0.9768925000687451, "grad_norm": 1.137575387954712, "learning_rate": 0.004765491912694465, "loss": 8.218, "step": 239800 }, { "epoch": 0.9772998780921265, "grad_norm": 1.7848414182662964, "learning_rate": 0.00476528189212554, "loss": 8.2495, "step": 239900 }, { "epoch": 0.977707256115508, "grad_norm": 4.410600185394287, "learning_rate": 0.004765071782195583, "loss": 8.2364, "step": 240000 }, { "epoch": 0.977707256115508, "eval_MaskedAccuracy": 0.4809854694355085, "eval_loss": 1.7356899976730347, "eval_runtime": 675.4649, "eval_samples_per_second": 93.974, "eval_steps_per_second": 0.367, "step": 240000 }, { "epoch": 0.9781146341388894, "grad_norm": 1.4677624702453613, "learning_rate": 0.0047648615829129016, "loss": 8.2211, "step": 240100 }, { "epoch": 0.9785220121622709, "grad_norm": 3.9822754859924316, "learning_rate": 0.004764651294285809, "loss": 8.1547, "step": 240200 }, { "epoch": 0.9789293901856524, "grad_norm": 6.896245002746582, "learning_rate": 0.00476444091632261, "loss": 8.1219, "step": 240300 }, { "epoch": 0.9793367682090338, "grad_norm": 3.443446397781372, "learning_rate": 0.0047642304490316326, "loss": 8.1556, "step": 240400 }, { "epoch": 0.9797441462324152, "grad_norm": 3.5489230155944824, "learning_rate": 0.004764019892421191, "loss": 8.185, "step": 240500 }, { "epoch": 0.9801515242557968, "grad_norm": 3.103484630584717, "learning_rate": 0.004763809246499613, "loss": 8.1697, "step": 240600 }, { "epoch": 0.9805589022791782, "grad_norm": 3.4157602787017822, "learning_rate": 0.004763598511275223, "loss": 8.295, "step": 240700 }, { "epoch": 0.9809662803025596, "grad_norm": 1.5067360401153564, "learning_rate": 0.004763387686756353, "loss": 8.2628, "step": 240800 }, { "epoch": 0.9813736583259411, "grad_norm": 3.985395669937134, "learning_rate": 0.004763176772951338, "loss": 8.2128, "step": 240900 }, { "epoch": 0.9817810363493226, "grad_norm": 2.806593418121338, "learning_rate": 0.00476296576986852, "loss": 8.145, "step": 241000 }, { "epoch": 0.9817810363493226, "eval_MaskedAccuracy": 0.48619838289017225, "eval_loss": 1.7169946432113647, "eval_runtime": 610.259, "eval_samples_per_second": 104.015, "eval_steps_per_second": 0.406, "step": 241000 }, { "epoch": 0.982188414372704, "grad_norm": 3.1887853145599365, "learning_rate": 0.004762754677516234, "loss": 8.2021, "step": 241100 }, { "epoch": 0.9825957923960855, "grad_norm": 5.074123382568359, "learning_rate": 0.004762543495902832, "loss": 8.182, "step": 241200 }, { "epoch": 0.9830031704194669, "grad_norm": 5.17244815826416, "learning_rate": 0.00476233222503666, "loss": 8.1297, "step": 241300 }, { "epoch": 0.9834105484428485, "grad_norm": 7.497429370880127, "learning_rate": 0.004762120864926077, "loss": 8.1496, "step": 241400 }, { "epoch": 0.9838179264662299, "grad_norm": 1.6269021034240723, "learning_rate": 0.004761909415579431, "loss": 8.1612, "step": 241500 }, { "epoch": 0.9842253044896113, "grad_norm": 3.574080228805542, "learning_rate": 0.004761697877005083, "loss": 8.1715, "step": 241600 }, { "epoch": 0.9846326825129929, "grad_norm": 4.916479587554932, "learning_rate": 0.004761486249211402, "loss": 8.1714, "step": 241700 }, { "epoch": 0.9850400605363743, "grad_norm": 2.030855178833008, "learning_rate": 0.0047612745322067485, "loss": 8.1239, "step": 241800 }, { "epoch": 0.9854474385597557, "grad_norm": 9.08359432220459, "learning_rate": 0.004761062725999501, "loss": 8.2372, "step": 241900 }, { "epoch": 0.9858548165831372, "grad_norm": 6.1944193840026855, "learning_rate": 0.00476085083059802, "loss": 8.1997, "step": 242000 }, { "epoch": 0.9858548165831372, "eval_MaskedAccuracy": 0.4853934323516833, "eval_loss": 1.713708758354187, "eval_runtime": 617.3324, "eval_samples_per_second": 102.823, "eval_steps_per_second": 0.402, "step": 242000 }, { "epoch": 0.9862621946065186, "grad_norm": 2.0614213943481445, "learning_rate": 0.004760638846010693, "loss": 8.2301, "step": 242100 }, { "epoch": 0.9866695726299001, "grad_norm": 2.5254554748535156, "learning_rate": 0.004760426772245896, "loss": 8.2202, "step": 242200 }, { "epoch": 0.9870769506532816, "grad_norm": 3.4275104999542236, "learning_rate": 0.004760214609312019, "loss": 8.1426, "step": 242300 }, { "epoch": 0.987484328676663, "grad_norm": 5.422964572906494, "learning_rate": 0.004760002357217439, "loss": 8.1568, "step": 242400 }, { "epoch": 0.9878917067000446, "grad_norm": 1.7899110317230225, "learning_rate": 0.004759790015970564, "loss": 8.1831, "step": 242500 }, { "epoch": 0.988299084723426, "grad_norm": 3.7543323040008545, "learning_rate": 0.004759577585579777, "loss": 8.2366, "step": 242600 }, { "epoch": 0.9887064627468074, "grad_norm": 5.622766017913818, "learning_rate": 0.0047593650660534784, "loss": 8.2532, "step": 242700 }, { "epoch": 0.9891138407701889, "grad_norm": 4.1286821365356445, "learning_rate": 0.004759152457400075, "loss": 8.2909, "step": 242800 }, { "epoch": 0.9895212187935704, "grad_norm": 5.105898380279541, "learning_rate": 0.0047589397596279675, "loss": 8.1774, "step": 242900 }, { "epoch": 0.9899285968169518, "grad_norm": 4.220027923583984, "learning_rate": 0.004758726972745564, "loss": 8.1656, "step": 243000 }, { "epoch": 0.9899285968169518, "eval_MaskedAccuracy": 0.48216152134175716, "eval_loss": 1.731658935546875, "eval_runtime": 620.3479, "eval_samples_per_second": 102.323, "eval_steps_per_second": 0.4, "step": 243000 }, { "epoch": 0.9903359748403333, "grad_norm": 3.5671005249023438, "learning_rate": 0.004758514096761283, "loss": 8.2275, "step": 243100 }, { "epoch": 0.9907433528637147, "grad_norm": 2.6987688541412354, "learning_rate": 0.004758301131683541, "loss": 8.2028, "step": 243200 }, { "epoch": 0.9911507308870962, "grad_norm": 5.813295841217041, "learning_rate": 0.004758088077520751, "loss": 8.2244, "step": 243300 }, { "epoch": 0.9915581089104777, "grad_norm": 2.056358575820923, "learning_rate": 0.004757874934281339, "loss": 8.1549, "step": 243400 }, { "epoch": 0.9919654869338591, "grad_norm": 2.728684663772583, "learning_rate": 0.004757661701973738, "loss": 8.1494, "step": 243500 }, { "epoch": 0.9923728649572405, "grad_norm": 4.71746826171875, "learning_rate": 0.004757448380606363, "loss": 8.11, "step": 243600 }, { "epoch": 0.9927802429806221, "grad_norm": 4.784518241882324, "learning_rate": 0.004757234970187655, "loss": 8.1885, "step": 243700 }, { "epoch": 0.9931876210040035, "grad_norm": 2.37805438041687, "learning_rate": 0.004757021470726056, "loss": 8.1919, "step": 243800 }, { "epoch": 0.993594999027385, "grad_norm": 3.5506582260131836, "learning_rate": 0.004756807882230003, "loss": 8.1366, "step": 243900 }, { "epoch": 0.9940023770507664, "grad_norm": 0.5849319100379944, "learning_rate": 0.004756594204707939, "loss": 8.2411, "step": 244000 }, { "epoch": 0.9940023770507664, "eval_MaskedAccuracy": 0.48107103285078806, "eval_loss": 1.7513779401779175, "eval_runtime": 556.9936, "eval_samples_per_second": 113.962, "eval_steps_per_second": 0.445, "step": 244000 }, { "epoch": 0.9944097550741479, "grad_norm": 1.4658851623535156, "learning_rate": 0.004756380438168312, "loss": 8.2649, "step": 244100 }, { "epoch": 0.9948171330975294, "grad_norm": 6.136865615844727, "learning_rate": 0.004756166582619584, "loss": 8.2526, "step": 244200 }, { "epoch": 0.9952245111209108, "grad_norm": 6.565762996673584, "learning_rate": 0.004755952638070195, "loss": 8.2155, "step": 244300 }, { "epoch": 0.9956318891442922, "grad_norm": 5.260137557983398, "learning_rate": 0.004755738604528605, "loss": 8.2215, "step": 244400 }, { "epoch": 0.9960392671676738, "grad_norm": 2.4687938690185547, "learning_rate": 0.004755524482003282, "loss": 8.228, "step": 244500 }, { "epoch": 0.9964466451910552, "grad_norm": 0.6773878335952759, "learning_rate": 0.0047553102705026875, "loss": 8.2016, "step": 244600 }, { "epoch": 0.9968540232144366, "grad_norm": 5.026801109313965, "learning_rate": 0.0047550959700352995, "loss": 8.2244, "step": 244700 }, { "epoch": 0.9972614012378181, "grad_norm": 5.554771423339844, "learning_rate": 0.004754881580609581, "loss": 8.1483, "step": 244800 }, { "epoch": 0.9976687792611996, "grad_norm": 4.744398593902588, "learning_rate": 0.004754667102234011, "loss": 8.1485, "step": 244900 }, { "epoch": 0.9980761572845811, "grad_norm": 9.949603080749512, "learning_rate": 0.00475445253491707, "loss": 8.2084, "step": 245000 }, { "epoch": 0.9980761572845811, "eval_MaskedAccuracy": 0.47898123664693637, "eval_loss": 1.7470967769622803, "eval_runtime": 537.0918, "eval_samples_per_second": 118.185, "eval_steps_per_second": 0.462, "step": 245000 }, { "epoch": 0.9984835353079625, "grad_norm": 4.404254913330078, "learning_rate": 0.004754237878667238, "loss": 8.2251, "step": 245100 }, { "epoch": 0.9988909133313439, "grad_norm": 4.610202312469482, "learning_rate": 0.004754023133493004, "loss": 8.1291, "step": 245200 }, { "epoch": 0.9992982913547255, "grad_norm": 4.8737335205078125, "learning_rate": 0.004753808299402859, "loss": 8.1327, "step": 245300 }, { "epoch": 0.9997056693781069, "grad_norm": 1.3142606019973755, "learning_rate": 0.004753593376405291, "loss": 8.1187, "step": 245400 }, { "epoch": 1.0001130474014883, "grad_norm": 6.105260848999023, "learning_rate": 0.004753378364508799, "loss": 8.1851, "step": 245500 }, { "epoch": 1.0005204254248699, "grad_norm": 1.9481598138809204, "learning_rate": 0.00475316326372189, "loss": 8.1759, "step": 245600 }, { "epoch": 1.0009278034482514, "grad_norm": 3.2028346061706543, "learning_rate": 0.00475294807405306, "loss": 8.1454, "step": 245700 }, { "epoch": 1.0013351814716327, "grad_norm": 2.652489185333252, "learning_rate": 0.004752732795510824, "loss": 8.1798, "step": 245800 }, { "epoch": 1.0017425594950142, "grad_norm": 6.72688627243042, "learning_rate": 0.00475251742810368, "loss": 8.2579, "step": 245900 }, { "epoch": 1.0021499375183958, "grad_norm": 4.054202556610107, "learning_rate": 0.004752301971840156, "loss": 8.211, "step": 246000 }, { "epoch": 1.0021499375183958, "eval_MaskedAccuracy": 0.4853460253722327, "eval_loss": 1.714086651802063, "eval_runtime": 148.5626, "eval_samples_per_second": 427.268, "eval_steps_per_second": 1.669, "step": 246000 }, { "epoch": 1.002557315541777, "grad_norm": 1.8084056377410889, "learning_rate": 0.0047520864267287695, "loss": 8.2046, "step": 246100 }, { "epoch": 1.0029646935651586, "grad_norm": 0.5682147145271301, "learning_rate": 0.004751870792778029, "loss": 8.23, "step": 246200 }, { "epoch": 1.0033720715885401, "grad_norm": 3.379467010498047, "learning_rate": 0.004751655069996472, "loss": 8.2539, "step": 246300 }, { "epoch": 1.0037794496119214, "grad_norm": 2.14129638671875, "learning_rate": 0.004751439258392628, "loss": 8.1796, "step": 246400 }, { "epoch": 1.004186827635303, "grad_norm": 6.0373921394348145, "learning_rate": 0.004751223357975017, "loss": 8.1889, "step": 246500 }, { "epoch": 1.0045942056586845, "grad_norm": 5.169281959533691, "learning_rate": 0.004751007368752187, "loss": 8.2046, "step": 246600 }, { "epoch": 1.0050015836820658, "grad_norm": 6.113409996032715, "learning_rate": 0.004750791290732675, "loss": 8.108, "step": 246700 }, { "epoch": 1.0054089617054474, "grad_norm": 2.174652338027954, "learning_rate": 0.004750575123925025, "loss": 8.1574, "step": 246800 }, { "epoch": 1.005816339728829, "grad_norm": 4.004303455352783, "learning_rate": 0.0047503588683377755, "loss": 8.1607, "step": 246900 }, { "epoch": 1.0062237177522102, "grad_norm": 2.085434913635254, "learning_rate": 0.004750142523979482, "loss": 8.1984, "step": 247000 }, { "epoch": 1.0062237177522102, "eval_MaskedAccuracy": 0.48152066804454813, "eval_loss": 1.7368111610412598, "eval_runtime": 148.9571, "eval_samples_per_second": 426.136, "eval_steps_per_second": 1.665, "step": 247000 }, { "epoch": 1.0066310957755917, "grad_norm": 4.9853434562683105, "learning_rate": 0.004749926090858699, "loss": 8.1975, "step": 247100 }, { "epoch": 1.0070384737989733, "grad_norm": 1.6406573057174683, "learning_rate": 0.004749709568983974, "loss": 8.1232, "step": 247200 }, { "epoch": 1.0074458518223546, "grad_norm": 5.71746826171875, "learning_rate": 0.004749492958363876, "loss": 8.1713, "step": 247300 }, { "epoch": 1.007853229845736, "grad_norm": 2.008352279663086, "learning_rate": 0.004749276259006973, "loss": 8.102, "step": 247400 }, { "epoch": 1.0082606078691176, "grad_norm": 1.7535492181777954, "learning_rate": 0.004749059470921824, "loss": 8.2237, "step": 247500 }, { "epoch": 1.008667985892499, "grad_norm": 6.515242576599121, "learning_rate": 0.004748842594117008, "loss": 8.2053, "step": 247600 }, { "epoch": 1.0090753639158805, "grad_norm": 3.829084873199463, "learning_rate": 0.004748625628601089, "loss": 8.2227, "step": 247700 }, { "epoch": 1.009482741939262, "grad_norm": 3.9138801097869873, "learning_rate": 0.004748408574382645, "loss": 8.1249, "step": 247800 }, { "epoch": 1.0098901199626433, "grad_norm": 6.763348579406738, "learning_rate": 0.004748191431470262, "loss": 8.1166, "step": 247900 }, { "epoch": 1.0102974979860249, "grad_norm": 0.8834948539733887, "learning_rate": 0.004747974199872523, "loss": 8.1769, "step": 248000 }, { "epoch": 1.0102974979860249, "eval_MaskedAccuracy": 0.48299971296835226, "eval_loss": 1.7294466495513916, "eval_runtime": 149.6861, "eval_samples_per_second": 424.061, "eval_steps_per_second": 1.657, "step": 248000 }, { "epoch": 1.0107048760094064, "grad_norm": 3.2210018634796143, "learning_rate": 0.004747756879598021, "loss": 8.2426, "step": 248100 }, { "epoch": 1.011112254032788, "grad_norm": 4.237231254577637, "learning_rate": 0.00474753947065534, "loss": 8.169, "step": 248200 }, { "epoch": 1.0115196320561692, "grad_norm": 5.079657077789307, "learning_rate": 0.004747321973053085, "loss": 8.1268, "step": 248300 }, { "epoch": 1.0119270100795508, "grad_norm": 3.452561378479004, "learning_rate": 0.004747104386799851, "loss": 8.0861, "step": 248400 }, { "epoch": 1.0123343881029323, "grad_norm": 4.034000396728516, "learning_rate": 0.004746886711904237, "loss": 8.1469, "step": 248500 }, { "epoch": 1.0127417661263136, "grad_norm": 1.7607423067092896, "learning_rate": 0.004746668948374848, "loss": 8.2048, "step": 248600 }, { "epoch": 1.0131491441496951, "grad_norm": 3.3386831283569336, "learning_rate": 0.004746451096220302, "loss": 8.25, "step": 248700 }, { "epoch": 1.0135565221730767, "grad_norm": 5.06843900680542, "learning_rate": 0.004746233155449206, "loss": 8.135, "step": 248800 }, { "epoch": 1.013963900196458, "grad_norm": 5.210528373718262, "learning_rate": 0.004746015126070175, "loss": 8.1363, "step": 248900 }, { "epoch": 1.0143712782198395, "grad_norm": 3.991243839263916, "learning_rate": 0.004745797008091829, "loss": 8.1689, "step": 249000 }, { "epoch": 1.0143712782198395, "eval_MaskedAccuracy": 0.48222734201463563, "eval_loss": 1.7312606573104858, "eval_runtime": 149.2811, "eval_samples_per_second": 425.211, "eval_steps_per_second": 1.661, "step": 249000 }, { "epoch": 1.014778656243221, "grad_norm": 5.869924545288086, "learning_rate": 0.004745578801522792, "loss": 8.1905, "step": 249100 }, { "epoch": 1.0151860342666024, "grad_norm": 3.2548134326934814, "learning_rate": 0.004745360506371691, "loss": 8.2221, "step": 249200 }, { "epoch": 1.015593412289984, "grad_norm": 6.3848137855529785, "learning_rate": 0.00474514212264715, "loss": 8.2396, "step": 249300 }, { "epoch": 1.0160007903133654, "grad_norm": 5.2388105392456055, "learning_rate": 0.004744923650357814, "loss": 8.2026, "step": 249400 }, { "epoch": 1.0164081683367467, "grad_norm": 3.3096816539764404, "learning_rate": 0.004744705089512321, "loss": 8.1698, "step": 249500 }, { "epoch": 1.0168155463601283, "grad_norm": 2.923670530319214, "learning_rate": 0.004744486440119309, "loss": 8.2163, "step": 249600 }, { "epoch": 1.0172229243835098, "grad_norm": 4.202099323272705, "learning_rate": 0.00474426770218742, "loss": 8.1819, "step": 249700 }, { "epoch": 1.0176303024068911, "grad_norm": 6.3655829429626465, "learning_rate": 0.004744048875725303, "loss": 8.2079, "step": 249800 }, { "epoch": 1.0180376804302727, "grad_norm": 2.7881789207458496, "learning_rate": 0.004743829960741609, "loss": 8.1448, "step": 249900 }, { "epoch": 1.0184450584536542, "grad_norm": 1.5355042219161987, "learning_rate": 0.004743610957244993, "loss": 8.1285, "step": 250000 }, { "epoch": 1.0184450584536542, "eval_MaskedAccuracy": 0.48120418762355, "eval_loss": 1.7278765439987183, "eval_runtime": 149.5109, "eval_samples_per_second": 424.558, "eval_steps_per_second": 1.659, "step": 250000 }, { "epoch": 1.0188524364770355, "grad_norm": 6.138425350189209, "learning_rate": 0.004743391865244105, "loss": 8.1409, "step": 250100 }, { "epoch": 1.019259814500417, "grad_norm": 1.641326665878296, "learning_rate": 0.004743172684747619, "loss": 8.1702, "step": 250200 }, { "epoch": 1.0196671925237986, "grad_norm": 5.6269989013671875, "learning_rate": 0.004742953415764191, "loss": 8.1567, "step": 250300 }, { "epoch": 1.0200745705471799, "grad_norm": 4.5842766761779785, "learning_rate": 0.004742734058302499, "loss": 8.1058, "step": 250400 }, { "epoch": 1.0204819485705614, "grad_norm": 4.902566909790039, "learning_rate": 0.004742514612371211, "loss": 8.0938, "step": 250500 }, { "epoch": 1.020889326593943, "grad_norm": 2.6178858280181885, "learning_rate": 0.004742295077979008, "loss": 8.1237, "step": 250600 }, { "epoch": 1.0212967046173245, "grad_norm": 3.580260992050171, "learning_rate": 0.004742075455134564, "loss": 8.1047, "step": 250700 }, { "epoch": 1.0217040826407058, "grad_norm": 6.789980888366699, "learning_rate": 0.004741855743846565, "loss": 8.1315, "step": 250800 }, { "epoch": 1.0221114606640873, "grad_norm": 6.0821638107299805, "learning_rate": 0.00474163594412369, "loss": 8.1029, "step": 250900 }, { "epoch": 1.0225188386874688, "grad_norm": 4.657712459564209, "learning_rate": 0.004741416055974637, "loss": 8.081, "step": 251000 }, { "epoch": 1.0225188386874688, "eval_MaskedAccuracy": 0.4886493288375345, "eval_loss": 1.7067351341247559, "eval_runtime": 158.5147, "eval_samples_per_second": 400.442, "eval_steps_per_second": 1.565, "step": 251000 }, { "epoch": 1.0229262167108502, "grad_norm": 3.2733216285705566, "learning_rate": 0.004741196079408091, "loss": 8.1162, "step": 251100 }, { "epoch": 1.0233335947342317, "grad_norm": 4.076113700866699, "learning_rate": 0.004740976014432755, "loss": 8.2341, "step": 251200 }, { "epoch": 1.0237409727576132, "grad_norm": 6.664559364318848, "learning_rate": 0.0047407558610573315, "loss": 8.2386, "step": 251300 }, { "epoch": 1.0241483507809945, "grad_norm": 5.388659477233887, "learning_rate": 0.004740535619290521, "loss": 8.1699, "step": 251400 }, { "epoch": 1.024555728804376, "grad_norm": 4.1781840324401855, "learning_rate": 0.004740315289141033, "loss": 8.1318, "step": 251500 }, { "epoch": 1.0249631068277576, "grad_norm": 5.653961658477783, "learning_rate": 0.004740094870617571, "loss": 8.2242, "step": 251600 }, { "epoch": 1.025370484851139, "grad_norm": 6.724490165710449, "learning_rate": 0.004739874363728855, "loss": 8.218, "step": 251700 }, { "epoch": 1.0257778628745204, "grad_norm": 6.656342029571533, "learning_rate": 0.004739653768483603, "loss": 8.1545, "step": 251800 }, { "epoch": 1.026185240897902, "grad_norm": 3.1695098876953125, "learning_rate": 0.004739433084890538, "loss": 8.256, "step": 251900 }, { "epoch": 1.0265926189212833, "grad_norm": 6.061586856842041, "learning_rate": 0.00473921231295838, "loss": 8.1495, "step": 252000 }, { "epoch": 1.0265926189212833, "eval_MaskedAccuracy": 0.48661222703316337, "eval_loss": 1.71462881565094, "eval_runtime": 152.3222, "eval_samples_per_second": 416.722, "eval_steps_per_second": 1.628, "step": 252000 }, { "epoch": 1.0269999969446648, "grad_norm": 2.984637498855591, "learning_rate": 0.004738991452695858, "loss": 8.174, "step": 252100 }, { "epoch": 1.0274073749680463, "grad_norm": 4.735746383666992, "learning_rate": 0.0047387705041117105, "loss": 8.2237, "step": 252200 }, { "epoch": 1.0278147529914277, "grad_norm": 2.597001552581787, "learning_rate": 0.004738549467214665, "loss": 8.1202, "step": 252300 }, { "epoch": 1.0282221310148092, "grad_norm": 3.530951738357544, "learning_rate": 0.004738328342013465, "loss": 8.1646, "step": 252400 }, { "epoch": 1.0286295090381907, "grad_norm": 5.350266933441162, "learning_rate": 0.004738107128516851, "loss": 8.1563, "step": 252500 }, { "epoch": 1.029036887061572, "grad_norm": 2.237780809402466, "learning_rate": 0.004737885826733565, "loss": 8.1153, "step": 252600 }, { "epoch": 1.0294442650849536, "grad_norm": 6.5042829513549805, "learning_rate": 0.00473766443667236, "loss": 8.1714, "step": 252700 }, { "epoch": 1.029851643108335, "grad_norm": 2.9459121227264404, "learning_rate": 0.004737442958341991, "loss": 8.1588, "step": 252800 }, { "epoch": 1.0302590211317164, "grad_norm": 4.08525276184082, "learning_rate": 0.004737221391751213, "loss": 8.0959, "step": 252900 }, { "epoch": 1.030666399155098, "grad_norm": 4.466893196105957, "learning_rate": 0.004736999736908782, "loss": 8.1995, "step": 253000 }, { "epoch": 1.030666399155098, "eval_MaskedAccuracy": 0.4787248450113544, "eval_loss": 1.7428325414657593, "eval_runtime": 151.8794, "eval_samples_per_second": 417.937, "eval_steps_per_second": 1.633, "step": 253000 }, { "epoch": 1.0310737771784795, "grad_norm": 8.401349067687988, "learning_rate": 0.0047367779938234655, "loss": 8.2618, "step": 253100 }, { "epoch": 1.031481155201861, "grad_norm": 4.149782180786133, "learning_rate": 0.004736556162504031, "loss": 8.2328, "step": 253200 }, { "epoch": 1.0318885332252423, "grad_norm": 7.0294084548950195, "learning_rate": 0.004736334242959242, "loss": 8.1726, "step": 253300 }, { "epoch": 1.0322959112486239, "grad_norm": 2.5511868000030518, "learning_rate": 0.004736112235197877, "loss": 8.1455, "step": 253400 }, { "epoch": 1.0327032892720054, "grad_norm": 4.843980312347412, "learning_rate": 0.0047358901392287114, "loss": 8.177, "step": 253500 }, { "epoch": 1.0331106672953867, "grad_norm": 3.1217896938323975, "learning_rate": 0.004735667955060528, "loss": 8.1667, "step": 253600 }, { "epoch": 1.0335180453187682, "grad_norm": 13.546120643615723, "learning_rate": 0.0047354456827021114, "loss": 8.173, "step": 253700 }, { "epoch": 1.0339254233421498, "grad_norm": 2.89715576171875, "learning_rate": 0.004735223322162252, "loss": 8.2104, "step": 253800 }, { "epoch": 1.034332801365531, "grad_norm": 5.824563980102539, "learning_rate": 0.0047350008734497314, "loss": 8.2252, "step": 253900 }, { "epoch": 1.0347401793889126, "grad_norm": 5.2591729164123535, "learning_rate": 0.004734778336573352, "loss": 8.2093, "step": 254000 }, { "epoch": 1.0347401793889126, "eval_MaskedAccuracy": 0.4854302678060531, "eval_loss": 1.71346914768219, "eval_runtime": 150.9444, "eval_samples_per_second": 420.526, "eval_steps_per_second": 1.643, "step": 254000 }, { "epoch": 1.0351475574122941, "grad_norm": 3.1785802841186523, "learning_rate": 0.004734555711541909, "loss": 8.1264, "step": 254100 }, { "epoch": 1.0355549354356754, "grad_norm": 1.5771757364273071, "learning_rate": 0.004734332998364197, "loss": 8.1249, "step": 254200 }, { "epoch": 1.035962313459057, "grad_norm": 3.380925178527832, "learning_rate": 0.004734110197049031, "loss": 8.1276, "step": 254300 }, { "epoch": 1.0363696914824385, "grad_norm": 5.520992755889893, "learning_rate": 0.004733887307605224, "loss": 8.1856, "step": 254400 }, { "epoch": 1.0367770695058198, "grad_norm": 2.0409340858459473, "learning_rate": 0.004733664330041577, "loss": 8.177, "step": 254500 }, { "epoch": 1.0371844475292014, "grad_norm": 5.373855113983154, "learning_rate": 0.004733441264366908, "loss": 8.2082, "step": 254600 }, { "epoch": 1.037591825552583, "grad_norm": 2.790085792541504, "learning_rate": 0.004733218110590038, "loss": 8.1732, "step": 254700 }, { "epoch": 1.0379992035759642, "grad_norm": 1.4873521327972412, "learning_rate": 0.004732994868719789, "loss": 8.1835, "step": 254800 }, { "epoch": 1.0384065815993457, "grad_norm": 1.530429720878601, "learning_rate": 0.004732771538764989, "loss": 8.2247, "step": 254900 }, { "epoch": 1.0388139596227273, "grad_norm": 3.61285400390625, "learning_rate": 0.004732548120734462, "loss": 8.1749, "step": 255000 }, { "epoch": 1.0388139596227273, "eval_MaskedAccuracy": 0.4864262992232778, "eval_loss": 1.7230299711227417, "eval_runtime": 149.6605, "eval_samples_per_second": 424.133, "eval_steps_per_second": 1.657, "step": 255000 }, { "epoch": 1.0392213376461086, "grad_norm": 2.3742315769195557, "learning_rate": 0.0047323246146370465, "loss": 8.1234, "step": 255100 }, { "epoch": 1.0396287156694901, "grad_norm": 5.732882499694824, "learning_rate": 0.004732101020481578, "loss": 8.176, "step": 255200 }, { "epoch": 1.0400360936928716, "grad_norm": 5.98242712020874, "learning_rate": 0.004731877338276891, "loss": 8.2219, "step": 255300 }, { "epoch": 1.040443471716253, "grad_norm": 5.500507831573486, "learning_rate": 0.0047316535680318305, "loss": 8.2961, "step": 255400 }, { "epoch": 1.0408508497396345, "grad_norm": 3.187746524810791, "learning_rate": 0.004731429709755249, "loss": 8.1657, "step": 255500 }, { "epoch": 1.041258227763016, "grad_norm": 6.253506660461426, "learning_rate": 0.0047312057634559986, "loss": 8.1167, "step": 255600 }, { "epoch": 1.0416656057863976, "grad_norm": 2.829343795776367, "learning_rate": 0.004730981729142925, "loss": 8.1404, "step": 255700 }, { "epoch": 1.0420729838097789, "grad_norm": 2.9654922485351562, "learning_rate": 0.00473075760682488, "loss": 8.1371, "step": 255800 }, { "epoch": 1.0424803618331604, "grad_norm": 1.2807074785232544, "learning_rate": 0.004730533396510732, "loss": 8.1983, "step": 255900 }, { "epoch": 1.042887739856542, "grad_norm": 4.129685401916504, "learning_rate": 0.004730309098209351, "loss": 8.174, "step": 256000 }, { "epoch": 1.042887739856542, "eval_MaskedAccuracy": 0.4800277906718458, "eval_loss": 1.7342748641967773, "eval_runtime": 192.1508, "eval_samples_per_second": 330.345, "eval_steps_per_second": 1.291, "step": 256000 }, { "epoch": 1.0432951178799232, "grad_norm": 3.0830535888671875, "learning_rate": 0.004730084711929603, "loss": 8.1937, "step": 256100 }, { "epoch": 1.0437024959033048, "grad_norm": 5.282797813415527, "learning_rate": 0.00472986023768035, "loss": 8.1215, "step": 256200 }, { "epoch": 1.0441098739266863, "grad_norm": 0.7001706957817078, "learning_rate": 0.004729635675470472, "loss": 8.1979, "step": 256300 }, { "epoch": 1.0445172519500676, "grad_norm": 3.220350980758667, "learning_rate": 0.004729411025308854, "loss": 8.1868, "step": 256400 }, { "epoch": 1.0449246299734491, "grad_norm": 5.503873348236084, "learning_rate": 0.004729186287204369, "loss": 8.1578, "step": 256500 }, { "epoch": 1.0453320079968307, "grad_norm": 3.1256799697875977, "learning_rate": 0.0047289614611659035, "loss": 8.1322, "step": 256600 }, { "epoch": 1.045739386020212, "grad_norm": 3.134495496749878, "learning_rate": 0.004728736547202346, "loss": 8.1981, "step": 256700 }, { "epoch": 1.0461467640435935, "grad_norm": 6.195826530456543, "learning_rate": 0.004728511545322587, "loss": 8.1569, "step": 256800 }, { "epoch": 1.046554142066975, "grad_norm": 5.16770076751709, "learning_rate": 0.0047282864555355224, "loss": 8.1899, "step": 256900 }, { "epoch": 1.0469615200903564, "grad_norm": 5.35223388671875, "learning_rate": 0.004728061277850063, "loss": 8.1473, "step": 257000 }, { "epoch": 1.0469615200903564, "eval_MaskedAccuracy": 0.48726242103151224, "eval_loss": 1.7100180387496948, "eval_runtime": 149.1608, "eval_samples_per_second": 425.554, "eval_steps_per_second": 1.663, "step": 257000 }, { "epoch": 1.047368898113738, "grad_norm": 1.6657848358154297, "learning_rate": 0.004727836012275092, "loss": 8.1177, "step": 257100 }, { "epoch": 1.0477762761371194, "grad_norm": 7.9431233406066895, "learning_rate": 0.004727610658819527, "loss": 8.168, "step": 257200 }, { "epoch": 1.0481836541605007, "grad_norm": 2.4482877254486084, "learning_rate": 0.00472738521749228, "loss": 8.2099, "step": 257300 }, { "epoch": 1.0485910321838823, "grad_norm": 3.727255344390869, "learning_rate": 0.004727159688302251, "loss": 8.2143, "step": 257400 }, { "epoch": 1.0489984102072638, "grad_norm": 1.0347565412521362, "learning_rate": 0.004726934071258369, "loss": 8.129, "step": 257500 }, { "epoch": 1.0494057882306451, "grad_norm": 3.988304615020752, "learning_rate": 0.00472670836636955, "loss": 8.1695, "step": 257600 }, { "epoch": 1.0498131662540267, "grad_norm": 3.0877161026000977, "learning_rate": 0.0047264825736447165, "loss": 8.1442, "step": 257700 }, { "epoch": 1.0502205442774082, "grad_norm": 5.7359232902526855, "learning_rate": 0.004726256693092799, "loss": 8.1259, "step": 257800 }, { "epoch": 1.0506279223007895, "grad_norm": 4.0298662185668945, "learning_rate": 0.004726030724722721, "loss": 8.0781, "step": 257900 }, { "epoch": 1.051035300324171, "grad_norm": 4.852095127105713, "learning_rate": 0.004725804668543424, "loss": 8.0993, "step": 258000 }, { "epoch": 1.051035300324171, "eval_MaskedAccuracy": 0.48923874445373744, "eval_loss": 1.7012666463851929, "eval_runtime": 151.5858, "eval_samples_per_second": 418.746, "eval_steps_per_second": 1.636, "step": 258000 }, { "epoch": 1.0514426783475526, "grad_norm": 4.054843902587891, "learning_rate": 0.004725578524563837, "loss": 8.0835, "step": 258100 }, { "epoch": 1.051850056370934, "grad_norm": 5.810976982116699, "learning_rate": 0.0047253522927929075, "loss": 8.0914, "step": 258200 }, { "epoch": 1.0522574343943154, "grad_norm": 3.9607856273651123, "learning_rate": 0.004725125973239575, "loss": 8.1012, "step": 258300 }, { "epoch": 1.052664812417697, "grad_norm": 3.03961181640625, "learning_rate": 0.004724899565912787, "loss": 8.144, "step": 258400 }, { "epoch": 1.0530721904410785, "grad_norm": 6.912770748138428, "learning_rate": 0.004724673070821498, "loss": 8.2314, "step": 258500 }, { "epoch": 1.0534795684644598, "grad_norm": 1.1770914793014526, "learning_rate": 0.004724446487974659, "loss": 8.1517, "step": 258600 }, { "epoch": 1.0538869464878413, "grad_norm": 10.230463981628418, "learning_rate": 0.004724219817381233, "loss": 8.2025, "step": 258700 }, { "epoch": 1.0542943245112228, "grad_norm": 0.9877458214759827, "learning_rate": 0.004723993059050179, "loss": 8.156, "step": 258800 }, { "epoch": 1.0547017025346042, "grad_norm": 1.8062012195587158, "learning_rate": 0.004723766212990459, "loss": 8.2504, "step": 258900 }, { "epoch": 1.0551090805579857, "grad_norm": 4.536640644073486, "learning_rate": 0.004723539279211045, "loss": 8.249, "step": 259000 }, { "epoch": 1.0551090805579857, "eval_MaskedAccuracy": 0.48295881254614076, "eval_loss": 1.7288438081741333, "eval_runtime": 151.1894, "eval_samples_per_second": 419.844, "eval_steps_per_second": 1.64, "step": 259000 }, { "epoch": 1.0555164585813672, "grad_norm": 3.7817294597625732, "learning_rate": 0.0047233122577209075, "loss": 8.2358, "step": 259100 }, { "epoch": 1.0559238366047485, "grad_norm": 4.458771705627441, "learning_rate": 0.004723085148529026, "loss": 8.1856, "step": 259200 }, { "epoch": 1.05633121462813, "grad_norm": 4.099343299865723, "learning_rate": 0.00472285795164437, "loss": 8.1814, "step": 259300 }, { "epoch": 1.0567385926515116, "grad_norm": 2.2912771701812744, "learning_rate": 0.004722630667075925, "loss": 8.1541, "step": 259400 }, { "epoch": 1.057145970674893, "grad_norm": 5.370869159698486, "learning_rate": 0.004722403294832682, "loss": 8.2239, "step": 259500 }, { "epoch": 1.0575533486982744, "grad_norm": 5.158105850219727, "learning_rate": 0.004722175834923627, "loss": 8.2159, "step": 259600 }, { "epoch": 1.057960726721656, "grad_norm": 3.3874893188476562, "learning_rate": 0.004721948287357746, "loss": 8.1775, "step": 259700 }, { "epoch": 1.0583681047450373, "grad_norm": 6.313094139099121, "learning_rate": 0.004721720652144048, "loss": 8.1786, "step": 259800 }, { "epoch": 1.0587754827684188, "grad_norm": 7.082878112792969, "learning_rate": 0.004721492929291526, "loss": 8.1443, "step": 259900 }, { "epoch": 1.0591828607918004, "grad_norm": 7.82448673248291, "learning_rate": 0.0047212651188091835, "loss": 8.1311, "step": 260000 }, { "epoch": 1.0591828607918004, "eval_MaskedAccuracy": 0.4816183055664994, "eval_loss": 1.7276784181594849, "eval_runtime": 155.418, "eval_samples_per_second": 408.421, "eval_steps_per_second": 1.596, "step": 260000 }, { "epoch": 1.0595902388151817, "grad_norm": 1.3936896324157715, "learning_rate": 0.004721037220706029, "loss": 8.1921, "step": 260100 }, { "epoch": 1.0599976168385632, "grad_norm": 0.652860701084137, "learning_rate": 0.004720809234991075, "loss": 8.1643, "step": 260200 }, { "epoch": 1.0604049948619447, "grad_norm": 6.536575794219971, "learning_rate": 0.004720581161673324, "loss": 8.2325, "step": 260300 }, { "epoch": 1.060812372885326, "grad_norm": 4.015500068664551, "learning_rate": 0.004720353000761806, "loss": 8.225, "step": 260400 }, { "epoch": 1.0612197509087076, "grad_norm": 4.177878379821777, "learning_rate": 0.004720124752265536, "loss": 8.1605, "step": 260500 }, { "epoch": 1.061627128932089, "grad_norm": 0.9724656343460083, "learning_rate": 0.004719896416193533, "loss": 8.1223, "step": 260600 }, { "epoch": 1.0620345069554706, "grad_norm": 5.476675033569336, "learning_rate": 0.004719667992554826, "loss": 8.2556, "step": 260700 }, { "epoch": 1.062441884978852, "grad_norm": 3.0141918659210205, "learning_rate": 0.0047194394813584555, "loss": 8.1575, "step": 260800 }, { "epoch": 1.0628492630022335, "grad_norm": 2.6944692134857178, "learning_rate": 0.00471921088261344, "loss": 8.1083, "step": 260900 }, { "epoch": 1.063256641025615, "grad_norm": 2.877093553543091, "learning_rate": 0.004718982196328832, "loss": 8.1973, "step": 261000 }, { "epoch": 1.063256641025615, "eval_MaskedAccuracy": 0.48363264676187867, "eval_loss": 1.72201669216156, "eval_runtime": 148.4137, "eval_samples_per_second": 427.696, "eval_steps_per_second": 1.671, "step": 261000 }, { "epoch": 1.0636640190489963, "grad_norm": 2.6658310890197754, "learning_rate": 0.004718753422513663, "loss": 8.1741, "step": 261100 }, { "epoch": 1.0640713970723779, "grad_norm": 3.4677579402923584, "learning_rate": 0.004718524561176982, "loss": 8.1563, "step": 261200 }, { "epoch": 1.0644787750957594, "grad_norm": 1.8401967287063599, "learning_rate": 0.004718295612327835, "loss": 8.2139, "step": 261300 }, { "epoch": 1.0648861531191407, "grad_norm": 3.109457492828369, "learning_rate": 0.00471806657597528, "loss": 8.207, "step": 261400 }, { "epoch": 1.0652935311425222, "grad_norm": 4.015644550323486, "learning_rate": 0.0047178374521283616, "loss": 8.2151, "step": 261500 }, { "epoch": 1.0657009091659038, "grad_norm": 4.507015228271484, "learning_rate": 0.004717608240796146, "loss": 8.2224, "step": 261600 }, { "epoch": 1.066108287189285, "grad_norm": 5.073245048522949, "learning_rate": 0.004717378941987694, "loss": 8.1965, "step": 261700 }, { "epoch": 1.0665156652126666, "grad_norm": 2.549006700515747, "learning_rate": 0.004717149555712068, "loss": 8.2085, "step": 261800 }, { "epoch": 1.0669230432360481, "grad_norm": 4.281580448150635, "learning_rate": 0.004716920081978343, "loss": 8.1909, "step": 261900 }, { "epoch": 1.0673304212594295, "grad_norm": 4.725358963012695, "learning_rate": 0.004716690520795584, "loss": 8.1232, "step": 262000 }, { "epoch": 1.0673304212594295, "eval_MaskedAccuracy": 0.4883496074575056, "eval_loss": 1.6964771747589111, "eval_runtime": 164.2241, "eval_samples_per_second": 386.521, "eval_steps_per_second": 1.51, "step": 262000 }, { "epoch": 1.067737799282811, "grad_norm": 4.8197760581970215, "learning_rate": 0.004716460872172865, "loss": 8.1073, "step": 262100 }, { "epoch": 1.0681451773061925, "grad_norm": 1.8663020133972168, "learning_rate": 0.004716231136119275, "loss": 8.0647, "step": 262200 }, { "epoch": 1.0685525553295738, "grad_norm": 2.7896194458007812, "learning_rate": 0.004716001312643891, "loss": 8.1042, "step": 262300 }, { "epoch": 1.0689599333529554, "grad_norm": 3.972243309020996, "learning_rate": 0.004715771401755794, "loss": 8.1265, "step": 262400 }, { "epoch": 1.069367311376337, "grad_norm": 3.569995403289795, "learning_rate": 0.004715541403464084, "loss": 8.0601, "step": 262500 }, { "epoch": 1.0697746893997182, "grad_norm": 2.492865562438965, "learning_rate": 0.004715311317777855, "loss": 8.1619, "step": 262600 }, { "epoch": 1.0701820674230997, "grad_norm": 2.858626365661621, "learning_rate": 0.004715081144706195, "loss": 8.238, "step": 262700 }, { "epoch": 1.0705894454464813, "grad_norm": 3.3485522270202637, "learning_rate": 0.004714850884258203, "loss": 8.1726, "step": 262800 }, { "epoch": 1.0709968234698626, "grad_norm": 2.575921058654785, "learning_rate": 0.004714620536442993, "loss": 8.1583, "step": 262900 }, { "epoch": 1.0714042014932441, "grad_norm": 2.6548173427581787, "learning_rate": 0.00471439010126966, "loss": 8.1958, "step": 263000 }, { "epoch": 1.0714042014932441, "eval_MaskedAccuracy": 0.48232082736317583, "eval_loss": 1.734666109085083, "eval_runtime": 162.083, "eval_samples_per_second": 391.627, "eval_steps_per_second": 1.53, "step": 263000 }, { "epoch": 1.0718115795166256, "grad_norm": 6.802894592285156, "learning_rate": 0.004714159578747321, "loss": 8.2134, "step": 263100 }, { "epoch": 1.0722189575400072, "grad_norm": 4.141062259674072, "learning_rate": 0.004713928968885087, "loss": 8.2439, "step": 263200 }, { "epoch": 1.0726263355633885, "grad_norm": 5.311740398406982, "learning_rate": 0.004713698271692076, "loss": 8.1981, "step": 263300 }, { "epoch": 1.07303371358677, "grad_norm": 11.01203441619873, "learning_rate": 0.004713467487177409, "loss": 8.1446, "step": 263400 }, { "epoch": 1.0734410916101516, "grad_norm": 6.425957202911377, "learning_rate": 0.004713236615350214, "loss": 8.2184, "step": 263500 }, { "epoch": 1.0738484696335329, "grad_norm": 4.091139793395996, "learning_rate": 0.004713005656219612, "loss": 8.1619, "step": 263600 }, { "epoch": 1.0742558476569144, "grad_norm": 4.363187313079834, "learning_rate": 0.004712774609794744, "loss": 8.1349, "step": 263700 }, { "epoch": 1.074663225680296, "grad_norm": 3.226597309112549, "learning_rate": 0.004712543476084728, "loss": 8.1829, "step": 263800 }, { "epoch": 1.0750706037036772, "grad_norm": 5.673817157745361, "learning_rate": 0.004712312255098713, "loss": 8.1851, "step": 263900 }, { "epoch": 1.0754779817270588, "grad_norm": 5.958888053894043, "learning_rate": 0.004712080946845837, "loss": 8.1547, "step": 264000 }, { "epoch": 1.0754779817270588, "eval_MaskedAccuracy": 0.4843080681283223, "eval_loss": 1.7236666679382324, "eval_runtime": 189.0288, "eval_samples_per_second": 335.801, "eval_steps_per_second": 1.312, "step": 264000 }, { "epoch": 1.0758853597504403, "grad_norm": 3.0145950317382812, "learning_rate": 0.004711849551335251, "loss": 8.2086, "step": 264100 }, { "epoch": 1.0762927377738216, "grad_norm": 1.837198257446289, "learning_rate": 0.004711618068576095, "loss": 8.2113, "step": 264200 }, { "epoch": 1.0767001157972031, "grad_norm": 3.117332935333252, "learning_rate": 0.004711386498577533, "loss": 8.1258, "step": 264300 }, { "epoch": 1.0771074938205847, "grad_norm": 3.062966823577881, "learning_rate": 0.004711154841348704, "loss": 8.1532, "step": 264400 }, { "epoch": 1.077514871843966, "grad_norm": 4.211122989654541, "learning_rate": 0.004710923096898773, "loss": 8.1588, "step": 264500 }, { "epoch": 1.0779222498673475, "grad_norm": 2.284803867340088, "learning_rate": 0.004710691265236909, "loss": 8.1051, "step": 264600 }, { "epoch": 1.078329627890729, "grad_norm": 4.682508945465088, "learning_rate": 0.004710459346372274, "loss": 8.0787, "step": 264700 }, { "epoch": 1.0787370059141104, "grad_norm": 5.156423568725586, "learning_rate": 0.004710227340314039, "loss": 8.0659, "step": 264800 }, { "epoch": 1.079144383937492, "grad_norm": 3.078960657119751, "learning_rate": 0.004709995247071368, "loss": 8.1013, "step": 264900 }, { "epoch": 1.0795517619608734, "grad_norm": 3.2186193466186523, "learning_rate": 0.00470976306665344, "loss": 8.087, "step": 265000 }, { "epoch": 1.0795517619608734, "eval_MaskedAccuracy": 0.488403056545127, "eval_loss": 1.7033782005310059, "eval_runtime": 163.3397, "eval_samples_per_second": 388.613, "eval_steps_per_second": 1.518, "step": 265000 }, { "epoch": 1.0799591399842547, "grad_norm": 5.391504764556885, "learning_rate": 0.0047095307990694365, "loss": 8.0565, "step": 265100 }, { "epoch": 1.0803665180076363, "grad_norm": 2.081413507461548, "learning_rate": 0.0047092984443285425, "loss": 8.1123, "step": 265200 }, { "epoch": 1.0807738960310178, "grad_norm": 3.115123748779297, "learning_rate": 0.004709066002439943, "loss": 8.1011, "step": 265300 }, { "epoch": 1.0811812740543991, "grad_norm": 4.943385124206543, "learning_rate": 0.00470883347341282, "loss": 8.0437, "step": 265400 }, { "epoch": 1.0815886520777807, "grad_norm": 4.331999778747559, "learning_rate": 0.004708600857256373, "loss": 8.0744, "step": 265500 }, { "epoch": 1.0819960301011622, "grad_norm": 4.476188659667969, "learning_rate": 0.004708368153979797, "loss": 8.0748, "step": 265600 }, { "epoch": 1.0824034081245437, "grad_norm": 5.666794300079346, "learning_rate": 0.0047081353635923025, "loss": 8.0852, "step": 265700 }, { "epoch": 1.082810786147925, "grad_norm": 5.742965221405029, "learning_rate": 0.004707902486103087, "loss": 8.077, "step": 265800 }, { "epoch": 1.0832181641713066, "grad_norm": 2.3758745193481445, "learning_rate": 0.004707669521521353, "loss": 8.1068, "step": 265900 }, { "epoch": 1.083625542194688, "grad_norm": 3.634063720703125, "learning_rate": 0.004707436469856306, "loss": 8.0879, "step": 266000 }, { "epoch": 1.083625542194688, "eval_MaskedAccuracy": 0.4891889805537006, "eval_loss": 1.7100374698638916, "eval_runtime": 174.0548, "eval_samples_per_second": 364.69, "eval_steps_per_second": 1.425, "step": 266000 }, { "epoch": 1.0840329202180694, "grad_norm": 3.887019157409668, "learning_rate": 0.004707203331117164, "loss": 8.0937, "step": 266100 }, { "epoch": 1.084440298241451, "grad_norm": 5.891707897186279, "learning_rate": 0.0047069701053131475, "loss": 8.081, "step": 266200 }, { "epoch": 1.0848476762648325, "grad_norm": 4.093088626861572, "learning_rate": 0.0047067367924534765, "loss": 8.0458, "step": 266300 }, { "epoch": 1.0852550542882138, "grad_norm": 16.209623336791992, "learning_rate": 0.00470650339254738, "loss": 8.0872, "step": 266400 }, { "epoch": 1.0856624323115953, "grad_norm": 3.369231939315796, "learning_rate": 0.004706269905604073, "loss": 8.25, "step": 266500 }, { "epoch": 1.0860698103349768, "grad_norm": 1.8260586261749268, "learning_rate": 0.004706036331632798, "loss": 8.2503, "step": 266600 }, { "epoch": 1.0864771883583582, "grad_norm": 4.970149993896484, "learning_rate": 0.00470580267064278, "loss": 8.23, "step": 266700 }, { "epoch": 1.0868845663817397, "grad_norm": 1.7472909688949585, "learning_rate": 0.004705568922643267, "loss": 8.1158, "step": 266800 }, { "epoch": 1.0872919444051212, "grad_norm": 1.5036593675613403, "learning_rate": 0.004705335087643487, "loss": 8.2032, "step": 266900 }, { "epoch": 1.0876993224285025, "grad_norm": 4.026489734649658, "learning_rate": 0.004705101165652698, "loss": 8.2286, "step": 267000 }, { "epoch": 1.0876993224285025, "eval_MaskedAccuracy": 0.48110589174391627, "eval_loss": 1.7291169166564941, "eval_runtime": 155.1183, "eval_samples_per_second": 409.21, "eval_steps_per_second": 1.599, "step": 267000 }, { "epoch": 1.088106700451884, "grad_norm": 2.847480535507202, "learning_rate": 0.004704867156680141, "loss": 8.1889, "step": 267100 }, { "epoch": 1.0885140784752656, "grad_norm": 5.766560077667236, "learning_rate": 0.004704633060735069, "loss": 8.1671, "step": 267200 }, { "epoch": 1.088921456498647, "grad_norm": 1.6122405529022217, "learning_rate": 0.004704398877826742, "loss": 8.1129, "step": 267300 }, { "epoch": 1.0893288345220284, "grad_norm": 1.7545127868652344, "learning_rate": 0.00470416460796441, "loss": 8.1343, "step": 267400 }, { "epoch": 1.08973621254541, "grad_norm": 2.776700019836426, "learning_rate": 0.004703930251157339, "loss": 8.1142, "step": 267500 }, { "epoch": 1.0901435905687913, "grad_norm": 0.6854613423347473, "learning_rate": 0.0047036958074147955, "loss": 8.154, "step": 267600 }, { "epoch": 1.0905509685921728, "grad_norm": 2.7014808654785156, "learning_rate": 0.00470346127674605, "loss": 8.1321, "step": 267700 }, { "epoch": 1.0909583466155544, "grad_norm": 10.678851127624512, "learning_rate": 0.004703226659160372, "loss": 8.1209, "step": 267800 }, { "epoch": 1.0913657246389357, "grad_norm": 1.0287868976593018, "learning_rate": 0.004702991954667038, "loss": 8.1732, "step": 267900 }, { "epoch": 1.0917731026623172, "grad_norm": 2.418203592300415, "learning_rate": 0.004702757163275323, "loss": 8.2017, "step": 268000 }, { "epoch": 1.0917731026623172, "eval_MaskedAccuracy": 0.48407416086684335, "eval_loss": 1.7229300737380981, "eval_runtime": 160.1907, "eval_samples_per_second": 396.253, "eval_steps_per_second": 1.548, "step": 268000 }, { "epoch": 1.0921804806856987, "grad_norm": 4.111920356750488, "learning_rate": 0.004702522284994514, "loss": 8.1566, "step": 268100 }, { "epoch": 1.0925878587090803, "grad_norm": 1.994573950767517, "learning_rate": 0.004702287319833896, "loss": 8.1353, "step": 268200 }, { "epoch": 1.0929952367324616, "grad_norm": 1.087450623512268, "learning_rate": 0.00470205226780276, "loss": 8.1521, "step": 268300 }, { "epoch": 1.093402614755843, "grad_norm": 2.8973805904388428, "learning_rate": 0.004701817128910397, "loss": 8.1979, "step": 268400 }, { "epoch": 1.0938099927792246, "grad_norm": 3.912325382232666, "learning_rate": 0.004701581903166106, "loss": 8.1635, "step": 268500 }, { "epoch": 1.094217370802606, "grad_norm": 3.248401641845703, "learning_rate": 0.00470134659057918, "loss": 8.1411, "step": 268600 }, { "epoch": 1.0946247488259875, "grad_norm": 5.1933135986328125, "learning_rate": 0.004701111191158926, "loss": 8.173, "step": 268700 }, { "epoch": 1.095032126849369, "grad_norm": 1.2353286743164062, "learning_rate": 0.004700875704914657, "loss": 8.1581, "step": 268800 }, { "epoch": 1.0954395048727503, "grad_norm": 4.2090253829956055, "learning_rate": 0.004700640131855669, "loss": 8.1331, "step": 268900 }, { "epoch": 1.0958468828961319, "grad_norm": 3.521942377090454, "learning_rate": 0.0047004044719912855, "loss": 8.1954, "step": 269000 }, { "epoch": 1.0958468828961319, "eval_MaskedAccuracy": 0.48426087111627897, "eval_loss": 1.7280783653259277, "eval_runtime": 156.8039, "eval_samples_per_second": 404.811, "eval_steps_per_second": 1.582, "step": 269000 }, { "epoch": 1.0962542609195134, "grad_norm": 1.2542732954025269, "learning_rate": 0.004700168725330823, "loss": 8.139, "step": 269100 }, { "epoch": 1.0966616389428947, "grad_norm": 3.3350350856781006, "learning_rate": 0.004699932891883601, "loss": 8.1973, "step": 269200 }, { "epoch": 1.0970690169662762, "grad_norm": 2.256042003631592, "learning_rate": 0.004699696971658945, "loss": 8.1227, "step": 269300 }, { "epoch": 1.0974763949896578, "grad_norm": 1.9328958988189697, "learning_rate": 0.004699460964666176, "loss": 8.0896, "step": 269400 }, { "epoch": 1.097883773013039, "grad_norm": 2.058600425720215, "learning_rate": 0.004699224870914627, "loss": 8.168, "step": 269500 }, { "epoch": 1.0982911510364206, "grad_norm": 4.162485599517822, "learning_rate": 0.004698988690413631, "loss": 8.0951, "step": 269600 }, { "epoch": 1.0986985290598021, "grad_norm": 2.3824028968811035, "learning_rate": 0.004698752423172528, "loss": 8.0478, "step": 269700 }, { "epoch": 1.0991059070831835, "grad_norm": 3.044100284576416, "learning_rate": 0.004698516069200664, "loss": 8.0492, "step": 269800 }, { "epoch": 1.099513285106565, "grad_norm": 4.95687198638916, "learning_rate": 0.004698279628507376, "loss": 8.0768, "step": 269900 }, { "epoch": 1.0999206631299465, "grad_norm": 1.5366508960723877, "learning_rate": 0.004698043101102013, "loss": 8.0825, "step": 270000 }, { "epoch": 1.0999206631299465, "eval_MaskedAccuracy": 0.48612449373834676, "eval_loss": 1.71591055393219, "eval_runtime": 166.5615, "eval_samples_per_second": 381.096, "eval_steps_per_second": 1.489, "step": 270000 }, { "epoch": 1.1003280411533278, "grad_norm": 2.692131996154785, "learning_rate": 0.0046978064869939344, "loss": 8.1676, "step": 270100 }, { "epoch": 1.1007354191767094, "grad_norm": 4.03659200668335, "learning_rate": 0.004697569786192483, "loss": 8.1174, "step": 270200 }, { "epoch": 1.101142797200091, "grad_norm": 0.7524415850639343, "learning_rate": 0.004697332998707021, "loss": 8.1263, "step": 270300 }, { "epoch": 1.1015501752234722, "grad_norm": 6.00818395614624, "learning_rate": 0.004697096124546906, "loss": 8.1965, "step": 270400 }, { "epoch": 1.1019575532468537, "grad_norm": 3.065471887588501, "learning_rate": 0.004696859163721507, "loss": 8.2128, "step": 270500 }, { "epoch": 1.1023649312702353, "grad_norm": 3.4066545963287354, "learning_rate": 0.004696622116240192, "loss": 8.1677, "step": 270600 }, { "epoch": 1.1027723092936168, "grad_norm": 5.791946887969971, "learning_rate": 0.0046963849821123355, "loss": 8.115, "step": 270700 }, { "epoch": 1.1031796873169981, "grad_norm": 6.0821533203125, "learning_rate": 0.004696147761347314, "loss": 8.1141, "step": 270800 }, { "epoch": 1.1035870653403796, "grad_norm": 4.188292980194092, "learning_rate": 0.004695910453954499, "loss": 8.1271, "step": 270900 }, { "epoch": 1.1039944433637612, "grad_norm": 3.8692679405212402, "learning_rate": 0.004695673059943279, "loss": 8.1337, "step": 271000 }, { "epoch": 1.1039944433637612, "eval_MaskedAccuracy": 0.4849790188195527, "eval_loss": 1.7239805459976196, "eval_runtime": 171.5484, "eval_samples_per_second": 370.018, "eval_steps_per_second": 1.446, "step": 271000 }, { "epoch": 1.1044018213871425, "grad_norm": 2.2323920726776123, "learning_rate": 0.004695435579323033, "loss": 8.2123, "step": 271100 }, { "epoch": 1.104809199410524, "grad_norm": 0.9265953302383423, "learning_rate": 0.004695198012103152, "loss": 8.2061, "step": 271200 }, { "epoch": 1.1052165774339056, "grad_norm": 5.013404369354248, "learning_rate": 0.004694960358293025, "loss": 8.1286, "step": 271300 }, { "epoch": 1.1056239554572869, "grad_norm": 0.45778951048851013, "learning_rate": 0.004694722617902061, "loss": 8.1433, "step": 271400 }, { "epoch": 1.1060313334806684, "grad_norm": 5.009927749633789, "learning_rate": 0.004694484790939649, "loss": 8.1416, "step": 271500 }, { "epoch": 1.10643871150405, "grad_norm": 2.8697478771209717, "learning_rate": 0.004694246877415187, "loss": 8.0945, "step": 271600 }, { "epoch": 1.1068460895274312, "grad_norm": 3.8795487880706787, "learning_rate": 0.0046940088773380914, "loss": 8.1403, "step": 271700 }, { "epoch": 1.1072534675508128, "grad_norm": 6.586146354675293, "learning_rate": 0.004693770790717769, "loss": 8.1388, "step": 271800 }, { "epoch": 1.1076608455741943, "grad_norm": 4.505802154541016, "learning_rate": 0.004693532617563631, "loss": 8.1552, "step": 271900 }, { "epoch": 1.1080682235975756, "grad_norm": 2.4296302795410156, "learning_rate": 0.004693294357885089, "loss": 8.0803, "step": 272000 }, { "epoch": 1.1080682235975756, "eval_MaskedAccuracy": 0.4881155818758434, "eval_loss": 1.6957036256790161, "eval_runtime": 209.1919, "eval_samples_per_second": 303.434, "eval_steps_per_second": 1.186, "step": 272000 }, { "epoch": 1.1084756016209572, "grad_norm": 7.506979465484619, "learning_rate": 0.00469305601169157, "loss": 8.1333, "step": 272100 }, { "epoch": 1.1088829796443387, "grad_norm": 2.638509511947632, "learning_rate": 0.004692817578992502, "loss": 8.1584, "step": 272200 }, { "epoch": 1.10929035766772, "grad_norm": 3.3887877464294434, "learning_rate": 0.004692579059797297, "loss": 8.1036, "step": 272300 }, { "epoch": 1.1096977356911015, "grad_norm": 4.049765110015869, "learning_rate": 0.004692340454115392, "loss": 8.0382, "step": 272400 }, { "epoch": 1.110105113714483, "grad_norm": 4.908907413482666, "learning_rate": 0.0046921017619562235, "loss": 8.0715, "step": 272500 }, { "epoch": 1.1105124917378644, "grad_norm": 4.042283535003662, "learning_rate": 0.004691862983329218, "loss": 8.0545, "step": 272600 }, { "epoch": 1.110919869761246, "grad_norm": 4.053991794586182, "learning_rate": 0.0046916241182438255, "loss": 8.0384, "step": 272700 }, { "epoch": 1.1113272477846274, "grad_norm": 3.4201111793518066, "learning_rate": 0.004691385166709484, "loss": 8.0466, "step": 272800 }, { "epoch": 1.1117346258080087, "grad_norm": 4.905217170715332, "learning_rate": 0.0046911461287356465, "loss": 8.0787, "step": 272900 }, { "epoch": 1.1121420038313903, "grad_norm": 0.5103607773780823, "learning_rate": 0.004690907004331768, "loss": 8.1088, "step": 273000 }, { "epoch": 1.1121420038313903, "eval_MaskedAccuracy": 0.48666966660284766, "eval_loss": 1.707664132118225, "eval_runtime": 155.8076, "eval_samples_per_second": 407.4, "eval_steps_per_second": 1.592, "step": 273000 }, { "epoch": 1.1125493818547718, "grad_norm": 3.7668569087982178, "learning_rate": 0.004690667793507286, "loss": 8.1373, "step": 273100 }, { "epoch": 1.1129567598781533, "grad_norm": 1.944350004196167, "learning_rate": 0.00469042849627167, "loss": 8.1208, "step": 273200 }, { "epoch": 1.1133641379015347, "grad_norm": 5.168220043182373, "learning_rate": 0.0046901891126343714, "loss": 8.1387, "step": 273300 }, { "epoch": 1.1137715159249162, "grad_norm": 2.502412796020508, "learning_rate": 0.0046899496426048605, "loss": 8.1277, "step": 273400 }, { "epoch": 1.1141788939482977, "grad_norm": 0.8807030320167542, "learning_rate": 0.00468971008619261, "loss": 8.1554, "step": 273500 }, { "epoch": 1.114586271971679, "grad_norm": 3.1593713760375977, "learning_rate": 0.004689470443407077, "loss": 8.1502, "step": 273600 }, { "epoch": 1.1149936499950606, "grad_norm": 4.367379665374756, "learning_rate": 0.0046892307142577545, "loss": 8.2173, "step": 273700 }, { "epoch": 1.115401028018442, "grad_norm": 3.056534767150879, "learning_rate": 0.0046889908987541055, "loss": 8.1163, "step": 273800 }, { "epoch": 1.1158084060418234, "grad_norm": 1.1638544797897339, "learning_rate": 0.004688750996905612, "loss": 8.1309, "step": 273900 }, { "epoch": 1.116215784065205, "grad_norm": 2.2497355937957764, "learning_rate": 0.004688511008721758, "loss": 8.2247, "step": 274000 }, { "epoch": 1.116215784065205, "eval_MaskedAccuracy": 0.4831780076449618, "eval_loss": 1.7273262739181519, "eval_runtime": 161.3412, "eval_samples_per_second": 393.427, "eval_steps_per_second": 1.537, "step": 274000 }, { "epoch": 1.1166231620885865, "grad_norm": 6.01616096496582, "learning_rate": 0.004688270934212037, "loss": 8.1929, "step": 274100 }, { "epoch": 1.1170305401119678, "grad_norm": 2.2356910705566406, "learning_rate": 0.004688030773385945, "loss": 8.0901, "step": 274200 }, { "epoch": 1.1174379181353493, "grad_norm": 4.811047077178955, "learning_rate": 0.004687790526252971, "loss": 8.1381, "step": 274300 }, { "epoch": 1.1178452961587309, "grad_norm": 3.640188217163086, "learning_rate": 0.004687550192822613, "loss": 8.1452, "step": 274400 }, { "epoch": 1.1182526741821122, "grad_norm": 2.9838287830352783, "learning_rate": 0.004687309773104376, "loss": 8.0714, "step": 274500 }, { "epoch": 1.1186600522054937, "grad_norm": 5.863322734832764, "learning_rate": 0.004687069267107753, "loss": 8.0812, "step": 274600 }, { "epoch": 1.1190674302288752, "grad_norm": 4.292328834533691, "learning_rate": 0.004686828674842264, "loss": 8.0808, "step": 274700 }, { "epoch": 1.1194748082522565, "grad_norm": 5.0022053718566895, "learning_rate": 0.004686587996317418, "loss": 8.0865, "step": 274800 }, { "epoch": 1.119882186275638, "grad_norm": 3.4901864528656006, "learning_rate": 0.004686347231542733, "loss": 8.08, "step": 274900 }, { "epoch": 1.1202895642990196, "grad_norm": 1.724212408065796, "learning_rate": 0.004686106380527726, "loss": 8.1934, "step": 275000 }, { "epoch": 1.1202895642990196, "eval_MaskedAccuracy": 0.4846983685409245, "eval_loss": 1.7235397100448608, "eval_runtime": 256.0296, "eval_samples_per_second": 247.924, "eval_steps_per_second": 0.969, "step": 275000 }, { "epoch": 1.120696942322401, "grad_norm": 0.8583388924598694, "learning_rate": 0.004685865443281921, "loss": 8.1504, "step": 275100 }, { "epoch": 1.1211043203457824, "grad_norm": 2.7991883754730225, "learning_rate": 0.004685624419814842, "loss": 8.1777, "step": 275200 }, { "epoch": 1.121511698369164, "grad_norm": 7.151505947113037, "learning_rate": 0.004685383310136006, "loss": 8.1491, "step": 275300 }, { "epoch": 1.1219190763925453, "grad_norm": 7.722084045410156, "learning_rate": 0.004685142114254963, "loss": 8.149, "step": 275400 }, { "epoch": 1.1223264544159268, "grad_norm": 6.095405101776123, "learning_rate": 0.004684900832181243, "loss": 8.1325, "step": 275500 }, { "epoch": 1.1227338324393084, "grad_norm": 1.316977858543396, "learning_rate": 0.004684659463924384, "loss": 8.1143, "step": 275600 }, { "epoch": 1.1231412104626899, "grad_norm": 8.855969429016113, "learning_rate": 0.004684418009493925, "loss": 8.1615, "step": 275700 }, { "epoch": 1.1235485884860712, "grad_norm": 3.414064645767212, "learning_rate": 0.004684176468899419, "loss": 8.1684, "step": 275800 }, { "epoch": 1.1239559665094527, "grad_norm": 3.0379483699798584, "learning_rate": 0.004683934842150418, "loss": 8.058, "step": 275900 }, { "epoch": 1.1243633445328343, "grad_norm": 3.9102447032928467, "learning_rate": 0.004683693129256457, "loss": 8.1565, "step": 276000 }, { "epoch": 1.1243633445328343, "eval_MaskedAccuracy": 0.48240115204096146, "eval_loss": 1.7249886989593506, "eval_runtime": 173.6473, "eval_samples_per_second": 365.546, "eval_steps_per_second": 1.428, "step": 276000 }, { "epoch": 1.1247707225562156, "grad_norm": 4.398492813110352, "learning_rate": 0.004683451330227109, "loss": 8.1829, "step": 276100 }, { "epoch": 1.125178100579597, "grad_norm": 3.6306369304656982, "learning_rate": 0.004683209445071927, "loss": 8.0959, "step": 276200 }, { "epoch": 1.1255854786029786, "grad_norm": 4.454225063323975, "learning_rate": 0.004682967473800478, "loss": 8.0981, "step": 276300 }, { "epoch": 1.12599285662636, "grad_norm": 3.1625213623046875, "learning_rate": 0.004682725416422324, "loss": 8.0729, "step": 276400 }, { "epoch": 1.1264002346497415, "grad_norm": 4.1124701499938965, "learning_rate": 0.0046824832729470385, "loss": 8.0742, "step": 276500 }, { "epoch": 1.126807612673123, "grad_norm": 4.031746864318848, "learning_rate": 0.004682241043384192, "loss": 8.0598, "step": 276600 }, { "epoch": 1.1272149906965043, "grad_norm": 3.8960773944854736, "learning_rate": 0.0046819987277433636, "loss": 8.0928, "step": 276700 }, { "epoch": 1.1276223687198859, "grad_norm": 4.482669830322266, "learning_rate": 0.004681756326034138, "loss": 8.0778, "step": 276800 }, { "epoch": 1.1280297467432674, "grad_norm": 2.868711471557617, "learning_rate": 0.004681513838266088, "loss": 8.0551, "step": 276900 }, { "epoch": 1.1284371247666487, "grad_norm": 4.186704635620117, "learning_rate": 0.00468127126444881, "loss": 8.1249, "step": 277000 }, { "epoch": 1.1284371247666487, "eval_MaskedAccuracy": 0.4834509396531186, "eval_loss": 1.724225640296936, "eval_runtime": 171.7546, "eval_samples_per_second": 369.574, "eval_steps_per_second": 1.444, "step": 277000 }, { "epoch": 1.1288445027900302, "grad_norm": 3.7420201301574707, "learning_rate": 0.004681028604591884, "loss": 8.1495, "step": 277100 }, { "epoch": 1.1292518808134118, "grad_norm": 4.511409759521484, "learning_rate": 0.00468078585870491, "loss": 8.0943, "step": 277200 }, { "epoch": 1.129659258836793, "grad_norm": 5.29677152633667, "learning_rate": 0.00468054302679748, "loss": 8.0926, "step": 277300 }, { "epoch": 1.1300666368601746, "grad_norm": 4.133951663970947, "learning_rate": 0.0046803001088792035, "loss": 8.0612, "step": 277400 }, { "epoch": 1.1304740148835561, "grad_norm": 4.754375457763672, "learning_rate": 0.004680057104959683, "loss": 8.0477, "step": 277500 }, { "epoch": 1.1308813929069375, "grad_norm": 5.114566326141357, "learning_rate": 0.00467981401504852, "loss": 8.0638, "step": 277600 }, { "epoch": 1.131288770930319, "grad_norm": 4.64804744720459, "learning_rate": 0.004679570839155321, "loss": 8.0741, "step": 277700 }, { "epoch": 1.1316961489537005, "grad_norm": 2.923494815826416, "learning_rate": 0.004679327577289709, "loss": 8.1535, "step": 277800 }, { "epoch": 1.1321035269770818, "grad_norm": 3.0124878883361816, "learning_rate": 0.004679084229461299, "loss": 8.1356, "step": 277900 }, { "epoch": 1.1325109050004634, "grad_norm": 4.786769866943359, "learning_rate": 0.004678840795679713, "loss": 8.119, "step": 278000 }, { "epoch": 1.1325109050004634, "eval_MaskedAccuracy": 0.4894147655588561, "eval_loss": 1.7013311386108398, "eval_runtime": 233.4884, "eval_samples_per_second": 271.859, "eval_steps_per_second": 1.062, "step": 278000 }, { "epoch": 1.132918283023845, "grad_norm": 5.455703258514404, "learning_rate": 0.004678597275954576, "loss": 8.0748, "step": 278100 }, { "epoch": 1.1333256610472264, "grad_norm": 5.843703746795654, "learning_rate": 0.004678353670295513, "loss": 8.1148, "step": 278200 }, { "epoch": 1.1337330390706077, "grad_norm": 2.1924307346343994, "learning_rate": 0.004678109978712153, "loss": 8.1491, "step": 278300 }, { "epoch": 1.1341404170939893, "grad_norm": 5.719590663909912, "learning_rate": 0.0046778662012141314, "loss": 8.1309, "step": 278400 }, { "epoch": 1.1345477951173706, "grad_norm": 5.64662504196167, "learning_rate": 0.0046776223378110875, "loss": 8.1755, "step": 278500 }, { "epoch": 1.1349551731407521, "grad_norm": 4.763120651245117, "learning_rate": 0.00467737838851267, "loss": 8.2121, "step": 278600 }, { "epoch": 1.1353625511641336, "grad_norm": 4.780452251434326, "learning_rate": 0.004677134353328508, "loss": 8.1014, "step": 278700 }, { "epoch": 1.1357699291875152, "grad_norm": 4.963366985321045, "learning_rate": 0.004676890232268258, "loss": 8.0473, "step": 278800 }, { "epoch": 1.1361773072108965, "grad_norm": 3.391356945037842, "learning_rate": 0.004676646025341571, "loss": 8.0508, "step": 278900 }, { "epoch": 1.136584685234278, "grad_norm": 5.09703254699707, "learning_rate": 0.004676401732558105, "loss": 8.0855, "step": 279000 }, { "epoch": 1.136584685234278, "eval_MaskedAccuracy": 0.4898350904715383, "eval_loss": 1.6970337629318237, "eval_runtime": 182.909, "eval_samples_per_second": 347.036, "eval_steps_per_second": 1.356, "step": 279000 }, { "epoch": 1.1369920632576596, "grad_norm": 4.194087982177734, "learning_rate": 0.0046761573539275065, "loss": 8.0722, "step": 279100 }, { "epoch": 1.1373994412810409, "grad_norm": 3.768462657928467, "learning_rate": 0.00467591288945945, "loss": 8.1768, "step": 279200 }, { "epoch": 1.1378068193044224, "grad_norm": 2.8276965618133545, "learning_rate": 0.004675668339163595, "loss": 8.1721, "step": 279300 }, { "epoch": 1.138214197327804, "grad_norm": 3.0509397983551025, "learning_rate": 0.004675423703049611, "loss": 8.1377, "step": 279400 }, { "epoch": 1.1386215753511852, "grad_norm": 1.7160394191741943, "learning_rate": 0.004675178981127179, "loss": 8.1548, "step": 279500 }, { "epoch": 1.1390289533745668, "grad_norm": 2.2624850273132324, "learning_rate": 0.004674934173405958, "loss": 8.1776, "step": 279600 }, { "epoch": 1.1394363313979483, "grad_norm": 7.251711368560791, "learning_rate": 0.004674689279895635, "loss": 8.1354, "step": 279700 }, { "epoch": 1.1398437094213296, "grad_norm": 5.023645401000977, "learning_rate": 0.00467444430060589, "loss": 8.2043, "step": 279800 }, { "epoch": 1.1402510874447112, "grad_norm": 3.3399031162261963, "learning_rate": 0.004674199235546409, "loss": 8.0932, "step": 279900 }, { "epoch": 1.1406584654680927, "grad_norm": 3.627291202545166, "learning_rate": 0.0046739540847268776, "loss": 8.065, "step": 280000 }, { "epoch": 1.1406584654680927, "eval_MaskedAccuracy": 0.4889675875917004, "eval_loss": 1.7060195207595825, "eval_runtime": 209.0401, "eval_samples_per_second": 303.655, "eval_steps_per_second": 1.186, "step": 280000 }, { "epoch": 1.141065843491474, "grad_norm": 5.373180866241455, "learning_rate": 0.00467370884815699, "loss": 8.091, "step": 280100 }, { "epoch": 1.1414732215148555, "grad_norm": 6.052729606628418, "learning_rate": 0.004673463525846452, "loss": 8.0751, "step": 280200 }, { "epoch": 1.141880599538237, "grad_norm": 3.9188849925994873, "learning_rate": 0.004673218117804952, "loss": 8.1804, "step": 280300 }, { "epoch": 1.1422879775616184, "grad_norm": 4.439460754394531, "learning_rate": 0.004672972624042189, "loss": 8.1365, "step": 280400 }, { "epoch": 1.142695355585, "grad_norm": 7.058625221252441, "learning_rate": 0.004672727044567876, "loss": 8.1494, "step": 280500 }, { "epoch": 1.1431027336083814, "grad_norm": 3.8022348880767822, "learning_rate": 0.004672481379391715, "loss": 8.1378, "step": 280600 }, { "epoch": 1.143510111631763, "grad_norm": 3.5311572551727295, "learning_rate": 0.004672235628523426, "loss": 8.1137, "step": 280700 }, { "epoch": 1.1439174896551443, "grad_norm": 4.319153785705566, "learning_rate": 0.004671989791972717, "loss": 8.0777, "step": 280800 }, { "epoch": 1.1443248676785258, "grad_norm": 4.833874225616455, "learning_rate": 0.004671743869749315, "loss": 8.1024, "step": 280900 }, { "epoch": 1.1447322457019071, "grad_norm": 3.737802743911743, "learning_rate": 0.004671497861862941, "loss": 8.078, "step": 281000 }, { "epoch": 1.1447322457019071, "eval_MaskedAccuracy": 0.4893886676306348, "eval_loss": 1.7081918716430664, "eval_runtime": 181.4112, "eval_samples_per_second": 349.901, "eval_steps_per_second": 1.367, "step": 281000 }, { "epoch": 1.1451396237252887, "grad_norm": 2.8301610946655273, "learning_rate": 0.004671251768323314, "loss": 8.0738, "step": 281100 }, { "epoch": 1.1455470017486702, "grad_norm": 4.914058208465576, "learning_rate": 0.004671005589140168, "loss": 8.0886, "step": 281200 }, { "epoch": 1.1459543797720517, "grad_norm": 3.273712158203125, "learning_rate": 0.004670759324323236, "loss": 8.0935, "step": 281300 }, { "epoch": 1.146361757795433, "grad_norm": 4.916646480560303, "learning_rate": 0.004670512973882251, "loss": 8.0889, "step": 281400 }, { "epoch": 1.1467691358188146, "grad_norm": 7.305778980255127, "learning_rate": 0.00467026653782696, "loss": 8.1171, "step": 281500 }, { "epoch": 1.147176513842196, "grad_norm": 2.4786717891693115, "learning_rate": 0.004670020016167105, "loss": 8.2327, "step": 281600 }, { "epoch": 1.1475838918655774, "grad_norm": 3.4263861179351807, "learning_rate": 0.004669773408912425, "loss": 8.2002, "step": 281700 }, { "epoch": 1.147991269888959, "grad_norm": 4.081550121307373, "learning_rate": 0.0046695267160726745, "loss": 8.1621, "step": 281800 }, { "epoch": 1.1483986479123405, "grad_norm": 2.9435956478118896, "learning_rate": 0.004669279937657605, "loss": 8.0995, "step": 281900 }, { "epoch": 1.1488060259357218, "grad_norm": 3.1086764335632324, "learning_rate": 0.0046690330736769755, "loss": 8.0852, "step": 282000 }, { "epoch": 1.1488060259357218, "eval_MaskedAccuracy": 0.48949397088198715, "eval_loss": 1.698317527770996, "eval_runtime": 171.0137, "eval_samples_per_second": 371.175, "eval_steps_per_second": 1.45, "step": 282000 }, { "epoch": 1.1492134039591033, "grad_norm": 4.455804347991943, "learning_rate": 0.004668786124140548, "loss": 8.0668, "step": 282100 }, { "epoch": 1.1496207819824849, "grad_norm": 8.845037460327148, "learning_rate": 0.0046685390890580806, "loss": 8.0871, "step": 282200 }, { "epoch": 1.1500281600058662, "grad_norm": 2.697097063064575, "learning_rate": 0.004668291968439348, "loss": 8.1008, "step": 282300 }, { "epoch": 1.1504355380292477, "grad_norm": 1.5756800174713135, "learning_rate": 0.004668044762294112, "loss": 8.0934, "step": 282400 }, { "epoch": 1.1508429160526292, "grad_norm": 3.879124641418457, "learning_rate": 0.004667797470632144, "loss": 8.1046, "step": 282500 }, { "epoch": 1.1512502940760105, "grad_norm": 3.2476515769958496, "learning_rate": 0.004667550093463231, "loss": 8.0288, "step": 282600 }, { "epoch": 1.151657672099392, "grad_norm": 3.167145252227783, "learning_rate": 0.004667302630797136, "loss": 8.0793, "step": 282700 }, { "epoch": 1.1520650501227736, "grad_norm": 6.005479335784912, "learning_rate": 0.004667055082643657, "loss": 8.0578, "step": 282800 }, { "epoch": 1.152472428146155, "grad_norm": 3.0901143550872803, "learning_rate": 0.00466680744901258, "loss": 8.1376, "step": 282900 }, { "epoch": 1.1528798061695364, "grad_norm": 0.9978356957435608, "learning_rate": 0.004666559729913692, "loss": 8.2153, "step": 283000 }, { "epoch": 1.1528798061695364, "eval_MaskedAccuracy": 0.4848038331343619, "eval_loss": 1.721642255783081, "eval_runtime": 235.129, "eval_samples_per_second": 269.962, "eval_steps_per_second": 1.055, "step": 283000 }, { "epoch": 1.153287184192918, "grad_norm": 3.1643245220184326, "learning_rate": 0.004666311925356788, "loss": 8.1531, "step": 283100 }, { "epoch": 1.1536945622162995, "grad_norm": 0.6623817682266235, "learning_rate": 0.004666064035351659, "loss": 8.1229, "step": 283200 }, { "epoch": 1.1541019402396808, "grad_norm": 4.834157943725586, "learning_rate": 0.0046658160599081155, "loss": 8.1745, "step": 283300 }, { "epoch": 1.1545093182630624, "grad_norm": 2.770371437072754, "learning_rate": 0.004665567999035959, "loss": 8.1868, "step": 283400 }, { "epoch": 1.1549166962864437, "grad_norm": 2.7883384227752686, "learning_rate": 0.004665319852744992, "loss": 8.0956, "step": 283500 }, { "epoch": 1.1553240743098252, "grad_norm": 3.587266445159912, "learning_rate": 0.004665071621045019, "loss": 8.1426, "step": 283600 }, { "epoch": 1.1557314523332067, "grad_norm": 2.944342851638794, "learning_rate": 0.004664823303945865, "loss": 8.1905, "step": 283700 }, { "epoch": 1.1561388303565883, "grad_norm": 4.455312252044678, "learning_rate": 0.004664574901457347, "loss": 8.0936, "step": 283800 }, { "epoch": 1.1565462083799696, "grad_norm": 2.765490770339966, "learning_rate": 0.004664326413589277, "loss": 8.0715, "step": 283900 }, { "epoch": 1.156953586403351, "grad_norm": 5.531642436981201, "learning_rate": 0.004664077840351492, "loss": 8.0301, "step": 284000 }, { "epoch": 1.156953586403351, "eval_MaskedAccuracy": 0.49008495701906885, "eval_loss": 1.6991403102874756, "eval_runtime": 226.8438, "eval_samples_per_second": 279.822, "eval_steps_per_second": 1.093, "step": 284000 }, { "epoch": 1.1573609644267326, "grad_norm": 3.5490705966949463, "learning_rate": 0.004663829181753806, "loss": 8.0236, "step": 284100 }, { "epoch": 1.157768342450114, "grad_norm": 3.0921506881713867, "learning_rate": 0.004663580437806063, "loss": 8.0636, "step": 284200 }, { "epoch": 1.1581757204734955, "grad_norm": 1.8780794143676758, "learning_rate": 0.0046633316085180915, "loss": 8.0648, "step": 284300 }, { "epoch": 1.158583098496877, "grad_norm": 3.8255105018615723, "learning_rate": 0.004663082693899725, "loss": 8.0325, "step": 284400 }, { "epoch": 1.1589904765202583, "grad_norm": 5.340209007263184, "learning_rate": 0.0046628336939608115, "loss": 8.057, "step": 284500 }, { "epoch": 1.1593978545436399, "grad_norm": 3.661881446838379, "learning_rate": 0.004662584608711187, "loss": 8.0247, "step": 284600 }, { "epoch": 1.1598052325670214, "grad_norm": 4.300080299377441, "learning_rate": 0.0046623354381607005, "loss": 8.0536, "step": 284700 }, { "epoch": 1.1602126105904027, "grad_norm": 5.313004493713379, "learning_rate": 0.004662086182319206, "loss": 8.0564, "step": 284800 }, { "epoch": 1.1606199886137842, "grad_norm": 3.9499144554138184, "learning_rate": 0.004661836841196555, "loss": 8.0147, "step": 284900 }, { "epoch": 1.1610273666371658, "grad_norm": 3.613180637359619, "learning_rate": 0.004661587414802613, "loss": 8.0632, "step": 285000 }, { "epoch": 1.1610273666371658, "eval_MaskedAccuracy": 0.4901985433871838, "eval_loss": 1.706134557723999, "eval_runtime": 176.7895, "eval_samples_per_second": 359.048, "eval_steps_per_second": 1.403, "step": 285000 }, { "epoch": 1.161434744660547, "grad_norm": 3.91060471534729, "learning_rate": 0.004661337903147235, "loss": 8.0707, "step": 285100 }, { "epoch": 1.1618421226839286, "grad_norm": 1.3036532402038574, "learning_rate": 0.0046610883062402925, "loss": 8.1888, "step": 285200 }, { "epoch": 1.1622495007073101, "grad_norm": 0.8332093954086304, "learning_rate": 0.004660838624091639, "loss": 8.1748, "step": 285300 }, { "epoch": 1.1626568787306915, "grad_norm": 2.4648735523223877, "learning_rate": 0.004660588856711164, "loss": 8.193, "step": 285400 }, { "epoch": 1.163064256754073, "grad_norm": 2.6603598594665527, "learning_rate": 0.004660339004108723, "loss": 8.2083, "step": 285500 }, { "epoch": 1.1634716347774545, "grad_norm": 2.0850071907043457, "learning_rate": 0.00466008906629421, "loss": 8.1916, "step": 285600 }, { "epoch": 1.163879012800836, "grad_norm": 5.8866424560546875, "learning_rate": 0.004659839043277496, "loss": 8.1063, "step": 285700 }, { "epoch": 1.1642863908242174, "grad_norm": 4.506791591644287, "learning_rate": 0.004659588935068473, "loss": 8.0866, "step": 285800 }, { "epoch": 1.164693768847599, "grad_norm": 4.221245765686035, "learning_rate": 0.004659338741677024, "loss": 8.0499, "step": 285900 }, { "epoch": 1.1651011468709802, "grad_norm": 6.313796520233154, "learning_rate": 0.004659088463113041, "loss": 8.0061, "step": 286000 }, { "epoch": 1.1651011468709802, "eval_MaskedAccuracy": 0.4906126527733975, "eval_loss": 1.6945469379425049, "eval_runtime": 158.8981, "eval_samples_per_second": 399.476, "eval_steps_per_second": 1.561, "step": 286000 }, { "epoch": 1.1655085248943617, "grad_norm": 3.875453472137451, "learning_rate": 0.004658838099386422, "loss": 8.0584, "step": 286100 }, { "epoch": 1.1659159029177433, "grad_norm": 3.0014939308166504, "learning_rate": 0.004658587650507059, "loss": 8.0632, "step": 286200 }, { "epoch": 1.1663232809411248, "grad_norm": 2.730990171432495, "learning_rate": 0.004658337116484863, "loss": 7.9925, "step": 286300 }, { "epoch": 1.1667306589645061, "grad_norm": 4.797238349914551, "learning_rate": 0.00465808649732973, "loss": 8.036, "step": 286400 }, { "epoch": 1.1671380369878877, "grad_norm": 4.819352626800537, "learning_rate": 0.004657835793051574, "loss": 8.0317, "step": 286500 }, { "epoch": 1.1675454150112692, "grad_norm": 1.2718093395233154, "learning_rate": 0.00465758500366031, "loss": 8.0646, "step": 286600 }, { "epoch": 1.1679527930346505, "grad_norm": 2.9197137355804443, "learning_rate": 0.004657334129165845, "loss": 8.1509, "step": 286700 }, { "epoch": 1.168360171058032, "grad_norm": 11.091897964477539, "learning_rate": 0.0046570831695780975, "loss": 8.1928, "step": 286800 }, { "epoch": 1.1687675490814136, "grad_norm": 4.137915134429932, "learning_rate": 0.004656832124906996, "loss": 8.1989, "step": 286900 }, { "epoch": 1.1691749271047949, "grad_norm": 4.835000514984131, "learning_rate": 0.0046565809951624616, "loss": 8.0976, "step": 287000 }, { "epoch": 1.1691749271047949, "eval_MaskedAccuracy": 0.48856667125114417, "eval_loss": 1.700318694114685, "eval_runtime": 179.8052, "eval_samples_per_second": 353.026, "eval_steps_per_second": 1.379, "step": 287000 }, { "epoch": 1.1695823051281764, "grad_norm": 3.653043508529663, "learning_rate": 0.004656329780354424, "loss": 8.0464, "step": 287100 }, { "epoch": 1.169989683151558, "grad_norm": 4.344825267791748, "learning_rate": 0.004656078480492809, "loss": 8.0559, "step": 287200 }, { "epoch": 1.1703970611749392, "grad_norm": 4.034629821777344, "learning_rate": 0.004655827095587565, "loss": 8.0518, "step": 287300 }, { "epoch": 1.1708044391983208, "grad_norm": 1.9681981801986694, "learning_rate": 0.004655575625648619, "loss": 8.0562, "step": 287400 }, { "epoch": 1.1712118172217023, "grad_norm": 5.370085716247559, "learning_rate": 0.004655324070685913, "loss": 8.0785, "step": 287500 }, { "epoch": 1.1716191952450836, "grad_norm": 13.195560455322266, "learning_rate": 0.004655072430709397, "loss": 8.0701, "step": 287600 }, { "epoch": 1.1720265732684652, "grad_norm": 4.355969429016113, "learning_rate": 0.004654820705729013, "loss": 8.1519, "step": 287700 }, { "epoch": 1.1724339512918467, "grad_norm": 4.580450534820557, "learning_rate": 0.004654568895754724, "loss": 8.0764, "step": 287800 }, { "epoch": 1.172841329315228, "grad_norm": 4.757116794586182, "learning_rate": 0.004654317000796484, "loss": 8.0572, "step": 287900 }, { "epoch": 1.1732487073386095, "grad_norm": 2.296961545944214, "learning_rate": 0.0046540650208642355, "loss": 8.1, "step": 288000 }, { "epoch": 1.1732487073386095, "eval_MaskedAccuracy": 0.48794103054963767, "eval_loss": 1.70279061794281, "eval_runtime": 227.6202, "eval_samples_per_second": 278.868, "eval_steps_per_second": 1.09, "step": 288000 }, { "epoch": 1.173656085361991, "grad_norm": 2.192230224609375, "learning_rate": 0.004653812955967959, "loss": 8.1132, "step": 288100 }, { "epoch": 1.1740634633853726, "grad_norm": 3.585665464401245, "learning_rate": 0.004653560806117613, "loss": 8.1349, "step": 288200 }, { "epoch": 1.174470841408754, "grad_norm": 4.043099880218506, "learning_rate": 0.004653308571323171, "loss": 8.1289, "step": 288300 }, { "epoch": 1.1748782194321354, "grad_norm": 3.1525979042053223, "learning_rate": 0.004653056251594601, "loss": 8.2137, "step": 288400 }, { "epoch": 1.1752855974555168, "grad_norm": 3.7244136333465576, "learning_rate": 0.00465280384694188, "loss": 8.1575, "step": 288500 }, { "epoch": 1.1756929754788983, "grad_norm": 4.780149459838867, "learning_rate": 0.004652551357374978, "loss": 8.0906, "step": 288600 }, { "epoch": 1.1761003535022798, "grad_norm": 3.728311061859131, "learning_rate": 0.0046522987829038884, "loss": 8.0462, "step": 288700 }, { "epoch": 1.1765077315256613, "grad_norm": 4.469501972198486, "learning_rate": 0.004652046123538593, "loss": 8.0955, "step": 288800 }, { "epoch": 1.1769151095490427, "grad_norm": 3.7230305671691895, "learning_rate": 0.004651793379289087, "loss": 8.0194, "step": 288900 }, { "epoch": 1.1773224875724242, "grad_norm": 4.39764404296875, "learning_rate": 0.004651540550165351, "loss": 8.051, "step": 289000 }, { "epoch": 1.1773224875724242, "eval_MaskedAccuracy": 0.49087479124984285, "eval_loss": 1.6909769773483276, "eval_runtime": 558.9363, "eval_samples_per_second": 113.566, "eval_steps_per_second": 0.444, "step": 289000 }, { "epoch": 1.1777298655958057, "grad_norm": 6.782959461212158, "learning_rate": 0.0046512876361773856, "loss": 8.032, "step": 289100 }, { "epoch": 1.178137243619187, "grad_norm": 1.3018677234649658, "learning_rate": 0.004651034637335197, "loss": 8.174, "step": 289200 }, { "epoch": 1.1785446216425686, "grad_norm": 1.7693558931350708, "learning_rate": 0.00465078155364878, "loss": 8.1965, "step": 289300 }, { "epoch": 1.17895199966595, "grad_norm": 4.082480430603027, "learning_rate": 0.004650528385128143, "loss": 8.1515, "step": 289400 }, { "epoch": 1.1793593776893314, "grad_norm": 4.21183967590332, "learning_rate": 0.0046502751317832915, "loss": 8.0705, "step": 289500 }, { "epoch": 1.179766755712713, "grad_norm": 3.432459831237793, "learning_rate": 0.004650021793624242, "loss": 8.0865, "step": 289600 }, { "epoch": 1.1801741337360945, "grad_norm": 4.862122535705566, "learning_rate": 0.004649768370661001, "loss": 8.0546, "step": 289700 }, { "epoch": 1.1805815117594758, "grad_norm": 4.301804542541504, "learning_rate": 0.004649514862903603, "loss": 8.0604, "step": 289800 }, { "epoch": 1.1809888897828573, "grad_norm": 3.298346757888794, "learning_rate": 0.004649261270362061, "loss": 8.0628, "step": 289900 }, { "epoch": 1.1813962678062389, "grad_norm": 6.476545333862305, "learning_rate": 0.004649007593046405, "loss": 8.1626, "step": 290000 }, { "epoch": 1.1813962678062389, "eval_MaskedAccuracy": 0.4817633937202652, "eval_loss": 1.730674386024475, "eval_runtime": 160.0866, "eval_samples_per_second": 396.51, "eval_steps_per_second": 1.549, "step": 290000 }, { "epoch": 1.1818036458296202, "grad_norm": 4.453090667724609, "learning_rate": 0.00464875383096666, "loss": 8.1244, "step": 290100 }, { "epoch": 1.1822110238530017, "grad_norm": 5.203738212585449, "learning_rate": 0.00464849998413286, "loss": 8.1195, "step": 290200 }, { "epoch": 1.1826184018763832, "grad_norm": 5.900394916534424, "learning_rate": 0.004648246052555043, "loss": 8.1849, "step": 290300 }, { "epoch": 1.1830257798997645, "grad_norm": 1.8650020360946655, "learning_rate": 0.004647992036243241, "loss": 8.184, "step": 290400 }, { "epoch": 1.183433157923146, "grad_norm": 2.952779769897461, "learning_rate": 0.004647737935207507, "loss": 8.1314, "step": 290500 }, { "epoch": 1.1838405359465276, "grad_norm": 1.1895891427993774, "learning_rate": 0.004647483749457883, "loss": 8.0934, "step": 290600 }, { "epoch": 1.1842479139699091, "grad_norm": 2.0459725856781006, "learning_rate": 0.004647229479004416, "loss": 8.1721, "step": 290700 }, { "epoch": 1.1846552919932904, "grad_norm": 3.5183563232421875, "learning_rate": 0.004646975123857159, "loss": 8.1091, "step": 290800 }, { "epoch": 1.185062670016672, "grad_norm": 1.6890475749969482, "learning_rate": 0.004646720684026171, "loss": 8.1511, "step": 290900 }, { "epoch": 1.1854700480400533, "grad_norm": 3.2172505855560303, "learning_rate": 0.004646466159521512, "loss": 8.0632, "step": 291000 }, { "epoch": 1.1854700480400533, "eval_MaskedAccuracy": 0.4890016386016322, "eval_loss": 1.703794002532959, "eval_runtime": 264.6484, "eval_samples_per_second": 239.85, "eval_steps_per_second": 0.937, "step": 291000 }, { "epoch": 1.1858774260634348, "grad_norm": 1.6402572393417358, "learning_rate": 0.00464621155035324, "loss": 8.1184, "step": 291100 }, { "epoch": 1.1862848040868164, "grad_norm": 3.0767862796783447, "learning_rate": 0.004645956856531417, "loss": 8.1399, "step": 291200 }, { "epoch": 1.186692182110198, "grad_norm": 3.245995283126831, "learning_rate": 0.004645702078066124, "loss": 8.0841, "step": 291300 }, { "epoch": 1.1870995601335792, "grad_norm": 4.78672981262207, "learning_rate": 0.004645447214967429, "loss": 8.0896, "step": 291400 }, { "epoch": 1.1875069381569607, "grad_norm": 6.897592544555664, "learning_rate": 0.0046451922672454075, "loss": 8.0885, "step": 291500 }, { "epoch": 1.1879143161803423, "grad_norm": 2.3824174404144287, "learning_rate": 0.004644937234910144, "loss": 8.1196, "step": 291600 }, { "epoch": 1.1883216942037236, "grad_norm": 5.0155863761901855, "learning_rate": 0.00464468211797171, "loss": 8.0814, "step": 291700 }, { "epoch": 1.188729072227105, "grad_norm": 1.3886222839355469, "learning_rate": 0.0046444269164402, "loss": 8.1123, "step": 291800 }, { "epoch": 1.1891364502504866, "grad_norm": 4.824828624725342, "learning_rate": 0.004644171630325706, "loss": 8.1269, "step": 291900 }, { "epoch": 1.189543828273868, "grad_norm": 1.4126006364822388, "learning_rate": 0.004643916259638313, "loss": 8.1427, "step": 292000 }, { "epoch": 1.189543828273868, "eval_MaskedAccuracy": 0.48855525663268745, "eval_loss": 1.6854230165481567, "eval_runtime": 178.3745, "eval_samples_per_second": 355.858, "eval_steps_per_second": 1.39, "step": 292000 }, { "epoch": 1.1899512062972495, "grad_norm": 2.1228461265563965, "learning_rate": 0.004643660804388119, "loss": 8.1371, "step": 292100 }, { "epoch": 1.190358584320631, "grad_norm": 0.6336086988449097, "learning_rate": 0.004643405264585224, "loss": 8.1392, "step": 292200 }, { "epoch": 1.1907659623440123, "grad_norm": 0.878674328327179, "learning_rate": 0.004643149640239736, "loss": 8.1752, "step": 292300 }, { "epoch": 1.1911733403673939, "grad_norm": 5.786048889160156, "learning_rate": 0.004642893931361753, "loss": 8.1784, "step": 292400 }, { "epoch": 1.1915807183907754, "grad_norm": 6.48675012588501, "learning_rate": 0.004642638137961386, "loss": 8.1513, "step": 292500 }, { "epoch": 1.1919880964141567, "grad_norm": 4.116359233856201, "learning_rate": 0.004642382260048754, "loss": 8.1237, "step": 292600 }, { "epoch": 1.1923954744375382, "grad_norm": 5.0022406578063965, "learning_rate": 0.004642126297633973, "loss": 8.0614, "step": 292700 }, { "epoch": 1.1928028524609198, "grad_norm": 4.177118301391602, "learning_rate": 0.004641870250727155, "loss": 8.0497, "step": 292800 }, { "epoch": 1.193210230484301, "grad_norm": 3.184532642364502, "learning_rate": 0.004641614119338434, "loss": 8.0297, "step": 292900 }, { "epoch": 1.1936176085076826, "grad_norm": 3.5414209365844727, "learning_rate": 0.004641357903477921, "loss": 8.0677, "step": 293000 }, { "epoch": 1.1936176085076826, "eval_MaskedAccuracy": 0.4917024225587292, "eval_loss": 1.6840522289276123, "eval_runtime": 525.4173, "eval_samples_per_second": 120.811, "eval_steps_per_second": 0.472, "step": 293000 }, { "epoch": 1.1940249865310641, "grad_norm": 4.260696887969971, "learning_rate": 0.004641101603155761, "loss": 8.0335, "step": 293100 }, { "epoch": 1.1944323645544457, "grad_norm": 6.748197078704834, "learning_rate": 0.004640845218382074, "loss": 8.0371, "step": 293200 }, { "epoch": 1.194839742577827, "grad_norm": 8.22083568572998, "learning_rate": 0.004640588749167006, "loss": 8.0992, "step": 293300 }, { "epoch": 1.1952471206012085, "grad_norm": 2.970147132873535, "learning_rate": 0.004640332195520685, "loss": 8.1274, "step": 293400 }, { "epoch": 1.1956544986245898, "grad_norm": 1.9968116283416748, "learning_rate": 0.0046400755574532646, "loss": 8.083, "step": 293500 }, { "epoch": 1.1960618766479714, "grad_norm": 3.9696598052978516, "learning_rate": 0.004639818834974893, "loss": 8.043, "step": 293600 }, { "epoch": 1.196469254671353, "grad_norm": 3.1897449493408203, "learning_rate": 0.004639562028095717, "loss": 8.0187, "step": 293700 }, { "epoch": 1.1968766326947344, "grad_norm": 2.5749704837799072, "learning_rate": 0.004639305136825884, "loss": 8.0338, "step": 293800 }, { "epoch": 1.1972840107181157, "grad_norm": 5.313751697540283, "learning_rate": 0.004639048161175557, "loss": 8.0858, "step": 293900 }, { "epoch": 1.1976913887414973, "grad_norm": 4.109214782714844, "learning_rate": 0.004638791101154886, "loss": 8.1647, "step": 294000 }, { "epoch": 1.1976913887414973, "eval_MaskedAccuracy": 0.4850518393627376, "eval_loss": 1.7173192501068115, "eval_runtime": 186.1655, "eval_samples_per_second": 340.965, "eval_steps_per_second": 1.332, "step": 294000 }, { "epoch": 1.1980987667648788, "grad_norm": 2.07083797454834, "learning_rate": 0.004638533956774044, "loss": 8.1601, "step": 294100 }, { "epoch": 1.1985061447882601, "grad_norm": 0.9491326808929443, "learning_rate": 0.004638276728043195, "loss": 8.1223, "step": 294200 }, { "epoch": 1.1989135228116417, "grad_norm": 1.581750750541687, "learning_rate": 0.0046380194149725044, "loss": 8.1301, "step": 294300 }, { "epoch": 1.1993209008350232, "grad_norm": 3.208538293838501, "learning_rate": 0.004637762017572147, "loss": 8.1662, "step": 294400 }, { "epoch": 1.1997282788584045, "grad_norm": 2.7052595615386963, "learning_rate": 0.004637504535852303, "loss": 8.0273, "step": 294500 }, { "epoch": 1.200135656881786, "grad_norm": 8.370946884155273, "learning_rate": 0.004637246969823157, "loss": 8.1228, "step": 294600 }, { "epoch": 1.2005430349051676, "grad_norm": 2.6555542945861816, "learning_rate": 0.004636989319494877, "loss": 8.1628, "step": 294700 }, { "epoch": 1.2009504129285489, "grad_norm": 1.5750937461853027, "learning_rate": 0.0046367315848776566, "loss": 8.1815, "step": 294800 }, { "epoch": 1.2013577909519304, "grad_norm": 4.36472749710083, "learning_rate": 0.004636473765981692, "loss": 8.1676, "step": 294900 }, { "epoch": 1.201765168975312, "grad_norm": 1.2535489797592163, "learning_rate": 0.00463621586281716, "loss": 8.1392, "step": 295000 }, { "epoch": 1.201765168975312, "eval_MaskedAccuracy": 0.48718782684736733, "eval_loss": 1.709524154663086, "eval_runtime": 187.0772, "eval_samples_per_second": 339.304, "eval_steps_per_second": 1.326, "step": 295000 }, { "epoch": 1.2021725469986932, "grad_norm": 2.1562795639038086, "learning_rate": 0.00463595787539427, "loss": 8.1355, "step": 295100 }, { "epoch": 1.2025799250220748, "grad_norm": 2.3468427658081055, "learning_rate": 0.004635699803723219, "loss": 8.107, "step": 295200 }, { "epoch": 1.2029873030454563, "grad_norm": 1.1948546171188354, "learning_rate": 0.004635441647814208, "loss": 8.1078, "step": 295300 }, { "epoch": 1.2033946810688376, "grad_norm": 5.46122407913208, "learning_rate": 0.00463518340767744, "loss": 8.1465, "step": 295400 }, { "epoch": 1.2038020590922192, "grad_norm": 3.8204262256622314, "learning_rate": 0.004634925083323132, "loss": 8.0802, "step": 295500 }, { "epoch": 1.2042094371156007, "grad_norm": 3.1001439094543457, "learning_rate": 0.004634666674761497, "loss": 8.0605, "step": 295600 }, { "epoch": 1.2046168151389822, "grad_norm": 4.485762596130371, "learning_rate": 0.004634408182002743, "loss": 8.0497, "step": 295700 }, { "epoch": 1.2050241931623635, "grad_norm": 0.7458304762840271, "learning_rate": 0.004634149605057095, "loss": 8.0634, "step": 295800 }, { "epoch": 1.205431571185745, "grad_norm": 3.6596946716308594, "learning_rate": 0.004633890943934777, "loss": 8.125, "step": 295900 }, { "epoch": 1.2058389492091264, "grad_norm": 1.1765960454940796, "learning_rate": 0.004633632198646016, "loss": 8.1403, "step": 296000 }, { "epoch": 1.2058389492091264, "eval_MaskedAccuracy": 0.4867809973571029, "eval_loss": 1.7117823362350464, "eval_runtime": 165.8442, "eval_samples_per_second": 382.745, "eval_steps_per_second": 1.495, "step": 296000 }, { "epoch": 1.206246327232508, "grad_norm": 1.9596821069717407, "learning_rate": 0.004633373369201036, "loss": 8.1842, "step": 296100 }, { "epoch": 1.2066537052558894, "grad_norm": 2.8808438777923584, "learning_rate": 0.004633114455610075, "loss": 8.2099, "step": 296200 }, { "epoch": 1.207061083279271, "grad_norm": 5.307520389556885, "learning_rate": 0.004632855457883367, "loss": 8.0813, "step": 296300 }, { "epoch": 1.2074684613026523, "grad_norm": 0.6345957517623901, "learning_rate": 0.004632596376031153, "loss": 8.136, "step": 296400 }, { "epoch": 1.2078758393260338, "grad_norm": 1.8907524347305298, "learning_rate": 0.004632337210063677, "loss": 8.1015, "step": 296500 }, { "epoch": 1.2082832173494154, "grad_norm": 4.043900966644287, "learning_rate": 0.00463207795999118, "loss": 8.1081, "step": 296600 }, { "epoch": 1.2086905953727967, "grad_norm": 5.490545749664307, "learning_rate": 0.004631818625823919, "loss": 8.1222, "step": 296700 }, { "epoch": 1.2090979733961782, "grad_norm": 4.196577548980713, "learning_rate": 0.004631559207572141, "loss": 8.0542, "step": 296800 }, { "epoch": 1.2095053514195597, "grad_norm": 3.866391658782959, "learning_rate": 0.004631299705246102, "loss": 8.0695, "step": 296900 }, { "epoch": 1.209912729442941, "grad_norm": 0.752380907535553, "learning_rate": 0.0046310401188560685, "loss": 8.0092, "step": 297000 }, { "epoch": 1.209912729442941, "eval_MaskedAccuracy": 0.4891799310806632, "eval_loss": 1.6924179792404175, "eval_runtime": 171.3721, "eval_samples_per_second": 370.399, "eval_steps_per_second": 1.447, "step": 297000 }, { "epoch": 1.2103201074663226, "grad_norm": 3.039321184158325, "learning_rate": 0.004630780448412297, "loss": 8.1151, "step": 297100 }, { "epoch": 1.210727485489704, "grad_norm": 1.9718246459960938, "learning_rate": 0.004630520693925053, "loss": 8.1389, "step": 297200 }, { "epoch": 1.2111348635130854, "grad_norm": 0.9533228278160095, "learning_rate": 0.004630260855404601, "loss": 8.139, "step": 297300 }, { "epoch": 1.211542241536467, "grad_norm": 3.9541590213775635, "learning_rate": 0.004630000932861223, "loss": 8.16, "step": 297400 }, { "epoch": 1.2119496195598485, "grad_norm": 2.471520185470581, "learning_rate": 0.004629740926305191, "loss": 8.179, "step": 297500 }, { "epoch": 1.2123569975832298, "grad_norm": 4.447343826293945, "learning_rate": 0.00462948083574679, "loss": 8.0903, "step": 297600 }, { "epoch": 1.2127643756066113, "grad_norm": 4.4200758934021, "learning_rate": 0.004629220661196297, "loss": 8.0701, "step": 297700 }, { "epoch": 1.2131717536299929, "grad_norm": 3.8681282997131348, "learning_rate": 0.004628960402664008, "loss": 8.0591, "step": 297800 }, { "epoch": 1.2135791316533742, "grad_norm": 4.3446197509765625, "learning_rate": 0.004628700060160198, "loss": 8.0233, "step": 297900 }, { "epoch": 1.2139865096767557, "grad_norm": 8.378327369689941, "learning_rate": 0.004628439633695168, "loss": 8.0572, "step": 298000 }, { "epoch": 1.2139865096767557, "eval_MaskedAccuracy": 0.4878226944878192, "eval_loss": 1.7110786437988281, "eval_runtime": 226.902, "eval_samples_per_second": 279.751, "eval_steps_per_second": 1.093, "step": 298000 }, { "epoch": 1.2143938877001372, "grad_norm": 3.936922311782837, "learning_rate": 0.004628179123279213, "loss": 8.0832, "step": 298100 }, { "epoch": 1.2148012657235188, "grad_norm": 3.7519657611846924, "learning_rate": 0.004627918528922626, "loss": 8.1479, "step": 298200 }, { "epoch": 1.2152086437469, "grad_norm": 2.771986246109009, "learning_rate": 0.004627657850635724, "loss": 8.18, "step": 298300 }, { "epoch": 1.2156160217702816, "grad_norm": 1.1955420970916748, "learning_rate": 0.0046273970884288034, "loss": 8.1782, "step": 298400 }, { "epoch": 1.216023399793663, "grad_norm": 2.453226089477539, "learning_rate": 0.004627136242312175, "loss": 8.1279, "step": 298500 }, { "epoch": 1.2164307778170445, "grad_norm": 2.992370367050171, "learning_rate": 0.0046268753122961515, "loss": 8.0821, "step": 298600 }, { "epoch": 1.216838155840426, "grad_norm": 5.242353439331055, "learning_rate": 0.004626614298391044, "loss": 8.1247, "step": 298700 }, { "epoch": 1.2172455338638075, "grad_norm": 1.5426554679870605, "learning_rate": 0.00462635320060717, "loss": 8.0914, "step": 298800 }, { "epoch": 1.2176529118871888, "grad_norm": 5.316330909729004, "learning_rate": 0.004626092018954864, "loss": 8.051, "step": 298900 }, { "epoch": 1.2180602899105704, "grad_norm": 3.0452542304992676, "learning_rate": 0.004625830753444443, "loss": 8.0352, "step": 299000 }, { "epoch": 1.2180602899105704, "eval_MaskedAccuracy": 0.49069970591165174, "eval_loss": 1.6971735954284668, "eval_runtime": 168.9447, "eval_samples_per_second": 375.721, "eval_steps_per_second": 1.468, "step": 299000 }, { "epoch": 1.218467667933952, "grad_norm": 3.606816053390503, "learning_rate": 0.004625569404086243, "loss": 8.0061, "step": 299100 }, { "epoch": 1.2188750459573332, "grad_norm": 4.931297302246094, "learning_rate": 0.00462530797089059, "loss": 8.0126, "step": 299200 }, { "epoch": 1.2192824239807147, "grad_norm": 5.3326873779296875, "learning_rate": 0.0046250464538678225, "loss": 7.9827, "step": 299300 }, { "epoch": 1.2196898020040963, "grad_norm": 14.773650169372559, "learning_rate": 0.0046247848530282815, "loss": 8.1116, "step": 299400 }, { "epoch": 1.2200971800274776, "grad_norm": 4.419100761413574, "learning_rate": 0.004624523168382312, "loss": 8.1343, "step": 299500 }, { "epoch": 1.2205045580508591, "grad_norm": 2.470949649810791, "learning_rate": 0.0046242613999402556, "loss": 8.1014, "step": 299600 }, { "epoch": 1.2209119360742406, "grad_norm": 1.0886467695236206, "learning_rate": 0.004623999547712456, "loss": 8.0791, "step": 299700 }, { "epoch": 1.221319314097622, "grad_norm": 1.0229309797286987, "learning_rate": 0.0046237376117092745, "loss": 8.1222, "step": 299800 }, { "epoch": 1.2217266921210035, "grad_norm": 3.866816759109497, "learning_rate": 0.0046234755919410545, "loss": 8.1587, "step": 299900 }, { "epoch": 1.222134070144385, "grad_norm": 5.334160327911377, "learning_rate": 0.00462321348841817, "loss": 8.0816, "step": 300000 }, { "epoch": 1.222134070144385, "eval_MaskedAccuracy": 0.4896748310121843, "eval_loss": 1.706262230873108, "eval_runtime": 195.5812, "eval_samples_per_second": 324.551, "eval_steps_per_second": 1.268, "step": 300000 }, { "epoch": 1.2225414481677663, "grad_norm": 3.508280038833618, "learning_rate": 0.004622951301150977, "loss": 8.0479, "step": 300100 }, { "epoch": 1.2229488261911479, "grad_norm": 3.168687343597412, "learning_rate": 0.004622689030149839, "loss": 8.0469, "step": 300200 }, { "epoch": 1.2233562042145294, "grad_norm": 2.397977352142334, "learning_rate": 0.004622426675425136, "loss": 8.1201, "step": 300300 }, { "epoch": 1.2237635822379107, "grad_norm": 2.019606113433838, "learning_rate": 0.004622164236987225, "loss": 8.1686, "step": 300400 }, { "epoch": 1.2241709602612922, "grad_norm": 2.7145845890045166, "learning_rate": 0.004621901714846492, "loss": 8.0884, "step": 300500 }, { "epoch": 1.2245783382846738, "grad_norm": 2.8610126972198486, "learning_rate": 0.004621639109013311, "loss": 8.0361, "step": 300600 }, { "epoch": 1.2249857163080553, "grad_norm": 4.2266435623168945, "learning_rate": 0.004621376419498069, "loss": 8.0563, "step": 300700 }, { "epoch": 1.2253930943314366, "grad_norm": 4.127247333526611, "learning_rate": 0.004621113646311145, "loss": 8.0616, "step": 300800 }, { "epoch": 1.2258004723548181, "grad_norm": 2.9968063831329346, "learning_rate": 0.0046208507894629275, "loss": 8.0561, "step": 300900 }, { "epoch": 1.2262078503781995, "grad_norm": 5.844603538513184, "learning_rate": 0.004620587848963815, "loss": 8.0466, "step": 301000 }, { "epoch": 1.2262078503781995, "eval_MaskedAccuracy": 0.4911941245461271, "eval_loss": 1.6918003559112549, "eval_runtime": 154.9656, "eval_samples_per_second": 409.613, "eval_steps_per_second": 1.6, "step": 301000 }, { "epoch": 1.226615228401581, "grad_norm": 4.7555060386657715, "learning_rate": 0.004620324824824195, "loss": 8.0266, "step": 301100 }, { "epoch": 1.2270226064249625, "grad_norm": 3.523132801055908, "learning_rate": 0.00462006171705448, "loss": 8.0484, "step": 301200 }, { "epoch": 1.227429984448344, "grad_norm": 1.8597171306610107, "learning_rate": 0.004619798525665065, "loss": 8.0113, "step": 301300 }, { "epoch": 1.2278373624717254, "grad_norm": 8.603140830993652, "learning_rate": 0.004619535250666346, "loss": 8.1079, "step": 301400 }, { "epoch": 1.228244740495107, "grad_norm": 6.364404201507568, "learning_rate": 0.004619271892068739, "loss": 8.1175, "step": 301500 }, { "epoch": 1.2286521185184884, "grad_norm": 2.263643503189087, "learning_rate": 0.0046190084498826545, "loss": 8.1233, "step": 301600 }, { "epoch": 1.2290594965418697, "grad_norm": 0.9943809509277344, "learning_rate": 0.004618744924118511, "loss": 8.11, "step": 301700 }, { "epoch": 1.2294668745652513, "grad_norm": 1.856001853942871, "learning_rate": 0.004618481314786725, "loss": 8.131, "step": 301800 }, { "epoch": 1.2298742525886328, "grad_norm": 3.258477210998535, "learning_rate": 0.004618217621897724, "loss": 8.1214, "step": 301900 }, { "epoch": 1.2302816306120141, "grad_norm": 1.1988614797592163, "learning_rate": 0.0046179538454619255, "loss": 8.0814, "step": 302000 }, { "epoch": 1.2302816306120141, "eval_MaskedAccuracy": 0.48597322669550147, "eval_loss": 1.714728593826294, "eval_runtime": 179.9806, "eval_samples_per_second": 352.682, "eval_steps_per_second": 1.378, "step": 302000 }, { "epoch": 1.2306890086353957, "grad_norm": 2.050732374191284, "learning_rate": 0.00461768998548976, "loss": 8.108, "step": 302100 }, { "epoch": 1.2310963866587772, "grad_norm": 3.5282022953033447, "learning_rate": 0.00461742604199166, "loss": 8.0959, "step": 302200 }, { "epoch": 1.2315037646821585, "grad_norm": 3.6474194526672363, "learning_rate": 0.004617162014978059, "loss": 8.0581, "step": 302300 }, { "epoch": 1.23191114270554, "grad_norm": 5.273232460021973, "learning_rate": 0.004616897904459396, "loss": 8.0409, "step": 302400 }, { "epoch": 1.2323185207289216, "grad_norm": 5.792709827423096, "learning_rate": 0.0046166337104461205, "loss": 8.0111, "step": 302500 }, { "epoch": 1.2327258987523029, "grad_norm": 4.636826038360596, "learning_rate": 0.004616369432948664, "loss": 8.083, "step": 302600 }, { "epoch": 1.2331332767756844, "grad_norm": 2.228703022003174, "learning_rate": 0.004616105071977488, "loss": 8.0733, "step": 302700 }, { "epoch": 1.233540654799066, "grad_norm": 2.250886917114258, "learning_rate": 0.004615840627543034, "loss": 8.1061, "step": 302800 }, { "epoch": 1.2339480328224472, "grad_norm": 2.698251962661743, "learning_rate": 0.004615576099655761, "loss": 8.1026, "step": 302900 }, { "epoch": 1.2343554108458288, "grad_norm": 3.854100227355957, "learning_rate": 0.004615311488326126, "loss": 8.0714, "step": 303000 }, { "epoch": 1.2343554108458288, "eval_MaskedAccuracy": 0.4881060796935515, "eval_loss": 1.6926175355911255, "eval_runtime": 155.3429, "eval_samples_per_second": 408.619, "eval_steps_per_second": 1.596, "step": 303000 }, { "epoch": 1.2347627888692103, "grad_norm": 2.0090067386627197, "learning_rate": 0.004615046793564592, "loss": 8.0752, "step": 303100 }, { "epoch": 1.2351701668925918, "grad_norm": 2.734537363052368, "learning_rate": 0.004614782015381621, "loss": 8.0923, "step": 303200 }, { "epoch": 1.2355775449159732, "grad_norm": 15.769610404968262, "learning_rate": 0.0046145171537876825, "loss": 8.119, "step": 303300 }, { "epoch": 1.2359849229393547, "grad_norm": 3.1276133060455322, "learning_rate": 0.00461425220879325, "loss": 8.14, "step": 303400 }, { "epoch": 1.236392300962736, "grad_norm": 5.444284915924072, "learning_rate": 0.004613987180408806, "loss": 8.1267, "step": 303500 }, { "epoch": 1.2367996789861175, "grad_norm": 5.557637691497803, "learning_rate": 0.0046137220686448075, "loss": 8.1384, "step": 303600 }, { "epoch": 1.237207057009499, "grad_norm": 1.9772955179214478, "learning_rate": 0.004613456873511747, "loss": 8.0422, "step": 303700 }, { "epoch": 1.2376144350328806, "grad_norm": 3.7055411338806152, "learning_rate": 0.004613191595020113, "loss": 8.1321, "step": 303800 }, { "epoch": 1.238021813056262, "grad_norm": 3.4126334190368652, "learning_rate": 0.004612926233180385, "loss": 8.1559, "step": 303900 }, { "epoch": 1.2384291910796434, "grad_norm": 4.730792045593262, "learning_rate": 0.004612660788003063, "loss": 8.0798, "step": 304000 }, { "epoch": 1.2384291910796434, "eval_MaskedAccuracy": 0.48978617182119905, "eval_loss": 1.7045177221298218, "eval_runtime": 193.263, "eval_samples_per_second": 328.444, "eval_steps_per_second": 1.283, "step": 304000 }, { "epoch": 1.238836569103025, "grad_norm": 1.8504356145858765, "learning_rate": 0.004612395259498635, "loss": 8.0589, "step": 304100 }, { "epoch": 1.2392439471264063, "grad_norm": 3.34515118598938, "learning_rate": 0.004612129647677609, "loss": 8.1066, "step": 304200 }, { "epoch": 1.2396513251497878, "grad_norm": 1.174383282661438, "learning_rate": 0.004611863952550479, "loss": 8.0642, "step": 304300 }, { "epoch": 1.2400587031731694, "grad_norm": 5.638028621673584, "learning_rate": 0.004611598174127743, "loss": 8.1238, "step": 304400 }, { "epoch": 1.2404660811965507, "grad_norm": 2.1906986236572266, "learning_rate": 0.004611332312419916, "loss": 8.1202, "step": 304500 }, { "epoch": 1.2408734592199322, "grad_norm": 3.808988332748413, "learning_rate": 0.004611066367437503, "loss": 8.1227, "step": 304600 }, { "epoch": 1.2412808372433137, "grad_norm": 1.7970826625823975, "learning_rate": 0.004610800339191022, "loss": 8.0841, "step": 304700 }, { "epoch": 1.241688215266695, "grad_norm": 1.9682625532150269, "learning_rate": 0.004610534227690988, "loss": 8.1265, "step": 304800 }, { "epoch": 1.2420955932900766, "grad_norm": 5.152673244476318, "learning_rate": 0.004610268032947929, "loss": 8.0917, "step": 304900 }, { "epoch": 1.242502971313458, "grad_norm": 0.8323341012001038, "learning_rate": 0.004610001754972367, "loss": 8.133, "step": 305000 }, { "epoch": 1.242502971313458, "eval_MaskedAccuracy": 0.48686285156172704, "eval_loss": 1.7085505723953247, "eval_runtime": 161.2778, "eval_samples_per_second": 393.582, "eval_steps_per_second": 1.538, "step": 305000 }, { "epoch": 1.2429103493368394, "grad_norm": 2.9382216930389404, "learning_rate": 0.004609735393774827, "loss": 8.1191, "step": 305100 }, { "epoch": 1.243317727360221, "grad_norm": 2.5187878608703613, "learning_rate": 0.004609468949365838, "loss": 8.1362, "step": 305200 }, { "epoch": 1.2437251053836025, "grad_norm": 4.52233362197876, "learning_rate": 0.00460920242175594, "loss": 8.1266, "step": 305300 }, { "epoch": 1.2441324834069838, "grad_norm": 5.111374855041504, "learning_rate": 0.004608935810955663, "loss": 8.1116, "step": 305400 }, { "epoch": 1.2445398614303653, "grad_norm": 1.2241135835647583, "learning_rate": 0.004608669116975555, "loss": 8.0831, "step": 305500 }, { "epoch": 1.2449472394537469, "grad_norm": 2.461332321166992, "learning_rate": 0.004608402339826152, "loss": 8.0891, "step": 305600 }, { "epoch": 1.2453546174771284, "grad_norm": 6.2857208251953125, "learning_rate": 0.0046081354795180045, "loss": 8.0607, "step": 305700 }, { "epoch": 1.2457619955005097, "grad_norm": 1.3130388259887695, "learning_rate": 0.004607868536061661, "loss": 8.1015, "step": 305800 }, { "epoch": 1.2461693735238912, "grad_norm": 1.7209711074829102, "learning_rate": 0.00460760150946768, "loss": 8.0705, "step": 305900 }, { "epoch": 1.2465767515472725, "grad_norm": 3.74379301071167, "learning_rate": 0.00460733439974661, "loss": 8.1352, "step": 306000 }, { "epoch": 1.2465767515472725, "eval_MaskedAccuracy": 0.4850375812619579, "eval_loss": 1.7195402383804321, "eval_runtime": 164.8649, "eval_samples_per_second": 385.018, "eval_steps_per_second": 1.504, "step": 306000 }, { "epoch": 1.246984129570654, "grad_norm": 3.218581199645996, "learning_rate": 0.004607067206909027, "loss": 8.1156, "step": 306100 }, { "epoch": 1.2473915075940356, "grad_norm": 2.4101579189300537, "learning_rate": 0.004606799930965481, "loss": 8.093, "step": 306200 }, { "epoch": 1.2477988856174171, "grad_norm": 3.4861109256744385, "learning_rate": 0.004606532571926543, "loss": 8.1189, "step": 306300 }, { "epoch": 1.2482062636407985, "grad_norm": 4.77168083190918, "learning_rate": 0.004606265129802781, "loss": 8.1026, "step": 306400 }, { "epoch": 1.24861364166418, "grad_norm": 0.6300526261329651, "learning_rate": 0.004605997604604765, "loss": 8.1177, "step": 306500 }, { "epoch": 1.2490210196875615, "grad_norm": 1.2264113426208496, "learning_rate": 0.0046057299963430855, "loss": 8.1051, "step": 306600 }, { "epoch": 1.2494283977109428, "grad_norm": 2.0417797565460205, "learning_rate": 0.0046054623050283084, "loss": 8.0656, "step": 306700 }, { "epoch": 1.2498357757343244, "grad_norm": 2.3916094303131104, "learning_rate": 0.0046051945306710305, "loss": 8.1169, "step": 306800 }, { "epoch": 1.250243153757706, "grad_norm": 6.253011226654053, "learning_rate": 0.004604926673281828, "loss": 8.0892, "step": 306900 }, { "epoch": 1.2506505317810872, "grad_norm": 2.1163041591644287, "learning_rate": 0.004604658732871291, "loss": 8.1115, "step": 307000 }, { "epoch": 1.2506505317810872, "eval_MaskedAccuracy": 0.48697341643848935, "eval_loss": 1.7105181217193604, "eval_runtime": 196.2697, "eval_samples_per_second": 323.412, "eval_steps_per_second": 1.264, "step": 307000 }, { "epoch": 1.2510579098044687, "grad_norm": 2.6035571098327637, "learning_rate": 0.004604390709450008, "loss": 8.1378, "step": 307100 }, { "epoch": 1.2514652878278503, "grad_norm": 5.0959248542785645, "learning_rate": 0.004604122603028583, "loss": 8.134, "step": 307200 }, { "epoch": 1.2518726658512316, "grad_norm": 2.674523115158081, "learning_rate": 0.004603854413617617, "loss": 8.0982, "step": 307300 }, { "epoch": 1.2522800438746131, "grad_norm": 1.7357358932495117, "learning_rate": 0.00460358614122771, "loss": 8.0884, "step": 307400 }, { "epoch": 1.2526874218979946, "grad_norm": 4.28224515914917, "learning_rate": 0.00460331778586947, "loss": 8.1212, "step": 307500 }, { "epoch": 1.2530947999213762, "grad_norm": 3.870980978012085, "learning_rate": 0.004603049347553501, "loss": 8.1202, "step": 307600 }, { "epoch": 1.2535021779447575, "grad_norm": 2.9729363918304443, "learning_rate": 0.0046027808262904275, "loss": 8.109, "step": 307700 }, { "epoch": 1.253909555968139, "grad_norm": 3.985447645187378, "learning_rate": 0.004602512222090859, "loss": 8.0672, "step": 307800 }, { "epoch": 1.2543169339915203, "grad_norm": 7.9126505851745605, "learning_rate": 0.0046022435349654086, "loss": 8.0686, "step": 307900 }, { "epoch": 1.2547243120149019, "grad_norm": 3.8987011909484863, "learning_rate": 0.00460197476492471, "loss": 8.1029, "step": 308000 }, { "epoch": 1.2547243120149019, "eval_MaskedAccuracy": 0.4900233862981613, "eval_loss": 1.7020424604415894, "eval_runtime": 187.7355, "eval_samples_per_second": 338.114, "eval_steps_per_second": 1.321, "step": 308000 }, { "epoch": 1.2551316900382834, "grad_norm": 3.613825559616089, "learning_rate": 0.004601705911979381, "loss": 8.0801, "step": 308100 }, { "epoch": 1.255539068061665, "grad_norm": 1.4558838605880737, "learning_rate": 0.004601436976140047, "loss": 8.1215, "step": 308200 }, { "epoch": 1.2559464460850462, "grad_norm": 7.969110488891602, "learning_rate": 0.004601167957417355, "loss": 8.1459, "step": 308300 }, { "epoch": 1.2563538241084278, "grad_norm": 5.945659160614014, "learning_rate": 0.004600898855821931, "loss": 8.108, "step": 308400 }, { "epoch": 1.256761202131809, "grad_norm": 2.5267322063446045, "learning_rate": 0.004600629671364405, "loss": 8.0766, "step": 308500 }, { "epoch": 1.2571685801551906, "grad_norm": 2.464092493057251, "learning_rate": 0.0046003604040554405, "loss": 8.1208, "step": 308600 }, { "epoch": 1.2575759581785722, "grad_norm": 3.6171298027038574, "learning_rate": 0.004600091053905665, "loss": 8.1159, "step": 308700 }, { "epoch": 1.2579833362019537, "grad_norm": 3.425246477127075, "learning_rate": 0.004599821620925738, "loss": 8.1051, "step": 308800 }, { "epoch": 1.258390714225335, "grad_norm": 2.6326539516448975, "learning_rate": 0.0045995521051263154, "loss": 8.0713, "step": 308900 }, { "epoch": 1.2587980922487165, "grad_norm": 3.3350260257720947, "learning_rate": 0.004599282506518043, "loss": 8.0321, "step": 309000 }, { "epoch": 1.2587980922487165, "eval_MaskedAccuracy": 0.4913877225593287, "eval_loss": 1.684166431427002, "eval_runtime": 175.0026, "eval_samples_per_second": 362.715, "eval_steps_per_second": 1.417, "step": 309000 }, { "epoch": 1.2592054702720978, "grad_norm": 4.280453681945801, "learning_rate": 0.004599012825111581, "loss": 8.055, "step": 309100 }, { "epoch": 1.2596128482954794, "grad_norm": 2.7680320739746094, "learning_rate": 0.004598743060917593, "loss": 8.1195, "step": 309200 }, { "epoch": 1.260020226318861, "grad_norm": 1.9460463523864746, "learning_rate": 0.004598473213946741, "loss": 8.0987, "step": 309300 }, { "epoch": 1.2604276043422424, "grad_norm": 1.9638036489486694, "learning_rate": 0.0045982032842097, "loss": 8.0552, "step": 309400 }, { "epoch": 1.2608349823656237, "grad_norm": 3.210782289505005, "learning_rate": 0.004597933271717138, "loss": 8.0821, "step": 309500 }, { "epoch": 1.2612423603890053, "grad_norm": 4.542831897735596, "learning_rate": 0.004597663176479727, "loss": 8.0303, "step": 309600 }, { "epoch": 1.2616497384123868, "grad_norm": 4.670233726501465, "learning_rate": 0.004597392998508145, "loss": 8.0116, "step": 309700 }, { "epoch": 1.2620571164357681, "grad_norm": 2.5082669258117676, "learning_rate": 0.00459712273781308, "loss": 8.0405, "step": 309800 }, { "epoch": 1.2624644944591497, "grad_norm": 2.2756545543670654, "learning_rate": 0.004596852394405212, "loss": 7.9797, "step": 309900 }, { "epoch": 1.2628718724825312, "grad_norm": 2.471325159072876, "learning_rate": 0.004596581968295235, "loss": 8.0666, "step": 310000 }, { "epoch": 1.2628718724825312, "eval_MaskedAccuracy": 0.4880205762552702, "eval_loss": 1.6943151950836182, "eval_runtime": 160.5475, "eval_samples_per_second": 395.372, "eval_steps_per_second": 1.545, "step": 310000 }, { "epoch": 1.2632792505059127, "grad_norm": 2.194175958633423, "learning_rate": 0.004596311459493836, "loss": 8.0875, "step": 310100 }, { "epoch": 1.263686628529294, "grad_norm": 2.4820425510406494, "learning_rate": 0.00459604086801171, "loss": 8.1294, "step": 310200 }, { "epoch": 1.2640940065526756, "grad_norm": 0.9932268857955933, "learning_rate": 0.004595770193859551, "loss": 8.0846, "step": 310300 }, { "epoch": 1.2645013845760569, "grad_norm": 5.9561381340026855, "learning_rate": 0.004595499437048069, "loss": 8.145, "step": 310400 }, { "epoch": 1.2649087625994384, "grad_norm": 4.604516983032227, "learning_rate": 0.0045952285975879555, "loss": 8.098, "step": 310500 }, { "epoch": 1.26531614062282, "grad_norm": 4.713510513305664, "learning_rate": 0.004594957675489932, "loss": 8.1127, "step": 310600 }, { "epoch": 1.2657235186462015, "grad_norm": 0.9178376793861389, "learning_rate": 0.004594686670764699, "loss": 8.1282, "step": 310700 }, { "epoch": 1.2661308966695828, "grad_norm": 1.6705297231674194, "learning_rate": 0.004594415583422981, "loss": 8.1677, "step": 310800 }, { "epoch": 1.2665382746929643, "grad_norm": 5.615497589111328, "learning_rate": 0.004594144413475491, "loss": 8.1144, "step": 310900 }, { "epoch": 1.2669456527163456, "grad_norm": 3.187138319015503, "learning_rate": 0.0045938731609329415, "loss": 8.0699, "step": 311000 }, { "epoch": 1.2669456527163456, "eval_MaskedAccuracy": 0.4898853434538884, "eval_loss": 1.6914721727371216, "eval_runtime": 168.9928, "eval_samples_per_second": 375.614, "eval_steps_per_second": 1.468, "step": 311000 }, { "epoch": 1.2673530307397272, "grad_norm": 3.255194902420044, "learning_rate": 0.004593601825806063, "loss": 8.0868, "step": 311100 }, { "epoch": 1.2677604087631087, "grad_norm": 4.374510288238525, "learning_rate": 0.004593330408105596, "loss": 8.1538, "step": 311200 }, { "epoch": 1.2681677867864902, "grad_norm": 3.63806414604187, "learning_rate": 0.004593058907842255, "loss": 8.1031, "step": 311300 }, { "epoch": 1.2685751648098715, "grad_norm": 2.7120907306671143, "learning_rate": 0.0045927873250267755, "loss": 8.0774, "step": 311400 }, { "epoch": 1.268982542833253, "grad_norm": 4.321403503417969, "learning_rate": 0.004592515659669892, "loss": 8.0922, "step": 311500 }, { "epoch": 1.2693899208566344, "grad_norm": 2.5028560161590576, "learning_rate": 0.004592243911782355, "loss": 8.1523, "step": 311600 }, { "epoch": 1.269797298880016, "grad_norm": 3.913832426071167, "learning_rate": 0.004591972081374904, "loss": 8.0677, "step": 311700 }, { "epoch": 1.2702046769033974, "grad_norm": 2.202146530151367, "learning_rate": 0.004591700168458283, "loss": 8.0916, "step": 311800 }, { "epoch": 1.270612054926779, "grad_norm": 1.159952163696289, "learning_rate": 0.004591428173043244, "loss": 8.0811, "step": 311900 }, { "epoch": 1.2710194329501603, "grad_norm": 2.1494171619415283, "learning_rate": 0.0045911560951405386, "loss": 8.1072, "step": 312000 }, { "epoch": 1.2710194329501603, "eval_MaskedAccuracy": 0.48879268615180604, "eval_loss": 1.6904964447021484, "eval_runtime": 164.6769, "eval_samples_per_second": 385.458, "eval_steps_per_second": 1.506, "step": 312000 }, { "epoch": 1.2714268109735418, "grad_norm": 3.1282689571380615, "learning_rate": 0.004590883934760925, "loss": 8.0268, "step": 312100 }, { "epoch": 1.2718341889969234, "grad_norm": 2.633990526199341, "learning_rate": 0.004590611691915167, "loss": 8.0739, "step": 312200 }, { "epoch": 1.2722415670203047, "grad_norm": 2.560723066329956, "learning_rate": 0.004590339366614029, "loss": 8.0921, "step": 312300 }, { "epoch": 1.2726489450436862, "grad_norm": 5.123629093170166, "learning_rate": 0.004590066958868263, "loss": 8.0859, "step": 312400 }, { "epoch": 1.2730563230670677, "grad_norm": 5.334412097930908, "learning_rate": 0.004589794468688652, "loss": 8.0871, "step": 312500 }, { "epoch": 1.2734637010904493, "grad_norm": 4.66272497177124, "learning_rate": 0.004589521896085961, "loss": 8.0924, "step": 312600 }, { "epoch": 1.2738710791138306, "grad_norm": 2.968400716781616, "learning_rate": 0.004589249241070973, "loss": 8.0869, "step": 312700 }, { "epoch": 1.274278457137212, "grad_norm": 4.876562118530273, "learning_rate": 0.004588976503654471, "loss": 8.03, "step": 312800 }, { "epoch": 1.2746858351605934, "grad_norm": 3.0390806198120117, "learning_rate": 0.004588703683847226, "loss": 8.0704, "step": 312900 }, { "epoch": 1.275093213183975, "grad_norm": 0.9052464962005615, "learning_rate": 0.004588430781660034, "loss": 8.0532, "step": 313000 }, { "epoch": 1.275093213183975, "eval_MaskedAccuracy": 0.4882918460567083, "eval_loss": 1.7067179679870605, "eval_runtime": 183.803, "eval_samples_per_second": 345.348, "eval_steps_per_second": 1.349, "step": 313000 }, { "epoch": 1.2755005912073565, "grad_norm": 1.3546417951583862, "learning_rate": 0.004588157797103682, "loss": 8.0895, "step": 313100 }, { "epoch": 1.275907969230738, "grad_norm": 1.6751666069030762, "learning_rate": 0.004587884730188957, "loss": 8.0547, "step": 313200 }, { "epoch": 1.2763153472541193, "grad_norm": 3.803537130355835, "learning_rate": 0.004587611580926658, "loss": 8.1219, "step": 313300 }, { "epoch": 1.2767227252775009, "grad_norm": 1.501133680343628, "learning_rate": 0.0045873383493275815, "loss": 8.0677, "step": 313400 }, { "epoch": 1.2771301033008822, "grad_norm": 2.1493144035339355, "learning_rate": 0.004587065035402535, "loss": 8.0835, "step": 313500 }, { "epoch": 1.2775374813242637, "grad_norm": 4.274430751800537, "learning_rate": 0.004586791639162321, "loss": 8.0656, "step": 313600 }, { "epoch": 1.2779448593476452, "grad_norm": 4.489757061004639, "learning_rate": 0.0045865181606177495, "loss": 8.0331, "step": 313700 }, { "epoch": 1.2783522373710268, "grad_norm": 4.5486016273498535, "learning_rate": 0.004586244599779637, "loss": 7.9875, "step": 313800 }, { "epoch": 1.278759615394408, "grad_norm": 3.658928394317627, "learning_rate": 0.004585970956658794, "loss": 8.0497, "step": 313900 }, { "epoch": 1.2791669934177896, "grad_norm": 1.7318885326385498, "learning_rate": 0.004585697231266038, "loss": 8.0245, "step": 314000 }, { "epoch": 1.2791669934177896, "eval_MaskedAccuracy": 0.4921710905561849, "eval_loss": 1.6829789876937866, "eval_runtime": 178.2729, "eval_samples_per_second": 356.061, "eval_steps_per_second": 1.391, "step": 314000 }, { "epoch": 1.279574371441171, "grad_norm": 4.326584815979004, "learning_rate": 0.00458542342361219, "loss": 8.0065, "step": 314100 }, { "epoch": 1.2799817494645525, "grad_norm": 5.520540714263916, "learning_rate": 0.0045851495337080805, "loss": 8.033, "step": 314200 }, { "epoch": 1.280389127487934, "grad_norm": 3.209029197692871, "learning_rate": 0.004584875561564537, "loss": 8.0821, "step": 314300 }, { "epoch": 1.2807965055113155, "grad_norm": 5.361316680908203, "learning_rate": 0.004584601507192383, "loss": 8.0513, "step": 314400 }, { "epoch": 1.2812038835346968, "grad_norm": 0.8805705904960632, "learning_rate": 0.004584327370602461, "loss": 8.091, "step": 314500 }, { "epoch": 1.2816112615580784, "grad_norm": 3.406642436981201, "learning_rate": 0.004584053151805604, "loss": 8.1275, "step": 314600 }, { "epoch": 1.28201863958146, "grad_norm": 3.8101279735565186, "learning_rate": 0.00458377885081265, "loss": 8.0898, "step": 314700 }, { "epoch": 1.2824260176048412, "grad_norm": 2.9716594219207764, "learning_rate": 0.0045835044676344585, "loss": 8.0317, "step": 314800 }, { "epoch": 1.2828333956282227, "grad_norm": 7.393836498260498, "learning_rate": 0.004583230002281866, "loss": 8.0202, "step": 314900 }, { "epoch": 1.2832407736516043, "grad_norm": 2.1396756172180176, "learning_rate": 0.0045829554547657265, "loss": 8.1281, "step": 315000 }, { "epoch": 1.2832407736516043, "eval_MaskedAccuracy": 0.4878806756258978, "eval_loss": 1.7071367502212524, "eval_runtime": 238.8459, "eval_samples_per_second": 265.761, "eval_steps_per_second": 1.038, "step": 315000 }, { "epoch": 1.2836481516749856, "grad_norm": 4.4804816246032715, "learning_rate": 0.004582680825096893, "loss": 8.1365, "step": 315100 }, { "epoch": 1.2840555296983671, "grad_norm": 2.1836588382720947, "learning_rate": 0.004582406113286226, "loss": 8.0849, "step": 315200 }, { "epoch": 1.2844629077217486, "grad_norm": 3.1351490020751953, "learning_rate": 0.004582131319344582, "loss": 8.0476, "step": 315300 }, { "epoch": 1.28487028574513, "grad_norm": 1.471785545349121, "learning_rate": 0.0045818564432828345, "loss": 8.0731, "step": 315400 }, { "epoch": 1.2852776637685115, "grad_norm": 3.349703073501587, "learning_rate": 0.004581581485111841, "loss": 8.084, "step": 315500 }, { "epoch": 1.285685041791893, "grad_norm": 2.4420437812805176, "learning_rate": 0.004581306444842476, "loss": 8.037, "step": 315600 }, { "epoch": 1.2860924198152746, "grad_norm": 3.901373863220215, "learning_rate": 0.00458103132248561, "loss": 8.0409, "step": 315700 }, { "epoch": 1.2864997978386559, "grad_norm": 3.417895793914795, "learning_rate": 0.00458075611805213, "loss": 8.0222, "step": 315800 }, { "epoch": 1.2869071758620374, "grad_norm": 4.650550365447998, "learning_rate": 0.004580480831552905, "loss": 8.0042, "step": 315900 }, { "epoch": 1.2873145538854187, "grad_norm": 3.382446050643921, "learning_rate": 0.00458020546299882, "loss": 7.981, "step": 316000 }, { "epoch": 1.2873145538854187, "eval_MaskedAccuracy": 0.49159966625540336, "eval_loss": 1.6857359409332275, "eval_runtime": 236.8735, "eval_samples_per_second": 267.974, "eval_steps_per_second": 1.047, "step": 316000 }, { "epoch": 1.2877219319088002, "grad_norm": 4.8859477043151855, "learning_rate": 0.004579930012400767, "loss": 8.0121, "step": 316100 }, { "epoch": 1.2881293099321818, "grad_norm": 0.9068177342414856, "learning_rate": 0.004579654479769629, "loss": 8.0609, "step": 316200 }, { "epoch": 1.2885366879555633, "grad_norm": 6.070545196533203, "learning_rate": 0.0045793788651163, "loss": 8.089, "step": 316300 }, { "epoch": 1.2889440659789446, "grad_norm": 1.946527361869812, "learning_rate": 0.004579103168451684, "loss": 8.0639, "step": 316400 }, { "epoch": 1.2893514440023262, "grad_norm": 2.040311336517334, "learning_rate": 0.0045788273897866836, "loss": 8.0402, "step": 316500 }, { "epoch": 1.2897588220257075, "grad_norm": 2.201308012008667, "learning_rate": 0.004578551529132186, "loss": 8.0823, "step": 316600 }, { "epoch": 1.290166200049089, "grad_norm": 5.859253406524658, "learning_rate": 0.004578275586499108, "loss": 8.1054, "step": 316700 }, { "epoch": 1.2905735780724705, "grad_norm": 4.747668266296387, "learning_rate": 0.004577999561898358, "loss": 8.0821, "step": 316800 }, { "epoch": 1.290980956095852, "grad_norm": 2.4468894004821777, "learning_rate": 0.004577723455340848, "loss": 8.1113, "step": 316900 }, { "epoch": 1.2913883341192334, "grad_norm": 8.025307655334473, "learning_rate": 0.004577447266837487, "loss": 8.0874, "step": 317000 }, { "epoch": 1.2913883341192334, "eval_MaskedAccuracy": 0.48697882185196195, "eval_loss": 1.7081642150878906, "eval_runtime": 192.8795, "eval_samples_per_second": 329.097, "eval_steps_per_second": 1.286, "step": 317000 }, { "epoch": 1.291795712142615, "grad_norm": 6.981740474700928, "learning_rate": 0.004577170996399209, "loss": 8.1121, "step": 317100 }, { "epoch": 1.2922030901659964, "grad_norm": 2.590914487838745, "learning_rate": 0.004576894644036933, "loss": 8.0975, "step": 317200 }, { "epoch": 1.2926104681893777, "grad_norm": 1.8462945222854614, "learning_rate": 0.004576618209761579, "loss": 8.0449, "step": 317300 }, { "epoch": 1.2930178462127593, "grad_norm": 3.038158416748047, "learning_rate": 0.004576341693584074, "loss": 8.0803, "step": 317400 }, { "epoch": 1.2934252242361408, "grad_norm": 5.122670650482178, "learning_rate": 0.004576065095515352, "loss": 8.041, "step": 317500 }, { "epoch": 1.2938326022595221, "grad_norm": 5.177152633666992, "learning_rate": 0.0045757884155663555, "loss": 8.1004, "step": 317600 }, { "epoch": 1.2942399802829037, "grad_norm": 1.4997923374176025, "learning_rate": 0.004575511653748008, "loss": 8.0695, "step": 317700 }, { "epoch": 1.2946473583062852, "grad_norm": 4.639394283294678, "learning_rate": 0.0045752348100712645, "loss": 8.0935, "step": 317800 }, { "epoch": 1.2950547363296665, "grad_norm": 1.499557614326477, "learning_rate": 0.004574957884547069, "loss": 8.018, "step": 317900 }, { "epoch": 1.295462114353048, "grad_norm": 3.189948558807373, "learning_rate": 0.004574680877186372, "loss": 8.1136, "step": 318000 }, { "epoch": 1.295462114353048, "eval_MaskedAccuracy": 0.4893474880067361, "eval_loss": 1.6934936046600342, "eval_runtime": 197.65, "eval_samples_per_second": 321.153, "eval_steps_per_second": 1.255, "step": 318000 }, { "epoch": 1.2958694923764296, "grad_norm": 1.9850223064422607, "learning_rate": 0.004574403788000111, "loss": 8.0685, "step": 318100 }, { "epoch": 1.296276870399811, "grad_norm": 4.261771202087402, "learning_rate": 0.00457412661699925, "loss": 8.0733, "step": 318200 }, { "epoch": 1.2966842484231924, "grad_norm": 2.2075061798095703, "learning_rate": 0.004573849364194751, "loss": 8.0329, "step": 318300 }, { "epoch": 1.297091626446574, "grad_norm": 1.7161146402359009, "learning_rate": 0.004573572029597568, "loss": 8.0544, "step": 318400 }, { "epoch": 1.2974990044699553, "grad_norm": 3.8380091190338135, "learning_rate": 0.004573294613218673, "loss": 8.0884, "step": 318500 }, { "epoch": 1.2979063824933368, "grad_norm": 2.094470262527466, "learning_rate": 0.004573017115069026, "loss": 8.0765, "step": 318600 }, { "epoch": 1.2983137605167183, "grad_norm": 2.915336847305298, "learning_rate": 0.004572739535159604, "loss": 8.0381, "step": 318700 }, { "epoch": 1.2987211385400999, "grad_norm": 3.584977626800537, "learning_rate": 0.004572461873501381, "loss": 8.0563, "step": 318800 }, { "epoch": 1.2991285165634812, "grad_norm": 3.8318121433258057, "learning_rate": 0.004572184130105332, "loss": 8.0355, "step": 318900 }, { "epoch": 1.2995358945868627, "grad_norm": 4.111202716827393, "learning_rate": 0.004571906304982439, "loss": 8.1071, "step": 319000 }, { "epoch": 1.2995358945868627, "eval_MaskedAccuracy": 0.4893559775239622, "eval_loss": 1.700352668762207, "eval_runtime": 205.5605, "eval_samples_per_second": 308.795, "eval_steps_per_second": 1.206, "step": 319000 }, { "epoch": 1.299943272610244, "grad_norm": 0.9857933521270752, "learning_rate": 0.004571628398143688, "loss": 8.0685, "step": 319100 }, { "epoch": 1.3003506506336255, "grad_norm": 4.459076881408691, "learning_rate": 0.004571350409600061, "loss": 8.121, "step": 319200 }, { "epoch": 1.300758028657007, "grad_norm": 4.374945640563965, "learning_rate": 0.004571072339362558, "loss": 8.0528, "step": 319300 }, { "epoch": 1.3011654066803886, "grad_norm": 1.8823482990264893, "learning_rate": 0.004570794187442155, "loss": 8.1035, "step": 319400 }, { "epoch": 1.30157278470377, "grad_norm": 2.472083568572998, "learning_rate": 0.004570515953849863, "loss": 8.0685, "step": 319500 }, { "epoch": 1.3019801627271514, "grad_norm": 5.041461944580078, "learning_rate": 0.004570237638596677, "loss": 8.0999, "step": 319600 }, { "epoch": 1.302387540750533, "grad_norm": 4.218006610870361, "learning_rate": 0.004569959241693603, "loss": 8.1125, "step": 319700 }, { "epoch": 1.3027949187739143, "grad_norm": 3.5334811210632324, "learning_rate": 0.004569680763151646, "loss": 8.0523, "step": 319800 }, { "epoch": 1.3032022967972958, "grad_norm": 4.296719074249268, "learning_rate": 0.004569402202981819, "loss": 8.0481, "step": 319900 }, { "epoch": 1.3036096748206774, "grad_norm": 2.7426321506500244, "learning_rate": 0.004569123561195129, "loss": 7.9919, "step": 320000 }, { "epoch": 1.3036096748206774, "eval_MaskedAccuracy": 0.4920605854338912, "eval_loss": 1.6915849447250366, "eval_runtime": 177.7127, "eval_samples_per_second": 357.183, "eval_steps_per_second": 1.396, "step": 320000 }, { "epoch": 1.3040170528440587, "grad_norm": 1.2438089847564697, "learning_rate": 0.0045688448378025945, "loss": 8.0546, "step": 320100 }, { "epoch": 1.3044244308674402, "grad_norm": 4.601132869720459, "learning_rate": 0.004568566032815233, "loss": 8.1145, "step": 320200 }, { "epoch": 1.3048318088908217, "grad_norm": 3.7675604820251465, "learning_rate": 0.004568287146244068, "loss": 8.0729, "step": 320300 }, { "epoch": 1.305239186914203, "grad_norm": 2.828606605529785, "learning_rate": 0.004568008178100126, "loss": 8.0348, "step": 320400 }, { "epoch": 1.3056465649375846, "grad_norm": 5.11243200302124, "learning_rate": 0.004567729128394441, "loss": 8.0683, "step": 320500 }, { "epoch": 1.306053942960966, "grad_norm": 4.557095050811768, "learning_rate": 0.004567449997138039, "loss": 8.0726, "step": 320600 }, { "epoch": 1.3064613209843476, "grad_norm": 2.090935707092285, "learning_rate": 0.0045671707843419635, "loss": 8.0992, "step": 320700 }, { "epoch": 1.306868699007729, "grad_norm": 2.7916462421417236, "learning_rate": 0.004566891490017243, "loss": 8.0534, "step": 320800 }, { "epoch": 1.3072760770311105, "grad_norm": 3.4025943279266357, "learning_rate": 0.004566612114174925, "loss": 8.0205, "step": 320900 }, { "epoch": 1.3076834550544918, "grad_norm": 3.246011972427368, "learning_rate": 0.0045663326568260556, "loss": 8.0384, "step": 321000 }, { "epoch": 1.3076834550544918, "eval_MaskedAccuracy": 0.4923900148265971, "eval_loss": 1.6762045621871948, "eval_runtime": 246.8153, "eval_samples_per_second": 257.18, "eval_steps_per_second": 1.005, "step": 321000 }, { "epoch": 1.3080908330778733, "grad_norm": 2.6321635246276855, "learning_rate": 0.0045660531179816805, "loss": 8.0152, "step": 321100 }, { "epoch": 1.3084982111012549, "grad_norm": 2.8856520652770996, "learning_rate": 0.0045657734976528525, "loss": 8.0505, "step": 321200 }, { "epoch": 1.3089055891246364, "grad_norm": 4.007819175720215, "learning_rate": 0.004565493795850623, "loss": 8.0754, "step": 321300 }, { "epoch": 1.3093129671480177, "grad_norm": 4.678389072418213, "learning_rate": 0.004565214012586056, "loss": 8.1385, "step": 321400 }, { "epoch": 1.3097203451713992, "grad_norm": 4.059128284454346, "learning_rate": 0.0045649341478702105, "loss": 8.1046, "step": 321500 }, { "epoch": 1.3101277231947805, "grad_norm": 2.2171413898468018, "learning_rate": 0.004564654201714153, "loss": 8.0373, "step": 321600 }, { "epoch": 1.310535101218162, "grad_norm": 5.1353678703308105, "learning_rate": 0.004564374174128946, "loss": 8.0667, "step": 321700 }, { "epoch": 1.3109424792415436, "grad_norm": 1.2548649311065674, "learning_rate": 0.004564094065125663, "loss": 8.0762, "step": 321800 }, { "epoch": 1.3113498572649251, "grad_norm": 9.941655158996582, "learning_rate": 0.004563813874715376, "loss": 8.0995, "step": 321900 }, { "epoch": 1.3117572352883065, "grad_norm": 3.532829761505127, "learning_rate": 0.004563533602909171, "loss": 8.1438, "step": 322000 }, { "epoch": 1.3117572352883065, "eval_MaskedAccuracy": 0.48833868593461993, "eval_loss": 1.7106949090957642, "eval_runtime": 174.0025, "eval_samples_per_second": 364.799, "eval_steps_per_second": 1.425, "step": 322000 }, { "epoch": 1.312164613311688, "grad_norm": 4.559520721435547, "learning_rate": 0.004563253249718122, "loss": 8.0733, "step": 322100 }, { "epoch": 1.3125719913350695, "grad_norm": 2.4093596935272217, "learning_rate": 0.004562972815153312, "loss": 8.1038, "step": 322200 }, { "epoch": 1.3129793693584508, "grad_norm": 2.1672794818878174, "learning_rate": 0.0045626922992258335, "loss": 8.018, "step": 322300 }, { "epoch": 1.3133867473818324, "grad_norm": 4.177785396575928, "learning_rate": 0.004562411701946769, "loss": 8.0942, "step": 322400 }, { "epoch": 1.313794125405214, "grad_norm": 1.0137476921081543, "learning_rate": 0.00456213102332722, "loss": 8.0877, "step": 322500 }, { "epoch": 1.3142015034285952, "grad_norm": 4.39864444732666, "learning_rate": 0.004561850263378278, "loss": 8.0427, "step": 322600 }, { "epoch": 1.3146088814519767, "grad_norm": 4.880063056945801, "learning_rate": 0.004561569422111041, "loss": 8.1473, "step": 322700 }, { "epoch": 1.3150162594753583, "grad_norm": 0.6708384156227112, "learning_rate": 0.00456128849953662, "loss": 8.0599, "step": 322800 }, { "epoch": 1.3154236374987396, "grad_norm": 3.5477101802825928, "learning_rate": 0.004561007495666116, "loss": 8.0718, "step": 322900 }, { "epoch": 1.3158310155221211, "grad_norm": 1.742142915725708, "learning_rate": 0.004560726410510638, "loss": 8.0656, "step": 323000 }, { "epoch": 1.3158310155221211, "eval_MaskedAccuracy": 0.48881355493981477, "eval_loss": 1.7019761800765991, "eval_runtime": 165.7085, "eval_samples_per_second": 383.058, "eval_steps_per_second": 1.497, "step": 323000 }, { "epoch": 1.3162383935455026, "grad_norm": 3.2039802074432373, "learning_rate": 0.0045604452440813, "loss": 8.0663, "step": 323100 }, { "epoch": 1.3166457715688842, "grad_norm": 1.3553608655929565, "learning_rate": 0.0045601639963892176, "loss": 8.0598, "step": 323200 }, { "epoch": 1.3170531495922655, "grad_norm": 3.884082794189453, "learning_rate": 0.004559882667445508, "loss": 8.079, "step": 323300 }, { "epoch": 1.317460527615647, "grad_norm": 0.9716830253601074, "learning_rate": 0.004559601257261301, "loss": 8.0483, "step": 323400 }, { "epoch": 1.3178679056390283, "grad_norm": 4.185678482055664, "learning_rate": 0.004559319765847716, "loss": 8.0821, "step": 323500 }, { "epoch": 1.3182752836624099, "grad_norm": 3.945801258087158, "learning_rate": 0.0045590381932158865, "loss": 8.0995, "step": 323600 }, { "epoch": 1.3186826616857914, "grad_norm": 3.5799834728240967, "learning_rate": 0.004558756539376936, "loss": 8.0337, "step": 323700 }, { "epoch": 1.319090039709173, "grad_norm": 3.5952444076538086, "learning_rate": 0.004558474804342002, "loss": 8.0947, "step": 323800 }, { "epoch": 1.3194974177325542, "grad_norm": 3.2049031257629395, "learning_rate": 0.004558192988122229, "loss": 8.1092, "step": 323900 }, { "epoch": 1.3199047957559358, "grad_norm": 2.319859504699707, "learning_rate": 0.004557911090728755, "loss": 8.0848, "step": 324000 }, { "epoch": 1.3199047957559358, "eval_MaskedAccuracy": 0.4901750505678959, "eval_loss": 1.6923388242721558, "eval_runtime": 169.8525, "eval_samples_per_second": 373.712, "eval_steps_per_second": 1.46, "step": 324000 }, { "epoch": 1.320312173779317, "grad_norm": 1.0164000988006592, "learning_rate": 0.004557629112172727, "loss": 8.0731, "step": 324100 }, { "epoch": 1.3207195518026986, "grad_norm": 1.9091594219207764, "learning_rate": 0.004557347052465298, "loss": 8.13, "step": 324200 }, { "epoch": 1.3211269298260802, "grad_norm": 1.4779466390609741, "learning_rate": 0.0045570649116176126, "loss": 8.1098, "step": 324300 }, { "epoch": 1.3215343078494617, "grad_norm": 13.105330467224121, "learning_rate": 0.004556782689640824, "loss": 8.0902, "step": 324400 }, { "epoch": 1.321941685872843, "grad_norm": 2.3293752670288086, "learning_rate": 0.004556500386546093, "loss": 8.0626, "step": 324500 }, { "epoch": 1.3223490638962245, "grad_norm": 1.8382813930511475, "learning_rate": 0.004556218002344574, "loss": 8.0754, "step": 324600 }, { "epoch": 1.322756441919606, "grad_norm": 3.2023203372955322, "learning_rate": 0.0045559355370474374, "loss": 8.0802, "step": 324700 }, { "epoch": 1.3231638199429874, "grad_norm": 2.298903703689575, "learning_rate": 0.004555652990665853, "loss": 8.0578, "step": 324800 }, { "epoch": 1.323571197966369, "grad_norm": 1.0970746278762817, "learning_rate": 0.004555370363210988, "loss": 8.0446, "step": 324900 }, { "epoch": 1.3239785759897504, "grad_norm": 1.624550223350525, "learning_rate": 0.004555087654694023, "loss": 8.0969, "step": 325000 }, { "epoch": 1.3239785759897504, "eval_MaskedAccuracy": 0.49033513649122273, "eval_loss": 1.6904529333114624, "eval_runtime": 245.9911, "eval_samples_per_second": 258.042, "eval_steps_per_second": 1.008, "step": 325000 }, { "epoch": 1.3243859540131317, "grad_norm": 2.7660932540893555, "learning_rate": 0.004554804865126121, "loss": 8.0789, "step": 325100 }, { "epoch": 1.3247933320365133, "grad_norm": 1.6017922163009644, "learning_rate": 0.0045545219945184725, "loss": 8.0805, "step": 325200 }, { "epoch": 1.3252007100598948, "grad_norm": 1.1955177783966064, "learning_rate": 0.004554239042882257, "loss": 8.0557, "step": 325300 }, { "epoch": 1.3256080880832761, "grad_norm": 1.2732222080230713, "learning_rate": 0.004553956010228655, "loss": 8.0826, "step": 325400 }, { "epoch": 1.3260154661066577, "grad_norm": 1.009123682975769, "learning_rate": 0.004553672896568865, "loss": 8.0644, "step": 325500 }, { "epoch": 1.3264228441300392, "grad_norm": 3.8442931175231934, "learning_rate": 0.00455338970191408, "loss": 8.052, "step": 325600 }, { "epoch": 1.3268302221534207, "grad_norm": 3.865133285522461, "learning_rate": 0.004553106426275495, "loss": 8.016, "step": 325700 }, { "epoch": 1.327237600176802, "grad_norm": 1.6926275491714478, "learning_rate": 0.004552823069664311, "loss": 8.0729, "step": 325800 }, { "epoch": 1.3276449782001836, "grad_norm": 3.8086345195770264, "learning_rate": 0.004552539632091731, "loss": 8.0186, "step": 325900 }, { "epoch": 1.3280523562235649, "grad_norm": 2.3455417156219482, "learning_rate": 0.004552256113568953, "loss": 8.0357, "step": 326000 }, { "epoch": 1.3280523562235649, "eval_MaskedAccuracy": 0.4922446641348998, "eval_loss": 1.688795566558838, "eval_runtime": 223.6127, "eval_samples_per_second": 283.866, "eval_steps_per_second": 1.109, "step": 326000 }, { "epoch": 1.3284597342469464, "grad_norm": 4.211648464202881, "learning_rate": 0.004551972514107189, "loss": 7.9905, "step": 326100 }, { "epoch": 1.328867112270328, "grad_norm": 3.3933537006378174, "learning_rate": 0.004551688833717657, "loss": 7.9922, "step": 326200 }, { "epoch": 1.3292744902937095, "grad_norm": 3.274343252182007, "learning_rate": 0.004551405072411573, "loss": 8.0469, "step": 326300 }, { "epoch": 1.3296818683170908, "grad_norm": 2.8096282482147217, "learning_rate": 0.004551121230200147, "loss": 8.0395, "step": 326400 }, { "epoch": 1.3300892463404723, "grad_norm": 2.965846538543701, "learning_rate": 0.0045508373070946035, "loss": 8.0873, "step": 326500 }, { "epoch": 1.3304966243638536, "grad_norm": 7.503636837005615, "learning_rate": 0.004550553303106173, "loss": 8.0563, "step": 326600 }, { "epoch": 1.3309040023872352, "grad_norm": 1.167646050453186, "learning_rate": 0.004550269218246077, "loss": 8.0396, "step": 326700 }, { "epoch": 1.3313113804106167, "grad_norm": 2.967207431793213, "learning_rate": 0.004549985052525551, "loss": 8.1156, "step": 326800 }, { "epoch": 1.3317187584339982, "grad_norm": 1.8906344175338745, "learning_rate": 0.004549700805955837, "loss": 8.0438, "step": 326900 }, { "epoch": 1.3321261364573795, "grad_norm": 2.934070348739624, "learning_rate": 0.004549416478548158, "loss": 8.0693, "step": 327000 }, { "epoch": 1.3321261364573795, "eval_MaskedAccuracy": 0.4904791841240935, "eval_loss": 1.6997677087783813, "eval_runtime": 181.7826, "eval_samples_per_second": 349.186, "eval_steps_per_second": 1.364, "step": 327000 }, { "epoch": 1.332533514480761, "grad_norm": 3.414621353149414, "learning_rate": 0.004549132070313766, "loss": 7.9985, "step": 327100 }, { "epoch": 1.3329408925041426, "grad_norm": 4.870385646820068, "learning_rate": 0.004548847581263898, "loss": 8.0352, "step": 327200 }, { "epoch": 1.333348270527524, "grad_norm": 4.610667705535889, "learning_rate": 0.004548563011409796, "loss": 8.059, "step": 327300 }, { "epoch": 1.3337556485509054, "grad_norm": 5.722850322723389, "learning_rate": 0.004548278360762722, "loss": 8.0358, "step": 327400 }, { "epoch": 1.334163026574287, "grad_norm": 3.6654059886932373, "learning_rate": 0.00454799362933393, "loss": 8.0514, "step": 327500 }, { "epoch": 1.3345704045976683, "grad_norm": 0.7920379638671875, "learning_rate": 0.00454770881713467, "loss": 8.0352, "step": 327600 }, { "epoch": 1.3349777826210498, "grad_norm": 3.1758406162261963, "learning_rate": 0.004547423924176206, "loss": 8.0964, "step": 327700 }, { "epoch": 1.3353851606444314, "grad_norm": 15.178168296813965, "learning_rate": 0.004547138950469805, "loss": 8.0582, "step": 327800 }, { "epoch": 1.3357925386678127, "grad_norm": 4.05223274230957, "learning_rate": 0.00454685389602673, "loss": 8.0708, "step": 327900 }, { "epoch": 1.3361999166911942, "grad_norm": 1.9266265630722046, "learning_rate": 0.004546568760858243, "loss": 8.0914, "step": 328000 }, { "epoch": 1.3361999166911942, "eval_MaskedAccuracy": 0.4894137960178113, "eval_loss": 1.7010035514831543, "eval_runtime": 176.7678, "eval_samples_per_second": 359.092, "eval_steps_per_second": 1.403, "step": 328000 }, { "epoch": 1.3366072947145757, "grad_norm": 4.658905982971191, "learning_rate": 0.004546283544975624, "loss": 8.0473, "step": 328100 }, { "epoch": 1.3370146727379573, "grad_norm": 4.361426830291748, "learning_rate": 0.004545998248390153, "loss": 8.0613, "step": 328200 }, { "epoch": 1.3374220507613386, "grad_norm": 3.9232702255249023, "learning_rate": 0.004545712871113103, "loss": 8.0478, "step": 328300 }, { "epoch": 1.33782942878472, "grad_norm": 1.5778297185897827, "learning_rate": 0.004545427413155761, "loss": 8.0545, "step": 328400 }, { "epoch": 1.3382368068081014, "grad_norm": 1.4725537300109863, "learning_rate": 0.004545141874529407, "loss": 8.0925, "step": 328500 }, { "epoch": 1.338644184831483, "grad_norm": 1.4098880290985107, "learning_rate": 0.004544856255245334, "loss": 8.0293, "step": 328600 }, { "epoch": 1.3390515628548645, "grad_norm": 3.308492660522461, "learning_rate": 0.004544570555314832, "loss": 8.0651, "step": 328700 }, { "epoch": 1.339458940878246, "grad_norm": 3.185256242752075, "learning_rate": 0.004544284774749198, "loss": 8.0394, "step": 328800 }, { "epoch": 1.3398663189016273, "grad_norm": 2.046502113342285, "learning_rate": 0.004543998913559733, "loss": 8.012, "step": 328900 }, { "epoch": 1.3402736969250089, "grad_norm": 4.595850944519043, "learning_rate": 0.00454371297175773, "loss": 8.0634, "step": 329000 }, { "epoch": 1.3402736969250089, "eval_MaskedAccuracy": 0.48753905746642684, "eval_loss": 1.6977170705795288, "eval_runtime": 173.5205, "eval_samples_per_second": 365.813, "eval_steps_per_second": 1.429, "step": 329000 }, { "epoch": 1.3406810749483902, "grad_norm": 1.6070823669433594, "learning_rate": 0.004543426949354504, "loss": 8.0694, "step": 329100 }, { "epoch": 1.3410884529717717, "grad_norm": 0.9883131384849548, "learning_rate": 0.004543140846361356, "loss": 8.0704, "step": 329200 }, { "epoch": 1.3414958309951532, "grad_norm": 2.9271223545074463, "learning_rate": 0.004542854662789608, "loss": 8.0833, "step": 329300 }, { "epoch": 1.3419032090185348, "grad_norm": 2.7808384895324707, "learning_rate": 0.0045425683986505625, "loss": 8.0299, "step": 329400 }, { "epoch": 1.342310587041916, "grad_norm": 3.7802367210388184, "learning_rate": 0.00454228205395553, "loss": 8.038, "step": 329500 }, { "epoch": 1.3427179650652976, "grad_norm": 2.54129958152771, "learning_rate": 0.004541995628715848, "loss": 7.9923, "step": 329600 }, { "epoch": 1.3431253430886791, "grad_norm": 3.8371310234069824, "learning_rate": 0.00454170912294283, "loss": 8.0065, "step": 329700 }, { "epoch": 1.3435327211120605, "grad_norm": 4.309117317199707, "learning_rate": 0.004541422536647813, "loss": 8.0154, "step": 329800 }, { "epoch": 1.343940099135442, "grad_norm": 6.226172924041748, "learning_rate": 0.0045411358698421184, "loss": 8.0002, "step": 329900 }, { "epoch": 1.3443474771588235, "grad_norm": 5.053940296173096, "learning_rate": 0.0045408491225370825, "loss": 8.0782, "step": 330000 }, { "epoch": 1.3443474771588235, "eval_MaskedAccuracy": 0.48996696355953634, "eval_loss": 1.6935937404632568, "eval_runtime": 171.1582, "eval_samples_per_second": 370.862, "eval_steps_per_second": 1.449, "step": 330000 }, { "epoch": 1.3447548551822048, "grad_norm": 3.9390504360198975, "learning_rate": 0.004540562294744039, "loss": 8.0451, "step": 330100 }, { "epoch": 1.3451622332055864, "grad_norm": 3.181065082550049, "learning_rate": 0.004540275386474344, "loss": 8.0572, "step": 330200 }, { "epoch": 1.345569611228968, "grad_norm": 3.486276865005493, "learning_rate": 0.004539988397739318, "loss": 8.0695, "step": 330300 }, { "epoch": 1.3459769892523492, "grad_norm": 2.4831082820892334, "learning_rate": 0.004539701328550317, "loss": 8.0066, "step": 330400 }, { "epoch": 1.3463843672757307, "grad_norm": 1.9745352268218994, "learning_rate": 0.004539414178918695, "loss": 8.0093, "step": 330500 }, { "epoch": 1.3467917452991123, "grad_norm": 2.8708128929138184, "learning_rate": 0.004539126948855803, "loss": 8.0634, "step": 330600 }, { "epoch": 1.3471991233224938, "grad_norm": 2.1291728019714355, "learning_rate": 0.004538839638372989, "loss": 8.0565, "step": 330700 }, { "epoch": 1.3476065013458751, "grad_norm": 0.8737789988517761, "learning_rate": 0.004538552247481611, "loss": 8.0754, "step": 330800 }, { "epoch": 1.3480138793692567, "grad_norm": 2.749574899673462, "learning_rate": 0.0045382647761930345, "loss": 8.0911, "step": 330900 }, { "epoch": 1.348421257392638, "grad_norm": 1.407920479774475, "learning_rate": 0.004537977224518631, "loss": 8.0574, "step": 331000 }, { "epoch": 1.348421257392638, "eval_MaskedAccuracy": 0.487671073066648, "eval_loss": 1.7070835828781128, "eval_runtime": 179.2232, "eval_samples_per_second": 354.173, "eval_steps_per_second": 1.384, "step": 331000 }, { "epoch": 1.3488286354160195, "grad_norm": 4.601722240447998, "learning_rate": 0.004537689592469766, "loss": 8.0862, "step": 331100 }, { "epoch": 1.349236013439401, "grad_norm": 3.048001527786255, "learning_rate": 0.00453740188005781, "loss": 8.0407, "step": 331200 }, { "epoch": 1.3496433914627826, "grad_norm": 3.656520366668701, "learning_rate": 0.00453711408729414, "loss": 8.0899, "step": 331300 }, { "epoch": 1.3500507694861639, "grad_norm": 2.7314696311950684, "learning_rate": 0.004536826214190134, "loss": 8.0933, "step": 331400 }, { "epoch": 1.3504581475095454, "grad_norm": 4.379732131958008, "learning_rate": 0.0045365382607571655, "loss": 8.0515, "step": 331500 }, { "epoch": 1.3508655255329267, "grad_norm": 1.268511414527893, "learning_rate": 0.004536250227006631, "loss": 8.0318, "step": 331600 }, { "epoch": 1.3512729035563082, "grad_norm": 3.986025094985962, "learning_rate": 0.0045359621129499065, "loss": 8.0469, "step": 331700 }, { "epoch": 1.3516802815796898, "grad_norm": 5.96984338760376, "learning_rate": 0.004535673918598391, "loss": 8.0228, "step": 331800 }, { "epoch": 1.3520876596030713, "grad_norm": 3.952479839324951, "learning_rate": 0.004535385643963468, "loss": 8.0324, "step": 331900 }, { "epoch": 1.3524950376264526, "grad_norm": 2.14145827293396, "learning_rate": 0.00453509728905654, "loss": 8.0112, "step": 332000 }, { "epoch": 1.3524950376264526, "eval_MaskedAccuracy": 0.488893655978673, "eval_loss": 1.7075307369232178, "eval_runtime": 205.2728, "eval_samples_per_second": 309.228, "eval_steps_per_second": 1.208, "step": 332000 }, { "epoch": 1.3529024156498342, "grad_norm": 3.316530227661133, "learning_rate": 0.004534808853889009, "loss": 8.032, "step": 332100 }, { "epoch": 1.3533097936732157, "grad_norm": 4.175098896026611, "learning_rate": 0.004534520338472278, "loss": 8.0142, "step": 332200 }, { "epoch": 1.353717171696597, "grad_norm": 1.8745940923690796, "learning_rate": 0.004534231742817755, "loss": 8.0436, "step": 332300 }, { "epoch": 1.3541245497199785, "grad_norm": 4.861321449279785, "learning_rate": 0.00453394306693685, "loss": 8.0902, "step": 332400 }, { "epoch": 1.35453192774336, "grad_norm": 4.840919494628906, "learning_rate": 0.004533654310840977, "loss": 8.0495, "step": 332500 }, { "epoch": 1.3549393057667414, "grad_norm": 3.8090786933898926, "learning_rate": 0.004533365474541542, "loss": 8.0285, "step": 332600 }, { "epoch": 1.355346683790123, "grad_norm": 1.8056049346923828, "learning_rate": 0.004533076558049974, "loss": 7.9913, "step": 332700 }, { "epoch": 1.3557540618135044, "grad_norm": 2.420238733291626, "learning_rate": 0.004532787561377689, "loss": 8.0275, "step": 332800 }, { "epoch": 1.3561614398368858, "grad_norm": 3.4509689807891846, "learning_rate": 0.004532498484536121, "loss": 8.0616, "step": 332900 }, { "epoch": 1.3565688178602673, "grad_norm": 1.4320125579833984, "learning_rate": 0.004532209327536695, "loss": 8.0347, "step": 333000 }, { "epoch": 1.3565688178602673, "eval_MaskedAccuracy": 0.4893542575052335, "eval_loss": 1.6820272207260132, "eval_runtime": 191.0192, "eval_samples_per_second": 332.302, "eval_steps_per_second": 1.298, "step": 333000 }, { "epoch": 1.3569761958836488, "grad_norm": 1.30508553981781, "learning_rate": 0.004531920090390839, "loss": 8.0336, "step": 333100 }, { "epoch": 1.3573835739070303, "grad_norm": 0.9989843964576721, "learning_rate": 0.004531630773109995, "loss": 8.0265, "step": 333200 }, { "epoch": 1.3577909519304117, "grad_norm": 1.1286146640777588, "learning_rate": 0.004531341375705594, "loss": 8.0547, "step": 333300 }, { "epoch": 1.3581983299537932, "grad_norm": 3.9923794269561768, "learning_rate": 0.004531051898189085, "loss": 8.0602, "step": 333400 }, { "epoch": 1.3586057079771745, "grad_norm": 2.3580663204193115, "learning_rate": 0.004530762340571909, "loss": 8.0617, "step": 333500 }, { "epoch": 1.359013086000556, "grad_norm": 3.204130172729492, "learning_rate": 0.004530472702865515, "loss": 8.0422, "step": 333600 }, { "epoch": 1.3594204640239376, "grad_norm": 3.7657577991485596, "learning_rate": 0.00453018298508135, "loss": 8.0668, "step": 333700 }, { "epoch": 1.359827842047319, "grad_norm": 2.150792360305786, "learning_rate": 0.0045298931872308715, "loss": 8.0803, "step": 333800 }, { "epoch": 1.3602352200707004, "grad_norm": 2.5532517433166504, "learning_rate": 0.004529603309325526, "loss": 8.0441, "step": 333900 }, { "epoch": 1.360642598094082, "grad_norm": 3.6992883682250977, "learning_rate": 0.004529313351376784, "loss": 8.0215, "step": 334000 }, { "epoch": 1.360642598094082, "eval_MaskedAccuracy": 0.49045791943351175, "eval_loss": 1.703776478767395, "eval_runtime": 197.5733, "eval_samples_per_second": 321.278, "eval_steps_per_second": 1.255, "step": 334000 }, { "epoch": 1.3610499761174633, "grad_norm": 4.939713478088379, "learning_rate": 0.004529023313396113, "loss": 8.0213, "step": 334100 }, { "epoch": 1.3614573541408448, "grad_norm": 4.058901309967041, "learning_rate": 0.004528733195394972, "loss": 8.0415, "step": 334200 }, { "epoch": 1.3618647321642263, "grad_norm": 3.4482951164245605, "learning_rate": 0.0045284429973848385, "loss": 8.043, "step": 334300 }, { "epoch": 1.3622721101876079, "grad_norm": 4.013343811035156, "learning_rate": 0.004528152719377181, "loss": 8.049, "step": 334400 }, { "epoch": 1.3626794882109892, "grad_norm": 5.67153263092041, "learning_rate": 0.004527862361383474, "loss": 8.0046, "step": 334500 }, { "epoch": 1.3630868662343707, "grad_norm": 1.1400394439697266, "learning_rate": 0.004527571923415196, "loss": 8.0832, "step": 334600 }, { "epoch": 1.3634942442577522, "grad_norm": 1.0227196216583252, "learning_rate": 0.004527281405483833, "loss": 8.1128, "step": 334700 }, { "epoch": 1.3639016222811335, "grad_norm": 4.858122825622559, "learning_rate": 0.004526990807600877, "loss": 8.0738, "step": 334800 }, { "epoch": 1.364309000304515, "grad_norm": 4.523569107055664, "learning_rate": 0.004526700129777813, "loss": 8.027, "step": 334900 }, { "epoch": 1.3647163783278966, "grad_norm": 12.582639694213867, "learning_rate": 0.004526409372026125, "loss": 7.9837, "step": 335000 }, { "epoch": 1.3647163783278966, "eval_MaskedAccuracy": 0.48998230648758867, "eval_loss": 1.6911903619766235, "eval_runtime": 420.5773, "eval_samples_per_second": 150.926, "eval_steps_per_second": 0.59, "step": 335000 }, { "epoch": 1.365123756351278, "grad_norm": 2.2136664390563965, "learning_rate": 0.00452611853435731, "loss": 8.0276, "step": 335100 }, { "epoch": 1.3655311343746594, "grad_norm": 3.2456421852111816, "learning_rate": 0.004525827616782868, "loss": 8.0373, "step": 335200 }, { "epoch": 1.365938512398041, "grad_norm": 4.780536651611328, "learning_rate": 0.004525536619314304, "loss": 8.004, "step": 335300 }, { "epoch": 1.3663458904214223, "grad_norm": 3.3634612560272217, "learning_rate": 0.004525245541963124, "loss": 8.0008, "step": 335400 }, { "epoch": 1.3667532684448038, "grad_norm": 5.064257621765137, "learning_rate": 0.0045249543847408315, "loss": 7.9679, "step": 335500 }, { "epoch": 1.3671606464681854, "grad_norm": 4.003411293029785, "learning_rate": 0.004524663147658934, "loss": 8.0352, "step": 335600 }, { "epoch": 1.367568024491567, "grad_norm": 2.0102012157440186, "learning_rate": 0.004524371830728952, "loss": 8.041, "step": 335700 }, { "epoch": 1.3679754025149482, "grad_norm": 2.133049726486206, "learning_rate": 0.004524080433962406, "loss": 8.0463, "step": 335800 }, { "epoch": 1.3683827805383297, "grad_norm": 1.0200728178024292, "learning_rate": 0.004523788957370809, "loss": 8.0384, "step": 335900 }, { "epoch": 1.368790158561711, "grad_norm": 5.349991798400879, "learning_rate": 0.004523497400965693, "loss": 8.0657, "step": 336000 }, { "epoch": 1.368790158561711, "eval_MaskedAccuracy": 0.48971874947821503, "eval_loss": 1.6930047273635864, "eval_runtime": 210.3323, "eval_samples_per_second": 301.789, "eval_steps_per_second": 1.179, "step": 336000 }, { "epoch": 1.3691975365850926, "grad_norm": 4.166293621063232, "learning_rate": 0.004523205764758574, "loss": 8.0336, "step": 336100 }, { "epoch": 1.3696049146084741, "grad_norm": 5.182121276855469, "learning_rate": 0.004522914048760986, "loss": 8.0082, "step": 336200 }, { "epoch": 1.3700122926318556, "grad_norm": 2.6781444549560547, "learning_rate": 0.004522622252984457, "loss": 7.967, "step": 336300 }, { "epoch": 1.370419670655237, "grad_norm": 1.211844801902771, "learning_rate": 0.004522330377440533, "loss": 8.0258, "step": 336400 }, { "epoch": 1.3708270486786185, "grad_norm": 0.9275456070899963, "learning_rate": 0.004522038422140751, "loss": 8.0486, "step": 336500 }, { "epoch": 1.3712344267019998, "grad_norm": 4.825027942657471, "learning_rate": 0.004521746387096655, "loss": 8.0614, "step": 336600 }, { "epoch": 1.3716418047253813, "grad_norm": 4.791982173919678, "learning_rate": 0.00452145427231979, "loss": 8.0858, "step": 336700 }, { "epoch": 1.3720491827487629, "grad_norm": 4.076387882232666, "learning_rate": 0.0045211620778217, "loss": 8.0703, "step": 336800 }, { "epoch": 1.3724565607721444, "grad_norm": 5.908453941345215, "learning_rate": 0.004520869803613938, "loss": 8.0356, "step": 336900 }, { "epoch": 1.3728639387955257, "grad_norm": 3.02319598197937, "learning_rate": 0.0045205774497080594, "loss": 8.0423, "step": 337000 }, { "epoch": 1.3728639387955257, "eval_MaskedAccuracy": 0.4881793270387368, "eval_loss": 1.7019963264465332, "eval_runtime": 203.1019, "eval_samples_per_second": 312.533, "eval_steps_per_second": 1.221, "step": 337000 }, { "epoch": 1.3732713168189072, "grad_norm": 2.5091326236724854, "learning_rate": 0.00452028501611563, "loss": 8.0723, "step": 337100 }, { "epoch": 1.3736786948422888, "grad_norm": 1.8626298904418945, "learning_rate": 0.004519992502848197, "loss": 8.095, "step": 337200 }, { "epoch": 1.37408607286567, "grad_norm": 2.8279223442077637, "learning_rate": 0.004519699909917331, "loss": 8.0275, "step": 337300 }, { "epoch": 1.3744934508890516, "grad_norm": 1.8525398969650269, "learning_rate": 0.00451940723733461, "loss": 8.003, "step": 337400 }, { "epoch": 1.3749008289124331, "grad_norm": 2.949223041534424, "learning_rate": 0.0045191144851115935, "loss": 8.0568, "step": 337500 }, { "epoch": 1.3753082069358145, "grad_norm": 4.134367942810059, "learning_rate": 0.004518821653259865, "loss": 8.0503, "step": 337600 }, { "epoch": 1.375715584959196, "grad_norm": 2.8016107082366943, "learning_rate": 0.0045185287417909906, "loss": 8.0944, "step": 337700 }, { "epoch": 1.3761229629825775, "grad_norm": 2.1527552604675293, "learning_rate": 0.004518235750716557, "loss": 8.0467, "step": 337800 }, { "epoch": 1.3765303410059588, "grad_norm": 4.885125637054443, "learning_rate": 0.004517942680048152, "loss": 8.0484, "step": 337900 }, { "epoch": 1.3769377190293404, "grad_norm": 4.211520195007324, "learning_rate": 0.004517649529797351, "loss": 8.0575, "step": 338000 }, { "epoch": 1.3769377190293404, "eval_MaskedAccuracy": 0.4918551946198786, "eval_loss": 1.6849377155303955, "eval_runtime": 247.7347, "eval_samples_per_second": 256.226, "eval_steps_per_second": 1.001, "step": 338000 }, { "epoch": 1.377345097052722, "grad_norm": 3.0525062084198, "learning_rate": 0.004517356299975756, "loss": 8.0429, "step": 338100 }, { "epoch": 1.3777524750761034, "grad_norm": 3.92828369140625, "learning_rate": 0.00451706299059495, "loss": 7.9908, "step": 338200 }, { "epoch": 1.3781598530994847, "grad_norm": 3.10221529006958, "learning_rate": 0.004516769601666532, "loss": 8.037, "step": 338300 }, { "epoch": 1.3785672311228663, "grad_norm": 2.6689865589141846, "learning_rate": 0.004516476133202104, "loss": 8.0976, "step": 338400 }, { "epoch": 1.3789746091462476, "grad_norm": 1.3879767656326294, "learning_rate": 0.004516182585213266, "loss": 8.0572, "step": 338500 }, { "epoch": 1.3793819871696291, "grad_norm": 1.174448013305664, "learning_rate": 0.004515888957711625, "loss": 8.0771, "step": 338600 }, { "epoch": 1.3797893651930107, "grad_norm": 5.28508186340332, "learning_rate": 0.004515595250708782, "loss": 8.0546, "step": 338700 }, { "epoch": 1.3801967432163922, "grad_norm": 4.878987789154053, "learning_rate": 0.004515301464216359, "loss": 8.0572, "step": 338800 }, { "epoch": 1.3806041212397735, "grad_norm": 6.071109771728516, "learning_rate": 0.004515007598245968, "loss": 8.0044, "step": 338900 }, { "epoch": 1.381011499263155, "grad_norm": 2.3168129920959473, "learning_rate": 0.004514713652809226, "loss": 8.0468, "step": 339000 }, { "epoch": 1.381011499263155, "eval_MaskedAccuracy": 0.4877877929554792, "eval_loss": 1.7033212184906006, "eval_runtime": 185.87, "eval_samples_per_second": 341.507, "eval_steps_per_second": 1.334, "step": 339000 }, { "epoch": 1.3814188772865363, "grad_norm": 1.4501903057098389, "learning_rate": 0.004514419627917752, "loss": 8.0691, "step": 339100 }, { "epoch": 1.3818262553099179, "grad_norm": 12.737586975097656, "learning_rate": 0.004514125523583172, "loss": 8.0547, "step": 339200 }, { "epoch": 1.3822336333332994, "grad_norm": 2.92000412940979, "learning_rate": 0.004513831339817122, "loss": 8.051, "step": 339300 }, { "epoch": 1.382641011356681, "grad_norm": 5.565245628356934, "learning_rate": 0.004513537076631216, "loss": 8.0789, "step": 339400 }, { "epoch": 1.3830483893800622, "grad_norm": 5.634011268615723, "learning_rate": 0.0045132427340370965, "loss": 8.068, "step": 339500 }, { "epoch": 1.3834557674034438, "grad_norm": 1.9182945489883423, "learning_rate": 0.004512948312046409, "loss": 8.0151, "step": 339600 }, { "epoch": 1.3838631454268253, "grad_norm": 4.284234523773193, "learning_rate": 0.004512653810670781, "loss": 7.9576, "step": 339700 }, { "epoch": 1.3842705234502066, "grad_norm": 3.367342710494995, "learning_rate": 0.004512359229921861, "loss": 8.0757, "step": 339800 }, { "epoch": 1.3846779014735882, "grad_norm": 4.452456474304199, "learning_rate": 0.004512064569811299, "loss": 8.0776, "step": 339900 }, { "epoch": 1.3850852794969697, "grad_norm": 4.0282883644104, "learning_rate": 0.004511769830350738, "loss": 8.0499, "step": 340000 }, { "epoch": 1.3850852794969697, "eval_MaskedAccuracy": 0.49261641937426026, "eval_loss": 1.6836434602737427, "eval_runtime": 183.9162, "eval_samples_per_second": 345.135, "eval_steps_per_second": 1.348, "step": 340000 }, { "epoch": 1.385492657520351, "grad_norm": 1.8705769777297974, "learning_rate": 0.0045114750115518324, "loss": 7.9889, "step": 340100 }, { "epoch": 1.3859000355437325, "grad_norm": 3.2162158489227295, "learning_rate": 0.004511180113426241, "loss": 8.0753, "step": 340200 }, { "epoch": 1.386307413567114, "grad_norm": 3.431786298751831, "learning_rate": 0.004510885135985616, "loss": 8.0001, "step": 340300 }, { "epoch": 1.3867147915904954, "grad_norm": 2.138399362564087, "learning_rate": 0.004510590079241629, "loss": 8.0156, "step": 340400 }, { "epoch": 1.387122169613877, "grad_norm": 3.040599822998047, "learning_rate": 0.004510294943205934, "loss": 8.0638, "step": 340500 }, { "epoch": 1.3875295476372584, "grad_norm": 12.077610969543457, "learning_rate": 0.004509999727890208, "loss": 8.0502, "step": 340600 }, { "epoch": 1.38793692566064, "grad_norm": 5.232182025909424, "learning_rate": 0.004509704433306117, "loss": 8.0552, "step": 340700 }, { "epoch": 1.3883443036840213, "grad_norm": 6.518558502197266, "learning_rate": 0.004509409059465335, "loss": 8.0078, "step": 340800 }, { "epoch": 1.3887516817074028, "grad_norm": 1.897269368171692, "learning_rate": 0.004509113606379544, "loss": 8.0856, "step": 340900 }, { "epoch": 1.3891590597307841, "grad_norm": 3.018871784210205, "learning_rate": 0.004508818074060427, "loss": 8.021, "step": 341000 }, { "epoch": 1.3891590597307841, "eval_MaskedAccuracy": 0.49240115959551306, "eval_loss": 1.686929702758789, "eval_runtime": 171.6369, "eval_samples_per_second": 369.827, "eval_steps_per_second": 1.445, "step": 341000 }, { "epoch": 1.3895664377541657, "grad_norm": 4.302035331726074, "learning_rate": 0.004508522462519665, "loss": 7.9861, "step": 341100 }, { "epoch": 1.3899738157775472, "grad_norm": 1.3537317514419556, "learning_rate": 0.004508226771768944, "loss": 8.0481, "step": 341200 }, { "epoch": 1.3903811938009287, "grad_norm": 2.8749494552612305, "learning_rate": 0.004507931001819948, "loss": 8.0595, "step": 341300 }, { "epoch": 1.39078857182431, "grad_norm": 5.485761642456055, "learning_rate": 0.004507635152684382, "loss": 8.0556, "step": 341400 }, { "epoch": 1.3911959498476916, "grad_norm": 4.461055278778076, "learning_rate": 0.004507339224373937, "loss": 8.0743, "step": 341500 }, { "epoch": 1.3916033278710729, "grad_norm": 5.363972187042236, "learning_rate": 0.004507043216900309, "loss": 8.0532, "step": 341600 }, { "epoch": 1.3920107058944544, "grad_norm": 5.1722588539123535, "learning_rate": 0.004506747130275201, "loss": 7.9847, "step": 341700 }, { "epoch": 1.392418083917836, "grad_norm": 3.0832786560058594, "learning_rate": 0.004506450964510331, "loss": 8.0637, "step": 341800 }, { "epoch": 1.3928254619412175, "grad_norm": 1.4068841934204102, "learning_rate": 0.004506154719617401, "loss": 8.0129, "step": 341900 }, { "epoch": 1.3932328399645988, "grad_norm": 4.5246052742004395, "learning_rate": 0.004505858395608122, "loss": 8.0463, "step": 342000 }, { "epoch": 1.3932328399645988, "eval_MaskedAccuracy": 0.488171587673146, "eval_loss": 1.7060898542404175, "eval_runtime": 173.3484, "eval_samples_per_second": 366.176, "eval_steps_per_second": 1.431, "step": 342000 }, { "epoch": 1.3936402179879803, "grad_norm": 2.04990291595459, "learning_rate": 0.004505561992494208, "loss": 8.0104, "step": 342100 }, { "epoch": 1.3940475960113619, "grad_norm": 7.8837199211120605, "learning_rate": 0.0045052655102873745, "loss": 8.0434, "step": 342200 }, { "epoch": 1.3944549740347432, "grad_norm": 3.021195888519287, "learning_rate": 0.004504968948999357, "loss": 8.0437, "step": 342300 }, { "epoch": 1.3948623520581247, "grad_norm": 2.564603090286255, "learning_rate": 0.004504672308641867, "loss": 8.0236, "step": 342400 }, { "epoch": 1.3952697300815062, "grad_norm": 0.9281584024429321, "learning_rate": 0.0045043755892266325, "loss": 8.058, "step": 342500 }, { "epoch": 1.3956771081048875, "grad_norm": 2.436694383621216, "learning_rate": 0.00450407879076539, "loss": 8.0409, "step": 342600 }, { "epoch": 1.396084486128269, "grad_norm": 2.731529712677002, "learning_rate": 0.004503781913269866, "loss": 8.0601, "step": 342700 }, { "epoch": 1.3964918641516506, "grad_norm": 2.8823978900909424, "learning_rate": 0.004503484956751801, "loss": 8.0132, "step": 342800 }, { "epoch": 1.396899242175032, "grad_norm": 8.806553840637207, "learning_rate": 0.004503187921222941, "loss": 7.992, "step": 342900 }, { "epoch": 1.3973066201984135, "grad_norm": 0.9807514548301697, "learning_rate": 0.0045028908066950275, "loss": 8.0146, "step": 343000 }, { "epoch": 1.3973066201984135, "eval_MaskedAccuracy": 0.4901069446463299, "eval_loss": 1.6997807025909424, "eval_runtime": 264.8787, "eval_samples_per_second": 239.642, "eval_steps_per_second": 0.936, "step": 343000 }, { "epoch": 1.397713998221795, "grad_norm": 4.19398832321167, "learning_rate": 0.004502593613179801, "loss": 8.0688, "step": 343100 }, { "epoch": 1.3981213762451765, "grad_norm": 3.5427639484405518, "learning_rate": 0.004502296340689015, "loss": 8.0185, "step": 343200 }, { "epoch": 1.3985287542685578, "grad_norm": 4.444757461547852, "learning_rate": 0.004501998989234425, "loss": 8.0288, "step": 343300 }, { "epoch": 1.3989361322919394, "grad_norm": 3.28558349609375, "learning_rate": 0.00450170155882778, "loss": 8.0218, "step": 343400 }, { "epoch": 1.3993435103153207, "grad_norm": 3.0191750526428223, "learning_rate": 0.004501404049480848, "loss": 7.9866, "step": 343500 }, { "epoch": 1.3997508883387022, "grad_norm": 2.12580943107605, "learning_rate": 0.004501106461205386, "loss": 8.0421, "step": 343600 }, { "epoch": 1.4001582663620837, "grad_norm": 2.5206921100616455, "learning_rate": 0.004500808794013157, "loss": 8.0331, "step": 343700 }, { "epoch": 1.4005656443854653, "grad_norm": 4.923537731170654, "learning_rate": 0.0045005110479159356, "loss": 8.0711, "step": 343800 }, { "epoch": 1.4009730224088466, "grad_norm": 4.230342388153076, "learning_rate": 0.004500213222925488, "loss": 8.0021, "step": 343900 }, { "epoch": 1.4013804004322281, "grad_norm": 2.248675584793091, "learning_rate": 0.004499915319053593, "loss": 8.0087, "step": 344000 }, { "epoch": 1.4013804004322281, "eval_MaskedAccuracy": 0.49225142805141237, "eval_loss": 1.6809008121490479, "eval_runtime": 172.1493, "eval_samples_per_second": 368.726, "eval_steps_per_second": 1.441, "step": 344000 }, { "epoch": 1.4017877784556094, "grad_norm": 6.397619247436523, "learning_rate": 0.004499617336312023, "loss": 8.0279, "step": 344100 }, { "epoch": 1.402195156478991, "grad_norm": 6.686524868011475, "learning_rate": 0.004499319274712562, "loss": 8.0732, "step": 344200 }, { "epoch": 1.4026025345023725, "grad_norm": 2.8127522468566895, "learning_rate": 0.004499021134266992, "loss": 8.0659, "step": 344300 }, { "epoch": 1.403009912525754, "grad_norm": 1.8352597951889038, "learning_rate": 0.004498722914987101, "loss": 8.0295, "step": 344400 }, { "epoch": 1.4034172905491353, "grad_norm": 3.4253170490264893, "learning_rate": 0.004498424616884684, "loss": 8.0607, "step": 344500 }, { "epoch": 1.4038246685725169, "grad_norm": 2.7556371688842773, "learning_rate": 0.00449812623997153, "loss": 8.0144, "step": 344600 }, { "epoch": 1.4042320465958982, "grad_norm": 3.0799689292907715, "learning_rate": 0.004497827784259437, "loss": 8.0261, "step": 344700 }, { "epoch": 1.4046394246192797, "grad_norm": 3.8346197605133057, "learning_rate": 0.004497529249760206, "loss": 7.9727, "step": 344800 }, { "epoch": 1.4050468026426612, "grad_norm": 5.817394733428955, "learning_rate": 0.00449723063648563, "loss": 8.0071, "step": 344900 }, { "epoch": 1.4054541806660428, "grad_norm": 3.7115495204925537, "learning_rate": 0.004496931944447526, "loss": 7.9977, "step": 345000 }, { "epoch": 1.4054541806660428, "eval_MaskedAccuracy": 0.4927991312882558, "eval_loss": 1.6924138069152832, "eval_runtime": 164.8575, "eval_samples_per_second": 385.036, "eval_steps_per_second": 1.504, "step": 345000 }, { "epoch": 1.405861558689424, "grad_norm": 2.293860912322998, "learning_rate": 0.004496633173657698, "loss": 7.9953, "step": 345100 }, { "epoch": 1.4062689367128056, "grad_norm": 2.226228713989258, "learning_rate": 0.004496334324127954, "loss": 8.0028, "step": 345200 }, { "epoch": 1.4066763147361871, "grad_norm": 1.7654036283493042, "learning_rate": 0.004496035395870117, "loss": 8.0259, "step": 345300 }, { "epoch": 1.4070836927595685, "grad_norm": 3.241719961166382, "learning_rate": 0.004495736388896001, "loss": 8.0116, "step": 345400 }, { "epoch": 1.40749107078295, "grad_norm": 6.492445468902588, "learning_rate": 0.00449543730321742, "loss": 8.0326, "step": 345500 }, { "epoch": 1.4078984488063315, "grad_norm": 2.9684269428253174, "learning_rate": 0.004495138138846212, "loss": 8.0502, "step": 345600 }, { "epoch": 1.408305826829713, "grad_norm": 1.2709413766860962, "learning_rate": 0.0044948388957942054, "loss": 8.0654, "step": 345700 }, { "epoch": 1.4087132048530944, "grad_norm": 5.379870891571045, "learning_rate": 0.004494539574073218, "loss": 8.0678, "step": 345800 }, { "epoch": 1.409120582876476, "grad_norm": 3.2984161376953125, "learning_rate": 0.0044942401736951, "loss": 8.0417, "step": 345900 }, { "epoch": 1.4095279608998572, "grad_norm": 1.9095920324325562, "learning_rate": 0.0044939406946716744, "loss": 8.0622, "step": 346000 }, { "epoch": 1.4095279608998572, "eval_MaskedAccuracy": 0.48976976075929307, "eval_loss": 1.6921502351760864, "eval_runtime": 224.8006, "eval_samples_per_second": 282.366, "eval_steps_per_second": 1.103, "step": 346000 }, { "epoch": 1.4099353389232387, "grad_norm": 2.1624269485473633, "learning_rate": 0.004493641137014782, "loss": 8.0521, "step": 346100 }, { "epoch": 1.4103427169466203, "grad_norm": 2.690730571746826, "learning_rate": 0.004493341500736279, "loss": 8.0555, "step": 346200 }, { "epoch": 1.4107500949700018, "grad_norm": 6.256383895874023, "learning_rate": 0.004493041785847994, "loss": 8.0565, "step": 346300 }, { "epoch": 1.4111574729933831, "grad_norm": 2.366687536239624, "learning_rate": 0.004492741992361785, "loss": 8.0496, "step": 346400 }, { "epoch": 1.4115648510167647, "grad_norm": 7.474885940551758, "learning_rate": 0.004492442120289507, "loss": 8.0452, "step": 346500 }, { "epoch": 1.411972229040146, "grad_norm": 1.8844860792160034, "learning_rate": 0.004492142169643017, "loss": 8.056, "step": 346600 }, { "epoch": 1.4123796070635275, "grad_norm": 3.0277657508850098, "learning_rate": 0.004491842140434164, "loss": 8.0266, "step": 346700 }, { "epoch": 1.412786985086909, "grad_norm": 4.008308410644531, "learning_rate": 0.004491542032674812, "loss": 8.0023, "step": 346800 }, { "epoch": 1.4131943631102906, "grad_norm": 1.7533228397369385, "learning_rate": 0.004491241846376829, "loss": 8.0353, "step": 346900 }, { "epoch": 1.4136017411336719, "grad_norm": 3.0283710956573486, "learning_rate": 0.004490941581552086, "loss": 8.0078, "step": 347000 }, { "epoch": 1.4136017411336719, "eval_MaskedAccuracy": 0.48967784953301846, "eval_loss": 1.7042174339294434, "eval_runtime": 185.5618, "eval_samples_per_second": 342.075, "eval_steps_per_second": 1.336, "step": 347000 }, { "epoch": 1.4140091191570534, "grad_norm": 3.1316170692443848, "learning_rate": 0.004490641238212449, "loss": 8.0283, "step": 347100 }, { "epoch": 1.4144164971804347, "grad_norm": 2.7969183921813965, "learning_rate": 0.004490340816369788, "loss": 8.0212, "step": 347200 }, { "epoch": 1.4148238752038163, "grad_norm": 3.6290698051452637, "learning_rate": 0.004490040316035995, "loss": 8.0161, "step": 347300 }, { "epoch": 1.4152312532271978, "grad_norm": 1.1475740671157837, "learning_rate": 0.004489739737222939, "loss": 8.0271, "step": 347400 }, { "epoch": 1.4156386312505793, "grad_norm": 5.602312088012695, "learning_rate": 0.004489439079942507, "loss": 8.0837, "step": 347500 }, { "epoch": 1.4160460092739606, "grad_norm": 0.7871115803718567, "learning_rate": 0.00448913834420659, "loss": 8.0143, "step": 347600 }, { "epoch": 1.4164533872973422, "grad_norm": 5.471859455108643, "learning_rate": 0.004488837530027066, "loss": 8.066, "step": 347700 }, { "epoch": 1.4168607653207237, "grad_norm": 1.3712109327316284, "learning_rate": 0.004488536637415832, "loss": 8.0948, "step": 347800 }, { "epoch": 1.417268143344105, "grad_norm": 1.636563777923584, "learning_rate": 0.004488235666384787, "loss": 8.0749, "step": 347900 }, { "epoch": 1.4176755213674865, "grad_norm": 4.104404449462891, "learning_rate": 0.004487934616945828, "loss": 8.049, "step": 348000 }, { "epoch": 1.4176755213674865, "eval_MaskedAccuracy": 0.49094555395382244, "eval_loss": 1.6923339366912842, "eval_runtime": 168.3756, "eval_samples_per_second": 376.99, "eval_steps_per_second": 1.473, "step": 348000 }, { "epoch": 1.418082899390868, "grad_norm": 1.3405181169509888, "learning_rate": 0.00448763348911086, "loss": 8.0409, "step": 348100 }, { "epoch": 1.4184902774142496, "grad_norm": 0.7778652310371399, "learning_rate": 0.004487332282891787, "loss": 8.0562, "step": 348200 }, { "epoch": 1.418897655437631, "grad_norm": 1.7618839740753174, "learning_rate": 0.004487030998300513, "loss": 8.0204, "step": 348300 }, { "epoch": 1.4193050334610124, "grad_norm": 4.535425186157227, "learning_rate": 0.004486729635348955, "loss": 8.0057, "step": 348400 }, { "epoch": 1.4197124114843938, "grad_norm": 5.359290599822998, "learning_rate": 0.004486428194049023, "loss": 8.0211, "step": 348500 }, { "epoch": 1.4201197895077753, "grad_norm": 2.8018269538879395, "learning_rate": 0.0044861266744126395, "loss": 8.044, "step": 348600 }, { "epoch": 1.4205271675311568, "grad_norm": 6.559733867645264, "learning_rate": 0.004485825076451726, "loss": 8.0027, "step": 348700 }, { "epoch": 1.4209345455545384, "grad_norm": 2.3456294536590576, "learning_rate": 0.004485523400178198, "loss": 8.0445, "step": 348800 }, { "epoch": 1.4213419235779197, "grad_norm": 2.961416482925415, "learning_rate": 0.0044852216456039914, "loss": 8.0284, "step": 348900 }, { "epoch": 1.4217493016013012, "grad_norm": 4.189509391784668, "learning_rate": 0.0044849198127410285, "loss": 7.9987, "step": 349000 }, { "epoch": 1.4217493016013012, "eval_MaskedAccuracy": 0.4919331122905063, "eval_loss": 1.686869740486145, "eval_runtime": 245.5479, "eval_samples_per_second": 258.508, "eval_steps_per_second": 1.01, "step": 349000 }, { "epoch": 1.4221566796246825, "grad_norm": 4.080605506896973, "learning_rate": 0.004484617901601247, "loss": 8.0049, "step": 349100 }, { "epoch": 1.422564057648064, "grad_norm": 1.869256615638733, "learning_rate": 0.004484315912196584, "loss": 8.0349, "step": 349200 }, { "epoch": 1.4229714356714456, "grad_norm": 1.7196160554885864, "learning_rate": 0.004484013844538975, "loss": 8.0591, "step": 349300 }, { "epoch": 1.423378813694827, "grad_norm": 6.398401260375977, "learning_rate": 0.004483711698640363, "loss": 8.0571, "step": 349400 }, { "epoch": 1.4237861917182084, "grad_norm": 6.66635799407959, "learning_rate": 0.0044834094745126955, "loss": 8.0254, "step": 349500 }, { "epoch": 1.42419356974159, "grad_norm": 1.418727993965149, "learning_rate": 0.0044831071721679215, "loss": 8.0496, "step": 349600 }, { "epoch": 1.4246009477649713, "grad_norm": 2.5015978813171387, "learning_rate": 0.0044828047916179985, "loss": 8.0475, "step": 349700 }, { "epoch": 1.4250083257883528, "grad_norm": 1.1977105140686035, "learning_rate": 0.004482502332874867, "loss": 8.0083, "step": 349800 }, { "epoch": 1.4254157038117343, "grad_norm": 4.282104015350342, "learning_rate": 0.004482199795950487, "loss": 7.9986, "step": 349900 }, { "epoch": 1.4258230818351159, "grad_norm": 1.0990618467330933, "learning_rate": 0.004481897180856828, "loss": 8.0481, "step": 350000 }, { "epoch": 1.4258230818351159, "eval_MaskedAccuracy": 0.4905801972207007, "eval_loss": 1.6971557140350342, "eval_runtime": 232.8802, "eval_samples_per_second": 272.569, "eval_steps_per_second": 1.065, "step": 350000 }, { "epoch": 1.4262304598584972, "grad_norm": 2.1344125270843506, "learning_rate": 0.004481594487605856, "loss": 8.0476, "step": 350100 }, { "epoch": 1.4266378378818787, "grad_norm": 2.339507579803467, "learning_rate": 0.004481291716209531, "loss": 8.0124, "step": 350200 }, { "epoch": 1.4270452159052602, "grad_norm": 1.9598414897918701, "learning_rate": 0.004480988866679821, "loss": 7.977, "step": 350300 }, { "epoch": 1.4274525939286415, "grad_norm": 2.153442859649658, "learning_rate": 0.004480685939028701, "loss": 8.0344, "step": 350400 }, { "epoch": 1.427859971952023, "grad_norm": 3.163754463195801, "learning_rate": 0.004480382933268155, "loss": 8.0101, "step": 350500 }, { "epoch": 1.4282673499754046, "grad_norm": 0.8315991163253784, "learning_rate": 0.004480079849410152, "loss": 8.0767, "step": 350600 }, { "epoch": 1.4286747279987861, "grad_norm": 0.9263890385627747, "learning_rate": 0.00447977668746668, "loss": 8.0548, "step": 350700 }, { "epoch": 1.4290821060221675, "grad_norm": 2.093311071395874, "learning_rate": 0.0044794734474497225, "loss": 8.0203, "step": 350800 }, { "epoch": 1.429489484045549, "grad_norm": 5.483651638031006, "learning_rate": 0.004479170129371276, "loss": 8.048, "step": 350900 }, { "epoch": 1.4298968620689303, "grad_norm": 1.5579036474227905, "learning_rate": 0.004478866733243319, "loss": 8.0299, "step": 351000 }, { "epoch": 1.4298968620689303, "eval_MaskedAccuracy": 0.49019804182762994, "eval_loss": 1.6904391050338745, "eval_runtime": 483.3898, "eval_samples_per_second": 131.314, "eval_steps_per_second": 0.513, "step": 351000 }, { "epoch": 1.4303042400923118, "grad_norm": 2.5887644290924072, "learning_rate": 0.0044785632590778526, "loss": 8.0062, "step": 351100 }, { "epoch": 1.4307116181156934, "grad_norm": 4.814824104309082, "learning_rate": 0.0044782597068868755, "loss": 8.0206, "step": 351200 }, { "epoch": 1.431118996139075, "grad_norm": 0.8506344556808472, "learning_rate": 0.004477956076682388, "loss": 8.0234, "step": 351300 }, { "epoch": 1.4315263741624562, "grad_norm": 2.3943898677825928, "learning_rate": 0.004477652368476392, "loss": 8.0364, "step": 351400 }, { "epoch": 1.4319337521858377, "grad_norm": 1.785700798034668, "learning_rate": 0.004477348582280897, "loss": 8.0473, "step": 351500 }, { "epoch": 1.432341130209219, "grad_norm": 3.400876045227051, "learning_rate": 0.00447704471810792, "loss": 8.1064, "step": 351600 }, { "epoch": 1.4327485082326006, "grad_norm": 2.6870365142822266, "learning_rate": 0.004476740775969462, "loss": 8.0594, "step": 351700 }, { "epoch": 1.4331558862559821, "grad_norm": 1.729146122932434, "learning_rate": 0.004476436755877546, "loss": 8.0338, "step": 351800 }, { "epoch": 1.4335632642793636, "grad_norm": 1.1923850774765015, "learning_rate": 0.00447613265784419, "loss": 8.0226, "step": 351900 }, { "epoch": 1.433970642302745, "grad_norm": 2.12165904045105, "learning_rate": 0.004475828481881418, "loss": 8.02, "step": 352000 }, { "epoch": 1.433970642302745, "eval_MaskedAccuracy": 0.4926593809327485, "eval_loss": 1.6839932203292847, "eval_runtime": 168.0703, "eval_samples_per_second": 377.675, "eval_steps_per_second": 1.476, "step": 352000 }, { "epoch": 1.4343780203261265, "grad_norm": 3.7795934677124023, "learning_rate": 0.004475524228001263, "loss": 7.9862, "step": 352100 }, { "epoch": 1.4347853983495078, "grad_norm": 4.6571574211120605, "learning_rate": 0.004475219896215741, "loss": 7.9855, "step": 352200 }, { "epoch": 1.4351927763728893, "grad_norm": 4.111793518066406, "learning_rate": 0.004474915486536891, "loss": 8.0121, "step": 352300 }, { "epoch": 1.4356001543962709, "grad_norm": 1.6061862707138062, "learning_rate": 0.004474610998976745, "loss": 7.983, "step": 352400 }, { "epoch": 1.4360075324196524, "grad_norm": 1.8522026538848877, "learning_rate": 0.004474306433547344, "loss": 8.01, "step": 352500 }, { "epoch": 1.4364149104430337, "grad_norm": 2.6164515018463135, "learning_rate": 0.00447400179026072, "loss": 8.0455, "step": 352600 }, { "epoch": 1.4368222884664152, "grad_norm": 3.0808162689208984, "learning_rate": 0.00447369706912893, "loss": 8.0419, "step": 352700 }, { "epoch": 1.4372296664897968, "grad_norm": 5.4039812088012695, "learning_rate": 0.00447339227016402, "loss": 8.0127, "step": 352800 }, { "epoch": 1.437637044513178, "grad_norm": 1.473721981048584, "learning_rate": 0.004473087393378034, "loss": 8.0568, "step": 352900 }, { "epoch": 1.4380444225365596, "grad_norm": 2.6627840995788574, "learning_rate": 0.0044727824387830175, "loss": 8.037, "step": 353000 }, { "epoch": 1.4380444225365596, "eval_MaskedAccuracy": 0.49035301162574, "eval_loss": 1.6992812156677246, "eval_runtime": 171.5562, "eval_samples_per_second": 370.001, "eval_steps_per_second": 1.446, "step": 353000 }, { "epoch": 1.4384518005599412, "grad_norm": 1.1248985528945923, "learning_rate": 0.004472477406391048, "loss": 8.0544, "step": 353100 }, { "epoch": 1.4388591785833227, "grad_norm": 3.9796719551086426, "learning_rate": 0.004472172296214165, "loss": 8.027, "step": 353200 }, { "epoch": 1.439266556606704, "grad_norm": 2.6582648754119873, "learning_rate": 0.004471867108264448, "loss": 8.0448, "step": 353300 }, { "epoch": 1.4396739346300855, "grad_norm": 1.5737559795379639, "learning_rate": 0.004471561842553949, "loss": 7.9911, "step": 353400 }, { "epoch": 1.4400813126534668, "grad_norm": 1.3840856552124023, "learning_rate": 0.004471256499094746, "loss": 8.0141, "step": 353500 }, { "epoch": 1.4404886906768484, "grad_norm": 1.7901232242584229, "learning_rate": 0.004470951077898908, "loss": 8.0552, "step": 353600 }, { "epoch": 1.44089606870023, "grad_norm": 2.631977081298828, "learning_rate": 0.004470645578978506, "loss": 8.0119, "step": 353700 }, { "epoch": 1.4413034467236114, "grad_norm": 3.0884792804718018, "learning_rate": 0.004470340002345623, "loss": 8.0113, "step": 353800 }, { "epoch": 1.4417108247469927, "grad_norm": 2.1293485164642334, "learning_rate": 0.0044700343480123415, "loss": 7.9717, "step": 353900 }, { "epoch": 1.4421182027703743, "grad_norm": 2.5049309730529785, "learning_rate": 0.0044697286159907385, "loss": 8.0098, "step": 354000 }, { "epoch": 1.4421182027703743, "eval_MaskedAccuracy": 0.49255391146316696, "eval_loss": 1.674404501914978, "eval_runtime": 162.3384, "eval_samples_per_second": 391.01, "eval_steps_per_second": 1.528, "step": 354000 }, { "epoch": 1.4425255807937556, "grad_norm": 4.3438191413879395, "learning_rate": 0.004469422806292906, "loss": 8.0276, "step": 354100 }, { "epoch": 1.4429329588171371, "grad_norm": 2.7198264598846436, "learning_rate": 0.004469116918930937, "loss": 8.0444, "step": 354200 }, { "epoch": 1.4433403368405187, "grad_norm": 2.4105312824249268, "learning_rate": 0.0044688109539169196, "loss": 8.0014, "step": 354300 }, { "epoch": 1.4437477148639002, "grad_norm": 3.454275131225586, "learning_rate": 0.0044685049112629495, "loss": 8.0121, "step": 354400 }, { "epoch": 1.4441550928872815, "grad_norm": 5.2834153175354, "learning_rate": 0.004468198790981126, "loss": 8.0129, "step": 354500 }, { "epoch": 1.444562470910663, "grad_norm": 4.581850528717041, "learning_rate": 0.004467892593083559, "loss": 8.0206, "step": 354600 }, { "epoch": 1.4449698489340443, "grad_norm": 3.5770163536071777, "learning_rate": 0.004467586317582348, "loss": 8.0247, "step": 354700 }, { "epoch": 1.4453772269574259, "grad_norm": 3.2041547298431396, "learning_rate": 0.004467279964489606, "loss": 8.0195, "step": 354800 }, { "epoch": 1.4457846049808074, "grad_norm": 3.5487098693847656, "learning_rate": 0.004466973533817442, "loss": 8.0313, "step": 354900 }, { "epoch": 1.446191983004189, "grad_norm": 3.3173043727874756, "learning_rate": 0.004466667025577968, "loss": 8.0444, "step": 355000 }, { "epoch": 1.446191983004189, "eval_MaskedAccuracy": 0.4920171257509205, "eval_loss": 1.688253402709961, "eval_runtime": 207.1816, "eval_samples_per_second": 306.379, "eval_steps_per_second": 1.197, "step": 355000 }, { "epoch": 1.4465993610275703, "grad_norm": 7.595371246337891, "learning_rate": 0.004466360439783299, "loss": 8.0146, "step": 355100 }, { "epoch": 1.4470067390509518, "grad_norm": 2.420732259750366, "learning_rate": 0.004466053776445564, "loss": 8.0344, "step": 355200 }, { "epoch": 1.4474141170743333, "grad_norm": 1.8066754341125488, "learning_rate": 0.004465747035576886, "loss": 7.9927, "step": 355300 }, { "epoch": 1.4478214950977146, "grad_norm": 2.813026189804077, "learning_rate": 0.004465440217189393, "loss": 8.0021, "step": 355400 }, { "epoch": 1.4482288731210962, "grad_norm": 1.8070228099822998, "learning_rate": 0.0044651333212952105, "loss": 7.9979, "step": 355500 }, { "epoch": 1.4486362511444777, "grad_norm": 3.373173713684082, "learning_rate": 0.004464826347906479, "loss": 8.0092, "step": 355600 }, { "epoch": 1.4490436291678592, "grad_norm": 1.6938971281051636, "learning_rate": 0.004464519297035334, "loss": 8.0332, "step": 355700 }, { "epoch": 1.4494510071912405, "grad_norm": 2.879782199859619, "learning_rate": 0.004464212168693899, "loss": 8.0537, "step": 355800 }, { "epoch": 1.449858385214622, "grad_norm": 3.2696335315704346, "learning_rate": 0.004463904962894334, "loss": 8.0623, "step": 355900 }, { "epoch": 1.4502657632380034, "grad_norm": 3.788130521774292, "learning_rate": 0.004463597679648781, "loss": 8.0341, "step": 356000 }, { "epoch": 1.4502657632380034, "eval_MaskedAccuracy": 0.4898133608425122, "eval_loss": 1.6999173164367676, "eval_runtime": 182.9109, "eval_samples_per_second": 347.032, "eval_steps_per_second": 1.356, "step": 356000 }, { "epoch": 1.450673141261385, "grad_norm": 4.155025005340576, "learning_rate": 0.004463290318969386, "loss": 8.0388, "step": 356100 }, { "epoch": 1.4510805192847664, "grad_norm": 5.0551676750183105, "learning_rate": 0.004462982880868299, "loss": 7.9944, "step": 356200 }, { "epoch": 1.451487897308148, "grad_norm": 3.779100179672241, "learning_rate": 0.004462675365357674, "loss": 8.0413, "step": 356300 }, { "epoch": 1.4518952753315293, "grad_norm": 3.9221200942993164, "learning_rate": 0.00446236777244968, "loss": 7.9957, "step": 356400 }, { "epoch": 1.4523026533549108, "grad_norm": 4.456353187561035, "learning_rate": 0.004462060102156466, "loss": 7.9959, "step": 356500 }, { "epoch": 1.4527100313782921, "grad_norm": 4.030895709991455, "learning_rate": 0.004461752354490196, "loss": 7.9711, "step": 356600 }, { "epoch": 1.4531174094016737, "grad_norm": 4.752027988433838, "learning_rate": 0.004461444529463043, "loss": 7.9674, "step": 356700 }, { "epoch": 1.4535247874250552, "grad_norm": 4.187869548797607, "learning_rate": 0.004461136627087173, "loss": 7.9886, "step": 356800 }, { "epoch": 1.4539321654484367, "grad_norm": 2.539093017578125, "learning_rate": 0.004460828647374755, "loss": 7.9884, "step": 356900 }, { "epoch": 1.454339543471818, "grad_norm": 6.668517589569092, "learning_rate": 0.004460520590337975, "loss": 8.0211, "step": 357000 }, { "epoch": 1.454339543471818, "eval_MaskedAccuracy": 0.48822290326397216, "eval_loss": 1.6956151723861694, "eval_runtime": 230.3796, "eval_samples_per_second": 275.528, "eval_steps_per_second": 1.076, "step": 357000 }, { "epoch": 1.4547469214951996, "grad_norm": 2.4861207008361816, "learning_rate": 0.004460212455989005, "loss": 8.0312, "step": 357100 }, { "epoch": 1.4551542995185809, "grad_norm": 3.670663833618164, "learning_rate": 0.0044599042443400315, "loss": 8.0086, "step": 357200 }, { "epoch": 1.4555616775419624, "grad_norm": 2.3406200408935547, "learning_rate": 0.004459595955403238, "loss": 8.003, "step": 357300 }, { "epoch": 1.455969055565344, "grad_norm": 1.8610780239105225, "learning_rate": 0.004459287589190809, "loss": 7.9989, "step": 357400 }, { "epoch": 1.4563764335887255, "grad_norm": 1.2680808305740356, "learning_rate": 0.004458979145714939, "loss": 8.0225, "step": 357500 }, { "epoch": 1.4567838116121068, "grad_norm": 4.370670318603516, "learning_rate": 0.004458670624987823, "loss": 7.9976, "step": 357600 }, { "epoch": 1.4571911896354883, "grad_norm": 4.670384883880615, "learning_rate": 0.004458362027021667, "loss": 8.0201, "step": 357700 }, { "epoch": 1.4575985676588699, "grad_norm": 2.720135450363159, "learning_rate": 0.004458053351828655, "loss": 8.0347, "step": 357800 }, { "epoch": 1.4580059456822512, "grad_norm": 2.43333101272583, "learning_rate": 0.0044577445994209985, "loss": 7.977, "step": 357900 }, { "epoch": 1.4584133237056327, "grad_norm": 1.0447818040847778, "learning_rate": 0.0044574357698109034, "loss": 8.0378, "step": 358000 }, { "epoch": 1.4584133237056327, "eval_MaskedAccuracy": 0.4895248275655935, "eval_loss": 1.6998313665390015, "eval_runtime": 178.2771, "eval_samples_per_second": 356.052, "eval_steps_per_second": 1.391, "step": 358000 }, { "epoch": 1.4588207017290142, "grad_norm": 3.34891414642334, "learning_rate": 0.00445712686301058, "loss": 8.0238, "step": 358100 }, { "epoch": 1.4592280797523958, "grad_norm": 2.1483287811279297, "learning_rate": 0.004456817879032248, "loss": 8.0378, "step": 358200 }, { "epoch": 1.459635457775777, "grad_norm": 7.006333351135254, "learning_rate": 0.004456508817888111, "loss": 7.9972, "step": 358300 }, { "epoch": 1.4600428357991586, "grad_norm": 0.8216266632080078, "learning_rate": 0.004456199679590388, "loss": 8.042, "step": 358400 }, { "epoch": 1.46045021382254, "grad_norm": 4.701449394226074, "learning_rate": 0.004455890464151309, "loss": 8.0216, "step": 358500 }, { "epoch": 1.4608575918459215, "grad_norm": 3.473137855529785, "learning_rate": 0.004455581171583096, "loss": 8.0026, "step": 358600 }, { "epoch": 1.461264969869303, "grad_norm": 6.207772254943848, "learning_rate": 0.0044552718018979805, "loss": 8.0614, "step": 358700 }, { "epoch": 1.4616723478926845, "grad_norm": 2.560777425765991, "learning_rate": 0.004454962355108189, "loss": 8.0169, "step": 358800 }, { "epoch": 1.4620797259160658, "grad_norm": 1.8854252099990845, "learning_rate": 0.004454652831225954, "loss": 8.0301, "step": 358900 }, { "epoch": 1.4624871039394474, "grad_norm": 4.9735188484191895, "learning_rate": 0.004454343230263511, "loss": 8.003, "step": 359000 }, { "epoch": 1.4624871039394474, "eval_MaskedAccuracy": 0.48814781740974716, "eval_loss": 1.6938985586166382, "eval_runtime": 279.0939, "eval_samples_per_second": 227.436, "eval_steps_per_second": 0.889, "step": 359000 }, { "epoch": 1.4628944819628287, "grad_norm": 2.5394465923309326, "learning_rate": 0.004454033552233106, "loss": 8.0566, "step": 359100 }, { "epoch": 1.4633018599862102, "grad_norm": 4.700394153594971, "learning_rate": 0.004453723797146984, "loss": 8.0367, "step": 359200 }, { "epoch": 1.4637092380095917, "grad_norm": 3.184756278991699, "learning_rate": 0.004453413965017392, "loss": 8.0044, "step": 359300 }, { "epoch": 1.4641166160329733, "grad_norm": 3.3647334575653076, "learning_rate": 0.0044531040558565705, "loss": 7.9719, "step": 359400 }, { "epoch": 1.4645239940563546, "grad_norm": 3.093984603881836, "learning_rate": 0.004452794069676775, "loss": 8.021, "step": 359500 }, { "epoch": 1.4649313720797361, "grad_norm": 1.4502638578414917, "learning_rate": 0.0044524840064902645, "loss": 7.9956, "step": 359600 }, { "epoch": 1.4653387501031174, "grad_norm": 4.267673015594482, "learning_rate": 0.004452173866309295, "loss": 8.0186, "step": 359700 }, { "epoch": 1.465746128126499, "grad_norm": 8.67224407196045, "learning_rate": 0.004451863649146126, "loss": 8.0116, "step": 359800 }, { "epoch": 1.4661535061498805, "grad_norm": 3.730142116546631, "learning_rate": 0.004451553355013028, "loss": 8.0154, "step": 359900 }, { "epoch": 1.466560884173262, "grad_norm": 2.6251790523529053, "learning_rate": 0.00445124298392226, "loss": 7.9937, "step": 360000 }, { "epoch": 1.466560884173262, "eval_MaskedAccuracy": 0.4928633534091767, "eval_loss": 1.688250184059143, "eval_runtime": 179.4596, "eval_samples_per_second": 353.706, "eval_steps_per_second": 1.382, "step": 360000 }, { "epoch": 1.4669682621966433, "grad_norm": 4.280770301818848, "learning_rate": 0.004450932535886099, "loss": 8.0097, "step": 360100 }, { "epoch": 1.4673756402200249, "grad_norm": 10.832054138183594, "learning_rate": 0.004450622010916824, "loss": 7.9814, "step": 360200 }, { "epoch": 1.4677830182434064, "grad_norm": 4.457239151000977, "learning_rate": 0.004450311409026702, "loss": 8.0166, "step": 360300 }, { "epoch": 1.4681903962667877, "grad_norm": 5.342009544372559, "learning_rate": 0.004450000730228014, "loss": 8.0293, "step": 360400 }, { "epoch": 1.4685977742901692, "grad_norm": 5.352954387664795, "learning_rate": 0.004449689974533045, "loss": 8.0095, "step": 360500 }, { "epoch": 1.4690051523135508, "grad_norm": 6.877140998840332, "learning_rate": 0.004449379141954084, "loss": 7.9923, "step": 360600 }, { "epoch": 1.4694125303369323, "grad_norm": 2.1410417556762695, "learning_rate": 0.004449068232503411, "loss": 8.0304, "step": 360700 }, { "epoch": 1.4698199083603136, "grad_norm": 1.1315438747406006, "learning_rate": 0.004448757246193324, "loss": 8.0495, "step": 360800 }, { "epoch": 1.4702272863836952, "grad_norm": 2.2842488288879395, "learning_rate": 0.004448446183036121, "loss": 8.021, "step": 360900 }, { "epoch": 1.4706346644070765, "grad_norm": 2.7832038402557373, "learning_rate": 0.0044481350430440915, "loss": 7.9993, "step": 361000 }, { "epoch": 1.4706346644070765, "eval_MaskedAccuracy": 0.49182212582889695, "eval_loss": 1.680207371711731, "eval_runtime": 224.7039, "eval_samples_per_second": 282.487, "eval_steps_per_second": 1.104, "step": 361000 }, { "epoch": 1.471042042430458, "grad_norm": 2.329878807067871, "learning_rate": 0.0044478238262295475, "loss": 8.0567, "step": 361100 }, { "epoch": 1.4714494204538395, "grad_norm": 1.3917580842971802, "learning_rate": 0.004447512532604777, "loss": 8.0241, "step": 361200 }, { "epoch": 1.471856798477221, "grad_norm": 3.8327815532684326, "learning_rate": 0.004447201162182104, "loss": 8.0266, "step": 361300 }, { "epoch": 1.4722641765006024, "grad_norm": 6.694658279418945, "learning_rate": 0.004446889714973832, "loss": 8.0424, "step": 361400 }, { "epoch": 1.472671554523984, "grad_norm": 3.1367924213409424, "learning_rate": 0.00444657819099227, "loss": 8.003, "step": 361500 }, { "epoch": 1.4730789325473652, "grad_norm": 3.245832920074463, "learning_rate": 0.0044462665902497405, "loss": 8.0443, "step": 361600 }, { "epoch": 1.4734863105707467, "grad_norm": 4.566452503204346, "learning_rate": 0.004445954912758557, "loss": 8.0364, "step": 361700 }, { "epoch": 1.4738936885941283, "grad_norm": 1.9279391765594482, "learning_rate": 0.004445643158531048, "loss": 8.0695, "step": 361800 }, { "epoch": 1.4743010666175098, "grad_norm": 3.4402527809143066, "learning_rate": 0.004445331327579535, "loss": 8.0358, "step": 361900 }, { "epoch": 1.4747084446408911, "grad_norm": 4.117164611816406, "learning_rate": 0.004445019419916347, "loss": 8.0099, "step": 362000 }, { "epoch": 1.4747084446408911, "eval_MaskedAccuracy": 0.49113621811736785, "eval_loss": 1.6913385391235352, "eval_runtime": 199.2783, "eval_samples_per_second": 318.529, "eval_steps_per_second": 1.244, "step": 362000 }, { "epoch": 1.4751158226642727, "grad_norm": 2.392179012298584, "learning_rate": 0.004444707435553814, "loss": 8.0438, "step": 362100 }, { "epoch": 1.475523200687654, "grad_norm": 2.4375250339508057, "learning_rate": 0.0044443953745042656, "loss": 8.0186, "step": 362200 }, { "epoch": 1.4759305787110355, "grad_norm": 1.4311915636062622, "learning_rate": 0.004444083236780047, "loss": 8.0084, "step": 362300 }, { "epoch": 1.476337956734417, "grad_norm": 4.306060791015625, "learning_rate": 0.004443771022393504, "loss": 8.0104, "step": 362400 }, { "epoch": 1.4767453347577986, "grad_norm": 3.543065071105957, "learning_rate": 0.004443458731356971, "loss": 7.9741, "step": 362500 }, { "epoch": 1.4771527127811799, "grad_norm": 4.2835798263549805, "learning_rate": 0.004443146363682799, "loss": 7.9911, "step": 362600 }, { "epoch": 1.4775600908045614, "grad_norm": 1.9651813507080078, "learning_rate": 0.004442833919383335, "loss": 8.0053, "step": 362700 }, { "epoch": 1.477967468827943, "grad_norm": 3.219658851623535, "learning_rate": 0.00444252139847093, "loss": 8.0154, "step": 362800 }, { "epoch": 1.4783748468513243, "grad_norm": 4.614295959472656, "learning_rate": 0.0044422088009579445, "loss": 7.9951, "step": 362900 }, { "epoch": 1.4787822248747058, "grad_norm": 5.121086597442627, "learning_rate": 0.004441896126856728, "loss": 8.0194, "step": 363000 }, { "epoch": 1.4787822248747058, "eval_MaskedAccuracy": 0.49050609975653975, "eval_loss": 1.6859321594238281, "eval_runtime": 204.7747, "eval_samples_per_second": 309.98, "eval_steps_per_second": 1.211, "step": 363000 }, { "epoch": 1.4791896028980873, "grad_norm": 5.913979530334473, "learning_rate": 0.004441583376179654, "loss": 8.0509, "step": 363100 }, { "epoch": 1.4795969809214689, "grad_norm": 4.092564582824707, "learning_rate": 0.004441270548939076, "loss": 8.0319, "step": 363200 }, { "epoch": 1.4800043589448502, "grad_norm": 1.9727094173431396, "learning_rate": 0.004440957645147368, "loss": 7.981, "step": 363300 }, { "epoch": 1.4804117369682317, "grad_norm": 2.3226356506347656, "learning_rate": 0.004440644664816902, "loss": 7.9914, "step": 363400 }, { "epoch": 1.480819114991613, "grad_norm": 5.634875297546387, "learning_rate": 0.004440331607960047, "loss": 7.978, "step": 363500 }, { "epoch": 1.4812264930149945, "grad_norm": 1.1887385845184326, "learning_rate": 0.004440018474589184, "loss": 8.0349, "step": 363600 }, { "epoch": 1.481633871038376, "grad_norm": 3.598142147064209, "learning_rate": 0.004439705264716694, "loss": 8.033, "step": 363700 }, { "epoch": 1.4820412490617576, "grad_norm": 1.807037115097046, "learning_rate": 0.004439391978354955, "loss": 8.0413, "step": 363800 }, { "epoch": 1.482448627085139, "grad_norm": 3.9683003425598145, "learning_rate": 0.004439078615516356, "loss": 8.0459, "step": 363900 }, { "epoch": 1.4828560051085204, "grad_norm": 5.336919784545898, "learning_rate": 0.004438765176213286, "loss": 8.0241, "step": 364000 }, { "epoch": 1.4828560051085204, "eval_MaskedAccuracy": 0.4891500509316086, "eval_loss": 1.691572904586792, "eval_runtime": 172.8148, "eval_samples_per_second": 367.306, "eval_steps_per_second": 1.435, "step": 364000 }, { "epoch": 1.4832633831319018, "grad_norm": 2.170383930206299, "learning_rate": 0.004438451660458131, "loss": 7.9796, "step": 364100 }, { "epoch": 1.4836707611552833, "grad_norm": 1.0361725091934204, "learning_rate": 0.0044381380682632955, "loss": 8.0232, "step": 364200 }, { "epoch": 1.4840781391786648, "grad_norm": 2.4752933979034424, "learning_rate": 0.004437824399641173, "loss": 8.0235, "step": 364300 }, { "epoch": 1.4844855172020464, "grad_norm": 2.0901341438293457, "learning_rate": 0.004437510654604171, "loss": 8.0367, "step": 364400 }, { "epoch": 1.4848928952254277, "grad_norm": 3.661705255508423, "learning_rate": 0.004437196833164682, "loss": 8.0061, "step": 364500 }, { "epoch": 1.4853002732488092, "grad_norm": 1.7246965169906616, "learning_rate": 0.004436882935335121, "loss": 8.0055, "step": 364600 }, { "epoch": 1.4857076512721905, "grad_norm": 2.910794258117676, "learning_rate": 0.004436568961127888, "loss": 8.0041, "step": 364700 }, { "epoch": 1.486115029295572, "grad_norm": 2.564167022705078, "learning_rate": 0.004436254910555408, "loss": 7.9882, "step": 364800 }, { "epoch": 1.4865224073189536, "grad_norm": 1.9633032083511353, "learning_rate": 0.00443594078363009, "loss": 8.0237, "step": 364900 }, { "epoch": 1.486929785342335, "grad_norm": 1.481297492980957, "learning_rate": 0.004435626580364358, "loss": 8.0262, "step": 365000 }, { "epoch": 1.486929785342335, "eval_MaskedAccuracy": 0.4919499401216122, "eval_loss": 1.6813246011734009, "eval_runtime": 202.8804, "eval_samples_per_second": 312.874, "eval_steps_per_second": 1.222, "step": 365000 }, { "epoch": 1.4873371633657164, "grad_norm": 2.3210670948028564, "learning_rate": 0.004435312300770635, "loss": 8.0464, "step": 365100 }, { "epoch": 1.487744541389098, "grad_norm": 4.390432834625244, "learning_rate": 0.004434997944861334, "loss": 8.0521, "step": 365200 }, { "epoch": 1.4881519194124795, "grad_norm": 1.854626178741455, "learning_rate": 0.004434683512648892, "loss": 8.0288, "step": 365300 }, { "epoch": 1.4885592974358608, "grad_norm": 1.3176974058151245, "learning_rate": 0.00443436900414574, "loss": 8.0144, "step": 365400 }, { "epoch": 1.4889666754592423, "grad_norm": 1.8628607988357544, "learning_rate": 0.00443405441936432, "loss": 7.9832, "step": 365500 }, { "epoch": 1.4893740534826239, "grad_norm": 2.8778035640716553, "learning_rate": 0.004433739758317057, "loss": 8.0054, "step": 365600 }, { "epoch": 1.4897814315060054, "grad_norm": 3.7729175090789795, "learning_rate": 0.004433425021016393, "loss": 8.0231, "step": 365700 }, { "epoch": 1.4901888095293867, "grad_norm": 2.486950397491455, "learning_rate": 0.004433110207474778, "loss": 7.9779, "step": 365800 }, { "epoch": 1.4905961875527682, "grad_norm": 1.9985967874526978, "learning_rate": 0.004432795317704664, "loss": 8.0286, "step": 365900 }, { "epoch": 1.4910035655761495, "grad_norm": 1.8332232236862183, "learning_rate": 0.004432480351718478, "loss": 8.0113, "step": 366000 }, { "epoch": 1.4910035655761495, "eval_MaskedAccuracy": 0.49197204558073937, "eval_loss": 1.6859568357467651, "eval_runtime": 165.0848, "eval_samples_per_second": 384.506, "eval_steps_per_second": 1.502, "step": 366000 }, { "epoch": 1.491410943599531, "grad_norm": 3.6302599906921387, "learning_rate": 0.004432165309528689, "loss": 8.0024, "step": 366100 }, { "epoch": 1.4918183216229126, "grad_norm": 4.383216857910156, "learning_rate": 0.004431850191147746, "loss": 7.9696, "step": 366200 }, { "epoch": 1.4922256996462941, "grad_norm": 3.673788070678711, "learning_rate": 0.004431534996588106, "loss": 8.0354, "step": 366300 }, { "epoch": 1.4926330776696755, "grad_norm": 2.768078565597534, "learning_rate": 0.004431219725862236, "loss": 8.0335, "step": 366400 }, { "epoch": 1.493040455693057, "grad_norm": 4.210900783538818, "learning_rate": 0.004430904378982597, "loss": 7.9617, "step": 366500 }, { "epoch": 1.4934478337164383, "grad_norm": 5.263278484344482, "learning_rate": 0.004430588955961656, "loss": 7.9852, "step": 366600 }, { "epoch": 1.4938552117398198, "grad_norm": 1.806351900100708, "learning_rate": 0.004430273456811887, "loss": 8.016, "step": 366700 }, { "epoch": 1.4942625897632014, "grad_norm": 7.461493968963623, "learning_rate": 0.004429957881545764, "loss": 8.0104, "step": 366800 }, { "epoch": 1.494669967786583, "grad_norm": 1.170372724533081, "learning_rate": 0.00442964223017576, "loss": 8.0552, "step": 366900 }, { "epoch": 1.4950773458099642, "grad_norm": 2.6418955326080322, "learning_rate": 0.004429326502714353, "loss": 8.0102, "step": 367000 }, { "epoch": 1.4950773458099642, "eval_MaskedAccuracy": 0.4934207540611734, "eval_loss": 1.673243761062622, "eval_runtime": 262.6899, "eval_samples_per_second": 241.639, "eval_steps_per_second": 0.944, "step": 367000 }, { "epoch": 1.4954847238333457, "grad_norm": 1.5339170694351196, "learning_rate": 0.004429010699174032, "loss": 8.0161, "step": 367100 }, { "epoch": 1.495892101856727, "grad_norm": 3.244283676147461, "learning_rate": 0.004428694819567276, "loss": 7.9964, "step": 367200 }, { "epoch": 1.4962994798801086, "grad_norm": 3.9228017330169678, "learning_rate": 0.004428378863906572, "loss": 8.0588, "step": 367300 }, { "epoch": 1.4967068579034901, "grad_norm": 4.916393756866455, "learning_rate": 0.004428062832204412, "loss": 8.0278, "step": 367400 }, { "epoch": 1.4971142359268717, "grad_norm": 6.785499095916748, "learning_rate": 0.004427746724473295, "loss": 8.0251, "step": 367500 }, { "epoch": 1.497521613950253, "grad_norm": 1.6612313985824585, "learning_rate": 0.004427430540725714, "loss": 8.0086, "step": 367600 }, { "epoch": 1.4979289919736345, "grad_norm": 3.4228081703186035, "learning_rate": 0.0044271142809741764, "loss": 8.0555, "step": 367700 }, { "epoch": 1.498336369997016, "grad_norm": 5.840153217315674, "learning_rate": 0.004426797945231178, "loss": 7.9652, "step": 367800 }, { "epoch": 1.4987437480203973, "grad_norm": 5.135745048522949, "learning_rate": 0.004426481533509228, "loss": 8.0224, "step": 367900 }, { "epoch": 1.4991511260437789, "grad_norm": 6.537227153778076, "learning_rate": 0.004426165045820837, "loss": 8.0252, "step": 368000 }, { "epoch": 1.4991511260437789, "eval_MaskedAccuracy": 0.488887823402608, "eval_loss": 1.701386570930481, "eval_runtime": 204.1211, "eval_samples_per_second": 310.972, "eval_steps_per_second": 1.215, "step": 368000 }, { "epoch": 1.4995585040671604, "grad_norm": 4.002716541290283, "learning_rate": 0.004425848482178517, "loss": 8.0073, "step": 368100 }, { "epoch": 1.499965882090542, "grad_norm": 3.9338126182556152, "learning_rate": 0.0044255318425947855, "loss": 8.0021, "step": 368200 }, { "epoch": 1.5003732601139232, "grad_norm": 1.7457914352416992, "learning_rate": 0.004425215127082153, "loss": 8.0395, "step": 368300 }, { "epoch": 1.5007806381373048, "grad_norm": 2.6222951412200928, "learning_rate": 0.004424898335653149, "loss": 8.0063, "step": 368400 }, { "epoch": 1.501188016160686, "grad_norm": 2.925283193588257, "learning_rate": 0.004424581468320298, "loss": 8.0165, "step": 368500 }, { "epoch": 1.5015953941840676, "grad_norm": 1.6769073009490967, "learning_rate": 0.004424264525096125, "loss": 8.0312, "step": 368600 }, { "epoch": 1.5020027722074492, "grad_norm": 3.1565895080566406, "learning_rate": 0.004423947505993157, "loss": 8.0026, "step": 368700 }, { "epoch": 1.5024101502308307, "grad_norm": 2.3908684253692627, "learning_rate": 0.004423630411023935, "loss": 8.0234, "step": 368800 }, { "epoch": 1.502817528254212, "grad_norm": 3.7312817573547363, "learning_rate": 0.00442331324020099, "loss": 8.0064, "step": 368900 }, { "epoch": 1.5032249062775935, "grad_norm": 3.0165467262268066, "learning_rate": 0.004422995993536863, "loss": 8.0322, "step": 369000 }, { "epoch": 1.5032249062775935, "eval_MaskedAccuracy": 0.4903636892160923, "eval_loss": 1.6973485946655273, "eval_runtime": 170.173, "eval_samples_per_second": 373.009, "eval_steps_per_second": 1.457, "step": 369000 }, { "epoch": 1.5036322843009748, "grad_norm": 1.907222867012024, "learning_rate": 0.004422678671044098, "loss": 7.9781, "step": 369100 }, { "epoch": 1.5040396623243564, "grad_norm": 5.6811065673828125, "learning_rate": 0.004422361272735235, "loss": 8.0059, "step": 369200 }, { "epoch": 1.504447040347738, "grad_norm": 3.2971274852752686, "learning_rate": 0.004422043798622829, "loss": 8.0166, "step": 369300 }, { "epoch": 1.5048544183711194, "grad_norm": 1.2491105794906616, "learning_rate": 0.004421726248719429, "loss": 7.9775, "step": 369400 }, { "epoch": 1.5052617963945008, "grad_norm": 3.353694200515747, "learning_rate": 0.0044214086230375885, "loss": 8.0086, "step": 369500 }, { "epoch": 1.5056691744178823, "grad_norm": 1.343021273612976, "learning_rate": 0.0044210909215898694, "loss": 8.0073, "step": 369600 }, { "epoch": 1.5060765524412636, "grad_norm": 4.270023345947266, "learning_rate": 0.004420773144388821, "loss": 8.0072, "step": 369700 }, { "epoch": 1.5064839304646451, "grad_norm": 3.398383855819702, "learning_rate": 0.00442045529144702, "loss": 7.9974, "step": 369800 }, { "epoch": 1.5068913084880267, "grad_norm": 5.190047264099121, "learning_rate": 0.004420137362777025, "loss": 8.0058, "step": 369900 }, { "epoch": 1.5072986865114082, "grad_norm": 6.810204982757568, "learning_rate": 0.004419819358391406, "loss": 8.008, "step": 370000 }, { "epoch": 1.5072986865114082, "eval_MaskedAccuracy": 0.4904813325585378, "eval_loss": 1.6939561367034912, "eval_runtime": 181.4744, "eval_samples_per_second": 349.779, "eval_steps_per_second": 1.367, "step": 370000 }, { "epoch": 1.5077060645347897, "grad_norm": 1.2358757257461548, "learning_rate": 0.004419501278302739, "loss": 7.9975, "step": 370100 }, { "epoch": 1.508113442558171, "grad_norm": 1.7502859830856323, "learning_rate": 0.004419183122523591, "loss": 7.9901, "step": 370200 }, { "epoch": 1.5085208205815523, "grad_norm": 3.5640110969543457, "learning_rate": 0.004418864891066549, "loss": 8.0361, "step": 370300 }, { "epoch": 1.5089281986049339, "grad_norm": 7.9837493896484375, "learning_rate": 0.00441854658394419, "loss": 8.0115, "step": 370400 }, { "epoch": 1.5093355766283154, "grad_norm": 4.1491780281066895, "learning_rate": 0.004418228201169104, "loss": 8.0016, "step": 370500 }, { "epoch": 1.509742954651697, "grad_norm": 5.766002655029297, "learning_rate": 0.004417909742753878, "loss": 7.989, "step": 370600 }, { "epoch": 1.5101503326750785, "grad_norm": 3.899489164352417, "learning_rate": 0.004417591208711096, "loss": 8.0149, "step": 370700 }, { "epoch": 1.5105577106984598, "grad_norm": 8.60437297821045, "learning_rate": 0.004417272599053358, "loss": 7.9866, "step": 370800 }, { "epoch": 1.5109650887218413, "grad_norm": 1.5715206861495972, "learning_rate": 0.00441695391379325, "loss": 7.9908, "step": 370900 }, { "epoch": 1.5113724667452226, "grad_norm": 1.131524920463562, "learning_rate": 0.004416635152943376, "loss": 8.0339, "step": 371000 }, { "epoch": 1.5113724667452226, "eval_MaskedAccuracy": 0.49164760725542445, "eval_loss": 1.6915628910064697, "eval_runtime": 292.0873, "eval_samples_per_second": 217.319, "eval_steps_per_second": 0.849, "step": 371000 }, { "epoch": 1.5117798447686042, "grad_norm": 1.6420693397521973, "learning_rate": 0.004416316316516341, "loss": 8.0173, "step": 371100 }, { "epoch": 1.5121872227919857, "grad_norm": 4.51385498046875, "learning_rate": 0.004415997404524749, "loss": 7.9963, "step": 371200 }, { "epoch": 1.5125946008153672, "grad_norm": 1.2989581823349, "learning_rate": 0.004415678416981209, "loss": 7.9935, "step": 371300 }, { "epoch": 1.5130019788387485, "grad_norm": 1.5589622259140015, "learning_rate": 0.004415359353898335, "loss": 7.9899, "step": 371400 }, { "epoch": 1.51340935686213, "grad_norm": 2.448767900466919, "learning_rate": 0.004415040215288737, "loss": 8.0232, "step": 371500 }, { "epoch": 1.5138167348855114, "grad_norm": 4.266506671905518, "learning_rate": 0.00441472100116503, "loss": 7.9833, "step": 371600 }, { "epoch": 1.514224112908893, "grad_norm": 2.878375291824341, "learning_rate": 0.0044144017115398445, "loss": 8.0197, "step": 371700 }, { "epoch": 1.5146314909322744, "grad_norm": 3.721202850341797, "learning_rate": 0.004414082346425793, "loss": 7.9689, "step": 371800 }, { "epoch": 1.515038868955656, "grad_norm": 2.0374796390533447, "learning_rate": 0.004413762905835502, "loss": 7.9809, "step": 371900 }, { "epoch": 1.5154462469790373, "grad_norm": 4.74337911605835, "learning_rate": 0.004413443389781608, "loss": 8.0431, "step": 372000 }, { "epoch": 1.5154462469790373, "eval_MaskedAccuracy": 0.4893654390312909, "eval_loss": 1.6991828680038452, "eval_runtime": 175.6731, "eval_samples_per_second": 361.33, "eval_steps_per_second": 1.412, "step": 372000 }, { "epoch": 1.5158536250024188, "grad_norm": 2.3870112895965576, "learning_rate": 0.004413123798276741, "loss": 7.9965, "step": 372100 }, { "epoch": 1.5162610030258001, "grad_norm": 5.8625712394714355, "learning_rate": 0.004412804131333538, "loss": 8.0064, "step": 372200 }, { "epoch": 1.5166683810491817, "grad_norm": 5.180693626403809, "learning_rate": 0.0044124843889646265, "loss": 8.0432, "step": 372300 }, { "epoch": 1.5170757590725632, "grad_norm": 3.3647851943969727, "learning_rate": 0.0044121645711826625, "loss": 7.989, "step": 372400 }, { "epoch": 1.5174831370959447, "grad_norm": 2.1196417808532715, "learning_rate": 0.004411844678000285, "loss": 8.0252, "step": 372500 }, { "epoch": 1.5178905151193263, "grad_norm": 3.921891689300537, "learning_rate": 0.004411524709430136, "loss": 7.9924, "step": 372600 }, { "epoch": 1.5182978931427076, "grad_norm": 3.548800230026245, "learning_rate": 0.0044112046654848636, "loss": 7.967, "step": 372700 }, { "epoch": 1.5187052711660889, "grad_norm": 3.645210027694702, "learning_rate": 0.004410884546177128, "loss": 7.9845, "step": 372800 }, { "epoch": 1.5191126491894704, "grad_norm": 5.26247501373291, "learning_rate": 0.004410564351519583, "loss": 7.9699, "step": 372900 }, { "epoch": 1.519520027212852, "grad_norm": 2.6062309741973877, "learning_rate": 0.0044102440815248925, "loss": 7.9899, "step": 373000 }, { "epoch": 1.519520027212852, "eval_MaskedAccuracy": 0.49075348218722203, "eval_loss": 1.6940529346466064, "eval_runtime": 235.0784, "eval_samples_per_second": 270.021, "eval_steps_per_second": 1.055, "step": 373000 }, { "epoch": 1.5199274052362335, "grad_norm": 2.387111186981201, "learning_rate": 0.00440992373620571, "loss": 8.0044, "step": 373100 }, { "epoch": 1.520334783259615, "grad_norm": 0.986452043056488, "learning_rate": 0.004409603315574701, "loss": 7.9815, "step": 373200 }, { "epoch": 1.5207421612829963, "grad_norm": 5.4955010414123535, "learning_rate": 0.004409282819644539, "loss": 8.0191, "step": 373300 }, { "epoch": 1.5211495393063779, "grad_norm": 2.1645195484161377, "learning_rate": 0.004408962248427899, "loss": 8.0379, "step": 373400 }, { "epoch": 1.5215569173297592, "grad_norm": 4.374330043792725, "learning_rate": 0.004408641601937445, "loss": 7.9843, "step": 373500 }, { "epoch": 1.5219642953531407, "grad_norm": 5.132680892944336, "learning_rate": 0.004408320880185853, "loss": 7.9998, "step": 373600 }, { "epoch": 1.5223716733765222, "grad_norm": 7.35249137878418, "learning_rate": 0.004408000083185806, "loss": 8.0068, "step": 373700 }, { "epoch": 1.5227790513999038, "grad_norm": 1.830296516418457, "learning_rate": 0.004407679210949991, "loss": 8.0002, "step": 373800 }, { "epoch": 1.523186429423285, "grad_norm": 3.0734915733337402, "learning_rate": 0.0044073582634910876, "loss": 8.0164, "step": 373900 }, { "epoch": 1.5235938074466666, "grad_norm": 2.2600815296173096, "learning_rate": 0.0044070372408217835, "loss": 7.9963, "step": 374000 }, { "epoch": 1.5235938074466666, "eval_MaskedAccuracy": 0.4904077083014307, "eval_loss": 1.7035925388336182, "eval_runtime": 178.9778, "eval_samples_per_second": 354.659, "eval_steps_per_second": 1.386, "step": 374000 }, { "epoch": 1.524001185470048, "grad_norm": 3.5932393074035645, "learning_rate": 0.004406716142954784, "loss": 7.9975, "step": 374100 }, { "epoch": 1.5244085634934295, "grad_norm": 1.704676866531372, "learning_rate": 0.004406394969902773, "loss": 8.0119, "step": 374200 }, { "epoch": 1.524815941516811, "grad_norm": 1.0707470178604126, "learning_rate": 0.004406073721678446, "loss": 8.0245, "step": 374300 }, { "epoch": 1.5252233195401925, "grad_norm": 3.7876572608947754, "learning_rate": 0.004405752398294502, "loss": 7.9867, "step": 374400 }, { "epoch": 1.5256306975635738, "grad_norm": 1.4847164154052734, "learning_rate": 0.004405430999763656, "loss": 8.0224, "step": 374500 }, { "epoch": 1.5260380755869554, "grad_norm": 1.2850916385650635, "learning_rate": 0.004405109526098606, "loss": 8.0137, "step": 374600 }, { "epoch": 1.5264454536103367, "grad_norm": 1.354112148284912, "learning_rate": 0.0044047879773120675, "loss": 8.021, "step": 374700 }, { "epoch": 1.5268528316337182, "grad_norm": 2.893594741821289, "learning_rate": 0.004404466353416749, "loss": 7.997, "step": 374800 }, { "epoch": 1.5272602096570997, "grad_norm": 2.0816569328308105, "learning_rate": 0.00440414465442537, "loss": 7.9995, "step": 374900 }, { "epoch": 1.5276675876804813, "grad_norm": 1.5227657556533813, "learning_rate": 0.00440382288035064, "loss": 8.0081, "step": 375000 }, { "epoch": 1.5276675876804813, "eval_MaskedAccuracy": 0.4910730598327578, "eval_loss": 1.6873279809951782, "eval_runtime": 172.7911, "eval_samples_per_second": 367.357, "eval_steps_per_second": 1.435, "step": 375000 }, { "epoch": 1.5280749657038628, "grad_norm": 2.5384185314178467, "learning_rate": 0.004403501031205286, "loss": 8.0438, "step": 375100 }, { "epoch": 1.5284823437272441, "grad_norm": 3.016969680786133, "learning_rate": 0.004403179107002036, "loss": 7.999, "step": 375200 }, { "epoch": 1.5288897217506254, "grad_norm": 3.4094536304473877, "learning_rate": 0.0044028571077536095, "loss": 7.9872, "step": 375300 }, { "epoch": 1.529297099774007, "grad_norm": 3.460625410079956, "learning_rate": 0.004402535033472744, "loss": 8.0281, "step": 375400 }, { "epoch": 1.5297044777973885, "grad_norm": 2.0465474128723145, "learning_rate": 0.004402212884172171, "loss": 7.9687, "step": 375500 }, { "epoch": 1.53011185582077, "grad_norm": 3.2641730308532715, "learning_rate": 0.004401890659864625, "loss": 7.9884, "step": 375600 }, { "epoch": 1.5305192338441516, "grad_norm": 0.8561846613883972, "learning_rate": 0.004401568360562846, "loss": 7.9817, "step": 375700 }, { "epoch": 1.5309266118675329, "grad_norm": 7.27733850479126, "learning_rate": 0.004401245986279574, "loss": 8.0236, "step": 375800 }, { "epoch": 1.5313339898909144, "grad_norm": 1.1665247678756714, "learning_rate": 0.004400923537027569, "loss": 8.0193, "step": 375900 }, { "epoch": 1.5317413679142957, "grad_norm": 1.3204345703125, "learning_rate": 0.0044006010128195515, "loss": 8.0183, "step": 376000 }, { "epoch": 1.5317413679142957, "eval_MaskedAccuracy": 0.49200622618186474, "eval_loss": 1.6839189529418945, "eval_runtime": 257.4429, "eval_samples_per_second": 246.563, "eval_steps_per_second": 0.963, "step": 376000 }, { "epoch": 1.5321487459376772, "grad_norm": 3.4822580814361572, "learning_rate": 0.004400278413668291, "loss": 8.0098, "step": 376100 }, { "epoch": 1.5325561239610588, "grad_norm": 1.6631346940994263, "learning_rate": 0.004399955739586542, "loss": 7.9757, "step": 376200 }, { "epoch": 1.5329635019844403, "grad_norm": 7.522161483764648, "learning_rate": 0.004399632990587055, "loss": 8.0237, "step": 376300 }, { "epoch": 1.5333708800078216, "grad_norm": 1.9211970567703247, "learning_rate": 0.004399310166682584, "loss": 8.0303, "step": 376400 }, { "epoch": 1.5337782580312032, "grad_norm": 4.876394271850586, "learning_rate": 0.004398987267885908, "loss": 8.044, "step": 376500 }, { "epoch": 1.5341856360545845, "grad_norm": 1.8055191040039062, "learning_rate": 0.004398664294209784, "loss": 7.9903, "step": 376600 }, { "epoch": 1.534593014077966, "grad_norm": 3.4431655406951904, "learning_rate": 0.0043983412456669825, "loss": 7.9903, "step": 376700 }, { "epoch": 1.5350003921013475, "grad_norm": 3.2290079593658447, "learning_rate": 0.004398018122270274, "loss": 7.9868, "step": 376800 }, { "epoch": 1.535407770124729, "grad_norm": 0.8165335655212402, "learning_rate": 0.004397694924032439, "loss": 8.0685, "step": 376900 }, { "epoch": 1.5358151481481104, "grad_norm": 1.4743595123291016, "learning_rate": 0.004397371650966245, "loss": 8.0165, "step": 377000 }, { "epoch": 1.5358151481481104, "eval_MaskedAccuracy": 0.4923337604931521, "eval_loss": 1.687532901763916, "eval_runtime": 177.2597, "eval_samples_per_second": 358.096, "eval_steps_per_second": 1.399, "step": 377000 }, { "epoch": 1.536222526171492, "grad_norm": 4.091436862945557, "learning_rate": 0.0043970483030844835, "loss": 7.9861, "step": 377100 }, { "epoch": 1.5366299041948732, "grad_norm": 2.774745225906372, "learning_rate": 0.00439672488039993, "loss": 7.9904, "step": 377200 }, { "epoch": 1.5370372822182548, "grad_norm": 4.360829830169678, "learning_rate": 0.004396401382925373, "loss": 8.0045, "step": 377300 }, { "epoch": 1.5374446602416363, "grad_norm": 2.3972575664520264, "learning_rate": 0.004396077810673604, "loss": 8.0261, "step": 377400 }, { "epoch": 1.5378520382650178, "grad_norm": 3.517026424407959, "learning_rate": 0.004395754163657418, "loss": 7.9857, "step": 377500 }, { "epoch": 1.5382594162883994, "grad_norm": 6.994585037231445, "learning_rate": 0.004395430441889607, "loss": 7.9608, "step": 377600 }, { "epoch": 1.5386667943117807, "grad_norm": 2.0977888107299805, "learning_rate": 0.004395106645382972, "loss": 7.9971, "step": 377700 }, { "epoch": 1.539074172335162, "grad_norm": 4.672280311584473, "learning_rate": 0.004394782774150305, "loss": 7.9997, "step": 377800 }, { "epoch": 1.5394815503585435, "grad_norm": 3.41923189163208, "learning_rate": 0.004394458828204427, "loss": 7.9821, "step": 377900 }, { "epoch": 1.539888928381925, "grad_norm": 1.2892951965332031, "learning_rate": 0.004394134807558128, "loss": 7.9444, "step": 378000 }, { "epoch": 1.539888928381925, "eval_MaskedAccuracy": 0.49300857328657016, "eval_loss": 1.6862092018127441, "eval_runtime": 186.9825, "eval_samples_per_second": 339.476, "eval_steps_per_second": 1.326, "step": 378000 }, { "epoch": 1.5402963064053066, "grad_norm": 2.5519824028015137, "learning_rate": 0.004393810712224229, "loss": 7.9852, "step": 378100 }, { "epoch": 1.540703684428688, "grad_norm": 5.804073810577393, "learning_rate": 0.004393486542215543, "loss": 7.9899, "step": 378200 }, { "epoch": 1.5411110624520694, "grad_norm": 3.2780373096466064, "learning_rate": 0.004393162297544885, "loss": 7.9598, "step": 378300 }, { "epoch": 1.541518440475451, "grad_norm": 3.6317949295043945, "learning_rate": 0.0043928379782250645, "loss": 8.0093, "step": 378400 }, { "epoch": 1.5419258184988323, "grad_norm": 1.9392168521881104, "learning_rate": 0.004392513584268915, "loss": 7.9733, "step": 378500 }, { "epoch": 1.5423331965222138, "grad_norm": 3.0337905883789062, "learning_rate": 0.004392189115689258, "loss": 8.016, "step": 378600 }, { "epoch": 1.5427405745455953, "grad_norm": 2.9371018409729004, "learning_rate": 0.004391864572498925, "loss": 7.993, "step": 378700 }, { "epoch": 1.5431479525689769, "grad_norm": 4.97229528427124, "learning_rate": 0.004391539954710744, "loss": 8.0001, "step": 378800 }, { "epoch": 1.5435553305923582, "grad_norm": 1.364279866218567, "learning_rate": 0.0043912152623375505, "loss": 8.0293, "step": 378900 }, { "epoch": 1.5439627086157397, "grad_norm": 2.0546860694885254, "learning_rate": 0.0043908904953921775, "loss": 7.9817, "step": 379000 }, { "epoch": 1.5439627086157397, "eval_MaskedAccuracy": 0.4925978571629836, "eval_loss": 1.6845088005065918, "eval_runtime": 193.6522, "eval_samples_per_second": 327.784, "eval_steps_per_second": 1.281, "step": 379000 }, { "epoch": 1.544370086639121, "grad_norm": 3.2605416774749756, "learning_rate": 0.004390565653887459, "loss": 7.9926, "step": 379100 }, { "epoch": 1.5447774646625025, "grad_norm": 1.0459586381912231, "learning_rate": 0.0043902407378362505, "loss": 7.9852, "step": 379200 }, { "epoch": 1.545184842685884, "grad_norm": 3.6447129249572754, "learning_rate": 0.004389915747251395, "loss": 7.9771, "step": 379300 }, { "epoch": 1.5455922207092656, "grad_norm": 4.3206706047058105, "learning_rate": 0.004389590682145738, "loss": 7.9652, "step": 379400 }, { "epoch": 1.545999598732647, "grad_norm": 1.4588992595672607, "learning_rate": 0.00438926554253213, "loss": 7.9142, "step": 379500 }, { "epoch": 1.5464069767560285, "grad_norm": 5.171288967132568, "learning_rate": 0.004388940328423422, "loss": 8.0281, "step": 379600 }, { "epoch": 1.5468143547794098, "grad_norm": 1.0709530115127563, "learning_rate": 0.004388615039832481, "loss": 7.9797, "step": 379700 }, { "epoch": 1.5472217328027913, "grad_norm": 1.3747026920318604, "learning_rate": 0.004388289676772165, "loss": 7.988, "step": 379800 }, { "epoch": 1.5476291108261728, "grad_norm": 4.58999490737915, "learning_rate": 0.0043879642392553415, "loss": 8.0068, "step": 379900 }, { "epoch": 1.5480364888495544, "grad_norm": 4.6510009765625, "learning_rate": 0.004387638727294866, "loss": 8.0183, "step": 380000 }, { "epoch": 1.5480364888495544, "eval_MaskedAccuracy": 0.492262019536217, "eval_loss": 1.6800833940505981, "eval_runtime": 167.5096, "eval_samples_per_second": 378.94, "eval_steps_per_second": 1.481, "step": 380000 }, { "epoch": 1.548443866872936, "grad_norm": 4.6116156578063965, "learning_rate": 0.004387313140903613, "loss": 7.9816, "step": 380100 }, { "epoch": 1.5488512448963172, "grad_norm": 1.628583312034607, "learning_rate": 0.004386987480094455, "loss": 7.9637, "step": 380200 }, { "epoch": 1.5492586229196985, "grad_norm": 3.180694818496704, "learning_rate": 0.004386661744880261, "loss": 8.0141, "step": 380300 }, { "epoch": 1.54966600094308, "grad_norm": 3.0045862197875977, "learning_rate": 0.004386335935273921, "loss": 7.9867, "step": 380400 }, { "epoch": 1.5500733789664616, "grad_norm": 4.71543025970459, "learning_rate": 0.004386010051288303, "loss": 7.9974, "step": 380500 }, { "epoch": 1.5504807569898431, "grad_norm": 4.103104591369629, "learning_rate": 0.004385684092936303, "loss": 8.0233, "step": 380600 }, { "epoch": 1.5508881350132246, "grad_norm": 5.594005107879639, "learning_rate": 0.004385358060230802, "loss": 7.9874, "step": 380700 }, { "epoch": 1.551295513036606, "grad_norm": 2.153127670288086, "learning_rate": 0.004385031953184691, "loss": 8.0051, "step": 380800 }, { "epoch": 1.5517028910599873, "grad_norm": 6.683218002319336, "learning_rate": 0.004384705771810855, "loss": 7.9949, "step": 380900 }, { "epoch": 1.5521102690833688, "grad_norm": 7.274106502532959, "learning_rate": 0.004384379516122195, "loss": 7.976, "step": 381000 }, { "epoch": 1.5521102690833688, "eval_MaskedAccuracy": 0.49259958121987407, "eval_loss": 1.6832220554351807, "eval_runtime": 247.5638, "eval_samples_per_second": 256.403, "eval_steps_per_second": 1.002, "step": 381000 }, { "epoch": 1.5525176471067503, "grad_norm": 2.6610333919525146, "learning_rate": 0.004384053186131619, "loss": 7.9938, "step": 381100 }, { "epoch": 1.5529250251301319, "grad_norm": 2.1829004287719727, "learning_rate": 0.004383726781852016, "loss": 7.997, "step": 381200 }, { "epoch": 1.5533324031535134, "grad_norm": 4.432714939117432, "learning_rate": 0.004383400303296297, "loss": 7.9341, "step": 381300 }, { "epoch": 1.5537397811768947, "grad_norm": 5.071836948394775, "learning_rate": 0.004383073750477368, "loss": 7.9738, "step": 381400 }, { "epoch": 1.5541471592002762, "grad_norm": 2.290721893310547, "learning_rate": 0.004382747123408135, "loss": 8.0268, "step": 381500 }, { "epoch": 1.5545545372236576, "grad_norm": 4.035449981689453, "learning_rate": 0.004382420422101519, "loss": 8.0091, "step": 381600 }, { "epoch": 1.554961915247039, "grad_norm": 5.090133190155029, "learning_rate": 0.004382093646570438, "loss": 8.0049, "step": 381700 }, { "epoch": 1.5553692932704206, "grad_norm": 1.8367220163345337, "learning_rate": 0.004381766796827804, "loss": 7.9489, "step": 381800 }, { "epoch": 1.5557766712938021, "grad_norm": 1.850050926208496, "learning_rate": 0.004381439872886538, "loss": 7.9628, "step": 381900 }, { "epoch": 1.5561840493171835, "grad_norm": 3.262934446334839, "learning_rate": 0.0043811128747595655, "loss": 7.9987, "step": 382000 }, { "epoch": 1.5561840493171835, "eval_MaskedAccuracy": 0.49243849720880073, "eval_loss": 1.679644227027893, "eval_runtime": 217.7531, "eval_samples_per_second": 291.504, "eval_steps_per_second": 1.139, "step": 382000 }, { "epoch": 1.556591427340565, "grad_norm": 4.958277225494385, "learning_rate": 0.004380785802459817, "loss": 7.9732, "step": 382100 }, { "epoch": 1.5569988053639463, "grad_norm": 8.351605415344238, "learning_rate": 0.004380458656000227, "loss": 7.9613, "step": 382200 }, { "epoch": 1.5574061833873278, "grad_norm": 1.5247186422348022, "learning_rate": 0.00438013143539372, "loss": 7.9877, "step": 382300 }, { "epoch": 1.5578135614107094, "grad_norm": 3.52302885055542, "learning_rate": 0.0043798041406532436, "loss": 7.9944, "step": 382400 }, { "epoch": 1.558220939434091, "grad_norm": 5.48790168762207, "learning_rate": 0.004379476771791734, "loss": 7.9529, "step": 382500 }, { "epoch": 1.5586283174574724, "grad_norm": 1.6266847848892212, "learning_rate": 0.004379149328822129, "loss": 8.005, "step": 382600 }, { "epoch": 1.5590356954808537, "grad_norm": 3.445540189743042, "learning_rate": 0.00437882181175738, "loss": 7.9733, "step": 382700 }, { "epoch": 1.559443073504235, "grad_norm": 6.589896202087402, "learning_rate": 0.004378494220610433, "loss": 7.9937, "step": 382800 }, { "epoch": 1.5598504515276166, "grad_norm": 4.113238334655762, "learning_rate": 0.0043781665553942395, "loss": 7.9968, "step": 382900 }, { "epoch": 1.5602578295509981, "grad_norm": 4.015130996704102, "learning_rate": 0.004377838816121752, "loss": 7.9891, "step": 383000 }, { "epoch": 1.5602578295509981, "eval_MaskedAccuracy": 0.49189015147263415, "eval_loss": 1.689454436302185, "eval_runtime": 259.1091, "eval_samples_per_second": 244.978, "eval_steps_per_second": 0.957, "step": 383000 }, { "epoch": 1.5606652075743797, "grad_norm": 2.397777557373047, "learning_rate": 0.004377511002805935, "loss": 8.006, "step": 383100 }, { "epoch": 1.5610725855977612, "grad_norm": 6.622825622558594, "learning_rate": 0.004377183115459738, "loss": 8.0003, "step": 383200 }, { "epoch": 1.5614799636211425, "grad_norm": 3.1762149333953857, "learning_rate": 0.004376855154096129, "loss": 7.984, "step": 383300 }, { "epoch": 1.5618873416445238, "grad_norm": 1.678264856338501, "learning_rate": 0.004376527118728077, "loss": 7.9757, "step": 383400 }, { "epoch": 1.5622947196679053, "grad_norm": 3.344902992248535, "learning_rate": 0.004376199009368541, "loss": 8.0187, "step": 383500 }, { "epoch": 1.5627020976912869, "grad_norm": 4.5006022453308105, "learning_rate": 0.004375870826030505, "loss": 7.961, "step": 383600 }, { "epoch": 1.5631094757146684, "grad_norm": 1.239322304725647, "learning_rate": 0.0043755425687269376, "loss": 7.9776, "step": 383700 }, { "epoch": 1.56351685373805, "grad_norm": 4.102320194244385, "learning_rate": 0.004375214237470823, "loss": 7.9709, "step": 383800 }, { "epoch": 1.5639242317614312, "grad_norm": 3.3950605392456055, "learning_rate": 0.0043748858322751345, "loss": 8.0008, "step": 383900 }, { "epoch": 1.5643316097848128, "grad_norm": 1.621116280555725, "learning_rate": 0.0043745573531528545, "loss": 7.9943, "step": 384000 }, { "epoch": 1.5643316097848128, "eval_MaskedAccuracy": 0.4919575874356232, "eval_loss": 1.6874390840530396, "eval_runtime": 217.0269, "eval_samples_per_second": 292.48, "eval_steps_per_second": 1.143, "step": 384000 }, { "epoch": 1.564738987808194, "grad_norm": 4.499731063842773, "learning_rate": 0.0043742288001169805, "loss": 7.954, "step": 384100 }, { "epoch": 1.5651463658315756, "grad_norm": 2.5246453285217285, "learning_rate": 0.004373900173180485, "loss": 7.9966, "step": 384200 }, { "epoch": 1.5655537438549572, "grad_norm": 2.9872934818267822, "learning_rate": 0.0043735714723563725, "loss": 7.9967, "step": 384300 }, { "epoch": 1.5659611218783387, "grad_norm": 1.435693383216858, "learning_rate": 0.004373242697657639, "loss": 7.971, "step": 384400 }, { "epoch": 1.56636849990172, "grad_norm": 7.620283126831055, "learning_rate": 0.0043729138490972855, "loss": 7.9908, "step": 384500 }, { "epoch": 1.5667758779251015, "grad_norm": 3.052788496017456, "learning_rate": 0.004372584926688302, "loss": 7.9748, "step": 384600 }, { "epoch": 1.5671832559484828, "grad_norm": 3.5217275619506836, "learning_rate": 0.004372255930443699, "loss": 7.9848, "step": 384700 }, { "epoch": 1.5675906339718644, "grad_norm": 5.30711030960083, "learning_rate": 0.0043719268603764775, "loss": 8.0149, "step": 384800 }, { "epoch": 1.567998011995246, "grad_norm": 2.9961659908294678, "learning_rate": 0.004371597716499653, "loss": 7.9639, "step": 384900 }, { "epoch": 1.5684053900186274, "grad_norm": 3.3438985347747803, "learning_rate": 0.004371268498826239, "loss": 7.9791, "step": 385000 }, { "epoch": 1.5684053900186274, "eval_MaskedAccuracy": 0.4931344547286357, "eval_loss": 1.679723858833313, "eval_runtime": 248.0602, "eval_samples_per_second": 255.889, "eval_steps_per_second": 1.0, "step": 385000 }, { "epoch": 1.568812768042009, "grad_norm": 2.019742250442505, "learning_rate": 0.004370939207369248, "loss": 7.9797, "step": 385100 }, { "epoch": 1.5692201460653903, "grad_norm": 3.772763967514038, "learning_rate": 0.004370609842141702, "loss": 8.0025, "step": 385200 }, { "epoch": 1.5696275240887716, "grad_norm": 3.6090774536132812, "learning_rate": 0.004370280403156618, "loss": 7.9975, "step": 385300 }, { "epoch": 1.5700349021121531, "grad_norm": 1.7758287191390991, "learning_rate": 0.0043699508904270256, "loss": 8.0111, "step": 385400 }, { "epoch": 1.5704422801355347, "grad_norm": 5.4468913078308105, "learning_rate": 0.004369621303965948, "loss": 7.9821, "step": 385500 }, { "epoch": 1.5708496581589162, "grad_norm": 1.8788725137710571, "learning_rate": 0.004369291643786418, "loss": 7.9887, "step": 385600 }, { "epoch": 1.5712570361822977, "grad_norm": 3.531351089477539, "learning_rate": 0.004368961909901478, "loss": 7.9929, "step": 385700 }, { "epoch": 1.571664414205679, "grad_norm": 1.8444750308990479, "learning_rate": 0.004368632102324148, "loss": 7.9473, "step": 385800 }, { "epoch": 1.5720717922290603, "grad_norm": 3.00870418548584, "learning_rate": 0.004368302221067471, "loss": 7.9732, "step": 385900 }, { "epoch": 1.5724791702524419, "grad_norm": 2.3317511081695557, "learning_rate": 0.004367972266144491, "loss": 7.9927, "step": 386000 }, { "epoch": 1.5724791702524419, "eval_MaskedAccuracy": 0.49240926965755605, "eval_loss": 1.6796014308929443, "eval_runtime": 175.2543, "eval_samples_per_second": 362.194, "eval_steps_per_second": 1.415, "step": 386000 }, { "epoch": 1.5728865482758234, "grad_norm": 4.185787677764893, "learning_rate": 0.004367642237568251, "loss": 8.0055, "step": 386100 }, { "epoch": 1.573293926299205, "grad_norm": 2.6901581287384033, "learning_rate": 0.004367312135351802, "loss": 8.0233, "step": 386200 }, { "epoch": 1.5737013043225865, "grad_norm": 2.5258898735046387, "learning_rate": 0.004366981959508197, "loss": 7.9875, "step": 386300 }, { "epoch": 1.5741086823459678, "grad_norm": 1.9751198291778564, "learning_rate": 0.004366651710050486, "loss": 8.0142, "step": 386400 }, { "epoch": 1.5745160603693493, "grad_norm": 3.2172818183898926, "learning_rate": 0.0043663213869917315, "loss": 7.978, "step": 386500 }, { "epoch": 1.5749234383927306, "grad_norm": 2.6670279502868652, "learning_rate": 0.00436599099034499, "loss": 7.992, "step": 386600 }, { "epoch": 1.5753308164161122, "grad_norm": 1.5565224885940552, "learning_rate": 0.004365660520123314, "loss": 7.9866, "step": 386700 }, { "epoch": 1.5757381944394937, "grad_norm": 1.5170857906341553, "learning_rate": 0.004365329976339782, "loss": 7.9695, "step": 386800 }, { "epoch": 1.5761455724628752, "grad_norm": 5.241313457489014, "learning_rate": 0.004364999359007448, "loss": 7.9856, "step": 386900 }, { "epoch": 1.5765529504862565, "grad_norm": 2.3493220806121826, "learning_rate": 0.004364668668139396, "loss": 8.0067, "step": 387000 }, { "epoch": 1.5765529504862565, "eval_MaskedAccuracy": 0.4925615120398001, "eval_loss": 1.68255615234375, "eval_runtime": 184.6857, "eval_samples_per_second": 343.697, "eval_steps_per_second": 1.343, "step": 387000 }, { "epoch": 1.576960328509638, "grad_norm": 1.3695019483566284, "learning_rate": 0.0043643379037486985, "loss": 8.0047, "step": 387100 }, { "epoch": 1.5773677065330194, "grad_norm": 3.1875879764556885, "learning_rate": 0.004364007065848422, "loss": 7.9819, "step": 387200 }, { "epoch": 1.577775084556401, "grad_norm": 2.9773612022399902, "learning_rate": 0.0043636761544516576, "loss": 7.9542, "step": 387300 }, { "epoch": 1.5781824625797825, "grad_norm": 3.6195931434631348, "learning_rate": 0.004363345169571481, "loss": 7.9477, "step": 387400 }, { "epoch": 1.578589840603164, "grad_norm": 0.8657932281494141, "learning_rate": 0.004363014111220982, "loss": 7.9695, "step": 387500 }, { "epoch": 1.5789972186265455, "grad_norm": 2.1034436225891113, "learning_rate": 0.004362682979413249, "loss": 7.9899, "step": 387600 }, { "epoch": 1.5794045966499268, "grad_norm": 2.825350761413574, "learning_rate": 0.004362351774161368, "loss": 7.9802, "step": 387700 }, { "epoch": 1.5798119746733081, "grad_norm": 1.7635798454284668, "learning_rate": 0.004362020495478441, "loss": 7.9105, "step": 387800 }, { "epoch": 1.5802193526966897, "grad_norm": 1.4848153591156006, "learning_rate": 0.0043616891433775635, "loss": 7.9555, "step": 387900 }, { "epoch": 1.5806267307200712, "grad_norm": 2.003793954849243, "learning_rate": 0.004361357717871833, "loss": 8.0073, "step": 388000 }, { "epoch": 1.5806267307200712, "eval_MaskedAccuracy": 0.49248137653424673, "eval_loss": 1.6808550357818604, "eval_runtime": 252.9763, "eval_samples_per_second": 250.917, "eval_steps_per_second": 0.98, "step": 388000 }, { "epoch": 1.5810341087434527, "grad_norm": 4.4342145919799805, "learning_rate": 0.004361026218974351, "loss": 7.9984, "step": 388100 }, { "epoch": 1.5814414867668343, "grad_norm": 5.26987361907959, "learning_rate": 0.004360694646698225, "loss": 7.9808, "step": 388200 }, { "epoch": 1.5818488647902156, "grad_norm": 3.508080244064331, "learning_rate": 0.004360363001056569, "loss": 7.9658, "step": 388300 }, { "epoch": 1.582256242813597, "grad_norm": 1.4259755611419678, "learning_rate": 0.0043600312820624805, "loss": 7.992, "step": 388400 }, { "epoch": 1.5826636208369784, "grad_norm": 2.7010579109191895, "learning_rate": 0.004359699489729094, "loss": 7.9874, "step": 388500 }, { "epoch": 1.58307099886036, "grad_norm": 4.352523326873779, "learning_rate": 0.004359367624069517, "loss": 7.997, "step": 388600 }, { "epoch": 1.5834783768837415, "grad_norm": 0.9584260582923889, "learning_rate": 0.004359035685096869, "loss": 8.0089, "step": 388700 }, { "epoch": 1.583885754907123, "grad_norm": 1.1232610940933228, "learning_rate": 0.004358703672824273, "loss": 7.985, "step": 388800 }, { "epoch": 1.5842931329305043, "grad_norm": 2.125617742538452, "learning_rate": 0.0043583715872648554, "loss": 8.0019, "step": 388900 }, { "epoch": 1.5847005109538859, "grad_norm": 1.381211280822754, "learning_rate": 0.004358039428431752, "loss": 7.9937, "step": 389000 }, { "epoch": 1.5847005109538859, "eval_MaskedAccuracy": 0.4935783608289307, "eval_loss": 1.6715673208236694, "eval_runtime": 197.0436, "eval_samples_per_second": 322.142, "eval_steps_per_second": 1.259, "step": 389000 }, { "epoch": 1.5851078889772672, "grad_norm": 0.8518221378326416, "learning_rate": 0.004357707196338086, "loss": 7.9801, "step": 389100 }, { "epoch": 1.5855152670006487, "grad_norm": 1.8795791864395142, "learning_rate": 0.004357374890996995, "loss": 7.9729, "step": 389200 }, { "epoch": 1.5859226450240302, "grad_norm": 2.8003125190734863, "learning_rate": 0.0043570425124216175, "loss": 8.003, "step": 389300 }, { "epoch": 1.5863300230474118, "grad_norm": 2.7544593811035156, "learning_rate": 0.004356710060625093, "loss": 7.9742, "step": 389400 }, { "epoch": 1.586737401070793, "grad_norm": 1.9319356679916382, "learning_rate": 0.00435637753562057, "loss": 7.9684, "step": 389500 }, { "epoch": 1.5871447790941746, "grad_norm": 2.1396312713623047, "learning_rate": 0.004356044937421187, "loss": 7.9813, "step": 389600 }, { "epoch": 1.587552157117556, "grad_norm": 2.89149808883667, "learning_rate": 0.004355712266040101, "loss": 8.0062, "step": 389700 }, { "epoch": 1.5879595351409375, "grad_norm": 4.6878132820129395, "learning_rate": 0.004355379521490456, "loss": 8.047, "step": 389800 }, { "epoch": 1.588366913164319, "grad_norm": 1.7978068590164185, "learning_rate": 0.0043550467037854115, "loss": 7.974, "step": 389900 }, { "epoch": 1.5887742911877005, "grad_norm": 2.9096691608428955, "learning_rate": 0.004354713812938128, "loss": 7.9832, "step": 390000 }, { "epoch": 1.5887742911877005, "eval_MaskedAccuracy": 0.49209820537622284, "eval_loss": 1.678334355354309, "eval_runtime": 185.1042, "eval_samples_per_second": 342.92, "eval_steps_per_second": 1.34, "step": 390000 }, { "epoch": 1.589181669211082, "grad_norm": 2.5478997230529785, "learning_rate": 0.0043543808489617634, "loss": 7.9452, "step": 390100 }, { "epoch": 1.5895890472344634, "grad_norm": 1.4829764366149902, "learning_rate": 0.004354047811869491, "loss": 7.9529, "step": 390200 }, { "epoch": 1.5899964252578447, "grad_norm": 4.051734447479248, "learning_rate": 0.004353714701674461, "loss": 7.9611, "step": 390300 }, { "epoch": 1.5904038032812262, "grad_norm": 2.479869842529297, "learning_rate": 0.004353381518389854, "loss": 7.9544, "step": 390400 }, { "epoch": 1.5908111813046077, "grad_norm": 1.2474099397659302, "learning_rate": 0.004353048262028836, "loss": 7.9865, "step": 390500 }, { "epoch": 1.5912185593279893, "grad_norm": 3.6086440086364746, "learning_rate": 0.00435271493260459, "loss": 7.9947, "step": 390600 }, { "epoch": 1.5916259373513708, "grad_norm": 5.260233402252197, "learning_rate": 0.0043523815301302875, "loss": 7.9664, "step": 390700 }, { "epoch": 1.5920333153747521, "grad_norm": 3.744760036468506, "learning_rate": 0.004352048054619116, "loss": 7.9917, "step": 390800 }, { "epoch": 1.5924406933981334, "grad_norm": 3.6280267238616943, "learning_rate": 0.004351714506084258, "loss": 7.9795, "step": 390900 }, { "epoch": 1.592848071421515, "grad_norm": 1.0758583545684814, "learning_rate": 0.0043513808845389, "loss": 7.9759, "step": 391000 }, { "epoch": 1.592848071421515, "eval_MaskedAccuracy": 0.49325010018277465, "eval_loss": 1.6883565187454224, "eval_runtime": 164.2987, "eval_samples_per_second": 386.345, "eval_steps_per_second": 1.509, "step": 391000 }, { "epoch": 1.5932554494448965, "grad_norm": 4.809948921203613, "learning_rate": 0.004351047189996223, "loss": 8.0092, "step": 391100 }, { "epoch": 1.593662827468278, "grad_norm": 1.1911133527755737, "learning_rate": 0.004350713422469431, "loss": 8.0217, "step": 391200 }, { "epoch": 1.5940702054916596, "grad_norm": 2.9028546810150146, "learning_rate": 0.004350379581971718, "loss": 7.9676, "step": 391300 }, { "epoch": 1.5944775835150409, "grad_norm": 4.913288593292236, "learning_rate": 0.004350045668516274, "loss": 7.9644, "step": 391400 }, { "epoch": 1.5948849615384224, "grad_norm": 2.528618574142456, "learning_rate": 0.004349711682116308, "loss": 7.9554, "step": 391500 }, { "epoch": 1.5952923395618037, "grad_norm": 4.065859794616699, "learning_rate": 0.004349377622785024, "loss": 7.9782, "step": 391600 }, { "epoch": 1.5956997175851853, "grad_norm": 1.9169384241104126, "learning_rate": 0.004349043490535629, "loss": 7.982, "step": 391700 }, { "epoch": 1.5961070956085668, "grad_norm": 3.5128958225250244, "learning_rate": 0.00434870928538133, "loss": 7.9852, "step": 391800 }, { "epoch": 1.5965144736319483, "grad_norm": 1.8897231817245483, "learning_rate": 0.004348375007335346, "loss": 7.9819, "step": 391900 }, { "epoch": 1.5969218516553296, "grad_norm": 2.6008095741271973, "learning_rate": 0.004348040656410888, "loss": 7.987, "step": 392000 }, { "epoch": 1.5969218516553296, "eval_MaskedAccuracy": 0.4921028195568469, "eval_loss": 1.6823772192001343, "eval_runtime": 167.8952, "eval_samples_per_second": 378.069, "eval_steps_per_second": 1.477, "step": 392000 }, { "epoch": 1.5973292296787112, "grad_norm": 3.952047109603882, "learning_rate": 0.004347706232621176, "loss": 7.9643, "step": 392100 }, { "epoch": 1.5977366077020925, "grad_norm": 1.9097968339920044, "learning_rate": 0.004347371735979424, "loss": 7.9793, "step": 392200 }, { "epoch": 1.598143985725474, "grad_norm": 1.9411801099777222, "learning_rate": 0.004347037166498867, "loss": 7.9616, "step": 392300 }, { "epoch": 1.5985513637488555, "grad_norm": 1.4001778364181519, "learning_rate": 0.00434670252419273, "loss": 7.9738, "step": 392400 }, { "epoch": 1.598958741772237, "grad_norm": 1.7431035041809082, "learning_rate": 0.004346367809074245, "loss": 7.9926, "step": 392500 }, { "epoch": 1.5993661197956186, "grad_norm": 2.230560541152954, "learning_rate": 0.004346033021156636, "loss": 7.9617, "step": 392600 }, { "epoch": 1.599773497819, "grad_norm": 1.0867366790771484, "learning_rate": 0.00434569816045315, "loss": 7.9819, "step": 392700 }, { "epoch": 1.6001808758423812, "grad_norm": 3.5999398231506348, "learning_rate": 0.0043453632269770184, "loss": 8.0082, "step": 392800 }, { "epoch": 1.6005882538657628, "grad_norm": 2.177280902862549, "learning_rate": 0.0043450282207414875, "loss": 7.9843, "step": 392900 }, { "epoch": 1.6009956318891443, "grad_norm": 1.610917568206787, "learning_rate": 0.004344693141759801, "loss": 7.9563, "step": 393000 }, { "epoch": 1.6009956318891443, "eval_MaskedAccuracy": 0.49194733052811196, "eval_loss": 1.6850906610488892, "eval_runtime": 159.2594, "eval_samples_per_second": 398.57, "eval_steps_per_second": 1.557, "step": 393000 }, { "epoch": 1.6014030099125258, "grad_norm": 4.063870429992676, "learning_rate": 0.004344357990045204, "loss": 7.9919, "step": 393100 }, { "epoch": 1.6018103879359074, "grad_norm": 1.9149118661880493, "learning_rate": 0.004344022765610947, "loss": 7.9467, "step": 393200 }, { "epoch": 1.6022177659592887, "grad_norm": 5.278997421264648, "learning_rate": 0.0043436874684702875, "loss": 7.9724, "step": 393300 }, { "epoch": 1.60262514398267, "grad_norm": 2.6950948238372803, "learning_rate": 0.004343352098636476, "loss": 7.9997, "step": 393400 }, { "epoch": 1.6030325220060515, "grad_norm": 3.0038623809814453, "learning_rate": 0.004343016656122776, "loss": 7.9972, "step": 393500 }, { "epoch": 1.603439900029433, "grad_norm": 1.5131049156188965, "learning_rate": 0.004342681140942442, "loss": 8.0109, "step": 393600 }, { "epoch": 1.6038472780528146, "grad_norm": 2.710996150970459, "learning_rate": 0.004342345553108749, "loss": 7.946, "step": 393700 }, { "epoch": 1.604254656076196, "grad_norm": 3.3355636596679688, "learning_rate": 0.004342009892634961, "loss": 7.9358, "step": 393800 }, { "epoch": 1.6046620340995774, "grad_norm": 4.113181114196777, "learning_rate": 0.004341674159534346, "loss": 7.9612, "step": 393900 }, { "epoch": 1.605069412122959, "grad_norm": 4.498475551605225, "learning_rate": 0.004341338353820176, "loss": 7.9791, "step": 394000 }, { "epoch": 1.605069412122959, "eval_MaskedAccuracy": 0.4912583135960985, "eval_loss": 1.6878175735473633, "eval_runtime": 162.9635, "eval_samples_per_second": 389.511, "eval_steps_per_second": 1.522, "step": 394000 }, { "epoch": 1.6054767901463403, "grad_norm": 6.954246520996094, "learning_rate": 0.004341002475505732, "loss": 7.9823, "step": 394100 }, { "epoch": 1.6058841681697218, "grad_norm": 4.334616184234619, "learning_rate": 0.0043406665246042905, "loss": 7.9726, "step": 394200 }, { "epoch": 1.6062915461931033, "grad_norm": 4.763188362121582, "learning_rate": 0.004340330501129129, "loss": 7.9971, "step": 394300 }, { "epoch": 1.6066989242164849, "grad_norm": 1.9894366264343262, "learning_rate": 0.004339994405093543, "loss": 7.9599, "step": 394400 }, { "epoch": 1.6071063022398662, "grad_norm": 5.279543876647949, "learning_rate": 0.004339658236510814, "loss": 7.9878, "step": 394500 }, { "epoch": 1.6075136802632477, "grad_norm": 1.0106977224349976, "learning_rate": 0.004339321995394225, "loss": 7.9433, "step": 394600 }, { "epoch": 1.607921058286629, "grad_norm": 2.0923564434051514, "learning_rate": 0.004338985681757076, "loss": 7.9819, "step": 394700 }, { "epoch": 1.6083284363100105, "grad_norm": 3.0404632091522217, "learning_rate": 0.004338649295612668, "loss": 7.99, "step": 394800 }, { "epoch": 1.608735814333392, "grad_norm": 3.663414716720581, "learning_rate": 0.004338312836974292, "loss": 7.9619, "step": 394900 }, { "epoch": 1.6091431923567736, "grad_norm": 11.241720199584961, "learning_rate": 0.00433797630585526, "loss": 7.9876, "step": 395000 }, { "epoch": 1.6091431923567736, "eval_MaskedAccuracy": 0.4917487531948813, "eval_loss": 1.6852360963821411, "eval_runtime": 185.0984, "eval_samples_per_second": 342.931, "eval_steps_per_second": 1.34, "step": 395000 }, { "epoch": 1.6095505703801551, "grad_norm": 4.754985809326172, "learning_rate": 0.004337639702268867, "loss": 7.9595, "step": 395100 }, { "epoch": 1.6099579484035365, "grad_norm": 5.527943134307861, "learning_rate": 0.004337303026228427, "loss": 7.9756, "step": 395200 }, { "epoch": 1.6103653264269178, "grad_norm": 1.6219241619110107, "learning_rate": 0.004336966277747251, "loss": 8.0081, "step": 395300 }, { "epoch": 1.6107727044502993, "grad_norm": 4.942535400390625, "learning_rate": 0.004336629456838651, "loss": 7.9625, "step": 395400 }, { "epoch": 1.6111800824736808, "grad_norm": 1.6272914409637451, "learning_rate": 0.004336292563515942, "loss": 7.9781, "step": 395500 }, { "epoch": 1.6115874604970624, "grad_norm": 3.4605345726013184, "learning_rate": 0.004335955597792442, "loss": 7.9501, "step": 395600 }, { "epoch": 1.611994838520444, "grad_norm": 1.3869355916976929, "learning_rate": 0.004335618559681468, "loss": 7.95, "step": 395700 }, { "epoch": 1.6124022165438252, "grad_norm": 5.522362232208252, "learning_rate": 0.004335281449196358, "loss": 7.9842, "step": 395800 }, { "epoch": 1.6128095945672065, "grad_norm": 2.6254281997680664, "learning_rate": 0.004334944266350435, "loss": 7.937, "step": 395900 }, { "epoch": 1.613216972590588, "grad_norm": 2.4742610454559326, "learning_rate": 0.004334607011157026, "loss": 7.9573, "step": 396000 }, { "epoch": 1.613216972590588, "eval_MaskedAccuracy": 0.4950616120567586, "eval_loss": 1.6649715900421143, "eval_runtime": 164.3113, "eval_samples_per_second": 386.316, "eval_steps_per_second": 1.509, "step": 396000 }, { "epoch": 1.6136243506139696, "grad_norm": 3.2119338512420654, "learning_rate": 0.004334269683629464, "loss": 7.9016, "step": 396100 }, { "epoch": 1.6140317286373511, "grad_norm": 1.6203646659851074, "learning_rate": 0.004333932283781086, "loss": 7.9583, "step": 396200 }, { "epoch": 1.6144391066607326, "grad_norm": 7.14988374710083, "learning_rate": 0.004333594811625235, "loss": 7.9748, "step": 396300 }, { "epoch": 1.614846484684114, "grad_norm": 2.0653839111328125, "learning_rate": 0.004333257267175256, "loss": 7.966, "step": 396400 }, { "epoch": 1.6152538627074955, "grad_norm": 3.278308153152466, "learning_rate": 0.00433291965044449, "loss": 7.9845, "step": 396500 }, { "epoch": 1.6156612407308768, "grad_norm": 3.3761467933654785, "learning_rate": 0.004332581961446283, "loss": 7.9792, "step": 396600 }, { "epoch": 1.6160686187542583, "grad_norm": 4.255733013153076, "learning_rate": 0.004332244200193985, "loss": 7.9835, "step": 396700 }, { "epoch": 1.6164759967776399, "grad_norm": 2.899632692337036, "learning_rate": 0.004331906366700951, "loss": 7.9676, "step": 396800 }, { "epoch": 1.6168833748010214, "grad_norm": 1.675042986869812, "learning_rate": 0.0043315684609805324, "loss": 7.9631, "step": 396900 }, { "epoch": 1.6172907528244027, "grad_norm": 1.578526496887207, "learning_rate": 0.0043312304830461, "loss": 7.9974, "step": 397000 }, { "epoch": 1.6172907528244027, "eval_MaskedAccuracy": 0.49271030592332854, "eval_loss": 1.6789170503616333, "eval_runtime": 193.6599, "eval_samples_per_second": 327.771, "eval_steps_per_second": 1.281, "step": 397000 }, { "epoch": 1.6176981308477842, "grad_norm": 1.3485881090164185, "learning_rate": 0.004330892432911004, "loss": 7.9684, "step": 397100 }, { "epoch": 1.6181055088711656, "grad_norm": 3.4496870040893555, "learning_rate": 0.004330554310588616, "loss": 7.9862, "step": 397200 }, { "epoch": 1.618512886894547, "grad_norm": 4.903573513031006, "learning_rate": 0.004330216116092305, "loss": 7.9935, "step": 397300 }, { "epoch": 1.6189202649179286, "grad_norm": 2.4433298110961914, "learning_rate": 0.004329877849435437, "loss": 7.979, "step": 397400 }, { "epoch": 1.6193276429413102, "grad_norm": 5.1005120277404785, "learning_rate": 0.004329539510631386, "loss": 7.9972, "step": 397500 }, { "epoch": 1.6197350209646917, "grad_norm": 3.9642107486724854, "learning_rate": 0.004329201099693534, "loss": 7.9366, "step": 397600 }, { "epoch": 1.620142398988073, "grad_norm": 2.4458343982696533, "learning_rate": 0.004328862616635256, "loss": 7.9604, "step": 397700 }, { "epoch": 1.6205497770114543, "grad_norm": 2.9090120792388916, "learning_rate": 0.004328524061469932, "loss": 8.0138, "step": 397800 }, { "epoch": 1.6209571550348358, "grad_norm": 2.2161173820495605, "learning_rate": 0.0043281854342109485, "loss": 7.9771, "step": 397900 }, { "epoch": 1.6213645330582174, "grad_norm": 7.317041397094727, "learning_rate": 0.004327846734871696, "loss": 7.9441, "step": 398000 }, { "epoch": 1.6213645330582174, "eval_MaskedAccuracy": 0.49359894427607975, "eval_loss": 1.6811603307724, "eval_runtime": 275.6596, "eval_samples_per_second": 230.269, "eval_steps_per_second": 0.9, "step": 398000 }, { "epoch": 1.621771911081599, "grad_norm": 4.609094142913818, "learning_rate": 0.004327507963465556, "loss": 7.9808, "step": 398100 }, { "epoch": 1.6221792891049804, "grad_norm": 1.264139175415039, "learning_rate": 0.00432716912000593, "loss": 7.9551, "step": 398200 }, { "epoch": 1.6225866671283617, "grad_norm": 3.328643321990967, "learning_rate": 0.004326830204506212, "loss": 7.9815, "step": 398300 }, { "epoch": 1.622994045151743, "grad_norm": 2.0682921409606934, "learning_rate": 0.004326491216979807, "loss": 7.9274, "step": 398400 }, { "epoch": 1.6234014231751246, "grad_norm": 2.3350448608398438, "learning_rate": 0.004326152157440109, "loss": 7.9593, "step": 398500 }, { "epoch": 1.6238088011985061, "grad_norm": 2.5045876502990723, "learning_rate": 0.004325813025900526, "loss": 7.9423, "step": 398600 }, { "epoch": 1.6242161792218877, "grad_norm": 3.6239988803863525, "learning_rate": 0.004325473822374469, "loss": 7.9834, "step": 398700 }, { "epoch": 1.6246235572452692, "grad_norm": 5.0391130447387695, "learning_rate": 0.004325134546875345, "loss": 7.9711, "step": 398800 }, { "epoch": 1.6250309352686505, "grad_norm": 4.182869911193848, "learning_rate": 0.004324795199416563, "loss": 7.9867, "step": 398900 }, { "epoch": 1.625438313292032, "grad_norm": 1.795639157295227, "learning_rate": 0.0043244557800115446, "loss": 7.9402, "step": 399000 }, { "epoch": 1.625438313292032, "eval_MaskedAccuracy": 0.4933457146622044, "eval_loss": 1.6800826787948608, "eval_runtime": 198.8368, "eval_samples_per_second": 319.237, "eval_steps_per_second": 1.247, "step": 399000 }, { "epoch": 1.6258456913154133, "grad_norm": 1.998484492301941, "learning_rate": 0.004324116288673711, "loss": 7.9877, "step": 399100 }, { "epoch": 1.6262530693387949, "grad_norm": 5.772905349731445, "learning_rate": 0.0043237767254164835, "loss": 7.9615, "step": 399200 }, { "epoch": 1.6266604473621764, "grad_norm": 2.0624938011169434, "learning_rate": 0.004323437090253278, "loss": 7.971, "step": 399300 }, { "epoch": 1.627067825385558, "grad_norm": 3.218437671661377, "learning_rate": 0.004323097383197533, "loss": 7.9829, "step": 399400 }, { "epoch": 1.6274752034089393, "grad_norm": 4.9599103927612305, "learning_rate": 0.004322757604262674, "loss": 7.9524, "step": 399500 }, { "epoch": 1.6278825814323208, "grad_norm": 2.601705312728882, "learning_rate": 0.004322417753462135, "loss": 7.9643, "step": 399600 }, { "epoch": 1.628289959455702, "grad_norm": 3.322831869125366, "learning_rate": 0.004322077830809351, "loss": 7.9489, "step": 399700 }, { "epoch": 1.6286973374790836, "grad_norm": 8.045740127563477, "learning_rate": 0.004321737836317761, "loss": 7.9575, "step": 399800 }, { "epoch": 1.6291047155024652, "grad_norm": 2.5539023876190186, "learning_rate": 0.004321397770000811, "loss": 7.9826, "step": 399900 }, { "epoch": 1.6295120935258467, "grad_norm": 2.9093823432922363, "learning_rate": 0.00432105763187194, "loss": 7.9627, "step": 400000 }, { "epoch": 1.6295120935258467, "eval_MaskedAccuracy": 0.4935417284178504, "eval_loss": 1.6767427921295166, "eval_runtime": 177.7659, "eval_samples_per_second": 357.076, "eval_steps_per_second": 1.395, "step": 400000 }, { "epoch": 1.6299194715492282, "grad_norm": 5.491323471069336, "learning_rate": 0.004320717421944603, "loss": 7.9637, "step": 400100 }, { "epoch": 1.6303268495726095, "grad_norm": 4.1802239418029785, "learning_rate": 0.004320377140232242, "loss": 7.9297, "step": 400200 }, { "epoch": 1.6307342275959908, "grad_norm": 2.1961169242858887, "learning_rate": 0.004320036786748309, "loss": 7.9707, "step": 400300 }, { "epoch": 1.6311416056193724, "grad_norm": 4.688284873962402, "learning_rate": 0.004319696361506266, "loss": 7.953, "step": 400400 }, { "epoch": 1.631548983642754, "grad_norm": 3.7482964992523193, "learning_rate": 0.00431935586451957, "loss": 7.9678, "step": 400500 }, { "epoch": 1.6319563616661354, "grad_norm": 3.6271839141845703, "learning_rate": 0.004319015295801683, "loss": 7.9599, "step": 400600 }, { "epoch": 1.632363739689517, "grad_norm": 1.394652247428894, "learning_rate": 0.004318674655366075, "loss": 7.9605, "step": 400700 }, { "epoch": 1.6327711177128983, "grad_norm": 6.607354164123535, "learning_rate": 0.004318333943226209, "loss": 7.9698, "step": 400800 }, { "epoch": 1.6331784957362796, "grad_norm": 4.040886402130127, "learning_rate": 0.004317993159395542, "loss": 7.9767, "step": 400900 }, { "epoch": 1.6335858737596611, "grad_norm": 1.9131618738174438, "learning_rate": 0.004317652303887569, "loss": 7.9528, "step": 401000 }, { "epoch": 1.6335858737596611, "eval_MaskedAccuracy": 0.4931281655770838, "eval_loss": 1.6782045364379883, "eval_runtime": 223.652, "eval_samples_per_second": 283.816, "eval_steps_per_second": 1.109, "step": 401000 }, { "epoch": 1.6339932517830427, "grad_norm": 2.304086208343506, "learning_rate": 0.004317311376715756, "loss": 7.9797, "step": 401100 }, { "epoch": 1.6344006298064242, "grad_norm": 2.0475404262542725, "learning_rate": 0.00431697037789358, "loss": 7.9691, "step": 401200 }, { "epoch": 1.6348080078298057, "grad_norm": 1.0804282426834106, "learning_rate": 0.004316629307434522, "loss": 7.9735, "step": 401300 }, { "epoch": 1.635215385853187, "grad_norm": 1.3630620241165161, "learning_rate": 0.004316288165352066, "loss": 7.9923, "step": 401400 }, { "epoch": 1.6356227638765686, "grad_norm": 1.6740797758102417, "learning_rate": 0.0043159469516597035, "loss": 7.9888, "step": 401500 }, { "epoch": 1.6360301418999499, "grad_norm": 2.8491854667663574, "learning_rate": 0.004315605666370924, "loss": 7.9797, "step": 401600 }, { "epoch": 1.6364375199233314, "grad_norm": 1.6639002561569214, "learning_rate": 0.004315264309499222, "loss": 7.9679, "step": 401700 }, { "epoch": 1.636844897946713, "grad_norm": 4.907125949859619, "learning_rate": 0.004314922881058082, "loss": 7.9838, "step": 401800 }, { "epoch": 1.6372522759700945, "grad_norm": 1.5400975942611694, "learning_rate": 0.004314581381061013, "loss": 7.9144, "step": 401900 }, { "epoch": 1.6376596539934758, "grad_norm": 1.6475751399993896, "learning_rate": 0.004314239809521518, "loss": 7.9528, "step": 402000 }, { "epoch": 1.6376596539934758, "eval_MaskedAccuracy": 0.49255158440846286, "eval_loss": 1.6747674942016602, "eval_runtime": 234.5816, "eval_samples_per_second": 270.592, "eval_steps_per_second": 1.057, "step": 402000 }, { "epoch": 1.6380670320168573, "grad_norm": 2.1660892963409424, "learning_rate": 0.00431389816645309, "loss": 7.9743, "step": 402100 }, { "epoch": 1.6384744100402386, "grad_norm": 3.7078475952148438, "learning_rate": 0.004313556451869249, "loss": 7.9838, "step": 402200 }, { "epoch": 1.6388817880636202, "grad_norm": 5.297366142272949, "learning_rate": 0.004313214665783495, "loss": 7.9534, "step": 402300 }, { "epoch": 1.6392891660870017, "grad_norm": 6.336480617523193, "learning_rate": 0.00431287280820935, "loss": 7.9774, "step": 402400 }, { "epoch": 1.6396965441103832, "grad_norm": 3.608903169631958, "learning_rate": 0.004312530879160318, "loss": 7.956, "step": 402500 }, { "epoch": 1.6401039221337648, "grad_norm": 8.459395408630371, "learning_rate": 0.0043121888786499234, "loss": 7.9813, "step": 402600 }, { "epoch": 1.640511300157146, "grad_norm": 2.347891330718994, "learning_rate": 0.004311846806691691, "loss": 7.9465, "step": 402700 }, { "epoch": 1.6409186781805274, "grad_norm": 4.714664459228516, "learning_rate": 0.004311504663299135, "loss": 7.965, "step": 402800 }, { "epoch": 1.641326056203909, "grad_norm": 1.5120912790298462, "learning_rate": 0.004311162448485785, "loss": 7.9713, "step": 402900 }, { "epoch": 1.6417334342272905, "grad_norm": 4.471959590911865, "learning_rate": 0.004310820162265182, "loss": 7.9685, "step": 403000 }, { "epoch": 1.6417334342272905, "eval_MaskedAccuracy": 0.4934505209408353, "eval_loss": 1.6755223274230957, "eval_runtime": 211.8507, "eval_samples_per_second": 299.626, "eval_steps_per_second": 1.171, "step": 403000 }, { "epoch": 1.642140812250672, "grad_norm": 4.829403400421143, "learning_rate": 0.0043104778046508495, "loss": 7.9463, "step": 403100 }, { "epoch": 1.6425481902740535, "grad_norm": 2.266538381576538, "learning_rate": 0.004310135375656314, "loss": 7.9841, "step": 403200 }, { "epoch": 1.6429555682974348, "grad_norm": 2.3340885639190674, "learning_rate": 0.004309792875295126, "loss": 7.9527, "step": 403300 }, { "epoch": 1.6433629463208161, "grad_norm": 7.065990924835205, "learning_rate": 0.004309450303580828, "loss": 7.9651, "step": 403400 }, { "epoch": 1.6437703243441977, "grad_norm": 2.794461965560913, "learning_rate": 0.004309107660526959, "loss": 7.958, "step": 403500 }, { "epoch": 1.6441777023675792, "grad_norm": 4.081377029418945, "learning_rate": 0.00430876494614706, "loss": 7.9748, "step": 403600 }, { "epoch": 1.6445850803909607, "grad_norm": 3.606553316116333, "learning_rate": 0.004308422160454688, "loss": 7.9587, "step": 403700 }, { "epoch": 1.6449924584143423, "grad_norm": 5.445745468139648, "learning_rate": 0.004308079303463388, "loss": 7.9171, "step": 403800 }, { "epoch": 1.6453998364377236, "grad_norm": 4.294501781463623, "learning_rate": 0.004307736375186721, "loss": 7.9594, "step": 403900 }, { "epoch": 1.6458072144611051, "grad_norm": 1.0839147567749023, "learning_rate": 0.004307393375638248, "loss": 7.9425, "step": 404000 }, { "epoch": 1.6458072144611051, "eval_MaskedAccuracy": 0.4937758301783506, "eval_loss": 1.675153136253357, "eval_runtime": 185.3521, "eval_samples_per_second": 342.462, "eval_steps_per_second": 1.338, "step": 404000 }, { "epoch": 1.6462145924844864, "grad_norm": 5.004493236541748, "learning_rate": 0.004307050304831517, "loss": 7.9765, "step": 404100 }, { "epoch": 1.646621970507868, "grad_norm": 3.2074897289276123, "learning_rate": 0.004306707162780103, "loss": 7.9565, "step": 404200 }, { "epoch": 1.6470293485312495, "grad_norm": 1.7683274745941162, "learning_rate": 0.004306363949497569, "loss": 7.985, "step": 404300 }, { "epoch": 1.647436726554631, "grad_norm": 2.3448328971862793, "learning_rate": 0.0043060206649974805, "loss": 7.955, "step": 404400 }, { "epoch": 1.6478441045780123, "grad_norm": 2.96193528175354, "learning_rate": 0.0043056773092934155, "loss": 7.9635, "step": 404500 }, { "epoch": 1.6482514826013939, "grad_norm": 1.4944769144058228, "learning_rate": 0.004305333882398952, "loss": 7.958, "step": 404600 }, { "epoch": 1.6486588606247752, "grad_norm": 2.9811477661132812, "learning_rate": 0.004304990384327652, "loss": 7.9884, "step": 404700 }, { "epoch": 1.6490662386481567, "grad_norm": 13.158381462097168, "learning_rate": 0.004304646815093105, "loss": 7.9626, "step": 404800 }, { "epoch": 1.6494736166715382, "grad_norm": 4.048529148101807, "learning_rate": 0.0043043031747089, "loss": 7.958, "step": 404900 }, { "epoch": 1.6498809946949198, "grad_norm": 1.8619483709335327, "learning_rate": 0.004303959463188612, "loss": 7.9605, "step": 405000 }, { "epoch": 1.6498809946949198, "eval_MaskedAccuracy": 0.49467404513256763, "eval_loss": 1.6699923276901245, "eval_runtime": 266.4221, "eval_samples_per_second": 238.254, "eval_steps_per_second": 0.931, "step": 405000 }, { "epoch": 1.6502883727183013, "grad_norm": 4.210719585418701, "learning_rate": 0.004303615680545833, "loss": 7.9587, "step": 405100 }, { "epoch": 1.6506957507416826, "grad_norm": 6.177710056304932, "learning_rate": 0.004303271826794163, "loss": 7.9308, "step": 405200 }, { "epoch": 1.651103128765064, "grad_norm": 8.642091751098633, "learning_rate": 0.00430292790194719, "loss": 7.9682, "step": 405300 }, { "epoch": 1.6515105067884455, "grad_norm": 4.50748348236084, "learning_rate": 0.004302583906018505, "loss": 7.9616, "step": 405400 }, { "epoch": 1.651917884811827, "grad_norm": 4.075202465057373, "learning_rate": 0.004302239839021721, "loss": 7.9338, "step": 405500 }, { "epoch": 1.6523252628352085, "grad_norm": 1.6843070983886719, "learning_rate": 0.004301895700970431, "loss": 7.978, "step": 405600 }, { "epoch": 1.65273264085859, "grad_norm": 2.934196710586548, "learning_rate": 0.0043015514918782445, "loss": 7.9676, "step": 405700 }, { "epoch": 1.6531400188819714, "grad_norm": 2.3354573249816895, "learning_rate": 0.004301207211758769, "loss": 7.9966, "step": 405800 }, { "epoch": 1.6535473969053527, "grad_norm": 3.1194190979003906, "learning_rate": 0.00430086286062561, "loss": 7.985, "step": 405900 }, { "epoch": 1.6539547749287342, "grad_norm": 1.0679525136947632, "learning_rate": 0.004300518438492392, "loss": 7.9865, "step": 406000 }, { "epoch": 1.6539547749287342, "eval_MaskedAccuracy": 0.49296178046008887, "eval_loss": 1.6834198236465454, "eval_runtime": 170.1924, "eval_samples_per_second": 372.966, "eval_steps_per_second": 1.457, "step": 406000 }, { "epoch": 1.6543621529521157, "grad_norm": 3.0432701110839844, "learning_rate": 0.004300173945372724, "loss": 7.9259, "step": 406100 }, { "epoch": 1.6547695309754973, "grad_norm": 2.8257131576538086, "learning_rate": 0.004299829381280237, "loss": 7.9596, "step": 406200 }, { "epoch": 1.6551769089988788, "grad_norm": 1.2833287715911865, "learning_rate": 0.004299484746228541, "loss": 7.9542, "step": 406300 }, { "epoch": 1.6555842870222601, "grad_norm": 1.386589527130127, "learning_rate": 0.004299140040231264, "loss": 7.9351, "step": 406400 }, { "epoch": 1.6559916650456417, "grad_norm": 1.9801644086837769, "learning_rate": 0.004298795263302037, "loss": 7.9559, "step": 406500 }, { "epoch": 1.656399043069023, "grad_norm": 2.5966262817382812, "learning_rate": 0.0042984504154544865, "loss": 7.9901, "step": 406600 }, { "epoch": 1.6568064210924045, "grad_norm": 1.6363015174865723, "learning_rate": 0.004298105496702247, "loss": 7.9713, "step": 406700 }, { "epoch": 1.657213799115786, "grad_norm": 4.371832847595215, "learning_rate": 0.004297760507058954, "loss": 7.9641, "step": 406800 }, { "epoch": 1.6576211771391676, "grad_norm": 1.9847220182418823, "learning_rate": 0.004297415446538257, "loss": 7.9447, "step": 406900 }, { "epoch": 1.6580285551625489, "grad_norm": 1.4259852170944214, "learning_rate": 0.004297070315153793, "loss": 7.9332, "step": 407000 }, { "epoch": 1.6580285551625489, "eval_MaskedAccuracy": 0.49389236265893344, "eval_loss": 1.6760696172714233, "eval_runtime": 283.6492, "eval_samples_per_second": 223.783, "eval_steps_per_second": 0.874, "step": 407000 }, { "epoch": 1.6584359331859304, "grad_norm": 3.0803794860839844, "learning_rate": 0.004296725112919201, "loss": 7.9529, "step": 407100 }, { "epoch": 1.6588433112093117, "grad_norm": 1.4253218173980713, "learning_rate": 0.004296379839848138, "loss": 7.9313, "step": 407200 }, { "epoch": 1.6592506892326933, "grad_norm": 1.1467676162719727, "learning_rate": 0.004296034495954242, "loss": 7.9871, "step": 407300 }, { "epoch": 1.6596580672560748, "grad_norm": 1.4785820245742798, "learning_rate": 0.004295689081251178, "loss": 8.0032, "step": 407400 }, { "epoch": 1.6600654452794563, "grad_norm": 4.422873020172119, "learning_rate": 0.004295343595752593, "loss": 7.9378, "step": 407500 }, { "epoch": 1.6604728233028379, "grad_norm": 5.220428943634033, "learning_rate": 0.004294998039472155, "loss": 7.9202, "step": 407600 }, { "epoch": 1.6608802013262192, "grad_norm": 8.391969680786133, "learning_rate": 0.0042946524124235184, "loss": 7.9858, "step": 407700 }, { "epoch": 1.6612875793496005, "grad_norm": 3.050560474395752, "learning_rate": 0.004294306714620354, "loss": 7.9808, "step": 407800 }, { "epoch": 1.661694957372982, "grad_norm": 3.4746363162994385, "learning_rate": 0.004293960946076323, "loss": 7.9493, "step": 407900 }, { "epoch": 1.6621023353963635, "grad_norm": 1.5520859956741333, "learning_rate": 0.004293615106805094, "loss": 7.9309, "step": 408000 }, { "epoch": 1.6621023353963635, "eval_MaskedAccuracy": 0.4951752125082972, "eval_loss": 1.66789972782135, "eval_runtime": 230.7829, "eval_samples_per_second": 275.046, "eval_steps_per_second": 1.075, "step": 408000 }, { "epoch": 1.662509713419745, "grad_norm": 7.703237056732178, "learning_rate": 0.004293269196820345, "loss": 7.9612, "step": 408100 }, { "epoch": 1.6629170914431266, "grad_norm": 2.1739680767059326, "learning_rate": 0.004292923216135756, "loss": 7.9304, "step": 408200 }, { "epoch": 1.663324469466508, "grad_norm": 3.6314687728881836, "learning_rate": 0.004292577164764995, "loss": 7.95, "step": 408300 }, { "epoch": 1.6637318474898892, "grad_norm": 1.4672982692718506, "learning_rate": 0.004292231042721748, "loss": 7.9747, "step": 408400 }, { "epoch": 1.6641392255132708, "grad_norm": 2.5027122497558594, "learning_rate": 0.004291884850019704, "loss": 7.96, "step": 408500 }, { "epoch": 1.6645466035366523, "grad_norm": 1.6916136741638184, "learning_rate": 0.004291538586672537, "loss": 7.9663, "step": 408600 }, { "epoch": 1.6649539815600338, "grad_norm": 1.5934362411499023, "learning_rate": 0.004291192252693948, "loss": 7.9469, "step": 408700 }, { "epoch": 1.6653613595834154, "grad_norm": 2.4465174674987793, "learning_rate": 0.0042908458480976274, "loss": 7.9432, "step": 408800 }, { "epoch": 1.6657687376067967, "grad_norm": 4.356081008911133, "learning_rate": 0.004290499372897264, "loss": 7.984, "step": 408900 }, { "epoch": 1.6661761156301782, "grad_norm": 3.9348583221435547, "learning_rate": 0.004290152827106566, "loss": 7.9373, "step": 409000 }, { "epoch": 1.6661761156301782, "eval_MaskedAccuracy": 0.4956848225531449, "eval_loss": 1.672873854637146, "eval_runtime": 177.0871, "eval_samples_per_second": 358.445, "eval_steps_per_second": 1.4, "step": 409000 }, { "epoch": 1.6665834936535595, "grad_norm": 2.170700788497925, "learning_rate": 0.004289806210739226, "loss": 7.9574, "step": 409100 }, { "epoch": 1.666990871676941, "grad_norm": 6.48043155670166, "learning_rate": 0.0042894595238089465, "loss": 7.9671, "step": 409200 }, { "epoch": 1.6673982497003226, "grad_norm": 3.2680306434631348, "learning_rate": 0.0042891127663294455, "loss": 7.9648, "step": 409300 }, { "epoch": 1.667805627723704, "grad_norm": 7.81300687789917, "learning_rate": 0.004288765938314423, "loss": 7.9988, "step": 409400 }, { "epoch": 1.6682130057470854, "grad_norm": 2.928682327270508, "learning_rate": 0.0042884190397775935, "loss": 7.94, "step": 409500 }, { "epoch": 1.668620383770467, "grad_norm": 5.415159225463867, "learning_rate": 0.004288072070732674, "loss": 7.9688, "step": 409600 }, { "epoch": 1.6690277617938483, "grad_norm": 2.4109930992126465, "learning_rate": 0.004287725031193375, "loss": 7.9196, "step": 409700 }, { "epoch": 1.6694351398172298, "grad_norm": 3.1917884349823, "learning_rate": 0.004287377921173422, "loss": 7.9317, "step": 409800 }, { "epoch": 1.6698425178406113, "grad_norm": 2.7681193351745605, "learning_rate": 0.004287030740686535, "loss": 7.956, "step": 409900 }, { "epoch": 1.6702498958639929, "grad_norm": 4.684969425201416, "learning_rate": 0.004286683489746447, "loss": 7.9379, "step": 410000 }, { "epoch": 1.6702498958639929, "eval_MaskedAccuracy": 0.49399434037778567, "eval_loss": 1.6739771366119385, "eval_runtime": 249.0096, "eval_samples_per_second": 254.914, "eval_steps_per_second": 0.996, "step": 410000 }, { "epoch": 1.6706572738873744, "grad_norm": 3.6070775985717773, "learning_rate": 0.004286336168366876, "loss": 7.9518, "step": 410100 }, { "epoch": 1.6710646519107557, "grad_norm": 6.102087497711182, "learning_rate": 0.004285988776561558, "loss": 7.9334, "step": 410200 }, { "epoch": 1.671472029934137, "grad_norm": 7.099484920501709, "learning_rate": 0.00428564131434423, "loss": 7.9412, "step": 410300 }, { "epoch": 1.6718794079575185, "grad_norm": 2.260991096496582, "learning_rate": 0.004285293781728627, "loss": 7.943, "step": 410400 }, { "epoch": 1.6722867859809, "grad_norm": 5.115174293518066, "learning_rate": 0.00428494617872849, "loss": 7.9473, "step": 410500 }, { "epoch": 1.6726941640042816, "grad_norm": 2.993117094039917, "learning_rate": 0.004284598505357562, "loss": 7.9453, "step": 410600 }, { "epoch": 1.6731015420276631, "grad_norm": 3.088998556137085, "learning_rate": 0.004284250761629587, "loss": 7.9341, "step": 410700 }, { "epoch": 1.6735089200510445, "grad_norm": 3.6426403522491455, "learning_rate": 0.004283902947558312, "loss": 7.9126, "step": 410800 }, { "epoch": 1.6739162980744258, "grad_norm": 2.7926712036132812, "learning_rate": 0.004283555063157483, "loss": 7.9481, "step": 410900 }, { "epoch": 1.6743236760978073, "grad_norm": 4.062406539916992, "learning_rate": 0.00428320710844086, "loss": 7.989, "step": 411000 }, { "epoch": 1.6743236760978073, "eval_MaskedAccuracy": 0.49204426507389587, "eval_loss": 1.6853642463684082, "eval_runtime": 162.4687, "eval_samples_per_second": 390.697, "eval_steps_per_second": 1.526, "step": 411000 }, { "epoch": 1.6747310541211888, "grad_norm": 3.0080082416534424, "learning_rate": 0.004282859083422203, "loss": 8.0003, "step": 411100 }, { "epoch": 1.6751384321445704, "grad_norm": 3.0320534706115723, "learning_rate": 0.0042825109881152726, "loss": 7.9293, "step": 411200 }, { "epoch": 1.675545810167952, "grad_norm": 6.334164142608643, "learning_rate": 0.004282162822533823, "loss": 7.9709, "step": 411300 }, { "epoch": 1.6759531881913332, "grad_norm": 1.5587489604949951, "learning_rate": 0.004281814586691617, "loss": 7.9611, "step": 411400 }, { "epoch": 1.6763605662147147, "grad_norm": 3.1051578521728516, "learning_rate": 0.004281466280602431, "loss": 7.9527, "step": 411500 }, { "epoch": 1.676767944238096, "grad_norm": 5.084553241729736, "learning_rate": 0.0042811179042800355, "loss": 7.9475, "step": 411600 }, { "epoch": 1.6771753222614776, "grad_norm": 4.1257243156433105, "learning_rate": 0.004280769457738201, "loss": 7.9343, "step": 411700 }, { "epoch": 1.6775827002848591, "grad_norm": 1.3054580688476562, "learning_rate": 0.004280420940990705, "loss": 7.976, "step": 411800 }, { "epoch": 1.6779900783082407, "grad_norm": 4.228930473327637, "learning_rate": 0.004280072354051323, "loss": 7.9528, "step": 411900 }, { "epoch": 1.678397456331622, "grad_norm": 3.6710877418518066, "learning_rate": 0.004279723696933832, "loss": 7.9711, "step": 412000 }, { "epoch": 1.678397456331622, "eval_MaskedAccuracy": 0.4943088891483813, "eval_loss": 1.6768760681152344, "eval_runtime": 214.2433, "eval_samples_per_second": 296.28, "eval_steps_per_second": 1.158, "step": 412000 }, { "epoch": 1.6788048343550035, "grad_norm": 6.524370193481445, "learning_rate": 0.0042793749696520165, "loss": 7.9817, "step": 412100 }, { "epoch": 1.6792122123783848, "grad_norm": 2.379122495651245, "learning_rate": 0.004279026172219673, "loss": 7.9257, "step": 412200 }, { "epoch": 1.6796195904017663, "grad_norm": 2.0603673458099365, "learning_rate": 0.004278677304650586, "loss": 7.9422, "step": 412300 }, { "epoch": 1.6800269684251479, "grad_norm": 1.7358134984970093, "learning_rate": 0.00427832836695855, "loss": 7.9817, "step": 412400 }, { "epoch": 1.6804343464485294, "grad_norm": 3.6427125930786133, "learning_rate": 0.004277979359157362, "loss": 7.9542, "step": 412500 }, { "epoch": 1.680841724471911, "grad_norm": 1.3322529792785645, "learning_rate": 0.004277630281260814, "loss": 7.998, "step": 412600 }, { "epoch": 1.6812491024952922, "grad_norm": 3.6892545223236084, "learning_rate": 0.004277281133282713, "loss": 7.9742, "step": 412700 }, { "epoch": 1.6816564805186736, "grad_norm": 1.3000271320343018, "learning_rate": 0.004276931915236861, "loss": 7.9364, "step": 412800 }, { "epoch": 1.682063858542055, "grad_norm": 4.525582790374756, "learning_rate": 0.004276582627137063, "loss": 7.9527, "step": 412900 }, { "epoch": 1.6824712365654366, "grad_norm": 2.7989420890808105, "learning_rate": 0.0042762332689971275, "loss": 7.9654, "step": 413000 }, { "epoch": 1.6824712365654366, "eval_MaskedAccuracy": 0.49499718782733126, "eval_loss": 1.671339988708496, "eval_runtime": 231.7647, "eval_samples_per_second": 273.881, "eval_steps_per_second": 1.07, "step": 413000 }, { "epoch": 1.6828786145888182, "grad_norm": 5.067391395568848, "learning_rate": 0.004275883840830866, "loss": 7.9547, "step": 413100 }, { "epoch": 1.6832859926121997, "grad_norm": 1.8825346231460571, "learning_rate": 0.0042755343426520995, "loss": 7.9453, "step": 413200 }, { "epoch": 1.683693370635581, "grad_norm": 3.201878786087036, "learning_rate": 0.004275184774474639, "loss": 7.9486, "step": 413300 }, { "epoch": 1.6841007486589623, "grad_norm": 1.6017849445343018, "learning_rate": 0.004274835136312307, "loss": 7.9322, "step": 413400 }, { "epoch": 1.6845081266823438, "grad_norm": 2.9095780849456787, "learning_rate": 0.004274485428178923, "loss": 7.9611, "step": 413500 }, { "epoch": 1.6849155047057254, "grad_norm": 1.9746508598327637, "learning_rate": 0.00427413565008831, "loss": 7.9572, "step": 413600 }, { "epoch": 1.685322882729107, "grad_norm": 4.102407932281494, "learning_rate": 0.004273785802054307, "loss": 7.9506, "step": 413700 }, { "epoch": 1.6857302607524884, "grad_norm": 1.4797166585922241, "learning_rate": 0.004273435884090749, "loss": 7.9547, "step": 413800 }, { "epoch": 1.6861376387758698, "grad_norm": 1.2575322389602661, "learning_rate": 0.004273085896211464, "loss": 7.943, "step": 413900 }, { "epoch": 1.6865450167992513, "grad_norm": 2.634089231491089, "learning_rate": 0.004272735838430278, "loss": 7.9474, "step": 414000 }, { "epoch": 1.6865450167992513, "eval_MaskedAccuracy": 0.4948957822579699, "eval_loss": 1.6674184799194336, "eval_runtime": 180.886, "eval_samples_per_second": 350.917, "eval_steps_per_second": 1.371, "step": 414000 }, { "epoch": 1.6869523948226326, "grad_norm": 2.0500218868255615, "learning_rate": 0.004272385710761043, "loss": 7.9274, "step": 414100 }, { "epoch": 1.6873597728460141, "grad_norm": 6.2046356201171875, "learning_rate": 0.0042720355132175964, "loss": 7.9405, "step": 414200 }, { "epoch": 1.6877671508693957, "grad_norm": 4.2807936668396, "learning_rate": 0.00427168524581379, "loss": 7.9614, "step": 414300 }, { "epoch": 1.6881745288927772, "grad_norm": 2.766878604888916, "learning_rate": 0.004271334908563461, "loss": 7.9708, "step": 414400 }, { "epoch": 1.6885819069161585, "grad_norm": 1.8761850595474243, "learning_rate": 0.004270984501480462, "loss": 7.9459, "step": 414500 }, { "epoch": 1.68898928493954, "grad_norm": 2.0470542907714844, "learning_rate": 0.00427063402457866, "loss": 7.9384, "step": 414600 }, { "epoch": 1.6893966629629213, "grad_norm": 2.438100576400757, "learning_rate": 0.004270283477871898, "loss": 7.9537, "step": 414700 }, { "epoch": 1.6898040409863029, "grad_norm": 4.45005464553833, "learning_rate": 0.00426993286137404, "loss": 7.928, "step": 414800 }, { "epoch": 1.6902114190096844, "grad_norm": 2.664546489715576, "learning_rate": 0.00426958217509895, "loss": 7.9209, "step": 414900 }, { "epoch": 1.690618797033066, "grad_norm": 1.286209225654602, "learning_rate": 0.004269231419060486, "loss": 7.955, "step": 415000 }, { "epoch": 1.690618797033066, "eval_MaskedAccuracy": 0.49416102800073225, "eval_loss": 1.6761102676391602, "eval_runtime": 181.4399, "eval_samples_per_second": 349.846, "eval_steps_per_second": 1.367, "step": 415000 }, { "epoch": 1.6910261750564475, "grad_norm": 3.3695225715637207, "learning_rate": 0.004268880593272521, "loss": 7.9491, "step": 415100 }, { "epoch": 1.6914335530798288, "grad_norm": 1.6755496263504028, "learning_rate": 0.004268529697748922, "loss": 7.9383, "step": 415200 }, { "epoch": 1.69184093110321, "grad_norm": 2.594820022583008, "learning_rate": 0.004268178732503559, "loss": 7.9261, "step": 415300 }, { "epoch": 1.6922483091265916, "grad_norm": 3.90124773979187, "learning_rate": 0.004267827697550309, "loss": 7.9241, "step": 415400 }, { "epoch": 1.6926556871499732, "grad_norm": 5.081125259399414, "learning_rate": 0.004267476592903052, "loss": 7.9661, "step": 415500 }, { "epoch": 1.6930630651733547, "grad_norm": 2.1485257148742676, "learning_rate": 0.004267125418575669, "loss": 7.9798, "step": 415600 }, { "epoch": 1.6934704431967362, "grad_norm": 1.2827850580215454, "learning_rate": 0.004266774174582044, "loss": 7.9659, "step": 415700 }, { "epoch": 1.6938778212201175, "grad_norm": 6.2516984939575195, "learning_rate": 0.004266422860936063, "loss": 7.9508, "step": 415800 }, { "epoch": 1.6942851992434989, "grad_norm": 4.668462753295898, "learning_rate": 0.004266071477651613, "loss": 7.9435, "step": 415900 }, { "epoch": 1.6946925772668804, "grad_norm": 2.5708775520324707, "learning_rate": 0.004265720024742591, "loss": 7.9611, "step": 416000 }, { "epoch": 1.6946925772668804, "eval_MaskedAccuracy": 0.49421570324190384, "eval_loss": 1.6862056255340576, "eval_runtime": 275.3313, "eval_samples_per_second": 230.544, "eval_steps_per_second": 0.901, "step": 416000 }, { "epoch": 1.695099955290262, "grad_norm": 5.651008129119873, "learning_rate": 0.004265368502222889, "loss": 7.9128, "step": 416100 }, { "epoch": 1.6955073333136434, "grad_norm": 4.246887683868408, "learning_rate": 0.004265016910106409, "loss": 7.9104, "step": 416200 }, { "epoch": 1.695914711337025, "grad_norm": 3.257011890411377, "learning_rate": 0.004264665248407044, "loss": 7.9553, "step": 416300 }, { "epoch": 1.6963220893604063, "grad_norm": 4.464527130126953, "learning_rate": 0.004264313517138699, "loss": 7.9419, "step": 416400 }, { "epoch": 1.6967294673837878, "grad_norm": 1.7570393085479736, "learning_rate": 0.0042639617163152755, "loss": 7.9531, "step": 416500 }, { "epoch": 1.6971368454071691, "grad_norm": 2.615110397338867, "learning_rate": 0.004263609845950698, "loss": 7.9562, "step": 416600 }, { "epoch": 1.6975442234305507, "grad_norm": 1.7686477899551392, "learning_rate": 0.004263257906058863, "loss": 7.965, "step": 416700 }, { "epoch": 1.6979516014539322, "grad_norm": 1.298539638519287, "learning_rate": 0.004262905896653691, "loss": 7.9437, "step": 416800 }, { "epoch": 1.6983589794773137, "grad_norm": 4.391310214996338, "learning_rate": 0.004262553817749098, "loss": 7.9452, "step": 416900 }, { "epoch": 1.698766357500695, "grad_norm": 5.422664165496826, "learning_rate": 0.0042622016693590024, "loss": 7.9323, "step": 417000 }, { "epoch": 1.698766357500695, "eval_MaskedAccuracy": 0.49353902698434704, "eval_loss": 1.6833051443099976, "eval_runtime": 200.6313, "eval_samples_per_second": 316.381, "eval_steps_per_second": 1.236, "step": 417000 }, { "epoch": 1.6991737355240766, "grad_norm": 5.574663162231445, "learning_rate": 0.004261849451497324, "loss": 7.9334, "step": 417100 }, { "epoch": 1.699581113547458, "grad_norm": 1.7768208980560303, "learning_rate": 0.004261497164177991, "loss": 7.9233, "step": 417200 }, { "epoch": 1.6999884915708394, "grad_norm": 3.46205472946167, "learning_rate": 0.004261144807414934, "loss": 7.9454, "step": 417300 }, { "epoch": 1.700395869594221, "grad_norm": 8.98598575592041, "learning_rate": 0.00426079238122208, "loss": 7.9507, "step": 417400 }, { "epoch": 1.7008032476176025, "grad_norm": 2.936594009399414, "learning_rate": 0.004260439885613358, "loss": 7.982, "step": 417500 }, { "epoch": 1.701210625640984, "grad_norm": 3.1874189376831055, "learning_rate": 0.004260087320602715, "loss": 7.9595, "step": 417600 }, { "epoch": 1.7016180036643653, "grad_norm": 5.177585601806641, "learning_rate": 0.004259734686204081, "loss": 7.9514, "step": 417700 }, { "epoch": 1.7020253816877466, "grad_norm": 5.808584690093994, "learning_rate": 0.004259381982431398, "loss": 7.9618, "step": 417800 }, { "epoch": 1.7024327597111282, "grad_norm": 2.695436954498291, "learning_rate": 0.004259029209298612, "loss": 7.9535, "step": 417900 }, { "epoch": 1.7028401377345097, "grad_norm": 5.604297637939453, "learning_rate": 0.004258676366819673, "loss": 7.9603, "step": 418000 }, { "epoch": 1.7028401377345097, "eval_MaskedAccuracy": 0.49453569524485747, "eval_loss": 1.6734776496887207, "eval_runtime": 289.2535, "eval_samples_per_second": 219.448, "eval_steps_per_second": 0.857, "step": 418000 }, { "epoch": 1.7032475157578912, "grad_norm": 2.5026907920837402, "learning_rate": 0.004258323455008524, "loss": 7.975, "step": 418100 }, { "epoch": 1.7036548937812728, "grad_norm": 2.7050795555114746, "learning_rate": 0.004257970473879123, "loss": 7.9621, "step": 418200 }, { "epoch": 1.704062271804654, "grad_norm": 2.320899248123169, "learning_rate": 0.0042576174234454185, "loss": 7.9488, "step": 418300 }, { "epoch": 1.7044696498280354, "grad_norm": 2.0656144618988037, "learning_rate": 0.004257264303721376, "loss": 7.8987, "step": 418400 }, { "epoch": 1.704877027851417, "grad_norm": 4.858016490936279, "learning_rate": 0.004256911114720962, "loss": 7.9145, "step": 418500 }, { "epoch": 1.7052844058747985, "grad_norm": 3.3390238285064697, "learning_rate": 0.004256557856458122, "loss": 7.9105, "step": 418600 }, { "epoch": 1.70569178389818, "grad_norm": 1.1924008131027222, "learning_rate": 0.004256204528946833, "loss": 7.9466, "step": 418700 }, { "epoch": 1.7060991619215615, "grad_norm": 2.3600430488586426, "learning_rate": 0.004255851132201069, "loss": 7.9802, "step": 418800 }, { "epoch": 1.7065065399449428, "grad_norm": 3.2106237411499023, "learning_rate": 0.004255497666234791, "loss": 7.9314, "step": 418900 }, { "epoch": 1.7069139179683244, "grad_norm": 2.626401662826538, "learning_rate": 0.004255144131061982, "loss": 7.953, "step": 419000 }, { "epoch": 1.7069139179683244, "eval_MaskedAccuracy": 0.4944812553546451, "eval_loss": 1.6595406532287598, "eval_runtime": 214.6741, "eval_samples_per_second": 295.685, "eval_steps_per_second": 1.155, "step": 419000 }, { "epoch": 1.7073212959917057, "grad_norm": 3.147618055343628, "learning_rate": 0.004254790526696614, "loss": 7.9468, "step": 419100 }, { "epoch": 1.7077286740150872, "grad_norm": 4.491914749145508, "learning_rate": 0.004254436853152674, "loss": 7.9405, "step": 419200 }, { "epoch": 1.7081360520384687, "grad_norm": 7.588096618652344, "learning_rate": 0.004254083110444123, "loss": 7.9451, "step": 419300 }, { "epoch": 1.7085434300618503, "grad_norm": 2.622401714324951, "learning_rate": 0.004253729298584968, "loss": 7.9366, "step": 419400 }, { "epoch": 1.7089508080852316, "grad_norm": 3.4954352378845215, "learning_rate": 0.004253375417589195, "loss": 7.9665, "step": 419500 }, { "epoch": 1.7093581861086131, "grad_norm": 5.299036026000977, "learning_rate": 0.00425302146747079, "loss": 7.8988, "step": 419600 }, { "epoch": 1.7097655641319944, "grad_norm": 1.441435694694519, "learning_rate": 0.00425266744824375, "loss": 7.9282, "step": 419700 }, { "epoch": 1.710172942155376, "grad_norm": 6.788598537445068, "learning_rate": 0.004252313359922064, "loss": 7.9406, "step": 419800 }, { "epoch": 1.7105803201787575, "grad_norm": 2.6914849281311035, "learning_rate": 0.00425195920251974, "loss": 7.9587, "step": 419900 }, { "epoch": 1.710987698202139, "grad_norm": 1.9702292680740356, "learning_rate": 0.0042516049760507775, "loss": 7.9498, "step": 420000 }, { "epoch": 1.710987698202139, "eval_MaskedAccuracy": 0.4946215008836926, "eval_loss": 1.671088695526123, "eval_runtime": 258.7533, "eval_samples_per_second": 245.315, "eval_steps_per_second": 0.958, "step": 420000 }, { "epoch": 1.7113950762255206, "grad_norm": 7.365056037902832, "learning_rate": 0.00425125068052917, "loss": 7.9604, "step": 420100 }, { "epoch": 1.7118024542489019, "grad_norm": 8.19728946685791, "learning_rate": 0.004250896315968938, "loss": 7.9543, "step": 420200 }, { "epoch": 1.7122098322722832, "grad_norm": 2.184723138809204, "learning_rate": 0.004250541882384093, "loss": 7.9384, "step": 420300 }, { "epoch": 1.7126172102956647, "grad_norm": 2.058471202850342, "learning_rate": 0.004250187379788638, "loss": 7.941, "step": 420400 }, { "epoch": 1.7130245883190462, "grad_norm": 1.7724852561950684, "learning_rate": 0.004249832808196584, "loss": 7.9573, "step": 420500 }, { "epoch": 1.7134319663424278, "grad_norm": 10.145926475524902, "learning_rate": 0.004249478167621958, "loss": 7.9009, "step": 420600 }, { "epoch": 1.7138393443658093, "grad_norm": 6.171712398529053, "learning_rate": 0.004249123458078784, "loss": 7.9543, "step": 420700 }, { "epoch": 1.7142467223891906, "grad_norm": 4.679589748382568, "learning_rate": 0.004248768679581081, "loss": 7.937, "step": 420800 }, { "epoch": 1.714654100412572, "grad_norm": 7.17892599105835, "learning_rate": 0.004248413832142874, "loss": 7.9342, "step": 420900 }, { "epoch": 1.7150614784359535, "grad_norm": 4.674642562866211, "learning_rate": 0.004248058915778193, "loss": 7.9379, "step": 421000 }, { "epoch": 1.7150614784359535, "eval_MaskedAccuracy": 0.49462195343410076, "eval_loss": 1.6705025434494019, "eval_runtime": 249.3654, "eval_samples_per_second": 254.55, "eval_steps_per_second": 0.995, "step": 421000 }, { "epoch": 1.715468856459335, "grad_norm": 1.7300959825515747, "learning_rate": 0.004247703930501073, "loss": 7.9063, "step": 421100 }, { "epoch": 1.7158762344827165, "grad_norm": 9.26630687713623, "learning_rate": 0.004247348876325544, "loss": 7.9698, "step": 421200 }, { "epoch": 1.716283612506098, "grad_norm": 2.407864809036255, "learning_rate": 0.004246993753265649, "loss": 7.9343, "step": 421300 }, { "epoch": 1.7166909905294794, "grad_norm": 3.963239908218384, "learning_rate": 0.004246638561335424, "loss": 7.9579, "step": 421400 }, { "epoch": 1.717098368552861, "grad_norm": 3.381040096282959, "learning_rate": 0.004246283300548916, "loss": 7.9266, "step": 421500 }, { "epoch": 1.7175057465762422, "grad_norm": 4.004113674163818, "learning_rate": 0.0042459279709201525, "loss": 7.9318, "step": 421600 }, { "epoch": 1.7179131245996238, "grad_norm": 4.8017168045043945, "learning_rate": 0.004245572572463203, "loss": 7.9577, "step": 421700 }, { "epoch": 1.7183205026230053, "grad_norm": 3.4024367332458496, "learning_rate": 0.00424521710519211, "loss": 7.9593, "step": 421800 }, { "epoch": 1.7187278806463868, "grad_norm": 4.002869129180908, "learning_rate": 0.0042448615691209265, "loss": 7.9441, "step": 421900 }, { "epoch": 1.7191352586697681, "grad_norm": 2.9499707221984863, "learning_rate": 0.004244505964263711, "loss": 7.939, "step": 422000 }, { "epoch": 1.7191352586697681, "eval_MaskedAccuracy": 0.4945934567815738, "eval_loss": 1.6787841320037842, "eval_runtime": 342.0885, "eval_samples_per_second": 185.554, "eval_steps_per_second": 0.725, "step": 422000 }, { "epoch": 1.7195426366931497, "grad_norm": 2.5295894145965576, "learning_rate": 0.004244150290634521, "loss": 7.9285, "step": 422100 }, { "epoch": 1.719950014716531, "grad_norm": 2.591674327850342, "learning_rate": 0.004243794548247419, "loss": 7.9027, "step": 422200 }, { "epoch": 1.7203573927399125, "grad_norm": 3.139409303665161, "learning_rate": 0.004243438737116469, "loss": 7.9523, "step": 422300 }, { "epoch": 1.720764770763294, "grad_norm": 6.544683933258057, "learning_rate": 0.0042430828572557365, "loss": 7.9359, "step": 422400 }, { "epoch": 1.7211721487866756, "grad_norm": 2.374436140060425, "learning_rate": 0.004242726908679294, "loss": 7.9489, "step": 422500 }, { "epoch": 1.721579526810057, "grad_norm": 2.56363582611084, "learning_rate": 0.004242370891401212, "loss": 7.9068, "step": 422600 }, { "epoch": 1.7219869048334384, "grad_norm": 5.707026481628418, "learning_rate": 0.004242014805435568, "loss": 7.9544, "step": 422700 }, { "epoch": 1.7223942828568197, "grad_norm": 1.828076958656311, "learning_rate": 0.004241658650796436, "loss": 7.9206, "step": 422800 }, { "epoch": 1.7228016608802013, "grad_norm": 1.7943181991577148, "learning_rate": 0.0042413024274979, "loss": 7.9149, "step": 422900 }, { "epoch": 1.7232090389035828, "grad_norm": 5.432728290557861, "learning_rate": 0.004240946135554039, "loss": 7.9043, "step": 423000 }, { "epoch": 1.7232090389035828, "eval_MaskedAccuracy": 0.49358708868547657, "eval_loss": 1.6763889789581299, "eval_runtime": 164.9674, "eval_samples_per_second": 384.779, "eval_steps_per_second": 1.503, "step": 423000 }, { "epoch": 1.7236164169269643, "grad_norm": 2.6649765968322754, "learning_rate": 0.004240589774978941, "loss": 7.9519, "step": 423100 }, { "epoch": 1.7240237949503459, "grad_norm": 4.455107688903809, "learning_rate": 0.0042402333457867035, "loss": 7.9516, "step": 423200 }, { "epoch": 1.7244311729737272, "grad_norm": 5.428768634796143, "learning_rate": 0.004239876847991411, "loss": 7.9421, "step": 423300 }, { "epoch": 1.7248385509971085, "grad_norm": 5.04478120803833, "learning_rate": 0.00423952028160716, "loss": 7.9131, "step": 423400 }, { "epoch": 1.72524592902049, "grad_norm": 4.8387932777404785, "learning_rate": 0.004239163646648039, "loss": 7.9496, "step": 423500 }, { "epoch": 1.7256533070438715, "grad_norm": 4.750764846801758, "learning_rate": 0.004238806943128157, "loss": 7.9309, "step": 423600 }, { "epoch": 1.726060685067253, "grad_norm": 3.087653160095215, "learning_rate": 0.004238450171061615, "loss": 7.9352, "step": 423700 }, { "epoch": 1.7264680630906346, "grad_norm": 6.936778545379639, "learning_rate": 0.004238093330462516, "loss": 7.9261, "step": 423800 }, { "epoch": 1.726875441114016, "grad_norm": 4.899052619934082, "learning_rate": 0.004237736421344969, "loss": 7.9258, "step": 423900 }, { "epoch": 1.7272828191373975, "grad_norm": 7.004554748535156, "learning_rate": 0.004237379443723085, "loss": 7.9139, "step": 424000 }, { "epoch": 1.7272828191373975, "eval_MaskedAccuracy": 0.49389334649659283, "eval_loss": 1.671739101409912, "eval_runtime": 160.3116, "eval_samples_per_second": 395.954, "eval_steps_per_second": 1.547, "step": 424000 }, { "epoch": 1.7276901971607788, "grad_norm": 1.0757733583450317, "learning_rate": 0.004237022397610971, "loss": 7.9485, "step": 424100 }, { "epoch": 1.7280975751841603, "grad_norm": 3.9845657348632812, "learning_rate": 0.004236665283022757, "loss": 7.9496, "step": 424200 }, { "epoch": 1.7285049532075418, "grad_norm": 2.754138708114624, "learning_rate": 0.00423630809997255, "loss": 7.932, "step": 424300 }, { "epoch": 1.7289123312309234, "grad_norm": 3.6710002422332764, "learning_rate": 0.00423595084847447, "loss": 7.9567, "step": 424400 }, { "epoch": 1.7293197092543047, "grad_norm": 3.097033739089966, "learning_rate": 0.004235593528542646, "loss": 7.9502, "step": 424500 }, { "epoch": 1.7297270872776862, "grad_norm": 2.262523651123047, "learning_rate": 0.004235236140191199, "loss": 7.9608, "step": 424600 }, { "epoch": 1.7301344653010675, "grad_norm": 3.1382665634155273, "learning_rate": 0.004234878683434268, "loss": 7.922, "step": 424700 }, { "epoch": 1.730541843324449, "grad_norm": 1.1934833526611328, "learning_rate": 0.004234521158285986, "loss": 7.9379, "step": 424800 }, { "epoch": 1.7309492213478306, "grad_norm": 4.094176292419434, "learning_rate": 0.00423416356476048, "loss": 7.9662, "step": 424900 }, { "epoch": 1.7313565993712121, "grad_norm": 4.058340072631836, "learning_rate": 0.004233805902871889, "loss": 7.9328, "step": 425000 }, { "epoch": 1.7313565993712121, "eval_MaskedAccuracy": 0.493221795250412, "eval_loss": 1.6672805547714233, "eval_runtime": 167.1428, "eval_samples_per_second": 379.771, "eval_steps_per_second": 1.484, "step": 425000 }, { "epoch": 1.7317639773945936, "grad_norm": 4.574451446533203, "learning_rate": 0.004233448172634353, "loss": 7.9546, "step": 425100 }, { "epoch": 1.732171355417975, "grad_norm": 6.09646463394165, "learning_rate": 0.004233090374062018, "loss": 7.9528, "step": 425200 }, { "epoch": 1.7325787334413563, "grad_norm": 1.7354048490524292, "learning_rate": 0.004232732507169023, "loss": 7.9722, "step": 425300 }, { "epoch": 1.7329861114647378, "grad_norm": 3.8242075443267822, "learning_rate": 0.004232374571969524, "loss": 7.925, "step": 425400 }, { "epoch": 1.7333934894881193, "grad_norm": 2.1813385486602783, "learning_rate": 0.004232016568477671, "loss": 7.9431, "step": 425500 }, { "epoch": 1.7338008675115009, "grad_norm": 2.1258385181427, "learning_rate": 0.004231658496707613, "loss": 7.9297, "step": 425600 }, { "epoch": 1.7342082455348824, "grad_norm": 1.3778568506240845, "learning_rate": 0.004231300356673509, "loss": 7.9432, "step": 425700 }, { "epoch": 1.7346156235582637, "grad_norm": 2.5177571773529053, "learning_rate": 0.004230942148389524, "loss": 7.9426, "step": 425800 }, { "epoch": 1.735023001581645, "grad_norm": 1.6045327186584473, "learning_rate": 0.004230583871869813, "loss": 7.9286, "step": 425900 }, { "epoch": 1.7354303796050266, "grad_norm": 4.004244327545166, "learning_rate": 0.004230225527128542, "loss": 7.9145, "step": 426000 }, { "epoch": 1.7354303796050266, "eval_MaskedAccuracy": 0.49541676722671146, "eval_loss": 1.6673568487167358, "eval_runtime": 279.7464, "eval_samples_per_second": 226.905, "eval_steps_per_second": 0.887, "step": 426000 }, { "epoch": 1.735837757628408, "grad_norm": 1.5896828174591064, "learning_rate": 0.004229867114179879, "loss": 7.9383, "step": 426100 }, { "epoch": 1.7362451356517896, "grad_norm": 4.541566848754883, "learning_rate": 0.004229508633037998, "loss": 7.9762, "step": 426200 }, { "epoch": 1.7366525136751711, "grad_norm": 3.481902837753296, "learning_rate": 0.0042291500837170655, "loss": 7.9363, "step": 426300 }, { "epoch": 1.7370598916985525, "grad_norm": 1.3038591146469116, "learning_rate": 0.00422879146623126, "loss": 7.9506, "step": 426400 }, { "epoch": 1.737467269721934, "grad_norm": 2.6721527576446533, "learning_rate": 0.004228432780594757, "loss": 7.8815, "step": 426500 }, { "epoch": 1.7378746477453153, "grad_norm": 2.730501413345337, "learning_rate": 0.004228074026821736, "loss": 7.9677, "step": 426600 }, { "epoch": 1.7382820257686968, "grad_norm": 4.208138942718506, "learning_rate": 0.004227715204926386, "loss": 7.9414, "step": 426700 }, { "epoch": 1.7386894037920784, "grad_norm": 1.6396872997283936, "learning_rate": 0.004227356314922886, "loss": 7.9424, "step": 426800 }, { "epoch": 1.73909678181546, "grad_norm": 1.0482051372528076, "learning_rate": 0.004226997356825428, "loss": 7.9356, "step": 426900 }, { "epoch": 1.7395041598388412, "grad_norm": 3.294713258743286, "learning_rate": 0.004226638330648208, "loss": 7.9513, "step": 427000 }, { "epoch": 1.7395041598388412, "eval_MaskedAccuracy": 0.49443078227574316, "eval_loss": 1.6736928224563599, "eval_runtime": 150.1969, "eval_samples_per_second": 422.619, "eval_steps_per_second": 1.651, "step": 427000 }, { "epoch": 1.7399115378622227, "grad_norm": 2.7487411499023438, "learning_rate": 0.004226279236405414, "loss": 7.9685, "step": 427100 }, { "epoch": 1.740318915885604, "grad_norm": 4.569572448730469, "learning_rate": 0.004225920074111246, "loss": 7.9265, "step": 427200 }, { "epoch": 1.7407262939089856, "grad_norm": 2.7802586555480957, "learning_rate": 0.004225560843779905, "loss": 7.9348, "step": 427300 }, { "epoch": 1.7411336719323671, "grad_norm": 2.1910150051116943, "learning_rate": 0.0042252015454255915, "loss": 7.9242, "step": 427400 }, { "epoch": 1.7415410499557487, "grad_norm": 4.726769924163818, "learning_rate": 0.004224842179062509, "loss": 7.9273, "step": 427500 }, { "epoch": 1.7419484279791302, "grad_norm": 2.721534013748169, "learning_rate": 0.004224482744704869, "loss": 7.9336, "step": 427600 }, { "epoch": 1.7423558060025115, "grad_norm": 1.8914376497268677, "learning_rate": 0.004224123242366877, "loss": 7.9219, "step": 427700 }, { "epoch": 1.7427631840258928, "grad_norm": 1.1209359169006348, "learning_rate": 0.00422376367206275, "loss": 7.9069, "step": 427800 }, { "epoch": 1.7431705620492743, "grad_norm": 1.9058761596679688, "learning_rate": 0.004223404033806703, "loss": 7.9621, "step": 427900 }, { "epoch": 1.7435779400726559, "grad_norm": 1.6395440101623535, "learning_rate": 0.0042230443276129615, "loss": 7.9118, "step": 428000 }, { "epoch": 1.7435779400726559, "eval_MaskedAccuracy": 0.4951851500268164, "eval_loss": 1.6693843603134155, "eval_runtime": 235.7121, "eval_samples_per_second": 269.295, "eval_steps_per_second": 1.052, "step": 428000 }, { "epoch": 1.7439853180960374, "grad_norm": 5.839138984680176, "learning_rate": 0.004222684553495732, "loss": 7.8909, "step": 428100 }, { "epoch": 1.744392696119419, "grad_norm": 2.5658891201019287, "learning_rate": 0.004222324711469246, "loss": 7.9404, "step": 428200 }, { "epoch": 1.7448000741428003, "grad_norm": 1.5509271621704102, "learning_rate": 0.004221964801547729, "loss": 7.9292, "step": 428300 }, { "epoch": 1.7452074521661816, "grad_norm": 5.172556400299072, "learning_rate": 0.004221604823745414, "loss": 7.8855, "step": 428400 }, { "epoch": 1.745614830189563, "grad_norm": 6.072554111480713, "learning_rate": 0.004221244778076527, "loss": 7.9796, "step": 428500 }, { "epoch": 1.7460222082129446, "grad_norm": 5.145614147186279, "learning_rate": 0.004220884664555305, "loss": 7.8932, "step": 428600 }, { "epoch": 1.7464295862363262, "grad_norm": 3.9118359088897705, "learning_rate": 0.004220524483195977, "loss": 7.9148, "step": 428700 }, { "epoch": 1.7468369642597077, "grad_norm": 7.136172771453857, "learning_rate": 0.0042201642340128015, "loss": 7.9739, "step": 428800 }, { "epoch": 1.747244342283089, "grad_norm": 1.8290817737579346, "learning_rate": 0.004219803917020007, "loss": 7.9246, "step": 428900 }, { "epoch": 1.7476517203064705, "grad_norm": 4.136419296264648, "learning_rate": 0.0042194435322318435, "loss": 7.8832, "step": 429000 }, { "epoch": 1.7476517203064705, "eval_MaskedAccuracy": 0.494982637445449, "eval_loss": 1.6747286319732666, "eval_runtime": 256.3837, "eval_samples_per_second": 247.582, "eval_steps_per_second": 0.967, "step": 429000 }, { "epoch": 1.7480590983298518, "grad_norm": 4.4007062911987305, "learning_rate": 0.0042190830796625545, "loss": 7.905, "step": 429100 }, { "epoch": 1.7484664763532334, "grad_norm": 4.226682662963867, "learning_rate": 0.004218722559326396, "loss": 7.9215, "step": 429200 }, { "epoch": 1.748873854376615, "grad_norm": 1.05141282081604, "learning_rate": 0.004218361971237617, "loss": 7.9446, "step": 429300 }, { "epoch": 1.7492812323999964, "grad_norm": 3.943563461303711, "learning_rate": 0.004218001315410477, "loss": 7.9563, "step": 429400 }, { "epoch": 1.7496886104233778, "grad_norm": 1.5362974405288696, "learning_rate": 0.004217640591859233, "loss": 7.9339, "step": 429500 }, { "epoch": 1.7500959884467593, "grad_norm": 1.417009949684143, "learning_rate": 0.004217279800598145, "loss": 7.9275, "step": 429600 }, { "epoch": 1.7505033664701406, "grad_norm": 2.833244562149048, "learning_rate": 0.0042169189416414775, "loss": 7.9469, "step": 429700 }, { "epoch": 1.7509107444935221, "grad_norm": 3.411043882369995, "learning_rate": 0.0042165580150035015, "loss": 7.9252, "step": 429800 }, { "epoch": 1.7513181225169037, "grad_norm": 2.143798828125, "learning_rate": 0.004216197020698485, "loss": 7.9404, "step": 429900 }, { "epoch": 1.7517255005402852, "grad_norm": 2.492276430130005, "learning_rate": 0.004215835958740691, "loss": 7.9048, "step": 430000 }, { "epoch": 1.7517255005402852, "eval_MaskedAccuracy": 0.49476444358347443, "eval_loss": 1.6649599075317383, "eval_runtime": 168.453, "eval_samples_per_second": 376.817, "eval_steps_per_second": 1.472, "step": 430000 }, { "epoch": 1.7521328785636667, "grad_norm": 1.5841550827026367, "learning_rate": 0.004215474829144404, "loss": 7.9557, "step": 430100 }, { "epoch": 1.752540256587048, "grad_norm": 1.9986809492111206, "learning_rate": 0.0042151136319239, "loss": 7.9075, "step": 430200 }, { "epoch": 1.7529476346104294, "grad_norm": 5.14298677444458, "learning_rate": 0.004214752367093458, "loss": 7.9479, "step": 430300 }, { "epoch": 1.7533550126338109, "grad_norm": 4.060118198394775, "learning_rate": 0.004214391034667361, "loss": 7.9276, "step": 430400 }, { "epoch": 1.7537623906571924, "grad_norm": 6.250939846038818, "learning_rate": 0.004214029634659891, "loss": 7.9274, "step": 430500 }, { "epoch": 1.754169768680574, "grad_norm": 1.6329816579818726, "learning_rate": 0.0042136681670853484, "loss": 7.9203, "step": 430600 }, { "epoch": 1.7545771467039555, "grad_norm": 2.1676533222198486, "learning_rate": 0.0042133066319580085, "loss": 7.9316, "step": 430700 }, { "epoch": 1.7549845247273368, "grad_norm": 3.689631700515747, "learning_rate": 0.00421294502929217, "loss": 7.9029, "step": 430800 }, { "epoch": 1.755391902750718, "grad_norm": 4.070959091186523, "learning_rate": 0.00421258335910213, "loss": 7.9356, "step": 430900 }, { "epoch": 1.7557992807740996, "grad_norm": 2.2822155952453613, "learning_rate": 0.004212221621402184, "loss": 7.935, "step": 431000 }, { "epoch": 1.7557992807740996, "eval_MaskedAccuracy": 0.49453676369755617, "eval_loss": 1.674303650856018, "eval_runtime": 160.2692, "eval_samples_per_second": 396.059, "eval_steps_per_second": 1.547, "step": 431000 }, { "epoch": 1.7562066587974812, "grad_norm": 2.610569953918457, "learning_rate": 0.004211859816206645, "loss": 7.93, "step": 431100 }, { "epoch": 1.7566140368208627, "grad_norm": 2.744990825653076, "learning_rate": 0.004211497943529805, "loss": 7.9149, "step": 431200 }, { "epoch": 1.7570214148442442, "grad_norm": 3.835881233215332, "learning_rate": 0.004211136003385974, "loss": 7.9793, "step": 431300 }, { "epoch": 1.7574287928676255, "grad_norm": 1.9926955699920654, "learning_rate": 0.004210773995789458, "loss": 7.9155, "step": 431400 }, { "epoch": 1.757836170891007, "grad_norm": 4.1470112800598145, "learning_rate": 0.00421041192075458, "loss": 7.9223, "step": 431500 }, { "epoch": 1.7582435489143884, "grad_norm": 4.6428937911987305, "learning_rate": 0.004210049778295644, "loss": 7.9092, "step": 431600 }, { "epoch": 1.75865092693777, "grad_norm": 1.5294262170791626, "learning_rate": 0.004209687568426969, "loss": 7.9197, "step": 431700 }, { "epoch": 1.7590583049611515, "grad_norm": 2.488340377807617, "learning_rate": 0.004209325291162881, "loss": 7.9236, "step": 431800 }, { "epoch": 1.759465682984533, "grad_norm": 1.558867335319519, "learning_rate": 0.004208962946517693, "loss": 7.9305, "step": 431900 }, { "epoch": 1.7598730610079143, "grad_norm": 1.2909411191940308, "learning_rate": 0.004208600534505739, "loss": 7.91, "step": 432000 }, { "epoch": 1.7598730610079143, "eval_MaskedAccuracy": 0.49530208904980155, "eval_loss": 1.6677933931350708, "eval_runtime": 226.9379, "eval_samples_per_second": 279.706, "eval_steps_per_second": 1.093, "step": 432000 }, { "epoch": 1.7602804390312958, "grad_norm": 4.529399394989014, "learning_rate": 0.004208238055141342, "loss": 7.9588, "step": 432100 }, { "epoch": 1.7606878170546771, "grad_norm": 6.088457107543945, "learning_rate": 0.004207875508438836, "loss": 7.9371, "step": 432200 }, { "epoch": 1.7610951950780587, "grad_norm": 4.843735694885254, "learning_rate": 0.004207512894412555, "loss": 7.8799, "step": 432300 }, { "epoch": 1.7615025731014402, "grad_norm": 1.129140019416809, "learning_rate": 0.004207150213076832, "loss": 7.9252, "step": 432400 }, { "epoch": 1.7619099511248217, "grad_norm": 2.9595580101013184, "learning_rate": 0.0042067874644460095, "loss": 7.9051, "step": 432500 }, { "epoch": 1.7623173291482033, "grad_norm": 12.065353393554688, "learning_rate": 0.004206424648534424, "loss": 7.9098, "step": 432600 }, { "epoch": 1.7627247071715846, "grad_norm": 3.2894842624664307, "learning_rate": 0.004206061765356425, "loss": 7.9287, "step": 432700 }, { "epoch": 1.763132085194966, "grad_norm": 2.2475671768188477, "learning_rate": 0.00420569881492636, "loss": 7.9415, "step": 432800 }, { "epoch": 1.7635394632183474, "grad_norm": 4.475327014923096, "learning_rate": 0.004205335797258577, "loss": 7.9633, "step": 432900 }, { "epoch": 1.763946841241729, "grad_norm": 1.3904350996017456, "learning_rate": 0.004204972712367418, "loss": 7.9322, "step": 433000 }, { "epoch": 1.763946841241729, "eval_MaskedAccuracy": 0.4948103956917094, "eval_loss": 1.6721564531326294, "eval_runtime": 202.4683, "eval_samples_per_second": 313.511, "eval_steps_per_second": 1.225, "step": 433000 }, { "epoch": 1.7643542192651105, "grad_norm": 4.464630126953125, "learning_rate": 0.004204609560267249, "loss": 7.9406, "step": 433100 }, { "epoch": 1.764761597288492, "grad_norm": 4.836032867431641, "learning_rate": 0.004204246340972421, "loss": 7.9282, "step": 433200 }, { "epoch": 1.7651689753118733, "grad_norm": 3.3716859817504883, "learning_rate": 0.004203883054497301, "loss": 7.8613, "step": 433300 }, { "epoch": 1.7655763533352546, "grad_norm": 3.8334603309631348, "learning_rate": 0.004203519700856246, "loss": 7.8941, "step": 433400 }, { "epoch": 1.7659837313586362, "grad_norm": 3.0918352603912354, "learning_rate": 0.0042031562800636186, "loss": 7.9405, "step": 433500 }, { "epoch": 1.7663911093820177, "grad_norm": 4.291378498077393, "learning_rate": 0.004202792792133799, "loss": 7.9402, "step": 433600 }, { "epoch": 1.7667984874053992, "grad_norm": 7.552089214324951, "learning_rate": 0.004202429237081146, "loss": 7.9158, "step": 433700 }, { "epoch": 1.7672058654287808, "grad_norm": 3.02734375, "learning_rate": 0.004202065614920036, "loss": 7.955, "step": 433800 }, { "epoch": 1.767613243452162, "grad_norm": 4.439110279083252, "learning_rate": 0.0042017019256648525, "loss": 7.9121, "step": 433900 }, { "epoch": 1.7680206214755436, "grad_norm": 3.56619930267334, "learning_rate": 0.0042013381693299596, "loss": 7.9461, "step": 434000 }, { "epoch": 1.7680206214755436, "eval_MaskedAccuracy": 0.49425697454076883, "eval_loss": 1.674866795539856, "eval_runtime": 168.5457, "eval_samples_per_second": 376.61, "eval_steps_per_second": 1.471, "step": 434000 }, { "epoch": 1.768427999498925, "grad_norm": 6.718011856079102, "learning_rate": 0.004200974345929748, "loss": 7.9562, "step": 434100 }, { "epoch": 1.7688353775223065, "grad_norm": 3.7811686992645264, "learning_rate": 0.004200610455478597, "loss": 7.9433, "step": 434200 }, { "epoch": 1.769242755545688, "grad_norm": 3.410982847213745, "learning_rate": 0.004200246497990897, "loss": 7.9246, "step": 434300 }, { "epoch": 1.7696501335690695, "grad_norm": 2.2441630363464355, "learning_rate": 0.004199882473481041, "loss": 7.9174, "step": 434400 }, { "epoch": 1.7700575115924508, "grad_norm": 5.297140121459961, "learning_rate": 0.004199518381963409, "loss": 7.9172, "step": 434500 }, { "epoch": 1.7704648896158324, "grad_norm": 1.750809907913208, "learning_rate": 0.004199154223452406, "loss": 7.9102, "step": 434600 }, { "epoch": 1.7708722676392137, "grad_norm": 5.267836093902588, "learning_rate": 0.004198789997962424, "loss": 7.9104, "step": 434700 }, { "epoch": 1.7712796456625952, "grad_norm": 3.8845372200012207, "learning_rate": 0.004198425705507866, "loss": 7.9406, "step": 434800 }, { "epoch": 1.7716870236859767, "grad_norm": 5.069277763366699, "learning_rate": 0.004198061346103134, "loss": 7.9008, "step": 434900 }, { "epoch": 1.7720944017093583, "grad_norm": 1.8418042659759521, "learning_rate": 0.004197696919762629, "loss": 7.9198, "step": 435000 }, { "epoch": 1.7720944017093583, "eval_MaskedAccuracy": 0.49537977874568173, "eval_loss": 1.6728644371032715, "eval_runtime": 194.3036, "eval_samples_per_second": 326.685, "eval_steps_per_second": 1.276, "step": 435000 }, { "epoch": 1.7725017797327398, "grad_norm": 2.8373005390167236, "learning_rate": 0.00419733242650076, "loss": 7.9016, "step": 435100 }, { "epoch": 1.7729091577561211, "grad_norm": 3.8157289028167725, "learning_rate": 0.004196967866331938, "loss": 7.8901, "step": 435200 }, { "epoch": 1.7733165357795024, "grad_norm": 3.1491050720214844, "learning_rate": 0.004196603239270578, "loss": 7.925, "step": 435300 }, { "epoch": 1.773723913802884, "grad_norm": 3.543290376663208, "learning_rate": 0.004196238545331096, "loss": 7.9499, "step": 435400 }, { "epoch": 1.7741312918262655, "grad_norm": 1.5468164682388306, "learning_rate": 0.00419587378452791, "loss": 7.9073, "step": 435500 }, { "epoch": 1.774538669849647, "grad_norm": 1.449621319770813, "learning_rate": 0.00419550895687544, "loss": 7.9334, "step": 435600 }, { "epoch": 1.7749460478730286, "grad_norm": 2.957839012145996, "learning_rate": 0.004195144062388103, "loss": 7.9364, "step": 435700 }, { "epoch": 1.7753534258964099, "grad_norm": 1.856015682220459, "learning_rate": 0.00419477910108034, "loss": 7.9517, "step": 435800 }, { "epoch": 1.7757608039197912, "grad_norm": 2.15834641456604, "learning_rate": 0.004194414072966572, "loss": 7.9036, "step": 435900 }, { "epoch": 1.7761681819431727, "grad_norm": 5.483026504516602, "learning_rate": 0.004194048978061223, "loss": 7.8789, "step": 436000 }, { "epoch": 1.7761681819431727, "eval_MaskedAccuracy": 0.4955424357984903, "eval_loss": 1.6680715084075928, "eval_runtime": 333.1299, "eval_samples_per_second": 190.544, "eval_steps_per_second": 0.744, "step": 436000 }, { "epoch": 1.7765755599665543, "grad_norm": 3.4480948448181152, "learning_rate": 0.004193683816378737, "loss": 7.9173, "step": 436100 }, { "epoch": 1.7769829379899358, "grad_norm": 1.7107717990875244, "learning_rate": 0.004193318587933548, "loss": 7.9448, "step": 436200 }, { "epoch": 1.7773903160133173, "grad_norm": 6.747818946838379, "learning_rate": 0.004192953292740097, "loss": 7.9291, "step": 436300 }, { "epoch": 1.7777976940366986, "grad_norm": 3.750035047531128, "learning_rate": 0.004192587930812824, "loss": 7.9411, "step": 436400 }, { "epoch": 1.7782050720600802, "grad_norm": 1.7927358150482178, "learning_rate": 0.004192222502166175, "loss": 7.9195, "step": 436500 }, { "epoch": 1.7786124500834615, "grad_norm": 1.3702152967453003, "learning_rate": 0.004191857006814593, "loss": 7.938, "step": 436600 }, { "epoch": 1.779019828106843, "grad_norm": 2.765705108642578, "learning_rate": 0.0041914914447725386, "loss": 7.9257, "step": 436700 }, { "epoch": 1.7794272061302245, "grad_norm": 2.7728800773620605, "learning_rate": 0.004191125816054456, "loss": 7.9118, "step": 436800 }, { "epoch": 1.779834584153606, "grad_norm": 4.654808044433594, "learning_rate": 0.004190760120674799, "loss": 7.9745, "step": 436900 }, { "epoch": 1.7802419621769874, "grad_norm": 5.276312351226807, "learning_rate": 0.004190394358648029, "loss": 7.9196, "step": 437000 }, { "epoch": 1.7802419621769874, "eval_MaskedAccuracy": 0.49464101729552723, "eval_loss": 1.6730901002883911, "eval_runtime": 180.536, "eval_samples_per_second": 351.597, "eval_steps_per_second": 1.374, "step": 437000 }, { "epoch": 1.780649340200369, "grad_norm": 3.10459566116333, "learning_rate": 0.004190028529988608, "loss": 7.9061, "step": 437100 }, { "epoch": 1.7810567182237502, "grad_norm": 2.878385066986084, "learning_rate": 0.004189662634710997, "loss": 7.9194, "step": 437200 }, { "epoch": 1.7814640962471318, "grad_norm": 4.35692024230957, "learning_rate": 0.004189296672829658, "loss": 7.9196, "step": 437300 }, { "epoch": 1.7818714742705133, "grad_norm": 2.9915459156036377, "learning_rate": 0.0041889306443590665, "loss": 7.9393, "step": 437400 }, { "epoch": 1.7822788522938948, "grad_norm": 6.092617034912109, "learning_rate": 0.0041885645493136905, "loss": 7.9078, "step": 437500 }, { "epoch": 1.7826862303172764, "grad_norm": 5.489753246307373, "learning_rate": 0.004188198387708, "loss": 7.9125, "step": 437600 }, { "epoch": 1.7830936083406577, "grad_norm": 4.809166431427002, "learning_rate": 0.004187832159556479, "loss": 7.9268, "step": 437700 }, { "epoch": 1.783500986364039, "grad_norm": 2.3220558166503906, "learning_rate": 0.004187465864873599, "loss": 7.9213, "step": 437800 }, { "epoch": 1.7839083643874205, "grad_norm": 4.348724365234375, "learning_rate": 0.004187099503673847, "loss": 7.9026, "step": 437900 }, { "epoch": 1.784315742410802, "grad_norm": 1.4235539436340332, "learning_rate": 0.004186733075971701, "loss": 7.911, "step": 438000 }, { "epoch": 1.784315742410802, "eval_MaskedAccuracy": 0.4959748050379288, "eval_loss": 1.666916012763977, "eval_runtime": 389.5847, "eval_samples_per_second": 162.932, "eval_steps_per_second": 0.637, "step": 438000 }, { "epoch": 1.7847231204341836, "grad_norm": 4.689061641693115, "learning_rate": 0.004186366581781657, "loss": 7.9296, "step": 438100 }, { "epoch": 1.785130498457565, "grad_norm": 1.690250039100647, "learning_rate": 0.004186000021118195, "loss": 7.9129, "step": 438200 }, { "epoch": 1.7855378764809464, "grad_norm": 2.6211435794830322, "learning_rate": 0.004185633393995812, "loss": 7.9078, "step": 438300 }, { "epoch": 1.7859452545043277, "grad_norm": 1.5534968376159668, "learning_rate": 0.004185266700429004, "loss": 7.8942, "step": 438400 }, { "epoch": 1.7863526325277093, "grad_norm": 3.143200397491455, "learning_rate": 0.004184899940432262, "loss": 7.9078, "step": 438500 }, { "epoch": 1.7867600105510908, "grad_norm": 2.1769890785217285, "learning_rate": 0.004184533114020093, "loss": 7.9193, "step": 438600 }, { "epoch": 1.7871673885744723, "grad_norm": 7.959547519683838, "learning_rate": 0.004184166221207, "loss": 7.9189, "step": 438700 }, { "epoch": 1.7875747665978539, "grad_norm": 2.8897268772125244, "learning_rate": 0.004183799262007484, "loss": 7.9022, "step": 438800 }, { "epoch": 1.7879821446212352, "grad_norm": 3.0990419387817383, "learning_rate": 0.004183432236436056, "loss": 7.8757, "step": 438900 }, { "epoch": 1.7883895226446167, "grad_norm": 3.6900036334991455, "learning_rate": 0.0041830651445072245, "loss": 7.9574, "step": 439000 }, { "epoch": 1.7883895226446167, "eval_MaskedAccuracy": 0.49563555786652236, "eval_loss": 1.673840880393982, "eval_runtime": 176.5906, "eval_samples_per_second": 359.453, "eval_steps_per_second": 1.404, "step": 439000 }, { "epoch": 1.788796900667998, "grad_norm": 2.1039018630981445, "learning_rate": 0.004182697986235501, "loss": 7.9234, "step": 439100 }, { "epoch": 1.7892042786913795, "grad_norm": 4.542923927307129, "learning_rate": 0.004182330761635402, "loss": 7.9206, "step": 439200 }, { "epoch": 1.789611656714761, "grad_norm": 2.3067336082458496, "learning_rate": 0.00418196347072145, "loss": 7.9344, "step": 439300 }, { "epoch": 1.7900190347381426, "grad_norm": 1.9165045022964478, "learning_rate": 0.004181596113508158, "loss": 7.9013, "step": 439400 }, { "epoch": 1.790426412761524, "grad_norm": 10.063130378723145, "learning_rate": 0.004181228690010059, "loss": 7.8963, "step": 439500 }, { "epoch": 1.7908337907849055, "grad_norm": 7.03554105758667, "learning_rate": 0.004180861200241673, "loss": 7.9011, "step": 439600 }, { "epoch": 1.7912411688082868, "grad_norm": 2.953171730041504, "learning_rate": 0.0041804936442175365, "loss": 7.9171, "step": 439700 }, { "epoch": 1.7916485468316683, "grad_norm": 1.5528123378753662, "learning_rate": 0.0041801260219521685, "loss": 7.9011, "step": 439800 }, { "epoch": 1.7920559248550498, "grad_norm": 3.7841126918792725, "learning_rate": 0.004179758333460114, "loss": 7.9069, "step": 439900 }, { "epoch": 1.7924633028784314, "grad_norm": 3.153717279434204, "learning_rate": 0.004179390578755899, "loss": 7.9109, "step": 440000 }, { "epoch": 1.7924633028784314, "eval_MaskedAccuracy": 0.4956089167070561, "eval_loss": 1.660947561264038, "eval_runtime": 168.1323, "eval_samples_per_second": 377.536, "eval_steps_per_second": 1.475, "step": 440000 }, { "epoch": 1.792870680901813, "grad_norm": 8.45737361907959, "learning_rate": 0.004179022757854072, "loss": 7.8934, "step": 440100 }, { "epoch": 1.7932780589251942, "grad_norm": 2.9157402515411377, "learning_rate": 0.004178654870769175, "loss": 7.9247, "step": 440200 }, { "epoch": 1.7936854369485755, "grad_norm": 4.986844062805176, "learning_rate": 0.0041782869175157515, "loss": 7.9019, "step": 440300 }, { "epoch": 1.794092814971957, "grad_norm": 3.2577908039093018, "learning_rate": 0.004177918898108344, "loss": 7.898, "step": 440400 }, { "epoch": 1.7945001929953386, "grad_norm": 5.337949752807617, "learning_rate": 0.0041775508125615046, "loss": 7.941, "step": 440500 }, { "epoch": 1.7949075710187201, "grad_norm": 4.633530139923096, "learning_rate": 0.004177182660889788, "loss": 7.9278, "step": 440600 }, { "epoch": 1.7953149490421016, "grad_norm": 5.676455974578857, "learning_rate": 0.004176814443107745, "loss": 7.9107, "step": 440700 }, { "epoch": 1.795722327065483, "grad_norm": 3.3541319370269775, "learning_rate": 0.004176446159229939, "loss": 7.9266, "step": 440800 }, { "epoch": 1.7961297050888643, "grad_norm": 8.209612846374512, "learning_rate": 0.004176077809270926, "loss": 7.9407, "step": 440900 }, { "epoch": 1.7965370831122458, "grad_norm": 2.504899263381958, "learning_rate": 0.00417570939324527, "loss": 7.9232, "step": 441000 }, { "epoch": 1.7965370831122458, "eval_MaskedAccuracy": 0.49530813754566594, "eval_loss": 1.6687226295471191, "eval_runtime": 227.0884, "eval_samples_per_second": 279.521, "eval_steps_per_second": 1.092, "step": 441000 }, { "epoch": 1.7969444611356273, "grad_norm": 3.49796199798584, "learning_rate": 0.0041753409111675395, "loss": 7.8928, "step": 441100 }, { "epoch": 1.7973518391590089, "grad_norm": 1.6105597019195557, "learning_rate": 0.004174972363052291, "loss": 7.8804, "step": 441200 }, { "epoch": 1.7977592171823904, "grad_norm": 1.1660513877868652, "learning_rate": 0.0041746037489141, "loss": 7.9187, "step": 441300 }, { "epoch": 1.7981665952057717, "grad_norm": 3.5011515617370605, "learning_rate": 0.00417423506876755, "loss": 7.9022, "step": 441400 }, { "epoch": 1.7985739732291532, "grad_norm": 4.8468403816223145, "learning_rate": 0.004173866322627205, "loss": 7.9085, "step": 441500 }, { "epoch": 1.7989813512525346, "grad_norm": 4.980172634124756, "learning_rate": 0.004173497510507649, "loss": 7.9135, "step": 441600 }, { "epoch": 1.799388729275916, "grad_norm": 1.7658847570419312, "learning_rate": 0.004173128632423463, "loss": 7.8693, "step": 441700 }, { "epoch": 1.7997961072992976, "grad_norm": 3.6838791370391846, "learning_rate": 0.004172759688389234, "loss": 7.916, "step": 441800 }, { "epoch": 1.8002034853226792, "grad_norm": 2.0375492572784424, "learning_rate": 0.0041723906784195425, "loss": 7.921, "step": 441900 }, { "epoch": 1.8006108633460605, "grad_norm": 6.014378547668457, "learning_rate": 0.0041720216025289796, "loss": 7.8806, "step": 442000 }, { "epoch": 1.8006108633460605, "eval_MaskedAccuracy": 0.49440137812230833, "eval_loss": 1.6746090650558472, "eval_runtime": 176.9757, "eval_samples_per_second": 358.671, "eval_steps_per_second": 1.401, "step": 442000 }, { "epoch": 1.801018241369442, "grad_norm": 2.6855621337890625, "learning_rate": 0.004171652460732139, "loss": 7.932, "step": 442100 }, { "epoch": 1.8014256193928233, "grad_norm": 3.248476028442383, "learning_rate": 0.004171283253043618, "loss": 7.8913, "step": 442200 }, { "epoch": 1.8018329974162048, "grad_norm": 2.8973610401153564, "learning_rate": 0.004170913979478002, "loss": 7.9527, "step": 442300 }, { "epoch": 1.8022403754395864, "grad_norm": 2.320071220397949, "learning_rate": 0.0041705446400498985, "loss": 7.9403, "step": 442400 }, { "epoch": 1.802647753462968, "grad_norm": 5.010457992553711, "learning_rate": 0.0041701752347739094, "loss": 7.914, "step": 442500 }, { "epoch": 1.8030551314863492, "grad_norm": 3.502883195877075, "learning_rate": 0.004169805763664631, "loss": 7.9174, "step": 442600 }, { "epoch": 1.8034625095097307, "grad_norm": 1.849295973777771, "learning_rate": 0.004169436226736683, "loss": 7.9389, "step": 442700 }, { "epoch": 1.803869887533112, "grad_norm": 2.2292819023132324, "learning_rate": 0.004169066624004665, "loss": 7.8988, "step": 442800 }, { "epoch": 1.8042772655564936, "grad_norm": 1.607640027999878, "learning_rate": 0.004168696955483185, "loss": 7.8778, "step": 442900 }, { "epoch": 1.8046846435798751, "grad_norm": 3.704303741455078, "learning_rate": 0.004168327221186878, "loss": 7.9121, "step": 443000 }, { "epoch": 1.8046846435798751, "eval_MaskedAccuracy": 0.4949538749421863, "eval_loss": 1.6765481233596802, "eval_runtime": 174.5788, "eval_samples_per_second": 363.595, "eval_steps_per_second": 1.421, "step": 443000 }, { "epoch": 1.8050920216032567, "grad_norm": 3.1414365768432617, "learning_rate": 0.0041679574211303475, "loss": 7.9105, "step": 443100 }, { "epoch": 1.8054993996266382, "grad_norm": 1.1742734909057617, "learning_rate": 0.004167587555328217, "loss": 7.9042, "step": 443200 }, { "epoch": 1.8059067776500195, "grad_norm": 4.276191234588623, "learning_rate": 0.004167217623795106, "loss": 7.9248, "step": 443300 }, { "epoch": 1.8063141556734008, "grad_norm": 3.426767587661743, "learning_rate": 0.004166847626545641, "loss": 7.9353, "step": 443400 }, { "epoch": 1.8067215336967823, "grad_norm": 3.6210439205169678, "learning_rate": 0.004166477563594457, "loss": 7.9257, "step": 443500 }, { "epoch": 1.8071289117201639, "grad_norm": 3.102713108062744, "learning_rate": 0.004166107434956181, "loss": 7.8792, "step": 443600 }, { "epoch": 1.8075362897435454, "grad_norm": 4.327525615692139, "learning_rate": 0.004165737240645438, "loss": 7.8969, "step": 443700 }, { "epoch": 1.807943667766927, "grad_norm": 2.9925131797790527, "learning_rate": 0.0041653669806768654, "loss": 7.9257, "step": 443800 }, { "epoch": 1.8083510457903083, "grad_norm": 4.600265979766846, "learning_rate": 0.00416499665506511, "loss": 7.9311, "step": 443900 }, { "epoch": 1.8087584238136898, "grad_norm": 2.469639778137207, "learning_rate": 0.0041646262638248025, "loss": 7.9158, "step": 444000 }, { "epoch": 1.8087584238136898, "eval_MaskedAccuracy": 0.49692060364473667, "eval_loss": 1.665111780166626, "eval_runtime": 186.8741, "eval_samples_per_second": 339.672, "eval_steps_per_second": 1.327, "step": 444000 }, { "epoch": 1.809165801837071, "grad_norm": 1.4323352575302124, "learning_rate": 0.004164255806970597, "loss": 7.8947, "step": 444100 }, { "epoch": 1.8095731798604526, "grad_norm": 1.980042815208435, "learning_rate": 0.004163885284517136, "loss": 7.9259, "step": 444200 }, { "epoch": 1.8099805578838342, "grad_norm": 4.067874908447266, "learning_rate": 0.0041635146964790724, "loss": 7.9005, "step": 444300 }, { "epoch": 1.8103879359072157, "grad_norm": 2.5142414569854736, "learning_rate": 0.0041631440428710476, "loss": 7.8821, "step": 444400 }, { "epoch": 1.810795313930597, "grad_norm": 3.3953969478607178, "learning_rate": 0.0041627733237077184, "loss": 7.9353, "step": 444500 }, { "epoch": 1.8112026919539785, "grad_norm": 2.7303261756896973, "learning_rate": 0.004162402539003742, "loss": 7.8995, "step": 444600 }, { "epoch": 1.8116100699773598, "grad_norm": 5.998344898223877, "learning_rate": 0.004162031688773784, "loss": 7.9338, "step": 444700 }, { "epoch": 1.8120174480007414, "grad_norm": 2.08536696434021, "learning_rate": 0.004161660773032501, "loss": 7.9245, "step": 444800 }, { "epoch": 1.812424826024123, "grad_norm": 5.068397521972656, "learning_rate": 0.004161289791794559, "loss": 7.9051, "step": 444900 }, { "epoch": 1.8128322040475044, "grad_norm": 3.3044345378875732, "learning_rate": 0.00416091874507462, "loss": 7.8989, "step": 445000 }, { "epoch": 1.8128322040475044, "eval_MaskedAccuracy": 0.49567050833286097, "eval_loss": 1.6687381267547607, "eval_runtime": 262.1765, "eval_samples_per_second": 242.112, "eval_steps_per_second": 0.946, "step": 445000 }, { "epoch": 1.8132395820708858, "grad_norm": 3.2127983570098877, "learning_rate": 0.00416054763288736, "loss": 7.9026, "step": 445100 }, { "epoch": 1.8136469600942673, "grad_norm": 2.71120548248291, "learning_rate": 0.004160176455247446, "loss": 7.8831, "step": 445200 }, { "epoch": 1.8140543381176486, "grad_norm": 3.842801094055176, "learning_rate": 0.004159805212169553, "loss": 7.8957, "step": 445300 }, { "epoch": 1.8144617161410301, "grad_norm": 3.2258753776550293, "learning_rate": 0.0041594339036683595, "loss": 7.8848, "step": 445400 }, { "epoch": 1.8148690941644117, "grad_norm": 1.8990269899368286, "learning_rate": 0.004159062529758543, "loss": 7.9124, "step": 445500 }, { "epoch": 1.8152764721877932, "grad_norm": 3.415687322616577, "learning_rate": 0.004158691090454788, "loss": 7.9171, "step": 445600 }, { "epoch": 1.8156838502111747, "grad_norm": 4.696331977844238, "learning_rate": 0.004158319585771781, "loss": 7.9059, "step": 445700 }, { "epoch": 1.816091228234556, "grad_norm": 7.733753681182861, "learning_rate": 0.004157948015724208, "loss": 7.9152, "step": 445800 }, { "epoch": 1.8164986062579374, "grad_norm": 5.251039505004883, "learning_rate": 0.004157576380326765, "loss": 7.9275, "step": 445900 }, { "epoch": 1.8169059842813189, "grad_norm": 1.9722418785095215, "learning_rate": 0.004157204679594133, "loss": 7.8827, "step": 446000 }, { "epoch": 1.8169059842813189, "eval_MaskedAccuracy": 0.4959173168432975, "eval_loss": 1.6723440885543823, "eval_runtime": 190.1191, "eval_samples_per_second": 333.875, "eval_steps_per_second": 1.304, "step": 446000 }, { "epoch": 1.8173133623047004, "grad_norm": 3.0915658473968506, "learning_rate": 0.004156832913541016, "loss": 7.9025, "step": 446100 }, { "epoch": 1.817720740328082, "grad_norm": 2.651003837585449, "learning_rate": 0.004156461082182111, "loss": 7.8733, "step": 446200 }, { "epoch": 1.8181281183514635, "grad_norm": 1.6605842113494873, "learning_rate": 0.004156089185532116, "loss": 7.8446, "step": 446300 }, { "epoch": 1.8185354963748448, "grad_norm": 2.606069326400757, "learning_rate": 0.004155717223605732, "loss": 7.8974, "step": 446400 }, { "epoch": 1.8189428743982263, "grad_norm": 1.5527215003967285, "learning_rate": 0.004155345196417669, "loss": 7.8842, "step": 446500 }, { "epoch": 1.8193502524216076, "grad_norm": 3.269723653793335, "learning_rate": 0.004154973103982632, "loss": 7.8867, "step": 446600 }, { "epoch": 1.8197576304449892, "grad_norm": 3.024127960205078, "learning_rate": 0.0041546009463153365, "loss": 7.8656, "step": 446700 }, { "epoch": 1.8201650084683707, "grad_norm": 3.980433225631714, "learning_rate": 0.004154228723430491, "loss": 7.8814, "step": 446800 }, { "epoch": 1.8205723864917522, "grad_norm": 2.666149377822876, "learning_rate": 0.004153856435342808, "loss": 7.9254, "step": 446900 }, { "epoch": 1.8209797645151335, "grad_norm": 1.558209776878357, "learning_rate": 0.004153484082067014, "loss": 7.9242, "step": 447000 }, { "epoch": 1.8209797645151335, "eval_MaskedAccuracy": 0.4950336656005353, "eval_loss": 1.6678202152252197, "eval_runtime": 164.6074, "eval_samples_per_second": 385.621, "eval_steps_per_second": 1.507, "step": 447000 }, { "epoch": 1.821387142538515, "grad_norm": 5.244185924530029, "learning_rate": 0.00415311166361783, "loss": 7.9113, "step": 447100 }, { "epoch": 1.8217945205618964, "grad_norm": 2.0754573345184326, "learning_rate": 0.0041527391800099765, "loss": 7.9222, "step": 447200 }, { "epoch": 1.822201898585278, "grad_norm": 2.7704737186431885, "learning_rate": 0.004152366631258185, "loss": 7.9639, "step": 447300 }, { "epoch": 1.8226092766086595, "grad_norm": 8.123115539550781, "learning_rate": 0.004151994017377173, "loss": 7.942, "step": 447400 }, { "epoch": 1.823016654632041, "grad_norm": 2.6104109287261963, "learning_rate": 0.004151621338381678, "loss": 7.9107, "step": 447500 }, { "epoch": 1.8234240326554223, "grad_norm": 1.7403087615966797, "learning_rate": 0.0041512485942864375, "loss": 7.9298, "step": 447600 }, { "epoch": 1.8238314106788038, "grad_norm": 2.766556978225708, "learning_rate": 0.0041508757851061884, "loss": 7.8635, "step": 447700 }, { "epoch": 1.8242387887021851, "grad_norm": 4.337071895599365, "learning_rate": 0.004150502910855666, "loss": 7.9125, "step": 447800 }, { "epoch": 1.8246461667255667, "grad_norm": 2.609506130218506, "learning_rate": 0.004150129971549611, "loss": 7.9264, "step": 447900 }, { "epoch": 1.8250535447489482, "grad_norm": 3.140911102294922, "learning_rate": 0.004149756967202771, "loss": 7.8923, "step": 448000 }, { "epoch": 1.8250535447489482, "eval_MaskedAccuracy": 0.495953354737639, "eval_loss": 1.6645445823669434, "eval_runtime": 178.6428, "eval_samples_per_second": 355.324, "eval_steps_per_second": 1.388, "step": 448000 }, { "epoch": 1.8254609227723297, "grad_norm": 3.31099271774292, "learning_rate": 0.0041493838978298924, "loss": 7.9058, "step": 448100 }, { "epoch": 1.8258683007957113, "grad_norm": 1.3624597787857056, "learning_rate": 0.0041490107634457206, "loss": 7.9254, "step": 448200 }, { "epoch": 1.8262756788190926, "grad_norm": 2.529301404953003, "learning_rate": 0.00414863756406501, "loss": 7.9183, "step": 448300 }, { "epoch": 1.826683056842474, "grad_norm": 6.022654056549072, "learning_rate": 0.004148264299702512, "loss": 7.9112, "step": 448400 }, { "epoch": 1.8270904348658554, "grad_norm": 4.083413600921631, "learning_rate": 0.004147890970372994, "loss": 7.905, "step": 448500 }, { "epoch": 1.827497812889237, "grad_norm": 1.5772367715835571, "learning_rate": 0.004147517576091207, "loss": 7.9241, "step": 448600 }, { "epoch": 1.8279051909126185, "grad_norm": 6.243408203125, "learning_rate": 0.004147144116871915, "loss": 7.9068, "step": 448700 }, { "epoch": 1.828312568936, "grad_norm": 3.281571865081787, "learning_rate": 0.004146770592729879, "loss": 7.8934, "step": 448800 }, { "epoch": 1.8287199469593813, "grad_norm": 5.442648887634277, "learning_rate": 0.004146397003679876, "loss": 7.8683, "step": 448900 }, { "epoch": 1.8291273249827629, "grad_norm": 2.891380548477173, "learning_rate": 0.004146023349736674, "loss": 7.9045, "step": 449000 }, { "epoch": 1.8291273249827629, "eval_MaskedAccuracy": 0.49615328179758333, "eval_loss": 1.6711750030517578, "eval_runtime": 227.1922, "eval_samples_per_second": 279.393, "eval_steps_per_second": 1.092, "step": 449000 }, { "epoch": 1.8295347030061442, "grad_norm": 4.703136920928955, "learning_rate": 0.0041456496309150385, "loss": 7.8932, "step": 449100 }, { "epoch": 1.8299420810295257, "grad_norm": 1.8629577159881592, "learning_rate": 0.0041452758472297485, "loss": 7.9143, "step": 449200 }, { "epoch": 1.8303494590529072, "grad_norm": 2.245495319366455, "learning_rate": 0.004144901998695584, "loss": 7.8996, "step": 449300 }, { "epoch": 1.8307568370762888, "grad_norm": 4.677794456481934, "learning_rate": 0.004144528085327317, "loss": 7.9425, "step": 449400 }, { "epoch": 1.83116421509967, "grad_norm": 3.489626407623291, "learning_rate": 0.004144154107139741, "loss": 7.9098, "step": 449500 }, { "epoch": 1.8315715931230516, "grad_norm": 2.117542028427124, "learning_rate": 0.004143780064147631, "loss": 7.8836, "step": 449600 }, { "epoch": 1.831978971146433, "grad_norm": 2.0413732528686523, "learning_rate": 0.00414340595636578, "loss": 7.8868, "step": 449700 }, { "epoch": 1.8323863491698145, "grad_norm": 2.2001099586486816, "learning_rate": 0.004143031783808978, "loss": 7.8777, "step": 449800 }, { "epoch": 1.832793727193196, "grad_norm": 3.2869749069213867, "learning_rate": 0.004142657546492024, "loss": 7.892, "step": 449900 }, { "epoch": 1.8332011052165775, "grad_norm": 3.861534595489502, "learning_rate": 0.004142283244429706, "loss": 7.941, "step": 450000 }, { "epoch": 1.8332011052165775, "eval_MaskedAccuracy": 0.4951523046993051, "eval_loss": 1.661887764930725, "eval_runtime": 213.694, "eval_samples_per_second": 297.042, "eval_steps_per_second": 1.161, "step": 450000 }, { "epoch": 1.8336084832399588, "grad_norm": 6.784153938293457, "learning_rate": 0.004141908877636822, "loss": 7.9454, "step": 450100 }, { "epoch": 1.8340158612633404, "grad_norm": 3.286696434020996, "learning_rate": 0.004141534446128179, "loss": 7.9103, "step": 450200 }, { "epoch": 1.8344232392867217, "grad_norm": 2.0969045162200928, "learning_rate": 0.00414115994991858, "loss": 7.9031, "step": 450300 }, { "epoch": 1.8348306173101032, "grad_norm": 2.1802902221679688, "learning_rate": 0.0041407853890228245, "loss": 7.9219, "step": 450400 }, { "epoch": 1.8352379953334848, "grad_norm": 11.451210021972656, "learning_rate": 0.0041404107634557255, "loss": 7.8825, "step": 450500 }, { "epoch": 1.8356453733568663, "grad_norm": 2.645416259765625, "learning_rate": 0.004140036073232091, "loss": 7.8948, "step": 450600 }, { "epoch": 1.8360527513802478, "grad_norm": 2.332348108291626, "learning_rate": 0.004139661318366737, "loss": 7.8975, "step": 450700 }, { "epoch": 1.8364601294036291, "grad_norm": 2.727524757385254, "learning_rate": 0.0041392864988744745, "loss": 7.9111, "step": 450800 }, { "epoch": 1.8368675074270104, "grad_norm": 2.6782305240631104, "learning_rate": 0.0041389116147701275, "loss": 7.8691, "step": 450900 }, { "epoch": 1.837274885450392, "grad_norm": 3.461545705795288, "learning_rate": 0.0041385366660685145, "loss": 7.9214, "step": 451000 }, { "epoch": 1.837274885450392, "eval_MaskedAccuracy": 0.4965261188255572, "eval_loss": 1.6573883295059204, "eval_runtime": 220.9948, "eval_samples_per_second": 287.228, "eval_steps_per_second": 1.122, "step": 451000 }, { "epoch": 1.8376822634737735, "grad_norm": 1.2859609127044678, "learning_rate": 0.00413816165278446, "loss": 7.8879, "step": 451100 }, { "epoch": 1.838089641497155, "grad_norm": 5.609396934509277, "learning_rate": 0.004137786574932793, "loss": 7.9193, "step": 451200 }, { "epoch": 1.8384970195205366, "grad_norm": 3.979440212249756, "learning_rate": 0.004137411432528341, "loss": 7.8831, "step": 451300 }, { "epoch": 1.8389043975439179, "grad_norm": 4.6923346519470215, "learning_rate": 0.00413703622558593, "loss": 7.9044, "step": 451400 }, { "epoch": 1.8393117755672994, "grad_norm": 5.283751487731934, "learning_rate": 0.004136660954120404, "loss": 7.9141, "step": 451500 }, { "epoch": 1.8397191535906807, "grad_norm": 5.3510422706604, "learning_rate": 0.004136285618146588, "loss": 7.922, "step": 451600 }, { "epoch": 1.8401265316140623, "grad_norm": 8.144210815429688, "learning_rate": 0.004135910217679331, "loss": 7.8678, "step": 451700 }, { "epoch": 1.8405339096374438, "grad_norm": 2.038489818572998, "learning_rate": 0.004135534752733466, "loss": 7.9156, "step": 451800 }, { "epoch": 1.8409412876608253, "grad_norm": 11.392291069030762, "learning_rate": 0.004135159223323846, "loss": 7.9059, "step": 451900 }, { "epoch": 1.8413486656842066, "grad_norm": 5.011401653289795, "learning_rate": 0.004134783629465308, "loss": 7.9042, "step": 452000 }, { "epoch": 1.8413486656842066, "eval_MaskedAccuracy": 0.4961874772093576, "eval_loss": 1.6660220623016357, "eval_runtime": 159.7033, "eval_samples_per_second": 397.462, "eval_steps_per_second": 1.553, "step": 452000 }, { "epoch": 1.8417560437075882, "grad_norm": 3.6451973915100098, "learning_rate": 0.004134407971172704, "loss": 7.8707, "step": 452100 }, { "epoch": 1.8421634217309695, "grad_norm": 4.899003505706787, "learning_rate": 0.004134032248460891, "loss": 7.8941, "step": 452200 }, { "epoch": 1.842570799754351, "grad_norm": 4.606278896331787, "learning_rate": 0.004133656461344721, "loss": 7.9084, "step": 452300 }, { "epoch": 1.8429781777777325, "grad_norm": 2.0069868564605713, "learning_rate": 0.004133280609839054, "loss": 7.9013, "step": 452400 }, { "epoch": 1.843385555801114, "grad_norm": 2.7358434200286865, "learning_rate": 0.004132904693958741, "loss": 7.8769, "step": 452500 }, { "epoch": 1.8437929338244954, "grad_norm": 1.9440211057662964, "learning_rate": 0.004132528713718655, "loss": 7.917, "step": 452600 }, { "epoch": 1.844200311847877, "grad_norm": 4.830276966094971, "learning_rate": 0.004132152669133653, "loss": 7.923, "step": 452700 }, { "epoch": 1.8446076898712582, "grad_norm": 1.4976750612258911, "learning_rate": 0.004131776560218604, "loss": 7.9158, "step": 452800 }, { "epoch": 1.8450150678946398, "grad_norm": 2.558852195739746, "learning_rate": 0.0041314003869883705, "loss": 7.8968, "step": 452900 }, { "epoch": 1.8454224459180213, "grad_norm": 3.535841464996338, "learning_rate": 0.004131024149457828, "loss": 7.8685, "step": 453000 }, { "epoch": 1.8454224459180213, "eval_MaskedAccuracy": 0.4963606994584085, "eval_loss": 1.6634576320648193, "eval_runtime": 153.1638, "eval_samples_per_second": 414.432, "eval_steps_per_second": 1.619, "step": 453000 }, { "epoch": 1.8458298239414028, "grad_norm": 4.483730792999268, "learning_rate": 0.004130647847641857, "loss": 7.8971, "step": 453100 }, { "epoch": 1.8462372019647844, "grad_norm": 5.824855327606201, "learning_rate": 0.004130271481555332, "loss": 7.8685, "step": 453200 }, { "epoch": 1.8466445799881657, "grad_norm": 4.613622665405273, "learning_rate": 0.00412989505121314, "loss": 7.9381, "step": 453300 }, { "epoch": 1.847051958011547, "grad_norm": 4.366022109985352, "learning_rate": 0.004129518556630148, "loss": 7.8921, "step": 453400 }, { "epoch": 1.8474593360349285, "grad_norm": 2.6968445777893066, "learning_rate": 0.004129141997821257, "loss": 7.8926, "step": 453500 }, { "epoch": 1.84786671405831, "grad_norm": 4.72305965423584, "learning_rate": 0.004128765374801345, "loss": 7.9136, "step": 453600 }, { "epoch": 1.8482740920816916, "grad_norm": 3.338348150253296, "learning_rate": 0.004128388687585303, "loss": 7.894, "step": 453700 }, { "epoch": 1.848681470105073, "grad_norm": 7.44401216506958, "learning_rate": 0.004128011936188016, "loss": 7.9059, "step": 453800 }, { "epoch": 1.8490888481284544, "grad_norm": 4.553011417388916, "learning_rate": 0.0041276351206243934, "loss": 7.9385, "step": 453900 }, { "epoch": 1.849496226151836, "grad_norm": 2.3087642192840576, "learning_rate": 0.0041272582409093254, "loss": 7.891, "step": 454000 }, { "epoch": 1.849496226151836, "eval_MaskedAccuracy": 0.4961357549464838, "eval_loss": 1.6584280729293823, "eval_runtime": 230.442, "eval_samples_per_second": 275.453, "eval_steps_per_second": 1.076, "step": 454000 }, { "epoch": 1.8499036041752173, "grad_norm": 3.6199896335601807, "learning_rate": 0.004126881297057713, "loss": 7.9171, "step": 454100 }, { "epoch": 1.8503109821985988, "grad_norm": 4.018579006195068, "learning_rate": 0.004126504289084462, "loss": 7.9041, "step": 454200 }, { "epoch": 1.8507183602219803, "grad_norm": 3.0367114543914795, "learning_rate": 0.004126127217004473, "loss": 7.8894, "step": 454300 }, { "epoch": 1.8511257382453619, "grad_norm": 4.969758987426758, "learning_rate": 0.004125750080832656, "loss": 7.8856, "step": 454400 }, { "epoch": 1.8515331162687432, "grad_norm": 2.2117040157318115, "learning_rate": 0.004125372880583914, "loss": 7.8793, "step": 454500 }, { "epoch": 1.8519404942921247, "grad_norm": 4.151038646697998, "learning_rate": 0.004124995616273167, "loss": 7.8797, "step": 454600 }, { "epoch": 1.852347872315506, "grad_norm": 1.7162184715270996, "learning_rate": 0.004124618287915334, "loss": 7.8996, "step": 454700 }, { "epoch": 1.8527552503388875, "grad_norm": 1.5427240133285522, "learning_rate": 0.004124240895525328, "loss": 7.9155, "step": 454800 }, { "epoch": 1.853162628362269, "grad_norm": 3.322561502456665, "learning_rate": 0.004123863439118065, "loss": 7.8792, "step": 454900 }, { "epoch": 1.8535700063856506, "grad_norm": 8.798599243164062, "learning_rate": 0.004123485918708471, "loss": 7.8988, "step": 455000 }, { "epoch": 1.8535700063856506, "eval_MaskedAccuracy": 0.4965567334365523, "eval_loss": 1.6644086837768555, "eval_runtime": 287.5826, "eval_samples_per_second": 220.723, "eval_steps_per_second": 0.862, "step": 455000 }, { "epoch": 1.853977384409032, "grad_norm": 1.1775785684585571, "learning_rate": 0.004123108334311479, "loss": 7.924, "step": 455100 }, { "epoch": 1.8543847624324135, "grad_norm": 3.676361560821533, "learning_rate": 0.004122730685942003, "loss": 7.9139, "step": 455200 }, { "epoch": 1.8547921404557948, "grad_norm": 1.9800739288330078, "learning_rate": 0.004122352973614984, "loss": 7.895, "step": 455300 }, { "epoch": 1.8551995184791763, "grad_norm": 2.143259048461914, "learning_rate": 0.004121975197345349, "loss": 7.8782, "step": 455400 }, { "epoch": 1.8556068965025578, "grad_norm": 1.4323902130126953, "learning_rate": 0.004121597357148036, "loss": 7.8973, "step": 455500 }, { "epoch": 1.8560142745259394, "grad_norm": 1.667303204536438, "learning_rate": 0.004121219453037981, "loss": 7.9141, "step": 455600 }, { "epoch": 1.856421652549321, "grad_norm": 2.2656047344207764, "learning_rate": 0.00412084148503013, "loss": 7.9419, "step": 455700 }, { "epoch": 1.8568290305727022, "grad_norm": 2.6010971069335938, "learning_rate": 0.004120463453139422, "loss": 7.9153, "step": 455800 }, { "epoch": 1.8572364085960835, "grad_norm": 8.667475700378418, "learning_rate": 0.004120085357380807, "loss": 7.9037, "step": 455900 }, { "epoch": 1.857643786619465, "grad_norm": 1.9657787084579468, "learning_rate": 0.004119707197769229, "loss": 7.9178, "step": 456000 }, { "epoch": 1.857643786619465, "eval_MaskedAccuracy": 0.496212023289223, "eval_loss": 1.6610968112945557, "eval_runtime": 372.771, "eval_samples_per_second": 170.281, "eval_steps_per_second": 0.665, "step": 456000 }, { "epoch": 1.8580511646428466, "grad_norm": 1.846003770828247, "learning_rate": 0.004119328974319637, "loss": 7.8931, "step": 456100 }, { "epoch": 1.8584585426662281, "grad_norm": 4.922017574310303, "learning_rate": 0.004118950687046992, "loss": 7.865, "step": 456200 }, { "epoch": 1.8588659206896097, "grad_norm": 5.104384422302246, "learning_rate": 0.004118572335966235, "loss": 7.8998, "step": 456300 }, { "epoch": 1.859273298712991, "grad_norm": 3.1299219131469727, "learning_rate": 0.004118193921092341, "loss": 7.8815, "step": 456400 }, { "epoch": 1.8596806767363725, "grad_norm": 3.355180501937866, "learning_rate": 0.004117815442440259, "loss": 7.9069, "step": 456500 }, { "epoch": 1.8600880547597538, "grad_norm": 4.386593341827393, "learning_rate": 0.004117436900024955, "loss": 7.8773, "step": 456600 }, { "epoch": 1.8604954327831353, "grad_norm": 3.226004123687744, "learning_rate": 0.004117058293861398, "loss": 7.8986, "step": 456700 }, { "epoch": 1.8609028108065169, "grad_norm": 1.5177836418151855, "learning_rate": 0.004116679623964555, "loss": 7.8996, "step": 456800 }, { "epoch": 1.8613101888298984, "grad_norm": 2.3991305828094482, "learning_rate": 0.004116300890349398, "loss": 7.9361, "step": 456900 }, { "epoch": 1.8617175668532797, "grad_norm": 2.6616926193237305, "learning_rate": 0.004115922093030896, "loss": 7.8761, "step": 457000 }, { "epoch": 1.8617175668532797, "eval_MaskedAccuracy": 0.4976523812898243, "eval_loss": 1.6497547626495361, "eval_runtime": 179.1595, "eval_samples_per_second": 354.299, "eval_steps_per_second": 1.384, "step": 457000 }, { "epoch": 1.8621249448766612, "grad_norm": 3.0376691818237305, "learning_rate": 0.004115543232024025, "loss": 7.9175, "step": 457100 }, { "epoch": 1.8625323229000426, "grad_norm": 3.201857805252075, "learning_rate": 0.004115164307343773, "loss": 7.853, "step": 457200 }, { "epoch": 1.862939700923424, "grad_norm": 2.287074089050293, "learning_rate": 0.004114785319005115, "loss": 7.8973, "step": 457300 }, { "epoch": 1.8633470789468056, "grad_norm": 4.273191928863525, "learning_rate": 0.004114406267023028, "loss": 7.9027, "step": 457400 }, { "epoch": 1.8637544569701872, "grad_norm": 2.2495110034942627, "learning_rate": 0.004114027151412505, "loss": 7.8978, "step": 457500 }, { "epoch": 1.8641618349935685, "grad_norm": 5.090861797332764, "learning_rate": 0.004113647972188537, "loss": 7.8977, "step": 457600 }, { "epoch": 1.86456921301695, "grad_norm": 2.9712412357330322, "learning_rate": 0.004113268729366111, "loss": 7.914, "step": 457700 }, { "epoch": 1.8649765910403313, "grad_norm": 1.534548044204712, "learning_rate": 0.004112889422960217, "loss": 7.8898, "step": 457800 }, { "epoch": 1.8653839690637128, "grad_norm": 1.4076720476150513, "learning_rate": 0.004112510052985856, "loss": 7.8964, "step": 457900 }, { "epoch": 1.8657913470870944, "grad_norm": 3.8278603553771973, "learning_rate": 0.004112130619458025, "loss": 7.8828, "step": 458000 }, { "epoch": 1.8657913470870944, "eval_MaskedAccuracy": 0.4966755756615507, "eval_loss": 1.6578221321105957, "eval_runtime": 178.832, "eval_samples_per_second": 354.948, "eval_steps_per_second": 1.387, "step": 458000 }, { "epoch": 1.866198725110476, "grad_norm": 5.5682454109191895, "learning_rate": 0.004111751122391722, "loss": 7.8974, "step": 458100 }, { "epoch": 1.8666061031338574, "grad_norm": 2.8451478481292725, "learning_rate": 0.004111371561801957, "loss": 7.8977, "step": 458200 }, { "epoch": 1.8670134811572388, "grad_norm": 3.2823026180267334, "learning_rate": 0.004110991937703733, "loss": 7.8918, "step": 458300 }, { "epoch": 1.86742085918062, "grad_norm": 6.824443340301514, "learning_rate": 0.0041106122501120635, "loss": 7.8635, "step": 458400 }, { "epoch": 1.8678282372040016, "grad_norm": 6.437410354614258, "learning_rate": 0.004110232499041946, "loss": 7.9025, "step": 458500 }, { "epoch": 1.8682356152273831, "grad_norm": 5.972446918487549, "learning_rate": 0.004109852684508407, "loss": 7.8893, "step": 458600 }, { "epoch": 1.8686429932507647, "grad_norm": 3.5736684799194336, "learning_rate": 0.004109472806526464, "loss": 7.8966, "step": 458700 }, { "epoch": 1.8690503712741462, "grad_norm": 3.444176197052002, "learning_rate": 0.0041090928651111224, "loss": 7.8729, "step": 458800 }, { "epoch": 1.8694577492975275, "grad_norm": 7.170397758483887, "learning_rate": 0.004108712860277412, "loss": 7.8855, "step": 458900 }, { "epoch": 1.869865127320909, "grad_norm": 3.737161159515381, "learning_rate": 0.004108332792040354, "loss": 7.9061, "step": 459000 }, { "epoch": 1.869865127320909, "eval_MaskedAccuracy": 0.4972140486354773, "eval_loss": 1.6708306074142456, "eval_runtime": 187.0603, "eval_samples_per_second": 339.334, "eval_steps_per_second": 1.326, "step": 459000 }, { "epoch": 1.8702725053442903, "grad_norm": 5.48805570602417, "learning_rate": 0.004107952660414973, "loss": 7.8729, "step": 459100 }, { "epoch": 1.8706798833676719, "grad_norm": 6.40994930267334, "learning_rate": 0.004107572465416302, "loss": 7.9006, "step": 459200 }, { "epoch": 1.8710872613910534, "grad_norm": 3.788102626800537, "learning_rate": 0.004107192207059372, "loss": 7.8982, "step": 459300 }, { "epoch": 1.871494639414435, "grad_norm": 1.819547176361084, "learning_rate": 0.004106811885359216, "loss": 7.8924, "step": 459400 }, { "epoch": 1.8719020174378163, "grad_norm": 6.286037445068359, "learning_rate": 0.004106431500330866, "loss": 7.9078, "step": 459500 }, { "epoch": 1.8723093954611978, "grad_norm": 2.6707632541656494, "learning_rate": 0.004106051051989363, "loss": 7.8837, "step": 459600 }, { "epoch": 1.872716773484579, "grad_norm": 2.4007577896118164, "learning_rate": 0.004105670540349755, "loss": 7.8965, "step": 459700 }, { "epoch": 1.8731241515079606, "grad_norm": 2.9365899562835693, "learning_rate": 0.0041052899654270706, "loss": 7.8642, "step": 459800 }, { "epoch": 1.8735315295313422, "grad_norm": 5.498284339904785, "learning_rate": 0.004104909327236368, "loss": 7.9186, "step": 459900 }, { "epoch": 1.8739389075547237, "grad_norm": 2.893430233001709, "learning_rate": 0.004104528625792691, "loss": 7.8869, "step": 460000 }, { "epoch": 1.8739389075547237, "eval_MaskedAccuracy": 0.49687180917215784, "eval_loss": 1.670817494392395, "eval_runtime": 215.0947, "eval_samples_per_second": 295.107, "eval_steps_per_second": 1.153, "step": 460000 }, { "epoch": 1.874346285578105, "grad_norm": 7.116004943847656, "learning_rate": 0.004104147861111094, "loss": 7.8856, "step": 460100 }, { "epoch": 1.8747536636014865, "grad_norm": 4.6221208572387695, "learning_rate": 0.004103767033206633, "loss": 7.8595, "step": 460200 }, { "epoch": 1.8751610416248679, "grad_norm": 4.07310676574707, "learning_rate": 0.004103386142094354, "loss": 7.8936, "step": 460300 }, { "epoch": 1.8755684196482494, "grad_norm": 1.851848840713501, "learning_rate": 0.00410300518778932, "loss": 7.9016, "step": 460400 }, { "epoch": 1.875975797671631, "grad_norm": 4.325085163116455, "learning_rate": 0.0041026241703066004, "loss": 7.8796, "step": 460500 }, { "epoch": 1.8763831756950125, "grad_norm": 1.8312495946884155, "learning_rate": 0.004102243089661252, "loss": 7.8858, "step": 460600 }, { "epoch": 1.876790553718394, "grad_norm": 5.251564979553223, "learning_rate": 0.004101861945868342, "loss": 7.8825, "step": 460700 }, { "epoch": 1.8771979317417753, "grad_norm": 6.409215927124023, "learning_rate": 0.004101480738942937, "loss": 7.8694, "step": 460800 }, { "epoch": 1.8776053097651566, "grad_norm": 3.440920829772949, "learning_rate": 0.004101099468900114, "loss": 7.8854, "step": 460900 }, { "epoch": 1.8780126877885381, "grad_norm": 2.75022292137146, "learning_rate": 0.004100718135754942, "loss": 7.9107, "step": 461000 }, { "epoch": 1.8780126877885381, "eval_MaskedAccuracy": 0.4972361158192994, "eval_loss": 1.656812071800232, "eval_runtime": 181.3266, "eval_samples_per_second": 350.064, "eval_steps_per_second": 1.368, "step": 461000 }, { "epoch": 1.8784200658119197, "grad_norm": 1.467983603477478, "learning_rate": 0.004100336739522496, "loss": 7.9035, "step": 461100 }, { "epoch": 1.8788274438353012, "grad_norm": 4.852626323699951, "learning_rate": 0.004099955280217857, "loss": 7.8854, "step": 461200 }, { "epoch": 1.8792348218586827, "grad_norm": 3.740487575531006, "learning_rate": 0.004099573757856104, "loss": 7.9009, "step": 461300 }, { "epoch": 1.879642199882064, "grad_norm": 3.978701114654541, "learning_rate": 0.0040991921724523196, "loss": 7.9037, "step": 461400 }, { "epoch": 1.8800495779054456, "grad_norm": 2.751621723175049, "learning_rate": 0.0040988105240215966, "loss": 7.8742, "step": 461500 }, { "epoch": 1.880456955928827, "grad_norm": 2.0758726596832275, "learning_rate": 0.004098428812579016, "loss": 7.8796, "step": 461600 }, { "epoch": 1.8808643339522084, "grad_norm": 4.8139495849609375, "learning_rate": 0.004098047038139674, "loss": 7.8671, "step": 461700 }, { "epoch": 1.88127171197559, "grad_norm": 3.995478868484497, "learning_rate": 0.004097665200718666, "loss": 7.8402, "step": 461800 }, { "epoch": 1.8816790899989715, "grad_norm": 5.929808616638184, "learning_rate": 0.004097283300331081, "loss": 7.9057, "step": 461900 }, { "epoch": 1.8820864680223528, "grad_norm": 4.032388687133789, "learning_rate": 0.004096901336992025, "loss": 7.867, "step": 462000 }, { "epoch": 1.8820864680223528, "eval_MaskedAccuracy": 0.4966355526760221, "eval_loss": 1.6544089317321777, "eval_runtime": 233.0379, "eval_samples_per_second": 272.385, "eval_steps_per_second": 1.064, "step": 462000 }, { "epoch": 1.8824938460457343, "grad_norm": 1.7739145755767822, "learning_rate": 0.004096519310716591, "loss": 7.8774, "step": 462100 }, { "epoch": 1.8829012240691156, "grad_norm": 1.3551661968231201, "learning_rate": 0.004096137221519888, "loss": 7.914, "step": 462200 }, { "epoch": 1.8833086020924972, "grad_norm": 7.672719478607178, "learning_rate": 0.004095755069417018, "loss": 7.89, "step": 462300 }, { "epoch": 1.8837159801158787, "grad_norm": 5.353662967681885, "learning_rate": 0.004095372854423089, "loss": 7.8856, "step": 462400 }, { "epoch": 1.8841233581392602, "grad_norm": 3.249173641204834, "learning_rate": 0.004094990576553222, "loss": 7.8568, "step": 462500 }, { "epoch": 1.8845307361626416, "grad_norm": 3.1244406700134277, "learning_rate": 0.004094608235822522, "loss": 7.8715, "step": 462600 }, { "epoch": 1.884938114186023, "grad_norm": 2.498183012008667, "learning_rate": 0.0040942258322461044, "loss": 7.879, "step": 462700 }, { "epoch": 1.8853454922094044, "grad_norm": 2.7141098976135254, "learning_rate": 0.004093843365839088, "loss": 7.8946, "step": 462800 }, { "epoch": 1.885752870232786, "grad_norm": 2.811753511428833, "learning_rate": 0.004093460836616596, "loss": 7.8782, "step": 462900 }, { "epoch": 1.8861602482561675, "grad_norm": 7.897110939025879, "learning_rate": 0.004093078244593749, "loss": 7.8573, "step": 463000 }, { "epoch": 1.8861602482561675, "eval_MaskedAccuracy": 0.49610789181503145, "eval_loss": 1.6706278324127197, "eval_runtime": 185.2555, "eval_samples_per_second": 342.64, "eval_steps_per_second": 1.339, "step": 463000 }, { "epoch": 1.886567626279549, "grad_norm": 4.551334857940674, "learning_rate": 0.004092695589785684, "loss": 7.8957, "step": 463100 }, { "epoch": 1.8869750043029305, "grad_norm": 2.587092876434326, "learning_rate": 0.004092312872207516, "loss": 7.8956, "step": 463200 }, { "epoch": 1.8873823823263118, "grad_norm": 2.0081067085266113, "learning_rate": 0.004091930091874383, "loss": 7.877, "step": 463300 }, { "epoch": 1.8877897603496931, "grad_norm": 2.8725426197052, "learning_rate": 0.004091547248801414, "loss": 7.8703, "step": 463400 }, { "epoch": 1.8881971383730747, "grad_norm": 4.049835205078125, "learning_rate": 0.004091164343003749, "loss": 7.9268, "step": 463500 }, { "epoch": 1.8886045163964562, "grad_norm": 5.329404354095459, "learning_rate": 0.004090781374496521, "loss": 7.8657, "step": 463600 }, { "epoch": 1.8890118944198377, "grad_norm": 3.372039556503296, "learning_rate": 0.004090398343294875, "loss": 7.8803, "step": 463700 }, { "epoch": 1.8894192724432193, "grad_norm": 4.426126003265381, "learning_rate": 0.004090015249413954, "loss": 7.8952, "step": 463800 }, { "epoch": 1.8898266504666006, "grad_norm": 2.545288324356079, "learning_rate": 0.0040896320928689054, "loss": 7.9167, "step": 463900 }, { "epoch": 1.890234028489982, "grad_norm": 1.8516414165496826, "learning_rate": 0.004089248873674872, "loss": 7.8635, "step": 464000 }, { "epoch": 1.890234028489982, "eval_MaskedAccuracy": 0.4977357268850934, "eval_loss": 1.646777868270874, "eval_runtime": 179.0391, "eval_samples_per_second": 354.537, "eval_steps_per_second": 1.385, "step": 464000 }, { "epoch": 1.8906414065133634, "grad_norm": 2.3388543128967285, "learning_rate": 0.004088865591847009, "loss": 7.9046, "step": 464100 }, { "epoch": 1.891048784536745, "grad_norm": 2.0005366802215576, "learning_rate": 0.004088482247400468, "loss": 7.8999, "step": 464200 }, { "epoch": 1.8914561625601265, "grad_norm": 4.26557731628418, "learning_rate": 0.0040880988403504005, "loss": 7.8838, "step": 464300 }, { "epoch": 1.891863540583508, "grad_norm": 2.6730682849884033, "learning_rate": 0.004087715370711972, "loss": 7.8887, "step": 464400 }, { "epoch": 1.8922709186068893, "grad_norm": 5.512271404266357, "learning_rate": 0.0040873318385003365, "loss": 7.9034, "step": 464500 }, { "epoch": 1.8926782966302709, "grad_norm": 4.042498588562012, "learning_rate": 0.004086948243730663, "loss": 7.8898, "step": 464600 }, { "epoch": 1.8930856746536522, "grad_norm": 3.528592348098755, "learning_rate": 0.004086564586418112, "loss": 7.892, "step": 464700 }, { "epoch": 1.8934930526770337, "grad_norm": 4.220452308654785, "learning_rate": 0.004086180866577856, "loss": 7.8771, "step": 464800 }, { "epoch": 1.8939004307004152, "grad_norm": 4.932054042816162, "learning_rate": 0.004085797084225067, "loss": 7.8865, "step": 464900 }, { "epoch": 1.8943078087237968, "grad_norm": 4.68433141708374, "learning_rate": 0.00408541323937491, "loss": 7.9124, "step": 465000 }, { "epoch": 1.8943078087237968, "eval_MaskedAccuracy": 0.496761901124384, "eval_loss": 1.6511331796646118, "eval_runtime": 163.5458, "eval_samples_per_second": 388.124, "eval_steps_per_second": 1.516, "step": 465000 }, { "epoch": 1.894715186747178, "grad_norm": 2.8748650550842285, "learning_rate": 0.004085029332042569, "loss": 7.8964, "step": 465100 }, { "epoch": 1.8951225647705596, "grad_norm": 1.8150627613067627, "learning_rate": 0.00408464536224322, "loss": 7.9157, "step": 465200 }, { "epoch": 1.895529942793941, "grad_norm": 2.498502492904663, "learning_rate": 0.0040842613299920385, "loss": 7.882, "step": 465300 }, { "epoch": 1.8959373208173225, "grad_norm": 10.029559135437012, "learning_rate": 0.004083877235304211, "loss": 7.8841, "step": 465400 }, { "epoch": 1.896344698840704, "grad_norm": 1.2890334129333496, "learning_rate": 0.004083493078194922, "loss": 7.8709, "step": 465500 }, { "epoch": 1.8967520768640855, "grad_norm": 4.392586708068848, "learning_rate": 0.004083108858679358, "loss": 7.9104, "step": 465600 }, { "epoch": 1.897159454887467, "grad_norm": 2.6691832542419434, "learning_rate": 0.004082724576772705, "loss": 7.8731, "step": 465700 }, { "epoch": 1.8975668329108484, "grad_norm": 8.658385276794434, "learning_rate": 0.004082340232490165, "loss": 7.8833, "step": 465800 }, { "epoch": 1.8979742109342297, "grad_norm": 3.7266769409179688, "learning_rate": 0.004081955825846923, "loss": 7.9004, "step": 465900 }, { "epoch": 1.8983815889576112, "grad_norm": 2.238558292388916, "learning_rate": 0.004081571356858191, "loss": 7.881, "step": 466000 }, { "epoch": 1.8983815889576112, "eval_MaskedAccuracy": 0.4971370541862614, "eval_loss": 1.659732699394226, "eval_runtime": 170.1739, "eval_samples_per_second": 373.007, "eval_steps_per_second": 1.457, "step": 466000 }, { "epoch": 1.8987889669809928, "grad_norm": 5.282289505004883, "learning_rate": 0.00408118682553916, "loss": 7.8756, "step": 466100 }, { "epoch": 1.8991963450043743, "grad_norm": 6.814397811889648, "learning_rate": 0.004080802231905035, "loss": 7.8595, "step": 466200 }, { "epoch": 1.8996037230277558, "grad_norm": 6.039074420928955, "learning_rate": 0.004080417575971024, "loss": 7.8788, "step": 466300 }, { "epoch": 1.9000111010511371, "grad_norm": 5.3705058097839355, "learning_rate": 0.004080032857752321, "loss": 7.8888, "step": 466400 }, { "epoch": 1.9004184790745184, "grad_norm": 1.4663264751434326, "learning_rate": 0.00407964807726415, "loss": 7.888, "step": 466500 }, { "epoch": 1.9008258570979, "grad_norm": 4.302461624145508, "learning_rate": 0.004079263234521717, "loss": 7.9108, "step": 466600 }, { "epoch": 1.9012332351212815, "grad_norm": 8.045517921447754, "learning_rate": 0.004078878329540241, "loss": 7.921, "step": 466700 }, { "epoch": 1.901640613144663, "grad_norm": 3.6992404460906982, "learning_rate": 0.004078493362334935, "loss": 7.8639, "step": 466800 }, { "epoch": 1.9020479911680446, "grad_norm": 1.4270908832550049, "learning_rate": 0.004078108332921021, "loss": 7.864, "step": 466900 }, { "epoch": 1.9024553691914259, "grad_norm": 3.777878999710083, "learning_rate": 0.004077723241313717, "loss": 7.8879, "step": 467000 }, { "epoch": 1.9024553691914259, "eval_MaskedAccuracy": 0.49775283973197887, "eval_loss": 1.6489890813827515, "eval_runtime": 209.5076, "eval_samples_per_second": 302.977, "eval_steps_per_second": 1.184, "step": 467000 }, { "epoch": 1.9028627472148074, "grad_norm": 3.6529839038848877, "learning_rate": 0.004077338087528257, "loss": 7.9005, "step": 467100 }, { "epoch": 1.9032701252381887, "grad_norm": 4.476478099822998, "learning_rate": 0.004076952871579866, "loss": 7.931, "step": 467200 }, { "epoch": 1.9036775032615703, "grad_norm": 1.4068933725357056, "learning_rate": 0.004076567593483765, "loss": 7.8746, "step": 467300 }, { "epoch": 1.9040848812849518, "grad_norm": 2.029770612716675, "learning_rate": 0.004076182253255195, "loss": 7.8543, "step": 467400 }, { "epoch": 1.9044922593083333, "grad_norm": 2.378837823867798, "learning_rate": 0.004075796850909382, "loss": 7.8565, "step": 467500 }, { "epoch": 1.9048996373317146, "grad_norm": 2.1673130989074707, "learning_rate": 0.00407541138646157, "loss": 7.8698, "step": 467600 }, { "epoch": 1.9053070153550962, "grad_norm": 1.8733528852462769, "learning_rate": 0.004075025859926995, "loss": 7.9058, "step": 467700 }, { "epoch": 1.9057143933784775, "grad_norm": 5.9949951171875, "learning_rate": 0.004074640271320902, "loss": 7.9013, "step": 467800 }, { "epoch": 1.906121771401859, "grad_norm": 1.5321779251098633, "learning_rate": 0.004074254620658537, "loss": 7.8523, "step": 467900 }, { "epoch": 1.9065291494252405, "grad_norm": 7.5324835777282715, "learning_rate": 0.004073868907955139, "loss": 7.8647, "step": 468000 }, { "epoch": 1.9065291494252405, "eval_MaskedAccuracy": 0.4969127485025389, "eval_loss": 1.6676855087280273, "eval_runtime": 192.8688, "eval_samples_per_second": 329.115, "eval_steps_per_second": 1.286, "step": 468000 }, { "epoch": 1.906936527448622, "grad_norm": 1.861028790473938, "learning_rate": 0.004073483133225972, "loss": 7.8922, "step": 468100 }, { "epoch": 1.9073439054720036, "grad_norm": 4.3910908699035645, "learning_rate": 0.004073097296486274, "loss": 7.899, "step": 468200 }, { "epoch": 1.907751283495385, "grad_norm": 1.6898629665374756, "learning_rate": 0.004072711397751302, "loss": 7.8955, "step": 468300 }, { "epoch": 1.9081586615187662, "grad_norm": 2.9406893253326416, "learning_rate": 0.004072325437036316, "loss": 7.889, "step": 468400 }, { "epoch": 1.9085660395421478, "grad_norm": 1.2531869411468506, "learning_rate": 0.00407193941435657, "loss": 7.9048, "step": 468500 }, { "epoch": 1.9089734175655293, "grad_norm": 3.3710033893585205, "learning_rate": 0.004071553329727329, "loss": 7.8771, "step": 468600 }, { "epoch": 1.9093807955889108, "grad_norm": 3.323848247528076, "learning_rate": 0.004071167183163855, "loss": 7.8752, "step": 468700 }, { "epoch": 1.9097881736122924, "grad_norm": 2.5802295207977295, "learning_rate": 0.004070780974681417, "loss": 7.8805, "step": 468800 }, { "epoch": 1.9101955516356737, "grad_norm": 3.3420770168304443, "learning_rate": 0.004070394704295283, "loss": 7.8891, "step": 468900 }, { "epoch": 1.910602929659055, "grad_norm": 2.558593273162842, "learning_rate": 0.004070008372020724, "loss": 7.9017, "step": 469000 }, { "epoch": 1.910602929659055, "eval_MaskedAccuracy": 0.4963021289787646, "eval_loss": 1.6615022420883179, "eval_runtime": 204.791, "eval_samples_per_second": 309.955, "eval_steps_per_second": 1.211, "step": 469000 }, { "epoch": 1.9110103076824365, "grad_norm": 3.3948159217834473, "learning_rate": 0.004069621977873013, "loss": 7.9062, "step": 469100 }, { "epoch": 1.911417685705818, "grad_norm": 4.204961776733398, "learning_rate": 0.004069235521867424, "loss": 7.8834, "step": 469200 }, { "epoch": 1.9118250637291996, "grad_norm": 1.9316223859786987, "learning_rate": 0.004068849004019239, "loss": 7.9044, "step": 469300 }, { "epoch": 1.9122324417525811, "grad_norm": 5.4399094581604, "learning_rate": 0.004068462424343744, "loss": 7.8593, "step": 469400 }, { "epoch": 1.9126398197759624, "grad_norm": 5.390496730804443, "learning_rate": 0.0040680757828562165, "loss": 7.8793, "step": 469500 }, { "epoch": 1.913047197799344, "grad_norm": 4.0297722816467285, "learning_rate": 0.004067689079571947, "loss": 7.8563, "step": 469600 }, { "epoch": 1.9134545758227253, "grad_norm": 5.022799968719482, "learning_rate": 0.004067302314506215, "loss": 7.8751, "step": 469700 }, { "epoch": 1.9138619538461068, "grad_norm": 5.983433723449707, "learning_rate": 0.004066915487674315, "loss": 7.8482, "step": 469800 }, { "epoch": 1.9142693318694883, "grad_norm": 1.6672743558883667, "learning_rate": 0.004066528599091545, "loss": 7.8626, "step": 469900 }, { "epoch": 1.9146767098928699, "grad_norm": 3.781672954559326, "learning_rate": 0.0040661416487731954, "loss": 7.8722, "step": 470000 }, { "epoch": 1.9146767098928699, "eval_MaskedAccuracy": 0.4977106534616291, "eval_loss": 1.6562927961349487, "eval_runtime": 291.0083, "eval_samples_per_second": 218.124, "eval_steps_per_second": 0.852, "step": 470000 }, { "epoch": 1.9150840879162512, "grad_norm": 6.619106292724609, "learning_rate": 0.00406575463673457, "loss": 7.8557, "step": 470100 }, { "epoch": 1.9154914659396327, "grad_norm": 4.2112884521484375, "learning_rate": 0.004065367562990967, "loss": 7.884, "step": 470200 }, { "epoch": 1.915898843963014, "grad_norm": 2.700252056121826, "learning_rate": 0.004064980427557689, "loss": 7.926, "step": 470300 }, { "epoch": 1.9163062219863956, "grad_norm": 1.110736608505249, "learning_rate": 0.004064593230450038, "loss": 7.8823, "step": 470400 }, { "epoch": 1.916713600009777, "grad_norm": 5.880198955535889, "learning_rate": 0.0040642059716833245, "loss": 7.8785, "step": 470500 }, { "epoch": 1.9171209780331586, "grad_norm": 1.161250114440918, "learning_rate": 0.004063818651272862, "loss": 7.8841, "step": 470600 }, { "epoch": 1.9175283560565402, "grad_norm": 3.639497995376587, "learning_rate": 0.004063431269233957, "loss": 7.8756, "step": 470700 }, { "epoch": 1.9179357340799215, "grad_norm": 2.357867479324341, "learning_rate": 0.004063043825581932, "loss": 7.8585, "step": 470800 }, { "epoch": 1.9183431121033028, "grad_norm": 3.1401703357696533, "learning_rate": 0.004062656320332102, "loss": 7.8698, "step": 470900 }, { "epoch": 1.9187504901266843, "grad_norm": 1.814631462097168, "learning_rate": 0.004062268753499785, "loss": 7.869, "step": 471000 }, { "epoch": 1.9187504901266843, "eval_MaskedAccuracy": 0.4971491722059388, "eval_loss": 1.6693766117095947, "eval_runtime": 183.1856, "eval_samples_per_second": 346.512, "eval_steps_per_second": 1.354, "step": 471000 }, { "epoch": 1.9191578681500658, "grad_norm": 1.5636416673660278, "learning_rate": 0.004061881125100306, "loss": 7.8423, "step": 471100 }, { "epoch": 1.9195652461734474, "grad_norm": 6.844422817230225, "learning_rate": 0.004061493435148992, "loss": 7.8685, "step": 471200 }, { "epoch": 1.919972624196829, "grad_norm": 1.8475629091262817, "learning_rate": 0.00406110568366116, "loss": 7.8783, "step": 471300 }, { "epoch": 1.9203800022202102, "grad_norm": 5.86833381652832, "learning_rate": 0.0040607178706521556, "loss": 7.8639, "step": 471400 }, { "epoch": 1.9207873802435915, "grad_norm": 1.7145296335220337, "learning_rate": 0.004060329996137304, "loss": 7.8992, "step": 471500 }, { "epoch": 1.921194758266973, "grad_norm": 1.964928388595581, "learning_rate": 0.004059942060131937, "loss": 7.9292, "step": 471600 }, { "epoch": 1.9216021362903546, "grad_norm": 5.93284797668457, "learning_rate": 0.0040595540626514005, "loss": 7.8975, "step": 471700 }, { "epoch": 1.9220095143137361, "grad_norm": 1.2464560270309448, "learning_rate": 0.004059166003711019, "loss": 7.91, "step": 471800 }, { "epoch": 1.9224168923371177, "grad_norm": 1.7859487533569336, "learning_rate": 0.004058777883326144, "loss": 7.9084, "step": 471900 }, { "epoch": 1.922824270360499, "grad_norm": 6.065908432006836, "learning_rate": 0.004058389701512122, "loss": 7.8726, "step": 472000 }, { "epoch": 1.922824270360499, "eval_MaskedAccuracy": 0.4974083189341227, "eval_loss": 1.6643046140670776, "eval_runtime": 198.8314, "eval_samples_per_second": 319.245, "eval_steps_per_second": 1.247, "step": 472000 }, { "epoch": 1.9232316483838805, "grad_norm": 1.6700581312179565, "learning_rate": 0.0040580014582842945, "loss": 7.8676, "step": 472100 }, { "epoch": 1.9236390264072618, "grad_norm": 2.486999988555908, "learning_rate": 0.004057613153658009, "loss": 7.8765, "step": 472200 }, { "epoch": 1.9240464044306433, "grad_norm": 3.6034107208251953, "learning_rate": 0.0040572247876486285, "loss": 7.8548, "step": 472300 }, { "epoch": 1.9244537824540249, "grad_norm": 3.9016072750091553, "learning_rate": 0.0040568363602714996, "loss": 7.8644, "step": 472400 }, { "epoch": 1.9248611604774064, "grad_norm": 5.163150787353516, "learning_rate": 0.004056447871541978, "loss": 7.8592, "step": 472500 }, { "epoch": 1.9252685385007877, "grad_norm": 1.991635799407959, "learning_rate": 0.004056059321475423, "loss": 7.8754, "step": 472600 }, { "epoch": 1.9256759165241693, "grad_norm": 2.20558762550354, "learning_rate": 0.004055670710087199, "loss": 7.9016, "step": 472700 }, { "epoch": 1.9260832945475506, "grad_norm": 7.398116588592529, "learning_rate": 0.004055282037392666, "loss": 7.8612, "step": 472800 }, { "epoch": 1.926490672570932, "grad_norm": 5.232546806335449, "learning_rate": 0.004054893303407193, "loss": 7.8753, "step": 472900 }, { "epoch": 1.9268980505943136, "grad_norm": 3.3898725509643555, "learning_rate": 0.0040545045081461515, "loss": 7.906, "step": 473000 }, { "epoch": 1.9268980505943136, "eval_MaskedAccuracy": 0.497005546678431, "eval_loss": 1.658578872680664, "eval_runtime": 168.2489, "eval_samples_per_second": 377.274, "eval_steps_per_second": 1.474, "step": 473000 }, { "epoch": 1.9273054286176952, "grad_norm": 1.957472324371338, "learning_rate": 0.004054115651624905, "loss": 7.8652, "step": 473100 }, { "epoch": 1.9277128066410767, "grad_norm": 5.501332759857178, "learning_rate": 0.004053726733858837, "loss": 7.8913, "step": 473200 }, { "epoch": 1.928120184664458, "grad_norm": 3.65632963180542, "learning_rate": 0.0040533377548633166, "loss": 7.8604, "step": 473300 }, { "epoch": 1.9285275626878393, "grad_norm": 1.9907680749893188, "learning_rate": 0.004052948714653717, "loss": 7.8895, "step": 473400 }, { "epoch": 1.9289349407112208, "grad_norm": 3.3675484657287598, "learning_rate": 0.004052559613245433, "loss": 7.9118, "step": 473500 }, { "epoch": 1.9293423187346024, "grad_norm": 4.073822975158691, "learning_rate": 0.004052170450653838, "loss": 7.8552, "step": 473600 }, { "epoch": 1.929749696757984, "grad_norm": 4.024270057678223, "learning_rate": 0.0040517812268943175, "loss": 7.897, "step": 473700 }, { "epoch": 1.9301570747813654, "grad_norm": 2.1195008754730225, "learning_rate": 0.004051391941982263, "loss": 7.8737, "step": 473800 }, { "epoch": 1.9305644528047468, "grad_norm": 6.140163898468018, "learning_rate": 0.004051002595933068, "loss": 7.8937, "step": 473900 }, { "epoch": 1.930971830828128, "grad_norm": 1.4223819971084595, "learning_rate": 0.004050613188762124, "loss": 7.8811, "step": 474000 }, { "epoch": 1.930971830828128, "eval_MaskedAccuracy": 0.4983757766751915, "eval_loss": 1.6546658277511597, "eval_runtime": 180.9313, "eval_samples_per_second": 350.829, "eval_steps_per_second": 1.371, "step": 474000 }, { "epoch": 1.9313792088515096, "grad_norm": 3.788630962371826, "learning_rate": 0.00405022372048482, "loss": 7.8691, "step": 474100 }, { "epoch": 1.9317865868748911, "grad_norm": 1.8217180967330933, "learning_rate": 0.004049834191116557, "loss": 7.8523, "step": 474200 }, { "epoch": 1.9321939648982727, "grad_norm": 5.193520545959473, "learning_rate": 0.004049444600672735, "loss": 7.8737, "step": 474300 }, { "epoch": 1.9326013429216542, "grad_norm": 4.182032585144043, "learning_rate": 0.004049054949168761, "loss": 7.8781, "step": 474400 }, { "epoch": 1.9330087209450355, "grad_norm": 6.593281269073486, "learning_rate": 0.004048665236620033, "loss": 7.8806, "step": 474500 }, { "epoch": 1.933416098968417, "grad_norm": 4.737550735473633, "learning_rate": 0.004048275463041967, "loss": 7.8708, "step": 474600 }, { "epoch": 1.9338234769917984, "grad_norm": 2.312268018722534, "learning_rate": 0.004047885628449966, "loss": 7.8792, "step": 474700 }, { "epoch": 1.9342308550151799, "grad_norm": 2.0605013370513916, "learning_rate": 0.004047495732859442, "loss": 7.8817, "step": 474800 }, { "epoch": 1.9346382330385614, "grad_norm": 10.729848861694336, "learning_rate": 0.0040471057762858165, "loss": 7.888, "step": 474900 }, { "epoch": 1.935045611061943, "grad_norm": 2.2714881896972656, "learning_rate": 0.004046715758744496, "loss": 7.8554, "step": 475000 }, { "epoch": 1.935045611061943, "eval_MaskedAccuracy": 0.49690260572009565, "eval_loss": 1.654526710510254, "eval_runtime": 161.1518, "eval_samples_per_second": 393.89, "eval_steps_per_second": 1.539, "step": 475000 }, { "epoch": 1.9354529890853243, "grad_norm": 3.213550090789795, "learning_rate": 0.004046325680250914, "loss": 7.8573, "step": 475100 }, { "epoch": 1.9358603671087058, "grad_norm": 4.958645820617676, "learning_rate": 0.004045935540820482, "loss": 7.9061, "step": 475200 }, { "epoch": 1.936267745132087, "grad_norm": 1.992046594619751, "learning_rate": 0.004045545340468625, "loss": 7.8761, "step": 475300 }, { "epoch": 1.9366751231554686, "grad_norm": 3.0850229263305664, "learning_rate": 0.0040451550792107716, "loss": 7.8495, "step": 475400 }, { "epoch": 1.9370825011788502, "grad_norm": 2.6831929683685303, "learning_rate": 0.004044764757062348, "loss": 7.8992, "step": 475500 }, { "epoch": 1.9374898792022317, "grad_norm": 5.275318622589111, "learning_rate": 0.004044374374038785, "loss": 7.8351, "step": 475600 }, { "epoch": 1.9378972572256132, "grad_norm": 3.3698630332946777, "learning_rate": 0.004043983930155527, "loss": 7.8287, "step": 475700 }, { "epoch": 1.9383046352489945, "grad_norm": 6.257713317871094, "learning_rate": 0.0040435934254280026, "loss": 7.8632, "step": 475800 }, { "epoch": 1.9387120132723759, "grad_norm": 2.4408538341522217, "learning_rate": 0.004043202859871646, "loss": 7.9321, "step": 475900 }, { "epoch": 1.9391193912957574, "grad_norm": 4.434767723083496, "learning_rate": 0.004042812233501905, "loss": 7.8824, "step": 476000 }, { "epoch": 1.9391193912957574, "eval_MaskedAccuracy": 0.49757411130997103, "eval_loss": 1.6568632125854492, "eval_runtime": 222.6206, "eval_samples_per_second": 285.131, "eval_steps_per_second": 1.114, "step": 476000 }, { "epoch": 1.939526769319139, "grad_norm": 5.1438517570495605, "learning_rate": 0.004042421546334225, "loss": 7.8831, "step": 476100 }, { "epoch": 1.9399341473425205, "grad_norm": 3.293837547302246, "learning_rate": 0.004042030798384047, "loss": 7.8711, "step": 476200 }, { "epoch": 1.940341525365902, "grad_norm": 6.498884201049805, "learning_rate": 0.00404163998966682, "loss": 7.8842, "step": 476300 }, { "epoch": 1.9407489033892833, "grad_norm": 2.4626593589782715, "learning_rate": 0.004041249120197994, "loss": 7.8796, "step": 476400 }, { "epoch": 1.9411562814126646, "grad_norm": 3.8586249351501465, "learning_rate": 0.004040858189993024, "loss": 7.8582, "step": 476500 }, { "epoch": 1.9415636594360461, "grad_norm": 6.778619766235352, "learning_rate": 0.0040404671990673675, "loss": 7.8856, "step": 476600 }, { "epoch": 1.9419710374594277, "grad_norm": 2.4657950401306152, "learning_rate": 0.004040076147436481, "loss": 7.8696, "step": 476700 }, { "epoch": 1.9423784154828092, "grad_norm": 4.460400581359863, "learning_rate": 0.004039685035115825, "loss": 7.8893, "step": 476800 }, { "epoch": 1.9427857935061907, "grad_norm": 4.682711601257324, "learning_rate": 0.004039293862120863, "loss": 7.8663, "step": 476900 }, { "epoch": 1.943193171529572, "grad_norm": 1.674350380897522, "learning_rate": 0.004038902628467053, "loss": 7.8653, "step": 477000 }, { "epoch": 1.943193171529572, "eval_MaskedAccuracy": 0.49762501700375905, "eval_loss": 1.67079758644104, "eval_runtime": 222.9317, "eval_samples_per_second": 284.733, "eval_steps_per_second": 1.112, "step": 477000 }, { "epoch": 1.9436005495529536, "grad_norm": 3.078145980834961, "learning_rate": 0.004038511334169871, "loss": 7.8835, "step": 477100 }, { "epoch": 1.944007927576335, "grad_norm": 4.532050609588623, "learning_rate": 0.004038119979244781, "loss": 7.8699, "step": 477200 }, { "epoch": 1.9444153055997164, "grad_norm": 1.8098946809768677, "learning_rate": 0.0040377285637072605, "loss": 7.8955, "step": 477300 }, { "epoch": 1.944822683623098, "grad_norm": 4.762369632720947, "learning_rate": 0.004037337087572785, "loss": 7.8899, "step": 477400 }, { "epoch": 1.9452300616464795, "grad_norm": 6.186821937561035, "learning_rate": 0.00403694555085683, "loss": 7.8725, "step": 477500 }, { "epoch": 1.9456374396698608, "grad_norm": 1.7627458572387695, "learning_rate": 0.0040365539535748675, "loss": 7.8508, "step": 477600 }, { "epoch": 1.9460448176932423, "grad_norm": 4.719379901885986, "learning_rate": 0.00403616229574239, "loss": 7.8766, "step": 477700 }, { "epoch": 1.9464521957166236, "grad_norm": 1.4635705947875977, "learning_rate": 0.00403577057737488, "loss": 7.8901, "step": 477800 }, { "epoch": 1.9468595737400052, "grad_norm": 2.690516471862793, "learning_rate": 0.00403537879848782, "loss": 7.8986, "step": 477900 }, { "epoch": 1.9472669517633867, "grad_norm": 2.8039186000823975, "learning_rate": 0.0040349869590967065, "loss": 7.8739, "step": 478000 }, { "epoch": 1.9472669517633867, "eval_MaskedAccuracy": 0.49792528945519005, "eval_loss": 1.654314398765564, "eval_runtime": 172.2687, "eval_samples_per_second": 368.471, "eval_steps_per_second": 1.44, "step": 478000 }, { "epoch": 1.9476743297867682, "grad_norm": 2.728649377822876, "learning_rate": 0.004034595059217011, "loss": 7.8701, "step": 478100 }, { "epoch": 1.9480817078101498, "grad_norm": 5.797906875610352, "learning_rate": 0.004034203098864252, "loss": 7.8526, "step": 478200 }, { "epoch": 1.948489085833531, "grad_norm": 8.51126480102539, "learning_rate": 0.004033811078053917, "loss": 7.8827, "step": 478300 }, { "epoch": 1.9488964638569124, "grad_norm": 1.855424404144287, "learning_rate": 0.0040334189968015, "loss": 7.9023, "step": 478400 }, { "epoch": 1.949303841880294, "grad_norm": 1.8557064533233643, "learning_rate": 0.00403302685512251, "loss": 7.8768, "step": 478500 }, { "epoch": 1.9497112199036755, "grad_norm": 2.491926431655884, "learning_rate": 0.0040326346530324434, "loss": 7.877, "step": 478600 }, { "epoch": 1.950118597927057, "grad_norm": 3.1537840366363525, "learning_rate": 0.004032242390546807, "loss": 7.842, "step": 478700 }, { "epoch": 1.9505259759504385, "grad_norm": 6.29361629486084, "learning_rate": 0.004031850067681113, "loss": 7.8575, "step": 478800 }, { "epoch": 1.9509333539738198, "grad_norm": 1.7191801071166992, "learning_rate": 0.004031457684450868, "loss": 7.8868, "step": 478900 }, { "epoch": 1.9513407319972011, "grad_norm": 2.9524221420288086, "learning_rate": 0.004031065240871586, "loss": 7.8854, "step": 479000 }, { "epoch": 1.9513407319972011, "eval_MaskedAccuracy": 0.4979104902478439, "eval_loss": 1.6542531251907349, "eval_runtime": 154.3286, "eval_samples_per_second": 411.304, "eval_steps_per_second": 1.607, "step": 479000 }, { "epoch": 1.9517481100205827, "grad_norm": 2.500034809112549, "learning_rate": 0.004030672736958779, "loss": 7.8849, "step": 479100 }, { "epoch": 1.9521554880439642, "grad_norm": 1.9168018102645874, "learning_rate": 0.004030280172727973, "loss": 7.8631, "step": 479200 }, { "epoch": 1.9525628660673457, "grad_norm": 5.046385765075684, "learning_rate": 0.004029887548194683, "loss": 7.8896, "step": 479300 }, { "epoch": 1.9529702440907273, "grad_norm": 2.453263521194458, "learning_rate": 0.0040294948633744325, "loss": 7.8544, "step": 479400 }, { "epoch": 1.9533776221141086, "grad_norm": 6.771822452545166, "learning_rate": 0.004029102118282749, "loss": 7.8464, "step": 479500 }, { "epoch": 1.9537850001374901, "grad_norm": 1.666183590888977, "learning_rate": 0.00402870931293516, "loss": 7.8639, "step": 479600 }, { "epoch": 1.9541923781608714, "grad_norm": 3.315634250640869, "learning_rate": 0.004028316447347187, "loss": 7.8744, "step": 479700 }, { "epoch": 1.954599756184253, "grad_norm": 4.202087879180908, "learning_rate": 0.004027923521534369, "loss": 7.8341, "step": 479800 }, { "epoch": 1.9550071342076345, "grad_norm": 2.855398654937744, "learning_rate": 0.004027530535512241, "loss": 7.8417, "step": 479900 }, { "epoch": 1.955414512231016, "grad_norm": 1.7981055974960327, "learning_rate": 0.004027137489296336, "loss": 7.8472, "step": 480000 }, { "epoch": 1.955414512231016, "eval_MaskedAccuracy": 0.4980796443724053, "eval_loss": 1.653088927268982, "eval_runtime": 267.2063, "eval_samples_per_second": 237.554, "eval_steps_per_second": 0.928, "step": 480000 }, { "epoch": 1.9558218902543973, "grad_norm": 7.516102313995361, "learning_rate": 0.004026744382902195, "loss": 7.8659, "step": 480100 }, { "epoch": 1.9562292682777789, "grad_norm": 3.189872980117798, "learning_rate": 0.0040263512163453636, "loss": 7.8605, "step": 480200 }, { "epoch": 1.9566366463011602, "grad_norm": 3.456697702407837, "learning_rate": 0.00402595798964138, "loss": 7.9078, "step": 480300 }, { "epoch": 1.9570440243245417, "grad_norm": 1.9788827896118164, "learning_rate": 0.004025564702805789, "loss": 7.8707, "step": 480400 }, { "epoch": 1.9574514023479233, "grad_norm": 4.001372814178467, "learning_rate": 0.004025171355854143, "loss": 7.873, "step": 480500 }, { "epoch": 1.9578587803713048, "grad_norm": 1.7897107601165771, "learning_rate": 0.004024777948801997, "loss": 7.8635, "step": 480600 }, { "epoch": 1.9582661583946863, "grad_norm": 2.17834734916687, "learning_rate": 0.0040243844816649, "loss": 7.8759, "step": 480700 }, { "epoch": 1.9586735364180676, "grad_norm": 4.898231029510498, "learning_rate": 0.004023990954458407, "loss": 7.8657, "step": 480800 }, { "epoch": 1.959080914441449, "grad_norm": 3.996277332305908, "learning_rate": 0.004023597367198079, "loss": 7.8389, "step": 480900 }, { "epoch": 1.9594882924648305, "grad_norm": 2.099057197570801, "learning_rate": 0.004023203719899471, "loss": 7.8278, "step": 481000 }, { "epoch": 1.9594882924648305, "eval_MaskedAccuracy": 0.49734578672849916, "eval_loss": 1.6585326194763184, "eval_runtime": 293.0499, "eval_samples_per_second": 216.605, "eval_steps_per_second": 0.846, "step": 481000 }, { "epoch": 1.959895670488212, "grad_norm": 3.6001155376434326, "learning_rate": 0.0040228100125781485, "loss": 7.8929, "step": 481100 }, { "epoch": 1.9603030485115935, "grad_norm": 4.5417256355285645, "learning_rate": 0.0040224162452496765, "loss": 7.8526, "step": 481200 }, { "epoch": 1.960710426534975, "grad_norm": 3.4753739833831787, "learning_rate": 0.00402202241792963, "loss": 7.8555, "step": 481300 }, { "epoch": 1.9611178045583564, "grad_norm": 3.7852957248687744, "learning_rate": 0.0040216285306335755, "loss": 7.8576, "step": 481400 }, { "epoch": 1.9615251825817377, "grad_norm": 3.904681444168091, "learning_rate": 0.004021234583377078, "loss": 7.8712, "step": 481500 }, { "epoch": 1.9619325606051192, "grad_norm": 7.926792621612549, "learning_rate": 0.004020840576175721, "loss": 7.8576, "step": 481600 }, { "epoch": 1.9623399386285008, "grad_norm": 6.559882640838623, "learning_rate": 0.004020446509045077, "loss": 7.858, "step": 481700 }, { "epoch": 1.9627473166518823, "grad_norm": 2.3641748428344727, "learning_rate": 0.004020052382000732, "loss": 7.8343, "step": 481800 }, { "epoch": 1.9631546946752638, "grad_norm": 2.38405179977417, "learning_rate": 0.004019658195058259, "loss": 7.8967, "step": 481900 }, { "epoch": 1.9635620726986451, "grad_norm": 2.101755142211914, "learning_rate": 0.004019263948233245, "loss": 7.8933, "step": 482000 }, { "epoch": 1.9635620726986451, "eval_MaskedAccuracy": 0.49814752555018316, "eval_loss": 1.6608792543411255, "eval_runtime": 192.4729, "eval_samples_per_second": 329.792, "eval_steps_per_second": 1.288, "step": 482000 }, { "epoch": 1.9639694507220267, "grad_norm": 4.64115571975708, "learning_rate": 0.004018869641541278, "loss": 7.8328, "step": 482100 }, { "epoch": 1.964376828745408, "grad_norm": 1.7008798122406006, "learning_rate": 0.00401847527499795, "loss": 7.8733, "step": 482200 }, { "epoch": 1.9647842067687895, "grad_norm": 4.0245442390441895, "learning_rate": 0.004018080848618849, "loss": 7.8603, "step": 482300 }, { "epoch": 1.965191584792171, "grad_norm": 5.126151084899902, "learning_rate": 0.004017686362419571, "loss": 7.8705, "step": 482400 }, { "epoch": 1.9655989628155526, "grad_norm": 1.8972097635269165, "learning_rate": 0.004017291816415706, "loss": 7.8862, "step": 482500 }, { "epoch": 1.9660063408389339, "grad_norm": 6.24486780166626, "learning_rate": 0.004016897210622862, "loss": 7.9271, "step": 482600 }, { "epoch": 1.9664137188623154, "grad_norm": 4.7259697914123535, "learning_rate": 0.004016502545056628, "loss": 7.871, "step": 482700 }, { "epoch": 1.9668210968856967, "grad_norm": 1.8844248056411743, "learning_rate": 0.004016107819732618, "loss": 7.8451, "step": 482800 }, { "epoch": 1.9672284749090783, "grad_norm": 1.8297553062438965, "learning_rate": 0.004015713034666439, "loss": 7.8553, "step": 482900 }, { "epoch": 1.9676358529324598, "grad_norm": 4.883835792541504, "learning_rate": 0.004015318189873694, "loss": 7.8533, "step": 483000 }, { "epoch": 1.9676358529324598, "eval_MaskedAccuracy": 0.49614832957562766, "eval_loss": 1.6681147813796997, "eval_runtime": 194.7594, "eval_samples_per_second": 325.92, "eval_steps_per_second": 1.273, "step": 483000 }, { "epoch": 1.9680432309558413, "grad_norm": 2.663156509399414, "learning_rate": 0.004014923285369992, "loss": 7.8835, "step": 483100 }, { "epoch": 1.9684506089792229, "grad_norm": 2.0312697887420654, "learning_rate": 0.0040145283211709465, "loss": 7.8721, "step": 483200 }, { "epoch": 1.9688579870026042, "grad_norm": 2.1541903018951416, "learning_rate": 0.004014133297292174, "loss": 7.8663, "step": 483300 }, { "epoch": 1.9692653650259855, "grad_norm": 1.7734742164611816, "learning_rate": 0.0040137382137492884, "loss": 7.8796, "step": 483400 }, { "epoch": 1.969672743049367, "grad_norm": 7.295593738555908, "learning_rate": 0.0040133430705579145, "loss": 7.8988, "step": 483500 }, { "epoch": 1.9700801210727485, "grad_norm": 5.96539831161499, "learning_rate": 0.004012947867733674, "loss": 7.8703, "step": 483600 }, { "epoch": 1.97048749909613, "grad_norm": 4.10883903503418, "learning_rate": 0.004012552605292188, "loss": 7.863, "step": 483700 }, { "epoch": 1.9708948771195116, "grad_norm": 4.31527853012085, "learning_rate": 0.004012157283249081, "loss": 7.8343, "step": 483800 }, { "epoch": 1.971302255142893, "grad_norm": 7.688234329223633, "learning_rate": 0.004011761901619987, "loss": 7.857, "step": 483900 }, { "epoch": 1.9717096331662742, "grad_norm": 5.177247047424316, "learning_rate": 0.004011366460420539, "loss": 7.8835, "step": 484000 }, { "epoch": 1.9717096331662742, "eval_MaskedAccuracy": 0.498085815293933, "eval_loss": 1.6607754230499268, "eval_runtime": 187.5598, "eval_samples_per_second": 338.431, "eval_steps_per_second": 1.322, "step": 484000 }, { "epoch": 1.9721170111896558, "grad_norm": 2.6931183338165283, "learning_rate": 0.004010970959666362, "loss": 7.868, "step": 484100 }, { "epoch": 1.9725243892130373, "grad_norm": 2.1372954845428467, "learning_rate": 0.004010575399373104, "loss": 7.8838, "step": 484200 }, { "epoch": 1.9729317672364188, "grad_norm": 4.982362270355225, "learning_rate": 0.004010179779556398, "loss": 7.8403, "step": 484300 }, { "epoch": 1.9733391452598004, "grad_norm": 2.3827271461486816, "learning_rate": 0.004009784100231887, "loss": 7.8794, "step": 484400 }, { "epoch": 1.9737465232831817, "grad_norm": 4.464802265167236, "learning_rate": 0.004009388361415203, "loss": 7.8898, "step": 484500 }, { "epoch": 1.9741539013065632, "grad_norm": 1.570246934890747, "learning_rate": 0.004008992563122011, "loss": 7.8775, "step": 484600 }, { "epoch": 1.9745612793299445, "grad_norm": 4.5733723640441895, "learning_rate": 0.004008596705367944, "loss": 7.8572, "step": 484700 }, { "epoch": 1.974968657353326, "grad_norm": 2.002183675765991, "learning_rate": 0.004008200788168656, "loss": 7.8508, "step": 484800 }, { "epoch": 1.9753760353767076, "grad_norm": 3.640878438949585, "learning_rate": 0.004007804811539806, "loss": 7.844, "step": 484900 }, { "epoch": 1.9757834134000891, "grad_norm": 5.636162281036377, "learning_rate": 0.004007408775497042, "loss": 7.8452, "step": 485000 }, { "epoch": 1.9757834134000891, "eval_MaskedAccuracy": 0.4974547461623308, "eval_loss": 1.656631588935852, "eval_runtime": 174.8193, "eval_samples_per_second": 363.095, "eval_steps_per_second": 1.419, "step": 485000 }, { "epoch": 1.9761907914234704, "grad_norm": 7.8830037117004395, "learning_rate": 0.004007012680056021, "loss": 7.8794, "step": 485100 }, { "epoch": 1.976598169446852, "grad_norm": 1.603452444076538, "learning_rate": 0.004006616525232406, "loss": 7.8654, "step": 485200 }, { "epoch": 1.9770055474702333, "grad_norm": 2.641059160232544, "learning_rate": 0.004006220311041858, "loss": 7.8737, "step": 485300 }, { "epoch": 1.9774129254936148, "grad_norm": 2.383936643600464, "learning_rate": 0.004005824037500043, "loss": 7.873, "step": 485400 }, { "epoch": 1.9778203035169963, "grad_norm": 4.151749610900879, "learning_rate": 0.004005427704622632, "loss": 7.8806, "step": 485500 }, { "epoch": 1.9782276815403779, "grad_norm": 3.9820024967193604, "learning_rate": 0.004005031312425284, "loss": 7.8726, "step": 485600 }, { "epoch": 1.9786350595637594, "grad_norm": 1.2179168462753296, "learning_rate": 0.004004634860923675, "loss": 7.8509, "step": 485700 }, { "epoch": 1.9790424375871407, "grad_norm": 3.316539764404297, "learning_rate": 0.004004238350133474, "loss": 7.9288, "step": 485800 }, { "epoch": 1.979449815610522, "grad_norm": 1.5485013723373413, "learning_rate": 0.004003841780070365, "loss": 7.8605, "step": 485900 }, { "epoch": 1.9798571936339036, "grad_norm": 7.027229309082031, "learning_rate": 0.004003445150750029, "loss": 7.8808, "step": 486000 }, { "epoch": 1.9798571936339036, "eval_MaskedAccuracy": 0.497335183035008, "eval_loss": 1.665848731994629, "eval_runtime": 244.3163, "eval_samples_per_second": 259.811, "eval_steps_per_second": 1.015, "step": 486000 }, { "epoch": 1.980264571657285, "grad_norm": 3.497182607650757, "learning_rate": 0.004003048462188146, "loss": 7.8623, "step": 486100 }, { "epoch": 1.9806719496806666, "grad_norm": 5.535577774047852, "learning_rate": 0.0040026517144003924, "loss": 7.8717, "step": 486200 }, { "epoch": 1.9810793277040482, "grad_norm": 4.201848983764648, "learning_rate": 0.0040022549074024555, "loss": 7.8874, "step": 486300 }, { "epoch": 1.9814867057274295, "grad_norm": 3.704149007797241, "learning_rate": 0.0040018580412100185, "loss": 7.8675, "step": 486400 }, { "epoch": 1.9818940837508108, "grad_norm": 4.645805358886719, "learning_rate": 0.004001461115838779, "loss": 7.8413, "step": 486500 }, { "epoch": 1.9823014617741923, "grad_norm": 2.235441207885742, "learning_rate": 0.0040010641313044295, "loss": 7.8991, "step": 486600 }, { "epoch": 1.9827088397975738, "grad_norm": 5.017754077911377, "learning_rate": 0.004000667087622661, "loss": 7.8697, "step": 486700 }, { "epoch": 1.9831162178209554, "grad_norm": 2.454075336456299, "learning_rate": 0.004000269984809177, "loss": 7.8887, "step": 486800 }, { "epoch": 1.983523595844337, "grad_norm": 1.3705803155899048, "learning_rate": 0.0039998728228796765, "loss": 7.8627, "step": 486900 }, { "epoch": 1.9839309738677182, "grad_norm": 3.2323405742645264, "learning_rate": 0.003999475601849856, "loss": 7.8695, "step": 487000 }, { "epoch": 1.9839309738677182, "eval_MaskedAccuracy": 0.49819125713116347, "eval_loss": 1.6521326303482056, "eval_runtime": 175.3211, "eval_samples_per_second": 362.056, "eval_steps_per_second": 1.415, "step": 487000 }, { "epoch": 1.9843383518910997, "grad_norm": 2.3861207962036133, "learning_rate": 0.003999078321735422, "loss": 7.843, "step": 487100 }, { "epoch": 1.984745729914481, "grad_norm": 5.130801200866699, "learning_rate": 0.003998680982552084, "loss": 7.8155, "step": 487200 }, { "epoch": 1.9851531079378626, "grad_norm": 5.160671234130859, "learning_rate": 0.0039982835843155465, "loss": 7.892, "step": 487300 }, { "epoch": 1.9855604859612441, "grad_norm": 4.723651885986328, "learning_rate": 0.00399788612704152, "loss": 7.8924, "step": 487400 }, { "epoch": 1.9859678639846257, "grad_norm": 2.9012949466705322, "learning_rate": 0.00399748861074572, "loss": 7.8565, "step": 487500 }, { "epoch": 1.986375242008007, "grad_norm": 4.409277439117432, "learning_rate": 0.0039970910354438635, "loss": 7.8184, "step": 487600 }, { "epoch": 1.9867826200313885, "grad_norm": 4.830262660980225, "learning_rate": 0.003996693401151668, "loss": 7.849, "step": 487700 }, { "epoch": 1.9871899980547698, "grad_norm": 7.154356002807617, "learning_rate": 0.003996295707884856, "loss": 7.8645, "step": 487800 }, { "epoch": 1.9875973760781513, "grad_norm": 2.3467037677764893, "learning_rate": 0.003995897955659146, "loss": 7.8955, "step": 487900 }, { "epoch": 1.9880047541015329, "grad_norm": 4.408042907714844, "learning_rate": 0.003995500144490269, "loss": 7.8682, "step": 488000 }, { "epoch": 1.9880047541015329, "eval_MaskedAccuracy": 0.4977144905082705, "eval_loss": 1.656249761581421, "eval_runtime": 171.7931, "eval_samples_per_second": 369.491, "eval_steps_per_second": 1.444, "step": 488000 }, { "epoch": 1.9884121321249144, "grad_norm": 6.825170516967773, "learning_rate": 0.003995102274393941, "loss": 7.8591, "step": 488100 }, { "epoch": 1.988819510148296, "grad_norm": 4.52269172668457, "learning_rate": 0.003994704345385907, "loss": 7.8472, "step": 488200 }, { "epoch": 1.9892268881716773, "grad_norm": 3.2726306915283203, "learning_rate": 0.0039943063574818995, "loss": 7.8211, "step": 488300 }, { "epoch": 1.9896342661950586, "grad_norm": 2.61883544921875, "learning_rate": 0.003993908310697633, "loss": 7.8732, "step": 488400 }, { "epoch": 1.99004164421844, "grad_norm": 4.642792224884033, "learning_rate": 0.0039935102050488655, "loss": 7.85, "step": 488500 }, { "epoch": 1.9904490222418216, "grad_norm": 2.5082287788391113, "learning_rate": 0.003993112040551326, "loss": 7.8357, "step": 488600 }, { "epoch": 1.9908564002652032, "grad_norm": 4.244477272033691, "learning_rate": 0.003992713817220759, "loss": 7.8668, "step": 488700 }, { "epoch": 1.9912637782885847, "grad_norm": 3.799135208129883, "learning_rate": 0.003992315535072908, "loss": 7.8422, "step": 488800 }, { "epoch": 1.991671156311966, "grad_norm": 2.4758245944976807, "learning_rate": 0.003991917194123517, "loss": 7.8798, "step": 488900 }, { "epoch": 1.9920785343353473, "grad_norm": 3.006605386734009, "learning_rate": 0.003991518794388338, "loss": 7.8743, "step": 489000 }, { "epoch": 1.9920785343353473, "eval_MaskedAccuracy": 0.49804167414093853, "eval_loss": 1.6533012390136719, "eval_runtime": 186.1393, "eval_samples_per_second": 341.014, "eval_steps_per_second": 1.332, "step": 489000 }, { "epoch": 1.9924859123587289, "grad_norm": 5.0802459716796875, "learning_rate": 0.003991120335883123, "loss": 7.8781, "step": 489100 }, { "epoch": 1.9928932903821104, "grad_norm": 3.296128988265991, "learning_rate": 0.003990721818623624, "loss": 7.8878, "step": 489200 }, { "epoch": 1.993300668405492, "grad_norm": 4.713596820831299, "learning_rate": 0.003990323242625591, "loss": 7.8469, "step": 489300 }, { "epoch": 1.9937080464288734, "grad_norm": 7.093138217926025, "learning_rate": 0.003989924607904786, "loss": 7.8772, "step": 489400 }, { "epoch": 1.9941154244522548, "grad_norm": 3.2654430866241455, "learning_rate": 0.0039895259144769706, "loss": 7.869, "step": 489500 }, { "epoch": 1.9945228024756363, "grad_norm": 4.690536975860596, "learning_rate": 0.0039891271623579095, "loss": 7.8711, "step": 489600 }, { "epoch": 1.9949301804990176, "grad_norm": 4.180046081542969, "learning_rate": 0.00398872835156336, "loss": 7.861, "step": 489700 }, { "epoch": 1.9953375585223991, "grad_norm": 1.7061024904251099, "learning_rate": 0.003988329482109093, "loss": 7.8457, "step": 489800 }, { "epoch": 1.9957449365457807, "grad_norm": 8.832228660583496, "learning_rate": 0.003987930554010878, "loss": 7.825, "step": 489900 }, { "epoch": 1.9961523145691622, "grad_norm": 5.911707878112793, "learning_rate": 0.003987531567284494, "loss": 7.8635, "step": 490000 }, { "epoch": 1.9961523145691622, "eval_MaskedAccuracy": 0.4982665301124076, "eval_loss": 1.6461238861083984, "eval_runtime": 186.756, "eval_samples_per_second": 339.887, "eval_steps_per_second": 1.328, "step": 490000 }, { "epoch": 1.9965596925925435, "grad_norm": 3.7599871158599854, "learning_rate": 0.003987132521945702, "loss": 7.8498, "step": 490100 }, { "epoch": 1.996967070615925, "grad_norm": 2.7844014167785645, "learning_rate": 0.003986733418010286, "loss": 7.8577, "step": 490200 }, { "epoch": 1.9973744486393064, "grad_norm": 2.239363193511963, "learning_rate": 0.003986334255494022, "loss": 7.8533, "step": 490300 }, { "epoch": 1.9977818266626879, "grad_norm": 2.896563768386841, "learning_rate": 0.003985935034412689, "loss": 7.8719, "step": 490400 }, { "epoch": 1.9981892046860694, "grad_norm": 3.142861843109131, "learning_rate": 0.003985535754782079, "loss": 7.846, "step": 490500 }, { "epoch": 1.998596582709451, "grad_norm": 1.930145502090454, "learning_rate": 0.003985136416617973, "loss": 7.8546, "step": 490600 }, { "epoch": 1.9990039607328325, "grad_norm": 2.043785572052002, "learning_rate": 0.003984737019936155, "loss": 7.9126, "step": 490700 }, { "epoch": 1.9994113387562138, "grad_norm": 4.26570463180542, "learning_rate": 0.003984337564752418, "loss": 7.8457, "step": 490800 }, { "epoch": 1.999818716779595, "grad_norm": 2.3818819522857666, "learning_rate": 0.003983938051082552, "loss": 7.8458, "step": 490900 }, { "epoch": 2.0002260948029766, "grad_norm": 3.318009614944458, "learning_rate": 0.003983538478942357, "loss": 7.8652, "step": 491000 }, { "epoch": 2.0002260948029766, "eval_MaskedAccuracy": 0.4978805718052601, "eval_loss": 1.6651256084442139, "eval_runtime": 148.2624, "eval_samples_per_second": 428.133, "eval_steps_per_second": 1.673, "step": 491000 }, { "epoch": 2.000633472826358, "grad_norm": 2.78810977935791, "learning_rate": 0.003983138848347631, "loss": 7.8623, "step": 491100 }, { "epoch": 2.0010408508497397, "grad_norm": 3.5467097759246826, "learning_rate": 0.003982739159314167, "loss": 7.8841, "step": 491200 }, { "epoch": 2.0014482288731212, "grad_norm": 10.010056495666504, "learning_rate": 0.003982339411857772, "loss": 7.8765, "step": 491300 }, { "epoch": 2.0018556068965028, "grad_norm": 2.5871756076812744, "learning_rate": 0.003981939605994252, "loss": 7.8858, "step": 491400 }, { "epoch": 2.002262984919884, "grad_norm": 6.318182468414307, "learning_rate": 0.003981539741739408, "loss": 7.8725, "step": 491500 }, { "epoch": 2.0026703629432654, "grad_norm": 2.4176394939422607, "learning_rate": 0.003981139819109054, "loss": 7.8824, "step": 491600 }, { "epoch": 2.003077740966647, "grad_norm": 4.502716064453125, "learning_rate": 0.003980739838118991, "loss": 7.8732, "step": 491700 }, { "epoch": 2.0034851189900285, "grad_norm": 2.8272433280944824, "learning_rate": 0.003980339798785041, "loss": 7.8563, "step": 491800 }, { "epoch": 2.00389249701341, "grad_norm": 2.6620304584503174, "learning_rate": 0.00397993970112302, "loss": 7.8728, "step": 491900 }, { "epoch": 2.0042998750367915, "grad_norm": 4.319948673248291, "learning_rate": 0.003979539545148746, "loss": 7.8687, "step": 492000 }, { "epoch": 2.0042998750367915, "eval_MaskedAccuracy": 0.49793333894702246, "eval_loss": 1.6577565670013428, "eval_runtime": 149.5513, "eval_samples_per_second": 424.443, "eval_steps_per_second": 1.658, "step": 492000 }, { "epoch": 2.0047072530601726, "grad_norm": 1.5957372188568115, "learning_rate": 0.003979139330878037, "loss": 7.8855, "step": 492100 }, { "epoch": 2.005114631083554, "grad_norm": 2.106097936630249, "learning_rate": 0.003978739058326709, "loss": 7.8231, "step": 492200 }, { "epoch": 2.0055220091069357, "grad_norm": 4.910973072052002, "learning_rate": 0.0039783387275105974, "loss": 7.8617, "step": 492300 }, { "epoch": 2.005929387130317, "grad_norm": 13.576258659362793, "learning_rate": 0.003977938338445527, "loss": 7.8684, "step": 492400 }, { "epoch": 2.0063367651536987, "grad_norm": 8.415559768676758, "learning_rate": 0.003977537891147327, "loss": 7.8494, "step": 492500 }, { "epoch": 2.0067441431770803, "grad_norm": 3.934183120727539, "learning_rate": 0.003977137385631824, "loss": 7.867, "step": 492600 }, { "epoch": 2.0071515212004614, "grad_norm": 4.725040435791016, "learning_rate": 0.003976736821914864, "loss": 7.857, "step": 492700 }, { "epoch": 2.007558899223843, "grad_norm": 2.1035566329956055, "learning_rate": 0.00397633620001227, "loss": 7.8536, "step": 492800 }, { "epoch": 2.0079662772472244, "grad_norm": 5.978846073150635, "learning_rate": 0.003975935519939892, "loss": 7.8704, "step": 492900 }, { "epoch": 2.008373655270606, "grad_norm": 2.2010936737060547, "learning_rate": 0.003975534781713568, "loss": 7.8661, "step": 493000 }, { "epoch": 2.008373655270606, "eval_MaskedAccuracy": 0.49859173485021163, "eval_loss": 1.6547033786773682, "eval_runtime": 149.7466, "eval_samples_per_second": 423.89, "eval_steps_per_second": 1.656, "step": 493000 }, { "epoch": 2.0087810332939875, "grad_norm": 5.013829708099365, "learning_rate": 0.003975133985349135, "loss": 7.8827, "step": 493100 }, { "epoch": 2.009188411317369, "grad_norm": 2.1907660961151123, "learning_rate": 0.003974733130862448, "loss": 7.9041, "step": 493200 }, { "epoch": 2.00959578934075, "grad_norm": 1.990098476409912, "learning_rate": 0.003974332218269345, "loss": 7.8904, "step": 493300 }, { "epoch": 2.0100031673641316, "grad_norm": 2.6972572803497314, "learning_rate": 0.003973931247585682, "loss": 7.872, "step": 493400 }, { "epoch": 2.010410545387513, "grad_norm": 7.332125186920166, "learning_rate": 0.003973530218827305, "loss": 7.8676, "step": 493500 }, { "epoch": 2.0108179234108947, "grad_norm": 7.7640299797058105, "learning_rate": 0.00397312913201008, "loss": 7.8457, "step": 493600 }, { "epoch": 2.0112253014342762, "grad_norm": 3.2454802989959717, "learning_rate": 0.003972727987149856, "loss": 7.874, "step": 493700 }, { "epoch": 2.011632679457658, "grad_norm": 4.539496898651123, "learning_rate": 0.003972326784262497, "loss": 7.8526, "step": 493800 }, { "epoch": 2.0120400574810393, "grad_norm": 1.4194692373275757, "learning_rate": 0.003971925523363862, "loss": 7.8871, "step": 493900 }, { "epoch": 2.0124474355044204, "grad_norm": 2.3241472244262695, "learning_rate": 0.003971524204469813, "loss": 7.8561, "step": 494000 }, { "epoch": 2.0124474355044204, "eval_MaskedAccuracy": 0.4992459584295612, "eval_loss": 1.6490083932876587, "eval_runtime": 148.9955, "eval_samples_per_second": 426.026, "eval_steps_per_second": 1.664, "step": 494000 }, { "epoch": 2.012854813527802, "grad_norm": 2.2037127017974854, "learning_rate": 0.003971122827596219, "loss": 7.8539, "step": 494100 }, { "epoch": 2.0132621915511835, "grad_norm": 2.7499496936798096, "learning_rate": 0.003970721392758945, "loss": 7.8661, "step": 494200 }, { "epoch": 2.013669569574565, "grad_norm": 2.684345006942749, "learning_rate": 0.003970319899973867, "loss": 7.8737, "step": 494300 }, { "epoch": 2.0140769475979465, "grad_norm": 2.927569627761841, "learning_rate": 0.003969918349256856, "loss": 7.8905, "step": 494400 }, { "epoch": 2.014484325621328, "grad_norm": 3.2021238803863525, "learning_rate": 0.0039695167406237864, "loss": 7.8662, "step": 494500 }, { "epoch": 2.014891703644709, "grad_norm": 1.8485453128814697, "learning_rate": 0.0039691150740905415, "loss": 7.8778, "step": 494600 }, { "epoch": 2.0152990816680907, "grad_norm": 8.958436965942383, "learning_rate": 0.0039687133496729915, "loss": 7.9229, "step": 494700 }, { "epoch": 2.015706459691472, "grad_norm": 1.9357823133468628, "learning_rate": 0.003968311567387029, "loss": 7.8505, "step": 494800 }, { "epoch": 2.0161138377148538, "grad_norm": 1.8658493757247925, "learning_rate": 0.003967909727248532, "loss": 7.8604, "step": 494900 }, { "epoch": 2.0165212157382353, "grad_norm": 2.6488263607025146, "learning_rate": 0.003967507829273389, "loss": 7.8379, "step": 495000 }, { "epoch": 2.0165212157382353, "eval_MaskedAccuracy": 0.4984190639202395, "eval_loss": 1.6625776290893555, "eval_runtime": 149.651, "eval_samples_per_second": 424.16, "eval_steps_per_second": 1.657, "step": 495000 }, { "epoch": 2.016928593761617, "grad_norm": 3.5286457538604736, "learning_rate": 0.003967105873477492, "loss": 7.8431, "step": 495100 }, { "epoch": 2.017335971784998, "grad_norm": 2.9637646675109863, "learning_rate": 0.003966703859876732, "loss": 7.8244, "step": 495200 }, { "epoch": 2.0177433498083794, "grad_norm": 1.8317160606384277, "learning_rate": 0.0039663017884870025, "loss": 7.8177, "step": 495300 }, { "epoch": 2.018150727831761, "grad_norm": 6.571487903594971, "learning_rate": 0.003965899659324195, "loss": 7.856, "step": 495400 }, { "epoch": 2.0185581058551425, "grad_norm": 4.068114757537842, "learning_rate": 0.003965497472404212, "loss": 7.8958, "step": 495500 }, { "epoch": 2.018965483878524, "grad_norm": 4.081752300262451, "learning_rate": 0.0039650952277429504, "loss": 7.8686, "step": 495600 }, { "epoch": 2.0193728619019056, "grad_norm": 3.2896957397460938, "learning_rate": 0.003964692925356319, "loss": 7.8418, "step": 495700 }, { "epoch": 2.0197802399252867, "grad_norm": 2.8759496212005615, "learning_rate": 0.003964290565260214, "loss": 7.8325, "step": 495800 }, { "epoch": 2.020187617948668, "grad_norm": 6.6146416664123535, "learning_rate": 0.00396388814747055, "loss": 7.8462, "step": 495900 }, { "epoch": 2.0205949959720497, "grad_norm": 3.111217975616455, "learning_rate": 0.003963485672003234, "loss": 7.8373, "step": 496000 }, { "epoch": 2.0205949959720497, "eval_MaskedAccuracy": 0.49895371591193666, "eval_loss": 1.6523817777633667, "eval_runtime": 149.372, "eval_samples_per_second": 424.952, "eval_steps_per_second": 1.66, "step": 496000 }, { "epoch": 2.0210023739954313, "grad_norm": 2.6997292041778564, "learning_rate": 0.003963083138874182, "loss": 7.8901, "step": 496100 }, { "epoch": 2.021409752018813, "grad_norm": 1.9682706594467163, "learning_rate": 0.0039626805480993, "loss": 7.8991, "step": 496200 }, { "epoch": 2.0218171300421943, "grad_norm": 8.418217658996582, "learning_rate": 0.003962277899694509, "loss": 7.8693, "step": 496300 }, { "epoch": 2.022224508065576, "grad_norm": 1.703735113143921, "learning_rate": 0.003961875193675732, "loss": 7.8759, "step": 496400 }, { "epoch": 2.022631886088957, "grad_norm": 3.291315793991089, "learning_rate": 0.003961472430058886, "loss": 7.8827, "step": 496500 }, { "epoch": 2.0230392641123385, "grad_norm": 9.770030975341797, "learning_rate": 0.003961069608859893, "loss": 7.8737, "step": 496600 }, { "epoch": 2.02344664213572, "grad_norm": 2.2468302249908447, "learning_rate": 0.003960666730094681, "loss": 7.8525, "step": 496700 }, { "epoch": 2.0238540201591015, "grad_norm": 2.570030689239502, "learning_rate": 0.003960263793779182, "loss": 7.8559, "step": 496800 }, { "epoch": 2.024261398182483, "grad_norm": 9.67927074432373, "learning_rate": 0.003959860799929313, "loss": 7.8569, "step": 496900 }, { "epoch": 2.0246687762058646, "grad_norm": 5.694683074951172, "learning_rate": 0.0039594577485610235, "loss": 7.8907, "step": 497000 }, { "epoch": 2.0246687762058646, "eval_MaskedAccuracy": 0.49784782853504506, "eval_loss": 1.651322841644287, "eval_runtime": 147.3449, "eval_samples_per_second": 430.799, "eval_steps_per_second": 1.683, "step": 497000 }, { "epoch": 2.0250761542292457, "grad_norm": 5.3224334716796875, "learning_rate": 0.0039590546396902375, "loss": 7.883, "step": 497100 }, { "epoch": 2.0254835322526272, "grad_norm": 1.8543610572814941, "learning_rate": 0.0039586514733329, "loss": 7.8683, "step": 497200 }, { "epoch": 2.0258909102760088, "grad_norm": 2.410493850708008, "learning_rate": 0.003958248249504936, "loss": 7.8893, "step": 497300 }, { "epoch": 2.0262982882993903, "grad_norm": 1.44971764087677, "learning_rate": 0.0039578449682223, "loss": 7.8402, "step": 497400 }, { "epoch": 2.026705666322772, "grad_norm": 6.2107086181640625, "learning_rate": 0.003957441629500934, "loss": 7.861, "step": 497500 }, { "epoch": 2.0271130443461534, "grad_norm": 1.6982815265655518, "learning_rate": 0.00395703823335678, "loss": 7.8832, "step": 497600 }, { "epoch": 2.0275204223695344, "grad_norm": 8.036022186279297, "learning_rate": 0.00395663477980579, "loss": 7.8428, "step": 497700 }, { "epoch": 2.027927800392916, "grad_norm": 4.244318962097168, "learning_rate": 0.003956231268863912, "loss": 7.848, "step": 497800 }, { "epoch": 2.0283351784162975, "grad_norm": 3.505483388900757, "learning_rate": 0.003955827700547101, "loss": 7.8499, "step": 497900 }, { "epoch": 2.028742556439679, "grad_norm": 1.8905874490737915, "learning_rate": 0.003955424074871312, "loss": 7.8887, "step": 498000 }, { "epoch": 2.028742556439679, "eval_MaskedAccuracy": 0.4991002723353723, "eval_loss": 1.649802803993225, "eval_runtime": 154.2681, "eval_samples_per_second": 411.465, "eval_steps_per_second": 1.608, "step": 498000 }, { "epoch": 2.0291499344630606, "grad_norm": 3.2744388580322266, "learning_rate": 0.003955020391852497, "loss": 7.8617, "step": 498100 }, { "epoch": 2.029557312486442, "grad_norm": 2.8778340816497803, "learning_rate": 0.003954616651506626, "loss": 7.8796, "step": 498200 }, { "epoch": 2.029964690509823, "grad_norm": 3.6095352172851562, "learning_rate": 0.003954212853849658, "loss": 7.8673, "step": 498300 }, { "epoch": 2.0303720685332047, "grad_norm": 9.938542366027832, "learning_rate": 0.00395380899889755, "loss": 7.8706, "step": 498400 }, { "epoch": 2.0307794465565863, "grad_norm": 7.641909599304199, "learning_rate": 0.003953405086666274, "loss": 7.853, "step": 498500 }, { "epoch": 2.031186824579968, "grad_norm": 1.5943167209625244, "learning_rate": 0.003953001117171798, "loss": 7.8503, "step": 498600 }, { "epoch": 2.0315942026033493, "grad_norm": 2.4938507080078125, "learning_rate": 0.003952597090430098, "loss": 7.845, "step": 498700 }, { "epoch": 2.032001580626731, "grad_norm": 3.876598834991455, "learning_rate": 0.003952193006457136, "loss": 7.846, "step": 498800 }, { "epoch": 2.0324089586501124, "grad_norm": 7.685597896575928, "learning_rate": 0.003951788865268895, "loss": 7.8387, "step": 498900 }, { "epoch": 2.0328163366734935, "grad_norm": 2.6308696269989014, "learning_rate": 0.003951384666881356, "loss": 7.8479, "step": 499000 }, { "epoch": 2.0328163366734935, "eval_MaskedAccuracy": 0.4985315018264538, "eval_loss": 1.650436282157898, "eval_runtime": 154.2302, "eval_samples_per_second": 411.567, "eval_steps_per_second": 1.608, "step": 499000 }, { "epoch": 2.033223714696875, "grad_norm": 6.766794681549072, "learning_rate": 0.0039509804113104985, "loss": 7.886, "step": 499100 }, { "epoch": 2.0336310927202566, "grad_norm": 3.8292505741119385, "learning_rate": 0.003950576098572298, "loss": 7.8787, "step": 499200 }, { "epoch": 2.034038470743638, "grad_norm": 2.4238884449005127, "learning_rate": 0.003950171728682741, "loss": 7.8691, "step": 499300 }, { "epoch": 2.0344458487670196, "grad_norm": 9.240036964416504, "learning_rate": 0.003949767301657816, "loss": 7.8608, "step": 499400 }, { "epoch": 2.034853226790401, "grad_norm": 3.8402020931243896, "learning_rate": 0.0039493628175135105, "loss": 7.868, "step": 499500 }, { "epoch": 2.0352606048137822, "grad_norm": 2.0422768592834473, "learning_rate": 0.003948958276265822, "loss": 7.8657, "step": 499600 }, { "epoch": 2.0356679828371638, "grad_norm": 2.8022241592407227, "learning_rate": 0.003948553677930741, "loss": 7.8771, "step": 499700 }, { "epoch": 2.0360753608605453, "grad_norm": 5.078003883361816, "learning_rate": 0.003948149022524265, "loss": 7.8755, "step": 499800 }, { "epoch": 2.036482738883927, "grad_norm": 2.3374109268188477, "learning_rate": 0.003947744310062384, "loss": 7.8508, "step": 499900 }, { "epoch": 2.0368901169073084, "grad_norm": 5.849517345428467, "learning_rate": 0.003947339540561103, "loss": 7.8546, "step": 500000 }, { "epoch": 2.0368901169073084, "eval_MaskedAccuracy": 0.4980729870603918, "eval_loss": 1.6496412754058838, "eval_runtime": 150.4494, "eval_samples_per_second": 421.909, "eval_steps_per_second": 1.648, "step": 500000 }, { "epoch": 2.03729749493069, "grad_norm": 1.8953150510787964, "learning_rate": 0.003946934714036427, "loss": 7.8451, "step": 500100 }, { "epoch": 2.037704872954071, "grad_norm": 2.111159563064575, "learning_rate": 0.003946529830504364, "loss": 7.8453, "step": 500200 }, { "epoch": 2.0381122509774525, "grad_norm": 2.262208938598633, "learning_rate": 0.003946124889980916, "loss": 7.8774, "step": 500300 }, { "epoch": 2.038519629000834, "grad_norm": 4.010697841644287, "learning_rate": 0.0039457198924820885, "loss": 7.8669, "step": 500400 }, { "epoch": 2.0389270070242156, "grad_norm": 5.616472244262695, "learning_rate": 0.003945314838023902, "loss": 7.8648, "step": 500500 }, { "epoch": 2.039334385047597, "grad_norm": 5.5384111404418945, "learning_rate": 0.003944909726622365, "loss": 7.845, "step": 500600 }, { "epoch": 2.0397417630709787, "grad_norm": 4.254812717437744, "learning_rate": 0.0039445045582934945, "loss": 7.8772, "step": 500700 }, { "epoch": 2.0401491410943597, "grad_norm": 2.6096653938293457, "learning_rate": 0.003944099333053311, "loss": 7.856, "step": 500800 }, { "epoch": 2.0405565191177413, "grad_norm": 2.5192205905914307, "learning_rate": 0.003943694050917833, "loss": 7.8865, "step": 500900 }, { "epoch": 2.040963897141123, "grad_norm": 1.8438318967819214, "learning_rate": 0.003943288711903082, "loss": 7.8819, "step": 501000 }, { "epoch": 2.040963897141123, "eval_MaskedAccuracy": 0.4982288280269719, "eval_loss": 1.647727370262146, "eval_runtime": 157.4726, "eval_samples_per_second": 403.092, "eval_steps_per_second": 1.575, "step": 501000 }, { "epoch": 2.0413712751645043, "grad_norm": 5.1324076652526855, "learning_rate": 0.003942883316025085, "loss": 7.8547, "step": 501100 }, { "epoch": 2.041778653187886, "grad_norm": 4.713312149047852, "learning_rate": 0.003942477863299873, "loss": 7.8933, "step": 501200 }, { "epoch": 2.0421860312112674, "grad_norm": 3.2050485610961914, "learning_rate": 0.003942072353743475, "loss": 7.8831, "step": 501300 }, { "epoch": 2.042593409234649, "grad_norm": 5.312648773193359, "learning_rate": 0.00394166678737192, "loss": 7.8576, "step": 501400 }, { "epoch": 2.04300078725803, "grad_norm": 1.825149416923523, "learning_rate": 0.003941261164201243, "loss": 7.8245, "step": 501500 }, { "epoch": 2.0434081652814116, "grad_norm": 6.237276554107666, "learning_rate": 0.00394085548424749, "loss": 7.8614, "step": 501600 }, { "epoch": 2.043815543304793, "grad_norm": 3.27168345451355, "learning_rate": 0.003940449747526681, "loss": 7.8625, "step": 501700 }, { "epoch": 2.0442229213281746, "grad_norm": 5.12900447845459, "learning_rate": 0.003940043954054862, "loss": 7.8834, "step": 501800 }, { "epoch": 2.044630299351556, "grad_norm": 3.0178205966949463, "learning_rate": 0.003939638103848086, "loss": 7.9093, "step": 501900 }, { "epoch": 2.0450376773749377, "grad_norm": 2.8164114952087402, "learning_rate": 0.003939232196922395, "loss": 7.8627, "step": 502000 }, { "epoch": 2.0450376773749377, "eval_MaskedAccuracy": 0.49816018406318197, "eval_loss": 1.6543567180633545, "eval_runtime": 163.2898, "eval_samples_per_second": 388.732, "eval_steps_per_second": 1.519, "step": 502000 }, { "epoch": 2.045445055398319, "grad_norm": 5.639309883117676, "learning_rate": 0.003938826233293827, "loss": 7.8648, "step": 502100 }, { "epoch": 2.0458524334217003, "grad_norm": 3.8196420669555664, "learning_rate": 0.003938420212978441, "loss": 7.8399, "step": 502200 }, { "epoch": 2.046259811445082, "grad_norm": 6.940499782562256, "learning_rate": 0.003938014135992289, "loss": 7.86, "step": 502300 }, { "epoch": 2.0466671894684634, "grad_norm": 4.158481121063232, "learning_rate": 0.0039376080023514224, "loss": 7.8347, "step": 502400 }, { "epoch": 2.047074567491845, "grad_norm": 6.34360408782959, "learning_rate": 0.0039372018120719015, "loss": 7.8619, "step": 502500 }, { "epoch": 2.0474819455152264, "grad_norm": 1.8799878358840942, "learning_rate": 0.003936795565169784, "loss": 7.8507, "step": 502600 }, { "epoch": 2.0478893235386075, "grad_norm": 6.622459411621094, "learning_rate": 0.003936389261661131, "loss": 7.8526, "step": 502700 }, { "epoch": 2.048296701561989, "grad_norm": 3.0950870513916016, "learning_rate": 0.003935982901562, "loss": 7.8605, "step": 502800 }, { "epoch": 2.0487040795853706, "grad_norm": 2.631889581680298, "learning_rate": 0.0039355764848884615, "loss": 7.8505, "step": 502900 }, { "epoch": 2.049111457608752, "grad_norm": 5.816521167755127, "learning_rate": 0.003935170011656585, "loss": 7.8576, "step": 503000 }, { "epoch": 2.049111457608752, "eval_MaskedAccuracy": 0.49909299324845074, "eval_loss": 1.6481876373291016, "eval_runtime": 208.8966, "eval_samples_per_second": 303.863, "eval_steps_per_second": 1.187, "step": 503000 }, { "epoch": 2.0495188356321337, "grad_norm": 4.197556018829346, "learning_rate": 0.003934763481882442, "loss": 7.8734, "step": 503100 }, { "epoch": 2.049926213655515, "grad_norm": 4.809361934661865, "learning_rate": 0.003934356895582095, "loss": 7.8417, "step": 503200 }, { "epoch": 2.0503335916788963, "grad_norm": 4.074956893920898, "learning_rate": 0.003933950252771629, "loss": 7.8513, "step": 503300 }, { "epoch": 2.050740969702278, "grad_norm": 6.168447971343994, "learning_rate": 0.0039335435534671144, "loss": 7.8305, "step": 503400 }, { "epoch": 2.0511483477256593, "grad_norm": 2.2486164569854736, "learning_rate": 0.0039331367976846365, "loss": 7.8466, "step": 503500 }, { "epoch": 2.051555725749041, "grad_norm": 2.659501791000366, "learning_rate": 0.003932729985440271, "loss": 7.8763, "step": 503600 }, { "epoch": 2.0519631037724224, "grad_norm": 2.3019237518310547, "learning_rate": 0.003932323116750102, "loss": 7.8584, "step": 503700 }, { "epoch": 2.052370481795804, "grad_norm": 3.082778215408325, "learning_rate": 0.003931916191630222, "loss": 7.8585, "step": 503800 }, { "epoch": 2.0527778598191855, "grad_norm": 5.441107749938965, "learning_rate": 0.003931509210096709, "loss": 7.8785, "step": 503900 }, { "epoch": 2.0531852378425666, "grad_norm": 3.09259295463562, "learning_rate": 0.003931102172165656, "loss": 7.8469, "step": 504000 }, { "epoch": 2.0531852378425666, "eval_MaskedAccuracy": 0.498288683919097, "eval_loss": 1.6587414741516113, "eval_runtime": 165.1973, "eval_samples_per_second": 384.244, "eval_steps_per_second": 1.501, "step": 504000 }, { "epoch": 2.053592615865948, "grad_norm": 2.1826281547546387, "learning_rate": 0.003930695077853157, "loss": 7.8734, "step": 504100 }, { "epoch": 2.0539999938893296, "grad_norm": 4.1279296875, "learning_rate": 0.003930287927175303, "loss": 7.8529, "step": 504200 }, { "epoch": 2.054407371912711, "grad_norm": 1.551027536392212, "learning_rate": 0.003929880720148198, "loss": 7.8445, "step": 504300 }, { "epoch": 2.0548147499360927, "grad_norm": 2.608344793319702, "learning_rate": 0.003929473456787942, "loss": 7.8375, "step": 504400 }, { "epoch": 2.0552221279594742, "grad_norm": 5.23867130279541, "learning_rate": 0.00392906613711063, "loss": 7.8857, "step": 504500 }, { "epoch": 2.0556295059828553, "grad_norm": 4.096222400665283, "learning_rate": 0.003928658761132365, "loss": 7.8897, "step": 504600 }, { "epoch": 2.056036884006237, "grad_norm": 6.972052097320557, "learning_rate": 0.003928251328869257, "loss": 7.8818, "step": 504700 }, { "epoch": 2.0564442620296184, "grad_norm": 3.181206226348877, "learning_rate": 0.003927843840337411, "loss": 7.8621, "step": 504800 }, { "epoch": 2.056851640053, "grad_norm": 1.3935866355895996, "learning_rate": 0.00392743629555294, "loss": 7.8976, "step": 504900 }, { "epoch": 2.0572590180763815, "grad_norm": 4.864538669586182, "learning_rate": 0.003927028694531953, "loss": 7.8811, "step": 505000 }, { "epoch": 2.0572590180763815, "eval_MaskedAccuracy": 0.497431730936525, "eval_loss": 1.6568992137908936, "eval_runtime": 178.3794, "eval_samples_per_second": 355.848, "eval_steps_per_second": 1.39, "step": 505000 }, { "epoch": 2.057666396099763, "grad_norm": 2.557091236114502, "learning_rate": 0.003926621037290565, "loss": 7.8348, "step": 505100 }, { "epoch": 2.058073774123144, "grad_norm": 3.1932973861694336, "learning_rate": 0.003926213323844895, "loss": 7.8379, "step": 505200 }, { "epoch": 2.0584811521465256, "grad_norm": 3.5232841968536377, "learning_rate": 0.003925805554211059, "loss": 7.8917, "step": 505300 }, { "epoch": 2.058888530169907, "grad_norm": 1.8708343505859375, "learning_rate": 0.003925397728405181, "loss": 7.8413, "step": 505400 }, { "epoch": 2.0592959081932887, "grad_norm": 4.613933086395264, "learning_rate": 0.003924989846443382, "loss": 7.8797, "step": 505500 }, { "epoch": 2.05970328621667, "grad_norm": 8.044772148132324, "learning_rate": 0.003924581908341792, "loss": 7.8726, "step": 505600 }, { "epoch": 2.0601106642400517, "grad_norm": 4.191234588623047, "learning_rate": 0.0039241739141165355, "loss": 7.8826, "step": 505700 }, { "epoch": 2.060518042263433, "grad_norm": 3.295250415802002, "learning_rate": 0.003923765863783748, "loss": 7.8648, "step": 505800 }, { "epoch": 2.0609254202868144, "grad_norm": 1.919803500175476, "learning_rate": 0.003923357757359557, "loss": 7.8739, "step": 505900 }, { "epoch": 2.061332798310196, "grad_norm": 3.5921905040740967, "learning_rate": 0.003922949594860091, "loss": 7.8661, "step": 506000 }, { "epoch": 2.061332798310196, "eval_MaskedAccuracy": 0.4990109497480266, "eval_loss": 1.6470351219177246, "eval_runtime": 169.5359, "eval_samples_per_second": 374.41, "eval_steps_per_second": 1.463, "step": 506000 }, { "epoch": 2.0617401763335774, "grad_norm": 4.288949489593506, "learning_rate": 0.003922541376301499, "loss": 7.8576, "step": 506100 }, { "epoch": 2.062147554356959, "grad_norm": 3.8970656394958496, "learning_rate": 0.003922133101699917, "loss": 7.8358, "step": 506200 }, { "epoch": 2.0625549323803405, "grad_norm": 5.335996627807617, "learning_rate": 0.003921724771071478, "loss": 7.8553, "step": 506300 }, { "epoch": 2.062962310403722, "grad_norm": 2.890336036682129, "learning_rate": 0.003921316384432334, "loss": 7.8439, "step": 506400 }, { "epoch": 2.063369688427103, "grad_norm": 2.4204955101013184, "learning_rate": 0.0039209079417986325, "loss": 7.8322, "step": 506500 }, { "epoch": 2.0637770664504846, "grad_norm": 4.1408772468566895, "learning_rate": 0.003920499443186511, "loss": 7.8684, "step": 506600 }, { "epoch": 2.064184444473866, "grad_norm": 5.479893207550049, "learning_rate": 0.003920090888612128, "loss": 7.8955, "step": 506700 }, { "epoch": 2.0645918224972477, "grad_norm": 4.912505626678467, "learning_rate": 0.003919682278091631, "loss": 7.8907, "step": 506800 }, { "epoch": 2.0649992005206292, "grad_norm": 4.852830410003662, "learning_rate": 0.003919273611641179, "loss": 7.8876, "step": 506900 }, { "epoch": 2.0654065785440108, "grad_norm": 8.33307933807373, "learning_rate": 0.003918864889276931, "loss": 7.8647, "step": 507000 }, { "epoch": 2.0654065785440108, "eval_MaskedAccuracy": 0.49786567208399146, "eval_loss": 1.6547155380249023, "eval_runtime": 156.5502, "eval_samples_per_second": 405.467, "eval_steps_per_second": 1.584, "step": 507000 }, { "epoch": 2.065813956567392, "grad_norm": 7.5528483390808105, "learning_rate": 0.003918456111015038, "loss": 7.877, "step": 507100 }, { "epoch": 2.0662213345907734, "grad_norm": 8.536968231201172, "learning_rate": 0.003918047276871674, "loss": 7.8847, "step": 507200 }, { "epoch": 2.066628712614155, "grad_norm": 2.540980339050293, "learning_rate": 0.003917638386862986, "loss": 7.8876, "step": 507300 }, { "epoch": 2.0670360906375365, "grad_norm": 1.9017066955566406, "learning_rate": 0.003917229441005151, "loss": 7.8696, "step": 507400 }, { "epoch": 2.067443468660918, "grad_norm": 4.358914375305176, "learning_rate": 0.003916820439314332, "loss": 7.8921, "step": 507500 }, { "epoch": 2.0678508466842995, "grad_norm": 8.98803997039795, "learning_rate": 0.003916411381806701, "loss": 7.8506, "step": 507600 }, { "epoch": 2.0682582247076806, "grad_norm": 1.7998460531234741, "learning_rate": 0.003916002268498411, "loss": 7.8614, "step": 507700 }, { "epoch": 2.068665602731062, "grad_norm": 4.80010986328125, "learning_rate": 0.0039155930994056514, "loss": 7.8716, "step": 507800 }, { "epoch": 2.0690729807544437, "grad_norm": 4.340163707733154, "learning_rate": 0.003915183874544606, "loss": 7.8666, "step": 507900 }, { "epoch": 2.069480358777825, "grad_norm": 5.179835796356201, "learning_rate": 0.003914774593931448, "loss": 7.8264, "step": 508000 }, { "epoch": 2.069480358777825, "eval_MaskedAccuracy": 0.49846833664187323, "eval_loss": 1.6485658884048462, "eval_runtime": 159.1141, "eval_samples_per_second": 398.934, "eval_steps_per_second": 1.559, "step": 508000 }, { "epoch": 2.0698877368012067, "grad_norm": 2.6034064292907715, "learning_rate": 0.003914365257582357, "loss": 7.8407, "step": 508100 }, { "epoch": 2.0702951148245883, "grad_norm": 3.7942333221435547, "learning_rate": 0.003913955865513524, "loss": 7.8936, "step": 508200 }, { "epoch": 2.0707024928479694, "grad_norm": 5.063714504241943, "learning_rate": 0.003913546417741127, "loss": 7.8372, "step": 508300 }, { "epoch": 2.071109870871351, "grad_norm": 2.7904000282287598, "learning_rate": 0.003913136914281354, "loss": 7.8756, "step": 508400 }, { "epoch": 2.0715172488947324, "grad_norm": 1.6965965032577515, "learning_rate": 0.003912727355150397, "loss": 7.8745, "step": 508500 }, { "epoch": 2.071924626918114, "grad_norm": 2.195662021636963, "learning_rate": 0.003912317740364454, "loss": 7.8828, "step": 508600 }, { "epoch": 2.0723320049414955, "grad_norm": 2.2256686687469482, "learning_rate": 0.003911908069939709, "loss": 7.8735, "step": 508700 }, { "epoch": 2.072739382964877, "grad_norm": 4.389769554138184, "learning_rate": 0.003911498343892368, "loss": 7.8894, "step": 508800 }, { "epoch": 2.0731467609882586, "grad_norm": 4.874904632568359, "learning_rate": 0.003911088562238624, "loss": 7.8908, "step": 508900 }, { "epoch": 2.0735541390116397, "grad_norm": 4.446139812469482, "learning_rate": 0.003910678724994674, "loss": 7.8519, "step": 509000 }, { "epoch": 2.0735541390116397, "eval_MaskedAccuracy": 0.4977517520413056, "eval_loss": 1.6625655889511108, "eval_runtime": 153.8827, "eval_samples_per_second": 412.496, "eval_steps_per_second": 1.612, "step": 509000 }, { "epoch": 2.073961517035021, "grad_norm": 2.1489932537078857, "learning_rate": 0.003910268832176721, "loss": 7.8871, "step": 509100 }, { "epoch": 2.0743688950584027, "grad_norm": 3.3913214206695557, "learning_rate": 0.003909858883800986, "loss": 7.8739, "step": 509200 }, { "epoch": 2.0747762730817843, "grad_norm": 4.963801860809326, "learning_rate": 0.00390944887988366, "loss": 7.8621, "step": 509300 }, { "epoch": 2.075183651105166, "grad_norm": 3.815263271331787, "learning_rate": 0.003909038820440954, "loss": 7.8576, "step": 509400 }, { "epoch": 2.0755910291285473, "grad_norm": 4.6263532638549805, "learning_rate": 0.003908628705489091, "loss": 7.859, "step": 509500 }, { "epoch": 2.0759984071519284, "grad_norm": 7.340849876403809, "learning_rate": 0.0039082185350442725, "loss": 7.8834, "step": 509600 }, { "epoch": 2.07640578517531, "grad_norm": 3.5412895679473877, "learning_rate": 0.003907808309122724, "loss": 7.8686, "step": 509700 }, { "epoch": 2.0768131631986915, "grad_norm": 2.75848650932312, "learning_rate": 0.003907398027740666, "loss": 7.8514, "step": 509800 }, { "epoch": 2.077220541222073, "grad_norm": 2.8344566822052, "learning_rate": 0.003906987690914311, "loss": 7.8755, "step": 509900 }, { "epoch": 2.0776279192454545, "grad_norm": 2.3449015617370605, "learning_rate": 0.003906577298659882, "loss": 7.8879, "step": 510000 }, { "epoch": 2.0776279192454545, "eval_MaskedAccuracy": 0.4976222971445992, "eval_loss": 1.6617436408996582, "eval_runtime": 160.4301, "eval_samples_per_second": 395.661, "eval_steps_per_second": 1.546, "step": 510000 }, { "epoch": 2.078035297268836, "grad_norm": 4.79791784286499, "learning_rate": 0.0039061668509936093, "loss": 7.8708, "step": 510100 }, { "epoch": 2.078442675292217, "grad_norm": 4.853598117828369, "learning_rate": 0.0039057563479317366, "loss": 7.8542, "step": 510200 }, { "epoch": 2.0788500533155987, "grad_norm": 3.1397154331207275, "learning_rate": 0.0039053457894904684, "loss": 7.8636, "step": 510300 }, { "epoch": 2.0792574313389802, "grad_norm": 2.812790632247925, "learning_rate": 0.003904935175686046, "loss": 7.8622, "step": 510400 }, { "epoch": 2.0796648093623618, "grad_norm": 5.004056930541992, "learning_rate": 0.003904524506534699, "loss": 7.8346, "step": 510500 }, { "epoch": 2.0800721873857433, "grad_norm": 5.575089454650879, "learning_rate": 0.003904113782052664, "loss": 7.8355, "step": 510600 }, { "epoch": 2.080479565409125, "grad_norm": 4.813459873199463, "learning_rate": 0.0039037030022561784, "loss": 7.8792, "step": 510700 }, { "epoch": 2.080886943432506, "grad_norm": 2.780196189880371, "learning_rate": 0.0039032921671614796, "loss": 7.8541, "step": 510800 }, { "epoch": 2.0812943214558874, "grad_norm": 2.5562331676483154, "learning_rate": 0.0039028812767848125, "loss": 7.8891, "step": 510900 }, { "epoch": 2.081701699479269, "grad_norm": 2.959256410598755, "learning_rate": 0.0039024703311424267, "loss": 7.9042, "step": 511000 }, { "epoch": 2.081701699479269, "eval_MaskedAccuracy": 0.49948171891703885, "eval_loss": 1.6568866968154907, "eval_runtime": 178.5216, "eval_samples_per_second": 355.565, "eval_steps_per_second": 1.389, "step": 511000 }, { "epoch": 2.0821090775026505, "grad_norm": 2.2464606761932373, "learning_rate": 0.0039020593302505576, "loss": 7.8597, "step": 511100 }, { "epoch": 2.082516455526032, "grad_norm": 7.631260871887207, "learning_rate": 0.0039016482741254656, "loss": 7.8887, "step": 511200 }, { "epoch": 2.0829238335494136, "grad_norm": 5.0504655838012695, "learning_rate": 0.003901237162783396, "loss": 7.8744, "step": 511300 }, { "epoch": 2.083331211572795, "grad_norm": 6.261029243469238, "learning_rate": 0.003900825996240606, "loss": 7.862, "step": 511400 }, { "epoch": 2.083738589596176, "grad_norm": 2.310039520263672, "learning_rate": 0.003900414774513348, "loss": 7.8867, "step": 511500 }, { "epoch": 2.0841459676195577, "grad_norm": 1.6989558935165405, "learning_rate": 0.00390000349761788, "loss": 7.8412, "step": 511600 }, { "epoch": 2.0845533456429393, "grad_norm": 1.8398146629333496, "learning_rate": 0.0038995921655704613, "loss": 7.8892, "step": 511700 }, { "epoch": 2.084960723666321, "grad_norm": 2.726363182067871, "learning_rate": 0.0038991807783873584, "loss": 7.8887, "step": 511800 }, { "epoch": 2.0853681016897023, "grad_norm": 2.66855525970459, "learning_rate": 0.0038987693360848334, "loss": 7.8638, "step": 511900 }, { "epoch": 2.085775479713084, "grad_norm": 2.9646921157836914, "learning_rate": 0.0038983578386791486, "loss": 7.8417, "step": 512000 }, { "epoch": 2.085775479713084, "eval_MaskedAccuracy": 0.49866002346569377, "eval_loss": 1.6616344451904297, "eval_runtime": 162.631, "eval_samples_per_second": 390.307, "eval_steps_per_second": 1.525, "step": 512000 }, { "epoch": 2.086182857736465, "grad_norm": 2.0324671268463135, "learning_rate": 0.003897946286186581, "loss": 7.855, "step": 512100 }, { "epoch": 2.0865902357598465, "grad_norm": 3.845881223678589, "learning_rate": 0.0038975346786233934, "loss": 7.8509, "step": 512200 }, { "epoch": 2.086997613783228, "grad_norm": 5.2466888427734375, "learning_rate": 0.0038971230160058613, "loss": 7.8224, "step": 512300 }, { "epoch": 2.0874049918066095, "grad_norm": 1.7458776235580444, "learning_rate": 0.0038967112983502626, "loss": 7.8584, "step": 512400 }, { "epoch": 2.087812369829991, "grad_norm": 2.2177734375, "learning_rate": 0.0038962995256728726, "loss": 7.8277, "step": 512500 }, { "epoch": 2.0882197478533726, "grad_norm": 4.301079750061035, "learning_rate": 0.0038958876979899707, "loss": 7.8364, "step": 512600 }, { "epoch": 2.0886271258767537, "grad_norm": 8.370576858520508, "learning_rate": 0.0038954758153178404, "loss": 7.8261, "step": 512700 }, { "epoch": 2.0890345039001352, "grad_norm": 2.8205881118774414, "learning_rate": 0.0038950638776727633, "loss": 7.8063, "step": 512800 }, { "epoch": 2.0894418819235168, "grad_norm": 1.931097149848938, "learning_rate": 0.00389465188507103, "loss": 7.8492, "step": 512900 }, { "epoch": 2.0898492599468983, "grad_norm": 1.9785386323928833, "learning_rate": 0.003894239837528921, "loss": 7.8588, "step": 513000 }, { "epoch": 2.0898492599468983, "eval_MaskedAccuracy": 0.49861447942393755, "eval_loss": 1.642656683921814, "eval_runtime": 187.0413, "eval_samples_per_second": 339.369, "eval_steps_per_second": 1.326, "step": 513000 }, { "epoch": 2.09025663797028, "grad_norm": 2.9555182456970215, "learning_rate": 0.0038938277350627315, "loss": 7.8609, "step": 513100 }, { "epoch": 2.0906640159936614, "grad_norm": 5.055253028869629, "learning_rate": 0.003893415577688753, "loss": 7.8673, "step": 513200 }, { "epoch": 2.0910713940170425, "grad_norm": 4.722437858581543, "learning_rate": 0.0038930033654232823, "loss": 7.8351, "step": 513300 }, { "epoch": 2.091478772040424, "grad_norm": 3.5397512912750244, "learning_rate": 0.0038925910982826138, "loss": 7.8484, "step": 513400 }, { "epoch": 2.0918861500638055, "grad_norm": 1.4733760356903076, "learning_rate": 0.0038921787762830483, "loss": 7.8361, "step": 513500 }, { "epoch": 2.092293528087187, "grad_norm": 5.55131721496582, "learning_rate": 0.003891766399440887, "loss": 7.8594, "step": 513600 }, { "epoch": 2.0927009061105686, "grad_norm": 2.943793773651123, "learning_rate": 0.0038913539677724344, "loss": 7.8328, "step": 513700 }, { "epoch": 2.09310828413395, "grad_norm": 2.3448822498321533, "learning_rate": 0.0038909414812939923, "loss": 7.8266, "step": 513800 }, { "epoch": 2.0935156621573316, "grad_norm": 4.41709041595459, "learning_rate": 0.00389052894002187, "loss": 7.8796, "step": 513900 }, { "epoch": 2.0939230401807127, "grad_norm": 6.548536777496338, "learning_rate": 0.0038901163439723795, "loss": 7.8454, "step": 514000 }, { "epoch": 2.0939230401807127, "eval_MaskedAccuracy": 0.4990569694959531, "eval_loss": 1.6530210971832275, "eval_runtime": 173.0732, "eval_samples_per_second": 366.758, "eval_steps_per_second": 1.433, "step": 514000 }, { "epoch": 2.0943304182040943, "grad_norm": 4.337338447570801, "learning_rate": 0.0038897036931618317, "loss": 7.8809, "step": 514100 }, { "epoch": 2.094737796227476, "grad_norm": 2.3941030502319336, "learning_rate": 0.003889290987606542, "loss": 7.8403, "step": 514200 }, { "epoch": 2.0951451742508573, "grad_norm": 5.223713397979736, "learning_rate": 0.0038888782273228287, "loss": 7.8693, "step": 514300 }, { "epoch": 2.095552552274239, "grad_norm": 6.067909240722656, "learning_rate": 0.0038884654123270065, "loss": 7.8672, "step": 514400 }, { "epoch": 2.0959599302976204, "grad_norm": 5.647371768951416, "learning_rate": 0.003888052542635397, "loss": 7.8555, "step": 514500 }, { "epoch": 2.0963673083210015, "grad_norm": 3.459240198135376, "learning_rate": 0.0038876396182643217, "loss": 7.8589, "step": 514600 }, { "epoch": 2.096774686344383, "grad_norm": 3.0185189247131348, "learning_rate": 0.0038872266392301082, "loss": 7.8543, "step": 514700 }, { "epoch": 2.0971820643677646, "grad_norm": 2.327406883239746, "learning_rate": 0.0038868136055490816, "loss": 7.8275, "step": 514800 }, { "epoch": 2.097589442391146, "grad_norm": 4.88460636138916, "learning_rate": 0.0038864005172375734, "loss": 7.8607, "step": 514900 }, { "epoch": 2.0979968204145276, "grad_norm": 3.7140984535217285, "learning_rate": 0.0038859873743119166, "loss": 7.8526, "step": 515000 }, { "epoch": 2.0979968204145276, "eval_MaskedAccuracy": 0.49905880600640645, "eval_loss": 1.6508853435516357, "eval_runtime": 157.7835, "eval_samples_per_second": 402.298, "eval_steps_per_second": 1.572, "step": 515000 }, { "epoch": 2.098404198437909, "grad_norm": 3.6966841220855713, "learning_rate": 0.0038855741767884434, "loss": 7.8116, "step": 515100 }, { "epoch": 2.0988115764612902, "grad_norm": 1.6206698417663574, "learning_rate": 0.003885160924683486, "loss": 7.8647, "step": 515200 }, { "epoch": 2.0992189544846718, "grad_norm": 2.2783801555633545, "learning_rate": 0.003884747618013387, "loss": 7.8496, "step": 515300 }, { "epoch": 2.0996263325080533, "grad_norm": 3.005263566970825, "learning_rate": 0.0038843342567944903, "loss": 7.8593, "step": 515400 }, { "epoch": 2.100033710531435, "grad_norm": 2.094526767730713, "learning_rate": 0.0038839208410431265, "loss": 7.8606, "step": 515500 }, { "epoch": 2.1004410885548164, "grad_norm": 8.773946762084961, "learning_rate": 0.003883507370775649, "loss": 7.8611, "step": 515600 }, { "epoch": 2.100848466578198, "grad_norm": 2.737407684326172, "learning_rate": 0.003883093846008401, "loss": 7.8187, "step": 515700 }, { "epoch": 2.101255844601579, "grad_norm": 2.1632235050201416, "learning_rate": 0.0038826802667577356, "loss": 7.8351, "step": 515800 }, { "epoch": 2.1016632226249605, "grad_norm": 5.39825963973999, "learning_rate": 0.003882266633039998, "loss": 7.8346, "step": 515900 }, { "epoch": 2.102070600648342, "grad_norm": 4.6561174392700195, "learning_rate": 0.003881852944871543, "loss": 7.8485, "step": 516000 }, { "epoch": 2.102070600648342, "eval_MaskedAccuracy": 0.4992994920440185, "eval_loss": 1.6478400230407715, "eval_runtime": 170.1425, "eval_samples_per_second": 373.076, "eval_steps_per_second": 1.458, "step": 516000 }, { "epoch": 2.1024779786717236, "grad_norm": 3.3429903984069824, "learning_rate": 0.0038814392022687274, "loss": 7.8529, "step": 516100 }, { "epoch": 2.102885356695105, "grad_norm": 8.731779098510742, "learning_rate": 0.0038810254052479127, "loss": 7.8222, "step": 516200 }, { "epoch": 2.1032927347184867, "grad_norm": 3.482006311416626, "learning_rate": 0.0038806115538254516, "loss": 7.8448, "step": 516300 }, { "epoch": 2.103700112741868, "grad_norm": 5.820671558380127, "learning_rate": 0.003880197648017711, "loss": 7.8503, "step": 516400 }, { "epoch": 2.1041074907652493, "grad_norm": 3.015876531600952, "learning_rate": 0.003879783687841048, "loss": 7.8595, "step": 516500 }, { "epoch": 2.104514868788631, "grad_norm": 3.7448184490203857, "learning_rate": 0.003879369673311833, "loss": 7.8652, "step": 516600 }, { "epoch": 2.1049222468120123, "grad_norm": 2.8957302570343018, "learning_rate": 0.003878955604446434, "loss": 7.8539, "step": 516700 }, { "epoch": 2.105329624835394, "grad_norm": 3.009982109069824, "learning_rate": 0.0038785414812612233, "loss": 7.8172, "step": 516800 }, { "epoch": 2.1057370028587754, "grad_norm": 7.062688827514648, "learning_rate": 0.003878127303772568, "loss": 7.8541, "step": 516900 }, { "epoch": 2.106144380882157, "grad_norm": 4.3620924949646, "learning_rate": 0.003877713071996851, "loss": 7.8689, "step": 517000 }, { "epoch": 2.106144380882157, "eval_MaskedAccuracy": 0.49807315141597197, "eval_loss": 1.6623533964157104, "eval_runtime": 155.7107, "eval_samples_per_second": 407.653, "eval_steps_per_second": 1.593, "step": 517000 }, { "epoch": 2.106551758905538, "grad_norm": 3.7454354763031006, "learning_rate": 0.0038772987859504423, "loss": 7.8493, "step": 517100 }, { "epoch": 2.1069591369289196, "grad_norm": 5.084829330444336, "learning_rate": 0.0038768844456497247, "loss": 7.8519, "step": 517200 }, { "epoch": 2.107366514952301, "grad_norm": 5.766896724700928, "learning_rate": 0.0038764700511110784, "loss": 7.8476, "step": 517300 }, { "epoch": 2.1077738929756826, "grad_norm": 2.518277883529663, "learning_rate": 0.0038760556023508855, "loss": 7.8506, "step": 517400 }, { "epoch": 2.108181270999064, "grad_norm": 3.968675374984741, "learning_rate": 0.003875641099385529, "loss": 7.842, "step": 517500 }, { "epoch": 2.1085886490224457, "grad_norm": 5.5741705894470215, "learning_rate": 0.0038752265422314002, "loss": 7.867, "step": 517600 }, { "epoch": 2.108996027045827, "grad_norm": 4.208227634429932, "learning_rate": 0.003874811930904893, "loss": 7.8408, "step": 517700 }, { "epoch": 2.1094034050692083, "grad_norm": 3.1576449871063232, "learning_rate": 0.0038743972654223884, "loss": 7.8388, "step": 517800 }, { "epoch": 2.10981078309259, "grad_norm": 4.84845495223999, "learning_rate": 0.003873982545800287, "loss": 7.8128, "step": 517900 }, { "epoch": 2.1102181611159714, "grad_norm": 2.3880467414855957, "learning_rate": 0.0038735677720549842, "loss": 7.8527, "step": 518000 }, { "epoch": 2.1102181611159714, "eval_MaskedAccuracy": 0.5003301768561452, "eval_loss": 1.6516457796096802, "eval_runtime": 161.9699, "eval_samples_per_second": 391.9, "eval_steps_per_second": 1.531, "step": 518000 }, { "epoch": 2.110625539139353, "grad_norm": 5.248900413513184, "learning_rate": 0.003873152944202876, "loss": 7.8289, "step": 518100 }, { "epoch": 2.1110329171627344, "grad_norm": 3.9732167720794678, "learning_rate": 0.003872738062260366, "loss": 7.8636, "step": 518200 }, { "epoch": 2.1114402951861155, "grad_norm": 3.315394639968872, "learning_rate": 0.003872323126243854, "loss": 7.8254, "step": 518300 }, { "epoch": 2.111847673209497, "grad_norm": 7.163773059844971, "learning_rate": 0.0038719081361697476, "loss": 7.8377, "step": 518400 }, { "epoch": 2.1122550512328786, "grad_norm": 3.326075315475464, "learning_rate": 0.0038714930920544484, "loss": 7.8421, "step": 518500 }, { "epoch": 2.11266242925626, "grad_norm": 2.2585558891296387, "learning_rate": 0.0038710779939143706, "loss": 7.865, "step": 518600 }, { "epoch": 2.1130698072796417, "grad_norm": 4.726303577423096, "learning_rate": 0.0038706628417659213, "loss": 7.8414, "step": 518700 }, { "epoch": 2.113477185303023, "grad_norm": 2.197706699371338, "learning_rate": 0.0038702476356255204, "loss": 7.8443, "step": 518800 }, { "epoch": 2.1138845633264047, "grad_norm": 4.723580360412598, "learning_rate": 0.0038698323755095793, "loss": 7.8326, "step": 518900 }, { "epoch": 2.114291941349786, "grad_norm": 2.8791539669036865, "learning_rate": 0.003869417061434514, "loss": 7.8506, "step": 519000 }, { "epoch": 2.114291941349786, "eval_MaskedAccuracy": 0.49916622843293673, "eval_loss": 1.6447579860687256, "eval_runtime": 157.7297, "eval_samples_per_second": 402.435, "eval_steps_per_second": 1.572, "step": 519000 }, { "epoch": 2.1146993193731674, "grad_norm": 6.8712687492370605, "learning_rate": 0.0038690016934167445, "loss": 7.8443, "step": 519100 }, { "epoch": 2.115106697396549, "grad_norm": 1.3840206861495972, "learning_rate": 0.003868586271472687, "loss": 7.839, "step": 519200 }, { "epoch": 2.1155140754199304, "grad_norm": 3.4413862228393555, "learning_rate": 0.003868170795618772, "loss": 7.8617, "step": 519300 }, { "epoch": 2.115921453443312, "grad_norm": 3.5457684993743896, "learning_rate": 0.0038677552658714282, "loss": 7.8538, "step": 519400 }, { "epoch": 2.1163288314666935, "grad_norm": 3.5211522579193115, "learning_rate": 0.003867339682247083, "loss": 7.8487, "step": 519500 }, { "epoch": 2.1167362094900746, "grad_norm": 2.000596046447754, "learning_rate": 0.0038669240447621604, "loss": 7.8483, "step": 519600 }, { "epoch": 2.117143587513456, "grad_norm": 2.7407684326171875, "learning_rate": 0.0038665083534330977, "loss": 7.849, "step": 519700 }, { "epoch": 2.1175509655368376, "grad_norm": 2.1641077995300293, "learning_rate": 0.00386609260827633, "loss": 7.8685, "step": 519800 }, { "epoch": 2.117958343560219, "grad_norm": 4.879209041595459, "learning_rate": 0.003865676809308291, "loss": 7.841, "step": 519900 }, { "epoch": 2.1183657215836007, "grad_norm": 4.267777442932129, "learning_rate": 0.0038652609565454194, "loss": 7.8681, "step": 520000 }, { "epoch": 2.1183657215836007, "eval_MaskedAccuracy": 0.4989128060239453, "eval_loss": 1.6525324583053589, "eval_runtime": 170.4922, "eval_samples_per_second": 372.31, "eval_steps_per_second": 1.455, "step": 520000 }, { "epoch": 2.1187730996069822, "grad_norm": 4.678677558898926, "learning_rate": 0.0038648450500041584, "loss": 7.8279, "step": 520100 }, { "epoch": 2.1191804776303633, "grad_norm": 1.9841251373291016, "learning_rate": 0.003864429089700947, "loss": 7.8761, "step": 520200 }, { "epoch": 2.119587855653745, "grad_norm": 3.339817523956299, "learning_rate": 0.0038640130756522344, "loss": 7.8515, "step": 520300 }, { "epoch": 2.1199952336771264, "grad_norm": 3.953916072845459, "learning_rate": 0.003863597007874463, "loss": 7.8564, "step": 520400 }, { "epoch": 2.120402611700508, "grad_norm": 2.9301578998565674, "learning_rate": 0.003863180886384087, "loss": 7.8552, "step": 520500 }, { "epoch": 2.1208099897238895, "grad_norm": 4.771327495574951, "learning_rate": 0.003862764711197554, "loss": 7.8701, "step": 520600 }, { "epoch": 2.121217367747271, "grad_norm": 9.336111068725586, "learning_rate": 0.0038623484823313205, "loss": 7.844, "step": 520700 }, { "epoch": 2.121624745770652, "grad_norm": 2.9203686714172363, "learning_rate": 0.003861932199801845, "loss": 7.8717, "step": 520800 }, { "epoch": 2.1220321237940336, "grad_norm": 6.97675895690918, "learning_rate": 0.0038615158636255815, "loss": 7.8497, "step": 520900 }, { "epoch": 2.122439501817415, "grad_norm": 3.968886137008667, "learning_rate": 0.0038610994738189942, "loss": 7.8311, "step": 521000 }, { "epoch": 2.122439501817415, "eval_MaskedAccuracy": 0.4993250706805788, "eval_loss": 1.649429202079773, "eval_runtime": 156.1808, "eval_samples_per_second": 406.427, "eval_steps_per_second": 1.588, "step": 521000 }, { "epoch": 2.1228468798407967, "grad_norm": 4.034364700317383, "learning_rate": 0.003860683030398538, "loss": 7.857, "step": 521100 }, { "epoch": 2.123254257864178, "grad_norm": 5.871217727661133, "learning_rate": 0.003860266533380682, "loss": 7.8487, "step": 521200 }, { "epoch": 2.1236616358875597, "grad_norm": 1.6691020727157593, "learning_rate": 0.003859849982781893, "loss": 7.8432, "step": 521300 }, { "epoch": 2.1240690139109413, "grad_norm": 3.405710220336914, "learning_rate": 0.003859433378618635, "loss": 7.8389, "step": 521400 }, { "epoch": 2.1244763919343224, "grad_norm": 3.6802337169647217, "learning_rate": 0.0038590167209073805, "loss": 7.8668, "step": 521500 }, { "epoch": 2.124883769957704, "grad_norm": 2.612476348876953, "learning_rate": 0.0038586000096646057, "loss": 7.8415, "step": 521600 }, { "epoch": 2.1252911479810854, "grad_norm": 2.7740638256073, "learning_rate": 0.0038581832449067805, "loss": 7.8302, "step": 521700 }, { "epoch": 2.125698526004467, "grad_norm": 6.684861183166504, "learning_rate": 0.003857766426650386, "loss": 7.8427, "step": 521800 }, { "epoch": 2.1261059040278485, "grad_norm": 4.789067268371582, "learning_rate": 0.0038573495549119, "loss": 7.8429, "step": 521900 }, { "epoch": 2.12651328205123, "grad_norm": 3.1102423667907715, "learning_rate": 0.003856932629707804, "loss": 7.8439, "step": 522000 }, { "epoch": 2.12651328205123, "eval_MaskedAccuracy": 0.49906116697405856, "eval_loss": 1.6518633365631104, "eval_runtime": 153.0851, "eval_samples_per_second": 414.645, "eval_steps_per_second": 1.62, "step": 522000 }, { "epoch": 2.126920660074611, "grad_norm": 5.182793140411377, "learning_rate": 0.003856515651054577, "loss": 7.8526, "step": 522100 }, { "epoch": 2.1273280380979926, "grad_norm": 4.2344651222229, "learning_rate": 0.0038560986189687075, "loss": 7.8276, "step": 522200 }, { "epoch": 2.127735416121374, "grad_norm": 4.652752876281738, "learning_rate": 0.0038556815334666833, "loss": 7.8345, "step": 522300 }, { "epoch": 2.1281427941447557, "grad_norm": 2.306577444076538, "learning_rate": 0.0038552643945649976, "loss": 7.8185, "step": 522400 }, { "epoch": 2.1285501721681372, "grad_norm": 2.9456610679626465, "learning_rate": 0.003854847202280141, "loss": 7.8668, "step": 522500 }, { "epoch": 2.1289575501915188, "grad_norm": 3.9706289768218994, "learning_rate": 0.0038544299566286053, "loss": 7.8662, "step": 522600 }, { "epoch": 2.1293649282149, "grad_norm": 2.4402737617492676, "learning_rate": 0.00385401265762688, "loss": 7.8397, "step": 522700 }, { "epoch": 2.1297723062382814, "grad_norm": 4.685486316680908, "learning_rate": 0.003853595305291472, "loss": 7.8325, "step": 522800 }, { "epoch": 2.130179684261663, "grad_norm": 3.8829379081726074, "learning_rate": 0.003853177899638879, "loss": 7.8706, "step": 522900 }, { "epoch": 2.1305870622850445, "grad_norm": 7.463794708251953, "learning_rate": 0.003852760440685605, "loss": 7.8707, "step": 523000 }, { "epoch": 2.1305870622850445, "eval_MaskedAccuracy": 0.4990442157171856, "eval_loss": 1.6476261615753174, "eval_runtime": 154.2854, "eval_samples_per_second": 411.419, "eval_steps_per_second": 1.607, "step": 523000 }, { "epoch": 2.130994440308426, "grad_norm": 4.382684707641602, "learning_rate": 0.0038523429284481525, "loss": 7.8549, "step": 523100 }, { "epoch": 2.1314018183318075, "grad_norm": 3.828449010848999, "learning_rate": 0.0038519253629430254, "loss": 7.8547, "step": 523200 }, { "epoch": 2.1318091963551886, "grad_norm": 3.674806833267212, "learning_rate": 0.0038515077441867353, "loss": 7.8408, "step": 523300 }, { "epoch": 2.13221657437857, "grad_norm": 2.4122889041900635, "learning_rate": 0.0038510900721957933, "loss": 7.8451, "step": 523400 }, { "epoch": 2.1326239524019517, "grad_norm": 3.8041248321533203, "learning_rate": 0.0038506723469867083, "loss": 7.8486, "step": 523500 }, { "epoch": 2.133031330425333, "grad_norm": 3.0059142112731934, "learning_rate": 0.0038502545685759997, "loss": 7.8389, "step": 523600 }, { "epoch": 2.1334387084487147, "grad_norm": 5.589702606201172, "learning_rate": 0.003849836736980186, "loss": 7.8639, "step": 523700 }, { "epoch": 2.1338460864720963, "grad_norm": 2.9695308208465576, "learning_rate": 0.0038494188522157804, "loss": 7.8568, "step": 523800 }, { "epoch": 2.134253464495478, "grad_norm": 1.8465338945388794, "learning_rate": 0.0038490009142993047, "loss": 7.8472, "step": 523900 }, { "epoch": 2.134660842518859, "grad_norm": 3.8933956623077393, "learning_rate": 0.0038485829232472878, "loss": 7.8666, "step": 524000 }, { "epoch": 2.134660842518859, "eval_MaskedAccuracy": 0.498521111636213, "eval_loss": 1.6424434185028076, "eval_runtime": 196.697, "eval_samples_per_second": 322.71, "eval_steps_per_second": 1.261, "step": 524000 }, { "epoch": 2.1350682205422404, "grad_norm": 6.770482540130615, "learning_rate": 0.003848164879076251, "loss": 7.8278, "step": 524100 }, { "epoch": 2.135475598565622, "grad_norm": 3.3351528644561768, "learning_rate": 0.003847746781802722, "loss": 7.8047, "step": 524200 }, { "epoch": 2.1358829765890035, "grad_norm": 1.5136771202087402, "learning_rate": 0.0038473286314432316, "loss": 7.8479, "step": 524300 }, { "epoch": 2.136290354612385, "grad_norm": 4.587497711181641, "learning_rate": 0.0038469104280143105, "loss": 7.8619, "step": 524400 }, { "epoch": 2.1366977326357666, "grad_norm": 3.4201977252960205, "learning_rate": 0.0038464921715324948, "loss": 7.8444, "step": 524500 }, { "epoch": 2.1371051106591477, "grad_norm": 2.8858134746551514, "learning_rate": 0.0038460738620143212, "loss": 7.8444, "step": 524600 }, { "epoch": 2.137512488682529, "grad_norm": 4.345208644866943, "learning_rate": 0.0038456554994763235, "loss": 7.8184, "step": 524700 }, { "epoch": 2.1379198667059107, "grad_norm": 3.379286289215088, "learning_rate": 0.0038452370839350426, "loss": 7.8281, "step": 524800 }, { "epoch": 2.1383272447292923, "grad_norm": 4.41142463684082, "learning_rate": 0.003844818615407021, "loss": 7.8362, "step": 524900 }, { "epoch": 2.138734622752674, "grad_norm": 1.7225568294525146, "learning_rate": 0.0038444000939088056, "loss": 7.853, "step": 525000 }, { "epoch": 2.138734622752674, "eval_MaskedAccuracy": 0.4993731993985832, "eval_loss": 1.6521061658859253, "eval_runtime": 154.7934, "eval_samples_per_second": 410.069, "eval_steps_per_second": 1.602, "step": 525000 }, { "epoch": 2.1391420007760553, "grad_norm": 3.65887713432312, "learning_rate": 0.003843981519456943, "loss": 7.8451, "step": 525100 }, { "epoch": 2.1395493787994364, "grad_norm": 1.7522298097610474, "learning_rate": 0.0038435628920679794, "loss": 7.8768, "step": 525200 }, { "epoch": 2.139956756822818, "grad_norm": 3.946798086166382, "learning_rate": 0.0038431442117584608, "loss": 7.8728, "step": 525300 }, { "epoch": 2.1403641348461995, "grad_norm": 2.1456899642944336, "learning_rate": 0.003842725478544948, "loss": 7.8832, "step": 525400 }, { "epoch": 2.140771512869581, "grad_norm": 3.279982328414917, "learning_rate": 0.0038423066924439912, "loss": 7.8431, "step": 525500 }, { "epoch": 2.1411788908929625, "grad_norm": 3.695875406265259, "learning_rate": 0.0038418878534721462, "loss": 7.8462, "step": 525600 }, { "epoch": 2.141586268916344, "grad_norm": 2.8035428524017334, "learning_rate": 0.003841468961645979, "loss": 7.8557, "step": 525700 }, { "epoch": 2.141993646939725, "grad_norm": 1.9271063804626465, "learning_rate": 0.0038410500169820433, "loss": 7.8382, "step": 525800 }, { "epoch": 2.1424010249631067, "grad_norm": 3.3840794563293457, "learning_rate": 0.003840631019496905, "loss": 7.8448, "step": 525900 }, { "epoch": 2.1428084029864882, "grad_norm": 2.669886350631714, "learning_rate": 0.003840211969207129, "loss": 7.8347, "step": 526000 }, { "epoch": 2.1428084029864882, "eval_MaskedAccuracy": 0.4983857915834357, "eval_loss": 1.6485259532928467, "eval_runtime": 157.1212, "eval_samples_per_second": 403.994, "eval_steps_per_second": 1.578, "step": 526000 }, { "epoch": 2.1432157810098698, "grad_norm": 2.8446619510650635, "learning_rate": 0.0038397928661292827, "loss": 7.8434, "step": 526100 }, { "epoch": 2.1436231590332513, "grad_norm": 2.5263686180114746, "learning_rate": 0.003839373710279936, "loss": 7.8465, "step": 526200 }, { "epoch": 2.144030537056633, "grad_norm": 2.31062650680542, "learning_rate": 0.0038389545016756605, "loss": 7.841, "step": 526300 }, { "epoch": 2.1444379150800144, "grad_norm": 1.9868495464324951, "learning_rate": 0.003838535240333024, "loss": 7.7923, "step": 526400 }, { "epoch": 2.1448452931033954, "grad_norm": 1.8735777139663696, "learning_rate": 0.0038381159262686096, "loss": 7.8812, "step": 526500 }, { "epoch": 2.145252671126777, "grad_norm": 2.031118631362915, "learning_rate": 0.0038376965594989977, "loss": 7.8441, "step": 526600 }, { "epoch": 2.1456600491501585, "grad_norm": 3.5911500453948975, "learning_rate": 0.003837277140040757, "loss": 7.8436, "step": 526700 }, { "epoch": 2.14606742717354, "grad_norm": 2.9640281200408936, "learning_rate": 0.0038368576679104777, "loss": 7.8411, "step": 526800 }, { "epoch": 2.1464748051969216, "grad_norm": 1.554478406906128, "learning_rate": 0.003836438143124744, "loss": 7.8642, "step": 526900 }, { "epoch": 2.146882183220303, "grad_norm": 3.397552490234375, "learning_rate": 0.003836018565700135, "loss": 7.8349, "step": 527000 }, { "epoch": 2.146882183220303, "eval_MaskedAccuracy": 0.4983316146545811, "eval_loss": 1.6525747776031494, "eval_runtime": 163.3248, "eval_samples_per_second": 388.649, "eval_steps_per_second": 1.518, "step": 527000 }, { "epoch": 2.147289561243684, "grad_norm": 2.814387321472168, "learning_rate": 0.003835598935653243, "loss": 7.8687, "step": 527100 }, { "epoch": 2.1476969392670657, "grad_norm": 4.296543121337891, "learning_rate": 0.003835179253000662, "loss": 7.8874, "step": 527200 }, { "epoch": 2.1481043172904473, "grad_norm": 3.7360377311706543, "learning_rate": 0.00383475951775898, "loss": 7.8229, "step": 527300 }, { "epoch": 2.148511695313829, "grad_norm": 1.4035084247589111, "learning_rate": 0.003834339729944791, "loss": 7.849, "step": 527400 }, { "epoch": 2.1489190733372103, "grad_norm": 4.194318771362305, "learning_rate": 0.0038339198895746937, "loss": 7.8487, "step": 527500 }, { "epoch": 2.149326451360592, "grad_norm": 3.1967825889587402, "learning_rate": 0.0038334999966652857, "loss": 7.8524, "step": 527600 }, { "epoch": 2.149733829383973, "grad_norm": 10.551984786987305, "learning_rate": 0.003833080051233164, "loss": 7.8903, "step": 527700 }, { "epoch": 2.1501412074073545, "grad_norm": 1.961940050125122, "learning_rate": 0.0038326600532949382, "loss": 7.8301, "step": 527800 }, { "epoch": 2.150548585430736, "grad_norm": 2.9610583782196045, "learning_rate": 0.003832240002867211, "loss": 7.8362, "step": 527900 }, { "epoch": 2.1509559634541175, "grad_norm": 2.6585211753845215, "learning_rate": 0.0038318198999665863, "loss": 7.8502, "step": 528000 }, { "epoch": 2.1509559634541175, "eval_MaskedAccuracy": 0.49930813654902767, "eval_loss": 1.6571317911148071, "eval_runtime": 160.0369, "eval_samples_per_second": 396.634, "eval_steps_per_second": 1.55, "step": 528000 }, { "epoch": 2.151363341477499, "grad_norm": 4.147009372711182, "learning_rate": 0.003831399744609676, "loss": 7.8445, "step": 528100 }, { "epoch": 2.1517707195008806, "grad_norm": 3.613985776901245, "learning_rate": 0.0038309795368130866, "loss": 7.8648, "step": 528200 }, { "epoch": 2.1521780975242617, "grad_norm": 5.673200607299805, "learning_rate": 0.0038305592765934364, "loss": 7.8482, "step": 528300 }, { "epoch": 2.1525854755476432, "grad_norm": 5.594901084899902, "learning_rate": 0.003830138963967339, "loss": 7.8462, "step": 528400 }, { "epoch": 2.1529928535710248, "grad_norm": 3.565707206726074, "learning_rate": 0.0038297185989514104, "loss": 7.8494, "step": 528500 }, { "epoch": 2.1534002315944063, "grad_norm": 10.87060260772705, "learning_rate": 0.003829298181562268, "loss": 7.8588, "step": 528600 }, { "epoch": 2.153807609617788, "grad_norm": 2.678865432739258, "learning_rate": 0.0038288777118165415, "loss": 7.8739, "step": 528700 }, { "epoch": 2.1542149876411694, "grad_norm": 6.544186115264893, "learning_rate": 0.0038284571897308464, "loss": 7.832, "step": 528800 }, { "epoch": 2.154622365664551, "grad_norm": 5.820631980895996, "learning_rate": 0.0038280366153218112, "loss": 7.84, "step": 528900 }, { "epoch": 2.155029743687932, "grad_norm": 2.8094024658203125, "learning_rate": 0.003827615988606059, "loss": 7.8275, "step": 529000 }, { "epoch": 2.155029743687932, "eval_MaskedAccuracy": 0.5000002667794017, "eval_loss": 1.6423031091690063, "eval_runtime": 157.6177, "eval_samples_per_second": 402.721, "eval_steps_per_second": 1.573, "step": 529000 }, { "epoch": 2.1554371217113135, "grad_norm": 3.2447738647460938, "learning_rate": 0.0038271953096002224, "loss": 7.8538, "step": 529100 }, { "epoch": 2.155844499734695, "grad_norm": 2.1335766315460205, "learning_rate": 0.003826774578320937, "loss": 7.8385, "step": 529200 }, { "epoch": 2.1562518777580766, "grad_norm": 8.36882495880127, "learning_rate": 0.0038263537947848324, "loss": 7.8431, "step": 529300 }, { "epoch": 2.156659255781458, "grad_norm": 3.091593027114868, "learning_rate": 0.003825932959008545, "loss": 7.8359, "step": 529400 }, { "epoch": 2.1570666338048397, "grad_norm": 1.5677893161773682, "learning_rate": 0.0038255120710087155, "loss": 7.8607, "step": 529500 }, { "epoch": 2.1574740118282207, "grad_norm": 2.9303829669952393, "learning_rate": 0.0038250911308019757, "loss": 7.8359, "step": 529600 }, { "epoch": 2.1578813898516023, "grad_norm": 3.8750693798065186, "learning_rate": 0.0038246701384049755, "loss": 7.8578, "step": 529700 }, { "epoch": 2.158288767874984, "grad_norm": 2.241936206817627, "learning_rate": 0.0038242490938343553, "loss": 7.8387, "step": 529800 }, { "epoch": 2.1586961458983653, "grad_norm": 3.8586337566375732, "learning_rate": 0.0038238279971067623, "loss": 7.8409, "step": 529900 }, { "epoch": 2.159103523921747, "grad_norm": 3.5696747303009033, "learning_rate": 0.003823406848238848, "loss": 7.8027, "step": 530000 }, { "epoch": 2.159103523921747, "eval_MaskedAccuracy": 0.4998697965700189, "eval_loss": 1.644965648651123, "eval_runtime": 154.371, "eval_samples_per_second": 411.191, "eval_steps_per_second": 1.607, "step": 530000 }, { "epoch": 2.1595109019451284, "grad_norm": 8.334405899047852, "learning_rate": 0.003822985647247259, "loss": 7.8247, "step": 530100 }, { "epoch": 2.1599182799685095, "grad_norm": 4.211894989013672, "learning_rate": 0.0038225643941486426, "loss": 7.817, "step": 530200 }, { "epoch": 2.160325657991891, "grad_norm": 2.0790696144104004, "learning_rate": 0.0038221430889596635, "loss": 7.839, "step": 530300 }, { "epoch": 2.1607330360152726, "grad_norm": 5.07918643951416, "learning_rate": 0.0038217217316969727, "loss": 7.8126, "step": 530400 }, { "epoch": 2.161140414038654, "grad_norm": 5.834768772125244, "learning_rate": 0.00382130032237723, "loss": 7.8014, "step": 530500 }, { "epoch": 2.1615477920620356, "grad_norm": 2.6662633419036865, "learning_rate": 0.003820878861017097, "loss": 7.8394, "step": 530600 }, { "epoch": 2.161955170085417, "grad_norm": 3.3728983402252197, "learning_rate": 0.0038204573476332345, "loss": 7.8006, "step": 530700 }, { "epoch": 2.1623625481087982, "grad_norm": 5.740840911865234, "learning_rate": 0.003820035782242308, "loss": 7.8252, "step": 530800 }, { "epoch": 2.1627699261321798, "grad_norm": 2.832451105117798, "learning_rate": 0.0038196141648609806, "loss": 7.8221, "step": 530900 }, { "epoch": 2.1631773041555613, "grad_norm": 2.4910361766815186, "learning_rate": 0.003819192495505927, "loss": 7.8279, "step": 531000 }, { "epoch": 2.1631773041555613, "eval_MaskedAccuracy": 0.4997051236722038, "eval_loss": 1.6499806642532349, "eval_runtime": 157.8028, "eval_samples_per_second": 402.249, "eval_steps_per_second": 1.572, "step": 531000 }, { "epoch": 2.163584682178943, "grad_norm": 1.957298994064331, "learning_rate": 0.0038187707741938145, "loss": 7.8521, "step": 531100 }, { "epoch": 2.1639920602023244, "grad_norm": 5.408090591430664, "learning_rate": 0.003818349000941315, "loss": 7.8317, "step": 531200 }, { "epoch": 2.164399438225706, "grad_norm": 5.594817638397217, "learning_rate": 0.0038179271757651056, "loss": 7.8408, "step": 531300 }, { "epoch": 2.1648068162490874, "grad_norm": 3.2920899391174316, "learning_rate": 0.003817505298681865, "loss": 7.8267, "step": 531400 }, { "epoch": 2.1652141942724685, "grad_norm": 6.1781439781188965, "learning_rate": 0.0038170833697082677, "loss": 7.8509, "step": 531500 }, { "epoch": 2.16562157229585, "grad_norm": 1.8539202213287354, "learning_rate": 0.0038166613888610005, "loss": 7.8519, "step": 531600 }, { "epoch": 2.1660289503192316, "grad_norm": 3.158538818359375, "learning_rate": 0.003816239356156741, "loss": 7.8044, "step": 531700 }, { "epoch": 2.166436328342613, "grad_norm": 5.459998607635498, "learning_rate": 0.0038158172716121776, "loss": 7.8652, "step": 531800 }, { "epoch": 2.1668437063659947, "grad_norm": 3.2789273262023926, "learning_rate": 0.003815395135243998, "loss": 7.8071, "step": 531900 }, { "epoch": 2.167251084389376, "grad_norm": 1.4929612874984741, "learning_rate": 0.0038149729470688884, "loss": 7.8441, "step": 532000 }, { "epoch": 2.167251084389376, "eval_MaskedAccuracy": 0.49909206600661943, "eval_loss": 1.6480473279953003, "eval_runtime": 158.6008, "eval_samples_per_second": 400.225, "eval_steps_per_second": 1.564, "step": 532000 }, { "epoch": 2.1676584624127573, "grad_norm": 2.944103479385376, "learning_rate": 0.003814550707103539, "loss": 7.8238, "step": 532100 }, { "epoch": 2.168065840436139, "grad_norm": 5.1649651527404785, "learning_rate": 0.0038141284153646498, "loss": 7.8642, "step": 532200 }, { "epoch": 2.1684732184595203, "grad_norm": 3.9113659858703613, "learning_rate": 0.003813706071868912, "loss": 7.843, "step": 532300 }, { "epoch": 2.168880596482902, "grad_norm": 8.139547348022461, "learning_rate": 0.0038132836766330233, "loss": 7.8724, "step": 532400 }, { "epoch": 2.1692879745062834, "grad_norm": 1.2801121473312378, "learning_rate": 0.0038128612296736875, "loss": 7.843, "step": 532500 }, { "epoch": 2.169695352529665, "grad_norm": 2.4380743503570557, "learning_rate": 0.0038124387310075956, "loss": 7.8175, "step": 532600 }, { "epoch": 2.170102730553046, "grad_norm": 8.219067573547363, "learning_rate": 0.003812016180651463, "loss": 7.8253, "step": 532700 }, { "epoch": 2.1705101085764276, "grad_norm": 3.3391449451446533, "learning_rate": 0.003811593578621991, "loss": 7.8213, "step": 532800 }, { "epoch": 2.170917486599809, "grad_norm": 3.765515089035034, "learning_rate": 0.0038111709249358844, "loss": 7.7826, "step": 532900 }, { "epoch": 2.1713248646231906, "grad_norm": 7.724734306335449, "learning_rate": 0.0038107482196098564, "loss": 7.8197, "step": 533000 }, { "epoch": 2.1713248646231906, "eval_MaskedAccuracy": 0.4996287348683913, "eval_loss": 1.6528569459915161, "eval_runtime": 157.3759, "eval_samples_per_second": 403.34, "eval_steps_per_second": 1.576, "step": 533000 }, { "epoch": 2.171732242646572, "grad_norm": 2.16390323638916, "learning_rate": 0.003810325462660616, "loss": 7.8005, "step": 533100 }, { "epoch": 2.1721396206699537, "grad_norm": 6.207178592681885, "learning_rate": 0.0038099026541048783, "loss": 7.8461, "step": 533200 }, { "epoch": 2.172546998693335, "grad_norm": 5.238420009613037, "learning_rate": 0.0038094797939593637, "loss": 7.8158, "step": 533300 }, { "epoch": 2.1729543767167163, "grad_norm": 6.278920650482178, "learning_rate": 0.0038090568822407855, "loss": 7.837, "step": 533400 }, { "epoch": 2.173361754740098, "grad_norm": 7.702585220336914, "learning_rate": 0.003808633918965862, "loss": 7.8248, "step": 533500 }, { "epoch": 2.1737691327634794, "grad_norm": 3.48207426071167, "learning_rate": 0.0038082109041513183, "loss": 7.8354, "step": 533600 }, { "epoch": 2.174176510786861, "grad_norm": 4.544466972351074, "learning_rate": 0.0038077878378138777, "loss": 7.8698, "step": 533700 }, { "epoch": 2.1745838888102424, "grad_norm": 4.2195611000061035, "learning_rate": 0.003807364719970264, "loss": 7.8203, "step": 533800 }, { "epoch": 2.174991266833624, "grad_norm": 11.225996017456055, "learning_rate": 0.0038069415506372085, "loss": 7.8338, "step": 533900 }, { "epoch": 2.175398644857005, "grad_norm": 10.38953971862793, "learning_rate": 0.0038065183298314352, "loss": 7.8005, "step": 534000 }, { "epoch": 2.175398644857005, "eval_MaskedAccuracy": 0.49920180309420636, "eval_loss": 1.6535975933074951, "eval_runtime": 231.6333, "eval_samples_per_second": 274.037, "eval_steps_per_second": 1.071, "step": 534000 }, { "epoch": 2.1758060228803866, "grad_norm": 8.93643569946289, "learning_rate": 0.0038060950575696854, "loss": 7.8441, "step": 534100 }, { "epoch": 2.176213400903768, "grad_norm": 4.134316921234131, "learning_rate": 0.0038056717338686872, "loss": 7.824, "step": 534200 }, { "epoch": 2.1766207789271497, "grad_norm": 3.2579381465911865, "learning_rate": 0.003805248358745182, "loss": 7.8401, "step": 534300 }, { "epoch": 2.177028156950531, "grad_norm": 3.608017921447754, "learning_rate": 0.003804824932215902, "loss": 7.8208, "step": 534400 }, { "epoch": 2.1774355349739127, "grad_norm": 5.360105514526367, "learning_rate": 0.0038044014542975946, "loss": 7.806, "step": 534500 }, { "epoch": 2.177842912997294, "grad_norm": 5.1331586837768555, "learning_rate": 0.003803977925006997, "loss": 7.7993, "step": 534600 }, { "epoch": 2.1782502910206754, "grad_norm": 2.071043014526367, "learning_rate": 0.0038035543443608534, "loss": 7.8678, "step": 534700 }, { "epoch": 2.178657669044057, "grad_norm": 6.327230453491211, "learning_rate": 0.003803130712375911, "loss": 7.8327, "step": 534800 }, { "epoch": 2.1790650470674384, "grad_norm": 6.99228572845459, "learning_rate": 0.00380270702906892, "loss": 7.8138, "step": 534900 }, { "epoch": 2.17947242509082, "grad_norm": 3.339266300201416, "learning_rate": 0.0038022832944566284, "loss": 7.8173, "step": 535000 }, { "epoch": 2.17947242509082, "eval_MaskedAccuracy": 0.49975830231760876, "eval_loss": 1.6462153196334839, "eval_runtime": 173.1718, "eval_samples_per_second": 366.549, "eval_steps_per_second": 1.432, "step": 535000 }, { "epoch": 2.1798798031142015, "grad_norm": 2.3373918533325195, "learning_rate": 0.003801859508555789, "loss": 7.8105, "step": 535100 }, { "epoch": 2.1802871811375826, "grad_norm": 2.4057939052581787, "learning_rate": 0.003801435671383158, "loss": 7.8449, "step": 535200 }, { "epoch": 2.180694559160964, "grad_norm": 2.7518136501312256, "learning_rate": 0.0038010117829554903, "loss": 7.8294, "step": 535300 }, { "epoch": 2.1811019371843456, "grad_norm": 2.5903055667877197, "learning_rate": 0.0038005878432895465, "loss": 7.7928, "step": 535400 }, { "epoch": 2.181509315207727, "grad_norm": 3.3852591514587402, "learning_rate": 0.0038001638524020856, "loss": 7.8247, "step": 535500 }, { "epoch": 2.1819166932311087, "grad_norm": 4.770111083984375, "learning_rate": 0.003799739810309872, "loss": 7.8157, "step": 535600 }, { "epoch": 2.1823240712544902, "grad_norm": 3.0455150604248047, "learning_rate": 0.0037993157170296645, "loss": 7.8099, "step": 535700 }, { "epoch": 2.1827314492778713, "grad_norm": 2.8539605140686035, "learning_rate": 0.003798891572578236, "loss": 7.8076, "step": 535800 }, { "epoch": 2.183138827301253, "grad_norm": 7.808065891265869, "learning_rate": 0.003798467376972358, "loss": 7.8456, "step": 535900 }, { "epoch": 2.1835462053246344, "grad_norm": 4.614112854003906, "learning_rate": 0.0037980431302287974, "loss": 7.8519, "step": 536000 }, { "epoch": 2.1835462053246344, "eval_MaskedAccuracy": 0.4990841277760691, "eval_loss": 1.6543712615966797, "eval_runtime": 162.9235, "eval_samples_per_second": 389.606, "eval_steps_per_second": 1.522, "step": 536000 }, { "epoch": 2.183953583348016, "grad_norm": 4.20052433013916, "learning_rate": 0.003797618832364323, "loss": 7.8395, "step": 536100 }, { "epoch": 2.1843609613713975, "grad_norm": 5.1673784255981445, "learning_rate": 0.0037971944833957074, "loss": 7.8113, "step": 536200 }, { "epoch": 2.184768339394779, "grad_norm": 1.4922406673431396, "learning_rate": 0.0037967700833397375, "loss": 7.8287, "step": 536300 }, { "epoch": 2.1851757174181605, "grad_norm": 4.0081095695495605, "learning_rate": 0.0037963456322131902, "loss": 7.8482, "step": 536400 }, { "epoch": 2.1855830954415416, "grad_norm": 6.309442520141602, "learning_rate": 0.003795921130032841, "loss": 7.8272, "step": 536500 }, { "epoch": 2.185990473464923, "grad_norm": 8.077388763427734, "learning_rate": 0.003795496576815472, "loss": 7.8017, "step": 536600 }, { "epoch": 2.1863978514883047, "grad_norm": 4.251585483551025, "learning_rate": 0.003795071972577877, "loss": 7.8262, "step": 536700 }, { "epoch": 2.186805229511686, "grad_norm": 3.7119550704956055, "learning_rate": 0.003794647317336835, "loss": 7.8292, "step": 536800 }, { "epoch": 2.1872126075350677, "grad_norm": 4.681642055511475, "learning_rate": 0.003794222611109138, "loss": 7.806, "step": 536900 }, { "epoch": 2.1876199855584493, "grad_norm": 2.630150318145752, "learning_rate": 0.0037937978539115736, "loss": 7.8133, "step": 537000 }, { "epoch": 2.1876199855584493, "eval_MaskedAccuracy": 0.500926470716596, "eval_loss": 1.6362963914871216, "eval_runtime": 158.0037, "eval_samples_per_second": 401.737, "eval_steps_per_second": 1.57, "step": 537000 }, { "epoch": 2.1880273635818304, "grad_norm": 3.701378583908081, "learning_rate": 0.0037933730457609376, "loss": 7.8424, "step": 537100 }, { "epoch": 2.188434741605212, "grad_norm": 4.026693344116211, "learning_rate": 0.003792948186674025, "loss": 7.7871, "step": 537200 }, { "epoch": 2.1888421196285934, "grad_norm": 1.4569182395935059, "learning_rate": 0.0037925232766676297, "loss": 7.856, "step": 537300 }, { "epoch": 2.189249497651975, "grad_norm": 2.1214449405670166, "learning_rate": 0.003792098315758554, "loss": 7.8341, "step": 537400 }, { "epoch": 2.1896568756753565, "grad_norm": 4.9652814865112305, "learning_rate": 0.0037916733039635993, "loss": 7.8549, "step": 537500 }, { "epoch": 2.190064253698738, "grad_norm": 1.9439303874969482, "learning_rate": 0.003791248241299567, "loss": 7.8433, "step": 537600 }, { "epoch": 2.190471631722119, "grad_norm": 8.613899230957031, "learning_rate": 0.003790823127783262, "loss": 7.8526, "step": 537700 }, { "epoch": 2.1908790097455006, "grad_norm": 8.970516204833984, "learning_rate": 0.003790397963431496, "loss": 7.8374, "step": 537800 }, { "epoch": 2.191286387768882, "grad_norm": 5.993648529052734, "learning_rate": 0.0037899727482610676, "loss": 7.8458, "step": 537900 }, { "epoch": 2.1916937657922637, "grad_norm": 3.2168002128601074, "learning_rate": 0.003789547482288795, "loss": 7.8049, "step": 538000 }, { "epoch": 2.1916937657922637, "eval_MaskedAccuracy": 0.5003411745718921, "eval_loss": 1.6429698467254639, "eval_runtime": 168.3955, "eval_samples_per_second": 376.946, "eval_steps_per_second": 1.473, "step": 538000 }, { "epoch": 2.1921011438156452, "grad_norm": 3.894731283187866, "learning_rate": 0.003789122165531492, "loss": 7.874, "step": 538100 }, { "epoch": 2.192508521839027, "grad_norm": 2.1121866703033447, "learning_rate": 0.0037886967980059728, "loss": 7.8311, "step": 538200 }, { "epoch": 2.192915899862408, "grad_norm": 4.866525650024414, "learning_rate": 0.0037882713797290565, "loss": 7.8234, "step": 538300 }, { "epoch": 2.1933232778857894, "grad_norm": 3.906606435775757, "learning_rate": 0.003787845910717556, "loss": 7.8283, "step": 538400 }, { "epoch": 2.193730655909171, "grad_norm": 4.288459777832031, "learning_rate": 0.0037874203909882957, "loss": 7.8652, "step": 538500 }, { "epoch": 2.1941380339325525, "grad_norm": 2.739276885986328, "learning_rate": 0.0037869948205580973, "loss": 7.8477, "step": 538600 }, { "epoch": 2.194545411955934, "grad_norm": 5.647121906280518, "learning_rate": 0.003786569199443788, "loss": 7.8288, "step": 538700 }, { "epoch": 2.1949527899793155, "grad_norm": 7.2101826667785645, "learning_rate": 0.0037861435276621926, "loss": 7.8173, "step": 538800 }, { "epoch": 2.195360168002697, "grad_norm": 2.7694876194000244, "learning_rate": 0.0037857178052301405, "loss": 7.8293, "step": 538900 }, { "epoch": 2.195767546026078, "grad_norm": 3.975038766860962, "learning_rate": 0.003785292032164464, "loss": 7.8396, "step": 539000 }, { "epoch": 2.195767546026078, "eval_MaskedAccuracy": 0.4981996990229896, "eval_loss": 1.6436177492141724, "eval_runtime": 166.5758, "eval_samples_per_second": 381.064, "eval_steps_per_second": 1.489, "step": 539000 }, { "epoch": 2.1961749240494597, "grad_norm": 5.791339874267578, "learning_rate": 0.0037848662084819947, "loss": 7.8639, "step": 539100 }, { "epoch": 2.196582302072841, "grad_norm": 2.7626590728759766, "learning_rate": 0.0037844403341995717, "loss": 7.8601, "step": 539200 }, { "epoch": 2.1969896800962228, "grad_norm": 4.960364818572998, "learning_rate": 0.003784014409334029, "loss": 7.8235, "step": 539300 }, { "epoch": 2.1973970581196043, "grad_norm": 6.897753715515137, "learning_rate": 0.0037835884339022046, "loss": 7.8117, "step": 539400 }, { "epoch": 2.197804436142986, "grad_norm": 3.9038569927215576, "learning_rate": 0.0037831624079209423, "loss": 7.8342, "step": 539500 }, { "epoch": 2.198211814166367, "grad_norm": 4.344732761383057, "learning_rate": 0.003782736331407085, "loss": 7.8416, "step": 539600 }, { "epoch": 2.1986191921897484, "grad_norm": 4.075064659118652, "learning_rate": 0.0037823102043774764, "loss": 7.8247, "step": 539700 }, { "epoch": 2.19902657021313, "grad_norm": 2.741217851638794, "learning_rate": 0.0037818840268489654, "loss": 7.8261, "step": 539800 }, { "epoch": 2.1994339482365115, "grad_norm": 6.242165565490723, "learning_rate": 0.0037814577988383967, "loss": 7.8609, "step": 539900 }, { "epoch": 2.199841326259893, "grad_norm": 2.72274112701416, "learning_rate": 0.003781031520362623, "loss": 7.829, "step": 540000 }, { "epoch": 2.199841326259893, "eval_MaskedAccuracy": 0.49986804644871313, "eval_loss": 1.6495589017868042, "eval_runtime": 159.9924, "eval_samples_per_second": 396.744, "eval_steps_per_second": 1.55, "step": 540000 }, { "epoch": 2.2002487042832746, "grad_norm": 3.6194722652435303, "learning_rate": 0.0037806051914385007, "loss": 7.8281, "step": 540100 }, { "epoch": 2.2006560823066557, "grad_norm": 2.2882962226867676, "learning_rate": 0.0037801788120828835, "loss": 7.8213, "step": 540200 }, { "epoch": 2.201063460330037, "grad_norm": 7.555246353149414, "learning_rate": 0.003779752382312627, "loss": 7.8318, "step": 540300 }, { "epoch": 2.2014708383534187, "grad_norm": 2.5695087909698486, "learning_rate": 0.003779325902144591, "loss": 7.8081, "step": 540400 }, { "epoch": 2.2018782163768003, "grad_norm": 4.27829122543335, "learning_rate": 0.003778899371595635, "loss": 7.821, "step": 540500 }, { "epoch": 2.202285594400182, "grad_norm": 2.186814785003662, "learning_rate": 0.0037784727906826233, "loss": 7.8395, "step": 540600 }, { "epoch": 2.2026929724235633, "grad_norm": 1.746558666229248, "learning_rate": 0.003778046159422424, "loss": 7.8174, "step": 540700 }, { "epoch": 2.2031003504469444, "grad_norm": 4.74664831161499, "learning_rate": 0.0037776194778318998, "loss": 7.8418, "step": 540800 }, { "epoch": 2.203507728470326, "grad_norm": 8.453991889953613, "learning_rate": 0.0037771927459279214, "loss": 7.8433, "step": 540900 }, { "epoch": 2.2039151064937075, "grad_norm": 3.422455310821533, "learning_rate": 0.003776765963727358, "loss": 7.8282, "step": 541000 }, { "epoch": 2.2039151064937075, "eval_MaskedAccuracy": 0.4993272886132466, "eval_loss": 1.6447093486785889, "eval_runtime": 157.4615, "eval_samples_per_second": 403.121, "eval_steps_per_second": 1.575, "step": 541000 }, { "epoch": 2.204322484517089, "grad_norm": 7.143071174621582, "learning_rate": 0.0037763391312470866, "loss": 7.8194, "step": 541100 }, { "epoch": 2.2047298625404705, "grad_norm": 3.948322057723999, "learning_rate": 0.0037759122485039763, "loss": 7.8531, "step": 541200 }, { "epoch": 2.205137240563852, "grad_norm": 2.6446802616119385, "learning_rate": 0.0037754853155149088, "loss": 7.8152, "step": 541300 }, { "epoch": 2.2055446185872336, "grad_norm": 2.490008592605591, "learning_rate": 0.0037750583322967617, "loss": 7.8329, "step": 541400 }, { "epoch": 2.2059519966106147, "grad_norm": 3.8207499980926514, "learning_rate": 0.0037746312988664187, "loss": 7.8432, "step": 541500 }, { "epoch": 2.2063593746339962, "grad_norm": 4.180668354034424, "learning_rate": 0.0037742042152407564, "loss": 7.8069, "step": 541600 }, { "epoch": 2.2067667526573778, "grad_norm": 2.0634877681732178, "learning_rate": 0.003773777081436664, "loss": 7.8385, "step": 541700 }, { "epoch": 2.2071741306807593, "grad_norm": 6.988525390625, "learning_rate": 0.0037733498974710285, "loss": 7.8492, "step": 541800 }, { "epoch": 2.207581508704141, "grad_norm": 3.995124340057373, "learning_rate": 0.003772922663360737, "loss": 7.8394, "step": 541900 }, { "epoch": 2.2079888867275224, "grad_norm": 5.8167195320129395, "learning_rate": 0.003772495379122682, "loss": 7.8169, "step": 542000 }, { "epoch": 2.2079888867275224, "eval_MaskedAccuracy": 0.49963134657016, "eval_loss": 1.6503068208694458, "eval_runtime": 160.0787, "eval_samples_per_second": 396.53, "eval_steps_per_second": 1.549, "step": 542000 }, { "epoch": 2.2083962647509034, "grad_norm": 3.727559804916382, "learning_rate": 0.0037720680447737532, "loss": 7.8193, "step": 542100 }, { "epoch": 2.208803642774285, "grad_norm": 3.18107271194458, "learning_rate": 0.0037716406603308518, "loss": 7.8175, "step": 542200 }, { "epoch": 2.2092110207976665, "grad_norm": 3.3818202018737793, "learning_rate": 0.00377121322581087, "loss": 7.8551, "step": 542300 }, { "epoch": 2.209618398821048, "grad_norm": 1.877657175064087, "learning_rate": 0.0037707857412307075, "loss": 7.8386, "step": 542400 }, { "epoch": 2.2100257768444296, "grad_norm": 3.3674402236938477, "learning_rate": 0.003770358206607266, "loss": 7.8581, "step": 542500 }, { "epoch": 2.210433154867811, "grad_norm": 4.645495891571045, "learning_rate": 0.003769930621957445, "loss": 7.8186, "step": 542600 }, { "epoch": 2.210840532891192, "grad_norm": 5.1082024574279785, "learning_rate": 0.0037695029872981513, "loss": 7.8224, "step": 542700 }, { "epoch": 2.2112479109145737, "grad_norm": 2.660203456878662, "learning_rate": 0.0037690753026462893, "loss": 7.8291, "step": 542800 }, { "epoch": 2.2116552889379553, "grad_norm": 7.779784679412842, "learning_rate": 0.0037686475680187713, "loss": 7.7967, "step": 542900 }, { "epoch": 2.212062666961337, "grad_norm": 2.886215925216675, "learning_rate": 0.0037682197834325087, "loss": 7.8453, "step": 543000 }, { "epoch": 2.212062666961337, "eval_MaskedAccuracy": 0.5005089513368042, "eval_loss": 1.6450562477111816, "eval_runtime": 206.8242, "eval_samples_per_second": 306.908, "eval_steps_per_second": 1.199, "step": 543000 }, { "epoch": 2.2124700449847183, "grad_norm": 2.2003118991851807, "learning_rate": 0.0037677919489044116, "loss": 7.84, "step": 543100 }, { "epoch": 2.2128774230081, "grad_norm": 3.7952935695648193, "learning_rate": 0.0037673640644513926, "loss": 7.8513, "step": 543200 }, { "epoch": 2.213284801031481, "grad_norm": 3.5863821506500244, "learning_rate": 0.003766936130090373, "loss": 7.8173, "step": 543300 }, { "epoch": 2.2136921790548625, "grad_norm": 7.687413215637207, "learning_rate": 0.0037665081458382638, "loss": 7.8407, "step": 543400 }, { "epoch": 2.214099557078244, "grad_norm": 2.257976770401001, "learning_rate": 0.0037660801117119945, "loss": 7.8417, "step": 543500 }, { "epoch": 2.2145069351016256, "grad_norm": 9.945231437683105, "learning_rate": 0.003765652027728486, "loss": 7.8569, "step": 543600 }, { "epoch": 2.214914313125007, "grad_norm": 2.041649341583252, "learning_rate": 0.003765223893904657, "loss": 7.8409, "step": 543700 }, { "epoch": 2.2153216911483886, "grad_norm": 4.65684175491333, "learning_rate": 0.003764795710257438, "loss": 7.8805, "step": 543800 }, { "epoch": 2.21572906917177, "grad_norm": 3.468898296356201, "learning_rate": 0.0037643674768037564, "loss": 7.8164, "step": 543900 }, { "epoch": 2.2161364471951512, "grad_norm": 5.240268230438232, "learning_rate": 0.0037639391935605417, "loss": 7.8377, "step": 544000 }, { "epoch": 2.2161364471951512, "eval_MaskedAccuracy": 0.5007431908521538, "eval_loss": 1.647658348083496, "eval_runtime": 168.2637, "eval_samples_per_second": 377.241, "eval_steps_per_second": 1.474, "step": 544000 }, { "epoch": 2.2165438252185328, "grad_norm": 3.825221538543701, "learning_rate": 0.0037635108605447295, "loss": 7.8212, "step": 544100 }, { "epoch": 2.2169512032419143, "grad_norm": 2.2604832649230957, "learning_rate": 0.00376308247777325, "loss": 7.8093, "step": 544200 }, { "epoch": 2.217358581265296, "grad_norm": 4.002956390380859, "learning_rate": 0.003762654045263041, "loss": 7.8089, "step": 544300 }, { "epoch": 2.2177659592886774, "grad_norm": 2.3894317150115967, "learning_rate": 0.003762225563031042, "loss": 7.836, "step": 544400 }, { "epoch": 2.218173337312059, "grad_norm": 3.4666402339935303, "learning_rate": 0.0037617970310941935, "loss": 7.8195, "step": 544500 }, { "epoch": 2.21858071533544, "grad_norm": 10.492304801940918, "learning_rate": 0.003761368449469437, "loss": 7.8253, "step": 544600 }, { "epoch": 2.2189880933588215, "grad_norm": 1.558408498764038, "learning_rate": 0.0037609398181737167, "loss": 7.7988, "step": 544700 }, { "epoch": 2.219395471382203, "grad_norm": 5.022818565368652, "learning_rate": 0.0037605111372239794, "loss": 7.8072, "step": 544800 }, { "epoch": 2.2198028494055846, "grad_norm": 6.653713226318359, "learning_rate": 0.003760082406637168, "loss": 7.8233, "step": 544900 }, { "epoch": 2.220210227428966, "grad_norm": 5.106639862060547, "learning_rate": 0.0037596536264302384, "loss": 7.8141, "step": 545000 }, { "epoch": 2.220210227428966, "eval_MaskedAccuracy": 0.5007562128437728, "eval_loss": 1.6528507471084595, "eval_runtime": 177.6296, "eval_samples_per_second": 357.35, "eval_steps_per_second": 1.396, "step": 545000 }, { "epoch": 2.2206176054523477, "grad_norm": 2.980137586593628, "learning_rate": 0.0037592247966201377, "loss": 7.8296, "step": 545100 }, { "epoch": 2.2210249834757287, "grad_norm": 4.1934332847595215, "learning_rate": 0.0037587959172238243, "loss": 7.8092, "step": 545200 }, { "epoch": 2.2214323614991103, "grad_norm": 3.990290403366089, "learning_rate": 0.0037583669882582525, "loss": 7.8209, "step": 545300 }, { "epoch": 2.221839739522492, "grad_norm": 3.0091090202331543, "learning_rate": 0.0037579380097403775, "loss": 7.825, "step": 545400 }, { "epoch": 2.2222471175458733, "grad_norm": 1.3348726034164429, "learning_rate": 0.0037575089816871614, "loss": 7.803, "step": 545500 }, { "epoch": 2.222654495569255, "grad_norm": 4.475834846496582, "learning_rate": 0.0037570799041155693, "loss": 7.8535, "step": 545600 }, { "epoch": 2.2230618735926364, "grad_norm": 6.8989787101745605, "learning_rate": 0.003756650777042563, "loss": 7.8025, "step": 545700 }, { "epoch": 2.2234692516160175, "grad_norm": 5.077826976776123, "learning_rate": 0.003756221600485106, "loss": 7.8013, "step": 545800 }, { "epoch": 2.223876629639399, "grad_norm": 2.5384793281555176, "learning_rate": 0.0037557923744601656, "loss": 7.8181, "step": 545900 }, { "epoch": 2.2242840076627806, "grad_norm": 4.500861644744873, "learning_rate": 0.0037553630989847144, "loss": 7.8304, "step": 546000 }, { "epoch": 2.2242840076627806, "eval_MaskedAccuracy": 0.5001765546564289, "eval_loss": 1.6489428281784058, "eval_runtime": 157.7882, "eval_samples_per_second": 402.286, "eval_steps_per_second": 1.572, "step": 546000 }, { "epoch": 2.224691385686162, "grad_norm": 3.0855696201324463, "learning_rate": 0.0037549337740757196, "loss": 7.8139, "step": 546100 }, { "epoch": 2.2250987637095436, "grad_norm": 3.060678720474243, "learning_rate": 0.0037545043997501606, "loss": 7.7787, "step": 546200 }, { "epoch": 2.225506141732925, "grad_norm": 3.354079008102417, "learning_rate": 0.0037540749760250143, "loss": 7.8235, "step": 546300 }, { "epoch": 2.2259135197563067, "grad_norm": 2.1660995483398438, "learning_rate": 0.003753645502917249, "loss": 7.814, "step": 546400 }, { "epoch": 2.226320897779688, "grad_norm": 2.482395887374878, "learning_rate": 0.003753215980443851, "loss": 7.7994, "step": 546500 }, { "epoch": 2.2267282758030693, "grad_norm": 5.098010540008545, "learning_rate": 0.0037527864086217954, "loss": 7.8203, "step": 546600 }, { "epoch": 2.227135653826451, "grad_norm": 4.216400623321533, "learning_rate": 0.003752356787468074, "loss": 7.7922, "step": 546700 }, { "epoch": 2.2275430318498324, "grad_norm": 5.281837463378906, "learning_rate": 0.0037519271169996625, "loss": 7.8095, "step": 546800 }, { "epoch": 2.227950409873214, "grad_norm": 8.910423278808594, "learning_rate": 0.003751497397233558, "loss": 7.796, "step": 546900 }, { "epoch": 2.2283577878965954, "grad_norm": 3.28885817527771, "learning_rate": 0.0037510676281867425, "loss": 7.8271, "step": 547000 }, { "epoch": 2.2283577878965954, "eval_MaskedAccuracy": 0.5011636216941383, "eval_loss": 1.6514025926589966, "eval_runtime": 170.1542, "eval_samples_per_second": 373.05, "eval_steps_per_second": 1.458, "step": 547000 }, { "epoch": 2.2287651659199765, "grad_norm": 5.114754676818848, "learning_rate": 0.003750637809876211, "loss": 7.8091, "step": 547100 }, { "epoch": 2.229172543943358, "grad_norm": 3.2427895069122314, "learning_rate": 0.003750207942318954, "loss": 7.8476, "step": 547200 }, { "epoch": 2.2295799219667396, "grad_norm": 3.4154791831970215, "learning_rate": 0.0037497780255319677, "loss": 7.7966, "step": 547300 }, { "epoch": 2.229987299990121, "grad_norm": 5.754019737243652, "learning_rate": 0.003749348059532246, "loss": 7.8493, "step": 547400 }, { "epoch": 2.2303946780135027, "grad_norm": 3.7867672443389893, "learning_rate": 0.0037489180443367877, "loss": 7.793, "step": 547500 }, { "epoch": 2.230802056036884, "grad_norm": 3.4546449184417725, "learning_rate": 0.0037484879799626, "loss": 7.8381, "step": 547600 }, { "epoch": 2.2312094340602653, "grad_norm": 6.485270977020264, "learning_rate": 0.003748057866426681, "loss": 7.8148, "step": 547700 }, { "epoch": 2.231616812083647, "grad_norm": 3.5456833839416504, "learning_rate": 0.003747627703746035, "loss": 7.8025, "step": 547800 }, { "epoch": 2.2320241901070283, "grad_norm": 4.4459967613220215, "learning_rate": 0.0037471974919376714, "loss": 7.8222, "step": 547900 }, { "epoch": 2.23243156813041, "grad_norm": 6.9822001457214355, "learning_rate": 0.0037467672310185973, "loss": 7.8157, "step": 548000 }, { "epoch": 2.23243156813041, "eval_MaskedAccuracy": 0.5004119611941124, "eval_loss": 1.6517724990844727, "eval_runtime": 156.7003, "eval_samples_per_second": 405.079, "eval_steps_per_second": 1.583, "step": 548000 }, { "epoch": 2.2328389461537914, "grad_norm": 3.1292076110839844, "learning_rate": 0.003746336921005827, "loss": 7.8315, "step": 548100 }, { "epoch": 2.233246324177173, "grad_norm": 8.645283699035645, "learning_rate": 0.0037459065619163683, "loss": 7.8157, "step": 548200 }, { "epoch": 2.233653702200554, "grad_norm": 3.8814592361450195, "learning_rate": 0.0037454761537672374, "loss": 7.8482, "step": 548300 }, { "epoch": 2.2340610802239356, "grad_norm": 6.201331615447998, "learning_rate": 0.0037450456965754456, "loss": 7.8313, "step": 548400 }, { "epoch": 2.234468458247317, "grad_norm": 3.4205143451690674, "learning_rate": 0.0037446151903580164, "loss": 7.8251, "step": 548500 }, { "epoch": 2.2348758362706986, "grad_norm": 2.4528141021728516, "learning_rate": 0.0037441846351319668, "loss": 7.8233, "step": 548600 }, { "epoch": 2.23528321429408, "grad_norm": 4.1301093101501465, "learning_rate": 0.003743754030914326, "loss": 7.8346, "step": 548700 }, { "epoch": 2.2356905923174617, "grad_norm": 2.6611547470092773, "learning_rate": 0.0037433233777221108, "loss": 7.8525, "step": 548800 }, { "epoch": 2.2360979703408432, "grad_norm": 3.2765557765960693, "learning_rate": 0.0037428926755723488, "loss": 7.8312, "step": 548900 }, { "epoch": 2.2365053483642243, "grad_norm": 2.1360530853271484, "learning_rate": 0.0037424619244820685, "loss": 7.8125, "step": 549000 }, { "epoch": 2.2365053483642243, "eval_MaskedAccuracy": 0.5003689653270004, "eval_loss": 1.6474647521972656, "eval_runtime": 201.9599, "eval_samples_per_second": 314.3, "eval_steps_per_second": 1.228, "step": 549000 }, { "epoch": 2.236912726387606, "grad_norm": 5.588429927825928, "learning_rate": 0.0037420311244682988, "loss": 7.802, "step": 549100 }, { "epoch": 2.2373201044109874, "grad_norm": 3.0774121284484863, "learning_rate": 0.003741600275548074, "loss": 7.8041, "step": 549200 }, { "epoch": 2.237727482434369, "grad_norm": 4.307383060455322, "learning_rate": 0.0037411693777384242, "loss": 7.8031, "step": 549300 }, { "epoch": 2.2381348604577505, "grad_norm": 4.235053539276123, "learning_rate": 0.003740738431056393, "loss": 7.7924, "step": 549400 }, { "epoch": 2.238542238481132, "grad_norm": 3.976292371749878, "learning_rate": 0.0037403074355190067, "loss": 7.8167, "step": 549500 }, { "epoch": 2.238949616504513, "grad_norm": 11.318830490112305, "learning_rate": 0.003739876391143312, "loss": 7.7853, "step": 549600 }, { "epoch": 2.2393569945278946, "grad_norm": 5.350687026977539, "learning_rate": 0.0037394452979463495, "loss": 7.8366, "step": 549700 }, { "epoch": 2.239764372551276, "grad_norm": 3.1678311824798584, "learning_rate": 0.0037390141559451606, "loss": 7.8202, "step": 549800 }, { "epoch": 2.2401717505746577, "grad_norm": 4.260071277618408, "learning_rate": 0.003738582965156799, "loss": 7.8077, "step": 549900 }, { "epoch": 2.240579128598039, "grad_norm": 2.2095463275909424, "learning_rate": 0.0037381517255982975, "loss": 7.8411, "step": 550000 }, { "epoch": 2.240579128598039, "eval_MaskedAccuracy": 0.5005593729909046, "eval_loss": 1.642432689666748, "eval_runtime": 171.0837, "eval_samples_per_second": 371.023, "eval_steps_per_second": 1.45, "step": 550000 }, { "epoch": 2.2409865066214207, "grad_norm": 6.515848636627197, "learning_rate": 0.0037377204372867168, "loss": 7.8074, "step": 550100 }, { "epoch": 2.241393884644802, "grad_norm": 2.398937940597534, "learning_rate": 0.003737289100239101, "loss": 7.8119, "step": 550200 }, { "epoch": 2.2418012626681834, "grad_norm": 2.9077181816101074, "learning_rate": 0.0037368577144725073, "loss": 7.8166, "step": 550300 }, { "epoch": 2.242208640691565, "grad_norm": 4.874625205993652, "learning_rate": 0.003736426280003991, "loss": 7.8138, "step": 550400 }, { "epoch": 2.2426160187149464, "grad_norm": 5.9073967933654785, "learning_rate": 0.0037359947968506053, "loss": 7.8335, "step": 550500 }, { "epoch": 2.243023396738328, "grad_norm": 3.353095293045044, "learning_rate": 0.0037355632650294137, "loss": 7.792, "step": 550600 }, { "epoch": 2.2434307747617095, "grad_norm": 1.4904179573059082, "learning_rate": 0.0037351316845574715, "loss": 7.8373, "step": 550700 }, { "epoch": 2.2438381527850906, "grad_norm": 1.9586129188537598, "learning_rate": 0.003734700055451846, "loss": 7.8444, "step": 550800 }, { "epoch": 2.244245530808472, "grad_norm": 3.9642250537872314, "learning_rate": 0.0037342683777296013, "loss": 7.8427, "step": 550900 }, { "epoch": 2.2446529088318536, "grad_norm": 3.0593836307525635, "learning_rate": 0.003733836651407803, "loss": 7.7923, "step": 551000 }, { "epoch": 2.2446529088318536, "eval_MaskedAccuracy": 0.501333218923484, "eval_loss": 1.635178565979004, "eval_runtime": 161.8285, "eval_samples_per_second": 392.242, "eval_steps_per_second": 1.532, "step": 551000 }, { "epoch": 2.245060286855235, "grad_norm": 3.8172152042388916, "learning_rate": 0.0037334048765035174, "loss": 7.8236, "step": 551100 }, { "epoch": 2.2454676648786167, "grad_norm": 3.798973321914673, "learning_rate": 0.0037329730530338227, "loss": 7.7827, "step": 551200 }, { "epoch": 2.2458750429019982, "grad_norm": 1.6006982326507568, "learning_rate": 0.003732541181015784, "loss": 7.8251, "step": 551300 }, { "epoch": 2.2462824209253798, "grad_norm": 4.4721198081970215, "learning_rate": 0.0037321092604664773, "loss": 7.8398, "step": 551400 }, { "epoch": 2.246689798948761, "grad_norm": 2.283562421798706, "learning_rate": 0.003731677291402978, "loss": 7.7928, "step": 551500 }, { "epoch": 2.2470971769721424, "grad_norm": 7.920275688171387, "learning_rate": 0.003731245273842359, "loss": 7.8573, "step": 551600 }, { "epoch": 2.247504554995524, "grad_norm": 1.8555209636688232, "learning_rate": 0.0037308132078017133, "loss": 7.8155, "step": 551700 }, { "epoch": 2.2479119330189055, "grad_norm": 2.872659921646118, "learning_rate": 0.0037303810932981163, "loss": 7.8305, "step": 551800 }, { "epoch": 2.248319311042287, "grad_norm": 1.9988043308258057, "learning_rate": 0.00372994893034865, "loss": 7.8389, "step": 551900 }, { "epoch": 2.2487266890656685, "grad_norm": 3.3895010948181152, "learning_rate": 0.0037295167189703984, "loss": 7.843, "step": 552000 }, { "epoch": 2.2487266890656685, "eval_MaskedAccuracy": 0.5008253832687625, "eval_loss": 1.643225908279419, "eval_runtime": 185.1265, "eval_samples_per_second": 342.879, "eval_steps_per_second": 1.34, "step": 552000 }, { "epoch": 2.2491340670890496, "grad_norm": 2.5584287643432617, "learning_rate": 0.0037290844591804516, "loss": 7.8113, "step": 552100 }, { "epoch": 2.249541445112431, "grad_norm": 13.224796295166016, "learning_rate": 0.0037286521509959014, "loss": 7.8325, "step": 552200 }, { "epoch": 2.2499488231358127, "grad_norm": 2.5514280796051025, "learning_rate": 0.0037282197944338326, "loss": 7.8443, "step": 552300 }, { "epoch": 2.250356201159194, "grad_norm": 3.5032153129577637, "learning_rate": 0.0037277873895113453, "loss": 7.794, "step": 552400 }, { "epoch": 2.2507635791825757, "grad_norm": 4.562185287475586, "learning_rate": 0.003727354936245535, "loss": 7.7988, "step": 552500 }, { "epoch": 2.2511709572059573, "grad_norm": 5.177770614624023, "learning_rate": 0.0037269224346534965, "loss": 7.7994, "step": 552600 }, { "epoch": 2.2515783352293384, "grad_norm": 2.4556784629821777, "learning_rate": 0.003726489884752326, "loss": 7.8352, "step": 552700 }, { "epoch": 2.25198571325272, "grad_norm": 7.931498050689697, "learning_rate": 0.0037260572865591276, "loss": 7.8044, "step": 552800 }, { "epoch": 2.2523930912761014, "grad_norm": 4.53995943069458, "learning_rate": 0.003725624640091002, "loss": 7.8239, "step": 552900 }, { "epoch": 2.252800469299483, "grad_norm": 4.165882587432861, "learning_rate": 0.003725191945365057, "loss": 7.7913, "step": 553000 }, { "epoch": 2.252800469299483, "eval_MaskedAccuracy": 0.5001803101334295, "eval_loss": 1.644093632698059, "eval_runtime": 194.6042, "eval_samples_per_second": 326.18, "eval_steps_per_second": 1.274, "step": 553000 }, { "epoch": 2.2532078473228645, "grad_norm": 4.2804179191589355, "learning_rate": 0.003724759202398399, "loss": 7.8301, "step": 553100 }, { "epoch": 2.253615225346246, "grad_norm": 5.424826622009277, "learning_rate": 0.003724326411208135, "loss": 7.8126, "step": 553200 }, { "epoch": 2.254022603369627, "grad_norm": 2.1051621437072754, "learning_rate": 0.0037238935718113776, "loss": 7.8271, "step": 553300 }, { "epoch": 2.2544299813930087, "grad_norm": 2.4027085304260254, "learning_rate": 0.003723460684225241, "loss": 7.8519, "step": 553400 }, { "epoch": 2.25483735941639, "grad_norm": 9.889691352844238, "learning_rate": 0.003723027748466836, "loss": 7.8033, "step": 553500 }, { "epoch": 2.2552447374397717, "grad_norm": 7.366210460662842, "learning_rate": 0.003722594764553277, "loss": 7.8267, "step": 553600 }, { "epoch": 2.2556521154631533, "grad_norm": 6.189861297607422, "learning_rate": 0.0037221617325016855, "loss": 7.8356, "step": 553700 }, { "epoch": 2.256059493486535, "grad_norm": 1.8813151121139526, "learning_rate": 0.0037217286523291786, "loss": 7.7938, "step": 553800 }, { "epoch": 2.2564668715099163, "grad_norm": 1.6196503639221191, "learning_rate": 0.0037212955240528856, "loss": 7.8403, "step": 553900 }, { "epoch": 2.2568742495332974, "grad_norm": 4.88037109375, "learning_rate": 0.0037208623476899226, "loss": 7.8406, "step": 554000 }, { "epoch": 2.2568742495332974, "eval_MaskedAccuracy": 0.5005618339050674, "eval_loss": 1.6498075723648071, "eval_runtime": 159.7862, "eval_samples_per_second": 397.256, "eval_steps_per_second": 1.552, "step": 554000 }, { "epoch": 2.257281627556679, "grad_norm": 3.5858309268951416, "learning_rate": 0.003720429123257416, "loss": 7.834, "step": 554100 }, { "epoch": 2.2576890055800605, "grad_norm": 1.9539397954940796, "learning_rate": 0.0037199958507724957, "loss": 7.7847, "step": 554200 }, { "epoch": 2.258096383603442, "grad_norm": 2.2557578086853027, "learning_rate": 0.0037195625302522935, "loss": 7.7873, "step": 554300 }, { "epoch": 2.2585037616268235, "grad_norm": 3.007535457611084, "learning_rate": 0.003719129161713936, "loss": 7.8134, "step": 554400 }, { "epoch": 2.2589111396502046, "grad_norm": 8.381376266479492, "learning_rate": 0.0037186957451745602, "loss": 7.8051, "step": 554500 }, { "epoch": 2.259318517673586, "grad_norm": 4.962038516998291, "learning_rate": 0.0037182622806513016, "loss": 7.833, "step": 554600 }, { "epoch": 2.2597258956969677, "grad_norm": 3.697455883026123, "learning_rate": 0.0037178287681612944, "loss": 7.7938, "step": 554700 }, { "epoch": 2.2601332737203492, "grad_norm": 6.414445877075195, "learning_rate": 0.003717395207721681, "loss": 7.819, "step": 554800 }, { "epoch": 2.2605406517437308, "grad_norm": 5.8277907371521, "learning_rate": 0.0037169615993495964, "loss": 7.8174, "step": 554900 }, { "epoch": 2.2609480297671123, "grad_norm": 1.5174105167388916, "learning_rate": 0.0037165279430621916, "loss": 7.8382, "step": 555000 }, { "epoch": 2.2609480297671123, "eval_MaskedAccuracy": 0.5009572925655903, "eval_loss": 1.6404433250427246, "eval_runtime": 156.6815, "eval_samples_per_second": 405.128, "eval_steps_per_second": 1.583, "step": 555000 }, { "epoch": 2.261355407790494, "grad_norm": 3.088292121887207, "learning_rate": 0.003716094238876608, "loss": 7.7966, "step": 555100 }, { "epoch": 2.261762785813875, "grad_norm": 3.5430777072906494, "learning_rate": 0.0037156604868099916, "loss": 7.8028, "step": 555200 }, { "epoch": 2.2621701638372564, "grad_norm": 3.1515798568725586, "learning_rate": 0.0037152266868794898, "loss": 7.8386, "step": 555300 }, { "epoch": 2.262577541860638, "grad_norm": 2.4247777462005615, "learning_rate": 0.0037147928391022566, "loss": 7.8397, "step": 555400 }, { "epoch": 2.2629849198840195, "grad_norm": 4.718557834625244, "learning_rate": 0.0037143589434954407, "loss": 7.8009, "step": 555500 }, { "epoch": 2.263392297907401, "grad_norm": 5.006442070007324, "learning_rate": 0.0037139250000761985, "loss": 7.8163, "step": 555600 }, { "epoch": 2.2637996759307826, "grad_norm": 3.265850067138672, "learning_rate": 0.003713491008861681, "loss": 7.7798, "step": 555700 }, { "epoch": 2.2642070539541637, "grad_norm": 7.734994411468506, "learning_rate": 0.003713056969869053, "loss": 7.8296, "step": 555800 }, { "epoch": 2.264614431977545, "grad_norm": 10.472969055175781, "learning_rate": 0.0037126228831154706, "loss": 7.8263, "step": 555900 }, { "epoch": 2.2650218100009267, "grad_norm": 2.4553678035736084, "learning_rate": 0.0037121887486181015, "loss": 7.8158, "step": 556000 }, { "epoch": 2.2650218100009267, "eval_MaskedAccuracy": 0.5005522445873212, "eval_loss": 1.646085500717163, "eval_runtime": 155.4707, "eval_samples_per_second": 408.283, "eval_steps_per_second": 1.595, "step": 556000 }, { "epoch": 2.2654291880243083, "grad_norm": 10.690045356750488, "learning_rate": 0.003711754566394102, "loss": 7.837, "step": 556100 }, { "epoch": 2.26583656604769, "grad_norm": 4.021206855773926, "learning_rate": 0.00371132033646064, "loss": 7.81, "step": 556200 }, { "epoch": 2.2662439440710713, "grad_norm": 8.149909019470215, "learning_rate": 0.003710886058834882, "loss": 7.7881, "step": 556300 }, { "epoch": 2.266651322094453, "grad_norm": 3.954235553741455, "learning_rate": 0.003710451733534001, "loss": 7.8244, "step": 556400 }, { "epoch": 2.267058700117834, "grad_norm": 7.280412673950195, "learning_rate": 0.0037100173605751647, "loss": 7.8122, "step": 556500 }, { "epoch": 2.2674660781412155, "grad_norm": 3.6498842239379883, "learning_rate": 0.003709582939975548, "loss": 7.8038, "step": 556600 }, { "epoch": 2.267873456164597, "grad_norm": 4.224459171295166, "learning_rate": 0.003709148471752326, "loss": 7.8348, "step": 556700 }, { "epoch": 2.2682808341879785, "grad_norm": 5.864927768707275, "learning_rate": 0.003708713955922673, "loss": 7.7983, "step": 556800 }, { "epoch": 2.26868821221136, "grad_norm": 5.801333427429199, "learning_rate": 0.003708279392503769, "loss": 7.8092, "step": 556900 }, { "epoch": 2.269095590234741, "grad_norm": 3.181314468383789, "learning_rate": 0.003707844781512797, "loss": 7.8201, "step": 557000 }, { "epoch": 2.269095590234741, "eval_MaskedAccuracy": 0.50079975482266, "eval_loss": 1.6471331119537354, "eval_runtime": 155.9278, "eval_samples_per_second": 407.086, "eval_steps_per_second": 1.59, "step": 557000 }, { "epoch": 2.2695029682581227, "grad_norm": 5.057437419891357, "learning_rate": 0.0037074101229669356, "loss": 7.7945, "step": 557100 }, { "epoch": 2.2699103462815042, "grad_norm": 3.7329092025756836, "learning_rate": 0.0037069754168833715, "loss": 7.7986, "step": 557200 }, { "epoch": 2.2703177243048858, "grad_norm": 4.425315856933594, "learning_rate": 0.0037065406632792917, "loss": 7.8158, "step": 557300 }, { "epoch": 2.2707251023282673, "grad_norm": 2.3986151218414307, "learning_rate": 0.0037061058621718837, "loss": 7.8124, "step": 557400 }, { "epoch": 2.271132480351649, "grad_norm": 3.9646878242492676, "learning_rate": 0.003705671013578333, "loss": 7.8108, "step": 557500 }, { "epoch": 2.2715398583750304, "grad_norm": 4.580368518829346, "learning_rate": 0.0037052361175158414, "loss": 7.8063, "step": 557600 }, { "epoch": 2.2719472363984115, "grad_norm": 7.307309150695801, "learning_rate": 0.0037048011740015954, "loss": 7.812, "step": 557700 }, { "epoch": 2.272354614421793, "grad_norm": 5.580484867095947, "learning_rate": 0.00370436618305279, "loss": 7.8363, "step": 557800 }, { "epoch": 2.2727619924451745, "grad_norm": 6.419212818145752, "learning_rate": 0.003703931144686628, "loss": 7.8152, "step": 557900 }, { "epoch": 2.273169370468556, "grad_norm": 3.574993133544922, "learning_rate": 0.003703496058920303, "loss": 7.7717, "step": 558000 }, { "epoch": 2.273169370468556, "eval_MaskedAccuracy": 0.5011912879313032, "eval_loss": 1.6435333490371704, "eval_runtime": 160.4227, "eval_samples_per_second": 395.68, "eval_steps_per_second": 1.546, "step": 558000 }, { "epoch": 2.2735767484919376, "grad_norm": 2.877426862716675, "learning_rate": 0.0037030609257710186, "loss": 7.8217, "step": 558100 }, { "epoch": 2.273984126515319, "grad_norm": 4.252274990081787, "learning_rate": 0.003702625745255979, "loss": 7.8042, "step": 558200 }, { "epoch": 2.2743915045387, "grad_norm": 7.433331489562988, "learning_rate": 0.0037021905173923847, "loss": 7.8158, "step": 558300 }, { "epoch": 2.2747988825620817, "grad_norm": 4.4136786460876465, "learning_rate": 0.0037017552421974493, "loss": 7.8343, "step": 558400 }, { "epoch": 2.2752062605854633, "grad_norm": 1.7757295370101929, "learning_rate": 0.0037013199196883778, "loss": 7.8028, "step": 558500 }, { "epoch": 2.275613638608845, "grad_norm": 2.7505078315734863, "learning_rate": 0.00370088454988238, "loss": 7.8028, "step": 558600 }, { "epoch": 2.2760210166322263, "grad_norm": 3.4686386585235596, "learning_rate": 0.0037004491327966697, "loss": 7.7894, "step": 558700 }, { "epoch": 2.276428394655608, "grad_norm": 3.3943593502044678, "learning_rate": 0.003700013668448465, "loss": 7.8202, "step": 558800 }, { "epoch": 2.2768357726789894, "grad_norm": 4.104429244995117, "learning_rate": 0.0036995781568549776, "loss": 7.8157, "step": 558900 }, { "epoch": 2.2772431507023705, "grad_norm": 5.528374195098877, "learning_rate": 0.003699142598033427, "loss": 7.8183, "step": 559000 }, { "epoch": 2.2772431507023705, "eval_MaskedAccuracy": 0.500252366596004, "eval_loss": 1.6462630033493042, "eval_runtime": 181.7661, "eval_samples_per_second": 349.218, "eval_steps_per_second": 1.364, "step": 559000 }, { "epoch": 2.277650528725752, "grad_norm": 3.605768918991089, "learning_rate": 0.0036987069920010316, "loss": 7.7941, "step": 559100 }, { "epoch": 2.2780579067491336, "grad_norm": 4.0323286056518555, "learning_rate": 0.003698271338775016, "loss": 7.8123, "step": 559200 }, { "epoch": 2.278465284772515, "grad_norm": 2.5782217979431152, "learning_rate": 0.003697835638372599, "loss": 7.7876, "step": 559300 }, { "epoch": 2.2788726627958966, "grad_norm": 3.2254178524017334, "learning_rate": 0.0036973998908110114, "loss": 7.8112, "step": 559400 }, { "epoch": 2.2792800408192777, "grad_norm": 6.719834804534912, "learning_rate": 0.003696964096107479, "loss": 7.7808, "step": 559500 }, { "epoch": 2.2796874188426592, "grad_norm": 7.524669647216797, "learning_rate": 0.003696528254279231, "loss": 7.8251, "step": 559600 }, { "epoch": 2.2800947968660408, "grad_norm": 7.806594371795654, "learning_rate": 0.0036960923653435005, "loss": 7.8273, "step": 559700 }, { "epoch": 2.2805021748894223, "grad_norm": 10.59316635131836, "learning_rate": 0.003695656429317515, "loss": 7.7829, "step": 559800 }, { "epoch": 2.280909552912804, "grad_norm": 2.955613374710083, "learning_rate": 0.0036952204462185126, "loss": 7.791, "step": 559900 }, { "epoch": 2.2813169309361854, "grad_norm": 5.315120697021484, "learning_rate": 0.003694784416063736, "loss": 7.81, "step": 560000 }, { "epoch": 2.2813169309361854, "eval_MaskedAccuracy": 0.5001056928718093, "eval_loss": 1.6514586210250854, "eval_runtime": 154.8769, "eval_samples_per_second": 409.848, "eval_steps_per_second": 1.601, "step": 560000 }, { "epoch": 2.281724308959567, "grad_norm": 2.851351737976074, "learning_rate": 0.0036943483388704123, "loss": 7.7882, "step": 560100 }, { "epoch": 2.282131686982948, "grad_norm": 6.286218643188477, "learning_rate": 0.003693912214655789, "loss": 7.8126, "step": 560200 }, { "epoch": 2.2825390650063295, "grad_norm": 7.340025424957275, "learning_rate": 0.003693476043437107, "loss": 7.795, "step": 560300 }, { "epoch": 2.282946443029711, "grad_norm": 5.198561668395996, "learning_rate": 0.0036930398252316113, "loss": 7.8086, "step": 560400 }, { "epoch": 2.2833538210530926, "grad_norm": 2.6386916637420654, "learning_rate": 0.003692603560056541, "loss": 7.8095, "step": 560500 }, { "epoch": 2.283761199076474, "grad_norm": 5.484129428863525, "learning_rate": 0.0036921672479291543, "loss": 7.8065, "step": 560600 }, { "epoch": 2.2841685770998557, "grad_norm": 4.82759952545166, "learning_rate": 0.003691730888866699, "loss": 7.8265, "step": 560700 }, { "epoch": 2.2845759551232367, "grad_norm": 2.8393630981445312, "learning_rate": 0.0036912944828864256, "loss": 7.7741, "step": 560800 }, { "epoch": 2.2849833331466183, "grad_norm": 3.239849090576172, "learning_rate": 0.003690858030005584, "loss": 7.8024, "step": 560900 }, { "epoch": 2.28539071117, "grad_norm": 4.779313087463379, "learning_rate": 0.0036904215302414316, "loss": 7.825, "step": 561000 }, { "epoch": 2.28539071117, "eval_MaskedAccuracy": 0.5005410275150758, "eval_loss": 1.6495428085327148, "eval_runtime": 162.8685, "eval_samples_per_second": 389.738, "eval_steps_per_second": 1.523, "step": 561000 }, { "epoch": 2.2857980891933813, "grad_norm": 9.7829008102417, "learning_rate": 0.003689984983611227, "loss": 7.7901, "step": 561100 }, { "epoch": 2.286205467216763, "grad_norm": 4.617170333862305, "learning_rate": 0.0036895483901322294, "loss": 7.8269, "step": 561200 }, { "epoch": 2.2866128452401444, "grad_norm": 2.352316379547119, "learning_rate": 0.003689111749821697, "loss": 7.842, "step": 561300 }, { "epoch": 2.287020223263526, "grad_norm": 4.032354354858398, "learning_rate": 0.0036886750626968943, "loss": 7.801, "step": 561400 }, { "epoch": 2.287427601286907, "grad_norm": 5.562901973724365, "learning_rate": 0.003688238328775084, "loss": 7.7932, "step": 561500 }, { "epoch": 2.2878349793102886, "grad_norm": 3.7815651893615723, "learning_rate": 0.003687801548073535, "loss": 7.8262, "step": 561600 }, { "epoch": 2.28824235733367, "grad_norm": 6.96209192276001, "learning_rate": 0.0036873647206095155, "loss": 7.8029, "step": 561700 }, { "epoch": 2.2886497353570516, "grad_norm": 2.10092830657959, "learning_rate": 0.0036869278464002927, "loss": 7.8132, "step": 561800 }, { "epoch": 2.289057113380433, "grad_norm": 3.034524917602539, "learning_rate": 0.0036864909254631407, "loss": 7.8082, "step": 561900 }, { "epoch": 2.2894644914038143, "grad_norm": 7.174429416656494, "learning_rate": 0.003686053957815332, "loss": 7.8039, "step": 562000 }, { "epoch": 2.2894644914038143, "eval_MaskedAccuracy": 0.5008579440761833, "eval_loss": 1.64951491355896, "eval_runtime": 169.9335, "eval_samples_per_second": 373.534, "eval_steps_per_second": 1.459, "step": 562000 }, { "epoch": 2.289871869427196, "grad_norm": 2.877821922302246, "learning_rate": 0.0036856169434741444, "loss": 7.7858, "step": 562100 }, { "epoch": 2.2902792474505773, "grad_norm": 3.2900638580322266, "learning_rate": 0.003685179882456856, "loss": 7.8389, "step": 562200 }, { "epoch": 2.290686625473959, "grad_norm": 2.1930456161499023, "learning_rate": 0.0036847427747807456, "loss": 7.774, "step": 562300 }, { "epoch": 2.2910940034973404, "grad_norm": 1.7845197916030884, "learning_rate": 0.0036843056204630973, "loss": 7.7848, "step": 562400 }, { "epoch": 2.291501381520722, "grad_norm": 3.4839022159576416, "learning_rate": 0.0036838684195211868, "loss": 7.8392, "step": 562500 }, { "epoch": 2.2919087595441034, "grad_norm": 3.169654369354248, "learning_rate": 0.0036834311719723045, "loss": 7.8162, "step": 562600 }, { "epoch": 2.2923161375674845, "grad_norm": 5.8147382736206055, "learning_rate": 0.0036829938778337344, "loss": 7.8284, "step": 562700 }, { "epoch": 2.292723515590866, "grad_norm": 3.330246686935425, "learning_rate": 0.003682556537122767, "loss": 7.825, "step": 562800 }, { "epoch": 2.2931308936142476, "grad_norm": 4.501898288726807, "learning_rate": 0.003682119149856691, "loss": 7.8016, "step": 562900 }, { "epoch": 2.293538271637629, "grad_norm": 6.830868721008301, "learning_rate": 0.0036816817160528004, "loss": 7.8366, "step": 563000 }, { "epoch": 2.293538271637629, "eval_MaskedAccuracy": 0.5006073604882039, "eval_loss": 1.6413272619247437, "eval_runtime": 165.8437, "eval_samples_per_second": 382.746, "eval_steps_per_second": 1.495, "step": 563000 }, { "epoch": 2.2939456496610107, "grad_norm": 3.261218786239624, "learning_rate": 0.0036812442357283866, "loss": 7.8113, "step": 563100 }, { "epoch": 2.294353027684392, "grad_norm": 5.727258682250977, "learning_rate": 0.0036808067089007493, "loss": 7.8122, "step": 563200 }, { "epoch": 2.2947604057077733, "grad_norm": 3.5103304386138916, "learning_rate": 0.0036803691355871792, "loss": 7.8079, "step": 563300 }, { "epoch": 2.295167783731155, "grad_norm": 6.098264217376709, "learning_rate": 0.003679931515804987, "loss": 7.7835, "step": 563400 }, { "epoch": 2.2955751617545364, "grad_norm": 7.136855125427246, "learning_rate": 0.0036794938495714642, "loss": 7.7704, "step": 563500 }, { "epoch": 2.295982539777918, "grad_norm": 6.015887260437012, "learning_rate": 0.0036790561369039144, "loss": 7.8326, "step": 563600 }, { "epoch": 2.2963899178012994, "grad_norm": 2.1826488971710205, "learning_rate": 0.0036786183778196473, "loss": 7.7864, "step": 563700 }, { "epoch": 2.296797295824681, "grad_norm": 7.495033264160156, "learning_rate": 0.00367818057233597, "loss": 7.8182, "step": 563800 }, { "epoch": 2.2972046738480625, "grad_norm": 7.047767162322998, "learning_rate": 0.003677742720470191, "loss": 7.8145, "step": 563900 }, { "epoch": 2.2976120518714436, "grad_norm": 6.113947868347168, "learning_rate": 0.003677304822239619, "loss": 7.8133, "step": 564000 }, { "epoch": 2.2976120518714436, "eval_MaskedAccuracy": 0.500017308723155, "eval_loss": 1.6545196771621704, "eval_runtime": 161.5075, "eval_samples_per_second": 393.022, "eval_steps_per_second": 1.536, "step": 564000 }, { "epoch": 2.298019429894825, "grad_norm": 6.169644355773926, "learning_rate": 0.0036768668776615706, "loss": 7.8446, "step": 564100 }, { "epoch": 2.2984268079182066, "grad_norm": 2.7540698051452637, "learning_rate": 0.0036764288867533533, "loss": 7.8195, "step": 564200 }, { "epoch": 2.298834185941588, "grad_norm": 2.0158894062042236, "learning_rate": 0.0036759908495322883, "loss": 7.7954, "step": 564300 }, { "epoch": 2.2992415639649697, "grad_norm": 3.291651964187622, "learning_rate": 0.003675552766015695, "loss": 7.8378, "step": 564400 }, { "epoch": 2.299648941988351, "grad_norm": 6.923490524291992, "learning_rate": 0.003675114636220887, "loss": 7.813, "step": 564500 }, { "epoch": 2.3000563200117323, "grad_norm": 2.207007646560669, "learning_rate": 0.0036746764601651885, "loss": 7.7775, "step": 564600 }, { "epoch": 2.300463698035114, "grad_norm": 6.48522424697876, "learning_rate": 0.003674238237865923, "loss": 7.8255, "step": 564700 }, { "epoch": 2.3008710760584954, "grad_norm": 4.100906848907471, "learning_rate": 0.0036737999693404197, "loss": 7.771, "step": 564800 }, { "epoch": 2.301278454081877, "grad_norm": 6.507815361022949, "learning_rate": 0.0036733616546060005, "loss": 7.8102, "step": 564900 }, { "epoch": 2.3016858321052585, "grad_norm": 3.452202796936035, "learning_rate": 0.003672923293679999, "loss": 7.8209, "step": 565000 }, { "epoch": 2.3016858321052585, "eval_MaskedAccuracy": 0.5015413176382573, "eval_loss": 1.6353548765182495, "eval_runtime": 199.7571, "eval_samples_per_second": 317.766, "eval_steps_per_second": 1.242, "step": 565000 }, { "epoch": 2.30209321012864, "grad_norm": 4.378198623657227, "learning_rate": 0.003672484886579738, "loss": 7.7998, "step": 565100 }, { "epoch": 2.302500588152021, "grad_norm": 2.597689628601074, "learning_rate": 0.003672046433322559, "loss": 7.8292, "step": 565200 }, { "epoch": 2.3029079661754026, "grad_norm": 3.111417055130005, "learning_rate": 0.0036716079339257908, "loss": 7.8239, "step": 565300 }, { "epoch": 2.303315344198784, "grad_norm": 4.451837062835693, "learning_rate": 0.0036711693884067735, "loss": 7.8201, "step": 565400 }, { "epoch": 2.3037227222221657, "grad_norm": 10.4205904006958, "learning_rate": 0.0036707307967828433, "loss": 7.8191, "step": 565500 }, { "epoch": 2.304130100245547, "grad_norm": 1.5484758615493774, "learning_rate": 0.0036702921590713367, "loss": 7.8065, "step": 565600 }, { "epoch": 2.3045374782689287, "grad_norm": 4.571197986602783, "learning_rate": 0.003669853475289599, "loss": 7.8109, "step": 565700 }, { "epoch": 2.30494485629231, "grad_norm": 6.3760247230529785, "learning_rate": 0.003669414745454978, "loss": 7.7971, "step": 565800 }, { "epoch": 2.3053522343156914, "grad_norm": 4.224003791809082, "learning_rate": 0.0036689759695848105, "loss": 7.8078, "step": 565900 }, { "epoch": 2.305759612339073, "grad_norm": 7.778561115264893, "learning_rate": 0.003668537147696447, "loss": 7.8465, "step": 566000 }, { "epoch": 2.305759612339073, "eval_MaskedAccuracy": 0.5002568947516487, "eval_loss": 1.639491319656372, "eval_runtime": 156.5749, "eval_samples_per_second": 405.403, "eval_steps_per_second": 1.584, "step": 566000 }, { "epoch": 2.3061669903624544, "grad_norm": 4.958521842956543, "learning_rate": 0.00366809827980724, "loss": 7.8103, "step": 566100 }, { "epoch": 2.306574368385836, "grad_norm": 10.79844856262207, "learning_rate": 0.0036676593659345346, "loss": 7.818, "step": 566200 }, { "epoch": 2.3069817464092175, "grad_norm": 2.0336101055145264, "learning_rate": 0.003667220406095688, "loss": 7.819, "step": 566300 }, { "epoch": 2.307389124432599, "grad_norm": 1.9391813278198242, "learning_rate": 0.0036667814003080537, "loss": 7.7723, "step": 566400 }, { "epoch": 2.30779650245598, "grad_norm": 5.151214122772217, "learning_rate": 0.0036663423485889897, "loss": 7.8015, "step": 566500 }, { "epoch": 2.3082038804793616, "grad_norm": 7.156179904937744, "learning_rate": 0.003665903250955848, "loss": 7.792, "step": 566600 }, { "epoch": 2.308611258502743, "grad_norm": 7.693875789642334, "learning_rate": 0.003665464107425993, "loss": 7.8119, "step": 566700 }, { "epoch": 2.3090186365261247, "grad_norm": 3.309619665145874, "learning_rate": 0.0036650249180167835, "loss": 7.8418, "step": 566800 }, { "epoch": 2.3094260145495062, "grad_norm": 2.2675909996032715, "learning_rate": 0.003664585682745583, "loss": 7.8168, "step": 566900 }, { "epoch": 2.3098333925728873, "grad_norm": 9.942997932434082, "learning_rate": 0.0036641464016297586, "loss": 7.8399, "step": 567000 }, { "epoch": 2.3098333925728873, "eval_MaskedAccuracy": 0.5007904791842843, "eval_loss": 1.6460387706756592, "eval_runtime": 163.1776, "eval_samples_per_second": 388.999, "eval_steps_per_second": 1.52, "step": 567000 }, { "epoch": 2.310240770596269, "grad_norm": 2.691694974899292, "learning_rate": 0.0036637070746866813, "loss": 7.8004, "step": 567100 }, { "epoch": 2.3106481486196504, "grad_norm": 7.271874904632568, "learning_rate": 0.003663267701933716, "loss": 7.8369, "step": 567200 }, { "epoch": 2.311055526643032, "grad_norm": 6.275662422180176, "learning_rate": 0.003662828283388233, "loss": 7.8187, "step": 567300 }, { "epoch": 2.3114629046664135, "grad_norm": 5.2271881103515625, "learning_rate": 0.0036623888190675984, "loss": 7.8217, "step": 567400 }, { "epoch": 2.311870282689795, "grad_norm": 8.13151741027832, "learning_rate": 0.0036619493089891995, "loss": 7.7958, "step": 567500 }, { "epoch": 2.3122776607131765, "grad_norm": 6.556377410888672, "learning_rate": 0.003661509753170407, "loss": 7.8084, "step": 567600 }, { "epoch": 2.3126850387365576, "grad_norm": 11.249375343322754, "learning_rate": 0.0036610701516285983, "loss": 7.8146, "step": 567700 }, { "epoch": 2.313092416759939, "grad_norm": 4.4853129386901855, "learning_rate": 0.0036606305043811495, "loss": 7.8035, "step": 567800 }, { "epoch": 2.3134997947833207, "grad_norm": 7.194879531860352, "learning_rate": 0.0036601908114454506, "loss": 7.8378, "step": 567900 }, { "epoch": 2.313907172806702, "grad_norm": 4.056886196136475, "learning_rate": 0.0036597510728388766, "loss": 7.7952, "step": 568000 }, { "epoch": 2.313907172806702, "eval_MaskedAccuracy": 0.5013411376071463, "eval_loss": 1.6393439769744873, "eval_runtime": 162.3513, "eval_samples_per_second": 390.979, "eval_steps_per_second": 1.528, "step": 568000 }, { "epoch": 2.3143145508300837, "grad_norm": 1.8405582904815674, "learning_rate": 0.003659311288578821, "loss": 7.8059, "step": 568100 }, { "epoch": 2.3147219288534653, "grad_norm": 7.408936977386475, "learning_rate": 0.0036588714586826636, "loss": 7.811, "step": 568200 }, { "epoch": 2.3151293068768464, "grad_norm": 3.2693755626678467, "learning_rate": 0.0036584315831677977, "loss": 7.8182, "step": 568300 }, { "epoch": 2.315536684900228, "grad_norm": 2.8912346363067627, "learning_rate": 0.0036579916620516116, "loss": 7.754, "step": 568400 }, { "epoch": 2.3159440629236094, "grad_norm": 2.818791151046753, "learning_rate": 0.003657551695351498, "loss": 7.7531, "step": 568500 }, { "epoch": 2.316351440946991, "grad_norm": 5.777654647827148, "learning_rate": 0.0036571116830848537, "loss": 7.8041, "step": 568600 }, { "epoch": 2.3167588189703725, "grad_norm": 2.4717233180999756, "learning_rate": 0.0036566716252690694, "loss": 7.7759, "step": 568700 }, { "epoch": 2.317166196993754, "grad_norm": 4.8509697914123535, "learning_rate": 0.0036562315219215522, "loss": 7.797, "step": 568800 }, { "epoch": 2.3175735750171356, "grad_norm": 2.7276742458343506, "learning_rate": 0.0036557913730596923, "loss": 7.7912, "step": 568900 }, { "epoch": 2.3179809530405167, "grad_norm": 7.866903781890869, "learning_rate": 0.0036553511787008955, "loss": 7.7853, "step": 569000 }, { "epoch": 2.3179809530405167, "eval_MaskedAccuracy": 0.5008868233028845, "eval_loss": 1.6494930982589722, "eval_runtime": 154.7865, "eval_samples_per_second": 410.087, "eval_steps_per_second": 1.602, "step": 569000 }, { "epoch": 2.318388331063898, "grad_norm": 2.6144144535064697, "learning_rate": 0.003654910938862568, "loss": 7.775, "step": 569100 }, { "epoch": 2.3187957090872797, "grad_norm": 3.0788028240203857, "learning_rate": 0.0036544706535621085, "loss": 7.7952, "step": 569200 }, { "epoch": 2.3192030871106613, "grad_norm": 6.774467945098877, "learning_rate": 0.003654030322816921, "loss": 7.7876, "step": 569300 }, { "epoch": 2.319610465134043, "grad_norm": 7.350885391235352, "learning_rate": 0.0036535899466444213, "loss": 7.8123, "step": 569400 }, { "epoch": 2.320017843157424, "grad_norm": 2.695387601852417, "learning_rate": 0.0036531495250620187, "loss": 7.8159, "step": 569500 }, { "epoch": 2.3204252211808054, "grad_norm": 2.6932992935180664, "learning_rate": 0.0036527090580871263, "loss": 7.8055, "step": 569600 }, { "epoch": 2.320832599204187, "grad_norm": 2.527388572692871, "learning_rate": 0.0036522685457371557, "loss": 7.7804, "step": 569700 }, { "epoch": 2.3212399772275685, "grad_norm": 4.48877477645874, "learning_rate": 0.0036518279880295218, "loss": 7.8505, "step": 569800 }, { "epoch": 2.32164735525095, "grad_norm": 5.784459590911865, "learning_rate": 0.0036513873849816428, "loss": 7.8049, "step": 569900 }, { "epoch": 2.3220547332743315, "grad_norm": 7.657225608825684, "learning_rate": 0.0036509467366109393, "loss": 7.8017, "step": 570000 }, { "epoch": 2.3220547332743315, "eval_MaskedAccuracy": 0.5004812969165234, "eval_loss": 1.6518429517745972, "eval_runtime": 183.4467, "eval_samples_per_second": 346.019, "eval_steps_per_second": 1.352, "step": 570000 }, { "epoch": 2.322462111297713, "grad_norm": 5.676888465881348, "learning_rate": 0.0036505060429348345, "loss": 7.787, "step": 570100 }, { "epoch": 2.322869489321094, "grad_norm": 3.2929296493530273, "learning_rate": 0.0036500653039707467, "loss": 7.791, "step": 570200 }, { "epoch": 2.3232768673444757, "grad_norm": 3.6335253715515137, "learning_rate": 0.003649624519736106, "loss": 7.7994, "step": 570300 }, { "epoch": 2.3236842453678572, "grad_norm": 4.016894340515137, "learning_rate": 0.0036491836902483366, "loss": 7.8071, "step": 570400 }, { "epoch": 2.3240916233912388, "grad_norm": 3.255833864212036, "learning_rate": 0.003648742815524863, "loss": 7.8296, "step": 570500 }, { "epoch": 2.3244990014146203, "grad_norm": 3.7515814304351807, "learning_rate": 0.0036483018955831216, "loss": 7.7897, "step": 570600 }, { "epoch": 2.324906379438002, "grad_norm": 3.1917057037353516, "learning_rate": 0.00364786093044054, "loss": 7.7937, "step": 570700 }, { "epoch": 2.325313757461383, "grad_norm": 4.040051460266113, "learning_rate": 0.0036474199201145564, "loss": 7.7956, "step": 570800 }, { "epoch": 2.3257211354847644, "grad_norm": 8.876458168029785, "learning_rate": 0.003646978864622602, "loss": 7.8038, "step": 570900 }, { "epoch": 2.326128513508146, "grad_norm": 6.235292911529541, "learning_rate": 0.0036465377639821137, "loss": 7.8209, "step": 571000 }, { "epoch": 2.326128513508146, "eval_MaskedAccuracy": 0.5000286903679956, "eval_loss": 1.6561836004257202, "eval_runtime": 207.5661, "eval_samples_per_second": 305.811, "eval_steps_per_second": 1.195, "step": 571000 }, { "epoch": 2.3265358915315275, "grad_norm": 4.052027225494385, "learning_rate": 0.0036460966182105324, "loss": 7.8094, "step": 571100 }, { "epoch": 2.326943269554909, "grad_norm": 2.9445784091949463, "learning_rate": 0.0036456554273253004, "loss": 7.7966, "step": 571200 }, { "epoch": 2.3273506475782906, "grad_norm": 5.168513774871826, "learning_rate": 0.00364521419134386, "loss": 7.8173, "step": 571300 }, { "epoch": 2.327758025601672, "grad_norm": 4.562358379364014, "learning_rate": 0.0036447729102836533, "loss": 7.7718, "step": 571400 }, { "epoch": 2.328165403625053, "grad_norm": 5.8594651222229, "learning_rate": 0.0036443315841621256, "loss": 7.8094, "step": 571500 }, { "epoch": 2.3285727816484347, "grad_norm": 5.5165019035339355, "learning_rate": 0.0036438902129967304, "loss": 7.7973, "step": 571600 }, { "epoch": 2.3289801596718163, "grad_norm": 3.9439899921417236, "learning_rate": 0.003643448796804908, "loss": 7.7941, "step": 571700 }, { "epoch": 2.329387537695198, "grad_norm": 3.9785115718841553, "learning_rate": 0.003643007335604119, "loss": 7.7978, "step": 571800 }, { "epoch": 2.3297949157185793, "grad_norm": 9.531577110290527, "learning_rate": 0.003642565829411813, "loss": 7.7614, "step": 571900 }, { "epoch": 2.3302022937419604, "grad_norm": 1.9221528768539429, "learning_rate": 0.003642124278245446, "loss": 7.8186, "step": 572000 }, { "epoch": 2.3302022937419604, "eval_MaskedAccuracy": 0.5009346109963853, "eval_loss": 1.6393803358078003, "eval_runtime": 168.0914, "eval_samples_per_second": 377.628, "eval_steps_per_second": 1.475, "step": 572000 }, { "epoch": 2.330609671765342, "grad_norm": 5.259122848510742, "learning_rate": 0.003641682682122474, "loss": 7.7697, "step": 572100 }, { "epoch": 2.3310170497887235, "grad_norm": 9.222817420959473, "learning_rate": 0.003641241041060359, "loss": 7.8259, "step": 572200 }, { "epoch": 2.331424427812105, "grad_norm": 2.1270833015441895, "learning_rate": 0.003640799355076559, "loss": 7.7948, "step": 572300 }, { "epoch": 2.3318318058354865, "grad_norm": 5.878179550170898, "learning_rate": 0.0036403576241885336, "loss": 7.82, "step": 572400 }, { "epoch": 2.332239183858868, "grad_norm": 5.052548408508301, "learning_rate": 0.003639915848413748, "loss": 7.7924, "step": 572500 }, { "epoch": 2.3326465618822496, "grad_norm": 5.933427333831787, "learning_rate": 0.003639474027769669, "loss": 7.7903, "step": 572600 }, { "epoch": 2.3330539399056307, "grad_norm": 2.792175054550171, "learning_rate": 0.0036390321622737616, "loss": 7.803, "step": 572700 }, { "epoch": 2.3334613179290122, "grad_norm": 3.595899820327759, "learning_rate": 0.003638590251943498, "loss": 7.7795, "step": 572800 }, { "epoch": 2.3338686959523938, "grad_norm": 3.459865093231201, "learning_rate": 0.0036381482967963517, "loss": 7.8132, "step": 572900 }, { "epoch": 2.3342760739757753, "grad_norm": 3.5949485301971436, "learning_rate": 0.0036377062968497883, "loss": 7.8101, "step": 573000 }, { "epoch": 2.3342760739757753, "eval_MaskedAccuracy": 0.5011215973574518, "eval_loss": 1.6427503824234009, "eval_runtime": 152.7683, "eval_samples_per_second": 415.505, "eval_steps_per_second": 1.623, "step": 573000 }, { "epoch": 2.334683451999157, "grad_norm": 3.4251813888549805, "learning_rate": 0.003637264252121289, "loss": 7.7793, "step": 573100 }, { "epoch": 2.3350908300225384, "grad_norm": 3.6462955474853516, "learning_rate": 0.003636822162628328, "loss": 7.8069, "step": 573200 }, { "epoch": 2.3354982080459195, "grad_norm": 2.881012201309204, "learning_rate": 0.0036363800283883843, "loss": 7.7885, "step": 573300 }, { "epoch": 2.335905586069301, "grad_norm": 9.830028533935547, "learning_rate": 0.003635937849418939, "loss": 7.8005, "step": 573400 }, { "epoch": 2.3363129640926825, "grad_norm": 5.260698318481445, "learning_rate": 0.003635495625737469, "loss": 7.7855, "step": 573500 }, { "epoch": 2.336720342116064, "grad_norm": 3.468350410461426, "learning_rate": 0.0036350533573614617, "loss": 7.824, "step": 573600 }, { "epoch": 2.3371277201394456, "grad_norm": 4.699432849884033, "learning_rate": 0.0036346110443083992, "loss": 7.7941, "step": 573700 }, { "epoch": 2.337535098162827, "grad_norm": 2.475425958633423, "learning_rate": 0.0036341686865957756, "loss": 7.7852, "step": 573800 }, { "epoch": 2.3379424761862087, "grad_norm": 3.1460745334625244, "learning_rate": 0.003633726284241068, "loss": 7.7886, "step": 573900 }, { "epoch": 2.3383498542095897, "grad_norm": 2.798072338104248, "learning_rate": 0.0036332838372617774, "loss": 7.7765, "step": 574000 }, { "epoch": 2.3383498542095897, "eval_MaskedAccuracy": 0.5010669093212721, "eval_loss": 1.6389412879943848, "eval_runtime": 152.9499, "eval_samples_per_second": 415.012, "eval_steps_per_second": 1.621, "step": 574000 }, { "epoch": 2.3387572322329713, "grad_norm": 3.818546772003174, "learning_rate": 0.00363284134567539, "loss": 7.7845, "step": 574100 }, { "epoch": 2.339164610256353, "grad_norm": 3.3038523197174072, "learning_rate": 0.003632398809499405, "loss": 7.8184, "step": 574200 }, { "epoch": 2.3395719882797343, "grad_norm": 4.6410908699035645, "learning_rate": 0.0036319562287513156, "loss": 7.788, "step": 574300 }, { "epoch": 2.339979366303116, "grad_norm": 1.9949098825454712, "learning_rate": 0.003631513603448619, "loss": 7.7812, "step": 574400 }, { "epoch": 2.340386744326497, "grad_norm": 2.74983549118042, "learning_rate": 0.0036310709336088132, "loss": 7.787, "step": 574500 }, { "epoch": 2.3407941223498785, "grad_norm": 5.470245361328125, "learning_rate": 0.003630628219249402, "loss": 7.8137, "step": 574600 }, { "epoch": 2.34120150037326, "grad_norm": 2.465470314025879, "learning_rate": 0.0036301854603878864, "loss": 7.8113, "step": 574700 }, { "epoch": 2.3416088783966416, "grad_norm": 4.013788223266602, "learning_rate": 0.003629742657041771, "loss": 7.8333, "step": 574800 }, { "epoch": 2.342016256420023, "grad_norm": 3.817545175552368, "learning_rate": 0.0036292998092285643, "loss": 7.8007, "step": 574900 }, { "epoch": 2.3424236344434046, "grad_norm": 4.242929458618164, "learning_rate": 0.003628856916965773, "loss": 7.7877, "step": 575000 }, { "epoch": 2.3424236344434046, "eval_MaskedAccuracy": 0.5012850973635277, "eval_loss": 1.6421722173690796, "eval_runtime": 158.7684, "eval_samples_per_second": 399.802, "eval_steps_per_second": 1.562, "step": 575000 }, { "epoch": 2.342831012466786, "grad_norm": 3.80789852142334, "learning_rate": 0.0036284139802709053, "loss": 7.7879, "step": 575100 }, { "epoch": 2.3432383904901672, "grad_norm": 4.318171977996826, "learning_rate": 0.0036279709991614735, "loss": 7.8149, "step": 575200 }, { "epoch": 2.3436457685135488, "grad_norm": 2.998563766479492, "learning_rate": 0.003627527973654996, "loss": 7.7942, "step": 575300 }, { "epoch": 2.3440531465369303, "grad_norm": 7.166109561920166, "learning_rate": 0.00362708490376898, "loss": 7.7971, "step": 575400 }, { "epoch": 2.344460524560312, "grad_norm": 3.0089938640594482, "learning_rate": 0.0036266417895209463, "loss": 7.775, "step": 575500 }, { "epoch": 2.3448679025836934, "grad_norm": 4.467271327972412, "learning_rate": 0.0036261986309284186, "loss": 7.801, "step": 575600 }, { "epoch": 2.345275280607075, "grad_norm": 1.9695075750350952, "learning_rate": 0.003625755428008911, "loss": 7.8133, "step": 575700 }, { "epoch": 2.345682658630456, "grad_norm": 2.8095858097076416, "learning_rate": 0.0036253121807799456, "loss": 7.81, "step": 575800 }, { "epoch": 2.3460900366538375, "grad_norm": 7.263975620269775, "learning_rate": 0.0036248688892590464, "loss": 7.805, "step": 575900 }, { "epoch": 2.346497414677219, "grad_norm": 2.4742026329040527, "learning_rate": 0.003624425553463742, "loss": 7.7794, "step": 576000 }, { "epoch": 2.346497414677219, "eval_MaskedAccuracy": 0.5012928785636943, "eval_loss": 1.6390610933303833, "eval_runtime": 157.8489, "eval_samples_per_second": 402.132, "eval_steps_per_second": 1.571, "step": 576000 }, { "epoch": 2.3469047927006006, "grad_norm": 5.670180320739746, "learning_rate": 0.0036239821734115594, "loss": 7.8105, "step": 576100 }, { "epoch": 2.347312170723982, "grad_norm": 3.840780258178711, "learning_rate": 0.0036235387491200246, "loss": 7.8036, "step": 576200 }, { "epoch": 2.3477195487473637, "grad_norm": 1.773376703262329, "learning_rate": 0.0036230952806066757, "loss": 7.7732, "step": 576300 }, { "epoch": 2.348126926770745, "grad_norm": 5.712237358093262, "learning_rate": 0.0036226517678890346, "loss": 7.7694, "step": 576400 }, { "epoch": 2.3485343047941263, "grad_norm": 3.7197134494781494, "learning_rate": 0.003622208210984643, "loss": 7.8039, "step": 576500 }, { "epoch": 2.348941682817508, "grad_norm": 3.3471412658691406, "learning_rate": 0.0036217646099110387, "loss": 7.805, "step": 576600 }, { "epoch": 2.3493490608408893, "grad_norm": 4.894758701324463, "learning_rate": 0.0036213209646857513, "loss": 7.8104, "step": 576700 }, { "epoch": 2.349756438864271, "grad_norm": 3.1351892948150635, "learning_rate": 0.00362087727532633, "loss": 7.8164, "step": 576800 }, { "epoch": 2.3501638168876524, "grad_norm": 1.4317957162857056, "learning_rate": 0.0036204335418503135, "loss": 7.8159, "step": 576900 }, { "epoch": 2.3505711949110335, "grad_norm": 2.3741769790649414, "learning_rate": 0.0036199897642752413, "loss": 7.8004, "step": 577000 }, { "epoch": 2.3505711949110335, "eval_MaskedAccuracy": 0.501224630950296, "eval_loss": 1.6415728330612183, "eval_runtime": 171.3502, "eval_samples_per_second": 370.446, "eval_steps_per_second": 1.447, "step": 577000 }, { "epoch": 2.350978572934415, "grad_norm": 6.531877040863037, "learning_rate": 0.0036195459426186607, "loss": 7.7916, "step": 577100 }, { "epoch": 2.3513859509577966, "grad_norm": 6.963188648223877, "learning_rate": 0.0036191020768981176, "loss": 7.7925, "step": 577200 }, { "epoch": 2.351793328981178, "grad_norm": 3.9958503246307373, "learning_rate": 0.003618658167131158, "loss": 7.7877, "step": 577300 }, { "epoch": 2.3522007070045596, "grad_norm": 3.9696502685546875, "learning_rate": 0.0036182142133353353, "loss": 7.806, "step": 577400 }, { "epoch": 2.352608085027941, "grad_norm": 6.785215377807617, "learning_rate": 0.003617770215528201, "loss": 7.8257, "step": 577500 }, { "epoch": 2.3530154630513227, "grad_norm": 3.5732803344726562, "learning_rate": 0.0036173261737273087, "loss": 7.7939, "step": 577600 }, { "epoch": 2.353422841074704, "grad_norm": 4.2556962966918945, "learning_rate": 0.003616882087950216, "loss": 7.7948, "step": 577700 }, { "epoch": 2.3538302190980853, "grad_norm": 3.27388334274292, "learning_rate": 0.0036164379582144758, "loss": 7.8147, "step": 577800 }, { "epoch": 2.354237597121467, "grad_norm": 3.56278395652771, "learning_rate": 0.003615993784537652, "loss": 7.8115, "step": 577900 }, { "epoch": 2.3546449751448484, "grad_norm": 2.025696039199829, "learning_rate": 0.0036155495669372983, "loss": 7.7848, "step": 578000 }, { "epoch": 2.3546449751448484, "eval_MaskedAccuracy": 0.5011612738226733, "eval_loss": 1.6435589790344238, "eval_runtime": 164.2816, "eval_samples_per_second": 386.385, "eval_steps_per_second": 1.51, "step": 578000 }, { "epoch": 2.35505235316823, "grad_norm": 2.4971704483032227, "learning_rate": 0.003615105305430979, "loss": 7.7912, "step": 578100 }, { "epoch": 2.3554597311916114, "grad_norm": 2.6171789169311523, "learning_rate": 0.0036146610000362617, "loss": 7.7789, "step": 578200 }, { "epoch": 2.3558671092149925, "grad_norm": 2.426166534423828, "learning_rate": 0.003614216650770709, "loss": 7.7846, "step": 578300 }, { "epoch": 2.356274487238374, "grad_norm": 3.547307014465332, "learning_rate": 0.0036137722576518923, "loss": 7.8254, "step": 578400 }, { "epoch": 2.3566818652617556, "grad_norm": 2.7063729763031006, "learning_rate": 0.003613327820697375, "loss": 7.7681, "step": 578500 }, { "epoch": 2.357089243285137, "grad_norm": 3.314094066619873, "learning_rate": 0.0036128833399247324, "loss": 7.8256, "step": 578600 }, { "epoch": 2.3574966213085187, "grad_norm": 2.50994873046875, "learning_rate": 0.0036124388153515354, "loss": 7.8004, "step": 578700 }, { "epoch": 2.3579039993319, "grad_norm": 2.1616404056549072, "learning_rate": 0.003611994246995362, "loss": 7.7966, "step": 578800 }, { "epoch": 2.3583113773552817, "grad_norm": 5.168814659118652, "learning_rate": 0.0036115496348737815, "loss": 7.7673, "step": 578900 }, { "epoch": 2.358718755378663, "grad_norm": 4.807029724121094, "learning_rate": 0.0036111049790043797, "loss": 7.787, "step": 579000 }, { "epoch": 2.358718755378663, "eval_MaskedAccuracy": 0.5011903029437689, "eval_loss": 1.6426823139190674, "eval_runtime": 156.4449, "eval_samples_per_second": 405.74, "eval_steps_per_second": 1.585, "step": 579000 }, { "epoch": 2.3591261334020444, "grad_norm": 4.3301215171813965, "learning_rate": 0.0036106602794047284, "loss": 7.817, "step": 579100 }, { "epoch": 2.359533511425426, "grad_norm": 3.070080518722534, "learning_rate": 0.0036102155360924165, "loss": 7.7913, "step": 579200 }, { "epoch": 2.3599408894488074, "grad_norm": 3.654529094696045, "learning_rate": 0.003609770749085021, "loss": 7.7843, "step": 579300 }, { "epoch": 2.360348267472189, "grad_norm": 2.2287397384643555, "learning_rate": 0.0036093259184001272, "loss": 7.7742, "step": 579400 }, { "epoch": 2.36075564549557, "grad_norm": 7.408300876617432, "learning_rate": 0.003608881044055323, "loss": 7.7914, "step": 579500 }, { "epoch": 2.3611630235189516, "grad_norm": 3.6855971813201904, "learning_rate": 0.003608436126068201, "loss": 7.7836, "step": 579600 }, { "epoch": 2.361570401542333, "grad_norm": 2.677215337753296, "learning_rate": 0.0036079911644563457, "loss": 7.803, "step": 579700 }, { "epoch": 2.3619777795657146, "grad_norm": 2.59047794342041, "learning_rate": 0.003607546159237352, "loss": 7.7641, "step": 579800 }, { "epoch": 2.362385157589096, "grad_norm": 3.900541305541992, "learning_rate": 0.00360710111042881, "loss": 7.7961, "step": 579900 }, { "epoch": 2.3627925356124777, "grad_norm": 3.410102367401123, "learning_rate": 0.003606656018048321, "loss": 7.8124, "step": 580000 }, { "epoch": 2.3627925356124777, "eval_MaskedAccuracy": 0.5012227258391272, "eval_loss": 1.6263668537139893, "eval_runtime": 197.6188, "eval_samples_per_second": 321.204, "eval_steps_per_second": 1.255, "step": 580000 }, { "epoch": 2.3631999136358592, "grad_norm": 4.543544292449951, "learning_rate": 0.003606210882113474, "loss": 7.8132, "step": 580100 }, { "epoch": 2.3636072916592403, "grad_norm": 5.982573986053467, "learning_rate": 0.0036057657026418734, "loss": 7.7747, "step": 580200 }, { "epoch": 2.364014669682622, "grad_norm": 5.714713096618652, "learning_rate": 0.003605320479651115, "loss": 7.8041, "step": 580300 }, { "epoch": 2.3644220477060034, "grad_norm": 3.8331897258758545, "learning_rate": 0.0036048752131588022, "loss": 7.7848, "step": 580400 }, { "epoch": 2.364829425729385, "grad_norm": 2.8905255794525146, "learning_rate": 0.003604429903182543, "loss": 7.8034, "step": 580500 }, { "epoch": 2.3652368037527665, "grad_norm": 7.409701347351074, "learning_rate": 0.0036039845497399365, "loss": 7.8113, "step": 580600 }, { "epoch": 2.365644181776148, "grad_norm": 2.486720323562622, "learning_rate": 0.0036035391528485955, "loss": 7.7664, "step": 580700 }, { "epoch": 2.366051559799529, "grad_norm": 4.313427925109863, "learning_rate": 0.0036030937125261235, "loss": 7.8212, "step": 580800 }, { "epoch": 2.3664589378229106, "grad_norm": 3.203368902206421, "learning_rate": 0.003602648228790135, "loss": 7.7793, "step": 580900 }, { "epoch": 2.366866315846292, "grad_norm": 5.898493766784668, "learning_rate": 0.0036022027016582407, "loss": 7.7956, "step": 581000 }, { "epoch": 2.366866315846292, "eval_MaskedAccuracy": 0.5013717417484989, "eval_loss": 1.6449079513549805, "eval_runtime": 163.9042, "eval_samples_per_second": 387.275, "eval_steps_per_second": 1.513, "step": 581000 }, { "epoch": 2.3672736938696737, "grad_norm": 3.306659460067749, "learning_rate": 0.003601757131148055, "loss": 7.8113, "step": 581100 }, { "epoch": 2.367681071893055, "grad_norm": 7.1116814613342285, "learning_rate": 0.0036013115172771936, "loss": 7.7716, "step": 581200 }, { "epoch": 2.3680884499164367, "grad_norm": 7.003377437591553, "learning_rate": 0.0036008658600632784, "loss": 7.7785, "step": 581300 }, { "epoch": 2.3684958279398183, "grad_norm": 1.9321496486663818, "learning_rate": 0.0036004201595239227, "loss": 7.794, "step": 581400 }, { "epoch": 2.3689032059631994, "grad_norm": 2.224198818206787, "learning_rate": 0.003599974415676748, "loss": 7.8046, "step": 581500 }, { "epoch": 2.369310583986581, "grad_norm": 4.905538082122803, "learning_rate": 0.0035995286285393736, "loss": 7.7696, "step": 581600 }, { "epoch": 2.3697179620099624, "grad_norm": 2.958261013031006, "learning_rate": 0.003599082798129433, "loss": 7.7959, "step": 581700 }, { "epoch": 2.370125340033344, "grad_norm": 4.866700172424316, "learning_rate": 0.0035986369244645497, "loss": 7.8012, "step": 581800 }, { "epoch": 2.3705327180567255, "grad_norm": 6.702052593231201, "learning_rate": 0.003598191007562351, "loss": 7.7836, "step": 581900 }, { "epoch": 2.3709400960801066, "grad_norm": 3.02662992477417, "learning_rate": 0.003597745047440462, "loss": 7.8078, "step": 582000 }, { "epoch": 2.3709400960801066, "eval_MaskedAccuracy": 0.5012330720607717, "eval_loss": 1.6523900032043457, "eval_runtime": 156.2154, "eval_samples_per_second": 406.336, "eval_steps_per_second": 1.588, "step": 582000 }, { "epoch": 2.371347474103488, "grad_norm": 3.6374354362487793, "learning_rate": 0.003597299044116517, "loss": 7.7908, "step": 582100 }, { "epoch": 2.3717548521268697, "grad_norm": 4.827856540679932, "learning_rate": 0.0035968529976081517, "loss": 7.8104, "step": 582200 }, { "epoch": 2.372162230150251, "grad_norm": 6.088109493255615, "learning_rate": 0.0035964069079329943, "loss": 7.7937, "step": 582300 }, { "epoch": 2.3725696081736327, "grad_norm": 2.390738010406494, "learning_rate": 0.0035959607751086843, "loss": 7.7718, "step": 582400 }, { "epoch": 2.3729769861970142, "grad_norm": 1.5880234241485596, "learning_rate": 0.0035955145991528616, "loss": 7.8001, "step": 582500 }, { "epoch": 2.373384364220396, "grad_norm": 5.216249465942383, "learning_rate": 0.003595068380083166, "loss": 7.7796, "step": 582600 }, { "epoch": 2.373791742243777, "grad_norm": 3.8343546390533447, "learning_rate": 0.0035946221179172363, "loss": 7.8048, "step": 582700 }, { "epoch": 2.3741991202671584, "grad_norm": 2.743173360824585, "learning_rate": 0.0035941758126727167, "loss": 7.7865, "step": 582800 }, { "epoch": 2.37460649829054, "grad_norm": 5.641646862030029, "learning_rate": 0.003593729464367252, "loss": 7.7988, "step": 582900 }, { "epoch": 2.3750138763139215, "grad_norm": 4.868067741394043, "learning_rate": 0.0035932830730184867, "loss": 7.8011, "step": 583000 }, { "epoch": 2.3750138763139215, "eval_MaskedAccuracy": 0.5022030759828252, "eval_loss": 1.63442063331604, "eval_runtime": 156.3059, "eval_samples_per_second": 406.101, "eval_steps_per_second": 1.587, "step": 583000 }, { "epoch": 2.375421254337303, "grad_norm": 6.6119608879089355, "learning_rate": 0.003592836638644071, "loss": 7.8056, "step": 583100 }, { "epoch": 2.3758286323606845, "grad_norm": 2.2543911933898926, "learning_rate": 0.0035923901612616538, "loss": 7.7988, "step": 583200 }, { "epoch": 2.3762360103840656, "grad_norm": 2.987464427947998, "learning_rate": 0.0035919436408888886, "loss": 7.8328, "step": 583300 }, { "epoch": 2.376643388407447, "grad_norm": 6.303028106689453, "learning_rate": 0.0035914970775434317, "loss": 7.8182, "step": 583400 }, { "epoch": 2.3770507664308287, "grad_norm": 3.9799399375915527, "learning_rate": 0.0035910504712429324, "loss": 7.7881, "step": 583500 }, { "epoch": 2.37745814445421, "grad_norm": 2.1123669147491455, "learning_rate": 0.0035906038220050503, "loss": 7.7745, "step": 583600 }, { "epoch": 2.3778655224775918, "grad_norm": 4.1016459465026855, "learning_rate": 0.0035901571298474406, "loss": 7.796, "step": 583700 }, { "epoch": 2.3782729005009733, "grad_norm": 5.921614646911621, "learning_rate": 0.0035897103947877674, "loss": 7.8014, "step": 583800 }, { "epoch": 2.378680278524355, "grad_norm": 6.695907115936279, "learning_rate": 0.00358926361684369, "loss": 7.7801, "step": 583900 }, { "epoch": 2.379087656547736, "grad_norm": 4.198784351348877, "learning_rate": 0.003588816796032871, "loss": 7.778, "step": 584000 }, { "epoch": 2.379087656547736, "eval_MaskedAccuracy": 0.501532274833887, "eval_loss": 1.641954779624939, "eval_runtime": 156.1541, "eval_samples_per_second": 406.496, "eval_steps_per_second": 1.588, "step": 584000 }, { "epoch": 2.3794950345711174, "grad_norm": 6.060896396636963, "learning_rate": 0.0035883699323729775, "loss": 7.7869, "step": 584100 }, { "epoch": 2.379902412594499, "grad_norm": 5.873023986816406, "learning_rate": 0.003587923025881678, "loss": 7.8, "step": 584200 }, { "epoch": 2.3803097906178805, "grad_norm": 4.626306533813477, "learning_rate": 0.0035874760765766387, "loss": 7.7987, "step": 584300 }, { "epoch": 2.380717168641262, "grad_norm": 5.819188594818115, "learning_rate": 0.003587029084475532, "loss": 7.8126, "step": 584400 }, { "epoch": 2.381124546664643, "grad_norm": 6.103301525115967, "learning_rate": 0.0035865820495960287, "loss": 7.8009, "step": 584500 }, { "epoch": 2.3815319246880247, "grad_norm": 2.5972049236297607, "learning_rate": 0.003586134971955799, "loss": 7.7842, "step": 584600 }, { "epoch": 2.381939302711406, "grad_norm": 3.71113920211792, "learning_rate": 0.003585687851572524, "loss": 7.8145, "step": 584700 }, { "epoch": 2.3823466807347877, "grad_norm": 2.0780303478240967, "learning_rate": 0.0035852406884638774, "loss": 7.7889, "step": 584800 }, { "epoch": 2.3827540587581693, "grad_norm": 8.88117790222168, "learning_rate": 0.0035847934826475384, "loss": 7.7979, "step": 584900 }, { "epoch": 2.383161436781551, "grad_norm": 2.3754899501800537, "learning_rate": 0.003584346234141191, "loss": 7.7864, "step": 585000 }, { "epoch": 2.383161436781551, "eval_MaskedAccuracy": 0.5022698361572302, "eval_loss": 1.6313894987106323, "eval_runtime": 175.7067, "eval_samples_per_second": 361.261, "eval_steps_per_second": 1.411, "step": 585000 }, { "epoch": 2.3835688148049323, "grad_norm": 3.1766247749328613, "learning_rate": 0.003583898942962508, "loss": 7.829, "step": 585100 }, { "epoch": 2.3839761928283134, "grad_norm": 2.9906697273254395, "learning_rate": 0.0035834516091291855, "loss": 7.8102, "step": 585200 }, { "epoch": 2.384383570851695, "grad_norm": 4.052495956420898, "learning_rate": 0.003583004232658902, "loss": 7.7815, "step": 585300 }, { "epoch": 2.3847909488750765, "grad_norm": 5.7117509841918945, "learning_rate": 0.0035825568135693445, "loss": 7.7914, "step": 585400 }, { "epoch": 2.385198326898458, "grad_norm": 3.1434757709503174, "learning_rate": 0.003582109351878203, "loss": 7.7874, "step": 585500 }, { "epoch": 2.3856057049218395, "grad_norm": 4.065485000610352, "learning_rate": 0.003581661847603165, "loss": 7.7814, "step": 585600 }, { "epoch": 2.386013082945221, "grad_norm": 3.9136483669281006, "learning_rate": 0.0035812143007619285, "loss": 7.7996, "step": 585700 }, { "epoch": 2.386420460968602, "grad_norm": 3.7829060554504395, "learning_rate": 0.0035807667113721842, "loss": 7.7671, "step": 585800 }, { "epoch": 2.3868278389919837, "grad_norm": 8.514617919921875, "learning_rate": 0.00358031907945163, "loss": 7.779, "step": 585900 }, { "epoch": 2.3872352170153652, "grad_norm": 9.994239807128906, "learning_rate": 0.0035798714050179563, "loss": 7.8028, "step": 586000 }, { "epoch": 2.3872352170153652, "eval_MaskedAccuracy": 0.5006644734791491, "eval_loss": 1.6454873085021973, "eval_runtime": 157.0633, "eval_samples_per_second": 404.143, "eval_steps_per_second": 1.579, "step": 586000 }, { "epoch": 2.3876425950387468, "grad_norm": 3.2327418327331543, "learning_rate": 0.0035794236880888684, "loss": 7.7905, "step": 586100 }, { "epoch": 2.3880499730621283, "grad_norm": 4.871578693389893, "learning_rate": 0.0035789759286820708, "loss": 7.8118, "step": 586200 }, { "epoch": 2.38845735108551, "grad_norm": 9.962044715881348, "learning_rate": 0.0035785281268152593, "loss": 7.7867, "step": 586300 }, { "epoch": 2.3888647291088914, "grad_norm": 7.3953118324279785, "learning_rate": 0.0035780802825061396, "loss": 7.7939, "step": 586400 }, { "epoch": 2.3892721071322724, "grad_norm": 2.738851308822632, "learning_rate": 0.0035776323957724177, "loss": 7.7937, "step": 586500 }, { "epoch": 2.389679485155654, "grad_norm": 2.027208089828491, "learning_rate": 0.0035771844666317963, "loss": 7.815, "step": 586600 }, { "epoch": 2.3900868631790355, "grad_norm": 4.308239459991455, "learning_rate": 0.003576736495101991, "loss": 7.799, "step": 586700 }, { "epoch": 2.390494241202417, "grad_norm": 5.500651836395264, "learning_rate": 0.0035762884812007107, "loss": 7.792, "step": 586800 }, { "epoch": 2.3909016192257986, "grad_norm": 4.689356803894043, "learning_rate": 0.0035758404249456644, "loss": 7.7815, "step": 586900 }, { "epoch": 2.3913089972491797, "grad_norm": 9.67606258392334, "learning_rate": 0.0035753923263545694, "loss": 7.7685, "step": 587000 }, { "epoch": 2.3913089972491797, "eval_MaskedAccuracy": 0.5009284097804791, "eval_loss": 1.6510730981826782, "eval_runtime": 155.2204, "eval_samples_per_second": 408.941, "eval_steps_per_second": 1.598, "step": 587000 }, { "epoch": 2.391716375272561, "grad_norm": 6.0829243659973145, "learning_rate": 0.003574944185445143, "loss": 7.7713, "step": 587100 }, { "epoch": 2.3921237532959427, "grad_norm": 7.072274684906006, "learning_rate": 0.003574496002235102, "loss": 7.776, "step": 587200 }, { "epoch": 2.3925311313193243, "grad_norm": 2.3124594688415527, "learning_rate": 0.0035740477767421605, "loss": 7.7851, "step": 587300 }, { "epoch": 2.392938509342706, "grad_norm": 5.689125061035156, "learning_rate": 0.0035735995089840426, "loss": 7.8035, "step": 587400 }, { "epoch": 2.3933458873660873, "grad_norm": 3.5439541339874268, "learning_rate": 0.0035731511989784727, "loss": 7.7895, "step": 587500 }, { "epoch": 2.393753265389469, "grad_norm": 9.96220874786377, "learning_rate": 0.0035727028467431705, "loss": 7.7954, "step": 587600 }, { "epoch": 2.39416064341285, "grad_norm": 3.481572151184082, "learning_rate": 0.0035722544522958696, "loss": 7.731, "step": 587700 }, { "epoch": 2.3945680214362315, "grad_norm": 5.532615661621094, "learning_rate": 0.0035718060156542867, "loss": 7.7983, "step": 587800 }, { "epoch": 2.394975399459613, "grad_norm": 3.2027766704559326, "learning_rate": 0.003571357536836159, "loss": 7.7861, "step": 587900 }, { "epoch": 2.3953827774829946, "grad_norm": 8.927424430847168, "learning_rate": 0.0035709090158592156, "loss": 7.7831, "step": 588000 }, { "epoch": 2.3953827774829946, "eval_MaskedAccuracy": 0.5007482981894489, "eval_loss": 1.6479322910308838, "eval_runtime": 155.9414, "eval_samples_per_second": 407.05, "eval_steps_per_second": 1.59, "step": 588000 }, { "epoch": 2.395790155506376, "grad_norm": 3.5374948978424072, "learning_rate": 0.003570460452741187, "loss": 7.7869, "step": 588100 }, { "epoch": 2.3961975335297576, "grad_norm": 5.209863662719727, "learning_rate": 0.003570011847499804, "loss": 7.7911, "step": 588200 }, { "epoch": 2.3966049115531387, "grad_norm": 4.747452259063721, "learning_rate": 0.0035695632001528075, "loss": 7.7698, "step": 588300 }, { "epoch": 2.3970122895765202, "grad_norm": 6.2473464012146, "learning_rate": 0.003569114510717937, "loss": 7.7797, "step": 588400 }, { "epoch": 2.3974196675999018, "grad_norm": 5.453723907470703, "learning_rate": 0.003568665779212927, "loss": 7.7949, "step": 588500 }, { "epoch": 2.3978270456232833, "grad_norm": 2.2264554500579834, "learning_rate": 0.0035682170056555228, "loss": 7.785, "step": 588600 }, { "epoch": 2.398234423646665, "grad_norm": 2.996682643890381, "learning_rate": 0.0035677681900634613, "loss": 7.7861, "step": 588700 }, { "epoch": 2.3986418016700464, "grad_norm": 5.559460163116455, "learning_rate": 0.0035673193324544912, "loss": 7.7806, "step": 588800 }, { "epoch": 2.399049179693428, "grad_norm": 4.804593086242676, "learning_rate": 0.003566870432846356, "loss": 7.7879, "step": 588900 }, { "epoch": 2.399456557716809, "grad_norm": 4.812346935272217, "learning_rate": 0.0035664214912568015, "loss": 7.7717, "step": 589000 }, { "epoch": 2.399456557716809, "eval_MaskedAccuracy": 0.5014329700923861, "eval_loss": 1.6416527032852173, "eval_runtime": 245.9242, "eval_samples_per_second": 258.112, "eval_steps_per_second": 1.008, "step": 589000 }, { "epoch": 2.3998639357401905, "grad_norm": 10.421682357788086, "learning_rate": 0.0035659725077035794, "loss": 7.7638, "step": 589100 }, { "epoch": 2.400271313763572, "grad_norm": 5.037592887878418, "learning_rate": 0.003565523482204439, "loss": 7.8033, "step": 589200 }, { "epoch": 2.4006786917869536, "grad_norm": 4.288503170013428, "learning_rate": 0.0035650744147771346, "loss": 7.7643, "step": 589300 }, { "epoch": 2.401086069810335, "grad_norm": 4.179448127746582, "learning_rate": 0.00356462530543942, "loss": 7.7669, "step": 589400 }, { "epoch": 2.401493447833716, "grad_norm": 5.028454303741455, "learning_rate": 0.003564176154209048, "loss": 7.8345, "step": 589500 }, { "epoch": 2.4019008258570977, "grad_norm": 7.250356197357178, "learning_rate": 0.0035637269611037744, "loss": 7.7684, "step": 589600 }, { "epoch": 2.4023082038804793, "grad_norm": 6.912049770355225, "learning_rate": 0.0035632777261413633, "loss": 7.7781, "step": 589700 }, { "epoch": 2.402715581903861, "grad_norm": 4.628445148468018, "learning_rate": 0.003562828449339571, "loss": 7.7719, "step": 589800 }, { "epoch": 2.4031229599272423, "grad_norm": 5.262722015380859, "learning_rate": 0.0035623791307161654, "loss": 7.7818, "step": 589900 }, { "epoch": 2.403530337950624, "grad_norm": 2.250422239303589, "learning_rate": 0.0035619297702889083, "loss": 7.7787, "step": 590000 }, { "epoch": 2.403530337950624, "eval_MaskedAccuracy": 0.5023515761858598, "eval_loss": 1.637993335723877, "eval_runtime": 165.8963, "eval_samples_per_second": 382.624, "eval_steps_per_second": 1.495, "step": 590000 }, { "epoch": 2.4039377159740054, "grad_norm": 2.7703800201416016, "learning_rate": 0.003561480368075565, "loss": 7.7831, "step": 590100 }, { "epoch": 2.4043450939973865, "grad_norm": 3.4823758602142334, "learning_rate": 0.0035610309240939005, "loss": 7.79, "step": 590200 }, { "epoch": 2.404752472020768, "grad_norm": 2.857991933822632, "learning_rate": 0.0035605814383616844, "loss": 7.7733, "step": 590300 }, { "epoch": 2.4051598500441496, "grad_norm": 2.695787191390991, "learning_rate": 0.0035601319108966916, "loss": 7.8119, "step": 590400 }, { "epoch": 2.405567228067531, "grad_norm": 5.409249305725098, "learning_rate": 0.0035596823417166886, "loss": 7.8127, "step": 590500 }, { "epoch": 2.4059746060909126, "grad_norm": 9.495888710021973, "learning_rate": 0.0035592327308394548, "loss": 7.756, "step": 590600 }, { "epoch": 2.406381984114294, "grad_norm": 15.420244216918945, "learning_rate": 0.003558783078282761, "loss": 7.82, "step": 590700 }, { "epoch": 2.4067893621376752, "grad_norm": 4.739377498626709, "learning_rate": 0.0035583333840643883, "loss": 7.804, "step": 590800 }, { "epoch": 2.407196740161057, "grad_norm": 4.556971073150635, "learning_rate": 0.003557883648202112, "loss": 7.7701, "step": 590900 }, { "epoch": 2.4076041181844383, "grad_norm": 3.1656908988952637, "learning_rate": 0.003557433870713714, "loss": 7.775, "step": 591000 }, { "epoch": 2.4076041181844383, "eval_MaskedAccuracy": 0.502121768862807, "eval_loss": 1.6379399299621582, "eval_runtime": 160.9565, "eval_samples_per_second": 394.367, "eval_steps_per_second": 1.541, "step": 591000 }, { "epoch": 2.40801149620782, "grad_norm": 3.7673425674438477, "learning_rate": 0.003556984051616979, "loss": 7.7967, "step": 591100 }, { "epoch": 2.4084188742312014, "grad_norm": 3.8414700031280518, "learning_rate": 0.003556534190929685, "loss": 7.7876, "step": 591200 }, { "epoch": 2.408826252254583, "grad_norm": 3.038132429122925, "learning_rate": 0.0035560842886696174, "loss": 7.7915, "step": 591300 }, { "epoch": 2.4092336302779644, "grad_norm": 3.504563570022583, "learning_rate": 0.0035556343448545696, "loss": 7.7801, "step": 591400 }, { "epoch": 2.4096410083013455, "grad_norm": 3.4391419887542725, "learning_rate": 0.003555184359502326, "loss": 7.7885, "step": 591500 }, { "epoch": 2.410048386324727, "grad_norm": 5.2733306884765625, "learning_rate": 0.0035547343326306786, "loss": 7.7761, "step": 591600 }, { "epoch": 2.4104557643481086, "grad_norm": 4.036264419555664, "learning_rate": 0.0035542842642574198, "loss": 7.7725, "step": 591700 }, { "epoch": 2.41086314237149, "grad_norm": 2.197981357574463, "learning_rate": 0.0035538341544003437, "loss": 7.7995, "step": 591800 }, { "epoch": 2.4112705203948717, "grad_norm": 1.739732265472412, "learning_rate": 0.003553384003077243, "loss": 7.7964, "step": 591900 }, { "epoch": 2.4116778984182528, "grad_norm": 8.783902168273926, "learning_rate": 0.003552933810305913, "loss": 7.7825, "step": 592000 }, { "epoch": 2.4116778984182528, "eval_MaskedAccuracy": 0.5010911627700765, "eval_loss": 1.634800910949707, "eval_runtime": 156.512, "eval_samples_per_second": 405.566, "eval_steps_per_second": 1.585, "step": 592000 }, { "epoch": 2.4120852764416343, "grad_norm": 2.94250750541687, "learning_rate": 0.003552483576104157, "loss": 7.8049, "step": 592100 }, { "epoch": 2.412492654465016, "grad_norm": 8.064268112182617, "learning_rate": 0.0035520333004897733, "loss": 7.7787, "step": 592200 }, { "epoch": 2.4129000324883974, "grad_norm": 2.6121816635131836, "learning_rate": 0.003551582983480563, "loss": 7.794, "step": 592300 }, { "epoch": 2.413307410511779, "grad_norm": 3.7331557273864746, "learning_rate": 0.003551132625094326, "loss": 7.7651, "step": 592400 }, { "epoch": 2.4137147885351604, "grad_norm": 3.1750247478485107, "learning_rate": 0.0035506822253488727, "loss": 7.8189, "step": 592500 }, { "epoch": 2.414122166558542, "grad_norm": 3.3395748138427734, "learning_rate": 0.00355023178426201, "loss": 7.8012, "step": 592600 }, { "epoch": 2.414529544581923, "grad_norm": 2.3670284748077393, "learning_rate": 0.003549781301851543, "loss": 7.7763, "step": 592700 }, { "epoch": 2.4149369226053046, "grad_norm": 3.7103707790374756, "learning_rate": 0.003549330778135285, "loss": 7.7952, "step": 592800 }, { "epoch": 2.415344300628686, "grad_norm": 2.7309718132019043, "learning_rate": 0.0035488802131310444, "loss": 7.7881, "step": 592900 }, { "epoch": 2.4157516786520676, "grad_norm": 3.794459104537964, "learning_rate": 0.0035484296068566336, "loss": 7.7456, "step": 593000 }, { "epoch": 2.4157516786520676, "eval_MaskedAccuracy": 0.5026838307102172, "eval_loss": 1.6401876211166382, "eval_runtime": 214.8496, "eval_samples_per_second": 295.444, "eval_steps_per_second": 1.154, "step": 593000 }, { "epoch": 2.416159056675449, "grad_norm": 3.176884889602661, "learning_rate": 0.003547978959329874, "loss": 7.7424, "step": 593100 }, { "epoch": 2.4165664346988307, "grad_norm": 10.780375480651855, "learning_rate": 0.0035475282705685757, "loss": 7.7992, "step": 593200 }, { "epoch": 2.416973812722212, "grad_norm": 7.064286231994629, "learning_rate": 0.003547077540590559, "loss": 7.8173, "step": 593300 }, { "epoch": 2.4173811907455933, "grad_norm": 9.125280380249023, "learning_rate": 0.003546626769413643, "loss": 7.7634, "step": 593400 }, { "epoch": 2.417788568768975, "grad_norm": 2.350525379180908, "learning_rate": 0.0035461759570556504, "loss": 7.8059, "step": 593500 }, { "epoch": 2.4181959467923564, "grad_norm": 2.1609630584716797, "learning_rate": 0.0035457251035344037, "loss": 7.7845, "step": 593600 }, { "epoch": 2.418603324815738, "grad_norm": 4.145672798156738, "learning_rate": 0.003545274208867726, "loss": 7.813, "step": 593700 }, { "epoch": 2.4190107028391195, "grad_norm": 1.9367409944534302, "learning_rate": 0.0035448232730734457, "loss": 7.7629, "step": 593800 }, { "epoch": 2.419418080862501, "grad_norm": 4.147180080413818, "learning_rate": 0.003544372296169388, "loss": 7.7736, "step": 593900 }, { "epoch": 2.419825458885882, "grad_norm": 4.469532489776611, "learning_rate": 0.003543921278173388, "loss": 7.808, "step": 594000 }, { "epoch": 2.419825458885882, "eval_MaskedAccuracy": 0.5020240601705928, "eval_loss": 1.6414042711257935, "eval_runtime": 154.9602, "eval_samples_per_second": 409.628, "eval_steps_per_second": 1.6, "step": 594000 }, { "epoch": 2.4202328369092636, "grad_norm": 5.342765808105469, "learning_rate": 0.003543470219103273, "loss": 7.8297, "step": 594100 }, { "epoch": 2.420640214932645, "grad_norm": 3.777756929397583, "learning_rate": 0.0035430191189768753, "loss": 7.7988, "step": 594200 }, { "epoch": 2.4210475929560267, "grad_norm": 5.757164478302002, "learning_rate": 0.003542567977812031, "loss": 7.8026, "step": 594300 }, { "epoch": 2.421454970979408, "grad_norm": 2.53171706199646, "learning_rate": 0.003542116795626575, "loss": 7.7985, "step": 594400 }, { "epoch": 2.4218623490027893, "grad_norm": 3.1694672107696533, "learning_rate": 0.003541665572438345, "loss": 7.793, "step": 594500 }, { "epoch": 2.422269727026171, "grad_norm": 5.521478176116943, "learning_rate": 0.003541214308265178, "loss": 7.7971, "step": 594600 }, { "epoch": 2.4226771050495524, "grad_norm": 4.49680757522583, "learning_rate": 0.0035407630031249215, "loss": 7.8049, "step": 594700 }, { "epoch": 2.423084483072934, "grad_norm": 3.1610946655273438, "learning_rate": 0.0035403116570354135, "loss": 7.772, "step": 594800 }, { "epoch": 2.4234918610963154, "grad_norm": 2.836824893951416, "learning_rate": 0.003539860270014498, "loss": 7.8031, "step": 594900 }, { "epoch": 2.423899239119697, "grad_norm": 8.358927726745605, "learning_rate": 0.0035394088420800194, "loss": 7.7444, "step": 595000 }, { "epoch": 2.423899239119697, "eval_MaskedAccuracy": 0.501722756180028, "eval_loss": 1.6437461376190186, "eval_runtime": 161.375, "eval_samples_per_second": 393.345, "eval_steps_per_second": 1.537, "step": 595000 }, { "epoch": 2.4243066171430785, "grad_norm": 5.089084148406982, "learning_rate": 0.003538957373249825, "loss": 7.7766, "step": 595100 }, { "epoch": 2.4247139951664596, "grad_norm": 4.644046783447266, "learning_rate": 0.0035385058635417715, "loss": 7.7713, "step": 595200 }, { "epoch": 2.425121373189841, "grad_norm": 3.160179376602173, "learning_rate": 0.0035380543129736996, "loss": 7.8105, "step": 595300 }, { "epoch": 2.4255287512132226, "grad_norm": 4.421177387237549, "learning_rate": 0.003537602721563468, "loss": 7.7514, "step": 595400 }, { "epoch": 2.425936129236604, "grad_norm": 10.34675407409668, "learning_rate": 0.0035371510893289276, "loss": 7.7565, "step": 595500 }, { "epoch": 2.4263435072599857, "grad_norm": 4.81442403793335, "learning_rate": 0.0035366994162879355, "loss": 7.7428, "step": 595600 }, { "epoch": 2.4267508852833672, "grad_norm": 6.543236255645752, "learning_rate": 0.003536247702458345, "loss": 7.8133, "step": 595700 }, { "epoch": 2.4271582633067483, "grad_norm": 2.53407883644104, "learning_rate": 0.003535795947858019, "loss": 7.7965, "step": 595800 }, { "epoch": 2.42756564133013, "grad_norm": 6.635446548461914, "learning_rate": 0.003535344152504815, "loss": 7.7983, "step": 595900 }, { "epoch": 2.4279730193535114, "grad_norm": 4.083710670471191, "learning_rate": 0.0035348923164165956, "loss": 7.7524, "step": 596000 }, { "epoch": 2.4279730193535114, "eval_MaskedAccuracy": 0.5013060614278118, "eval_loss": 1.6439677476882935, "eval_runtime": 157.3899, "eval_samples_per_second": 403.304, "eval_steps_per_second": 1.576, "step": 596000 }, { "epoch": 2.428380397376893, "grad_norm": 6.001681327819824, "learning_rate": 0.003534440439611226, "loss": 7.7901, "step": 596100 }, { "epoch": 2.4287877754002745, "grad_norm": 3.3321585655212402, "learning_rate": 0.0035339885221065694, "loss": 7.7784, "step": 596200 }, { "epoch": 2.429195153423656, "grad_norm": 10.198421478271484, "learning_rate": 0.003533536563920495, "loss": 7.7841, "step": 596300 }, { "epoch": 2.4296025314470375, "grad_norm": 4.937011241912842, "learning_rate": 0.0035330845650708674, "loss": 7.8041, "step": 596400 }, { "epoch": 2.4300099094704186, "grad_norm": 3.839660406112671, "learning_rate": 0.003532632525575559, "loss": 7.7741, "step": 596500 }, { "epoch": 2.4304172874938, "grad_norm": 4.8967061042785645, "learning_rate": 0.003532180445452442, "loss": 7.7967, "step": 596600 }, { "epoch": 2.4308246655171817, "grad_norm": 3.739058256149292, "learning_rate": 0.003531728324719386, "loss": 7.7678, "step": 596700 }, { "epoch": 2.431232043540563, "grad_norm": 3.3789188861846924, "learning_rate": 0.0035312761633942667, "loss": 7.8127, "step": 596800 }, { "epoch": 2.4316394215639447, "grad_norm": 3.7414557933807373, "learning_rate": 0.0035308239614949646, "loss": 7.7633, "step": 596900 }, { "epoch": 2.432046799587326, "grad_norm": 2.8671152591705322, "learning_rate": 0.003530371719039353, "loss": 7.7577, "step": 597000 }, { "epoch": 2.432046799587326, "eval_MaskedAccuracy": 0.5013253448657601, "eval_loss": 1.6389906406402588, "eval_runtime": 157.3848, "eval_samples_per_second": 403.317, "eval_steps_per_second": 1.576, "step": 597000 }, { "epoch": 2.4324541776107074, "grad_norm": 4.212960243225098, "learning_rate": 0.003529919436045315, "loss": 7.8131, "step": 597100 }, { "epoch": 2.432861555634089, "grad_norm": 3.658583402633667, "learning_rate": 0.0035294671125307266, "loss": 7.7826, "step": 597200 }, { "epoch": 2.4332689336574704, "grad_norm": 3.706495761871338, "learning_rate": 0.003529014748513475, "loss": 7.782, "step": 597300 }, { "epoch": 2.433676311680852, "grad_norm": 2.5128815174102783, "learning_rate": 0.003528562344011446, "loss": 7.8014, "step": 597400 }, { "epoch": 2.4340836897042335, "grad_norm": 1.9968478679656982, "learning_rate": 0.0035281098990425186, "loss": 7.7877, "step": 597500 }, { "epoch": 2.434491067727615, "grad_norm": 7.746984958648682, "learning_rate": 0.0035276574136245873, "loss": 7.7932, "step": 597600 }, { "epoch": 2.434898445750996, "grad_norm": 2.634483814239502, "learning_rate": 0.003527204887775543, "loss": 7.7713, "step": 597700 }, { "epoch": 2.4353058237743777, "grad_norm": 9.012521743774414, "learning_rate": 0.00352675232151327, "loss": 7.7765, "step": 597800 }, { "epoch": 2.435713201797759, "grad_norm": 3.6214818954467773, "learning_rate": 0.003526299714855664, "loss": 7.7699, "step": 597900 }, { "epoch": 2.4361205798211407, "grad_norm": 7.51224946975708, "learning_rate": 0.0035258470678206154, "loss": 7.8023, "step": 598000 }, { "epoch": 2.4361205798211407, "eval_MaskedAccuracy": 0.5020065258373666, "eval_loss": 1.6350057125091553, "eval_runtime": 152.2062, "eval_samples_per_second": 417.039, "eval_steps_per_second": 1.629, "step": 598000 }, { "epoch": 2.4365279578445223, "grad_norm": 4.046254634857178, "learning_rate": 0.0035253943804260213, "loss": 7.7911, "step": 598100 }, { "epoch": 2.436935335867904, "grad_norm": 5.190164089202881, "learning_rate": 0.0035249416526897804, "loss": 7.7615, "step": 598200 }, { "epoch": 2.437342713891285, "grad_norm": 12.994916915893555, "learning_rate": 0.0035244888846297953, "loss": 7.7658, "step": 598300 }, { "epoch": 2.4377500919146664, "grad_norm": 4.974771499633789, "learning_rate": 0.003524036076263957, "loss": 7.7725, "step": 598400 }, { "epoch": 2.438157469938048, "grad_norm": 2.5542399883270264, "learning_rate": 0.0035235832276101766, "loss": 7.783, "step": 598500 }, { "epoch": 2.4385648479614295, "grad_norm": 4.832586765289307, "learning_rate": 0.0035231303386863526, "loss": 7.7698, "step": 598600 }, { "epoch": 2.438972225984811, "grad_norm": 5.31456995010376, "learning_rate": 0.003522677409510389, "loss": 7.7903, "step": 598700 }, { "epoch": 2.4393796040081925, "grad_norm": 3.226726770401001, "learning_rate": 0.0035222244401001963, "loss": 7.7941, "step": 598800 }, { "epoch": 2.439786982031574, "grad_norm": 6.057314872741699, "learning_rate": 0.0035217714304736775, "loss": 7.7717, "step": 598900 }, { "epoch": 2.440194360054955, "grad_norm": 5.601971626281738, "learning_rate": 0.0035213183806487463, "loss": 7.7683, "step": 599000 }, { "epoch": 2.440194360054955, "eval_MaskedAccuracy": 0.5013334530629393, "eval_loss": 1.6420518159866333, "eval_runtime": 160.0676, "eval_samples_per_second": 396.557, "eval_steps_per_second": 1.549, "step": 599000 }, { "epoch": 2.4406017380783367, "grad_norm": 11.546951293945312, "learning_rate": 0.0035208652906433186, "loss": 7.7542, "step": 599100 }, { "epoch": 2.4410091161017182, "grad_norm": 2.4373414516448975, "learning_rate": 0.0035204121604753003, "loss": 7.7695, "step": 599200 }, { "epoch": 2.4414164941250998, "grad_norm": 2.8253660202026367, "learning_rate": 0.00351995899016261, "loss": 7.7759, "step": 599300 }, { "epoch": 2.4418238721484813, "grad_norm": 3.2685346603393555, "learning_rate": 0.0035195057797231615, "loss": 7.7881, "step": 599400 }, { "epoch": 2.4422312501718624, "grad_norm": 3.1104533672332764, "learning_rate": 0.0035190525291748705, "loss": 7.7807, "step": 599500 }, { "epoch": 2.442638628195244, "grad_norm": 9.3849515914917, "learning_rate": 0.0035185992385356585, "loss": 7.7639, "step": 599600 }, { "epoch": 2.4430460062186254, "grad_norm": 3.7798829078674316, "learning_rate": 0.0035181459078234504, "loss": 7.785, "step": 599700 }, { "epoch": 2.443453384242007, "grad_norm": 2.5248496532440186, "learning_rate": 0.0035176925370561667, "loss": 7.7743, "step": 599800 }, { "epoch": 2.4438607622653885, "grad_norm": 2.904284954071045, "learning_rate": 0.00351723912625173, "loss": 7.7652, "step": 599900 }, { "epoch": 2.44426814028877, "grad_norm": 6.10533332824707, "learning_rate": 0.0035167856754280678, "loss": 7.7785, "step": 600000 }, { "epoch": 2.44426814028877, "eval_MaskedAccuracy": 0.5014401785964906, "eval_loss": 1.6428192853927612, "eval_runtime": 161.7854, "eval_samples_per_second": 392.347, "eval_steps_per_second": 1.533, "step": 600000 }, { "epoch": 2.4446755183121516, "grad_norm": 3.563767194747925, "learning_rate": 0.0035163321846031046, "loss": 7.7794, "step": 600100 }, { "epoch": 2.4450828963355327, "grad_norm": 3.370847225189209, "learning_rate": 0.0035158786537947723, "loss": 7.7974, "step": 600200 }, { "epoch": 2.445490274358914, "grad_norm": 4.457855224609375, "learning_rate": 0.003515425083021001, "loss": 7.7743, "step": 600300 }, { "epoch": 2.4458976523822957, "grad_norm": 4.694652557373047, "learning_rate": 0.003514971472299721, "loss": 7.7955, "step": 600400 }, { "epoch": 2.4463050304056773, "grad_norm": 3.1714799404144287, "learning_rate": 0.003514517821648867, "loss": 7.7949, "step": 600500 }, { "epoch": 2.446712408429059, "grad_norm": 3.201887607574463, "learning_rate": 0.0035140641310863756, "loss": 7.7856, "step": 600600 }, { "epoch": 2.4471197864524403, "grad_norm": 6.544091701507568, "learning_rate": 0.0035136104006301826, "loss": 7.799, "step": 600700 }, { "epoch": 2.4475271644758214, "grad_norm": 3.627216100692749, "learning_rate": 0.003513156630298223, "loss": 7.8096, "step": 600800 }, { "epoch": 2.447934542499203, "grad_norm": 3.186779499053955, "learning_rate": 0.0035127028201084394, "loss": 7.7801, "step": 600900 }, { "epoch": 2.4483419205225845, "grad_norm": 9.663773536682129, "learning_rate": 0.003512248970078778, "loss": 7.7877, "step": 601000 }, { "epoch": 2.4483419205225845, "eval_MaskedAccuracy": 0.5014117117366939, "eval_loss": 1.6410915851593018, "eval_runtime": 205.3947, "eval_samples_per_second": 309.044, "eval_steps_per_second": 1.207, "step": 601000 }, { "epoch": 2.448749298545966, "grad_norm": 6.464639663696289, "learning_rate": 0.0035117950802271733, "loss": 7.82, "step": 601100 }, { "epoch": 2.4491566765693475, "grad_norm": 3.021066904067993, "learning_rate": 0.0035113411505715746, "loss": 7.7605, "step": 601200 }, { "epoch": 2.449564054592729, "grad_norm": 1.7831275463104248, "learning_rate": 0.0035108871811299286, "loss": 7.7613, "step": 601300 }, { "epoch": 2.4499714326161106, "grad_norm": 1.8714038133621216, "learning_rate": 0.00351043317192018, "loss": 7.7519, "step": 601400 }, { "epoch": 2.4503788106394917, "grad_norm": 2.989990472793579, "learning_rate": 0.003509979122960283, "loss": 7.7641, "step": 601500 }, { "epoch": 2.4507861886628732, "grad_norm": 2.905515670776367, "learning_rate": 0.003509525034268185, "loss": 7.7724, "step": 601600 }, { "epoch": 2.4511935666862548, "grad_norm": 1.7551355361938477, "learning_rate": 0.003509070905861842, "loss": 7.7868, "step": 601700 }, { "epoch": 2.4516009447096363, "grad_norm": 2.576457977294922, "learning_rate": 0.003508616737759204, "loss": 7.7956, "step": 601800 }, { "epoch": 2.452008322733018, "grad_norm": 3.6266062259674072, "learning_rate": 0.0035081625299782284, "loss": 7.7604, "step": 601900 }, { "epoch": 2.452415700756399, "grad_norm": 3.1121485233306885, "learning_rate": 0.0035077082825368724, "loss": 7.7806, "step": 602000 }, { "epoch": 2.452415700756399, "eval_MaskedAccuracy": 0.502414683474927, "eval_loss": 1.6362465620040894, "eval_runtime": 162.692, "eval_samples_per_second": 390.16, "eval_steps_per_second": 1.524, "step": 602000 }, { "epoch": 2.4528230787797805, "grad_norm": 11.392309188842773, "learning_rate": 0.0035072539954530977, "loss": 7.7646, "step": 602100 }, { "epoch": 2.453230456803162, "grad_norm": 5.102611541748047, "learning_rate": 0.003506799668744863, "loss": 7.8046, "step": 602200 }, { "epoch": 2.4536378348265435, "grad_norm": 8.4649076461792, "learning_rate": 0.0035063453024301297, "loss": 7.763, "step": 602300 }, { "epoch": 2.454045212849925, "grad_norm": 7.183914661407471, "learning_rate": 0.0035058908965268625, "loss": 7.7853, "step": 602400 }, { "epoch": 2.4544525908733066, "grad_norm": 9.594482421875, "learning_rate": 0.0035054364510530207, "loss": 7.7438, "step": 602500 }, { "epoch": 2.454859968896688, "grad_norm": 3.214622974395752, "learning_rate": 0.0035049819660265768, "loss": 7.7968, "step": 602600 }, { "epoch": 2.455267346920069, "grad_norm": 3.7919723987579346, "learning_rate": 0.003504527441465497, "loss": 7.7411, "step": 602700 }, { "epoch": 2.4556747249434507, "grad_norm": 4.053675651550293, "learning_rate": 0.0035040728773877475, "loss": 7.7886, "step": 602800 }, { "epoch": 2.4560821029668323, "grad_norm": 3.9221904277801514, "learning_rate": 0.0035036182738113083, "loss": 7.7377, "step": 602900 }, { "epoch": 2.456489480990214, "grad_norm": 2.673964500427246, "learning_rate": 0.0035031636307541487, "loss": 7.7565, "step": 603000 }, { "epoch": 2.456489480990214, "eval_MaskedAccuracy": 0.5020966775888541, "eval_loss": 1.6324012279510498, "eval_runtime": 234.3925, "eval_samples_per_second": 270.811, "eval_steps_per_second": 1.058, "step": 603000 }, { "epoch": 2.4568968590135953, "grad_norm": 2.3325867652893066, "learning_rate": 0.0035027089482342413, "loss": 7.7874, "step": 603100 }, { "epoch": 2.457304237036977, "grad_norm": 4.30885648727417, "learning_rate": 0.003502254226269564, "loss": 7.7754, "step": 603200 }, { "epoch": 2.457711615060358, "grad_norm": 4.349258899688721, "learning_rate": 0.003501799464878091, "loss": 7.7988, "step": 603300 }, { "epoch": 2.4581189930837395, "grad_norm": 11.04083251953125, "learning_rate": 0.0035013446640778027, "loss": 7.7695, "step": 603400 }, { "epoch": 2.458526371107121, "grad_norm": 4.791848659515381, "learning_rate": 0.0035008898238866864, "loss": 7.7853, "step": 603500 }, { "epoch": 2.4589337491305026, "grad_norm": 5.2664408683776855, "learning_rate": 0.003500434944322717, "loss": 7.7757, "step": 603600 }, { "epoch": 2.459341127153884, "grad_norm": 6.127841472625732, "learning_rate": 0.0034999800254038814, "loss": 7.7905, "step": 603700 }, { "epoch": 2.4597485051772656, "grad_norm": 4.613772869110107, "learning_rate": 0.0034995250671481623, "loss": 7.7382, "step": 603800 }, { "epoch": 2.460155883200647, "grad_norm": 3.5152437686920166, "learning_rate": 0.003499070069573547, "loss": 7.7811, "step": 603900 }, { "epoch": 2.4605632612240282, "grad_norm": 8.655340194702148, "learning_rate": 0.0034986150326980275, "loss": 7.7644, "step": 604000 }, { "epoch": 2.4605632612240282, "eval_MaskedAccuracy": 0.5018208698885417, "eval_loss": 1.6410335302352905, "eval_runtime": 167.0622, "eval_samples_per_second": 379.954, "eval_steps_per_second": 1.484, "step": 604000 }, { "epoch": 2.4609706392474098, "grad_norm": 2.4539482593536377, "learning_rate": 0.0034981599565395904, "loss": 7.7642, "step": 604100 }, { "epoch": 2.4613780172707913, "grad_norm": 2.8700265884399414, "learning_rate": 0.0034977048411162274, "loss": 7.799, "step": 604200 }, { "epoch": 2.461785395294173, "grad_norm": 7.467411994934082, "learning_rate": 0.0034972496864459357, "loss": 7.8224, "step": 604300 }, { "epoch": 2.4621927733175544, "grad_norm": 2.57955002784729, "learning_rate": 0.0034967944925467045, "loss": 7.7487, "step": 604400 }, { "epoch": 2.4626001513409355, "grad_norm": 5.648074626922607, "learning_rate": 0.0034963392594365354, "loss": 7.7588, "step": 604500 }, { "epoch": 2.463007529364317, "grad_norm": 3.960665225982666, "learning_rate": 0.0034958839871334236, "loss": 7.7721, "step": 604600 }, { "epoch": 2.4634149073876985, "grad_norm": 7.0501275062561035, "learning_rate": 0.003495428675655364, "loss": 7.7589, "step": 604700 }, { "epoch": 2.46382228541108, "grad_norm": 6.553703784942627, "learning_rate": 0.0034949733250203667, "loss": 7.7747, "step": 604800 }, { "epoch": 2.4642296634344616, "grad_norm": 5.630553722381592, "learning_rate": 0.003494517935246427, "loss": 7.77, "step": 604900 }, { "epoch": 2.464637041457843, "grad_norm": 2.755204439163208, "learning_rate": 0.0034940625063515523, "loss": 7.7734, "step": 605000 }, { "epoch": 2.464637041457843, "eval_MaskedAccuracy": 0.5022833296206384, "eval_loss": 1.6339539289474487, "eval_runtime": 194.94, "eval_samples_per_second": 325.618, "eval_steps_per_second": 1.272, "step": 605000 }, { "epoch": 2.4650444194812247, "grad_norm": 2.969895601272583, "learning_rate": 0.003493607038353743, "loss": 7.783, "step": 605100 }, { "epoch": 2.4654517975046057, "grad_norm": 2.7398605346679688, "learning_rate": 0.0034931515312710123, "loss": 7.7843, "step": 605200 }, { "epoch": 2.4658591755279873, "grad_norm": 3.5312838554382324, "learning_rate": 0.003492695985121364, "loss": 7.7612, "step": 605300 }, { "epoch": 2.466266553551369, "grad_norm": 8.26854133605957, "learning_rate": 0.003492240399922813, "loss": 7.7444, "step": 605400 }, { "epoch": 2.4666739315747503, "grad_norm": 4.664064884185791, "learning_rate": 0.0034917847756933697, "loss": 7.7587, "step": 605500 }, { "epoch": 2.467081309598132, "grad_norm": 3.2515950202941895, "learning_rate": 0.003491329112451045, "loss": 7.7686, "step": 605600 }, { "epoch": 2.4674886876215134, "grad_norm": 4.394134998321533, "learning_rate": 0.003490873410213856, "loss": 7.7707, "step": 605700 }, { "epoch": 2.4678960656448945, "grad_norm": 5.289269924163818, "learning_rate": 0.003490417668999815, "loss": 7.7599, "step": 605800 }, { "epoch": 2.468303443668276, "grad_norm": 3.2438066005706787, "learning_rate": 0.003489961888826945, "loss": 7.756, "step": 605900 }, { "epoch": 2.4687108216916576, "grad_norm": 3.645613193511963, "learning_rate": 0.00348950606971326, "loss": 7.7876, "step": 606000 }, { "epoch": 2.4687108216916576, "eval_MaskedAccuracy": 0.5029190005682133, "eval_loss": 1.6357187032699585, "eval_runtime": 165.3427, "eval_samples_per_second": 383.906, "eval_steps_per_second": 1.5, "step": 606000 }, { "epoch": 2.469118199715039, "grad_norm": 10.071039199829102, "learning_rate": 0.003489050211676785, "loss": 7.7758, "step": 606100 }, { "epoch": 2.4695255777384206, "grad_norm": 3.530179738998413, "learning_rate": 0.003488594314735545, "loss": 7.802, "step": 606200 }, { "epoch": 2.469932955761802, "grad_norm": 2.915921211242676, "learning_rate": 0.003488138378907564, "loss": 7.7942, "step": 606300 }, { "epoch": 2.4703403337851837, "grad_norm": 5.038060188293457, "learning_rate": 0.003487682404210863, "loss": 7.7954, "step": 606400 }, { "epoch": 2.470747711808565, "grad_norm": 7.178977012634277, "learning_rate": 0.0034872263906634672, "loss": 7.7791, "step": 606500 }, { "epoch": 2.4711550898319463, "grad_norm": 3.2906036376953125, "learning_rate": 0.0034867703382834157, "loss": 7.7613, "step": 606600 }, { "epoch": 2.471562467855328, "grad_norm": 4.418756008148193, "learning_rate": 0.0034863142470887274, "loss": 7.8086, "step": 606700 }, { "epoch": 2.4719698458787094, "grad_norm": 2.7068023681640625, "learning_rate": 0.00348585811709744, "loss": 7.7814, "step": 606800 }, { "epoch": 2.472377223902091, "grad_norm": 3.6483817100524902, "learning_rate": 0.003485401948327583, "loss": 7.7869, "step": 606900 }, { "epoch": 2.472784601925472, "grad_norm": 2.069129705429077, "learning_rate": 0.0034849457407971933, "loss": 7.7796, "step": 607000 }, { "epoch": 2.472784601925472, "eval_MaskedAccuracy": 0.5023630049503295, "eval_loss": 1.6310961246490479, "eval_runtime": 164.6139, "eval_samples_per_second": 385.605, "eval_steps_per_second": 1.507, "step": 607000 }, { "epoch": 2.4731919799488535, "grad_norm": 2.366436004638672, "learning_rate": 0.003484489494524309, "loss": 7.7938, "step": 607100 }, { "epoch": 2.473599357972235, "grad_norm": 5.079567909240723, "learning_rate": 0.003484033209526966, "loss": 7.7741, "step": 607200 }, { "epoch": 2.4740067359956166, "grad_norm": 7.488637924194336, "learning_rate": 0.003483576885823208, "loss": 7.7635, "step": 607300 }, { "epoch": 2.474414114018998, "grad_norm": 2.4173405170440674, "learning_rate": 0.003483120523431071, "loss": 7.8059, "step": 607400 }, { "epoch": 2.4748214920423797, "grad_norm": 4.786422252655029, "learning_rate": 0.0034826641223685982, "loss": 7.762, "step": 607500 }, { "epoch": 2.475228870065761, "grad_norm": 4.182760238647461, "learning_rate": 0.003482207682653829, "loss": 7.7919, "step": 607600 }, { "epoch": 2.4756362480891423, "grad_norm": 4.433559417724609, "learning_rate": 0.003481751204304817, "loss": 7.8114, "step": 607700 }, { "epoch": 2.476043626112524, "grad_norm": 2.670544147491455, "learning_rate": 0.003481294687339606, "loss": 7.8075, "step": 607800 }, { "epoch": 2.4764510041359054, "grad_norm": 2.4868991374969482, "learning_rate": 0.0034808381317762388, "loss": 7.7655, "step": 607900 }, { "epoch": 2.476858382159287, "grad_norm": 2.5362184047698975, "learning_rate": 0.0034803815376327736, "loss": 7.7779, "step": 608000 }, { "epoch": 2.476858382159287, "eval_MaskedAccuracy": 0.502517082019846, "eval_loss": 1.635817050933838, "eval_runtime": 167.187, "eval_samples_per_second": 379.671, "eval_steps_per_second": 1.483, "step": 608000 }, { "epoch": 2.4772657601826684, "grad_norm": 8.652792930603027, "learning_rate": 0.003479924904927261, "loss": 7.7412, "step": 608100 }, { "epoch": 2.47767313820605, "grad_norm": 3.2159080505371094, "learning_rate": 0.0034794682336777532, "loss": 7.8121, "step": 608200 }, { "epoch": 2.478080516229431, "grad_norm": 2.7276620864868164, "learning_rate": 0.0034790115239023045, "loss": 7.798, "step": 608300 }, { "epoch": 2.4784878942528126, "grad_norm": 2.7427170276641846, "learning_rate": 0.003478554775618969, "loss": 7.759, "step": 608400 }, { "epoch": 2.478895272276194, "grad_norm": 3.3964476585388184, "learning_rate": 0.003478097988845807, "loss": 7.7567, "step": 608500 }, { "epoch": 2.4793026502995756, "grad_norm": 2.7253124713897705, "learning_rate": 0.0034776411636008803, "loss": 7.8012, "step": 608600 }, { "epoch": 2.479710028322957, "grad_norm": 6.021406650543213, "learning_rate": 0.0034771842999022444, "loss": 7.7693, "step": 608700 }, { "epoch": 2.4801174063463387, "grad_norm": 5.724459648132324, "learning_rate": 0.0034767273977679616, "loss": 7.7649, "step": 608800 }, { "epoch": 2.4805247843697202, "grad_norm": 3.1598286628723145, "learning_rate": 0.003476270457216099, "loss": 7.7503, "step": 608900 }, { "epoch": 2.4809321623931013, "grad_norm": 6.052318572998047, "learning_rate": 0.003475813478264721, "loss": 7.7651, "step": 609000 }, { "epoch": 2.4809321623931013, "eval_MaskedAccuracy": 0.5026950562611281, "eval_loss": 1.6452536582946777, "eval_runtime": 157.0128, "eval_samples_per_second": 404.273, "eval_steps_per_second": 1.579, "step": 609000 }, { "epoch": 2.481339540416483, "grad_norm": 2.323472499847412, "learning_rate": 0.0034753564609318953, "loss": 7.7643, "step": 609100 }, { "epoch": 2.4817469184398644, "grad_norm": 3.980980157852173, "learning_rate": 0.003474899405235685, "loss": 7.766, "step": 609200 }, { "epoch": 2.482154296463246, "grad_norm": 5.836856365203857, "learning_rate": 0.0034744423111941615, "loss": 7.7902, "step": 609300 }, { "epoch": 2.4825616744866275, "grad_norm": 3.978548526763916, "learning_rate": 0.0034739851788254056, "loss": 7.7556, "step": 609400 }, { "epoch": 2.4829690525100085, "grad_norm": 2.193514585494995, "learning_rate": 0.003473528008147481, "loss": 7.7771, "step": 609500 }, { "epoch": 2.48337643053339, "grad_norm": 4.373281478881836, "learning_rate": 0.0034730707991784604, "loss": 7.7665, "step": 609600 }, { "epoch": 2.4837838085567716, "grad_norm": 3.0255274772644043, "learning_rate": 0.0034726135519364274, "loss": 7.7561, "step": 609700 }, { "epoch": 2.484191186580153, "grad_norm": 3.5589005947113037, "learning_rate": 0.0034721562664394565, "loss": 7.7837, "step": 609800 }, { "epoch": 2.4845985646035347, "grad_norm": 3.6775569915771484, "learning_rate": 0.0034716989427056237, "loss": 7.7502, "step": 609900 }, { "epoch": 2.485005942626916, "grad_norm": 10.020858764648438, "learning_rate": 0.003471241580753008, "loss": 7.805, "step": 610000 }, { "epoch": 2.485005942626916, "eval_MaskedAccuracy": 0.5018309697365412, "eval_loss": 1.6496104001998901, "eval_runtime": 159.4095, "eval_samples_per_second": 398.195, "eval_steps_per_second": 1.556, "step": 610000 }, { "epoch": 2.4854133206502977, "grad_norm": 6.619731426239014, "learning_rate": 0.0034707841805996986, "loss": 7.7667, "step": 610100 }, { "epoch": 2.485820698673679, "grad_norm": 5.1112542152404785, "learning_rate": 0.003470326742263774, "loss": 7.7172, "step": 610200 }, { "epoch": 2.4862280766970604, "grad_norm": 6.065150737762451, "learning_rate": 0.0034698692657633245, "loss": 7.7433, "step": 610300 }, { "epoch": 2.486635454720442, "grad_norm": 4.601003646850586, "learning_rate": 0.003469411751116425, "loss": 7.7615, "step": 610400 }, { "epoch": 2.4870428327438234, "grad_norm": 5.6820244789123535, "learning_rate": 0.0034689541983411744, "loss": 7.7956, "step": 610500 }, { "epoch": 2.487450210767205, "grad_norm": 2.847165107727051, "learning_rate": 0.003468496607455661, "loss": 7.7725, "step": 610600 }, { "epoch": 2.4878575887905865, "grad_norm": 4.603565216064453, "learning_rate": 0.003468038978477969, "loss": 7.788, "step": 610700 }, { "epoch": 2.4882649668139676, "grad_norm": 4.942103862762451, "learning_rate": 0.0034675813114261956, "loss": 7.7807, "step": 610800 }, { "epoch": 2.488672344837349, "grad_norm": 4.010811805725098, "learning_rate": 0.0034671236063184355, "loss": 7.7754, "step": 610900 }, { "epoch": 2.4890797228607306, "grad_norm": 4.431800842285156, "learning_rate": 0.00346666586317279, "loss": 7.7784, "step": 611000 }, { "epoch": 2.4890797228607306, "eval_MaskedAccuracy": 0.5031158369280649, "eval_loss": 1.6375545263290405, "eval_runtime": 154.7681, "eval_samples_per_second": 410.136, "eval_steps_per_second": 1.602, "step": 611000 }, { "epoch": 2.489487100884112, "grad_norm": 3.4358301162719727, "learning_rate": 0.0034662080820073472, "loss": 7.7893, "step": 611100 }, { "epoch": 2.4898944789074937, "grad_norm": 5.371962547302246, "learning_rate": 0.0034657502628402067, "loss": 7.7254, "step": 611200 }, { "epoch": 2.4903018569308752, "grad_norm": 5.027492523193359, "learning_rate": 0.003465292405689468, "loss": 7.7705, "step": 611300 }, { "epoch": 2.490709234954257, "grad_norm": 5.015530109405518, "learning_rate": 0.0034648345105732373, "loss": 7.7824, "step": 611400 }, { "epoch": 2.491116612977638, "grad_norm": 4.747501850128174, "learning_rate": 0.003464376577509613, "loss": 7.7949, "step": 611500 }, { "epoch": 2.4915239910010194, "grad_norm": 3.3777360916137695, "learning_rate": 0.0034639186065166996, "loss": 7.8029, "step": 611600 }, { "epoch": 2.491931369024401, "grad_norm": 2.6882739067077637, "learning_rate": 0.003463460597612606, "loss": 7.7576, "step": 611700 }, { "epoch": 2.4923387470477825, "grad_norm": 6.328719139099121, "learning_rate": 0.003463002550815439, "loss": 7.7791, "step": 611800 }, { "epoch": 2.492746125071164, "grad_norm": 1.7057703733444214, "learning_rate": 0.0034625444661433064, "loss": 7.7636, "step": 611900 }, { "epoch": 2.493153503094545, "grad_norm": 2.849590301513672, "learning_rate": 0.003462086343614317, "loss": 7.7619, "step": 612000 }, { "epoch": 2.493153503094545, "eval_MaskedAccuracy": 0.5034365202158351, "eval_loss": 1.6304341554641724, "eval_runtime": 218.3404, "eval_samples_per_second": 290.72, "eval_steps_per_second": 1.136, "step": 612000 }, { "epoch": 2.4935608811179266, "grad_norm": 3.324779748916626, "learning_rate": 0.0034616281832465937, "loss": 7.7423, "step": 612100 }, { "epoch": 2.493968259141308, "grad_norm": 8.73055362701416, "learning_rate": 0.0034611699850582373, "loss": 7.7453, "step": 612200 }, { "epoch": 2.4943756371646897, "grad_norm": 8.771262168884277, "learning_rate": 0.0034607117490673663, "loss": 7.7936, "step": 612300 }, { "epoch": 2.494783015188071, "grad_norm": 4.403766632080078, "learning_rate": 0.003460253475292096, "loss": 7.8016, "step": 612400 }, { "epoch": 2.4951903932114528, "grad_norm": 8.34892463684082, "learning_rate": 0.0034597951637505508, "loss": 7.7638, "step": 612500 }, { "epoch": 2.4955977712348343, "grad_norm": 5.161684036254883, "learning_rate": 0.0034593368144608485, "loss": 7.7557, "step": 612600 }, { "epoch": 2.4960051492582154, "grad_norm": 5.6892876625061035, "learning_rate": 0.0034588784274411062, "loss": 7.8037, "step": 612700 }, { "epoch": 2.496412527281597, "grad_norm": 3.6872031688690186, "learning_rate": 0.003458420002709448, "loss": 7.7857, "step": 612800 }, { "epoch": 2.4968199053049784, "grad_norm": 3.777289628982544, "learning_rate": 0.003457961540283996, "loss": 7.7583, "step": 612900 }, { "epoch": 2.49722728332836, "grad_norm": 2.7119300365448, "learning_rate": 0.00345750304018288, "loss": 7.7738, "step": 613000 }, { "epoch": 2.49722728332836, "eval_MaskedAccuracy": 0.5022991251156033, "eval_loss": 1.636875867843628, "eval_runtime": 162.3094, "eval_samples_per_second": 391.08, "eval_steps_per_second": 1.528, "step": 613000 }, { "epoch": 2.4976346613517415, "grad_norm": 3.9631919860839844, "learning_rate": 0.0034570445024242214, "loss": 7.7678, "step": 613100 }, { "epoch": 2.498042039375123, "grad_norm": 3.6911187171936035, "learning_rate": 0.003456585927026153, "loss": 7.7789, "step": 613200 }, { "epoch": 2.498449417398504, "grad_norm": 3.555579900741577, "learning_rate": 0.0034561273140068013, "loss": 7.795, "step": 613300 }, { "epoch": 2.4988567954218857, "grad_norm": 5.924427509307861, "learning_rate": 0.0034556686633843026, "loss": 7.7343, "step": 613400 }, { "epoch": 2.499264173445267, "grad_norm": 2.3663249015808105, "learning_rate": 0.003455209975176787, "loss": 7.7641, "step": 613500 }, { "epoch": 2.4996715514686487, "grad_norm": 2.5663599967956543, "learning_rate": 0.00345475124940239, "loss": 7.775, "step": 613600 }, { "epoch": 2.5000789294920303, "grad_norm": 4.334702014923096, "learning_rate": 0.003454292486079246, "loss": 7.7715, "step": 613700 }, { "epoch": 2.500486307515412, "grad_norm": 3.6804959774017334, "learning_rate": 0.0034538336852254913, "loss": 7.8049, "step": 613800 }, { "epoch": 2.5008936855387933, "grad_norm": 14.971857070922852, "learning_rate": 0.003453374846859268, "loss": 7.79, "step": 613900 }, { "epoch": 2.5013010635621744, "grad_norm": 5.183899402618408, "learning_rate": 0.0034529159709987154, "loss": 7.7578, "step": 614000 }, { "epoch": 2.5013010635621744, "eval_MaskedAccuracy": 0.5023197603573429, "eval_loss": 1.6355232000350952, "eval_runtime": 187.9687, "eval_samples_per_second": 337.694, "eval_steps_per_second": 1.319, "step": 614000 }, { "epoch": 2.501708441585556, "grad_norm": 2.179237127304077, "learning_rate": 0.003452457057661972, "loss": 7.7526, "step": 614100 }, { "epoch": 2.5021158196089375, "grad_norm": 5.896249294281006, "learning_rate": 0.0034519981068671848, "loss": 7.781, "step": 614200 }, { "epoch": 2.502523197632319, "grad_norm": 3.0926969051361084, "learning_rate": 0.0034515391186324975, "loss": 7.7922, "step": 614300 }, { "epoch": 2.5029305756557005, "grad_norm": 5.69252872467041, "learning_rate": 0.003451080092976058, "loss": 7.7848, "step": 614400 }, { "epoch": 2.5033379536790816, "grad_norm": 5.180838584899902, "learning_rate": 0.0034506210299160116, "loss": 7.7828, "step": 614500 }, { "epoch": 2.503745331702463, "grad_norm": 4.583305835723877, "learning_rate": 0.0034501619294705094, "loss": 7.7746, "step": 614600 }, { "epoch": 2.5041527097258447, "grad_norm": 5.0666279792785645, "learning_rate": 0.003449702791657699, "loss": 7.777, "step": 614700 }, { "epoch": 2.5045600877492262, "grad_norm": 5.836339950561523, "learning_rate": 0.0034492436164957367, "loss": 7.7332, "step": 614800 }, { "epoch": 2.5049674657726078, "grad_norm": 2.6494479179382324, "learning_rate": 0.0034487844040027695, "loss": 7.7607, "step": 614900 }, { "epoch": 2.5053748437959893, "grad_norm": 2.540177583694458, "learning_rate": 0.003448325154196961, "loss": 7.7808, "step": 615000 }, { "epoch": 2.5053748437959893, "eval_MaskedAccuracy": 0.5018629976135593, "eval_loss": 1.6425836086273193, "eval_runtime": 162.095, "eval_samples_per_second": 391.597, "eval_steps_per_second": 1.53, "step": 615000 }, { "epoch": 2.505782221819371, "grad_norm": 8.705368041992188, "learning_rate": 0.0034478658670964676, "loss": 7.7685, "step": 615100 }, { "epoch": 2.5061895998427524, "grad_norm": 7.735439300537109, "learning_rate": 0.0034474065427194424, "loss": 7.7816, "step": 615200 }, { "epoch": 2.5065969778661334, "grad_norm": 3.698718547821045, "learning_rate": 0.0034469471810840446, "loss": 7.7528, "step": 615300 }, { "epoch": 2.507004355889515, "grad_norm": 2.3210785388946533, "learning_rate": 0.0034464877822084354, "loss": 7.7778, "step": 615400 }, { "epoch": 2.5074117339128965, "grad_norm": 4.6606125831604, "learning_rate": 0.0034460283461107796, "loss": 7.7883, "step": 615500 }, { "epoch": 2.507819111936278, "grad_norm": 1.963275671005249, "learning_rate": 0.0034455688728092402, "loss": 7.7587, "step": 615600 }, { "epoch": 2.508226489959659, "grad_norm": 5.2963361740112305, "learning_rate": 0.003445109362321983, "loss": 7.7516, "step": 615700 }, { "epoch": 2.5086338679830407, "grad_norm": 4.963475704193115, "learning_rate": 0.0034446498146671754, "loss": 7.7896, "step": 615800 }, { "epoch": 2.509041246006422, "grad_norm": 3.43817400932312, "learning_rate": 0.0034441902298629826, "loss": 7.7537, "step": 615900 }, { "epoch": 2.5094486240298037, "grad_norm": 5.425559997558594, "learning_rate": 0.0034437306079275753, "loss": 7.7537, "step": 616000 }, { "epoch": 2.5094486240298037, "eval_MaskedAccuracy": 0.5025626337645198, "eval_loss": 1.6352336406707764, "eval_runtime": 241.6423, "eval_samples_per_second": 262.686, "eval_steps_per_second": 1.026, "step": 616000 }, { "epoch": 2.5098560020531853, "grad_norm": 9.041913032531738, "learning_rate": 0.003443270948879127, "loss": 7.763, "step": 616100 }, { "epoch": 2.510263380076567, "grad_norm": 3.324021339416504, "learning_rate": 0.003442811252735811, "loss": 7.7677, "step": 616200 }, { "epoch": 2.5106707580999483, "grad_norm": 7.944117069244385, "learning_rate": 0.0034423515195157986, "loss": 7.7356, "step": 616300 }, { "epoch": 2.51107813612333, "grad_norm": 7.065430641174316, "learning_rate": 0.003441891749237269, "loss": 7.7934, "step": 616400 }, { "epoch": 2.511485514146711, "grad_norm": 3.643815040588379, "learning_rate": 0.0034414319419183983, "loss": 7.7855, "step": 616500 }, { "epoch": 2.5118928921700925, "grad_norm": 3.610201597213745, "learning_rate": 0.0034409720975773664, "loss": 7.7476, "step": 616600 }, { "epoch": 2.512300270193474, "grad_norm": 5.183040142059326, "learning_rate": 0.0034405122162323477, "loss": 7.7557, "step": 616700 }, { "epoch": 2.5127076482168555, "grad_norm": 6.824285507202148, "learning_rate": 0.0034400522979015297, "loss": 7.7853, "step": 616800 }, { "epoch": 2.513115026240237, "grad_norm": 10.065016746520996, "learning_rate": 0.0034395923426030917, "loss": 7.773, "step": 616900 }, { "epoch": 2.513522404263618, "grad_norm": 4.384364604949951, "learning_rate": 0.0034391323503552207, "loss": 7.7912, "step": 617000 }, { "epoch": 2.513522404263618, "eval_MaskedAccuracy": 0.5024734477734989, "eval_loss": 1.639122486114502, "eval_runtime": 161.8896, "eval_samples_per_second": 392.094, "eval_steps_per_second": 1.532, "step": 617000 }, { "epoch": 2.5139297822869997, "grad_norm": 2.951051712036133, "learning_rate": 0.003438672321176104, "loss": 7.7722, "step": 617100 }, { "epoch": 2.5143371603103812, "grad_norm": 11.248997688293457, "learning_rate": 0.0034382122550839253, "loss": 7.7819, "step": 617200 }, { "epoch": 2.5147445383337628, "grad_norm": 4.652326583862305, "learning_rate": 0.003437752152096874, "loss": 7.7744, "step": 617300 }, { "epoch": 2.5151519163571443, "grad_norm": 2.3746697902679443, "learning_rate": 0.0034372920122331435, "loss": 7.7547, "step": 617400 }, { "epoch": 2.515559294380526, "grad_norm": 7.469295978546143, "learning_rate": 0.003436831835510922, "loss": 7.7722, "step": 617500 }, { "epoch": 2.5159666724039074, "grad_norm": 5.900223731994629, "learning_rate": 0.0034363716219484065, "loss": 7.7933, "step": 617600 }, { "epoch": 2.516374050427289, "grad_norm": 6.131950855255127, "learning_rate": 0.0034359113715637856, "loss": 7.7672, "step": 617700 }, { "epoch": 2.51678142845067, "grad_norm": 6.427236557006836, "learning_rate": 0.003435451084375261, "loss": 7.7507, "step": 617800 }, { "epoch": 2.5171888064740515, "grad_norm": 7.460343360900879, "learning_rate": 0.003434990760401027, "loss": 7.7422, "step": 617900 }, { "epoch": 2.517596184497433, "grad_norm": 3.753558397293091, "learning_rate": 0.0034345303996592834, "loss": 7.7799, "step": 618000 }, { "epoch": 2.517596184497433, "eval_MaskedAccuracy": 0.5025722323480888, "eval_loss": 1.6341897249221802, "eval_runtime": 185.6726, "eval_samples_per_second": 341.871, "eval_steps_per_second": 1.336, "step": 618000 }, { "epoch": 2.5180035625208146, "grad_norm": 3.9119603633880615, "learning_rate": 0.003434070002168237, "loss": 7.7853, "step": 618100 }, { "epoch": 2.5184109405441957, "grad_norm": 5.590705394744873, "learning_rate": 0.0034336095679460815, "loss": 7.7477, "step": 618200 }, { "epoch": 2.518818318567577, "grad_norm": 4.970400810241699, "learning_rate": 0.003433149097011025, "loss": 7.7607, "step": 618300 }, { "epoch": 2.5192256965909587, "grad_norm": 5.685876846313477, "learning_rate": 0.0034326885893812696, "loss": 7.7934, "step": 618400 }, { "epoch": 2.5196330746143403, "grad_norm": 7.493686199188232, "learning_rate": 0.0034322280450750234, "loss": 7.7888, "step": 618500 }, { "epoch": 2.520040452637722, "grad_norm": 3.7989518642425537, "learning_rate": 0.003431767464110492, "loss": 7.7608, "step": 618600 }, { "epoch": 2.5204478306611033, "grad_norm": 2.843581199645996, "learning_rate": 0.0034313068465058843, "loss": 7.7442, "step": 618700 }, { "epoch": 2.520855208684485, "grad_norm": 3.6405067443847656, "learning_rate": 0.0034308461922794168, "loss": 7.7532, "step": 618800 }, { "epoch": 2.5212625867078664, "grad_norm": 6.370888710021973, "learning_rate": 0.0034303855014492964, "loss": 7.7689, "step": 618900 }, { "epoch": 2.5216699647312475, "grad_norm": 4.907922744750977, "learning_rate": 0.0034299247740337407, "loss": 7.7687, "step": 619000 }, { "epoch": 2.5216699647312475, "eval_MaskedAccuracy": 0.5023274693518835, "eval_loss": 1.6362217664718628, "eval_runtime": 174.3653, "eval_samples_per_second": 364.04, "eval_steps_per_second": 1.422, "step": 619000 }, { "epoch": 2.522077342754629, "grad_norm": 3.8803508281707764, "learning_rate": 0.003429464010050957, "loss": 7.7692, "step": 619100 }, { "epoch": 2.5224847207780106, "grad_norm": 3.016896963119507, "learning_rate": 0.00342900320951917, "loss": 7.7835, "step": 619200 }, { "epoch": 2.522892098801392, "grad_norm": 7.485684394836426, "learning_rate": 0.003428542372456591, "loss": 7.7638, "step": 619300 }, { "epoch": 2.5232994768247736, "grad_norm": 3.4870240688323975, "learning_rate": 0.0034280814988814435, "loss": 7.7549, "step": 619400 }, { "epoch": 2.5237068548481547, "grad_norm": 2.594203233718872, "learning_rate": 0.003427620588811948, "loss": 7.7738, "step": 619500 }, { "epoch": 2.5241142328715362, "grad_norm": 4.025074481964111, "learning_rate": 0.0034271596422663286, "loss": 7.766, "step": 619600 }, { "epoch": 2.524521610894918, "grad_norm": 15.031338691711426, "learning_rate": 0.0034266986592628034, "loss": 7.7444, "step": 619700 }, { "epoch": 2.5249289889182993, "grad_norm": 3.5820491313934326, "learning_rate": 0.0034262376398196013, "loss": 7.7875, "step": 619800 }, { "epoch": 2.525336366941681, "grad_norm": 3.851367473602295, "learning_rate": 0.0034257765839549475, "loss": 7.7547, "step": 619900 }, { "epoch": 2.5257437449650624, "grad_norm": 4.407445907592773, "learning_rate": 0.0034253154916870695, "loss": 7.7818, "step": 620000 }, { "epoch": 2.5257437449650624, "eval_MaskedAccuracy": 0.5017868783965054, "eval_loss": 1.6382781267166138, "eval_runtime": 158.9167, "eval_samples_per_second": 399.429, "eval_steps_per_second": 1.561, "step": 620000 }, { "epoch": 2.526151122988444, "grad_norm": 5.073345184326172, "learning_rate": 0.0034248543630341992, "loss": 7.7495, "step": 620100 }, { "epoch": 2.5265585010118254, "grad_norm": 4.661971569061279, "learning_rate": 0.0034243931980145687, "loss": 7.751, "step": 620200 }, { "epoch": 2.5269658790352065, "grad_norm": 3.7760164737701416, "learning_rate": 0.0034239319966464037, "loss": 7.7607, "step": 620300 }, { "epoch": 2.527373257058588, "grad_norm": 5.625454425811768, "learning_rate": 0.003423470758947944, "loss": 7.7581, "step": 620400 }, { "epoch": 2.5277806350819696, "grad_norm": 3.6919190883636475, "learning_rate": 0.003423009484937423, "loss": 7.7946, "step": 620500 }, { "epoch": 2.528188013105351, "grad_norm": 9.471922874450684, "learning_rate": 0.003422548174633077, "loss": 7.7613, "step": 620600 }, { "epoch": 2.528595391128732, "grad_norm": 2.955620288848877, "learning_rate": 0.003422086828053144, "loss": 7.7491, "step": 620700 }, { "epoch": 2.5290027691521137, "grad_norm": 3.1450116634368896, "learning_rate": 0.0034216254452158652, "loss": 7.7672, "step": 620800 }, { "epoch": 2.5294101471754953, "grad_norm": 2.3749918937683105, "learning_rate": 0.0034211640261394777, "loss": 7.7367, "step": 620900 }, { "epoch": 2.529817525198877, "grad_norm": 3.8808748722076416, "learning_rate": 0.0034207025708422258, "loss": 7.781, "step": 621000 }, { "epoch": 2.529817525198877, "eval_MaskedAccuracy": 0.503201223599316, "eval_loss": 1.63099205493927, "eval_runtime": 160.6549, "eval_samples_per_second": 395.108, "eval_steps_per_second": 1.544, "step": 621000 }, { "epoch": 2.5302249032222583, "grad_norm": 4.164322376251221, "learning_rate": 0.0034202410793423544, "loss": 7.7878, "step": 621100 }, { "epoch": 2.53063228124564, "grad_norm": 5.7266645431518555, "learning_rate": 0.003419779551658109, "loss": 7.7607, "step": 621200 }, { "epoch": 2.5310396592690214, "grad_norm": 2.7845304012298584, "learning_rate": 0.003419317987807737, "loss": 7.7895, "step": 621300 }, { "epoch": 2.531447037292403, "grad_norm": 4.079392910003662, "learning_rate": 0.003418856387809479, "loss": 7.7829, "step": 621400 }, { "epoch": 2.531854415315784, "grad_norm": 6.282017707824707, "learning_rate": 0.00341839475168159, "loss": 7.7442, "step": 621500 }, { "epoch": 2.5322617933391656, "grad_norm": 3.3576087951660156, "learning_rate": 0.0034179330794423257, "loss": 7.7694, "step": 621600 }, { "epoch": 2.532669171362547, "grad_norm": 3.2473766803741455, "learning_rate": 0.003417471371109934, "loss": 7.7467, "step": 621700 }, { "epoch": 2.5330765493859286, "grad_norm": 3.8091328144073486, "learning_rate": 0.003417009626702671, "loss": 7.7745, "step": 621800 }, { "epoch": 2.53348392740931, "grad_norm": 1.907678484916687, "learning_rate": 0.0034165478462387894, "loss": 7.7477, "step": 621900 }, { "epoch": 2.5338913054326913, "grad_norm": 2.0353424549102783, "learning_rate": 0.0034160860297365414, "loss": 7.7626, "step": 622000 }, { "epoch": 2.5338913054326913, "eval_MaskedAccuracy": 0.5024660503472291, "eval_loss": 1.6323477029800415, "eval_runtime": 158.4253, "eval_samples_per_second": 400.668, "eval_steps_per_second": 1.565, "step": 622000 }, { "epoch": 2.534298683456073, "grad_norm": 8.999128341674805, "learning_rate": 0.0034156241772141933, "loss": 7.7751, "step": 622100 }, { "epoch": 2.5347060614794543, "grad_norm": 5.298986434936523, "learning_rate": 0.00341516228869, "loss": 7.7877, "step": 622200 }, { "epoch": 2.535113439502836, "grad_norm": 6.519100189208984, "learning_rate": 0.0034147003641822236, "loss": 7.7697, "step": 622300 }, { "epoch": 2.5355208175262174, "grad_norm": 5.478178024291992, "learning_rate": 0.0034142384037091256, "loss": 7.745, "step": 622400 }, { "epoch": 2.535928195549599, "grad_norm": 3.7654266357421875, "learning_rate": 0.0034137764072889724, "loss": 7.7622, "step": 622500 }, { "epoch": 2.5363355735729805, "grad_norm": 4.415789604187012, "learning_rate": 0.0034133143749400254, "loss": 7.7689, "step": 622600 }, { "epoch": 2.536742951596362, "grad_norm": 2.717233180999756, "learning_rate": 0.003412852306680553, "loss": 7.7383, "step": 622700 }, { "epoch": 2.537150329619743, "grad_norm": 10.035981178283691, "learning_rate": 0.0034123902025288227, "loss": 7.7406, "step": 622800 }, { "epoch": 2.5375577076431246, "grad_norm": 7.083771705627441, "learning_rate": 0.0034119280625031063, "loss": 7.7589, "step": 622900 }, { "epoch": 2.537965085666506, "grad_norm": 2.220515727996826, "learning_rate": 0.0034114658866216743, "loss": 7.7809, "step": 623000 }, { "epoch": 2.537965085666506, "eval_MaskedAccuracy": 0.5033736089149096, "eval_loss": 1.6255074739456177, "eval_runtime": 162.0465, "eval_samples_per_second": 391.715, "eval_steps_per_second": 1.53, "step": 623000 }, { "epoch": 2.5383724636898877, "grad_norm": 3.3882949352264404, "learning_rate": 0.0034110036749027957, "loss": 7.7483, "step": 623100 }, { "epoch": 2.5387798417132688, "grad_norm": 4.393842697143555, "learning_rate": 0.003410541427364745, "loss": 7.7839, "step": 623200 }, { "epoch": 2.5391872197366503, "grad_norm": 4.683650970458984, "learning_rate": 0.0034100791440258007, "loss": 7.7726, "step": 623300 }, { "epoch": 2.539594597760032, "grad_norm": 3.0677711963653564, "learning_rate": 0.0034096168249042377, "loss": 7.752, "step": 623400 }, { "epoch": 2.5400019757834134, "grad_norm": 7.665134906768799, "learning_rate": 0.0034091544700183335, "loss": 7.7489, "step": 623500 }, { "epoch": 2.540409353806795, "grad_norm": 2.6495871543884277, "learning_rate": 0.003408692079386368, "loss": 7.7649, "step": 623600 }, { "epoch": 2.5408167318301764, "grad_norm": 6.374454021453857, "learning_rate": 0.0034082296530266185, "loss": 7.7857, "step": 623700 }, { "epoch": 2.541224109853558, "grad_norm": 4.619896411895752, "learning_rate": 0.00340776719095737, "loss": 7.7247, "step": 623800 }, { "epoch": 2.5416314878769395, "grad_norm": 5.068841457366943, "learning_rate": 0.0034073046931969064, "loss": 7.7504, "step": 623900 }, { "epoch": 2.5420388659003206, "grad_norm": 2.5633034706115723, "learning_rate": 0.003406842159763513, "loss": 7.7569, "step": 624000 }, { "epoch": 2.5420388659003206, "eval_MaskedAccuracy": 0.5031453808360214, "eval_loss": 1.6318475008010864, "eval_runtime": 156.4185, "eval_samples_per_second": 405.809, "eval_steps_per_second": 1.585, "step": 624000 }, { "epoch": 2.542446243923702, "grad_norm": 3.5641820430755615, "learning_rate": 0.0034063795906754744, "loss": 7.7768, "step": 624100 }, { "epoch": 2.5428536219470836, "grad_norm": 4.2368879318237305, "learning_rate": 0.00340591698595108, "loss": 7.7521, "step": 624200 }, { "epoch": 2.543260999970465, "grad_norm": 3.035111665725708, "learning_rate": 0.0034054543456086207, "loss": 7.7471, "step": 624300 }, { "epoch": 2.5436683779938467, "grad_norm": 2.8946642875671387, "learning_rate": 0.003404991669666385, "loss": 7.7628, "step": 624400 }, { "epoch": 2.544075756017228, "grad_norm": 3.1361265182495117, "learning_rate": 0.0034045289581426615, "loss": 7.7874, "step": 624500 }, { "epoch": 2.5444831340406093, "grad_norm": 4.589052677154541, "learning_rate": 0.003404066211055753, "loss": 7.7339, "step": 624600 }, { "epoch": 2.544890512063991, "grad_norm": 9.63264274597168, "learning_rate": 0.0034036034284239446, "loss": 7.7424, "step": 624700 }, { "epoch": 2.5452978900873724, "grad_norm": 5.131928443908691, "learning_rate": 0.003403140610265541, "loss": 7.7656, "step": 624800 }, { "epoch": 2.545705268110754, "grad_norm": 4.377277851104736, "learning_rate": 0.0034026777565988302, "loss": 7.7772, "step": 624900 }, { "epoch": 2.5461126461341355, "grad_norm": 2.270489454269409, "learning_rate": 0.003402214867442119, "loss": 7.727, "step": 625000 }, { "epoch": 2.5461126461341355, "eval_MaskedAccuracy": 0.5028148942312095, "eval_loss": 1.6352275609970093, "eval_runtime": 163.2975, "eval_samples_per_second": 388.714, "eval_steps_per_second": 1.519, "step": 625000 }, { "epoch": 2.546520024157517, "grad_norm": 6.795170307159424, "learning_rate": 0.0034017519428137075, "loss": 7.76, "step": 625100 }, { "epoch": 2.5469274021808985, "grad_norm": 4.163559913635254, "learning_rate": 0.0034012889827318934, "loss": 7.7531, "step": 625200 }, { "epoch": 2.5473347802042796, "grad_norm": 4.35445499420166, "learning_rate": 0.003400825987214981, "loss": 7.775, "step": 625300 }, { "epoch": 2.547742158227661, "grad_norm": 5.694300174713135, "learning_rate": 0.003400362956281278, "loss": 7.7588, "step": 625400 }, { "epoch": 2.5481495362510427, "grad_norm": 11.355413436889648, "learning_rate": 0.0033998998899490867, "loss": 7.7714, "step": 625500 }, { "epoch": 2.548556914274424, "grad_norm": 4.696236610412598, "learning_rate": 0.0033994367882367177, "loss": 7.736, "step": 625600 }, { "epoch": 2.5489642922978053, "grad_norm": 3.539301872253418, "learning_rate": 0.0033989736511624777, "loss": 7.752, "step": 625700 }, { "epoch": 2.549371670321187, "grad_norm": 4.2535223960876465, "learning_rate": 0.0033985104787446767, "loss": 7.7551, "step": 625800 }, { "epoch": 2.5497790483445684, "grad_norm": 3.1228325366973877, "learning_rate": 0.003398047271001628, "loss": 7.7513, "step": 625900 }, { "epoch": 2.55018642636795, "grad_norm": 5.040442943572998, "learning_rate": 0.003397584027951647, "loss": 7.7251, "step": 626000 }, { "epoch": 2.55018642636795, "eval_MaskedAccuracy": 0.5020356942354235, "eval_loss": 1.630746841430664, "eval_runtime": 201.9807, "eval_samples_per_second": 314.268, "eval_steps_per_second": 1.228, "step": 626000 }, { "epoch": 2.5505938043913314, "grad_norm": 3.274500608444214, "learning_rate": 0.003397120749613042, "loss": 7.764, "step": 626100 }, { "epoch": 2.551001182414713, "grad_norm": 5.133733749389648, "learning_rate": 0.0033966574360041347, "loss": 7.7578, "step": 626200 }, { "epoch": 2.5514085604380945, "grad_norm": 3.882274866104126, "learning_rate": 0.00339619408714324, "loss": 7.7649, "step": 626300 }, { "epoch": 2.551815938461476, "grad_norm": 2.9392213821411133, "learning_rate": 0.003395730703048678, "loss": 7.7526, "step": 626400 }, { "epoch": 2.552223316484857, "grad_norm": 5.912354946136475, "learning_rate": 0.003395267283738765, "loss": 7.7781, "step": 626500 }, { "epoch": 2.5526306945082387, "grad_norm": 4.403803825378418, "learning_rate": 0.0033948038292318243, "loss": 7.7613, "step": 626600 }, { "epoch": 2.55303807253162, "grad_norm": 4.239830017089844, "learning_rate": 0.003394340339546181, "loss": 7.7316, "step": 626700 }, { "epoch": 2.5534454505550017, "grad_norm": 2.795454978942871, "learning_rate": 0.003393876814700157, "loss": 7.7912, "step": 626800 }, { "epoch": 2.5538528285783832, "grad_norm": 4.8407793045043945, "learning_rate": 0.0033934132547120796, "loss": 7.7749, "step": 626900 }, { "epoch": 2.5542602066017643, "grad_norm": 3.2152836322784424, "learning_rate": 0.0033929496596002756, "loss": 7.7589, "step": 627000 }, { "epoch": 2.5542602066017643, "eval_MaskedAccuracy": 0.5028559796056031, "eval_loss": 1.6388236284255981, "eval_runtime": 199.451, "eval_samples_per_second": 318.254, "eval_steps_per_second": 1.243, "step": 627000 }, { "epoch": 2.554667584625146, "grad_norm": 4.629742622375488, "learning_rate": 0.003392486029383074, "loss": 7.7668, "step": 627100 }, { "epoch": 2.5550749626485274, "grad_norm": 4.139995574951172, "learning_rate": 0.0033920223640788033, "loss": 7.7629, "step": 627200 }, { "epoch": 2.555482340671909, "grad_norm": 7.941045761108398, "learning_rate": 0.0033915586637057937, "loss": 7.745, "step": 627300 }, { "epoch": 2.5558897186952905, "grad_norm": 3.0665557384490967, "learning_rate": 0.003391094928282374, "loss": 7.7789, "step": 627400 }, { "epoch": 2.556297096718672, "grad_norm": 3.5421810150146484, "learning_rate": 0.0033906311578268842, "loss": 7.7454, "step": 627500 }, { "epoch": 2.5567044747420535, "grad_norm": 2.8414247035980225, "learning_rate": 0.0033901673523576573, "loss": 7.7498, "step": 627600 }, { "epoch": 2.5571118527654346, "grad_norm": 2.641085147857666, "learning_rate": 0.0033897035118930306, "loss": 7.7597, "step": 627700 }, { "epoch": 2.557519230788816, "grad_norm": 4.267969131469727, "learning_rate": 0.003389239636451345, "loss": 7.7813, "step": 627800 }, { "epoch": 2.5579266088121977, "grad_norm": 4.983828067779541, "learning_rate": 0.0033887757260509326, "loss": 7.7749, "step": 627900 }, { "epoch": 2.558333986835579, "grad_norm": 2.1012320518493652, "learning_rate": 0.003388311780710142, "loss": 7.7959, "step": 628000 }, { "epoch": 2.558333986835579, "eval_MaskedAccuracy": 0.5024192944190107, "eval_loss": 1.6422089338302612, "eval_runtime": 168.2914, "eval_samples_per_second": 377.179, "eval_steps_per_second": 1.474, "step": 628000 }, { "epoch": 2.5587413648589608, "grad_norm": 7.516231536865234, "learning_rate": 0.0033878478004473124, "loss": 7.7633, "step": 628100 }, { "epoch": 2.559148742882342, "grad_norm": 4.119853496551514, "learning_rate": 0.0033873837852807865, "loss": 7.7588, "step": 628200 }, { "epoch": 2.5595561209057234, "grad_norm": 3.6812572479248047, "learning_rate": 0.0033869197352289137, "loss": 7.7777, "step": 628300 }, { "epoch": 2.559963498929105, "grad_norm": 5.309595584869385, "learning_rate": 0.003386455650310039, "loss": 7.7878, "step": 628400 }, { "epoch": 2.5603708769524864, "grad_norm": 6.48313570022583, "learning_rate": 0.0033859915305425037, "loss": 7.7696, "step": 628500 }, { "epoch": 2.560778254975868, "grad_norm": 7.021803855895996, "learning_rate": 0.00338552737594466, "loss": 7.7929, "step": 628600 }, { "epoch": 2.5611856329992495, "grad_norm": 4.681790351867676, "learning_rate": 0.003385063186534861, "loss": 7.7431, "step": 628700 }, { "epoch": 2.561593011022631, "grad_norm": 7.020096302032471, "learning_rate": 0.00338459896233146, "loss": 7.7527, "step": 628800 }, { "epoch": 2.5620003890460126, "grad_norm": 6.3975725173950195, "learning_rate": 0.0033841347033528057, "loss": 7.7605, "step": 628900 }, { "epoch": 2.5624077670693937, "grad_norm": 8.613045692443848, "learning_rate": 0.0033836704096172547, "loss": 7.7299, "step": 629000 }, { "epoch": 2.5624077670693937, "eval_MaskedAccuracy": 0.5020393620005411, "eval_loss": 1.6370080709457397, "eval_runtime": 200.737, "eval_samples_per_second": 316.215, "eval_steps_per_second": 1.235, "step": 629000 }, { "epoch": 2.562815145092775, "grad_norm": 4.094930648803711, "learning_rate": 0.003383206081143165, "loss": 7.7384, "step": 629100 }, { "epoch": 2.5632225231161567, "grad_norm": 3.1509199142456055, "learning_rate": 0.0033827417179488907, "loss": 7.7706, "step": 629200 }, { "epoch": 2.5636299011395383, "grad_norm": 5.84613561630249, "learning_rate": 0.0033822773200527898, "loss": 7.7464, "step": 629300 }, { "epoch": 2.56403727916292, "grad_norm": 2.2058050632476807, "learning_rate": 0.003381812887473227, "loss": 7.737, "step": 629400 }, { "epoch": 2.564444657186301, "grad_norm": 3.3693466186523438, "learning_rate": 0.0033813484202285596, "loss": 7.7523, "step": 629500 }, { "epoch": 2.5648520352096824, "grad_norm": 10.856343269348145, "learning_rate": 0.0033808839183371527, "loss": 7.7631, "step": 629600 }, { "epoch": 2.565259413233064, "grad_norm": 3.346740484237671, "learning_rate": 0.0033804193818173706, "loss": 7.7421, "step": 629700 }, { "epoch": 2.5656667912564455, "grad_norm": 5.470869541168213, "learning_rate": 0.0033799548106875745, "loss": 7.7382, "step": 629800 }, { "epoch": 2.566074169279827, "grad_norm": 4.04727840423584, "learning_rate": 0.003379490204966138, "loss": 7.7826, "step": 629900 }, { "epoch": 2.5664815473032085, "grad_norm": 3.49943208694458, "learning_rate": 0.0033790255646714245, "loss": 7.7395, "step": 630000 }, { "epoch": 2.5664815473032085, "eval_MaskedAccuracy": 0.5026848141215671, "eval_loss": 1.6341649293899536, "eval_runtime": 218.4629, "eval_samples_per_second": 290.557, "eval_steps_per_second": 1.135, "step": 630000 }, { "epoch": 2.56688892532659, "grad_norm": 4.030453205108643, "learning_rate": 0.0033785608898218083, "loss": 7.7338, "step": 630100 }, { "epoch": 2.567296303349971, "grad_norm": 4.446766376495361, "learning_rate": 0.0033780961804356596, "loss": 7.7699, "step": 630200 }, { "epoch": 2.5677036813733527, "grad_norm": 3.2295103073120117, "learning_rate": 0.0033776314365313452, "loss": 7.7671, "step": 630300 }, { "epoch": 2.5681110593967342, "grad_norm": 3.9324440956115723, "learning_rate": 0.003377166658127242, "loss": 7.7481, "step": 630400 }, { "epoch": 2.5685184374201158, "grad_norm": 3.631042003631592, "learning_rate": 0.003376701845241731, "loss": 7.7388, "step": 630500 }, { "epoch": 2.5689258154434973, "grad_norm": 4.721069812774658, "learning_rate": 0.0033762369978931795, "loss": 7.7604, "step": 630600 }, { "epoch": 2.5693331934668784, "grad_norm": 7.176989555358887, "learning_rate": 0.003375772116099972, "loss": 7.7796, "step": 630700 }, { "epoch": 2.56974057149026, "grad_norm": 5.375190734863281, "learning_rate": 0.003375307199880481, "loss": 7.7791, "step": 630800 }, { "epoch": 2.5701479495136414, "grad_norm": 5.220234394073486, "learning_rate": 0.0033748422492530955, "loss": 7.7767, "step": 630900 }, { "epoch": 2.570555327537023, "grad_norm": 2.8575234413146973, "learning_rate": 0.0033743772642361946, "loss": 7.7608, "step": 631000 }, { "epoch": 2.570555327537023, "eval_MaskedAccuracy": 0.5033651866433944, "eval_loss": 1.6335687637329102, "eval_runtime": 168.7379, "eval_samples_per_second": 376.181, "eval_steps_per_second": 1.47, "step": 631000 }, { "epoch": 2.5709627055604045, "grad_norm": 3.3210229873657227, "learning_rate": 0.003373912244848161, "loss": 7.7667, "step": 631100 }, { "epoch": 2.571370083583786, "grad_norm": 3.0266685485839844, "learning_rate": 0.0033734471911073777, "loss": 7.7601, "step": 631200 }, { "epoch": 2.5717774616071676, "grad_norm": 1.9625940322875977, "learning_rate": 0.00337298210303223, "loss": 7.779, "step": 631300 }, { "epoch": 2.572184839630549, "grad_norm": 2.838864326477051, "learning_rate": 0.003372516980641106, "loss": 7.7817, "step": 631400 }, { "epoch": 2.57259221765393, "grad_norm": 4.446281433105469, "learning_rate": 0.003372051823952398, "loss": 7.7641, "step": 631500 }, { "epoch": 2.5729995956773117, "grad_norm": 2.1418299674987793, "learning_rate": 0.003371586632984494, "loss": 7.7803, "step": 631600 }, { "epoch": 2.5734069737006933, "grad_norm": 6.692812919616699, "learning_rate": 0.0033711214077557865, "loss": 7.7374, "step": 631700 }, { "epoch": 2.573814351724075, "grad_norm": 3.6473629474639893, "learning_rate": 0.00337065614828467, "loss": 7.7772, "step": 631800 }, { "epoch": 2.5742217297474563, "grad_norm": 5.349830627441406, "learning_rate": 0.0033701908545895314, "loss": 7.7521, "step": 631900 }, { "epoch": 2.5746291077708374, "grad_norm": 3.0547399520874023, "learning_rate": 0.0033697255266887743, "loss": 7.7611, "step": 632000 }, { "epoch": 2.5746291077708374, "eval_MaskedAccuracy": 0.5032770196930922, "eval_loss": 1.635548710823059, "eval_runtime": 201.8086, "eval_samples_per_second": 314.536, "eval_steps_per_second": 1.229, "step": 632000 }, { "epoch": 2.575036485794219, "grad_norm": 2.5424275398254395, "learning_rate": 0.003369260164600792, "loss": 7.7414, "step": 632100 }, { "epoch": 2.5754438638176005, "grad_norm": 7.3950395584106445, "learning_rate": 0.0033687947683439805, "loss": 7.7431, "step": 632200 }, { "epoch": 2.575851241840982, "grad_norm": 2.375016689300537, "learning_rate": 0.0033683293379367448, "loss": 7.7477, "step": 632300 }, { "epoch": 2.5762586198643636, "grad_norm": 3.9684011936187744, "learning_rate": 0.0033678638733974824, "loss": 7.7538, "step": 632400 }, { "epoch": 2.576665997887745, "grad_norm": 10.618762016296387, "learning_rate": 0.003367398374744596, "loss": 7.7222, "step": 632500 }, { "epoch": 2.5770733759111266, "grad_norm": 2.8266074657440186, "learning_rate": 0.0033669328419964908, "loss": 7.7424, "step": 632600 }, { "epoch": 2.5774807539345077, "grad_norm": 4.158894062042236, "learning_rate": 0.0033664672751715662, "loss": 7.7364, "step": 632700 }, { "epoch": 2.5778881319578892, "grad_norm": 3.104992628097534, "learning_rate": 0.0033660016742882346, "loss": 7.7622, "step": 632800 }, { "epoch": 2.5782955099812708, "grad_norm": 8.103853225708008, "learning_rate": 0.0033655360393649004, "loss": 7.7499, "step": 632900 }, { "epoch": 2.5787028880046523, "grad_norm": 7.338804721832275, "learning_rate": 0.0033650703704199768, "loss": 7.7608, "step": 633000 }, { "epoch": 2.5787028880046523, "eval_MaskedAccuracy": 0.5028308156270933, "eval_loss": 1.6374253034591675, "eval_runtime": 155.7305, "eval_samples_per_second": 407.602, "eval_steps_per_second": 1.592, "step": 633000 }, { "epoch": 2.579110266028034, "grad_norm": 4.880356788635254, "learning_rate": 0.0033646046674718757, "loss": 7.764, "step": 633100 }, { "epoch": 2.579517644051415, "grad_norm": 7.644883632659912, "learning_rate": 0.003364138930539002, "loss": 7.7473, "step": 633200 }, { "epoch": 2.5799250220747965, "grad_norm": 2.9873547554016113, "learning_rate": 0.0033636731596397708, "loss": 7.7226, "step": 633300 }, { "epoch": 2.580332400098178, "grad_norm": 2.4233789443969727, "learning_rate": 0.0033632073547925955, "loss": 7.7255, "step": 633400 }, { "epoch": 2.5807397781215595, "grad_norm": 5.47147274017334, "learning_rate": 0.003362741516015892, "loss": 7.7682, "step": 633500 }, { "epoch": 2.581147156144941, "grad_norm": 7.26539421081543, "learning_rate": 0.0033622756433280784, "loss": 7.7674, "step": 633600 }, { "epoch": 2.5815545341683226, "grad_norm": 3.6344358921051025, "learning_rate": 0.0033618097367475756, "loss": 7.8084, "step": 633700 }, { "epoch": 2.581961912191704, "grad_norm": 5.889102935791016, "learning_rate": 0.0033613437962927996, "loss": 7.7693, "step": 633800 }, { "epoch": 2.5823692902150857, "grad_norm": 2.740557909011841, "learning_rate": 0.0033608778219821763, "loss": 7.7431, "step": 633900 }, { "epoch": 2.5827766682384667, "grad_norm": 5.892124176025391, "learning_rate": 0.003360411813834124, "loss": 7.7568, "step": 634000 }, { "epoch": 2.5827766682384667, "eval_MaskedAccuracy": 0.5030534435764971, "eval_loss": 1.6318116188049316, "eval_runtime": 160.8996, "eval_samples_per_second": 394.507, "eval_steps_per_second": 1.541, "step": 634000 }, { "epoch": 2.5831840462618483, "grad_norm": 7.4653778076171875, "learning_rate": 0.0033599457718670673, "loss": 7.7416, "step": 634100 }, { "epoch": 2.58359142428523, "grad_norm": 2.5234272480010986, "learning_rate": 0.003359479696099433, "loss": 7.7468, "step": 634200 }, { "epoch": 2.5839988023086113, "grad_norm": 3.5666096210479736, "learning_rate": 0.0033590135865496484, "loss": 7.7775, "step": 634300 }, { "epoch": 2.584406180331993, "grad_norm": 4.852608680725098, "learning_rate": 0.003358547443236134, "loss": 7.7654, "step": 634400 }, { "epoch": 2.584813558355374, "grad_norm": 3.440246105194092, "learning_rate": 0.0033580812661773243, "loss": 7.7464, "step": 634500 }, { "epoch": 2.5852209363787555, "grad_norm": 4.503756999969482, "learning_rate": 0.0033576150553916487, "loss": 7.7556, "step": 634600 }, { "epoch": 2.585628314402137, "grad_norm": 6.754344463348389, "learning_rate": 0.003357148810897542, "loss": 7.7561, "step": 634700 }, { "epoch": 2.5860356924255186, "grad_norm": 3.8878188133239746, "learning_rate": 0.0033566825327134404, "loss": 7.7432, "step": 634800 }, { "epoch": 2.5864430704489, "grad_norm": 3.3473639488220215, "learning_rate": 0.00335621622085777, "loss": 7.7797, "step": 634900 }, { "epoch": 2.5868504484722816, "grad_norm": 3.869795799255371, "learning_rate": 0.003355749875348968, "loss": 7.7359, "step": 635000 }, { "epoch": 2.5868504484722816, "eval_MaskedAccuracy": 0.5022740854793031, "eval_loss": 1.6272011995315552, "eval_runtime": 169.6625, "eval_samples_per_second": 374.131, "eval_steps_per_second": 1.462, "step": 635000 }, { "epoch": 2.587257826495663, "grad_norm": 3.1247730255126953, "learning_rate": 0.003355283496205473, "loss": 7.7528, "step": 635100 }, { "epoch": 2.5876652045190442, "grad_norm": 3.1590864658355713, "learning_rate": 0.003354817083445727, "loss": 7.7502, "step": 635200 }, { "epoch": 2.588072582542426, "grad_norm": 11.75013542175293, "learning_rate": 0.0033543506370881638, "loss": 7.7724, "step": 635300 }, { "epoch": 2.5884799605658073, "grad_norm": 10.730052947998047, "learning_rate": 0.003353884157151228, "loss": 7.7249, "step": 635400 }, { "epoch": 2.588887338589189, "grad_norm": 5.237130641937256, "learning_rate": 0.0033534176436533602, "loss": 7.7701, "step": 635500 }, { "epoch": 2.5892947166125704, "grad_norm": 5.012698650360107, "learning_rate": 0.003352951096613007, "loss": 7.7378, "step": 635600 }, { "epoch": 2.5897020946359515, "grad_norm": 3.3394856452941895, "learning_rate": 0.0033524845160486146, "loss": 7.7727, "step": 635700 }, { "epoch": 2.590109472659333, "grad_norm": 4.111293315887451, "learning_rate": 0.0033520179019786243, "loss": 7.7264, "step": 635800 }, { "epoch": 2.5905168506827145, "grad_norm": 3.9242944717407227, "learning_rate": 0.0033515512544214842, "loss": 7.7592, "step": 635900 }, { "epoch": 2.590924228706096, "grad_norm": 3.3268682956695557, "learning_rate": 0.003351084573395647, "loss": 7.7419, "step": 636000 }, { "epoch": 2.590924228706096, "eval_MaskedAccuracy": 0.5030383245427981, "eval_loss": 1.6388341188430786, "eval_runtime": 155.5487, "eval_samples_per_second": 408.078, "eval_steps_per_second": 1.594, "step": 636000 }, { "epoch": 2.5913316067294776, "grad_norm": 6.217716693878174, "learning_rate": 0.0033506178589195615, "loss": 7.7713, "step": 636100 }, { "epoch": 2.591738984752859, "grad_norm": 3.2350013256073, "learning_rate": 0.0033501511110116777, "loss": 7.754, "step": 636200 }, { "epoch": 2.5921463627762407, "grad_norm": 6.423516273498535, "learning_rate": 0.0033496843296904506, "loss": 7.7939, "step": 636300 }, { "epoch": 2.592553740799622, "grad_norm": 5.533207893371582, "learning_rate": 0.003349217514974333, "loss": 7.768, "step": 636400 }, { "epoch": 2.5929611188230033, "grad_norm": 3.31998610496521, "learning_rate": 0.0033487506668817835, "loss": 7.743, "step": 636500 }, { "epoch": 2.593368496846385, "grad_norm": 3.193930149078369, "learning_rate": 0.0033482837854312574, "loss": 7.7569, "step": 636600 }, { "epoch": 2.5937758748697664, "grad_norm": 7.648054599761963, "learning_rate": 0.0033478168706412115, "loss": 7.7484, "step": 636700 }, { "epoch": 2.594183252893148, "grad_norm": 4.330843925476074, "learning_rate": 0.0033473499225301073, "loss": 7.7393, "step": 636800 }, { "epoch": 2.5945906309165294, "grad_norm": 2.8680262565612793, "learning_rate": 0.0033468829411164073, "loss": 7.7703, "step": 636900 }, { "epoch": 2.5949980089399105, "grad_norm": 3.8069069385528564, "learning_rate": 0.0033464159264185705, "loss": 7.736, "step": 637000 }, { "epoch": 2.5949980089399105, "eval_MaskedAccuracy": 0.5022587825609062, "eval_loss": 1.6388996839523315, "eval_runtime": 185.8618, "eval_samples_per_second": 341.523, "eval_steps_per_second": 1.334, "step": 637000 }, { "epoch": 2.595405386963292, "grad_norm": 5.330597877502441, "learning_rate": 0.003345948878455062, "loss": 7.7636, "step": 637100 }, { "epoch": 2.5958127649866736, "grad_norm": 3.6598243713378906, "learning_rate": 0.0033454817972443504, "loss": 7.778, "step": 637200 }, { "epoch": 2.596220143010055, "grad_norm": 2.821416139602661, "learning_rate": 0.003345014682804897, "loss": 7.7682, "step": 637300 }, { "epoch": 2.5966275210334366, "grad_norm": 1.9124808311462402, "learning_rate": 0.0033445475351551704, "loss": 7.7222, "step": 637400 }, { "epoch": 2.597034899056818, "grad_norm": 2.547819137573242, "learning_rate": 0.0033440803543136385, "loss": 7.7314, "step": 637500 }, { "epoch": 2.5974422770801997, "grad_norm": 3.20186448097229, "learning_rate": 0.00334361314029877, "loss": 7.7568, "step": 637600 }, { "epoch": 2.597849655103581, "grad_norm": 4.085344314575195, "learning_rate": 0.0033431458931290426, "loss": 7.7625, "step": 637700 }, { "epoch": 2.5982570331269623, "grad_norm": 2.124180555343628, "learning_rate": 0.003342678612822924, "loss": 7.7627, "step": 637800 }, { "epoch": 2.598664411150344, "grad_norm": 9.462471008300781, "learning_rate": 0.0033422112993988884, "loss": 7.7629, "step": 637900 }, { "epoch": 2.5990717891737254, "grad_norm": 2.8271639347076416, "learning_rate": 0.0033417439528754134, "loss": 7.7276, "step": 638000 }, { "epoch": 2.5990717891737254, "eval_MaskedAccuracy": 0.5034621723905387, "eval_loss": 1.634547233581543, "eval_runtime": 151.1659, "eval_samples_per_second": 419.91, "eval_steps_per_second": 1.641, "step": 638000 }, { "epoch": 2.599479167197107, "grad_norm": 6.620189666748047, "learning_rate": 0.0033412765732709765, "loss": 7.7502, "step": 638100 }, { "epoch": 2.599886545220488, "grad_norm": 2.910623788833618, "learning_rate": 0.003340809160604051, "loss": 7.7625, "step": 638200 }, { "epoch": 2.6002939232438695, "grad_norm": 3.306079149246216, "learning_rate": 0.0033403417148931197, "loss": 7.7523, "step": 638300 }, { "epoch": 2.600701301267251, "grad_norm": 4.400734901428223, "learning_rate": 0.0033398742361566616, "loss": 7.7739, "step": 638400 }, { "epoch": 2.6011086792906326, "grad_norm": 3.394253969192505, "learning_rate": 0.00333940672441316, "loss": 7.742, "step": 638500 }, { "epoch": 2.601516057314014, "grad_norm": 3.1092782020568848, "learning_rate": 0.003338939179681099, "loss": 7.7571, "step": 638600 }, { "epoch": 2.6019234353373957, "grad_norm": 2.225964307785034, "learning_rate": 0.00333847160197896, "loss": 7.7552, "step": 638700 }, { "epoch": 2.602330813360777, "grad_norm": 3.435316562652588, "learning_rate": 0.00333800399132523, "loss": 7.7401, "step": 638800 }, { "epoch": 2.6027381913841587, "grad_norm": 10.924406051635742, "learning_rate": 0.0033375363477384, "loss": 7.7392, "step": 638900 }, { "epoch": 2.60314556940754, "grad_norm": 4.353061199188232, "learning_rate": 0.0033370686712369552, "loss": 7.7537, "step": 639000 }, { "epoch": 2.60314556940754, "eval_MaskedAccuracy": 0.5024711280498285, "eval_loss": 1.6405030488967896, "eval_runtime": 158.0755, "eval_samples_per_second": 401.555, "eval_steps_per_second": 1.569, "step": 639000 }, { "epoch": 2.6035529474309214, "grad_norm": 5.042884826660156, "learning_rate": 0.0033366009618393836, "loss": 7.7651, "step": 639100 }, { "epoch": 2.603960325454303, "grad_norm": 3.777416229248047, "learning_rate": 0.0033361332195641826, "loss": 7.7392, "step": 639200 }, { "epoch": 2.6043677034776844, "grad_norm": 3.00793719291687, "learning_rate": 0.0033356654444298356, "loss": 7.7564, "step": 639300 }, { "epoch": 2.604775081501066, "grad_norm": 5.785336017608643, "learning_rate": 0.003335197636454842, "loss": 7.7292, "step": 639400 }, { "epoch": 2.605182459524447, "grad_norm": 1.8484314680099487, "learning_rate": 0.0033347297956576975, "loss": 7.7556, "step": 639500 }, { "epoch": 2.6055898375478286, "grad_norm": 3.6654958724975586, "learning_rate": 0.003334261922056897, "loss": 7.7618, "step": 639600 }, { "epoch": 2.60599721557121, "grad_norm": 2.9968793392181396, "learning_rate": 0.003333794015670936, "loss": 7.7455, "step": 639700 }, { "epoch": 2.6064045935945916, "grad_norm": 2.7247536182403564, "learning_rate": 0.003333326076518316, "loss": 7.7662, "step": 639800 }, { "epoch": 2.606811971617973, "grad_norm": 2.114069938659668, "learning_rate": 0.003332858104617532, "loss": 7.7438, "step": 639900 }, { "epoch": 2.6072193496413547, "grad_norm": 3.1880838871002197, "learning_rate": 0.003332390099987095, "loss": 7.7387, "step": 640000 }, { "epoch": 2.6072193496413547, "eval_MaskedAccuracy": 0.5025778422856783, "eval_loss": 1.6369432210922241, "eval_runtime": 156.6582, "eval_samples_per_second": 405.188, "eval_steps_per_second": 1.583, "step": 640000 }, { "epoch": 2.6076267276647362, "grad_norm": 8.58343505859375, "learning_rate": 0.0033319220626455014, "loss": 7.775, "step": 640100 }, { "epoch": 2.6080341056881173, "grad_norm": 4.06058406829834, "learning_rate": 0.003331453992611252, "loss": 7.7449, "step": 640200 }, { "epoch": 2.608441483711499, "grad_norm": 4.162612438201904, "learning_rate": 0.003330985889902857, "loss": 7.7489, "step": 640300 }, { "epoch": 2.6088488617348804, "grad_norm": 6.647921085357666, "learning_rate": 0.0033305177545388223, "loss": 7.7241, "step": 640400 }, { "epoch": 2.609256239758262, "grad_norm": 2.6385583877563477, "learning_rate": 0.003330049586537656, "loss": 7.7574, "step": 640500 }, { "epoch": 2.6096636177816435, "grad_norm": 3.011281967163086, "learning_rate": 0.0033295813859178696, "loss": 7.7616, "step": 640600 }, { "epoch": 2.6100709958050246, "grad_norm": 6.106122970581055, "learning_rate": 0.003329113152697967, "loss": 7.7396, "step": 640700 }, { "epoch": 2.610478373828406, "grad_norm": 3.221099853515625, "learning_rate": 0.003328644886896465, "loss": 7.7382, "step": 640800 }, { "epoch": 2.6108857518517876, "grad_norm": 4.115914344787598, "learning_rate": 0.003328176588531876, "loss": 7.777, "step": 640900 }, { "epoch": 2.611293129875169, "grad_norm": 10.452240943908691, "learning_rate": 0.003327708257622711, "loss": 7.7523, "step": 641000 }, { "epoch": 2.611293129875169, "eval_MaskedAccuracy": 0.5026915635817152, "eval_loss": 1.6318708658218384, "eval_runtime": 153.341, "eval_samples_per_second": 413.953, "eval_steps_per_second": 1.617, "step": 641000 }, { "epoch": 2.6117005078985507, "grad_norm": 3.1379988193511963, "learning_rate": 0.003327239894187492, "loss": 7.7272, "step": 641100 }, { "epoch": 2.612107885921932, "grad_norm": 3.4544689655303955, "learning_rate": 0.003326771498244731, "loss": 7.749, "step": 641200 }, { "epoch": 2.6125152639453137, "grad_norm": 3.4039297103881836, "learning_rate": 0.0033263030698129485, "loss": 7.7518, "step": 641300 }, { "epoch": 2.6129226419686953, "grad_norm": 4.055280685424805, "learning_rate": 0.003325834608910663, "loss": 7.7408, "step": 641400 }, { "epoch": 2.6133300199920764, "grad_norm": 3.9467811584472656, "learning_rate": 0.0033253661155563936, "loss": 7.7629, "step": 641500 }, { "epoch": 2.613737398015458, "grad_norm": 4.436812877655029, "learning_rate": 0.003324897589768664, "loss": 7.7476, "step": 641600 }, { "epoch": 2.6141447760388394, "grad_norm": 5.523942470550537, "learning_rate": 0.003324429031566, "loss": 7.763, "step": 641700 }, { "epoch": 2.614552154062221, "grad_norm": 3.8268330097198486, "learning_rate": 0.003323960440966925, "loss": 7.7348, "step": 641800 }, { "epoch": 2.6149595320856025, "grad_norm": 3.319409132003784, "learning_rate": 0.0033234918179899623, "loss": 7.7337, "step": 641900 }, { "epoch": 2.6153669101089836, "grad_norm": 3.136472463607788, "learning_rate": 0.0033230231626536428, "loss": 7.7086, "step": 642000 }, { "epoch": 2.6153669101089836, "eval_MaskedAccuracy": 0.5035448803106163, "eval_loss": 1.6341197490692139, "eval_runtime": 257.8807, "eval_samples_per_second": 246.145, "eval_steps_per_second": 0.962, "step": 642000 }, { "epoch": 2.615774288132365, "grad_norm": 5.561611652374268, "learning_rate": 0.0033225544749764914, "loss": 7.7543, "step": 642100 }, { "epoch": 2.6161816661557467, "grad_norm": 6.583291053771973, "learning_rate": 0.0033220857549770377, "loss": 7.7651, "step": 642200 }, { "epoch": 2.616589044179128, "grad_norm": 4.528241157531738, "learning_rate": 0.0033216170026738163, "loss": 7.7199, "step": 642300 }, { "epoch": 2.6169964222025097, "grad_norm": 10.77587890625, "learning_rate": 0.0033211482180853535, "loss": 7.7579, "step": 642400 }, { "epoch": 2.6174038002258913, "grad_norm": 3.0642378330230713, "learning_rate": 0.0033206794012301867, "loss": 7.782, "step": 642500 }, { "epoch": 2.617811178249273, "grad_norm": 7.352479934692383, "learning_rate": 0.0033202105521268526, "loss": 7.7674, "step": 642600 }, { "epoch": 2.618218556272654, "grad_norm": 4.312407493591309, "learning_rate": 0.003319741670793884, "loss": 7.7521, "step": 642700 }, { "epoch": 2.6186259342960354, "grad_norm": 4.083638668060303, "learning_rate": 0.0033192727572498196, "loss": 7.7394, "step": 642800 }, { "epoch": 2.619033312319417, "grad_norm": 2.7038230895996094, "learning_rate": 0.0033188038115131986, "loss": 7.736, "step": 642900 }, { "epoch": 2.6194406903427985, "grad_norm": 4.815116882324219, "learning_rate": 0.0033183348336025603, "loss": 7.7477, "step": 643000 }, { "epoch": 2.6194406903427985, "eval_MaskedAccuracy": 0.5028943727183118, "eval_loss": 1.6297352313995361, "eval_runtime": 153.1634, "eval_samples_per_second": 414.433, "eval_steps_per_second": 1.619, "step": 643000 }, { "epoch": 2.61984806836618, "grad_norm": 7.506451606750488, "learning_rate": 0.0033178658235364466, "loss": 7.7345, "step": 643100 }, { "epoch": 2.620255446389561, "grad_norm": 3.9260177612304688, "learning_rate": 0.003317396781333397, "loss": 7.7586, "step": 643200 }, { "epoch": 2.6206628244129426, "grad_norm": 3.6912059783935547, "learning_rate": 0.0033169277070119576, "loss": 7.7536, "step": 643300 }, { "epoch": 2.621070202436324, "grad_norm": 3.7203218936920166, "learning_rate": 0.0033164586005906744, "loss": 7.7613, "step": 643400 }, { "epoch": 2.6214775804597057, "grad_norm": 4.618086814880371, "learning_rate": 0.0033159894620880923, "loss": 7.7718, "step": 643500 }, { "epoch": 2.6218849584830872, "grad_norm": 8.803803443908691, "learning_rate": 0.0033155202915227584, "loss": 7.7613, "step": 643600 }, { "epoch": 2.6222923365064688, "grad_norm": 4.569681167602539, "learning_rate": 0.0033150510889132243, "loss": 7.7379, "step": 643700 }, { "epoch": 2.6226997145298503, "grad_norm": 6.2448015213012695, "learning_rate": 0.0033145818542780385, "loss": 7.7458, "step": 643800 }, { "epoch": 2.623107092553232, "grad_norm": 6.161468505859375, "learning_rate": 0.0033141125876357514, "loss": 7.7255, "step": 643900 }, { "epoch": 2.623514470576613, "grad_norm": 4.657154560089111, "learning_rate": 0.0033136432890049166, "loss": 7.7212, "step": 644000 }, { "epoch": 2.623514470576613, "eval_MaskedAccuracy": 0.5034150424478206, "eval_loss": 1.629897117614746, "eval_runtime": 150.5706, "eval_samples_per_second": 421.57, "eval_steps_per_second": 1.647, "step": 644000 }, { "epoch": 2.6239218485999944, "grad_norm": 9.369250297546387, "learning_rate": 0.003313173958404086, "loss": 7.7378, "step": 644100 }, { "epoch": 2.624329226623376, "grad_norm": 3.7428979873657227, "learning_rate": 0.0033127045958518168, "loss": 7.7266, "step": 644200 }, { "epoch": 2.6247366046467575, "grad_norm": 3.641878604888916, "learning_rate": 0.003312235201366667, "loss": 7.727, "step": 644300 }, { "epoch": 2.625143982670139, "grad_norm": 7.004256248474121, "learning_rate": 0.003311765774967191, "loss": 7.7331, "step": 644400 }, { "epoch": 2.62555136069352, "grad_norm": 8.118968963623047, "learning_rate": 0.003311296316671948, "loss": 7.7447, "step": 644500 }, { "epoch": 2.6259587387169017, "grad_norm": 5.203165054321289, "learning_rate": 0.003310826826499498, "loss": 7.7414, "step": 644600 }, { "epoch": 2.626366116740283, "grad_norm": 4.296602725982666, "learning_rate": 0.0033103573044684083, "loss": 7.7464, "step": 644700 }, { "epoch": 2.6267734947636647, "grad_norm": 2.9566855430603027, "learning_rate": 0.003309887750597233, "loss": 7.739, "step": 644800 }, { "epoch": 2.6271808727870463, "grad_norm": 6.859632968902588, "learning_rate": 0.0033094181649045424, "loss": 7.7531, "step": 644900 }, { "epoch": 2.627588250810428, "grad_norm": 5.904131889343262, "learning_rate": 0.0033089485474088985, "loss": 7.7462, "step": 645000 }, { "epoch": 2.627588250810428, "eval_MaskedAccuracy": 0.5030494847748717, "eval_loss": 1.6355324983596802, "eval_runtime": 198.0358, "eval_samples_per_second": 320.528, "eval_steps_per_second": 1.252, "step": 645000 }, { "epoch": 2.6279956288338093, "grad_norm": 6.6715593338012695, "learning_rate": 0.003308478898128868, "loss": 7.7265, "step": 645100 }, { "epoch": 2.6284030068571904, "grad_norm": 9.638071060180664, "learning_rate": 0.0033080092170830176, "loss": 7.7136, "step": 645200 }, { "epoch": 2.628810384880572, "grad_norm": 4.502671241760254, "learning_rate": 0.0033075395042899223, "loss": 7.7503, "step": 645300 }, { "epoch": 2.6292177629039535, "grad_norm": 3.2229530811309814, "learning_rate": 0.003307069759768146, "loss": 7.7282, "step": 645400 }, { "epoch": 2.629625140927335, "grad_norm": 4.425695896148682, "learning_rate": 0.0033065999835362636, "loss": 7.7401, "step": 645500 }, { "epoch": 2.6300325189507165, "grad_norm": 4.474765300750732, "learning_rate": 0.003306130175612846, "loss": 7.7369, "step": 645600 }, { "epoch": 2.6304398969740976, "grad_norm": 3.017754554748535, "learning_rate": 0.00330566033601647, "loss": 7.7343, "step": 645700 }, { "epoch": 2.630847274997479, "grad_norm": 4.0706706047058105, "learning_rate": 0.0033051904647657076, "loss": 7.7278, "step": 645800 }, { "epoch": 2.6312546530208607, "grad_norm": 3.418870449066162, "learning_rate": 0.0033047205618791375, "loss": 7.7664, "step": 645900 }, { "epoch": 2.6316620310442422, "grad_norm": 4.286476135253906, "learning_rate": 0.003304250627375334, "loss": 7.7521, "step": 646000 }, { "epoch": 2.6316620310442422, "eval_MaskedAccuracy": 0.5034081972867275, "eval_loss": 1.6310018301010132, "eval_runtime": 189.7994, "eval_samples_per_second": 334.437, "eval_steps_per_second": 1.307, "step": 646000 }, { "epoch": 2.6320694090676238, "grad_norm": 3.2341248989105225, "learning_rate": 0.0033037806612728816, "loss": 7.7632, "step": 646100 }, { "epoch": 2.6324767870910053, "grad_norm": 4.886885166168213, "learning_rate": 0.0033033106635903535, "loss": 7.7327, "step": 646200 }, { "epoch": 2.632884165114387, "grad_norm": 5.385007381439209, "learning_rate": 0.0033028406343463346, "loss": 7.7425, "step": 646300 }, { "epoch": 2.6332915431377684, "grad_norm": 4.208031177520752, "learning_rate": 0.0033023705735594116, "loss": 7.7504, "step": 646400 }, { "epoch": 2.6336989211611495, "grad_norm": 5.101689338684082, "learning_rate": 0.003301900481248163, "loss": 7.7287, "step": 646500 }, { "epoch": 2.634106299184531, "grad_norm": 4.291695594787598, "learning_rate": 0.003301430357431178, "loss": 7.7218, "step": 646600 }, { "epoch": 2.6345136772079125, "grad_norm": 4.769300937652588, "learning_rate": 0.003300960202127043, "loss": 7.7584, "step": 646700 }, { "epoch": 2.634921055231294, "grad_norm": 3.5957601070404053, "learning_rate": 0.003300490015354342, "loss": 7.741, "step": 646800 }, { "epoch": 2.6353284332546756, "grad_norm": 2.1788899898529053, "learning_rate": 0.0033000197971316636, "loss": 7.7314, "step": 646900 }, { "epoch": 2.6357358112780567, "grad_norm": 2.3784098625183105, "learning_rate": 0.003299549547477601, "loss": 7.766, "step": 647000 }, { "epoch": 2.6357358112780567, "eval_MaskedAccuracy": 0.502821646416057, "eval_loss": 1.6343514919281006, "eval_runtime": 153.2288, "eval_samples_per_second": 414.256, "eval_steps_per_second": 1.618, "step": 647000 }, { "epoch": 2.636143189301438, "grad_norm": 5.536612033843994, "learning_rate": 0.003299079266410749, "loss": 7.7294, "step": 647100 }, { "epoch": 2.6365505673248197, "grad_norm": 3.7842025756835938, "learning_rate": 0.0032986089539496946, "loss": 7.7352, "step": 647200 }, { "epoch": 2.6369579453482013, "grad_norm": 3.101235866546631, "learning_rate": 0.0032981386101130347, "loss": 7.7491, "step": 647300 }, { "epoch": 2.637365323371583, "grad_norm": 4.833268165588379, "learning_rate": 0.0032976682349193627, "loss": 7.7601, "step": 647400 }, { "epoch": 2.6377727013949643, "grad_norm": 3.6589972972869873, "learning_rate": 0.0032971978283872776, "loss": 7.7311, "step": 647500 }, { "epoch": 2.638180079418346, "grad_norm": 3.490777015686035, "learning_rate": 0.003296727390535375, "loss": 7.7495, "step": 647600 }, { "epoch": 2.638587457441727, "grad_norm": 2.925628900527954, "learning_rate": 0.0032962569213822487, "loss": 7.7641, "step": 647700 }, { "epoch": 2.6389948354651085, "grad_norm": 2.6034014225006104, "learning_rate": 0.0032957864209465088, "loss": 7.7402, "step": 647800 }, { "epoch": 2.63940221348849, "grad_norm": 3.1383039951324463, "learning_rate": 0.003295315889246752, "loss": 7.7217, "step": 647900 }, { "epoch": 2.6398095915118716, "grad_norm": 2.6625351905822754, "learning_rate": 0.0032948453263015784, "loss": 7.7506, "step": 648000 }, { "epoch": 2.6398095915118716, "eval_MaskedAccuracy": 0.5032627812556223, "eval_loss": 1.633973240852356, "eval_runtime": 157.4257, "eval_samples_per_second": 403.212, "eval_steps_per_second": 1.575, "step": 648000 }, { "epoch": 2.640216969535253, "grad_norm": 4.2527031898498535, "learning_rate": 0.0032943747321295984, "loss": 7.7224, "step": 648100 }, { "epoch": 2.640624347558634, "grad_norm": 4.225713729858398, "learning_rate": 0.0032939041067494117, "loss": 7.7487, "step": 648200 }, { "epoch": 2.6410317255820157, "grad_norm": 4.850101947784424, "learning_rate": 0.0032934334501796268, "loss": 7.7383, "step": 648300 }, { "epoch": 2.6414391036053972, "grad_norm": 3.3083293437957764, "learning_rate": 0.003292962762438849, "loss": 7.7351, "step": 648400 }, { "epoch": 2.6418464816287788, "grad_norm": 5.9677042961120605, "learning_rate": 0.0032924920435456894, "loss": 7.7441, "step": 648500 }, { "epoch": 2.6422538596521603, "grad_norm": 4.312749862670898, "learning_rate": 0.0032920212935187584, "loss": 7.7333, "step": 648600 }, { "epoch": 2.642661237675542, "grad_norm": 4.137760162353516, "learning_rate": 0.0032915505123766687, "loss": 7.734, "step": 648700 }, { "epoch": 2.6430686156989234, "grad_norm": 4.529685974121094, "learning_rate": 0.003291079700138029, "loss": 7.7682, "step": 648800 }, { "epoch": 2.643475993722305, "grad_norm": 2.5717289447784424, "learning_rate": 0.003290608856821454, "loss": 7.7284, "step": 648900 }, { "epoch": 2.643883371745686, "grad_norm": 7.729617595672607, "learning_rate": 0.0032901379824455566, "loss": 7.7786, "step": 649000 }, { "epoch": 2.643883371745686, "eval_MaskedAccuracy": 0.5025726633766079, "eval_loss": 1.6284892559051514, "eval_runtime": 158.623, "eval_samples_per_second": 400.169, "eval_steps_per_second": 1.563, "step": 649000 }, { "epoch": 2.6442907497690675, "grad_norm": 5.148902416229248, "learning_rate": 0.0032896670770289588, "loss": 7.7386, "step": 649100 }, { "epoch": 2.644698127792449, "grad_norm": 3.575874090194702, "learning_rate": 0.003289196140590273, "loss": 7.7559, "step": 649200 }, { "epoch": 2.6451055058158306, "grad_norm": 4.578037738800049, "learning_rate": 0.003288725173148119, "loss": 7.7321, "step": 649300 }, { "epoch": 2.645512883839212, "grad_norm": 3.5895893573760986, "learning_rate": 0.003288254174721118, "loss": 7.7578, "step": 649400 }, { "epoch": 2.645920261862593, "grad_norm": 2.84005069732666, "learning_rate": 0.003287783145327894, "loss": 7.7407, "step": 649500 }, { "epoch": 2.6463276398859747, "grad_norm": 2.76251482963562, "learning_rate": 0.003287312084987063, "loss": 7.7272, "step": 649600 }, { "epoch": 2.6467350179093563, "grad_norm": 5.468085289001465, "learning_rate": 0.003286840993717252, "loss": 7.7274, "step": 649700 }, { "epoch": 2.647142395932738, "grad_norm": 4.382936954498291, "learning_rate": 0.003286369871537083, "loss": 7.7303, "step": 649800 }, { "epoch": 2.6475497739561193, "grad_norm": 5.057107448577881, "learning_rate": 0.0032858987184651816, "loss": 7.7261, "step": 649900 }, { "epoch": 2.647957151979501, "grad_norm": 3.6791536808013916, "learning_rate": 0.0032854275345201777, "loss": 7.7577, "step": 650000 }, { "epoch": 2.647957151979501, "eval_MaskedAccuracy": 0.5033928448150526, "eval_loss": 1.6409040689468384, "eval_runtime": 168.8179, "eval_samples_per_second": 376.003, "eval_steps_per_second": 1.469, "step": 650000 }, { "epoch": 2.6483645300028824, "grad_norm": 4.990233898162842, "learning_rate": 0.0032849563197207016, "loss": 7.7639, "step": 650100 }, { "epoch": 2.6487719080262635, "grad_norm": 3.060655117034912, "learning_rate": 0.003284485074085381, "loss": 7.7528, "step": 650200 }, { "epoch": 2.649179286049645, "grad_norm": 4.236000061035156, "learning_rate": 0.0032840137976328453, "loss": 7.7395, "step": 650300 }, { "epoch": 2.6495866640730266, "grad_norm": 4.522440433502197, "learning_rate": 0.0032835424903817283, "loss": 7.7816, "step": 650400 }, { "epoch": 2.649994042096408, "grad_norm": 3.9909865856170654, "learning_rate": 0.0032830711523506666, "loss": 7.7378, "step": 650500 }, { "epoch": 2.6504014201197896, "grad_norm": 3.386911392211914, "learning_rate": 0.003282599783558287, "loss": 7.7474, "step": 650600 }, { "epoch": 2.6508087981431707, "grad_norm": 2.55659818649292, "learning_rate": 0.0032821283840232278, "loss": 7.7695, "step": 650700 }, { "epoch": 2.6512161761665523, "grad_norm": 5.9968061447143555, "learning_rate": 0.0032816569537641284, "loss": 7.7578, "step": 650800 }, { "epoch": 2.651623554189934, "grad_norm": 6.817568302154541, "learning_rate": 0.00328118549279963, "loss": 7.7295, "step": 650900 }, { "epoch": 2.6520309322133153, "grad_norm": 9.927818298339844, "learning_rate": 0.0032807140011483628, "loss": 7.7438, "step": 651000 }, { "epoch": 2.6520309322133153, "eval_MaskedAccuracy": 0.5034037926576029, "eval_loss": 1.6238354444503784, "eval_runtime": 161.4741, "eval_samples_per_second": 393.103, "eval_steps_per_second": 1.536, "step": 651000 }, { "epoch": 2.652438310236697, "grad_norm": 4.887489318847656, "learning_rate": 0.003280242478828971, "loss": 7.7562, "step": 651100 }, { "epoch": 2.6528456882600784, "grad_norm": 6.656474590301514, "learning_rate": 0.003279770925860098, "loss": 7.7341, "step": 651200 }, { "epoch": 2.65325306628346, "grad_norm": 4.37863302230835, "learning_rate": 0.003279299342260389, "loss": 7.7249, "step": 651300 }, { "epoch": 2.6536604443068414, "grad_norm": 3.029027223587036, "learning_rate": 0.003278827728048482, "loss": 7.7519, "step": 651400 }, { "epoch": 2.6540678223302225, "grad_norm": 5.147206783294678, "learning_rate": 0.0032783560832430284, "loss": 7.736, "step": 651500 }, { "epoch": 2.654475200353604, "grad_norm": 4.042923450469971, "learning_rate": 0.0032778844078626738, "loss": 7.7345, "step": 651600 }, { "epoch": 2.6548825783769856, "grad_norm": 5.513915061950684, "learning_rate": 0.0032774127019260637, "loss": 7.7615, "step": 651700 }, { "epoch": 2.655289956400367, "grad_norm": 3.6763839721679688, "learning_rate": 0.00327694096545185, "loss": 7.7258, "step": 651800 }, { "epoch": 2.6556973344237487, "grad_norm": 2.0926759243011475, "learning_rate": 0.0032764691984586824, "loss": 7.7838, "step": 651900 }, { "epoch": 2.6561047124471298, "grad_norm": 8.465835571289062, "learning_rate": 0.003275997400965211, "loss": 7.7634, "step": 652000 }, { "epoch": 2.6561047124471298, "eval_MaskedAccuracy": 0.5026738480989897, "eval_loss": 1.6305789947509766, "eval_runtime": 158.411, "eval_samples_per_second": 400.705, "eval_steps_per_second": 1.566, "step": 652000 }, { "epoch": 2.6565120904705113, "grad_norm": 4.270816326141357, "learning_rate": 0.003275525572990089, "loss": 7.7325, "step": 652100 }, { "epoch": 2.656919468493893, "grad_norm": 3.6876659393310547, "learning_rate": 0.00327505371455197, "loss": 7.7261, "step": 652200 }, { "epoch": 2.6573268465172744, "grad_norm": 3.0237393379211426, "learning_rate": 0.0032745818256695093, "loss": 7.7233, "step": 652300 }, { "epoch": 2.657734224540656, "grad_norm": 4.164248943328857, "learning_rate": 0.0032741099063613587, "loss": 7.7374, "step": 652400 }, { "epoch": 2.6581416025640374, "grad_norm": 5.732575416564941, "learning_rate": 0.003273637956646182, "loss": 7.7519, "step": 652500 }, { "epoch": 2.658548980587419, "grad_norm": 3.6276843547821045, "learning_rate": 0.0032731659765426343, "loss": 7.7491, "step": 652600 }, { "epoch": 2.6589563586108, "grad_norm": 3.634322166442871, "learning_rate": 0.0032726939660693787, "loss": 7.744, "step": 652700 }, { "epoch": 2.6593637366341816, "grad_norm": 6.3059563636779785, "learning_rate": 0.003272221925245076, "loss": 7.7371, "step": 652800 }, { "epoch": 2.659771114657563, "grad_norm": 6.109255313873291, "learning_rate": 0.00327174985408839, "loss": 7.746, "step": 652900 }, { "epoch": 2.6601784926809446, "grad_norm": 6.7755208015441895, "learning_rate": 0.0032712777526179783, "loss": 7.7462, "step": 653000 }, { "epoch": 2.6601784926809446, "eval_MaskedAccuracy": 0.5039142799604575, "eval_loss": 1.636847734451294, "eval_runtime": 158.8607, "eval_samples_per_second": 399.57, "eval_steps_per_second": 1.561, "step": 653000 }, { "epoch": 2.660585870704326, "grad_norm": 4.314255237579346, "learning_rate": 0.003270805620852508, "loss": 7.7527, "step": 653100 }, { "epoch": 2.6609932487277073, "grad_norm": 3.098864793777466, "learning_rate": 0.003270333458810645, "loss": 7.7721, "step": 653200 }, { "epoch": 2.661400626751089, "grad_norm": 3.9847843647003174, "learning_rate": 0.003269861266511057, "loss": 7.7402, "step": 653300 }, { "epoch": 2.6618080047744703, "grad_norm": 12.476365089416504, "learning_rate": 0.003269389043972412, "loss": 7.7257, "step": 653400 }, { "epoch": 2.662215382797852, "grad_norm": 2.452924966812134, "learning_rate": 0.003268916791213377, "loss": 7.7618, "step": 653500 }, { "epoch": 2.6626227608212334, "grad_norm": 4.939070224761963, "learning_rate": 0.00326844450825263, "loss": 7.7161, "step": 653600 }, { "epoch": 2.663030138844615, "grad_norm": 4.919296741485596, "learning_rate": 0.003267972195108836, "loss": 7.7501, "step": 653700 }, { "epoch": 2.6634375168679965, "grad_norm": 2.7469310760498047, "learning_rate": 0.003267499851800673, "loss": 7.7293, "step": 653800 }, { "epoch": 2.663844894891378, "grad_norm": 4.047046184539795, "learning_rate": 0.003267027478346811, "loss": 7.7394, "step": 653900 }, { "epoch": 2.664252272914759, "grad_norm": 3.699885368347168, "learning_rate": 0.0032665550747659282, "loss": 7.7607, "step": 654000 }, { "epoch": 2.664252272914759, "eval_MaskedAccuracy": 0.5038251701037562, "eval_loss": 1.6308118104934692, "eval_runtime": 156.6225, "eval_samples_per_second": 405.28, "eval_steps_per_second": 1.583, "step": 654000 }, { "epoch": 2.6646596509381406, "grad_norm": 2.383355140686035, "learning_rate": 0.003266082641076702, "loss": 7.721, "step": 654100 }, { "epoch": 2.665067028961522, "grad_norm": 5.599305152893066, "learning_rate": 0.003265610177297807, "loss": 7.7281, "step": 654200 }, { "epoch": 2.6654744069849037, "grad_norm": 9.50638484954834, "learning_rate": 0.003265137683447925, "loss": 7.7655, "step": 654300 }, { "epoch": 2.665881785008285, "grad_norm": 5.803840637207031, "learning_rate": 0.0032646651595457307, "loss": 7.7634, "step": 654400 }, { "epoch": 2.6662891630316663, "grad_norm": 2.774467945098877, "learning_rate": 0.003264192605609913, "loss": 7.7026, "step": 654500 }, { "epoch": 2.666696541055048, "grad_norm": 3.172819137573242, "learning_rate": 0.003263720021659151, "loss": 7.7291, "step": 654600 }, { "epoch": 2.6671039190784294, "grad_norm": 6.5416975021362305, "learning_rate": 0.003263247407712128, "loss": 7.7583, "step": 654700 }, { "epoch": 2.667511297101811, "grad_norm": 2.694756507873535, "learning_rate": 0.003262774763787526, "loss": 7.7231, "step": 654800 }, { "epoch": 2.6679186751251924, "grad_norm": 7.714715480804443, "learning_rate": 0.003262302089904039, "loss": 7.7305, "step": 654900 }, { "epoch": 2.668326053148574, "grad_norm": 3.5213520526885986, "learning_rate": 0.0032618293860803473, "loss": 7.7225, "step": 655000 }, { "epoch": 2.668326053148574, "eval_MaskedAccuracy": 0.5033319548576939, "eval_loss": 1.6350668668746948, "eval_runtime": 157.4558, "eval_samples_per_second": 403.135, "eval_steps_per_second": 1.575, "step": 655000 }, { "epoch": 2.6687334311719555, "grad_norm": 6.717520713806152, "learning_rate": 0.0032613566523351446, "loss": 7.7301, "step": 655100 }, { "epoch": 2.6691408091953366, "grad_norm": 1.9514811038970947, "learning_rate": 0.003260883888687118, "loss": 7.7179, "step": 655200 }, { "epoch": 2.669548187218718, "grad_norm": 6.654797077178955, "learning_rate": 0.0032604110951549594, "loss": 7.733, "step": 655300 }, { "epoch": 2.6699555652420996, "grad_norm": 3.4241044521331787, "learning_rate": 0.0032599382717573597, "loss": 7.7225, "step": 655400 }, { "epoch": 2.670362943265481, "grad_norm": 3.2960762977600098, "learning_rate": 0.003259465418513011, "loss": 7.7223, "step": 655500 }, { "epoch": 2.6707703212888627, "grad_norm": 4.2361674308776855, "learning_rate": 0.0032589925354406094, "loss": 7.741, "step": 655600 }, { "epoch": 2.671177699312244, "grad_norm": 4.492542266845703, "learning_rate": 0.0032585196225588483, "loss": 7.7582, "step": 655700 }, { "epoch": 2.6715850773356253, "grad_norm": 3.8159728050231934, "learning_rate": 0.003258046679886431, "loss": 7.7834, "step": 655800 }, { "epoch": 2.671992455359007, "grad_norm": 8.345585823059082, "learning_rate": 0.0032575737074420495, "loss": 7.7202, "step": 655900 }, { "epoch": 2.6723998333823884, "grad_norm": 6.983959674835205, "learning_rate": 0.0032571007052444047, "loss": 7.7417, "step": 656000 }, { "epoch": 2.6723998333823884, "eval_MaskedAccuracy": 0.5041212221981818, "eval_loss": 1.6257683038711548, "eval_runtime": 167.801, "eval_samples_per_second": 378.281, "eval_steps_per_second": 1.478, "step": 656000 }, { "epoch": 2.67280721140577, "grad_norm": 3.0840566158294678, "learning_rate": 0.0032566276733121966, "loss": 7.7297, "step": 656100 }, { "epoch": 2.6732145894291515, "grad_norm": 5.3380279541015625, "learning_rate": 0.0032561546116641263, "loss": 7.7449, "step": 656200 }, { "epoch": 2.673621967452533, "grad_norm": 6.754693508148193, "learning_rate": 0.003255681520318896, "loss": 7.7088, "step": 656300 }, { "epoch": 2.6740293454759145, "grad_norm": 5.879538059234619, "learning_rate": 0.0032552083992952096, "loss": 7.74, "step": 656400 }, { "epoch": 2.6744367234992956, "grad_norm": 5.780153274536133, "learning_rate": 0.0032547352486117747, "loss": 7.7345, "step": 656500 }, { "epoch": 2.674844101522677, "grad_norm": 4.357320785522461, "learning_rate": 0.0032542620682872946, "loss": 7.7305, "step": 656600 }, { "epoch": 2.6752514795460587, "grad_norm": 3.5099992752075195, "learning_rate": 0.003253788858340474, "loss": 7.7745, "step": 656700 }, { "epoch": 2.67565885756944, "grad_norm": 6.600289344787598, "learning_rate": 0.003253315618790031, "loss": 7.738, "step": 656800 }, { "epoch": 2.6760662355928218, "grad_norm": 10.957653999328613, "learning_rate": 0.00325284234965467, "loss": 7.7517, "step": 656900 }, { "epoch": 2.676473613616203, "grad_norm": 4.545962333679199, "learning_rate": 0.0032523690509530974, "loss": 7.7583, "step": 657000 }, { "epoch": 2.676473613616203, "eval_MaskedAccuracy": 0.5034654201527211, "eval_loss": 1.6379927396774292, "eval_runtime": 226.1231, "eval_samples_per_second": 280.714, "eval_steps_per_second": 1.097, "step": 657000 }, { "epoch": 2.6768809916395844, "grad_norm": 2.678373336791992, "learning_rate": 0.003251895722704028, "loss": 7.7635, "step": 657100 }, { "epoch": 2.677288369662966, "grad_norm": 5.181834697723389, "learning_rate": 0.0032514223649261777, "loss": 7.7028, "step": 657200 }, { "epoch": 2.6776957476863474, "grad_norm": 9.015517234802246, "learning_rate": 0.0032509489776382564, "loss": 7.7647, "step": 657300 }, { "epoch": 2.678103125709729, "grad_norm": 4.9846720695495605, "learning_rate": 0.003250475560858984, "loss": 7.7519, "step": 657400 }, { "epoch": 2.6785105037331105, "grad_norm": 4.39346981048584, "learning_rate": 0.0032500021146070752, "loss": 7.7089, "step": 657500 }, { "epoch": 2.678917881756492, "grad_norm": 6.527012825012207, "learning_rate": 0.003249528638901249, "loss": 7.751, "step": 657600 }, { "epoch": 2.679325259779873, "grad_norm": 5.306674003601074, "learning_rate": 0.003249055133760224, "loss": 7.7623, "step": 657700 }, { "epoch": 2.6797326378032547, "grad_norm": 2.8503808975219727, "learning_rate": 0.0032485815992027212, "loss": 7.7665, "step": 657800 }, { "epoch": 2.680140015826636, "grad_norm": 5.299534797668457, "learning_rate": 0.003248108035247459, "loss": 7.7183, "step": 657900 }, { "epoch": 2.6805473938500177, "grad_norm": 5.834193706512451, "learning_rate": 0.0032476344419131633, "loss": 7.7479, "step": 658000 }, { "epoch": 2.6805473938500177, "eval_MaskedAccuracy": 0.5036503069933103, "eval_loss": 1.630007028579712, "eval_runtime": 175.1958, "eval_samples_per_second": 362.315, "eval_steps_per_second": 1.416, "step": 658000 }, { "epoch": 2.6809547718733993, "grad_norm": 3.148181438446045, "learning_rate": 0.003247160819218553, "loss": 7.7387, "step": 658100 }, { "epoch": 2.6813621498967803, "grad_norm": 4.28448486328125, "learning_rate": 0.0032466871671823585, "loss": 7.7497, "step": 658200 }, { "epoch": 2.681769527920162, "grad_norm": 6.779094696044922, "learning_rate": 0.0032462134858233034, "loss": 7.7273, "step": 658300 }, { "epoch": 2.6821769059435434, "grad_norm": 3.783496856689453, "learning_rate": 0.003245739775160116, "loss": 7.6995, "step": 658400 }, { "epoch": 2.682584283966925, "grad_norm": 2.3883776664733887, "learning_rate": 0.003245266035211523, "loss": 7.7321, "step": 658500 }, { "epoch": 2.6829916619903065, "grad_norm": 9.007153511047363, "learning_rate": 0.003244792265996254, "loss": 7.7263, "step": 658600 }, { "epoch": 2.683399040013688, "grad_norm": 4.153413772583008, "learning_rate": 0.0032443184675330404, "loss": 7.7335, "step": 658700 }, { "epoch": 2.6838064180370695, "grad_norm": 5.07761287689209, "learning_rate": 0.0032438446398406133, "loss": 7.7478, "step": 658800 }, { "epoch": 2.684213796060451, "grad_norm": 5.622042655944824, "learning_rate": 0.0032433707829377044, "loss": 7.7436, "step": 658900 }, { "epoch": 2.684621174083832, "grad_norm": 2.3242077827453613, "learning_rate": 0.0032428968968430507, "loss": 7.7153, "step": 659000 }, { "epoch": 2.684621174083832, "eval_MaskedAccuracy": 0.5030991623659978, "eval_loss": 1.6299339532852173, "eval_runtime": 160.5434, "eval_samples_per_second": 395.382, "eval_steps_per_second": 1.545, "step": 659000 }, { "epoch": 2.6850285521072137, "grad_norm": 3.4865944385528564, "learning_rate": 0.003242422981575384, "loss": 7.7329, "step": 659100 }, { "epoch": 2.6854359301305952, "grad_norm": 2.9826807975769043, "learning_rate": 0.0032419490371534436, "loss": 7.7384, "step": 659200 }, { "epoch": 2.6858433081539768, "grad_norm": 6.728859901428223, "learning_rate": 0.003241475063595969, "loss": 7.739, "step": 659300 }, { "epoch": 2.6862506861773583, "grad_norm": 3.132185697555542, "learning_rate": 0.0032410010609216972, "loss": 7.7441, "step": 659400 }, { "epoch": 2.6866580642007394, "grad_norm": 4.959762096405029, "learning_rate": 0.003240527029149367, "loss": 7.7033, "step": 659500 }, { "epoch": 2.687065442224121, "grad_norm": 3.4063496589660645, "learning_rate": 0.003240052968297717, "loss": 7.7615, "step": 659600 }, { "epoch": 2.6874728202475024, "grad_norm": 10.797226905822754, "learning_rate": 0.0032395788783854945, "loss": 7.7321, "step": 659700 }, { "epoch": 2.687880198270884, "grad_norm": 11.4490385055542, "learning_rate": 0.003239104759431438, "loss": 7.7274, "step": 659800 }, { "epoch": 2.6882875762942655, "grad_norm": 3.827448844909668, "learning_rate": 0.0032386306114542927, "loss": 7.7157, "step": 659900 }, { "epoch": 2.688694954317647, "grad_norm": 3.8741378784179688, "learning_rate": 0.003238156434472808, "loss": 7.7169, "step": 660000 }, { "epoch": 2.688694954317647, "eval_MaskedAccuracy": 0.503351968900708, "eval_loss": 1.6341768503189087, "eval_runtime": 159.4772, "eval_samples_per_second": 398.026, "eval_steps_per_second": 1.555, "step": 660000 }, { "epoch": 2.6891023323410286, "grad_norm": 106.6436538696289, "learning_rate": 0.003237682228505726, "loss": 7.7472, "step": 660100 }, { "epoch": 2.6895097103644097, "grad_norm": 4.269049167633057, "learning_rate": 0.003237207993571798, "loss": 7.7843, "step": 660200 }, { "epoch": 2.689917088387791, "grad_norm": 6.576282501220703, "learning_rate": 0.0032367337296897707, "loss": 7.7143, "step": 660300 }, { "epoch": 2.6903244664111727, "grad_norm": 9.505963325500488, "learning_rate": 0.003236259436878399, "loss": 7.7229, "step": 660400 }, { "epoch": 2.6907318444345543, "grad_norm": 6.678642749786377, "learning_rate": 0.003235785115156429, "loss": 7.7638, "step": 660500 }, { "epoch": 2.691139222457936, "grad_norm": 3.550166606903076, "learning_rate": 0.003235310764542615, "loss": 7.7644, "step": 660600 }, { "epoch": 2.691546600481317, "grad_norm": 10.332160949707031, "learning_rate": 0.0032348363850557113, "loss": 7.754, "step": 660700 }, { "epoch": 2.6919539785046984, "grad_norm": 4.420124530792236, "learning_rate": 0.00323436197671447, "loss": 7.7474, "step": 660800 }, { "epoch": 2.69236135652808, "grad_norm": 12.303844451904297, "learning_rate": 0.0032338875395376467, "loss": 7.7464, "step": 660900 }, { "epoch": 2.6927687345514615, "grad_norm": 10.578495979309082, "learning_rate": 0.003233413073544003, "loss": 7.7469, "step": 661000 }, { "epoch": 2.6927687345514615, "eval_MaskedAccuracy": 0.5034515531390799, "eval_loss": 1.6318436861038208, "eval_runtime": 148.7141, "eval_samples_per_second": 426.832, "eval_steps_per_second": 1.668, "step": 661000 }, { "epoch": 2.693176112574843, "grad_norm": 3.2199487686157227, "learning_rate": 0.0032329385787522934, "loss": 7.7408, "step": 661100 }, { "epoch": 2.6935834905982246, "grad_norm": 9.546009063720703, "learning_rate": 0.0032324640551812827, "loss": 7.7545, "step": 661200 }, { "epoch": 2.693990868621606, "grad_norm": 2.9559309482574463, "learning_rate": 0.0032319895028497226, "loss": 7.7454, "step": 661300 }, { "epoch": 2.6943982466449876, "grad_norm": 3.469130754470825, "learning_rate": 0.0032315149217763822, "loss": 7.7414, "step": 661400 }, { "epoch": 2.6948056246683687, "grad_norm": 4.411319255828857, "learning_rate": 0.003231040311980017, "loss": 7.7061, "step": 661500 }, { "epoch": 2.6952130026917502, "grad_norm": 3.295698881149292, "learning_rate": 0.003230565673479398, "loss": 7.7346, "step": 661600 }, { "epoch": 2.6956203807151318, "grad_norm": 10.868988037109375, "learning_rate": 0.0032300910062932893, "loss": 7.7283, "step": 661700 }, { "epoch": 2.6960277587385133, "grad_norm": 9.993134498596191, "learning_rate": 0.0032296163104404514, "loss": 7.7633, "step": 661800 }, { "epoch": 2.696435136761895, "grad_norm": 8.998684883117676, "learning_rate": 0.0032291415859396605, "loss": 7.7485, "step": 661900 }, { "epoch": 2.696842514785276, "grad_norm": 3.220615863800049, "learning_rate": 0.003228666832809676, "loss": 7.7697, "step": 662000 }, { "epoch": 2.696842514785276, "eval_MaskedAccuracy": 0.5040962974919867, "eval_loss": 1.6281524896621704, "eval_runtime": 148.3752, "eval_samples_per_second": 427.807, "eval_steps_per_second": 1.671, "step": 662000 }, { "epoch": 2.6972498928086575, "grad_norm": 3.631373643875122, "learning_rate": 0.003228192051069267, "loss": 7.7254, "step": 662100 }, { "epoch": 2.697657270832039, "grad_norm": 5.53477144241333, "learning_rate": 0.0032277172407372146, "loss": 7.7484, "step": 662200 }, { "epoch": 2.6980646488554205, "grad_norm": 5.845667362213135, "learning_rate": 0.0032272424018322814, "loss": 7.7308, "step": 662300 }, { "epoch": 2.698472026878802, "grad_norm": 3.323057174682617, "learning_rate": 0.003226767534373242, "loss": 7.724, "step": 662400 }, { "epoch": 2.6988794049021836, "grad_norm": 2.7929320335388184, "learning_rate": 0.0032262926383788753, "loss": 7.7006, "step": 662500 }, { "epoch": 2.699286782925565, "grad_norm": 7.5049896240234375, "learning_rate": 0.003225817713867954, "loss": 7.73, "step": 662600 }, { "epoch": 2.699694160948946, "grad_norm": 4.794573783874512, "learning_rate": 0.0032253427608592514, "loss": 7.7093, "step": 662700 }, { "epoch": 2.7001015389723277, "grad_norm": 4.7356696128845215, "learning_rate": 0.003224867779371547, "loss": 7.7181, "step": 662800 }, { "epoch": 2.7005089169957093, "grad_norm": 2.7196238040924072, "learning_rate": 0.0032243927694236185, "loss": 7.7176, "step": 662900 }, { "epoch": 2.700916295019091, "grad_norm": 5.0205159187316895, "learning_rate": 0.003223917731034248, "loss": 7.7623, "step": 663000 }, { "epoch": 2.700916295019091, "eval_MaskedAccuracy": 0.5038521830246014, "eval_loss": 1.6336082220077515, "eval_runtime": 149.6745, "eval_samples_per_second": 424.093, "eval_steps_per_second": 1.657, "step": 663000 }, { "epoch": 2.7013236730424723, "grad_norm": 3.996946334838867, "learning_rate": 0.0032234426642222137, "loss": 7.7267, "step": 663100 }, { "epoch": 2.7017310510658534, "grad_norm": 3.2087459564208984, "learning_rate": 0.0032229675690062997, "loss": 7.7654, "step": 663200 }, { "epoch": 2.702138429089235, "grad_norm": 3.582521438598633, "learning_rate": 0.0032224924454052855, "loss": 7.7521, "step": 663300 }, { "epoch": 2.7025458071126165, "grad_norm": 7.063210964202881, "learning_rate": 0.0032220172934379556, "loss": 7.7552, "step": 663400 }, { "epoch": 2.702953185135998, "grad_norm": 7.65894079208374, "learning_rate": 0.003221542113123099, "loss": 7.738, "step": 663500 }, { "epoch": 2.7033605631593796, "grad_norm": 5.349039077758789, "learning_rate": 0.0032210669044795, "loss": 7.7524, "step": 663600 }, { "epoch": 2.703767941182761, "grad_norm": 4.374447822570801, "learning_rate": 0.003220591667525946, "loss": 7.7433, "step": 663700 }, { "epoch": 2.7041753192061426, "grad_norm": 4.791806221008301, "learning_rate": 0.0032201164022812273, "loss": 7.7792, "step": 663800 }, { "epoch": 2.704582697229524, "grad_norm": 8.21108627319336, "learning_rate": 0.003219641108764132, "loss": 7.7342, "step": 663900 }, { "epoch": 2.7049900752529052, "grad_norm": 3.945631742477417, "learning_rate": 0.003219165786993452, "loss": 7.709, "step": 664000 }, { "epoch": 2.7049900752529052, "eval_MaskedAccuracy": 0.5037033042070721, "eval_loss": 1.6292518377304077, "eval_runtime": 149.5575, "eval_samples_per_second": 424.425, "eval_steps_per_second": 1.658, "step": 664000 }, { "epoch": 2.705397453276287, "grad_norm": 3.24385142326355, "learning_rate": 0.003218690436987978, "loss": 7.7348, "step": 664100 }, { "epoch": 2.7058048312996683, "grad_norm": 7.810568332672119, "learning_rate": 0.0032182150587665064, "loss": 7.7509, "step": 664200 }, { "epoch": 2.70621220932305, "grad_norm": 9.91736888885498, "learning_rate": 0.0032177396523478292, "loss": 7.7213, "step": 664300 }, { "epoch": 2.7066195873464314, "grad_norm": 8.979564666748047, "learning_rate": 0.0032172642177507385, "loss": 7.7312, "step": 664400 }, { "epoch": 2.7070269653698125, "grad_norm": 3.256312370300293, "learning_rate": 0.0032167887549940355, "loss": 7.7632, "step": 664500 }, { "epoch": 2.707434343393194, "grad_norm": 6.67056131362915, "learning_rate": 0.0032163132640965165, "loss": 7.7438, "step": 664600 }, { "epoch": 2.7078417214165755, "grad_norm": 3.1909127235412598, "learning_rate": 0.003215837745076978, "loss": 7.733, "step": 664700 }, { "epoch": 2.708249099439957, "grad_norm": 4.2988715171813965, "learning_rate": 0.0032153621979542223, "loss": 7.7362, "step": 664800 }, { "epoch": 2.7086564774633386, "grad_norm": 3.895310878753662, "learning_rate": 0.00321488662274705, "loss": 7.7339, "step": 664900 }, { "epoch": 2.70906385548672, "grad_norm": 6.113119125366211, "learning_rate": 0.0032144110194742624, "loss": 7.7264, "step": 665000 }, { "epoch": 2.70906385548672, "eval_MaskedAccuracy": 0.503768846450946, "eval_loss": 1.6280908584594727, "eval_runtime": 148.2173, "eval_samples_per_second": 428.263, "eval_steps_per_second": 1.673, "step": 665000 }, { "epoch": 2.7094712335101017, "grad_norm": 4.171345233917236, "learning_rate": 0.0032139353881546636, "loss": 7.7229, "step": 665100 }, { "epoch": 2.7098786115334828, "grad_norm": 6.491979598999023, "learning_rate": 0.0032134597288070556, "loss": 7.7482, "step": 665200 }, { "epoch": 2.7102859895568643, "grad_norm": 10.674572944641113, "learning_rate": 0.0032129840414502463, "loss": 7.7533, "step": 665300 }, { "epoch": 2.710693367580246, "grad_norm": 5.564167022705078, "learning_rate": 0.00321250832610304, "loss": 7.7534, "step": 665400 }, { "epoch": 2.7111007456036273, "grad_norm": 7.038264274597168, "learning_rate": 0.0032120325827842463, "loss": 7.7142, "step": 665500 }, { "epoch": 2.711508123627009, "grad_norm": 2.9987425804138184, "learning_rate": 0.0032115568115126743, "loss": 7.7232, "step": 665600 }, { "epoch": 2.71191550165039, "grad_norm": 3.360598087310791, "learning_rate": 0.0032110810123071308, "loss": 7.7358, "step": 665700 }, { "epoch": 2.7123228796737715, "grad_norm": 11.335540771484375, "learning_rate": 0.0032106051851864313, "loss": 7.7117, "step": 665800 }, { "epoch": 2.712730257697153, "grad_norm": 3.6695189476013184, "learning_rate": 0.0032101293301693866, "loss": 7.758, "step": 665900 }, { "epoch": 2.7131376357205346, "grad_norm": 5.335036277770996, "learning_rate": 0.0032096534472748055, "loss": 7.6919, "step": 666000 }, { "epoch": 2.7131376357205346, "eval_MaskedAccuracy": 0.5040659751029255, "eval_loss": 1.628699541091919, "eval_runtime": 150.9461, "eval_samples_per_second": 420.521, "eval_steps_per_second": 1.643, "step": 666000 }, { "epoch": 2.713545013743916, "grad_norm": 3.3608007431030273, "learning_rate": 0.003209177536521505, "loss": 7.7517, "step": 666100 }, { "epoch": 2.7139523917672976, "grad_norm": 2.3203938007354736, "learning_rate": 0.003208701597928301, "loss": 7.7089, "step": 666200 }, { "epoch": 2.714359769790679, "grad_norm": 2.199331283569336, "learning_rate": 0.0032082256315140063, "loss": 7.7258, "step": 666300 }, { "epoch": 2.7147671478140607, "grad_norm": 5.572924613952637, "learning_rate": 0.0032077496372974394, "loss": 7.7371, "step": 666400 }, { "epoch": 2.715174525837442, "grad_norm": 5.677365303039551, "learning_rate": 0.003207273615297422, "loss": 7.7777, "step": 666500 }, { "epoch": 2.7155819038608233, "grad_norm": 3.157910108566284, "learning_rate": 0.0032067975655327764, "loss": 7.7324, "step": 666600 }, { "epoch": 2.715989281884205, "grad_norm": 5.038809299468994, "learning_rate": 0.0032063214880223173, "loss": 7.7197, "step": 666700 }, { "epoch": 2.7163966599075864, "grad_norm": 9.037930488586426, "learning_rate": 0.003205845382784871, "loss": 7.7173, "step": 666800 }, { "epoch": 2.716804037930968, "grad_norm": 2.4714245796203613, "learning_rate": 0.0032053692498392548, "loss": 7.7198, "step": 666900 }, { "epoch": 2.717211415954349, "grad_norm": 5.054344654083252, "learning_rate": 0.0032048930892042975, "loss": 7.7455, "step": 667000 }, { "epoch": 2.717211415954349, "eval_MaskedAccuracy": 0.5031529451686492, "eval_loss": 1.6368383169174194, "eval_runtime": 148.3692, "eval_samples_per_second": 427.825, "eval_steps_per_second": 1.672, "step": 667000 }, { "epoch": 2.7176187939777305, "grad_norm": 5.298316478729248, "learning_rate": 0.003204416900898827, "loss": 7.7285, "step": 667100 }, { "epoch": 2.718026172001112, "grad_norm": 10.430736541748047, "learning_rate": 0.0032039406849416643, "loss": 7.775, "step": 667200 }, { "epoch": 2.7184335500244936, "grad_norm": 8.048696517944336, "learning_rate": 0.003203464441351639, "loss": 7.7378, "step": 667300 }, { "epoch": 2.718840928047875, "grad_norm": 4.4530253410339355, "learning_rate": 0.0032029881701475743, "loss": 7.7577, "step": 667400 }, { "epoch": 2.7192483060712567, "grad_norm": 3.222097158432007, "learning_rate": 0.0032025118713483063, "loss": 7.7375, "step": 667500 }, { "epoch": 2.719655684094638, "grad_norm": 3.4581751823425293, "learning_rate": 0.003202035544972665, "loss": 7.7477, "step": 667600 }, { "epoch": 2.7200630621180193, "grad_norm": 4.972837448120117, "learning_rate": 0.0032015591910394808, "loss": 7.7218, "step": 667700 }, { "epoch": 2.720470440141401, "grad_norm": 6.326439380645752, "learning_rate": 0.0032010828095675858, "loss": 7.7323, "step": 667800 }, { "epoch": 2.7208778181647824, "grad_norm": 3.2040696144104004, "learning_rate": 0.0032006064005758133, "loss": 7.7626, "step": 667900 }, { "epoch": 2.721285196188164, "grad_norm": 5.238594055175781, "learning_rate": 0.003200129964082997, "loss": 7.7036, "step": 668000 }, { "epoch": 2.721285196188164, "eval_MaskedAccuracy": 0.5033877847279113, "eval_loss": 1.6351256370544434, "eval_runtime": 148.8788, "eval_samples_per_second": 426.36, "eval_steps_per_second": 1.666, "step": 668000 }, { "epoch": 2.7216925742115454, "grad_norm": 3.157193183898926, "learning_rate": 0.0031996535001079783, "loss": 7.7385, "step": 668100 }, { "epoch": 2.7220999522349265, "grad_norm": 4.541149139404297, "learning_rate": 0.00319917700866959, "loss": 7.7382, "step": 668200 }, { "epoch": 2.722507330258308, "grad_norm": 3.2123706340789795, "learning_rate": 0.0031987004897866727, "loss": 7.7155, "step": 668300 }, { "epoch": 2.7229147082816896, "grad_norm": 8.792269706726074, "learning_rate": 0.0031982239434780652, "loss": 7.7058, "step": 668400 }, { "epoch": 2.723322086305071, "grad_norm": 3.3838703632354736, "learning_rate": 0.0031977473697626054, "loss": 7.7268, "step": 668500 }, { "epoch": 2.7237294643284526, "grad_norm": 4.612702369689941, "learning_rate": 0.0031972707686591375, "loss": 7.7645, "step": 668600 }, { "epoch": 2.724136842351834, "grad_norm": 6.421452045440674, "learning_rate": 0.0031967941401865045, "loss": 7.7529, "step": 668700 }, { "epoch": 2.7245442203752157, "grad_norm": 6.113936424255371, "learning_rate": 0.0031963174843635507, "loss": 7.7221, "step": 668800 }, { "epoch": 2.7249515983985972, "grad_norm": 7.661069393157959, "learning_rate": 0.0031958408012091174, "loss": 7.7416, "step": 668900 }, { "epoch": 2.7253589764219783, "grad_norm": 3.8138558864593506, "learning_rate": 0.003195364090742052, "loss": 7.7242, "step": 669000 }, { "epoch": 2.7253589764219783, "eval_MaskedAccuracy": 0.5034439907782373, "eval_loss": 1.6119866371154785, "eval_runtime": 149.2263, "eval_samples_per_second": 425.367, "eval_steps_per_second": 1.662, "step": 669000 }, { "epoch": 2.72576635444536, "grad_norm": 5.3097734451293945, "learning_rate": 0.0031948873529812015, "loss": 7.7515, "step": 669100 }, { "epoch": 2.7261737324687414, "grad_norm": 3.244429588317871, "learning_rate": 0.0031944105879454137, "loss": 7.7197, "step": 669200 }, { "epoch": 2.726581110492123, "grad_norm": 4.060519218444824, "learning_rate": 0.0031939337956535395, "loss": 7.7415, "step": 669300 }, { "epoch": 2.7269884885155045, "grad_norm": 10.333324432373047, "learning_rate": 0.0031934569761244255, "loss": 7.6973, "step": 669400 }, { "epoch": 2.7273958665388855, "grad_norm": 4.821107387542725, "learning_rate": 0.003192980129376926, "loss": 7.7379, "step": 669500 }, { "epoch": 2.727803244562267, "grad_norm": 4.046305179595947, "learning_rate": 0.0031925032554298905, "loss": 7.7052, "step": 669600 }, { "epoch": 2.7282106225856486, "grad_norm": 3.4669225215911865, "learning_rate": 0.003192026354302181, "loss": 7.75, "step": 669700 }, { "epoch": 2.72861800060903, "grad_norm": 9.231145858764648, "learning_rate": 0.0031915494260126393, "loss": 7.7366, "step": 669800 }, { "epoch": 2.7290253786324117, "grad_norm": 2.925002098083496, "learning_rate": 0.003191072470580129, "loss": 7.7343, "step": 669900 }, { "epoch": 2.729432756655793, "grad_norm": 11.176908493041992, "learning_rate": 0.003190595488023502, "loss": 7.7285, "step": 670000 }, { "epoch": 2.729432756655793, "eval_MaskedAccuracy": 0.5037509228227985, "eval_loss": 1.6253544092178345, "eval_runtime": 148.6907, "eval_samples_per_second": 426.9, "eval_steps_per_second": 1.668, "step": 670000 }, { "epoch": 2.7298401346791747, "grad_norm": 4.042506217956543, "learning_rate": 0.003190118478361622, "loss": 7.7288, "step": 670100 }, { "epoch": 2.730247512702556, "grad_norm": 4.371574401855469, "learning_rate": 0.003189641441613343, "loss": 7.7238, "step": 670200 }, { "epoch": 2.7306548907259374, "grad_norm": 3.266829252243042, "learning_rate": 0.003189164377797526, "loss": 7.7006, "step": 670300 }, { "epoch": 2.731062268749319, "grad_norm": 9.816326141357422, "learning_rate": 0.003188687286933032, "loss": 7.7063, "step": 670400 }, { "epoch": 2.7314696467727004, "grad_norm": 3.488961935043335, "learning_rate": 0.0031882101690387248, "loss": 7.74, "step": 670500 }, { "epoch": 2.731877024796082, "grad_norm": 2.7548892498016357, "learning_rate": 0.003187733024133465, "loss": 7.7267, "step": 670600 }, { "epoch": 2.732284402819463, "grad_norm": 5.342942714691162, "learning_rate": 0.0031872558522361163, "loss": 7.7698, "step": 670700 }, { "epoch": 2.7326917808428446, "grad_norm": 2.250199556350708, "learning_rate": 0.003186778653365544, "loss": 7.7258, "step": 670800 }, { "epoch": 2.733099158866226, "grad_norm": 2.5757293701171875, "learning_rate": 0.003186301427540618, "loss": 7.7061, "step": 670900 }, { "epoch": 2.7335065368896077, "grad_norm": 3.861353874206543, "learning_rate": 0.003185824174780201, "loss": 7.7477, "step": 671000 }, { "epoch": 2.7335065368896077, "eval_MaskedAccuracy": 0.5030542218199967, "eval_loss": 1.6270012855529785, "eval_runtime": 149.4202, "eval_samples_per_second": 424.815, "eval_steps_per_second": 1.66, "step": 671000 }, { "epoch": 2.733913914912989, "grad_norm": 5.667209625244141, "learning_rate": 0.0031853468951031617, "loss": 7.7407, "step": 671100 }, { "epoch": 2.7343212929363707, "grad_norm": 6.801764011383057, "learning_rate": 0.0031848695885283713, "loss": 7.7014, "step": 671200 }, { "epoch": 2.7347286709597523, "grad_norm": 3.8669235706329346, "learning_rate": 0.0031843922550747004, "loss": 7.716, "step": 671300 }, { "epoch": 2.735136048983134, "grad_norm": 4.433848857879639, "learning_rate": 0.003183914894761019, "loss": 7.7218, "step": 671400 }, { "epoch": 2.735543427006515, "grad_norm": 3.301412582397461, "learning_rate": 0.003183437507606204, "loss": 7.7504, "step": 671500 }, { "epoch": 2.7359508050298964, "grad_norm": 4.774487018585205, "learning_rate": 0.0031829600936291286, "loss": 7.7406, "step": 671600 }, { "epoch": 2.736358183053278, "grad_norm": 4.506502628326416, "learning_rate": 0.003182482652848658, "loss": 7.7439, "step": 671700 }, { "epoch": 2.7367655610766595, "grad_norm": 2.380566358566284, "learning_rate": 0.0031820051852836776, "loss": 7.7247, "step": 671800 }, { "epoch": 2.737172939100041, "grad_norm": 2.402787685394287, "learning_rate": 0.0031815276909530617, "loss": 7.7408, "step": 671900 }, { "epoch": 2.737580317123422, "grad_norm": 4.9035491943359375, "learning_rate": 0.0031810501698756838, "loss": 7.7508, "step": 672000 }, { "epoch": 2.737580317123422, "eval_MaskedAccuracy": 0.504237015677938, "eval_loss": 1.6274861097335815, "eval_runtime": 148.0721, "eval_samples_per_second": 428.683, "eval_steps_per_second": 1.675, "step": 672000 }, { "epoch": 2.7379876951468036, "grad_norm": 4.066720962524414, "learning_rate": 0.0031805726220704295, "loss": 7.7401, "step": 672100 }, { "epoch": 2.738395073170185, "grad_norm": 7.2215142250061035, "learning_rate": 0.003180095047556176, "loss": 7.7269, "step": 672200 }, { "epoch": 2.7388024511935667, "grad_norm": 6.007561206817627, "learning_rate": 0.003179617446351804, "loss": 7.7329, "step": 672300 }, { "epoch": 2.7392098292169482, "grad_norm": 6.568578720092773, "learning_rate": 0.003179139818476196, "loss": 7.7144, "step": 672400 }, { "epoch": 2.7396172072403298, "grad_norm": 7.652806758880615, "learning_rate": 0.0031786621639482387, "loss": 7.7502, "step": 672500 }, { "epoch": 2.7400245852637113, "grad_norm": 3.3555850982666016, "learning_rate": 0.00317818448278681, "loss": 7.6896, "step": 672600 }, { "epoch": 2.7404319632870924, "grad_norm": 5.025359630584717, "learning_rate": 0.003177706775010799, "loss": 7.7452, "step": 672700 }, { "epoch": 2.740839341310474, "grad_norm": 5.02529239654541, "learning_rate": 0.0031772290406390908, "loss": 7.7531, "step": 672800 }, { "epoch": 2.7412467193338554, "grad_norm": 5.380584239959717, "learning_rate": 0.003176751279690576, "loss": 7.7413, "step": 672900 }, { "epoch": 2.741654097357237, "grad_norm": 6.14848518371582, "learning_rate": 0.003176273492184139, "loss": 7.7432, "step": 673000 }, { "epoch": 2.741654097357237, "eval_MaskedAccuracy": 0.5031924432503198, "eval_loss": 1.6348071098327637, "eval_runtime": 149.5255, "eval_samples_per_second": 424.516, "eval_steps_per_second": 1.659, "step": 673000 }, { "epoch": 2.7420614753806185, "grad_norm": 3.4233415126800537, "learning_rate": 0.003175795678138673, "loss": 7.7337, "step": 673100 }, { "epoch": 2.7424688534039996, "grad_norm": 3.1220381259918213, "learning_rate": 0.0031753178375730625, "loss": 7.7318, "step": 673200 }, { "epoch": 2.742876231427381, "grad_norm": 5.210653781890869, "learning_rate": 0.0031748399705062032, "loss": 7.7346, "step": 673300 }, { "epoch": 2.7432836094507627, "grad_norm": 3.6905648708343506, "learning_rate": 0.0031743620769569877, "loss": 7.7322, "step": 673400 }, { "epoch": 2.743690987474144, "grad_norm": 3.794119358062744, "learning_rate": 0.0031738841569443096, "loss": 7.7447, "step": 673500 }, { "epoch": 2.7440983654975257, "grad_norm": 9.346611976623535, "learning_rate": 0.0031734062104870654, "loss": 7.7172, "step": 673600 }, { "epoch": 2.7445057435209073, "grad_norm": 8.45644760131836, "learning_rate": 0.003172928237604145, "loss": 7.7499, "step": 673700 }, { "epoch": 2.744913121544289, "grad_norm": 11.964926719665527, "learning_rate": 0.0031724502383144483, "loss": 7.7178, "step": 673800 }, { "epoch": 2.7453204995676703, "grad_norm": 9.513945579528809, "learning_rate": 0.0031719722126368776, "loss": 7.726, "step": 673900 }, { "epoch": 2.7457278775910514, "grad_norm": 3.0402848720550537, "learning_rate": 0.003171494160590327, "loss": 7.7057, "step": 674000 }, { "epoch": 2.7457278775910514, "eval_MaskedAccuracy": 0.5034982886733228, "eval_loss": 1.6308186054229736, "eval_runtime": 148.9235, "eval_samples_per_second": 426.232, "eval_steps_per_second": 1.665, "step": 674000 }, { "epoch": 2.746135255614433, "grad_norm": 2.512981414794922, "learning_rate": 0.003171016082193699, "loss": 7.7149, "step": 674100 }, { "epoch": 2.7465426336378145, "grad_norm": 2.6818249225616455, "learning_rate": 0.0031705379774658936, "loss": 7.719, "step": 674200 }, { "epoch": 2.746950011661196, "grad_norm": 3.6553287506103516, "learning_rate": 0.0031700598464258103, "loss": 7.7353, "step": 674300 }, { "epoch": 2.7473573896845775, "grad_norm": 3.668513059616089, "learning_rate": 0.003169581689092354, "loss": 7.6893, "step": 674400 }, { "epoch": 2.7477647677079586, "grad_norm": 2.452850341796875, "learning_rate": 0.003169103505484428, "loss": 7.729, "step": 674500 }, { "epoch": 2.74817214573134, "grad_norm": 4.9898905754089355, "learning_rate": 0.003168625295620942, "loss": 7.7586, "step": 674600 }, { "epoch": 2.7485795237547217, "grad_norm": 3.0301427841186523, "learning_rate": 0.003168147059520797, "loss": 7.6959, "step": 674700 }, { "epoch": 2.7489869017781032, "grad_norm": 2.4823179244995117, "learning_rate": 0.0031676687972029007, "loss": 7.7184, "step": 674800 }, { "epoch": 2.7493942798014848, "grad_norm": 4.111626625061035, "learning_rate": 0.003167190508686164, "loss": 7.7334, "step": 674900 }, { "epoch": 2.7498016578248663, "grad_norm": 2.7121200561523438, "learning_rate": 0.0031667121939894937, "loss": 7.7417, "step": 675000 }, { "epoch": 2.7498016578248663, "eval_MaskedAccuracy": 0.5036854018838389, "eval_loss": 1.6386982202529907, "eval_runtime": 148.7873, "eval_samples_per_second": 426.622, "eval_steps_per_second": 1.667, "step": 675000 }, { "epoch": 2.750209035848248, "grad_norm": 6.808712959289551, "learning_rate": 0.0031662338531318005, "loss": 7.7388, "step": 675100 }, { "epoch": 2.750616413871629, "grad_norm": 3.6436800956726074, "learning_rate": 0.003165755486132, "loss": 7.7042, "step": 675200 }, { "epoch": 2.7510237918950105, "grad_norm": 3.008739709854126, "learning_rate": 0.0031652770930089974, "loss": 7.7218, "step": 675300 }, { "epoch": 2.751431169918392, "grad_norm": 3.265902519226074, "learning_rate": 0.0031647986737817135, "loss": 7.719, "step": 675400 }, { "epoch": 2.7518385479417735, "grad_norm": 12.43411922454834, "learning_rate": 0.0031643202284690575, "loss": 7.7321, "step": 675500 }, { "epoch": 2.752245925965155, "grad_norm": 4.71318244934082, "learning_rate": 0.003163841757089949, "loss": 7.7354, "step": 675600 }, { "epoch": 2.752653303988536, "grad_norm": 5.173762321472168, "learning_rate": 0.0031633632596632997, "loss": 7.7349, "step": 675700 }, { "epoch": 2.7530606820119177, "grad_norm": 7.2542877197265625, "learning_rate": 0.0031628847362080294, "loss": 7.7312, "step": 675800 }, { "epoch": 2.753468060035299, "grad_norm": 2.1073505878448486, "learning_rate": 0.0031624061867430594, "loss": 7.7359, "step": 675900 }, { "epoch": 2.7538754380586807, "grad_norm": 4.097883701324463, "learning_rate": 0.003161927611287303, "loss": 7.7634, "step": 676000 }, { "epoch": 2.7538754380586807, "eval_MaskedAccuracy": 0.5040784739991235, "eval_loss": 1.6261121034622192, "eval_runtime": 148.1031, "eval_samples_per_second": 428.593, "eval_steps_per_second": 1.675, "step": 676000 }, { "epoch": 2.7542828160820623, "grad_norm": 9.5602388381958, "learning_rate": 0.003161449009859684, "loss": 7.7123, "step": 676100 }, { "epoch": 2.754690194105444, "grad_norm": 4.921302318572998, "learning_rate": 0.0031609703824791264, "loss": 7.7375, "step": 676200 }, { "epoch": 2.7550975721288253, "grad_norm": 7.506611347198486, "learning_rate": 0.003160491729164552, "loss": 7.7179, "step": 676300 }, { "epoch": 2.755504950152207, "grad_norm": 5.155307769775391, "learning_rate": 0.003160013049934881, "loss": 7.74, "step": 676400 }, { "epoch": 2.755912328175588, "grad_norm": 6.343349933624268, "learning_rate": 0.0031595343448090436, "loss": 7.7557, "step": 676500 }, { "epoch": 2.7563197061989695, "grad_norm": 5.314796447753906, "learning_rate": 0.0031590556138059605, "loss": 7.7697, "step": 676600 }, { "epoch": 2.756727084222351, "grad_norm": 9.024616241455078, "learning_rate": 0.0031585768569445593, "loss": 7.7188, "step": 676700 }, { "epoch": 2.7571344622457326, "grad_norm": 4.857398509979248, "learning_rate": 0.00315809807424377, "loss": 7.6998, "step": 676800 }, { "epoch": 2.757541840269114, "grad_norm": 3.59858775138855, "learning_rate": 0.0031576192657225234, "loss": 7.7389, "step": 676900 }, { "epoch": 2.757949218292495, "grad_norm": 2.3827154636383057, "learning_rate": 0.003157140431399746, "loss": 7.6996, "step": 677000 }, { "epoch": 2.757949218292495, "eval_MaskedAccuracy": 0.5043865721423844, "eval_loss": 1.6320961713790894, "eval_runtime": 148.277, "eval_samples_per_second": 428.091, "eval_steps_per_second": 1.673, "step": 677000 }, { "epoch": 2.7583565963158767, "grad_norm": 3.564591646194458, "learning_rate": 0.0031566615712943706, "loss": 7.7032, "step": 677100 }, { "epoch": 2.7587639743392582, "grad_norm": 4.032743453979492, "learning_rate": 0.003156182685425328, "loss": 7.7725, "step": 677200 }, { "epoch": 2.7591713523626398, "grad_norm": 7.908647537231445, "learning_rate": 0.0031557037738115536, "loss": 7.6997, "step": 677300 }, { "epoch": 2.7595787303860213, "grad_norm": 2.619904041290283, "learning_rate": 0.0031552248364719766, "loss": 7.7045, "step": 677400 }, { "epoch": 2.759986108409403, "grad_norm": 6.5481061935424805, "learning_rate": 0.0031547458734255338, "loss": 7.7437, "step": 677500 }, { "epoch": 2.7603934864327844, "grad_norm": 3.4915366172790527, "learning_rate": 0.0031542668846911612, "loss": 7.7467, "step": 677600 }, { "epoch": 2.7608008644561655, "grad_norm": 4.60849142074585, "learning_rate": 0.003153787870287798, "loss": 7.7135, "step": 677700 }, { "epoch": 2.761208242479547, "grad_norm": 5.239776134490967, "learning_rate": 0.003153308830234378, "loss": 7.7504, "step": 677800 }, { "epoch": 2.7616156205029285, "grad_norm": 8.834480285644531, "learning_rate": 0.0031528297645498425, "loss": 7.722, "step": 677900 }, { "epoch": 2.76202299852631, "grad_norm": 2.389150619506836, "learning_rate": 0.003152350673253132, "loss": 7.7123, "step": 678000 }, { "epoch": 2.76202299852631, "eval_MaskedAccuracy": 0.5047657486651792, "eval_loss": 1.6233168840408325, "eval_runtime": 148.873, "eval_samples_per_second": 426.377, "eval_steps_per_second": 1.666, "step": 678000 }, { "epoch": 2.7624303765496916, "grad_norm": 7.212696552276611, "learning_rate": 0.003151871556363187, "loss": 7.6981, "step": 678100 }, { "epoch": 2.7628377545730727, "grad_norm": 5.194124698638916, "learning_rate": 0.0031513924138989453, "loss": 7.7247, "step": 678200 }, { "epoch": 2.763245132596454, "grad_norm": 3.673970937728882, "learning_rate": 0.0031509132458793594, "loss": 7.7149, "step": 678300 }, { "epoch": 2.7636525106198357, "grad_norm": 2.1644327640533447, "learning_rate": 0.003150434052323366, "loss": 7.7381, "step": 678400 }, { "epoch": 2.7640598886432173, "grad_norm": 7.227392673492432, "learning_rate": 0.0031499548332499156, "loss": 7.7276, "step": 678500 }, { "epoch": 2.764467266666599, "grad_norm": 6.203690528869629, "learning_rate": 0.003149475588677948, "loss": 7.7101, "step": 678600 }, { "epoch": 2.7648746446899803, "grad_norm": 2.6559998989105225, "learning_rate": 0.003148996318626418, "loss": 7.7048, "step": 678700 }, { "epoch": 2.765282022713362, "grad_norm": 2.9390740394592285, "learning_rate": 0.0031485170231142687, "loss": 7.719, "step": 678800 }, { "epoch": 2.7656894007367434, "grad_norm": 2.8324954509735107, "learning_rate": 0.0031480377021604502, "loss": 7.717, "step": 678900 }, { "epoch": 2.7660967787601245, "grad_norm": 5.291262149810791, "learning_rate": 0.0031475583557839084, "loss": 7.7237, "step": 679000 }, { "epoch": 2.7660967787601245, "eval_MaskedAccuracy": 0.5034864006410433, "eval_loss": 1.6280428171157837, "eval_runtime": 149.0766, "eval_samples_per_second": 425.795, "eval_steps_per_second": 1.664, "step": 679000 }, { "epoch": 2.766504156783506, "grad_norm": 3.3246068954467773, "learning_rate": 0.003147078984003601, "loss": 7.7369, "step": 679100 }, { "epoch": 2.7669115348068876, "grad_norm": 7.084670543670654, "learning_rate": 0.003146599586838473, "loss": 7.7349, "step": 679200 }, { "epoch": 2.767318912830269, "grad_norm": 7.184365272521973, "learning_rate": 0.0031461201643074826, "loss": 7.7123, "step": 679300 }, { "epoch": 2.7677262908536506, "grad_norm": 2.9993152618408203, "learning_rate": 0.003145640716429583, "loss": 7.7133, "step": 679400 }, { "epoch": 2.7681336688770317, "grad_norm": 4.63287878036499, "learning_rate": 0.0031451612432237307, "loss": 7.7412, "step": 679500 }, { "epoch": 2.7685410469004132, "grad_norm": 4.111084461212158, "learning_rate": 0.0031446817447088825, "loss": 7.7259, "step": 679600 }, { "epoch": 2.768948424923795, "grad_norm": 3.8003876209259033, "learning_rate": 0.0031442022209039913, "loss": 7.7123, "step": 679700 }, { "epoch": 2.7693558029471763, "grad_norm": 5.517698764801025, "learning_rate": 0.003143722671828017, "loss": 7.7237, "step": 679800 }, { "epoch": 2.769763180970558, "grad_norm": 7.103514671325684, "learning_rate": 0.0031432430974999218, "loss": 7.7492, "step": 679900 }, { "epoch": 2.7701705589939394, "grad_norm": 2.8233470916748047, "learning_rate": 0.00314276349793866, "loss": 7.7166, "step": 680000 }, { "epoch": 2.7701705589939394, "eval_MaskedAccuracy": 0.5041070418044662, "eval_loss": 1.6354745626449585, "eval_runtime": 148.7404, "eval_samples_per_second": 426.757, "eval_steps_per_second": 1.667, "step": 680000 }, { "epoch": 2.770577937017321, "grad_norm": 3.1389245986938477, "learning_rate": 0.0031422838731632002, "loss": 7.728, "step": 680100 }, { "epoch": 2.770985315040702, "grad_norm": 7.308162689208984, "learning_rate": 0.0031418042231924977, "loss": 7.7258, "step": 680200 }, { "epoch": 2.7713926930640835, "grad_norm": 3.361658811569214, "learning_rate": 0.003141324548045524, "loss": 7.7189, "step": 680300 }, { "epoch": 2.771800071087465, "grad_norm": 2.921330690383911, "learning_rate": 0.0031408448477412325, "loss": 7.7132, "step": 680400 }, { "epoch": 2.7722074491108466, "grad_norm": 5.913390159606934, "learning_rate": 0.0031403651222985966, "loss": 7.712, "step": 680500 }, { "epoch": 2.772614827134228, "grad_norm": 4.974047660827637, "learning_rate": 0.0031398853717365745, "loss": 7.721, "step": 680600 }, { "epoch": 2.773022205157609, "grad_norm": 2.729994297027588, "learning_rate": 0.0031394055960741398, "loss": 7.7278, "step": 680700 }, { "epoch": 2.7734295831809908, "grad_norm": 8.038491249084473, "learning_rate": 0.0031389257953302603, "loss": 7.7267, "step": 680800 }, { "epoch": 2.7738369612043723, "grad_norm": 3.782606363296509, "learning_rate": 0.0031384459695239027, "loss": 7.7274, "step": 680900 }, { "epoch": 2.774244339227754, "grad_norm": 3.078270196914673, "learning_rate": 0.003137966118674038, "loss": 7.7161, "step": 681000 }, { "epoch": 2.774244339227754, "eval_MaskedAccuracy": 0.5041785264991338, "eval_loss": 1.6250332593917847, "eval_runtime": 148.5202, "eval_samples_per_second": 427.39, "eval_steps_per_second": 1.67, "step": 681000 }, { "epoch": 2.7746517172511354, "grad_norm": 5.447459697723389, "learning_rate": 0.003137486242799641, "loss": 7.7379, "step": 681100 }, { "epoch": 2.775059095274517, "grad_norm": 4.432352542877197, "learning_rate": 0.003137006341919675, "loss": 7.7326, "step": 681200 }, { "epoch": 2.7754664732978984, "grad_norm": 2.1805579662323, "learning_rate": 0.003136526416053121, "loss": 7.7204, "step": 681300 }, { "epoch": 2.77587385132128, "grad_norm": 3.008263111114502, "learning_rate": 0.0031360464652189484, "loss": 7.7275, "step": 681400 }, { "epoch": 2.776281229344661, "grad_norm": 4.100875377655029, "learning_rate": 0.0031355664894361307, "loss": 7.7247, "step": 681500 }, { "epoch": 2.7766886073680426, "grad_norm": 4.5805253982543945, "learning_rate": 0.0031350864887236514, "loss": 7.722, "step": 681600 }, { "epoch": 2.777095985391424, "grad_norm": 4.6138763427734375, "learning_rate": 0.0031346064631004827, "loss": 7.7396, "step": 681700 }, { "epoch": 2.7775033634148056, "grad_norm": 3.771960973739624, "learning_rate": 0.0031341264125856025, "loss": 7.7384, "step": 681800 }, { "epoch": 2.777910741438187, "grad_norm": 6.917104244232178, "learning_rate": 0.0031336463371979886, "loss": 7.7311, "step": 681900 }, { "epoch": 2.7783181194615683, "grad_norm": 4.290818214416504, "learning_rate": 0.0031331662369566272, "loss": 7.7459, "step": 682000 }, { "epoch": 2.7783181194615683, "eval_MaskedAccuracy": 0.5034832038612592, "eval_loss": 1.6305007934570312, "eval_runtime": 149.1055, "eval_samples_per_second": 425.712, "eval_steps_per_second": 1.663, "step": 682000 }, { "epoch": 2.77872549748495, "grad_norm": 3.2923989295959473, "learning_rate": 0.0031326861118804914, "loss": 7.717, "step": 682100 }, { "epoch": 2.7791328755083313, "grad_norm": 3.507107973098755, "learning_rate": 0.003132205961988569, "loss": 7.7414, "step": 682200 }, { "epoch": 2.779540253531713, "grad_norm": 8.689350128173828, "learning_rate": 0.003131725787299842, "loss": 7.7122, "step": 682300 }, { "epoch": 2.7799476315550944, "grad_norm": 5.283635139465332, "learning_rate": 0.0031312455878332924, "loss": 7.7247, "step": 682400 }, { "epoch": 2.780355009578476, "grad_norm": 4.201075553894043, "learning_rate": 0.003130765363607904, "loss": 7.7141, "step": 682500 }, { "epoch": 2.7807623876018575, "grad_norm": 2.8721766471862793, "learning_rate": 0.0031302851146426626, "loss": 7.7778, "step": 682600 }, { "epoch": 2.7811697656252385, "grad_norm": 3.2971625328063965, "learning_rate": 0.0031298048409565556, "loss": 7.7198, "step": 682700 }, { "epoch": 2.78157714364862, "grad_norm": 4.196006774902344, "learning_rate": 0.003129324542568573, "loss": 7.7237, "step": 682800 }, { "epoch": 2.7819845216720016, "grad_norm": 2.078606128692627, "learning_rate": 0.0031288442194977043, "loss": 7.7405, "step": 682900 }, { "epoch": 2.782391899695383, "grad_norm": 9.211048126220703, "learning_rate": 0.0031283638717629317, "loss": 7.7066, "step": 683000 }, { "epoch": 2.782391899695383, "eval_MaskedAccuracy": 0.5041814404864644, "eval_loss": 1.627467155456543, "eval_runtime": 148.6688, "eval_samples_per_second": 426.962, "eval_steps_per_second": 1.668, "step": 683000 }, { "epoch": 2.7827992777187647, "grad_norm": 5.170778751373291, "learning_rate": 0.003127883499383252, "loss": 7.6953, "step": 683100 }, { "epoch": 2.7832066557421458, "grad_norm": 3.7099454402923584, "learning_rate": 0.0031274031023776606, "loss": 7.743, "step": 683200 }, { "epoch": 2.7836140337655273, "grad_norm": 4.783493518829346, "learning_rate": 0.0031269226807651438, "loss": 7.7147, "step": 683300 }, { "epoch": 2.784021411788909, "grad_norm": 6.044512748718262, "learning_rate": 0.0031264422345646976, "loss": 7.7199, "step": 683400 }, { "epoch": 2.7844287898122904, "grad_norm": 6.170932292938232, "learning_rate": 0.003125961763795313, "loss": 7.731, "step": 683500 }, { "epoch": 2.784836167835672, "grad_norm": 4.316214084625244, "learning_rate": 0.0031254812684759895, "loss": 7.6988, "step": 683600 }, { "epoch": 2.7852435458590534, "grad_norm": 6.521294593811035, "learning_rate": 0.003125000748625722, "loss": 7.7277, "step": 683700 }, { "epoch": 2.785650923882435, "grad_norm": 5.507486820220947, "learning_rate": 0.003124520204263514, "loss": 7.7474, "step": 683800 }, { "epoch": 2.7860583019058165, "grad_norm": 4.277698516845703, "learning_rate": 0.0031240396354083578, "loss": 7.711, "step": 683900 }, { "epoch": 2.7864656799291976, "grad_norm": 8.897061347961426, "learning_rate": 0.003123559042079254, "loss": 7.7319, "step": 684000 }, { "epoch": 2.7864656799291976, "eval_MaskedAccuracy": 0.5039483355498073, "eval_loss": 1.6266154050827026, "eval_runtime": 148.4396, "eval_samples_per_second": 427.622, "eval_steps_per_second": 1.671, "step": 684000 }, { "epoch": 2.786873057952579, "grad_norm": 6.696630001068115, "learning_rate": 0.003123078424295201, "loss": 7.7096, "step": 684100 }, { "epoch": 2.7872804359759606, "grad_norm": 5.411538124084473, "learning_rate": 0.0031225977820752023, "loss": 7.7234, "step": 684200 }, { "epoch": 2.787687813999342, "grad_norm": 3.5579845905303955, "learning_rate": 0.0031221171154382573, "loss": 7.6876, "step": 684300 }, { "epoch": 2.7880951920227237, "grad_norm": 2.8582041263580322, "learning_rate": 0.0031216364244033737, "loss": 7.7063, "step": 684400 }, { "epoch": 2.788502570046105, "grad_norm": 6.498418807983398, "learning_rate": 0.0031211557089895585, "loss": 7.7511, "step": 684500 }, { "epoch": 2.7889099480694863, "grad_norm": 4.134893417358398, "learning_rate": 0.0031206749692158087, "loss": 7.704, "step": 684600 }, { "epoch": 2.789317326092868, "grad_norm": 2.3614683151245117, "learning_rate": 0.0031201942051011355, "loss": 7.756, "step": 684700 }, { "epoch": 2.7897247041162494, "grad_norm": 2.780640125274658, "learning_rate": 0.0031197134166645445, "loss": 7.7161, "step": 684800 }, { "epoch": 2.790132082139631, "grad_norm": 5.244045257568359, "learning_rate": 0.0031192326039250445, "loss": 7.7454, "step": 684900 }, { "epoch": 2.7905394601630125, "grad_norm": 8.637303352355957, "learning_rate": 0.0031187517669016454, "loss": 7.7301, "step": 685000 }, { "epoch": 2.7905394601630125, "eval_MaskedAccuracy": 0.5043351051933935, "eval_loss": 1.6246445178985596, "eval_runtime": 148.7432, "eval_samples_per_second": 426.749, "eval_steps_per_second": 1.667, "step": 685000 }, { "epoch": 2.790946838186394, "grad_norm": 7.704771041870117, "learning_rate": 0.0031182709056133554, "loss": 7.7314, "step": 685100 }, { "epoch": 2.791354216209775, "grad_norm": 3.6072418689727783, "learning_rate": 0.0031177900200791845, "loss": 7.728, "step": 685200 }, { "epoch": 2.7917615942331566, "grad_norm": 3.866476535797119, "learning_rate": 0.0031173091103181504, "loss": 7.7149, "step": 685300 }, { "epoch": 2.792168972256538, "grad_norm": 3.703122854232788, "learning_rate": 0.0031168281763492606, "loss": 7.7016, "step": 685400 }, { "epoch": 2.7925763502799197, "grad_norm": 3.259774923324585, "learning_rate": 0.003116347218191534, "loss": 7.715, "step": 685500 }, { "epoch": 2.792983728303301, "grad_norm": 4.0200324058532715, "learning_rate": 0.00311586623586398, "loss": 7.7021, "step": 685600 }, { "epoch": 2.7933911063266823, "grad_norm": 3.2888174057006836, "learning_rate": 0.0031153852293856177, "loss": 7.7336, "step": 685700 }, { "epoch": 2.793798484350064, "grad_norm": 4.4348368644714355, "learning_rate": 0.0031149041987754617, "loss": 7.7349, "step": 685800 }, { "epoch": 2.7942058623734454, "grad_norm": 2.963186502456665, "learning_rate": 0.0031144231440525315, "loss": 7.6976, "step": 685900 }, { "epoch": 2.794613240396827, "grad_norm": 5.670172214508057, "learning_rate": 0.003113942065235843, "loss": 7.7188, "step": 686000 }, { "epoch": 2.794613240396827, "eval_MaskedAccuracy": 0.504245478068693, "eval_loss": 1.6286091804504395, "eval_runtime": 149.342, "eval_samples_per_second": 425.038, "eval_steps_per_second": 1.661, "step": 686000 }, { "epoch": 2.7950206184202084, "grad_norm": 2.3668212890625, "learning_rate": 0.003113460962344421, "loss": 7.6912, "step": 686100 }, { "epoch": 2.79542799644359, "grad_norm": 4.4726786613464355, "learning_rate": 0.0031129798353972784, "loss": 7.7345, "step": 686200 }, { "epoch": 2.7958353744669715, "grad_norm": 4.388908386230469, "learning_rate": 0.0031124986844134443, "loss": 7.753, "step": 686300 }, { "epoch": 2.796242752490353, "grad_norm": 7.050173282623291, "learning_rate": 0.0031120175094119344, "loss": 7.7281, "step": 686400 }, { "epoch": 2.796650130513734, "grad_norm": 4.756292819976807, "learning_rate": 0.0031115363104117773, "loss": 7.72, "step": 686500 }, { "epoch": 2.7970575085371157, "grad_norm": 5.236794948577881, "learning_rate": 0.0031110550874319976, "loss": 7.7203, "step": 686600 }, { "epoch": 2.797464886560497, "grad_norm": 6.344773769378662, "learning_rate": 0.0031105738404916166, "loss": 7.7042, "step": 686700 }, { "epoch": 2.7978722645838787, "grad_norm": 6.981508255004883, "learning_rate": 0.003110092569609664, "loss": 7.7147, "step": 686800 }, { "epoch": 2.7982796426072603, "grad_norm": 4.591001987457275, "learning_rate": 0.003109611274805169, "loss": 7.721, "step": 686900 }, { "epoch": 2.7986870206306413, "grad_norm": 5.013411045074463, "learning_rate": 0.003109129956097154, "loss": 7.7221, "step": 687000 }, { "epoch": 2.7986870206306413, "eval_MaskedAccuracy": 0.5041233797290112, "eval_loss": 1.6214803457260132, "eval_runtime": 150.2376, "eval_samples_per_second": 422.504, "eval_steps_per_second": 1.651, "step": 687000 }, { "epoch": 2.799094398654023, "grad_norm": 4.680105686187744, "learning_rate": 0.003108648613504652, "loss": 7.7321, "step": 687100 }, { "epoch": 2.7995017766774044, "grad_norm": 4.8180389404296875, "learning_rate": 0.0031081672470466906, "loss": 7.7019, "step": 687200 }, { "epoch": 2.799909154700786, "grad_norm": 3.550044536590576, "learning_rate": 0.0031076858567423006, "loss": 7.7062, "step": 687300 }, { "epoch": 2.8003165327241675, "grad_norm": 2.391879081726074, "learning_rate": 0.0031072044426105117, "loss": 7.7312, "step": 687400 }, { "epoch": 2.800723910747549, "grad_norm": 5.2197041511535645, "learning_rate": 0.003106723004670364, "loss": 7.7202, "step": 687500 }, { "epoch": 2.8011312887709305, "grad_norm": 4.447170734405518, "learning_rate": 0.0031062415429408898, "loss": 7.7159, "step": 687600 }, { "epoch": 2.8015386667943116, "grad_norm": 4.965562343597412, "learning_rate": 0.0031057600574411217, "loss": 7.7362, "step": 687700 }, { "epoch": 2.801946044817693, "grad_norm": 5.058368682861328, "learning_rate": 0.0031052785481900927, "loss": 7.7302, "step": 687800 }, { "epoch": 2.8023534228410747, "grad_norm": 3.9994418621063232, "learning_rate": 0.003104797015206845, "loss": 7.7124, "step": 687900 }, { "epoch": 2.8027608008644562, "grad_norm": 3.2288706302642822, "learning_rate": 0.0031043154585104105, "loss": 7.6999, "step": 688000 }, { "epoch": 2.8027608008644562, "eval_MaskedAccuracy": 0.5048591890666292, "eval_loss": 1.6219958066940308, "eval_runtime": 149.2094, "eval_samples_per_second": 425.416, "eval_steps_per_second": 1.662, "step": 688000 }, { "epoch": 2.8031681788878378, "grad_norm": 3.697815418243408, "learning_rate": 0.0031038338781198328, "loss": 7.7462, "step": 688100 }, { "epoch": 2.803575556911219, "grad_norm": 6.208542346954346, "learning_rate": 0.003103352274054148, "loss": 7.7115, "step": 688200 }, { "epoch": 2.8039829349346004, "grad_norm": 4.559090614318848, "learning_rate": 0.003102870646332399, "loss": 7.7318, "step": 688300 }, { "epoch": 2.804390312957982, "grad_norm": 3.0643157958984375, "learning_rate": 0.0031023889949736245, "loss": 7.7203, "step": 688400 }, { "epoch": 2.8047976909813634, "grad_norm": 5.9461669921875, "learning_rate": 0.0031019073199968676, "loss": 7.7279, "step": 688500 }, { "epoch": 2.805205069004745, "grad_norm": 8.562631607055664, "learning_rate": 0.003101425621421172, "loss": 7.7171, "step": 688600 }, { "epoch": 2.8056124470281265, "grad_norm": 8.356283187866211, "learning_rate": 0.0031009438992655844, "loss": 7.6994, "step": 688700 }, { "epoch": 2.806019825051508, "grad_norm": 10.357708930969238, "learning_rate": 0.003100462153549147, "loss": 7.7289, "step": 688800 }, { "epoch": 2.8064272030748896, "grad_norm": 5.205289363861084, "learning_rate": 0.0030999803842909034, "loss": 7.7502, "step": 688900 }, { "epoch": 2.8068345810982707, "grad_norm": 7.57562780380249, "learning_rate": 0.0030994985915099036, "loss": 7.7263, "step": 689000 }, { "epoch": 2.8068345810982707, "eval_MaskedAccuracy": 0.504349269193066, "eval_loss": 1.62308669090271, "eval_runtime": 149.1934, "eval_samples_per_second": 425.461, "eval_steps_per_second": 1.662, "step": 689000 }, { "epoch": 2.807241959121652, "grad_norm": 5.566107273101807, "learning_rate": 0.003099016775225192, "loss": 7.7312, "step": 689100 }, { "epoch": 2.8076493371450337, "grad_norm": 5.995349407196045, "learning_rate": 0.0030985349354558205, "loss": 7.7518, "step": 689200 }, { "epoch": 2.8080567151684153, "grad_norm": 3.4671058654785156, "learning_rate": 0.0030980530722208442, "loss": 7.7189, "step": 689300 }, { "epoch": 2.8084640931917964, "grad_norm": 3.8707988262176514, "learning_rate": 0.003097571185539305, "loss": 7.6957, "step": 689400 }, { "epoch": 2.808871471215178, "grad_norm": 3.558927059173584, "learning_rate": 0.0030970892754302592, "loss": 7.7169, "step": 689500 }, { "epoch": 2.8092788492385594, "grad_norm": 2.6028566360473633, "learning_rate": 0.0030966073419127584, "loss": 7.7107, "step": 689600 }, { "epoch": 2.809686227261941, "grad_norm": 4.1595940589904785, "learning_rate": 0.0030961253850058526, "loss": 7.7297, "step": 689700 }, { "epoch": 2.8100936052853225, "grad_norm": 3.851271390914917, "learning_rate": 0.003095643404728598, "loss": 7.7341, "step": 689800 }, { "epoch": 2.810500983308704, "grad_norm": 6.789198875427246, "learning_rate": 0.0030951614011000507, "loss": 7.7163, "step": 689900 }, { "epoch": 2.8109083613320855, "grad_norm": 4.078215599060059, "learning_rate": 0.00309467937413927, "loss": 7.6922, "step": 690000 }, { "epoch": 2.8109083613320855, "eval_MaskedAccuracy": 0.5041951639695755, "eval_loss": 1.623473882675171, "eval_runtime": 148.8146, "eval_samples_per_second": 426.544, "eval_steps_per_second": 1.667, "step": 690000 }, { "epoch": 2.811315739355467, "grad_norm": 5.313997745513916, "learning_rate": 0.00309419732386531, "loss": 7.6959, "step": 690100 }, { "epoch": 2.811723117378848, "grad_norm": 18.729393005371094, "learning_rate": 0.0030937152502972243, "loss": 7.7084, "step": 690200 }, { "epoch": 2.8121304954022297, "grad_norm": 4.033646583557129, "learning_rate": 0.0030932331534540753, "loss": 7.7469, "step": 690300 }, { "epoch": 2.8125378734256112, "grad_norm": 2.9967758655548096, "learning_rate": 0.0030927510333549277, "loss": 7.7383, "step": 690400 }, { "epoch": 2.8129452514489928, "grad_norm": 5.653675079345703, "learning_rate": 0.0030922688900188386, "loss": 7.7156, "step": 690500 }, { "epoch": 2.8133526294723743, "grad_norm": 5.748961925506592, "learning_rate": 0.0030917867234648666, "loss": 7.7241, "step": 690600 }, { "epoch": 2.8137600074957554, "grad_norm": 4.450202465057373, "learning_rate": 0.0030913045337120776, "loss": 7.7102, "step": 690700 }, { "epoch": 2.814167385519137, "grad_norm": 5.869491100311279, "learning_rate": 0.0030908223207795366, "loss": 7.7178, "step": 690800 }, { "epoch": 2.8145747635425185, "grad_norm": 12.048808097839355, "learning_rate": 0.0030903400846863053, "loss": 7.7094, "step": 690900 }, { "epoch": 2.8149821415659, "grad_norm": 6.04811429977417, "learning_rate": 0.003089857825451448, "loss": 7.7201, "step": 691000 }, { "epoch": 2.8149821415659, "eval_MaskedAccuracy": 0.5040807661085703, "eval_loss": 1.6305480003356934, "eval_runtime": 149.2591, "eval_samples_per_second": 425.274, "eval_steps_per_second": 1.662, "step": 691000 }, { "epoch": 2.8153895195892815, "grad_norm": 5.337472915649414, "learning_rate": 0.0030893755430940334, "loss": 7.7292, "step": 691100 }, { "epoch": 2.815796897612663, "grad_norm": 5.841711521148682, "learning_rate": 0.0030888932376331316, "loss": 7.7428, "step": 691200 }, { "epoch": 2.8162042756360446, "grad_norm": 6.443739891052246, "learning_rate": 0.0030884109090878065, "loss": 7.7095, "step": 691300 }, { "epoch": 2.816611653659426, "grad_norm": 3.8162341117858887, "learning_rate": 0.0030879285574771263, "loss": 7.711, "step": 691400 }, { "epoch": 2.817019031682807, "grad_norm": 3.2067131996154785, "learning_rate": 0.003087446182820161, "loss": 7.7339, "step": 691500 }, { "epoch": 2.8174264097061887, "grad_norm": 3.846188545227051, "learning_rate": 0.003086963785135982, "loss": 7.7297, "step": 691600 }, { "epoch": 2.8178337877295703, "grad_norm": 7.100821018218994, "learning_rate": 0.0030864813644436613, "loss": 7.7268, "step": 691700 }, { "epoch": 2.818241165752952, "grad_norm": 8.114201545715332, "learning_rate": 0.0030859989207622737, "loss": 7.7188, "step": 691800 }, { "epoch": 2.818648543776333, "grad_norm": 4.804603576660156, "learning_rate": 0.0030855164541108916, "loss": 7.6748, "step": 691900 }, { "epoch": 2.8190559217997144, "grad_norm": 9.302783012390137, "learning_rate": 0.003085033964508592, "loss": 7.7527, "step": 692000 }, { "epoch": 2.8190559217997144, "eval_MaskedAccuracy": 0.503542860675209, "eval_loss": 1.6362484693527222, "eval_runtime": 149.521, "eval_samples_per_second": 424.529, "eval_steps_per_second": 1.659, "step": 692000 }, { "epoch": 2.819463299823096, "grad_norm": 11.042616844177246, "learning_rate": 0.0030845514519744458, "loss": 7.7349, "step": 692100 }, { "epoch": 2.8198706778464775, "grad_norm": 7.054125785827637, "learning_rate": 0.0030840689165275302, "loss": 7.7114, "step": 692200 }, { "epoch": 2.820278055869859, "grad_norm": 8.906975746154785, "learning_rate": 0.0030835863581869247, "loss": 7.7168, "step": 692300 }, { "epoch": 2.8206854338932406, "grad_norm": 2.9098007678985596, "learning_rate": 0.0030831037769717055, "loss": 7.7402, "step": 692400 }, { "epoch": 2.821092811916622, "grad_norm": 3.044633388519287, "learning_rate": 0.003082621172900954, "loss": 7.7195, "step": 692500 }, { "epoch": 2.8215001899400036, "grad_norm": 2.5839884281158447, "learning_rate": 0.0030821385459937436, "loss": 7.7109, "step": 692600 }, { "epoch": 2.8219075679633847, "grad_norm": 7.343106746673584, "learning_rate": 0.0030816558962691604, "loss": 7.7065, "step": 692700 }, { "epoch": 2.8223149459867662, "grad_norm": 3.198737621307373, "learning_rate": 0.003081173223746287, "loss": 7.7233, "step": 692800 }, { "epoch": 2.8227223240101478, "grad_norm": 2.686126708984375, "learning_rate": 0.003080690528444206, "loss": 7.7205, "step": 692900 }, { "epoch": 2.8231297020335293, "grad_norm": 2.5314345359802246, "learning_rate": 0.003080207810381999, "loss": 7.7184, "step": 693000 }, { "epoch": 2.8231297020335293, "eval_MaskedAccuracy": 0.504285800770228, "eval_loss": 1.6302218437194824, "eval_runtime": 149.4239, "eval_samples_per_second": 424.805, "eval_steps_per_second": 1.66, "step": 693000 }, { "epoch": 2.823537080056911, "grad_norm": 7.544650077819824, "learning_rate": 0.0030797250695787496, "loss": 7.7302, "step": 693100 }, { "epoch": 2.823944458080292, "grad_norm": 4.015535354614258, "learning_rate": 0.0030792423060535426, "loss": 7.7167, "step": 693200 }, { "epoch": 2.8243518361036735, "grad_norm": 2.9468696117401123, "learning_rate": 0.0030787595198254686, "loss": 7.7445, "step": 693300 }, { "epoch": 2.824759214127055, "grad_norm": 8.753650665283203, "learning_rate": 0.003078276710913611, "loss": 7.7316, "step": 693400 }, { "epoch": 2.8251665921504365, "grad_norm": 5.600179672241211, "learning_rate": 0.003077793879337059, "loss": 7.7287, "step": 693500 }, { "epoch": 2.825573970173818, "grad_norm": 5.177038669586182, "learning_rate": 0.003077311025114899, "loss": 7.6927, "step": 693600 }, { "epoch": 2.8259813481971996, "grad_norm": 8.811504364013672, "learning_rate": 0.0030768281482662263, "loss": 7.679, "step": 693700 }, { "epoch": 2.826388726220581, "grad_norm": 6.328119277954102, "learning_rate": 0.003076345248810129, "loss": 7.7264, "step": 693800 }, { "epoch": 2.8267961042439627, "grad_norm": 4.487634658813477, "learning_rate": 0.0030758623267656974, "loss": 7.7199, "step": 693900 }, { "epoch": 2.8272034822673437, "grad_norm": 4.589730739593506, "learning_rate": 0.003075379382152023, "loss": 7.7223, "step": 694000 }, { "epoch": 2.8272034822673437, "eval_MaskedAccuracy": 0.5045122670012522, "eval_loss": 1.6221905946731567, "eval_runtime": 148.9319, "eval_samples_per_second": 426.208, "eval_steps_per_second": 1.665, "step": 694000 }, { "epoch": 2.8276108602907253, "grad_norm": 3.289022207260132, "learning_rate": 0.003074896414988207, "loss": 7.7231, "step": 694100 }, { "epoch": 2.828018238314107, "grad_norm": 6.647270202636719, "learning_rate": 0.0030744134252933333, "loss": 7.7364, "step": 694200 }, { "epoch": 2.8284256163374883, "grad_norm": 6.761031150817871, "learning_rate": 0.0030739304130865054, "loss": 7.721, "step": 694300 }, { "epoch": 2.8288329943608694, "grad_norm": 4.394124984741211, "learning_rate": 0.0030734473783868123, "loss": 7.7176, "step": 694400 }, { "epoch": 2.829240372384251, "grad_norm": 6.165120601654053, "learning_rate": 0.003072964321213357, "loss": 7.7101, "step": 694500 }, { "epoch": 2.8296477504076325, "grad_norm": 4.580467700958252, "learning_rate": 0.0030724812415852325, "loss": 7.6946, "step": 694600 }, { "epoch": 2.830055128431014, "grad_norm": 3.2740745544433594, "learning_rate": 0.003071998139521537, "loss": 7.6949, "step": 694700 }, { "epoch": 2.8304625064543956, "grad_norm": 8.765399932861328, "learning_rate": 0.0030715150150413748, "loss": 7.6705, "step": 694800 }, { "epoch": 2.830869884477777, "grad_norm": 3.4015614986419678, "learning_rate": 0.0030710318681638447, "loss": 7.7404, "step": 694900 }, { "epoch": 2.8312772625011586, "grad_norm": 3.995572805404663, "learning_rate": 0.0030705486989080454, "loss": 7.7388, "step": 695000 }, { "epoch": 2.8312772625011586, "eval_MaskedAccuracy": 0.5046110943227539, "eval_loss": 1.633867621421814, "eval_runtime": 149.1334, "eval_samples_per_second": 425.632, "eval_steps_per_second": 1.663, "step": 695000 }, { "epoch": 2.83168464052454, "grad_norm": 6.184289455413818, "learning_rate": 0.0030700655072930824, "loss": 7.7401, "step": 695100 }, { "epoch": 2.8320920185479213, "grad_norm": 7.292242050170898, "learning_rate": 0.0030695822933380574, "loss": 7.747, "step": 695200 }, { "epoch": 2.832499396571303, "grad_norm": 6.968306064605713, "learning_rate": 0.0030690990570620713, "loss": 7.6917, "step": 695300 }, { "epoch": 2.8329067745946843, "grad_norm": 4.081552982330322, "learning_rate": 0.0030686157984842356, "loss": 7.7035, "step": 695400 }, { "epoch": 2.833314152618066, "grad_norm": 6.458633899688721, "learning_rate": 0.003068132517623655, "loss": 7.6979, "step": 695500 }, { "epoch": 2.8337215306414474, "grad_norm": 3.7281322479248047, "learning_rate": 0.003067649214499431, "loss": 7.7263, "step": 695600 }, { "epoch": 2.8341289086648285, "grad_norm": 7.177165985107422, "learning_rate": 0.0030671658891306755, "loss": 7.6904, "step": 695700 }, { "epoch": 2.83453628668821, "grad_norm": 7.323238849639893, "learning_rate": 0.0030666825415364965, "loss": 7.6749, "step": 695800 }, { "epoch": 2.8349436647115915, "grad_norm": 7.35430908203125, "learning_rate": 0.003066199171736003, "loss": 7.719, "step": 695900 }, { "epoch": 2.835351042734973, "grad_norm": 7.7270073890686035, "learning_rate": 0.003065715779748306, "loss": 7.708, "step": 696000 }, { "epoch": 2.835351042734973, "eval_MaskedAccuracy": 0.5048210869445112, "eval_loss": 1.623809576034546, "eval_runtime": 148.8926, "eval_samples_per_second": 426.321, "eval_steps_per_second": 1.666, "step": 696000 }, { "epoch": 2.8357584207583546, "grad_norm": 3.1805379390716553, "learning_rate": 0.0030652323655925118, "loss": 7.6836, "step": 696100 }, { "epoch": 2.836165798781736, "grad_norm": 3.5719704627990723, "learning_rate": 0.003064748929287736, "loss": 7.7184, "step": 696200 }, { "epoch": 2.8365731768051177, "grad_norm": 2.7994565963745117, "learning_rate": 0.0030642654708530905, "loss": 7.7149, "step": 696300 }, { "epoch": 2.836980554828499, "grad_norm": 5.4765625, "learning_rate": 0.003063781990307692, "loss": 7.6906, "step": 696400 }, { "epoch": 2.8373879328518803, "grad_norm": 3.522538423538208, "learning_rate": 0.003063298487670654, "loss": 7.6981, "step": 696500 }, { "epoch": 2.837795310875262, "grad_norm": 2.3591034412384033, "learning_rate": 0.003062814962961088, "loss": 7.7021, "step": 696600 }, { "epoch": 2.8382026888986434, "grad_norm": 5.188504219055176, "learning_rate": 0.0030623314161981148, "loss": 7.6768, "step": 696700 }, { "epoch": 2.838610066922025, "grad_norm": 12.307287216186523, "learning_rate": 0.0030618478474008465, "loss": 7.6884, "step": 696800 }, { "epoch": 2.839017444945406, "grad_norm": 4.71583890914917, "learning_rate": 0.0030613642565884077, "loss": 7.7258, "step": 696900 }, { "epoch": 2.8394248229687875, "grad_norm": 7.0869832038879395, "learning_rate": 0.003060880643779911, "loss": 7.6911, "step": 697000 }, { "epoch": 2.8394248229687875, "eval_MaskedAccuracy": 0.5041891362388816, "eval_loss": 1.6216367483139038, "eval_runtime": 149.287, "eval_samples_per_second": 425.195, "eval_steps_per_second": 1.661, "step": 697000 }, { "epoch": 2.839832200992169, "grad_norm": 3.5789458751678467, "learning_rate": 0.0030603970089944812, "loss": 7.7058, "step": 697100 }, { "epoch": 2.8402395790155506, "grad_norm": 7.403262615203857, "learning_rate": 0.0030599133522512378, "loss": 7.6868, "step": 697200 }, { "epoch": 2.840646957038932, "grad_norm": 7.532144546508789, "learning_rate": 0.0030594296735692956, "loss": 7.7527, "step": 697300 }, { "epoch": 2.8410543350623136, "grad_norm": 3.671082019805908, "learning_rate": 0.003058945972967789, "loss": 7.7012, "step": 697400 }, { "epoch": 2.841461713085695, "grad_norm": 8.792901992797852, "learning_rate": 0.003058462250465834, "loss": 7.6974, "step": 697500 }, { "epoch": 2.8418690911090767, "grad_norm": 6.278231620788574, "learning_rate": 0.0030579785060825546, "loss": 7.7238, "step": 697600 }, { "epoch": 2.842276469132458, "grad_norm": 2.9636943340301514, "learning_rate": 0.0030574947398370805, "loss": 7.7156, "step": 697700 }, { "epoch": 2.8426838471558393, "grad_norm": 5.388239860534668, "learning_rate": 0.0030570109517485348, "loss": 7.7217, "step": 697800 }, { "epoch": 2.843091225179221, "grad_norm": 13.538797378540039, "learning_rate": 0.003056527141836042, "loss": 7.6908, "step": 697900 }, { "epoch": 2.8434986032026024, "grad_norm": 7.257206439971924, "learning_rate": 0.0030560433101187337, "loss": 7.7442, "step": 698000 }, { "epoch": 2.8434986032026024, "eval_MaskedAccuracy": 0.5049515498740003, "eval_loss": 1.6278083324432373, "eval_runtime": 149.3787, "eval_samples_per_second": 424.933, "eval_steps_per_second": 1.66, "step": 698000 }, { "epoch": 2.843905981225984, "grad_norm": 3.598336935043335, "learning_rate": 0.0030555594566157355, "loss": 7.7023, "step": 698100 }, { "epoch": 2.844313359249365, "grad_norm": 6.302800178527832, "learning_rate": 0.003055075581346176, "loss": 7.7309, "step": 698200 }, { "epoch": 2.8447207372727465, "grad_norm": 3.172574043273926, "learning_rate": 0.0030545916843291907, "loss": 7.7402, "step": 698300 }, { "epoch": 2.845128115296128, "grad_norm": 3.85713267326355, "learning_rate": 0.0030541077655839035, "loss": 7.7114, "step": 698400 }, { "epoch": 2.8455354933195096, "grad_norm": 3.4408833980560303, "learning_rate": 0.0030536238251294514, "loss": 7.7179, "step": 698500 }, { "epoch": 2.845942871342891, "grad_norm": 6.391783714294434, "learning_rate": 0.0030531398629849655, "loss": 7.7357, "step": 698600 }, { "epoch": 2.8463502493662727, "grad_norm": 7.269733905792236, "learning_rate": 0.0030526558791695775, "loss": 7.7042, "step": 698700 }, { "epoch": 2.846757627389654, "grad_norm": 7.072800636291504, "learning_rate": 0.0030521718737024236, "loss": 7.6934, "step": 698800 }, { "epoch": 2.8471650054130357, "grad_norm": 6.533130168914795, "learning_rate": 0.0030516878466026382, "loss": 7.7217, "step": 698900 }, { "epoch": 2.847572383436417, "grad_norm": 5.605027198791504, "learning_rate": 0.0030512037978893567, "loss": 7.7091, "step": 699000 }, { "epoch": 2.847572383436417, "eval_MaskedAccuracy": 0.5045449432806524, "eval_loss": 1.6316790580749512, "eval_runtime": 149.5385, "eval_samples_per_second": 424.479, "eval_steps_per_second": 1.658, "step": 699000 }, { "epoch": 2.8479797614597984, "grad_norm": 10.6749267578125, "learning_rate": 0.003050719727581721, "loss": 7.7156, "step": 699100 }, { "epoch": 2.84838713948318, "grad_norm": 6.989471435546875, "learning_rate": 0.003050235635698865, "loss": 7.7154, "step": 699200 }, { "epoch": 2.8487945175065614, "grad_norm": 7.544661045074463, "learning_rate": 0.0030497515222599274, "loss": 7.7293, "step": 699300 }, { "epoch": 2.8492018955299425, "grad_norm": 4.551535129547119, "learning_rate": 0.0030492673872840525, "loss": 7.7206, "step": 699400 }, { "epoch": 2.849609273553324, "grad_norm": 5.39074182510376, "learning_rate": 0.0030487832307903727, "loss": 7.7176, "step": 699500 }, { "epoch": 2.8500166515767056, "grad_norm": 4.181148529052734, "learning_rate": 0.003048299052798037, "loss": 7.7149, "step": 699600 }, { "epoch": 2.850424029600087, "grad_norm": 3.8611361980438232, "learning_rate": 0.003047814853326184, "loss": 7.7027, "step": 699700 }, { "epoch": 2.8508314076234686, "grad_norm": 3.8873724937438965, "learning_rate": 0.0030473306323939536, "loss": 7.7517, "step": 699800 }, { "epoch": 2.85123878564685, "grad_norm": 3.9484033584594727, "learning_rate": 0.003046846390020493, "loss": 7.7039, "step": 699900 }, { "epoch": 2.8516461636702317, "grad_norm": 9.328553199768066, "learning_rate": 0.0030463621262249454, "loss": 7.7465, "step": 700000 }, { "epoch": 2.8516461636702317, "eval_MaskedAccuracy": 0.503998148291027, "eval_loss": 1.63697350025177, "eval_runtime": 150.2187, "eval_samples_per_second": 422.557, "eval_steps_per_second": 1.651, "step": 700000 }, { "epoch": 2.8520535416936132, "grad_norm": 3.6931982040405273, "learning_rate": 0.003045877841026461, "loss": 7.6942, "step": 700100 }, { "epoch": 2.8524609197169943, "grad_norm": 5.414756774902344, "learning_rate": 0.0030453935344441794, "loss": 7.7076, "step": 700200 }, { "epoch": 2.852868297740376, "grad_norm": 3.762303590774536, "learning_rate": 0.0030449092064972517, "loss": 7.6774, "step": 700300 }, { "epoch": 2.8532756757637574, "grad_norm": 4.2372283935546875, "learning_rate": 0.0030444248572048254, "loss": 7.7182, "step": 700400 }, { "epoch": 2.853683053787139, "grad_norm": 8.580768585205078, "learning_rate": 0.0030439404865860525, "loss": 7.7171, "step": 700500 }, { "epoch": 2.8540904318105205, "grad_norm": 6.197897434234619, "learning_rate": 0.0030434560946600766, "loss": 7.7389, "step": 700600 }, { "epoch": 2.8544978098339016, "grad_norm": 7.621669769287109, "learning_rate": 0.0030429716814460547, "loss": 7.6687, "step": 700700 }, { "epoch": 2.854905187857283, "grad_norm": 3.5647974014282227, "learning_rate": 0.0030424872469631318, "loss": 7.6928, "step": 700800 }, { "epoch": 2.8553125658806646, "grad_norm": 8.431017875671387, "learning_rate": 0.003042002791230462, "loss": 7.7091, "step": 700900 }, { "epoch": 2.855719943904046, "grad_norm": 4.402780532836914, "learning_rate": 0.003041518314267202, "loss": 7.7249, "step": 701000 }, { "epoch": 2.855719943904046, "eval_MaskedAccuracy": 0.5047495204685684, "eval_loss": 1.6300948858261108, "eval_runtime": 148.4825, "eval_samples_per_second": 427.498, "eval_steps_per_second": 1.67, "step": 701000 }, { "epoch": 2.8561273219274277, "grad_norm": 3.092787504196167, "learning_rate": 0.0030410338160925037, "loss": 7.691, "step": 701100 }, { "epoch": 2.856534699950809, "grad_norm": 6.276670932769775, "learning_rate": 0.0030405492967255214, "loss": 7.724, "step": 701200 }, { "epoch": 2.8569420779741908, "grad_norm": 7.114150047302246, "learning_rate": 0.0030400647561854084, "loss": 7.6765, "step": 701300 }, { "epoch": 2.8573494559975723, "grad_norm": 9.160008430480957, "learning_rate": 0.0030395801944913267, "loss": 7.698, "step": 701400 }, { "epoch": 2.8577568340209534, "grad_norm": 3.681234359741211, "learning_rate": 0.0030390956116624307, "loss": 7.7127, "step": 701500 }, { "epoch": 2.858164212044335, "grad_norm": 3.5404510498046875, "learning_rate": 0.003038611007717878, "loss": 7.6865, "step": 701600 }, { "epoch": 2.8585715900677164, "grad_norm": 4.3910040855407715, "learning_rate": 0.0030381263826768294, "loss": 7.7255, "step": 701700 }, { "epoch": 2.858978968091098, "grad_norm": 4.79788064956665, "learning_rate": 0.003037641736558443, "loss": 7.7295, "step": 701800 }, { "epoch": 2.859386346114479, "grad_norm": 4.587947845458984, "learning_rate": 0.003037157069381879, "loss": 7.684, "step": 701900 }, { "epoch": 2.8597937241378606, "grad_norm": 9.907379150390625, "learning_rate": 0.003036672381166298, "loss": 7.7106, "step": 702000 }, { "epoch": 2.8597937241378606, "eval_MaskedAccuracy": 0.5044675331927309, "eval_loss": 1.6207369565963745, "eval_runtime": 148.9726, "eval_samples_per_second": 426.092, "eval_steps_per_second": 1.665, "step": 702000 }, { "epoch": 2.860201102161242, "grad_norm": 4.197406768798828, "learning_rate": 0.0030361876719308682, "loss": 7.7149, "step": 702100 }, { "epoch": 2.8606084801846237, "grad_norm": 3.471236228942871, "learning_rate": 0.0030357029416947477, "loss": 7.7045, "step": 702200 }, { "epoch": 2.861015858208005, "grad_norm": 3.9093289375305176, "learning_rate": 0.0030352181904771012, "loss": 7.7128, "step": 702300 }, { "epoch": 2.8614232362313867, "grad_norm": 8.871551513671875, "learning_rate": 0.003034733418297096, "loss": 7.7123, "step": 702400 }, { "epoch": 2.8618306142547683, "grad_norm": 4.724961280822754, "learning_rate": 0.0030342486251738942, "loss": 7.7155, "step": 702500 }, { "epoch": 2.86223799227815, "grad_norm": 5.354717254638672, "learning_rate": 0.0030337638111266616, "loss": 7.6495, "step": 702600 }, { "epoch": 2.862645370301531, "grad_norm": 4.442861557006836, "learning_rate": 0.003033278976174568, "loss": 7.7297, "step": 702700 }, { "epoch": 2.8630527483249124, "grad_norm": 5.711311340332031, "learning_rate": 0.003032794120336781, "loss": 7.723, "step": 702800 }, { "epoch": 2.863460126348294, "grad_norm": 6.135890007019043, "learning_rate": 0.003032309243632472, "loss": 7.7171, "step": 702900 }, { "epoch": 2.8638675043716755, "grad_norm": 5.509307384490967, "learning_rate": 0.0030318243460808044, "loss": 7.7359, "step": 703000 }, { "epoch": 2.8638675043716755, "eval_MaskedAccuracy": 0.5043071706895894, "eval_loss": 1.6213853359222412, "eval_runtime": 149.5854, "eval_samples_per_second": 424.346, "eval_steps_per_second": 1.658, "step": 703000 }, { "epoch": 2.864274882395057, "grad_norm": 3.526852607727051, "learning_rate": 0.0030313394277009568, "loss": 7.6897, "step": 703100 }, { "epoch": 2.864682260418438, "grad_norm": 3.783799171447754, "learning_rate": 0.003030854488512097, "loss": 7.7135, "step": 703200 }, { "epoch": 2.8650896384418196, "grad_norm": 3.1931519508361816, "learning_rate": 0.0030303695285333948, "loss": 7.6781, "step": 703300 }, { "epoch": 2.865497016465201, "grad_norm": 6.136069297790527, "learning_rate": 0.0030298845477840252, "loss": 7.7244, "step": 703400 }, { "epoch": 2.8659043944885827, "grad_norm": 5.632696151733398, "learning_rate": 0.003029399546283161, "loss": 7.7179, "step": 703500 }, { "epoch": 2.8663117725119642, "grad_norm": 5.666800022125244, "learning_rate": 0.003028914524049981, "loss": 7.7362, "step": 703600 }, { "epoch": 2.8667191505353458, "grad_norm": 5.373630523681641, "learning_rate": 0.003028429481103657, "loss": 7.7143, "step": 703700 }, { "epoch": 2.8671265285587273, "grad_norm": 4.59381628036499, "learning_rate": 0.0030279444174633632, "loss": 7.7349, "step": 703800 }, { "epoch": 2.867533906582109, "grad_norm": 4.590327262878418, "learning_rate": 0.0030274593331482817, "loss": 7.7045, "step": 703900 }, { "epoch": 2.86794128460549, "grad_norm": 6.63765811920166, "learning_rate": 0.003026974228177588, "loss": 7.7145, "step": 704000 }, { "epoch": 2.86794128460549, "eval_MaskedAccuracy": 0.5050363644742049, "eval_loss": 1.631582260131836, "eval_runtime": 149.3218, "eval_samples_per_second": 425.095, "eval_steps_per_second": 1.661, "step": 704000 }, { "epoch": 2.8683486626288714, "grad_norm": 2.6514902114868164, "learning_rate": 0.0030264891025704615, "loss": 7.7125, "step": 704100 }, { "epoch": 2.868756040652253, "grad_norm": 3.9465410709381104, "learning_rate": 0.003026003956346082, "loss": 7.7155, "step": 704200 }, { "epoch": 2.8691634186756345, "grad_norm": 14.424043655395508, "learning_rate": 0.0030255187895236305, "loss": 7.7102, "step": 704300 }, { "epoch": 2.8695707966990156, "grad_norm": 5.492244243621826, "learning_rate": 0.0030250336021222872, "loss": 7.7153, "step": 704400 }, { "epoch": 2.869978174722397, "grad_norm": 5.44766092300415, "learning_rate": 0.0030245483941612338, "loss": 7.7064, "step": 704500 }, { "epoch": 2.8703855527457787, "grad_norm": 13.81653118133545, "learning_rate": 0.003024063165659656, "loss": 7.7101, "step": 704600 }, { "epoch": 2.87079293076916, "grad_norm": 5.759309768676758, "learning_rate": 0.0030235779166367317, "loss": 7.7331, "step": 704700 }, { "epoch": 2.8712003087925417, "grad_norm": 5.606010913848877, "learning_rate": 0.003023092647111651, "loss": 7.6919, "step": 704800 }, { "epoch": 2.8716076868159233, "grad_norm": 3.6325557231903076, "learning_rate": 0.0030226073571035968, "loss": 7.7012, "step": 704900 }, { "epoch": 2.872015064839305, "grad_norm": 5.615848541259766, "learning_rate": 0.0030221220466317552, "loss": 7.7007, "step": 705000 }, { "epoch": 2.872015064839305, "eval_MaskedAccuracy": 0.5040617252538367, "eval_loss": 1.6279255151748657, "eval_runtime": 150.6296, "eval_samples_per_second": 421.405, "eval_steps_per_second": 1.646, "step": 705000 }, { "epoch": 2.8724224428626863, "grad_norm": 5.490151882171631, "learning_rate": 0.003021636715715311, "loss": 7.7074, "step": 705100 }, { "epoch": 2.8728298208860674, "grad_norm": 4.993218898773193, "learning_rate": 0.0030211513643734547, "loss": 7.6761, "step": 705200 }, { "epoch": 2.873237198909449, "grad_norm": 6.155918598175049, "learning_rate": 0.0030206659926253756, "loss": 7.7004, "step": 705300 }, { "epoch": 2.8736445769328305, "grad_norm": 4.703640937805176, "learning_rate": 0.003020180600490261, "loss": 7.7017, "step": 705400 }, { "epoch": 2.874051954956212, "grad_norm": 6.884881496429443, "learning_rate": 0.0030196951879873017, "loss": 7.7238, "step": 705500 }, { "epoch": 2.8744593329795936, "grad_norm": 8.275361061096191, "learning_rate": 0.00301920975513569, "loss": 7.7167, "step": 705600 }, { "epoch": 2.8748667110029746, "grad_norm": 6.780029296875, "learning_rate": 0.0030187243019546162, "loss": 7.7141, "step": 705700 }, { "epoch": 2.875274089026356, "grad_norm": 3.2327849864959717, "learning_rate": 0.0030182388284632745, "loss": 7.7281, "step": 705800 }, { "epoch": 2.8756814670497377, "grad_norm": 4.052231788635254, "learning_rate": 0.003017753334680855, "loss": 7.6799, "step": 705900 }, { "epoch": 2.8760888450731192, "grad_norm": 6.708811283111572, "learning_rate": 0.0030172678206265525, "loss": 7.7126, "step": 706000 }, { "epoch": 2.8760888450731192, "eval_MaskedAccuracy": 0.5049096995201287, "eval_loss": 1.618673324584961, "eval_runtime": 149.8397, "eval_samples_per_second": 423.626, "eval_steps_per_second": 1.655, "step": 706000 }, { "epoch": 2.8764962230965008, "grad_norm": 2.240532875061035, "learning_rate": 0.003016782286319563, "loss": 7.7333, "step": 706100 }, { "epoch": 2.8769036011198823, "grad_norm": 7.174434661865234, "learning_rate": 0.0030162967317790826, "loss": 7.7014, "step": 706200 }, { "epoch": 2.877310979143264, "grad_norm": 7.849184989929199, "learning_rate": 0.003015811157024309, "loss": 7.6906, "step": 706300 }, { "epoch": 2.8777183571666454, "grad_norm": 3.4370803833007812, "learning_rate": 0.0030153255620744357, "loss": 7.705, "step": 706400 }, { "epoch": 2.8781257351900265, "grad_norm": 3.1505165100097656, "learning_rate": 0.003014839946948669, "loss": 7.7099, "step": 706500 }, { "epoch": 2.878533113213408, "grad_norm": 3.3950932025909424, "learning_rate": 0.0030143543116662, "loss": 7.7487, "step": 706600 }, { "epoch": 2.8789404912367895, "grad_norm": 5.6509222984313965, "learning_rate": 0.003013868656246233, "loss": 7.7353, "step": 706700 }, { "epoch": 2.879347869260171, "grad_norm": 4.527373790740967, "learning_rate": 0.0030133829807079643, "loss": 7.73, "step": 706800 }, { "epoch": 2.879755247283552, "grad_norm": 5.177055835723877, "learning_rate": 0.0030128972850705974, "loss": 7.6888, "step": 706900 }, { "epoch": 2.8801626253069337, "grad_norm": 6.955810546875, "learning_rate": 0.0030124115693533357, "loss": 7.6916, "step": 707000 }, { "epoch": 2.8801626253069337, "eval_MaskedAccuracy": 0.5048821537289734, "eval_loss": 1.6189310550689697, "eval_runtime": 151.7201, "eval_samples_per_second": 418.376, "eval_steps_per_second": 1.635, "step": 707000 }, { "epoch": 2.880570003330315, "grad_norm": 3.485917568206787, "learning_rate": 0.003011925833575381, "loss": 7.6974, "step": 707100 }, { "epoch": 2.8809773813536967, "grad_norm": 5.959170818328857, "learning_rate": 0.003011440077755936, "loss": 7.7137, "step": 707200 }, { "epoch": 2.8813847593770783, "grad_norm": 6.006525039672852, "learning_rate": 0.003010954301914205, "loss": 7.7283, "step": 707300 }, { "epoch": 2.88179213740046, "grad_norm": 5.056791305541992, "learning_rate": 0.003010468506069395, "loss": 7.7301, "step": 707400 }, { "epoch": 2.8821995154238413, "grad_norm": 3.9427425861358643, "learning_rate": 0.0030099826902407097, "loss": 7.6731, "step": 707500 }, { "epoch": 2.882606893447223, "grad_norm": 11.1943998336792, "learning_rate": 0.0030094968544473583, "loss": 7.7174, "step": 707600 }, { "epoch": 2.883014271470604, "grad_norm": 4.588169097900391, "learning_rate": 0.003009010998708546, "loss": 7.7047, "step": 707700 }, { "epoch": 2.8834216494939855, "grad_norm": 8.373878479003906, "learning_rate": 0.0030085251230434875, "loss": 7.6711, "step": 707800 }, { "epoch": 2.883829027517367, "grad_norm": 6.053554534912109, "learning_rate": 0.003008039227471388, "loss": 7.7386, "step": 707900 }, { "epoch": 2.8842364055407486, "grad_norm": 6.039653778076172, "learning_rate": 0.003007553312011456, "loss": 7.6898, "step": 708000 }, { "epoch": 2.8842364055407486, "eval_MaskedAccuracy": 0.5048192362638789, "eval_loss": 1.631282925605774, "eval_runtime": 149.2734, "eval_samples_per_second": 425.233, "eval_steps_per_second": 1.661, "step": 708000 }, { "epoch": 2.88464378356413, "grad_norm": 6.733051776885986, "learning_rate": 0.0030070673766829014, "loss": 7.7225, "step": 708100 }, { "epoch": 2.885051161587511, "grad_norm": 9.6881103515625, "learning_rate": 0.0030065814215049385, "loss": 7.6886, "step": 708200 }, { "epoch": 2.8854585396108927, "grad_norm": 8.552311897277832, "learning_rate": 0.0030060954464967817, "loss": 7.7346, "step": 708300 }, { "epoch": 2.8858659176342742, "grad_norm": 4.68634557723999, "learning_rate": 0.0030056094516776417, "loss": 7.6908, "step": 708400 }, { "epoch": 2.886273295657656, "grad_norm": 3.3616514205932617, "learning_rate": 0.003005123437066729, "loss": 7.718, "step": 708500 }, { "epoch": 2.8866806736810373, "grad_norm": 7.538613319396973, "learning_rate": 0.003004637402683269, "loss": 7.6986, "step": 708600 }, { "epoch": 2.887088051704419, "grad_norm": 7.794840335845947, "learning_rate": 0.0030041513485464634, "loss": 7.7061, "step": 708700 }, { "epoch": 2.8874954297278004, "grad_norm": 4.480391502380371, "learning_rate": 0.0030036652746755384, "loss": 7.685, "step": 708800 }, { "epoch": 2.887902807751182, "grad_norm": 7.3373847007751465, "learning_rate": 0.0030031791810897048, "loss": 7.6954, "step": 708900 }, { "epoch": 2.888310185774563, "grad_norm": 6.190589427947998, "learning_rate": 0.0030026930678081877, "loss": 7.7198, "step": 709000 }, { "epoch": 2.888310185774563, "eval_MaskedAccuracy": 0.5039708538074354, "eval_loss": 1.6308400630950928, "eval_runtime": 152.1451, "eval_samples_per_second": 417.207, "eval_steps_per_second": 1.63, "step": 709000 }, { "epoch": 2.8887175637979445, "grad_norm": 8.154083251953125, "learning_rate": 0.003002206934850202, "loss": 7.7233, "step": 709100 }, { "epoch": 2.889124941821326, "grad_norm": 2.867553949356079, "learning_rate": 0.0030017207822349663, "loss": 7.6865, "step": 709200 }, { "epoch": 2.8895323198447076, "grad_norm": 5.089413642883301, "learning_rate": 0.0030012346099817003, "loss": 7.7197, "step": 709300 }, { "epoch": 2.8899396978680887, "grad_norm": 3.1376349925994873, "learning_rate": 0.003000748418109628, "loss": 7.7022, "step": 709400 }, { "epoch": 2.89034707589147, "grad_norm": 3.189527988433838, "learning_rate": 0.00300026220663797, "loss": 7.7078, "step": 709500 }, { "epoch": 2.8907544539148518, "grad_norm": 2.2897450923919678, "learning_rate": 0.002999775975585948, "loss": 7.7159, "step": 709600 }, { "epoch": 2.8911618319382333, "grad_norm": 4.6041669845581055, "learning_rate": 0.002999289724972783, "loss": 7.7472, "step": 709700 }, { "epoch": 2.891569209961615, "grad_norm": 4.715603351593018, "learning_rate": 0.0029988034548177, "loss": 7.7378, "step": 709800 }, { "epoch": 2.8919765879849963, "grad_norm": 8.691732406616211, "learning_rate": 0.0029983171651399325, "loss": 7.6861, "step": 709900 }, { "epoch": 2.892383966008378, "grad_norm": 2.508530378341675, "learning_rate": 0.002997830855958693, "loss": 7.7132, "step": 710000 }, { "epoch": 2.892383966008378, "eval_MaskedAccuracy": 0.5058309831015211, "eval_loss": 1.6112987995147705, "eval_runtime": 150.3306, "eval_samples_per_second": 422.243, "eval_steps_per_second": 1.65, "step": 710000 }, { "epoch": 2.8927913440317594, "grad_norm": 10.949094772338867, "learning_rate": 0.0029973445272932116, "loss": 7.7088, "step": 710100 }, { "epoch": 2.8931987220551405, "grad_norm": 7.096635818481445, "learning_rate": 0.0029968581791627196, "loss": 7.7207, "step": 710200 }, { "epoch": 2.893606100078522, "grad_norm": 6.554495334625244, "learning_rate": 0.002996371811586448, "loss": 7.694, "step": 710300 }, { "epoch": 2.8940134781019036, "grad_norm": 4.622104644775391, "learning_rate": 0.002995885424583617, "loss": 7.7164, "step": 710400 }, { "epoch": 2.894420856125285, "grad_norm": 4.340527057647705, "learning_rate": 0.0029953990181734594, "loss": 7.7106, "step": 710500 }, { "epoch": 2.8948282341486666, "grad_norm": 4.099761962890625, "learning_rate": 0.00299491259237521, "loss": 7.7194, "step": 710600 }, { "epoch": 2.8952356121720477, "grad_norm": 5.374207496643066, "learning_rate": 0.0029944261472080913, "loss": 7.7074, "step": 710700 }, { "epoch": 2.8956429901954293, "grad_norm": 2.7654128074645996, "learning_rate": 0.0029939396826913407, "loss": 7.7106, "step": 710800 }, { "epoch": 2.896050368218811, "grad_norm": 1.8438504934310913, "learning_rate": 0.002993453198844185, "loss": 7.7102, "step": 710900 }, { "epoch": 2.8964577462421923, "grad_norm": 3.5522167682647705, "learning_rate": 0.002992966695685864, "loss": 7.722, "step": 711000 }, { "epoch": 2.8964577462421923, "eval_MaskedAccuracy": 0.5050682224615328, "eval_loss": 1.6337037086486816, "eval_runtime": 162.347, "eval_samples_per_second": 390.99, "eval_steps_per_second": 1.528, "step": 711000 }, { "epoch": 2.896865124265574, "grad_norm": 7.992817401885986, "learning_rate": 0.0029924801732356104, "loss": 7.6995, "step": 711100 }, { "epoch": 2.8972725022889554, "grad_norm": 2.883950710296631, "learning_rate": 0.002991993631512658, "loss": 7.7226, "step": 711200 }, { "epoch": 2.897679880312337, "grad_norm": 6.727240085601807, "learning_rate": 0.002991507070536246, "loss": 7.7159, "step": 711300 }, { "epoch": 2.8980872583357185, "grad_norm": 7.277162551879883, "learning_rate": 0.002991020490325604, "loss": 7.739, "step": 711400 }, { "epoch": 2.8984946363590995, "grad_norm": 4.6611151695251465, "learning_rate": 0.0029905338908999776, "loss": 7.7172, "step": 711500 }, { "epoch": 2.898902014382481, "grad_norm": 2.6746129989624023, "learning_rate": 0.002990047272278596, "loss": 7.7064, "step": 711600 }, { "epoch": 2.8993093924058626, "grad_norm": 2.9171528816223145, "learning_rate": 0.0029895606344807007, "loss": 7.7234, "step": 711700 }, { "epoch": 2.899716770429244, "grad_norm": 5.212060928344727, "learning_rate": 0.002989073977525534, "loss": 7.7135, "step": 711800 }, { "epoch": 2.9001241484526252, "grad_norm": 3.189424753189087, "learning_rate": 0.002988587301432333, "loss": 7.7212, "step": 711900 }, { "epoch": 2.9005315264760068, "grad_norm": 3.9328625202178955, "learning_rate": 0.002988100606220339, "loss": 7.6914, "step": 712000 }, { "epoch": 2.9005315264760068, "eval_MaskedAccuracy": 0.5044705051542822, "eval_loss": 1.6272234916687012, "eval_runtime": 149.5052, "eval_samples_per_second": 424.574, "eval_steps_per_second": 1.659, "step": 712000 }, { "epoch": 2.9009389044993883, "grad_norm": 6.704166412353516, "learning_rate": 0.0029876138919087947, "loss": 7.6884, "step": 712100 }, { "epoch": 2.90134628252277, "grad_norm": 5.157271862030029, "learning_rate": 0.0029871271585169444, "loss": 7.6892, "step": 712200 }, { "epoch": 2.9017536605461514, "grad_norm": 4.0318284034729, "learning_rate": 0.0029866404060640245, "loss": 7.7036, "step": 712300 }, { "epoch": 2.902161038569533, "grad_norm": 5.164078235626221, "learning_rate": 0.002986153634569284, "loss": 7.7161, "step": 712400 }, { "epoch": 2.9025684165929144, "grad_norm": 2.2458889484405518, "learning_rate": 0.0029856668440519695, "loss": 7.6755, "step": 712500 }, { "epoch": 2.902975794616296, "grad_norm": 3.4589650630950928, "learning_rate": 0.002985180034531323, "loss": 7.7568, "step": 712600 }, { "epoch": 2.903383172639677, "grad_norm": 6.620082378387451, "learning_rate": 0.0029846932060265943, "loss": 7.7145, "step": 712700 }, { "epoch": 2.9037905506630586, "grad_norm": 3.3758513927459717, "learning_rate": 0.002984206358557023, "loss": 7.6961, "step": 712800 }, { "epoch": 2.90419792868644, "grad_norm": 2.730759859085083, "learning_rate": 0.0029837194921418647, "loss": 7.7228, "step": 712900 }, { "epoch": 2.9046053067098216, "grad_norm": 4.694217205047607, "learning_rate": 0.0029832326068003623, "loss": 7.7067, "step": 713000 }, { "epoch": 2.9046053067098216, "eval_MaskedAccuracy": 0.5045022081028918, "eval_loss": 1.6237282752990723, "eval_runtime": 149.0646, "eval_samples_per_second": 425.829, "eval_steps_per_second": 1.664, "step": 713000 }, { "epoch": 2.905012684733203, "grad_norm": 3.580455780029297, "learning_rate": 0.0029827457025517696, "loss": 7.7108, "step": 713100 }, { "epoch": 2.9054200627565843, "grad_norm": 4.593750953674316, "learning_rate": 0.0029822587794153315, "loss": 7.7094, "step": 713200 }, { "epoch": 2.905827440779966, "grad_norm": 8.8233060836792, "learning_rate": 0.0029817718374103006, "loss": 7.6925, "step": 713300 }, { "epoch": 2.9062348188033473, "grad_norm": 5.019341945648193, "learning_rate": 0.0029812848765559327, "loss": 7.7098, "step": 713400 }, { "epoch": 2.906642196826729, "grad_norm": 16.843360900878906, "learning_rate": 0.002980797896871476, "loss": 7.724, "step": 713500 }, { "epoch": 2.9070495748501104, "grad_norm": 4.91794490814209, "learning_rate": 0.0029803108983761798, "loss": 7.7207, "step": 713600 }, { "epoch": 2.907456952873492, "grad_norm": 4.787856101989746, "learning_rate": 0.002979823881089304, "loss": 7.7097, "step": 713700 }, { "epoch": 2.9078643308968735, "grad_norm": 5.718859672546387, "learning_rate": 0.0029793368450301014, "loss": 7.6908, "step": 713800 }, { "epoch": 2.908271708920255, "grad_norm": 3.3372747898101807, "learning_rate": 0.002978849790217824, "loss": 7.7173, "step": 713900 }, { "epoch": 2.908679086943636, "grad_norm": 12.192091941833496, "learning_rate": 0.0029783627166717315, "loss": 7.6694, "step": 714000 }, { "epoch": 2.908679086943636, "eval_MaskedAccuracy": 0.5045802707806957, "eval_loss": 1.626402735710144, "eval_runtime": 149.4493, "eval_samples_per_second": 424.733, "eval_steps_per_second": 1.659, "step": 714000 }, { "epoch": 2.9090864649670176, "grad_norm": 9.249349594116211, "learning_rate": 0.002977875624411079, "loss": 7.7004, "step": 714100 }, { "epoch": 2.909493842990399, "grad_norm": 2.8941810131073, "learning_rate": 0.002977388513455125, "loss": 7.6865, "step": 714200 }, { "epoch": 2.9099012210137807, "grad_norm": 2.450652837753296, "learning_rate": 0.002976901383823124, "loss": 7.6643, "step": 714300 }, { "epoch": 2.9103085990371618, "grad_norm": 3.3011999130249023, "learning_rate": 0.0029764142355343383, "loss": 7.7161, "step": 714400 }, { "epoch": 2.9107159770605433, "grad_norm": 6.105404853820801, "learning_rate": 0.0029759270686080252, "loss": 7.7055, "step": 714500 }, { "epoch": 2.911123355083925, "grad_norm": 4.215657711029053, "learning_rate": 0.0029754398830634447, "loss": 7.7178, "step": 714600 }, { "epoch": 2.9115307331073064, "grad_norm": 4.54106330871582, "learning_rate": 0.0029749526789198686, "loss": 7.6772, "step": 714700 }, { "epoch": 2.911938111130688, "grad_norm": 9.177712440490723, "learning_rate": 0.002974465456196543, "loss": 7.6919, "step": 714800 }, { "epoch": 2.9123454891540694, "grad_norm": 5.482707500457764, "learning_rate": 0.0029739782149127395, "loss": 7.7002, "step": 714900 }, { "epoch": 2.912752867177451, "grad_norm": 8.724689483642578, "learning_rate": 0.0029734909550877156, "loss": 7.6898, "step": 715000 }, { "epoch": 2.912752867177451, "eval_MaskedAccuracy": 0.5038898235956852, "eval_loss": 1.6259732246398926, "eval_runtime": 149.5821, "eval_samples_per_second": 424.356, "eval_steps_per_second": 1.658, "step": 715000 }, { "epoch": 2.9131602452008325, "grad_norm": 2.1611859798431396, "learning_rate": 0.0029730036767407412, "loss": 7.729, "step": 715100 }, { "epoch": 2.9135676232242136, "grad_norm": 5.355795860290527, "learning_rate": 0.0029725163798910797, "loss": 7.6732, "step": 715200 }, { "epoch": 2.913975001247595, "grad_norm": 6.048969745635986, "learning_rate": 0.0029720290645579945, "loss": 7.7244, "step": 715300 }, { "epoch": 2.9143823792709767, "grad_norm": 4.974484920501709, "learning_rate": 0.00297154173076075, "loss": 7.7006, "step": 715400 }, { "epoch": 2.914789757294358, "grad_norm": 6.803608417510986, "learning_rate": 0.0029710543785186174, "loss": 7.7058, "step": 715500 }, { "epoch": 2.9151971353177397, "grad_norm": 5.492311000823975, "learning_rate": 0.002970567007850859, "loss": 7.665, "step": 715600 }, { "epoch": 2.915604513341121, "grad_norm": 3.787933349609375, "learning_rate": 0.0029700796187767473, "loss": 7.7027, "step": 715700 }, { "epoch": 2.9160118913645023, "grad_norm": 4.391969203948975, "learning_rate": 0.0029695922113155524, "loss": 7.7234, "step": 715800 }, { "epoch": 2.916419269387884, "grad_norm": 6.197183609008789, "learning_rate": 0.0029691047854865445, "loss": 7.7195, "step": 715900 }, { "epoch": 2.9168266474112654, "grad_norm": 4.438564777374268, "learning_rate": 0.002968617341308992, "loss": 7.6584, "step": 716000 }, { "epoch": 2.9168266474112654, "eval_MaskedAccuracy": 0.5050919345665351, "eval_loss": 1.617082953453064, "eval_runtime": 149.1154, "eval_samples_per_second": 425.684, "eval_steps_per_second": 1.663, "step": 716000 }, { "epoch": 2.917234025434647, "grad_norm": 3.1807377338409424, "learning_rate": 0.002968129878802167, "loss": 7.7373, "step": 716100 }, { "epoch": 2.9176414034580285, "grad_norm": 3.906308889389038, "learning_rate": 0.0029676423979853378, "loss": 7.6929, "step": 716200 }, { "epoch": 2.91804878148141, "grad_norm": 5.037497520446777, "learning_rate": 0.0029671548988777817, "loss": 7.6899, "step": 716300 }, { "epoch": 2.9184561595047915, "grad_norm": 11.539466857910156, "learning_rate": 0.0029666673814987723, "loss": 7.7106, "step": 716400 }, { "epoch": 2.9188635375281726, "grad_norm": 4.352266788482666, "learning_rate": 0.002966179845867578, "loss": 7.7163, "step": 716500 }, { "epoch": 2.919270915551554, "grad_norm": 2.791508913040161, "learning_rate": 0.0029656922920034802, "loss": 7.7151, "step": 716600 }, { "epoch": 2.9196782935749357, "grad_norm": 2.868605852127075, "learning_rate": 0.0029652047199257544, "loss": 7.6685, "step": 716700 }, { "epoch": 2.9200856715983172, "grad_norm": 2.7636594772338867, "learning_rate": 0.0029647171296536736, "loss": 7.7198, "step": 716800 }, { "epoch": 2.9204930496216983, "grad_norm": 10.295610427856445, "learning_rate": 0.0029642295212065142, "loss": 7.7159, "step": 716900 }, { "epoch": 2.92090042764508, "grad_norm": 4.25411319732666, "learning_rate": 0.0029637418946035572, "loss": 7.7139, "step": 717000 }, { "epoch": 2.92090042764508, "eval_MaskedAccuracy": 0.5046538472804063, "eval_loss": 1.63292396068573, "eval_runtime": 149.3438, "eval_samples_per_second": 425.033, "eval_steps_per_second": 1.661, "step": 717000 }, { "epoch": 2.9213078056684614, "grad_norm": 5.864894390106201, "learning_rate": 0.002963254249864079, "loss": 7.7272, "step": 717100 }, { "epoch": 2.921715183691843, "grad_norm": 8.386134147644043, "learning_rate": 0.0029627665870073597, "loss": 7.7314, "step": 717200 }, { "epoch": 2.9221225617152244, "grad_norm": 4.239066123962402, "learning_rate": 0.0029622789060526787, "loss": 7.7046, "step": 717300 }, { "epoch": 2.922529939738606, "grad_norm": 3.709198236465454, "learning_rate": 0.0029617912070193154, "loss": 7.6977, "step": 717400 }, { "epoch": 2.9229373177619875, "grad_norm": 3.645662784576416, "learning_rate": 0.0029613034899265514, "loss": 7.7026, "step": 717500 }, { "epoch": 2.923344695785369, "grad_norm": 3.1036386489868164, "learning_rate": 0.0029608157547936684, "loss": 7.6985, "step": 717600 }, { "epoch": 2.92375207380875, "grad_norm": 3.0938498973846436, "learning_rate": 0.002960328001639957, "loss": 7.6977, "step": 717700 }, { "epoch": 2.9241594518321317, "grad_norm": 6.18734073638916, "learning_rate": 0.002959840230484692, "loss": 7.7267, "step": 717800 }, { "epoch": 2.924566829855513, "grad_norm": 4.151284694671631, "learning_rate": 0.0029593524413471612, "loss": 7.6832, "step": 717900 }, { "epoch": 2.9249742078788947, "grad_norm": 4.311347961425781, "learning_rate": 0.0029588646342466485, "loss": 7.6803, "step": 718000 }, { "epoch": 2.9249742078788947, "eval_MaskedAccuracy": 0.5049609116665331, "eval_loss": 1.6250135898590088, "eval_runtime": 152.4868, "eval_samples_per_second": 416.272, "eval_steps_per_second": 1.626, "step": 718000 }, { "epoch": 2.9253815859022763, "grad_norm": 2.8787670135498047, "learning_rate": 0.0029583768092024364, "loss": 7.6815, "step": 718100 }, { "epoch": 2.9257889639256573, "grad_norm": 5.31884241104126, "learning_rate": 0.0029578889662338183, "loss": 7.7168, "step": 718200 }, { "epoch": 2.926196341949039, "grad_norm": 12.058183670043945, "learning_rate": 0.002957401105360075, "loss": 7.6968, "step": 718300 }, { "epoch": 2.9266037199724204, "grad_norm": 6.33865213394165, "learning_rate": 0.002956913226600497, "loss": 7.703, "step": 718400 }, { "epoch": 2.927011097995802, "grad_norm": 3.841931104660034, "learning_rate": 0.002956425329974373, "loss": 7.7032, "step": 718500 }, { "epoch": 2.9274184760191835, "grad_norm": 5.193235397338867, "learning_rate": 0.002955937415500991, "loss": 7.6898, "step": 718600 }, { "epoch": 2.927825854042565, "grad_norm": 4.908641815185547, "learning_rate": 0.0029554494831996406, "loss": 7.7301, "step": 718700 }, { "epoch": 2.9282332320659465, "grad_norm": 2.531991720199585, "learning_rate": 0.0029549615330896092, "loss": 7.7017, "step": 718800 }, { "epoch": 2.928640610089328, "grad_norm": 5.351516246795654, "learning_rate": 0.0029544735651901946, "loss": 7.6722, "step": 718900 }, { "epoch": 2.929047988112709, "grad_norm": 6.742931842803955, "learning_rate": 0.0029539855795206868, "loss": 7.6868, "step": 719000 }, { "epoch": 2.929047988112709, "eval_MaskedAccuracy": 0.5054865291604324, "eval_loss": 1.617951512336731, "eval_runtime": 149.5237, "eval_samples_per_second": 424.521, "eval_steps_per_second": 1.659, "step": 719000 }, { "epoch": 2.9294553661360907, "grad_norm": 6.731724739074707, "learning_rate": 0.0029534975761003764, "loss": 7.7216, "step": 719100 }, { "epoch": 2.9298627441594722, "grad_norm": 9.921653747558594, "learning_rate": 0.0029530095549485575, "loss": 7.6906, "step": 719200 }, { "epoch": 2.9302701221828538, "grad_norm": 4.2861409187316895, "learning_rate": 0.002952521516084528, "loss": 7.7046, "step": 719300 }, { "epoch": 2.930677500206235, "grad_norm": 4.969663143157959, "learning_rate": 0.0029520334595275776, "loss": 7.6951, "step": 719400 }, { "epoch": 2.9310848782296164, "grad_norm": 4.280135154724121, "learning_rate": 0.0029515453852970034, "loss": 7.7229, "step": 719500 }, { "epoch": 2.931492256252998, "grad_norm": 5.155004978179932, "learning_rate": 0.0029510572934121015, "loss": 7.7359, "step": 719600 }, { "epoch": 2.9318996342763795, "grad_norm": 7.241194248199463, "learning_rate": 0.0029505691838921672, "loss": 7.7205, "step": 719700 }, { "epoch": 2.932307012299761, "grad_norm": 3.899705410003662, "learning_rate": 0.002950081056756502, "loss": 7.6606, "step": 719800 }, { "epoch": 2.9327143903231425, "grad_norm": 4.1715264320373535, "learning_rate": 0.002949592912024402, "loss": 7.6871, "step": 719900 }, { "epoch": 2.933121768346524, "grad_norm": 3.1270625591278076, "learning_rate": 0.0029491047497151707, "loss": 7.7235, "step": 720000 }, { "epoch": 2.933121768346524, "eval_MaskedAccuracy": 0.5054259339308764, "eval_loss": 1.6230510473251343, "eval_runtime": 150.2205, "eval_samples_per_second": 422.552, "eval_steps_per_second": 1.651, "step": 720000 }, { "epoch": 2.9335291463699056, "grad_norm": 8.253684997558594, "learning_rate": 0.0029486165698481013, "loss": 7.7145, "step": 720100 }, { "epoch": 2.9339365243932867, "grad_norm": 10.558189392089844, "learning_rate": 0.0029481283724424948, "loss": 7.6804, "step": 720200 }, { "epoch": 2.934343902416668, "grad_norm": 8.32635498046875, "learning_rate": 0.0029476401575176544, "loss": 7.7117, "step": 720300 }, { "epoch": 2.9347512804400497, "grad_norm": 6.75897741317749, "learning_rate": 0.0029471519250928785, "loss": 7.7012, "step": 720400 }, { "epoch": 2.9351586584634313, "grad_norm": 3.6416022777557373, "learning_rate": 0.0029466636751874758, "loss": 7.7006, "step": 720500 }, { "epoch": 2.935566036486813, "grad_norm": 8.695493698120117, "learning_rate": 0.002946175407820745, "loss": 7.7098, "step": 720600 }, { "epoch": 2.935973414510194, "grad_norm": 10.675243377685547, "learning_rate": 0.002945687123011988, "loss": 7.721, "step": 720700 }, { "epoch": 2.9363807925335754, "grad_norm": 4.8481574058532715, "learning_rate": 0.0029451988207805134, "loss": 7.6941, "step": 720800 }, { "epoch": 2.936788170556957, "grad_norm": 8.68506145477295, "learning_rate": 0.002944710501145626, "loss": 7.6994, "step": 720900 }, { "epoch": 2.9371955485803385, "grad_norm": 7.147097587585449, "learning_rate": 0.0029442221641266305, "loss": 7.6612, "step": 721000 }, { "epoch": 2.9371955485803385, "eval_MaskedAccuracy": 0.5051493795452836, "eval_loss": 1.6206094026565552, "eval_runtime": 149.6584, "eval_samples_per_second": 424.139, "eval_steps_per_second": 1.657, "step": 721000 }, { "epoch": 2.93760292660372, "grad_norm": 9.032767295837402, "learning_rate": 0.0029437338097428327, "loss": 7.693, "step": 721100 }, { "epoch": 2.9380103046271016, "grad_norm": 4.130688667297363, "learning_rate": 0.0029432454380135427, "loss": 7.6987, "step": 721200 }, { "epoch": 2.938417682650483, "grad_norm": 6.816453456878662, "learning_rate": 0.002942757048958063, "loss": 7.6833, "step": 721300 }, { "epoch": 2.9388250606738646, "grad_norm": 3.935292959213257, "learning_rate": 0.0029422686425957093, "loss": 7.6942, "step": 721400 }, { "epoch": 2.9392324386972457, "grad_norm": 3.9741551876068115, "learning_rate": 0.0029417802189457888, "loss": 7.7356, "step": 721500 }, { "epoch": 2.9396398167206272, "grad_norm": 3.3311500549316406, "learning_rate": 0.002941291778027609, "loss": 7.6762, "step": 721600 }, { "epoch": 2.9400471947440088, "grad_norm": 4.277311325073242, "learning_rate": 0.002940803319860483, "loss": 7.7096, "step": 721700 }, { "epoch": 2.9404545727673903, "grad_norm": 3.4445302486419678, "learning_rate": 0.0029403148444637167, "loss": 7.6713, "step": 721800 }, { "epoch": 2.9408619507907714, "grad_norm": 8.501815795898438, "learning_rate": 0.002939826351856633, "loss": 7.7114, "step": 721900 }, { "epoch": 2.941269328814153, "grad_norm": 3.7861318588256836, "learning_rate": 0.002939337842058535, "loss": 7.6802, "step": 722000 }, { "epoch": 2.941269328814153, "eval_MaskedAccuracy": 0.5052984105124683, "eval_loss": 1.6163740158081055, "eval_runtime": 152.9085, "eval_samples_per_second": 415.124, "eval_steps_per_second": 1.622, "step": 722000 }, { "epoch": 2.9416767068375345, "grad_norm": 5.100472450256348, "learning_rate": 0.002938849315088736, "loss": 7.6613, "step": 722100 }, { "epoch": 2.942084084860916, "grad_norm": 7.634149074554443, "learning_rate": 0.002938360770966554, "loss": 7.6993, "step": 722200 }, { "epoch": 2.9424914628842975, "grad_norm": 3.8841817378997803, "learning_rate": 0.002937872209711308, "loss": 7.6613, "step": 722300 }, { "epoch": 2.942898840907679, "grad_norm": 7.857929229736328, "learning_rate": 0.0029373836313423027, "loss": 7.7037, "step": 722400 }, { "epoch": 2.9433062189310606, "grad_norm": 5.007296562194824, "learning_rate": 0.002936895035878861, "loss": 7.7167, "step": 722500 }, { "epoch": 2.943713596954442, "grad_norm": 5.84328031539917, "learning_rate": 0.0029364064233403054, "loss": 7.6694, "step": 722600 }, { "epoch": 2.944120974977823, "grad_norm": 4.893817901611328, "learning_rate": 0.0029359177937459432, "loss": 7.7198, "step": 722700 }, { "epoch": 2.9445283530012047, "grad_norm": 4.34383487701416, "learning_rate": 0.002935429147115091, "loss": 7.7282, "step": 722800 }, { "epoch": 2.9449357310245863, "grad_norm": 3.5883665084838867, "learning_rate": 0.002934940483467072, "loss": 7.7415, "step": 722900 }, { "epoch": 2.945343109047968, "grad_norm": 8.807660102844238, "learning_rate": 0.0029344518028212078, "loss": 7.7001, "step": 723000 }, { "epoch": 2.945343109047968, "eval_MaskedAccuracy": 0.5059133828639059, "eval_loss": 1.615801215171814, "eval_runtime": 151.4336, "eval_samples_per_second": 419.167, "eval_steps_per_second": 1.638, "step": 723000 }, { "epoch": 2.9457504870713493, "grad_norm": 6.665395259857178, "learning_rate": 0.002933963105196817, "loss": 7.6794, "step": 723100 }, { "epoch": 2.9461578650947304, "grad_norm": 7.352250099182129, "learning_rate": 0.0029334743906132203, "loss": 7.721, "step": 723200 }, { "epoch": 2.946565243118112, "grad_norm": 4.369938373565674, "learning_rate": 0.0029329856590897424, "loss": 7.6953, "step": 723300 }, { "epoch": 2.9469726211414935, "grad_norm": 4.498866081237793, "learning_rate": 0.0029324969106456996, "loss": 7.7072, "step": 723400 }, { "epoch": 2.947379999164875, "grad_norm": 14.836862564086914, "learning_rate": 0.0029320081453004187, "loss": 7.664, "step": 723500 }, { "epoch": 2.9477873771882566, "grad_norm": 5.324207782745361, "learning_rate": 0.0029315193630732213, "loss": 7.6951, "step": 723600 }, { "epoch": 2.948194755211638, "grad_norm": 5.6641716957092285, "learning_rate": 0.0029310305639834336, "loss": 7.7168, "step": 723700 }, { "epoch": 2.9486021332350196, "grad_norm": 5.0012102127075195, "learning_rate": 0.002930541748050377, "loss": 7.7001, "step": 723800 }, { "epoch": 2.949009511258401, "grad_norm": 9.163387298583984, "learning_rate": 0.0029300529152933813, "loss": 7.6949, "step": 723900 }, { "epoch": 2.9494168892817823, "grad_norm": 4.676816463470459, "learning_rate": 0.0029295640657317663, "loss": 7.6794, "step": 724000 }, { "epoch": 2.9494168892817823, "eval_MaskedAccuracy": 0.5046621195508941, "eval_loss": 1.620750069618225, "eval_runtime": 148.7728, "eval_samples_per_second": 426.664, "eval_steps_per_second": 1.667, "step": 724000 }, { "epoch": 2.949824267305164, "grad_norm": 3.069194793701172, "learning_rate": 0.002929075199384863, "loss": 7.7058, "step": 724100 }, { "epoch": 2.9502316453285453, "grad_norm": 4.935457229614258, "learning_rate": 0.0029285863162720025, "loss": 7.7179, "step": 724200 }, { "epoch": 2.950639023351927, "grad_norm": 5.500514984130859, "learning_rate": 0.00292809741641251, "loss": 7.714, "step": 724300 }, { "epoch": 2.951046401375308, "grad_norm": 3.5026183128356934, "learning_rate": 0.0029276084998257155, "loss": 7.703, "step": 724400 }, { "epoch": 2.9514537793986895, "grad_norm": 4.186327934265137, "learning_rate": 0.002927119566530943, "loss": 7.689, "step": 724500 }, { "epoch": 2.951861157422071, "grad_norm": 8.100483894348145, "learning_rate": 0.002926630616547526, "loss": 7.7152, "step": 724600 }, { "epoch": 2.9522685354454525, "grad_norm": 2.308488607406616, "learning_rate": 0.002926141649894795, "loss": 7.6922, "step": 724700 }, { "epoch": 2.952675913468834, "grad_norm": 8.073177337646484, "learning_rate": 0.002925652666592082, "loss": 7.7232, "step": 724800 }, { "epoch": 2.9530832914922156, "grad_norm": 5.914341926574707, "learning_rate": 0.0029251636666587227, "loss": 7.6996, "step": 724900 }, { "epoch": 2.953490669515597, "grad_norm": 2.3362417221069336, "learning_rate": 0.002924674650114046, "loss": 7.7011, "step": 725000 }, { "epoch": 2.953490669515597, "eval_MaskedAccuracy": 0.5049072191625641, "eval_loss": 1.63347589969635, "eval_runtime": 149.5143, "eval_samples_per_second": 424.548, "eval_steps_per_second": 1.659, "step": 725000 }, { "epoch": 2.9538980475389787, "grad_norm": 3.9333150386810303, "learning_rate": 0.0029241856169773817, "loss": 7.697, "step": 725100 }, { "epoch": 2.9543054255623598, "grad_norm": 4.176628589630127, "learning_rate": 0.0029236965672680723, "loss": 7.712, "step": 725200 }, { "epoch": 2.9547128035857413, "grad_norm": 4.2781596183776855, "learning_rate": 0.002923207501005447, "loss": 7.7064, "step": 725300 }, { "epoch": 2.955120181609123, "grad_norm": 7.252349853515625, "learning_rate": 0.002922718418208844, "loss": 7.6847, "step": 725400 }, { "epoch": 2.9555275596325044, "grad_norm": 3.614680528640747, "learning_rate": 0.0029222293188975934, "loss": 7.721, "step": 725500 }, { "epoch": 2.955934937655886, "grad_norm": 5.9780378341674805, "learning_rate": 0.0029217402030910404, "loss": 7.7173, "step": 725600 }, { "epoch": 2.956342315679267, "grad_norm": 3.651726722717285, "learning_rate": 0.0029212510708085198, "loss": 7.6847, "step": 725700 }, { "epoch": 2.9567496937026485, "grad_norm": 2.0712854862213135, "learning_rate": 0.0029207619220693666, "loss": 7.6922, "step": 725800 }, { "epoch": 2.95715707172603, "grad_norm": 7.380396366119385, "learning_rate": 0.0029202727568929197, "loss": 7.7136, "step": 725900 }, { "epoch": 2.9575644497494116, "grad_norm": 5.288581371307373, "learning_rate": 0.002919783575298519, "loss": 7.6962, "step": 726000 }, { "epoch": 2.9575644497494116, "eval_MaskedAccuracy": 0.5056228732656415, "eval_loss": 1.612118124961853, "eval_runtime": 149.9322, "eval_samples_per_second": 423.365, "eval_steps_per_second": 1.654, "step": 726000 }, { "epoch": 2.957971827772793, "grad_norm": 6.535576343536377, "learning_rate": 0.00291929437730551, "loss": 7.7225, "step": 726100 }, { "epoch": 2.9583792057961746, "grad_norm": 4.8185343742370605, "learning_rate": 0.0029188051629332273, "loss": 7.6954, "step": 726200 }, { "epoch": 2.958786583819556, "grad_norm": 5.420071601867676, "learning_rate": 0.0029183159322010107, "loss": 7.6909, "step": 726300 }, { "epoch": 2.9591939618429377, "grad_norm": 5.946316719055176, "learning_rate": 0.002917826685128204, "loss": 7.6928, "step": 726400 }, { "epoch": 2.959601339866319, "grad_norm": 3.5505311489105225, "learning_rate": 0.0029173374217341497, "loss": 7.6879, "step": 726500 }, { "epoch": 2.9600087178897003, "grad_norm": 5.27262544631958, "learning_rate": 0.002916848142038198, "loss": 7.702, "step": 726600 }, { "epoch": 2.960416095913082, "grad_norm": 4.948713302612305, "learning_rate": 0.0029163588460596837, "loss": 7.6924, "step": 726700 }, { "epoch": 2.9608234739364634, "grad_norm": 4.281991004943848, "learning_rate": 0.0029158695338179523, "loss": 7.6952, "step": 726800 }, { "epoch": 2.9612308519598445, "grad_norm": 2.798652410507202, "learning_rate": 0.0029153802053323514, "loss": 7.6928, "step": 726900 }, { "epoch": 2.961638229983226, "grad_norm": 4.923476219177246, "learning_rate": 0.002914890860622227, "loss": 7.6702, "step": 727000 }, { "epoch": 2.961638229983226, "eval_MaskedAccuracy": 0.5045890129922889, "eval_loss": 1.6305315494537354, "eval_runtime": 149.5062, "eval_samples_per_second": 424.571, "eval_steps_per_second": 1.659, "step": 727000 }, { "epoch": 2.9620456080066075, "grad_norm": 5.6135101318359375, "learning_rate": 0.002914401499706924, "loss": 7.6997, "step": 727100 }, { "epoch": 2.962452986029989, "grad_norm": 4.787189483642578, "learning_rate": 0.002913912122605788, "loss": 7.7305, "step": 727200 }, { "epoch": 2.9628603640533706, "grad_norm": 4.7185282707214355, "learning_rate": 0.002913422729338171, "loss": 7.6821, "step": 727300 }, { "epoch": 2.963267742076752, "grad_norm": 5.087222099304199, "learning_rate": 0.002912933319923419, "loss": 7.7056, "step": 727400 }, { "epoch": 2.9636751201001337, "grad_norm": 5.190913200378418, "learning_rate": 0.0029124438943808804, "loss": 7.7075, "step": 727500 }, { "epoch": 2.964082498123515, "grad_norm": 4.4287495613098145, "learning_rate": 0.002911954452729903, "loss": 7.7054, "step": 727600 }, { "epoch": 2.9644898761468963, "grad_norm": 6.311236381530762, "learning_rate": 0.0029114649949898395, "loss": 7.6822, "step": 727700 }, { "epoch": 2.964897254170278, "grad_norm": 3.371539831161499, "learning_rate": 0.0029109755211800415, "loss": 7.6709, "step": 727800 }, { "epoch": 2.9653046321936594, "grad_norm": 3.391474723815918, "learning_rate": 0.0029104860313198594, "loss": 7.6815, "step": 727900 }, { "epoch": 2.965712010217041, "grad_norm": 5.109682083129883, "learning_rate": 0.002909996525428648, "loss": 7.6944, "step": 728000 }, { "epoch": 2.965712010217041, "eval_MaskedAccuracy": 0.5048527927644045, "eval_loss": 1.6240174770355225, "eval_runtime": 151.7464, "eval_samples_per_second": 418.303, "eval_steps_per_second": 1.634, "step": 728000 }, { "epoch": 2.9661193882404224, "grad_norm": 8.26157283782959, "learning_rate": 0.0029095070035257565, "loss": 7.6758, "step": 728100 }, { "epoch": 2.9665267662638035, "grad_norm": 4.0670552253723145, "learning_rate": 0.0029090174656305414, "loss": 7.7192, "step": 728200 }, { "epoch": 2.966934144287185, "grad_norm": 6.780729293823242, "learning_rate": 0.0029085279117623533, "loss": 7.7318, "step": 728300 }, { "epoch": 2.9673415223105666, "grad_norm": 7.467378616333008, "learning_rate": 0.002908038341940552, "loss": 7.6909, "step": 728400 }, { "epoch": 2.967748900333948, "grad_norm": 3.635274648666382, "learning_rate": 0.002907548756184487, "loss": 7.6848, "step": 728500 }, { "epoch": 2.9681562783573296, "grad_norm": 11.50245475769043, "learning_rate": 0.002907059154513517, "loss": 7.7112, "step": 728600 }, { "epoch": 2.968563656380711, "grad_norm": 4.017874240875244, "learning_rate": 0.0029065695369469994, "loss": 7.7199, "step": 728700 }, { "epoch": 2.9689710344040927, "grad_norm": 2.9645016193389893, "learning_rate": 0.002906079903504291, "loss": 7.6968, "step": 728800 }, { "epoch": 2.9693784124274742, "grad_norm": 5.300781726837158, "learning_rate": 0.0029055902542047423, "loss": 7.7147, "step": 728900 }, { "epoch": 2.9697857904508553, "grad_norm": 6.1493754386901855, "learning_rate": 0.002905100589067723, "loss": 7.7191, "step": 729000 }, { "epoch": 2.9697857904508553, "eval_MaskedAccuracy": 0.5048661914267644, "eval_loss": 1.6320323944091797, "eval_runtime": 149.8621, "eval_samples_per_second": 423.563, "eval_steps_per_second": 1.655, "step": 729000 }, { "epoch": 2.970193168474237, "grad_norm": 3.4888110160827637, "learning_rate": 0.002904610908112584, "loss": 7.687, "step": 729100 }, { "epoch": 2.9706005464976184, "grad_norm": 7.061175346374512, "learning_rate": 0.0029041212113586914, "loss": 7.7043, "step": 729200 }, { "epoch": 2.971007924521, "grad_norm": 6.764214515686035, "learning_rate": 0.002903631498825401, "loss": 7.6733, "step": 729300 }, { "epoch": 2.971415302544381, "grad_norm": 6.659168243408203, "learning_rate": 0.0029031417705320784, "loss": 7.699, "step": 729400 }, { "epoch": 2.9718226805677626, "grad_norm": 4.185555458068848, "learning_rate": 0.002902652026498082, "loss": 7.694, "step": 729500 }, { "epoch": 2.972230058591144, "grad_norm": 3.2082176208496094, "learning_rate": 0.0029021622667427735, "loss": 7.7067, "step": 729600 }, { "epoch": 2.9726374366145256, "grad_norm": 3.5333786010742188, "learning_rate": 0.002901672491285517, "loss": 7.6632, "step": 729700 }, { "epoch": 2.973044814637907, "grad_norm": 4.613351821899414, "learning_rate": 0.002901182700145677, "loss": 7.6723, "step": 729800 }, { "epoch": 2.9734521926612887, "grad_norm": 8.803238868713379, "learning_rate": 0.002900692893342618, "loss": 7.6971, "step": 729900 }, { "epoch": 2.97385957068467, "grad_norm": 7.382742881774902, "learning_rate": 0.0029002030708957, "loss": 7.6764, "step": 730000 }, { "epoch": 2.97385957068467, "eval_MaskedAccuracy": 0.5047144547284493, "eval_loss": 1.6255278587341309, "eval_runtime": 150.1483, "eval_samples_per_second": 422.755, "eval_steps_per_second": 1.652, "step": 730000 }, { "epoch": 2.9742669487080517, "grad_norm": 6.7785725593566895, "learning_rate": 0.002899713232824287, "loss": 7.6895, "step": 730100 }, { "epoch": 2.974674326731433, "grad_norm": 5.104455471038818, "learning_rate": 0.002899223379147751, "loss": 7.6893, "step": 730200 }, { "epoch": 2.9750817047548144, "grad_norm": 9.909504890441895, "learning_rate": 0.002898733509885456, "loss": 7.7152, "step": 730300 }, { "epoch": 2.975489082778196, "grad_norm": 4.498833179473877, "learning_rate": 0.0028982436250567715, "loss": 7.6746, "step": 730400 }, { "epoch": 2.9758964608015774, "grad_norm": 4.285370826721191, "learning_rate": 0.0028977537246810637, "loss": 7.6886, "step": 730500 }, { "epoch": 2.976303838824959, "grad_norm": 4.445438861846924, "learning_rate": 0.002897263808777697, "loss": 7.6967, "step": 730600 }, { "epoch": 2.97671121684834, "grad_norm": 3.821533679962158, "learning_rate": 0.0028967738773660453, "loss": 7.7112, "step": 730700 }, { "epoch": 2.9771185948717216, "grad_norm": 7.10809850692749, "learning_rate": 0.0028962839304654763, "loss": 7.6848, "step": 730800 }, { "epoch": 2.977525972895103, "grad_norm": 3.383739709854126, "learning_rate": 0.0028957939680953606, "loss": 7.7111, "step": 730900 }, { "epoch": 2.9779333509184847, "grad_norm": 2.9453628063201904, "learning_rate": 0.00289530399027507, "loss": 7.6914, "step": 731000 }, { "epoch": 2.9779333509184847, "eval_MaskedAccuracy": 0.5047596690559836, "eval_loss": 1.6265870332717896, "eval_runtime": 149.4462, "eval_samples_per_second": 424.741, "eval_steps_per_second": 1.659, "step": 731000 }, { "epoch": 2.978340728941866, "grad_norm": 2.5462045669555664, "learning_rate": 0.002894813997023976, "loss": 7.7145, "step": 731100 }, { "epoch": 2.9787481069652477, "grad_norm": 13.579780578613281, "learning_rate": 0.002894323988361453, "loss": 7.7328, "step": 731200 }, { "epoch": 2.9791554849886293, "grad_norm": 4.660255432128906, "learning_rate": 0.002893833964306867, "loss": 7.7004, "step": 731300 }, { "epoch": 2.979562863012011, "grad_norm": 6.878735065460205, "learning_rate": 0.0028933439248795953, "loss": 7.7008, "step": 731400 }, { "epoch": 2.979970241035392, "grad_norm": 9.91526985168457, "learning_rate": 0.002892853870099012, "loss": 7.6857, "step": 731500 }, { "epoch": 2.9803776190587734, "grad_norm": 8.592841148376465, "learning_rate": 0.002892363799984492, "loss": 7.6917, "step": 731600 }, { "epoch": 2.980784997082155, "grad_norm": 8.438247680664062, "learning_rate": 0.0028918737145554067, "loss": 7.6953, "step": 731700 }, { "epoch": 2.9811923751055365, "grad_norm": 5.771642208099365, "learning_rate": 0.002891383613831136, "loss": 7.7254, "step": 731800 }, { "epoch": 2.9815997531289176, "grad_norm": 9.10280990600586, "learning_rate": 0.0028908934978310514, "loss": 7.6905, "step": 731900 }, { "epoch": 2.982007131152299, "grad_norm": 5.338988780975342, "learning_rate": 0.0028904033665745313, "loss": 7.6966, "step": 732000 }, { "epoch": 2.982007131152299, "eval_MaskedAccuracy": 0.5050782308411959, "eval_loss": 1.6257840394973755, "eval_runtime": 148.6556, "eval_samples_per_second": 427.001, "eval_steps_per_second": 1.668, "step": 732000 }, { "epoch": 2.9824145091756806, "grad_norm": 5.40072774887085, "learning_rate": 0.0028899132200809587, "loss": 7.6848, "step": 732100 }, { "epoch": 2.982821887199062, "grad_norm": 5.238689422607422, "learning_rate": 0.0028894230583697057, "loss": 7.7052, "step": 732200 }, { "epoch": 2.9832292652224437, "grad_norm": 8.466293334960938, "learning_rate": 0.002888932881460157, "loss": 7.6786, "step": 732300 }, { "epoch": 2.9836366432458252, "grad_norm": 6.270270347595215, "learning_rate": 0.002888442689371685, "loss": 7.6877, "step": 732400 }, { "epoch": 2.9840440212692068, "grad_norm": 6.5726189613342285, "learning_rate": 0.002887952482123674, "loss": 7.6897, "step": 732500 }, { "epoch": 2.9844513992925883, "grad_norm": 4.553796768188477, "learning_rate": 0.002887462259735502, "loss": 7.7027, "step": 732600 }, { "epoch": 2.9848587773159694, "grad_norm": 6.129177093505859, "learning_rate": 0.0028869720222265516, "loss": 7.707, "step": 732700 }, { "epoch": 2.985266155339351, "grad_norm": 4.546405792236328, "learning_rate": 0.002886481769616207, "loss": 7.6833, "step": 732800 }, { "epoch": 2.9856735333627324, "grad_norm": 3.305152177810669, "learning_rate": 0.002885991501923847, "loss": 7.7115, "step": 732900 }, { "epoch": 2.986080911386114, "grad_norm": 2.6064453125, "learning_rate": 0.002885501219168852, "loss": 7.6836, "step": 733000 }, { "epoch": 2.986080911386114, "eval_MaskedAccuracy": 0.5055143008516121, "eval_loss": 1.6125576496124268, "eval_runtime": 149.4368, "eval_samples_per_second": 424.768, "eval_steps_per_second": 1.66, "step": 733000 }, { "epoch": 2.9864882894094955, "grad_norm": 8.053740501403809, "learning_rate": 0.0028850109213706134, "loss": 7.7005, "step": 733100 }, { "epoch": 2.9868956674328766, "grad_norm": 5.1984686851501465, "learning_rate": 0.0028845206085485065, "loss": 7.6784, "step": 733200 }, { "epoch": 2.987303045456258, "grad_norm": 10.709946632385254, "learning_rate": 0.0028840302807219185, "loss": 7.7334, "step": 733300 }, { "epoch": 2.9877104234796397, "grad_norm": 4.176344871520996, "learning_rate": 0.0028835399379102383, "loss": 7.6755, "step": 733400 }, { "epoch": 2.988117801503021, "grad_norm": 2.9539191722869873, "learning_rate": 0.002883049580132852, "loss": 7.7084, "step": 733500 }, { "epoch": 2.9885251795264027, "grad_norm": 6.382058143615723, "learning_rate": 0.002882559207409141, "loss": 7.6969, "step": 733600 }, { "epoch": 2.9889325575497843, "grad_norm": 9.542335510253906, "learning_rate": 0.002882068819758494, "loss": 7.7055, "step": 733700 }, { "epoch": 2.989339935573166, "grad_norm": 8.789894104003906, "learning_rate": 0.002881578417200301, "loss": 7.6694, "step": 733800 }, { "epoch": 2.9897473135965473, "grad_norm": 4.893813610076904, "learning_rate": 0.0028810879997539477, "loss": 7.6971, "step": 733900 }, { "epoch": 2.9901546916199284, "grad_norm": 5.839018821716309, "learning_rate": 0.0028805975674388276, "loss": 7.72, "step": 734000 }, { "epoch": 2.9901546916199284, "eval_MaskedAccuracy": 0.5056054177429948, "eval_loss": 1.6218526363372803, "eval_runtime": 168.423, "eval_samples_per_second": 376.884, "eval_steps_per_second": 1.472, "step": 734000 }, { "epoch": 2.99056206964331, "grad_norm": 4.137758731842041, "learning_rate": 0.002880107120274327, "loss": 7.6845, "step": 734100 }, { "epoch": 2.9909694476666915, "grad_norm": 5.157028675079346, "learning_rate": 0.0028796166582798362, "loss": 7.7, "step": 734200 }, { "epoch": 2.991376825690073, "grad_norm": 5.364386558532715, "learning_rate": 0.0028791261814747424, "loss": 7.7323, "step": 734300 }, { "epoch": 2.991784203713454, "grad_norm": 3.042241334915161, "learning_rate": 0.002878635689878441, "loss": 7.6873, "step": 734400 }, { "epoch": 2.9921915817368356, "grad_norm": 4.499843597412109, "learning_rate": 0.002878145183510324, "loss": 7.6892, "step": 734500 }, { "epoch": 2.992598959760217, "grad_norm": 3.1978838443756104, "learning_rate": 0.0028776546623897827, "loss": 7.6834, "step": 734600 }, { "epoch": 2.9930063377835987, "grad_norm": 4.563478469848633, "learning_rate": 0.002877164126536209, "loss": 7.6761, "step": 734700 }, { "epoch": 2.9934137158069802, "grad_norm": 3.513648271560669, "learning_rate": 0.0028766735759689966, "loss": 7.6972, "step": 734800 }, { "epoch": 2.9938210938303618, "grad_norm": 5.518065452575684, "learning_rate": 0.0028761830107075386, "loss": 7.707, "step": 734900 }, { "epoch": 2.9942284718537433, "grad_norm": 5.362910747528076, "learning_rate": 0.00287569243077123, "loss": 7.7155, "step": 735000 }, { "epoch": 2.9942284718537433, "eval_MaskedAccuracy": 0.505939511371849, "eval_loss": 1.618703007698059, "eval_runtime": 150.1441, "eval_samples_per_second": 422.767, "eval_steps_per_second": 1.652, "step": 735000 }, { "epoch": 2.994635849877125, "grad_norm": 3.5753090381622314, "learning_rate": 0.002875201836179469, "loss": 7.7045, "step": 735100 }, { "epoch": 2.995043227900506, "grad_norm": 8.037628173828125, "learning_rate": 0.0028747112269516483, "loss": 7.6737, "step": 735200 }, { "epoch": 2.9954506059238875, "grad_norm": 6.65151309967041, "learning_rate": 0.002874220603107168, "loss": 7.6828, "step": 735300 }, { "epoch": 2.995857983947269, "grad_norm": 3.9218015670776367, "learning_rate": 0.00287372996466542, "loss": 7.6996, "step": 735400 }, { "epoch": 2.9962653619706505, "grad_norm": 6.109340667724609, "learning_rate": 0.0028732393116458056, "loss": 7.7094, "step": 735500 }, { "epoch": 2.996672739994032, "grad_norm": 4.062647342681885, "learning_rate": 0.0028727486440677207, "loss": 7.7047, "step": 735600 }, { "epoch": 2.997080118017413, "grad_norm": 3.483187437057495, "learning_rate": 0.002872257961950565, "loss": 7.6908, "step": 735700 }, { "epoch": 2.9974874960407947, "grad_norm": 2.671337366104126, "learning_rate": 0.0028717672653137375, "loss": 7.6875, "step": 735800 }, { "epoch": 2.997894874064176, "grad_norm": 5.355167388916016, "learning_rate": 0.0028712765541766424, "loss": 7.6624, "step": 735900 }, { "epoch": 2.9983022520875577, "grad_norm": 6.177648544311523, "learning_rate": 0.0028707858285586725, "loss": 7.691, "step": 736000 }, { "epoch": 2.9983022520875577, "eval_MaskedAccuracy": 0.5055798926796584, "eval_loss": 1.6123625040054321, "eval_runtime": 149.3746, "eval_samples_per_second": 424.945, "eval_steps_per_second": 1.66, "step": 736000 }, { "epoch": 2.9987096301109393, "grad_norm": 5.381433486938477, "learning_rate": 0.0028702950884792336, "loss": 7.693, "step": 736100 }, { "epoch": 2.999117008134321, "grad_norm": 3.4278206825256348, "learning_rate": 0.0028698043339577257, "loss": 7.6852, "step": 736200 }, { "epoch": 2.9995243861577023, "grad_norm": 5.128978252410889, "learning_rate": 0.0028693135650135534, "loss": 7.6878, "step": 736300 }, { "epoch": 2.999931764181084, "grad_norm": 5.375679016113281, "learning_rate": 0.0028688227816661164, "loss": 7.6914, "step": 736400 }, { "epoch": 3.000339142204465, "grad_norm": 4.731819152832031, "learning_rate": 0.002868331983934817, "loss": 7.7202, "step": 736500 }, { "epoch": 3.0007465202278465, "grad_norm": 3.3854033946990967, "learning_rate": 0.00286784117183906, "loss": 7.7265, "step": 736600 }, { "epoch": 3.001153898251228, "grad_norm": 5.151326656341553, "learning_rate": 0.0028673503453982515, "loss": 7.6926, "step": 736700 }, { "epoch": 3.0015612762746096, "grad_norm": 3.308462619781494, "learning_rate": 0.002866859504631797, "loss": 7.6979, "step": 736800 }, { "epoch": 3.001968654297991, "grad_norm": 4.403634548187256, "learning_rate": 0.002866368649559097, "loss": 7.7056, "step": 736900 }, { "epoch": 3.002376032321372, "grad_norm": 9.887831687927246, "learning_rate": 0.0028658777801995627, "loss": 7.7114, "step": 737000 }, { "epoch": 3.002376032321372, "eval_MaskedAccuracy": 0.505611609193833, "eval_loss": 1.6212120056152344, "eval_runtime": 149.3676, "eval_samples_per_second": 424.965, "eval_steps_per_second": 1.66, "step": 737000 }, { "epoch": 3.0027834103447537, "grad_norm": 4.206628799438477, "learning_rate": 0.0028653868965725997, "loss": 7.6915, "step": 737100 }, { "epoch": 3.0031907883681352, "grad_norm": 4.031418323516846, "learning_rate": 0.0028648959986976195, "loss": 7.7405, "step": 737200 }, { "epoch": 3.0035981663915168, "grad_norm": 4.3671746253967285, "learning_rate": 0.002864405086594023, "loss": 7.7004, "step": 737300 }, { "epoch": 3.0040055444148983, "grad_norm": 4.248547077178955, "learning_rate": 0.0028639141602812205, "loss": 7.6832, "step": 737400 }, { "epoch": 3.00441292243828, "grad_norm": 4.220926761627197, "learning_rate": 0.002863423219778623, "loss": 7.6934, "step": 737500 }, { "epoch": 3.004820300461661, "grad_norm": 10.080137252807617, "learning_rate": 0.0028629322651056413, "loss": 7.7249, "step": 737600 }, { "epoch": 3.0052276784850425, "grad_norm": 5.022657871246338, "learning_rate": 0.002862441296281681, "loss": 7.6981, "step": 737700 }, { "epoch": 3.005635056508424, "grad_norm": 3.362440347671509, "learning_rate": 0.0028619503133261543, "loss": 7.724, "step": 737800 }, { "epoch": 3.0060424345318055, "grad_norm": 6.001651287078857, "learning_rate": 0.00286145931625847, "loss": 7.6872, "step": 737900 }, { "epoch": 3.006449812555187, "grad_norm": 3.4034435749053955, "learning_rate": 0.0028609683050980477, "loss": 7.7149, "step": 738000 }, { "epoch": 3.006449812555187, "eval_MaskedAccuracy": 0.5058157231836666, "eval_loss": 1.6164454221725464, "eval_runtime": 148.4915, "eval_samples_per_second": 427.472, "eval_steps_per_second": 1.67, "step": 738000 }, { "epoch": 3.0068571905785686, "grad_norm": 6.026236534118652, "learning_rate": 0.00286047727986429, "loss": 7.6771, "step": 738100 }, { "epoch": 3.00726456860195, "grad_norm": 4.825756072998047, "learning_rate": 0.0028599862405766143, "loss": 7.7141, "step": 738200 }, { "epoch": 3.007671946625331, "grad_norm": 3.3585920333862305, "learning_rate": 0.0028594951872544343, "loss": 7.6962, "step": 738300 }, { "epoch": 3.0080793246487127, "grad_norm": 4.888416290283203, "learning_rate": 0.0028590041199171635, "loss": 7.6803, "step": 738400 }, { "epoch": 3.0084867026720943, "grad_norm": 4.891050815582275, "learning_rate": 0.002858513038584217, "loss": 7.7202, "step": 738500 }, { "epoch": 3.008894080695476, "grad_norm": 3.14123797416687, "learning_rate": 0.0028580219432750094, "loss": 7.7039, "step": 738600 }, { "epoch": 3.0093014587188573, "grad_norm": 3.5715818405151367, "learning_rate": 0.002857530834008957, "loss": 7.7042, "step": 738700 }, { "epoch": 3.009708836742239, "grad_norm": 5.466462135314941, "learning_rate": 0.002857039710805476, "loss": 7.7214, "step": 738800 }, { "epoch": 3.01011621476562, "grad_norm": 5.804788112640381, "learning_rate": 0.002856548573683981, "loss": 7.7363, "step": 738900 }, { "epoch": 3.0105235927890015, "grad_norm": 8.0160493850708, "learning_rate": 0.0028560574226638917, "loss": 7.7206, "step": 739000 }, { "epoch": 3.0105235927890015, "eval_MaskedAccuracy": 0.5056501302084144, "eval_loss": 1.6164095401763916, "eval_runtime": 149.2281, "eval_samples_per_second": 425.362, "eval_steps_per_second": 1.662, "step": 739000 }, { "epoch": 3.010930970812383, "grad_norm": 3.4310660362243652, "learning_rate": 0.002855566257764624, "loss": 7.7148, "step": 739100 }, { "epoch": 3.0113383488357646, "grad_norm": 5.341206073760986, "learning_rate": 0.0028550750790055975, "loss": 7.6797, "step": 739200 }, { "epoch": 3.011745726859146, "grad_norm": 4.394791603088379, "learning_rate": 0.0028545838864062318, "loss": 7.6749, "step": 739300 }, { "epoch": 3.0121531048825276, "grad_norm": 4.237307548522949, "learning_rate": 0.002854092679985943, "loss": 7.7006, "step": 739400 }, { "epoch": 3.0125604829059087, "grad_norm": 2.5844573974609375, "learning_rate": 0.002853601459764154, "loss": 7.6835, "step": 739500 }, { "epoch": 3.0129678609292903, "grad_norm": 5.240605354309082, "learning_rate": 0.002853110225760285, "loss": 7.6913, "step": 739600 }, { "epoch": 3.013375238952672, "grad_norm": 8.880341529846191, "learning_rate": 0.002852618977993756, "loss": 7.6997, "step": 739700 }, { "epoch": 3.0137826169760533, "grad_norm": 5.894056797027588, "learning_rate": 0.0028521277164839922, "loss": 7.6919, "step": 739800 }, { "epoch": 3.014189994999435, "grad_norm": 2.679236650466919, "learning_rate": 0.0028516364412504132, "loss": 7.6828, "step": 739900 }, { "epoch": 3.0145973730228164, "grad_norm": 3.015913724899292, "learning_rate": 0.002851145152312444, "loss": 7.7108, "step": 740000 }, { "epoch": 3.0145973730228164, "eval_MaskedAccuracy": 0.5061662086205887, "eval_loss": 1.6264451742172241, "eval_runtime": 149.7792, "eval_samples_per_second": 423.797, "eval_steps_per_second": 1.656, "step": 740000 }, { "epoch": 3.0150047510461975, "grad_norm": 4.6664228439331055, "learning_rate": 0.0028506538496895, "loss": 7.6873, "step": 740100 }, { "epoch": 3.015412129069579, "grad_norm": 3.0084333419799805, "learning_rate": 0.0028501625334010137, "loss": 7.7015, "step": 740200 }, { "epoch": 3.0158195070929605, "grad_norm": 6.42466926574707, "learning_rate": 0.002849671203466406, "loss": 7.6992, "step": 740300 }, { "epoch": 3.016226885116342, "grad_norm": 5.211976528167725, "learning_rate": 0.002849179859905102, "loss": 7.6955, "step": 740400 }, { "epoch": 3.0166342631397236, "grad_norm": 8.872745513916016, "learning_rate": 0.002848688502736526, "loss": 7.7122, "step": 740500 }, { "epoch": 3.017041641163105, "grad_norm": 5.2570695877075195, "learning_rate": 0.0028481971319801076, "loss": 7.7019, "step": 740600 }, { "epoch": 3.0174490191864867, "grad_norm": 6.204245090484619, "learning_rate": 0.00284770574765527, "loss": 7.7023, "step": 740700 }, { "epoch": 3.0178563972098678, "grad_norm": 4.359434604644775, "learning_rate": 0.00284721434978144, "loss": 7.6523, "step": 740800 }, { "epoch": 3.0182637752332493, "grad_norm": 8.607190132141113, "learning_rate": 0.002846722938378047, "loss": 7.7324, "step": 740900 }, { "epoch": 3.018671153256631, "grad_norm": 5.038397789001465, "learning_rate": 0.0028462315134645177, "loss": 7.6877, "step": 741000 }, { "epoch": 3.018671153256631, "eval_MaskedAccuracy": 0.5055186695738466, "eval_loss": 1.620163917541504, "eval_runtime": 150.7644, "eval_samples_per_second": 421.028, "eval_steps_per_second": 1.645, "step": 741000 }, { "epoch": 3.0190785312800124, "grad_norm": 8.162409782409668, "learning_rate": 0.002845740075060283, "loss": 7.6638, "step": 741100 }, { "epoch": 3.019485909303394, "grad_norm": 5.22674036026001, "learning_rate": 0.002845248623184771, "loss": 7.6913, "step": 741200 }, { "epoch": 3.0198932873267754, "grad_norm": 7.2996907234191895, "learning_rate": 0.00284475715785741, "loss": 7.6909, "step": 741300 }, { "epoch": 3.0203006653501565, "grad_norm": 4.533921718597412, "learning_rate": 0.0028442656790976305, "loss": 7.6845, "step": 741400 }, { "epoch": 3.020708043373538, "grad_norm": 7.809845447540283, "learning_rate": 0.0028437741869248656, "loss": 7.6783, "step": 741500 }, { "epoch": 3.0211154213969196, "grad_norm": 3.4706003665924072, "learning_rate": 0.002843282681358549, "loss": 7.6951, "step": 741600 }, { "epoch": 3.021522799420301, "grad_norm": 2.5168254375457764, "learning_rate": 0.0028427911624181066, "loss": 7.6831, "step": 741700 }, { "epoch": 3.0219301774436826, "grad_norm": 3.419708728790283, "learning_rate": 0.00284229963012297, "loss": 7.6986, "step": 741800 }, { "epoch": 3.022337555467064, "grad_norm": 5.101961135864258, "learning_rate": 0.002841808084492575, "loss": 7.7074, "step": 741900 }, { "epoch": 3.0227449334904453, "grad_norm": 3.0421252250671387, "learning_rate": 0.0028413165255463564, "loss": 7.6789, "step": 742000 }, { "epoch": 3.0227449334904453, "eval_MaskedAccuracy": 0.5056500423693558, "eval_loss": 1.6271016597747803, "eval_runtime": 151.2846, "eval_samples_per_second": 419.58, "eval_steps_per_second": 1.639, "step": 742000 }, { "epoch": 3.023152311513827, "grad_norm": 3.4731786251068115, "learning_rate": 0.002840824953303746, "loss": 7.6852, "step": 742100 }, { "epoch": 3.0235596895372083, "grad_norm": 3.8895857334136963, "learning_rate": 0.002840333367784181, "loss": 7.7051, "step": 742200 }, { "epoch": 3.02396706756059, "grad_norm": 5.0788493156433105, "learning_rate": 0.0028398417690070977, "loss": 7.7234, "step": 742300 }, { "epoch": 3.0243744455839714, "grad_norm": 4.581942558288574, "learning_rate": 0.002839350156991924, "loss": 7.693, "step": 742400 }, { "epoch": 3.024781823607353, "grad_norm": 2.6827476024627686, "learning_rate": 0.002838858531758098, "loss": 7.7087, "step": 742500 }, { "epoch": 3.025189201630734, "grad_norm": 3.306877613067627, "learning_rate": 0.002838366893325062, "loss": 7.7229, "step": 742600 }, { "epoch": 3.0255965796541155, "grad_norm": 4.539236545562744, "learning_rate": 0.0028378752417122493, "loss": 7.6909, "step": 742700 }, { "epoch": 3.026003957677497, "grad_norm": 3.317619562149048, "learning_rate": 0.0028373835769391, "loss": 7.6756, "step": 742800 }, { "epoch": 3.0264113357008786, "grad_norm": 6.079135417938232, "learning_rate": 0.002836891899025048, "loss": 7.7073, "step": 742900 }, { "epoch": 3.02681871372426, "grad_norm": 8.416139602661133, "learning_rate": 0.002836400207989534, "loss": 7.693, "step": 743000 }, { "epoch": 3.02681871372426, "eval_MaskedAccuracy": 0.5055632710561418, "eval_loss": 1.6125744581222534, "eval_runtime": 152.4589, "eval_samples_per_second": 416.348, "eval_steps_per_second": 1.627, "step": 743000 }, { "epoch": 3.0272260917476417, "grad_norm": 7.413698673248291, "learning_rate": 0.0028359085038519977, "loss": 7.6778, "step": 743100 }, { "epoch": 3.027633469771023, "grad_norm": 6.549065113067627, "learning_rate": 0.0028354167866318776, "loss": 7.6899, "step": 743200 }, { "epoch": 3.0280408477944043, "grad_norm": 5.39340353012085, "learning_rate": 0.0028349250563486157, "loss": 7.7047, "step": 743300 }, { "epoch": 3.028448225817786, "grad_norm": 5.896380424499512, "learning_rate": 0.0028344333130216506, "loss": 7.7133, "step": 743400 }, { "epoch": 3.0288556038411674, "grad_norm": 6.32306432723999, "learning_rate": 0.0028339415566704246, "loss": 7.7131, "step": 743500 }, { "epoch": 3.029262981864549, "grad_norm": 3.8068430423736572, "learning_rate": 0.0028334497873143795, "loss": 7.7373, "step": 743600 }, { "epoch": 3.0296703598879304, "grad_norm": 10.210831642150879, "learning_rate": 0.0028329580049729595, "loss": 7.6972, "step": 743700 }, { "epoch": 3.030077737911312, "grad_norm": 10.17298412322998, "learning_rate": 0.0028324662096656044, "loss": 7.7149, "step": 743800 }, { "epoch": 3.030485115934693, "grad_norm": 4.248089790344238, "learning_rate": 0.002831974401411758, "loss": 7.6869, "step": 743900 }, { "epoch": 3.0308924939580746, "grad_norm": 3.900387763977051, "learning_rate": 0.0028314825802308657, "loss": 7.7162, "step": 744000 }, { "epoch": 3.0308924939580746, "eval_MaskedAccuracy": 0.5062941364730348, "eval_loss": 1.6123100519180298, "eval_runtime": 154.1937, "eval_samples_per_second": 411.664, "eval_steps_per_second": 1.608, "step": 744000 }, { "epoch": 3.031299871981456, "grad_norm": 3.7343649864196777, "learning_rate": 0.0028309907461423704, "loss": 7.7208, "step": 744100 }, { "epoch": 3.0317072500048377, "grad_norm": 3.5612926483154297, "learning_rate": 0.0028304988991657153, "loss": 7.6932, "step": 744200 }, { "epoch": 3.032114628028219, "grad_norm": 6.879009246826172, "learning_rate": 0.0028300070393203476, "loss": 7.7268, "step": 744300 }, { "epoch": 3.0325220060516007, "grad_norm": 4.398885726928711, "learning_rate": 0.002829515166625715, "loss": 7.7104, "step": 744400 }, { "epoch": 3.032929384074982, "grad_norm": 4.473330974578857, "learning_rate": 0.002829023281101265, "loss": 7.7114, "step": 744500 }, { "epoch": 3.0333367620983633, "grad_norm": 4.854863166809082, "learning_rate": 0.0028285313827664384, "loss": 7.7086, "step": 744600 }, { "epoch": 3.033744140121745, "grad_norm": 6.374593257904053, "learning_rate": 0.0028280394716406884, "loss": 7.7386, "step": 744700 }, { "epoch": 3.0341515181451264, "grad_norm": 3.94598126411438, "learning_rate": 0.0028275475477434627, "loss": 7.7082, "step": 744800 }, { "epoch": 3.034558896168508, "grad_norm": 4.2861409187316895, "learning_rate": 0.0028270556110942045, "loss": 7.659, "step": 744900 }, { "epoch": 3.0349662741918895, "grad_norm": 4.4543023109436035, "learning_rate": 0.0028265636617123653, "loss": 7.7032, "step": 745000 }, { "epoch": 3.0349662741918895, "eval_MaskedAccuracy": 0.5057628963797117, "eval_loss": 1.6194565296173096, "eval_runtime": 151.1422, "eval_samples_per_second": 419.975, "eval_steps_per_second": 1.641, "step": 745000 }, { "epoch": 3.0353736522152706, "grad_norm": 5.848895072937012, "learning_rate": 0.002826071699617396, "loss": 7.6823, "step": 745100 }, { "epoch": 3.035781030238652, "grad_norm": 5.476065635681152, "learning_rate": 0.002825579724828742, "loss": 7.7301, "step": 745200 }, { "epoch": 3.0361884082620336, "grad_norm": 9.30887222290039, "learning_rate": 0.0028250877373658563, "loss": 7.7132, "step": 745300 }, { "epoch": 3.036595786285415, "grad_norm": 5.337265968322754, "learning_rate": 0.0028245957372481927, "loss": 7.6611, "step": 745400 }, { "epoch": 3.0370031643087967, "grad_norm": 3.821563243865967, "learning_rate": 0.0028241037244951996, "loss": 7.7002, "step": 745500 }, { "epoch": 3.037410542332178, "grad_norm": 7.907029151916504, "learning_rate": 0.0028236116991263295, "loss": 7.6754, "step": 745600 }, { "epoch": 3.0378179203555598, "grad_norm": 9.829130172729492, "learning_rate": 0.0028231196611610363, "loss": 7.7387, "step": 745700 }, { "epoch": 3.038225298378941, "grad_norm": 4.472766399383545, "learning_rate": 0.0028226276106187685, "loss": 7.6909, "step": 745800 }, { "epoch": 3.0386326764023224, "grad_norm": 8.586014747619629, "learning_rate": 0.0028221355475189845, "loss": 7.6959, "step": 745900 }, { "epoch": 3.039040054425704, "grad_norm": 3.3005423545837402, "learning_rate": 0.002821643471881136, "loss": 7.7071, "step": 746000 }, { "epoch": 3.039040054425704, "eval_MaskedAccuracy": 0.5054460142927345, "eval_loss": 1.6223331689834595, "eval_runtime": 151.3643, "eval_samples_per_second": 419.359, "eval_steps_per_second": 1.638, "step": 746000 }, { "epoch": 3.0394474324490854, "grad_norm": 7.866025924682617, "learning_rate": 0.0028211513837246766, "loss": 7.7215, "step": 746100 }, { "epoch": 3.039854810472467, "grad_norm": 12.360164642333984, "learning_rate": 0.0028206592830690626, "loss": 7.6792, "step": 746200 }, { "epoch": 3.0402621884958485, "grad_norm": 5.763864040374756, "learning_rate": 0.0028201671699337504, "loss": 7.7254, "step": 746300 }, { "epoch": 3.0406695665192296, "grad_norm": 4.453338623046875, "learning_rate": 0.002819675044338191, "loss": 7.6742, "step": 746400 }, { "epoch": 3.041076944542611, "grad_norm": 10.988475799560547, "learning_rate": 0.002819182906301844, "loss": 7.7044, "step": 746500 }, { "epoch": 3.0414843225659927, "grad_norm": 3.7080917358398438, "learning_rate": 0.002818690755844169, "loss": 7.71, "step": 746600 }, { "epoch": 3.041891700589374, "grad_norm": 7.433635234832764, "learning_rate": 0.0028181985929846215, "loss": 7.6761, "step": 746700 }, { "epoch": 3.0422990786127557, "grad_norm": 6.572902202606201, "learning_rate": 0.002817706417742658, "loss": 7.6797, "step": 746800 }, { "epoch": 3.0427064566361373, "grad_norm": 3.7825119495391846, "learning_rate": 0.0028172142301377398, "loss": 7.6901, "step": 746900 }, { "epoch": 3.0431138346595183, "grad_norm": 7.348376274108887, "learning_rate": 0.0028167220301893196, "loss": 7.6862, "step": 747000 }, { "epoch": 3.0431138346595183, "eval_MaskedAccuracy": 0.5059655368405178, "eval_loss": 1.6155526638031006, "eval_runtime": 163.344, "eval_samples_per_second": 388.603, "eval_steps_per_second": 1.518, "step": 747000 }, { "epoch": 3.0435212126829, "grad_norm": 4.178409576416016, "learning_rate": 0.0028162298179168588, "loss": 7.7009, "step": 747100 }, { "epoch": 3.0439285907062814, "grad_norm": 9.232433319091797, "learning_rate": 0.002815737593339823, "loss": 7.7132, "step": 747200 }, { "epoch": 3.044335968729663, "grad_norm": 5.816783905029297, "learning_rate": 0.002815245356477669, "loss": 7.6965, "step": 747300 }, { "epoch": 3.0447433467530445, "grad_norm": 4.057966232299805, "learning_rate": 0.0028147531073498535, "loss": 7.7127, "step": 747400 }, { "epoch": 3.045150724776426, "grad_norm": 4.916996479034424, "learning_rate": 0.002814260845975841, "loss": 7.6831, "step": 747500 }, { "epoch": 3.045558102799807, "grad_norm": 2.2980215549468994, "learning_rate": 0.0028137685723750943, "loss": 7.7005, "step": 747600 }, { "epoch": 3.0459654808231886, "grad_norm": 4.099763870239258, "learning_rate": 0.0028132762865670734, "loss": 7.7293, "step": 747700 }, { "epoch": 3.04637285884657, "grad_norm": 3.3498318195343018, "learning_rate": 0.002812783988571241, "loss": 7.6964, "step": 747800 }, { "epoch": 3.0467802368699517, "grad_norm": 4.06113862991333, "learning_rate": 0.0028122916784070607, "loss": 7.6875, "step": 747900 }, { "epoch": 3.0471876148933332, "grad_norm": 4.0826239585876465, "learning_rate": 0.002811799356093999, "loss": 7.7025, "step": 748000 }, { "epoch": 3.0471876148933332, "eval_MaskedAccuracy": 0.5053813683576247, "eval_loss": 1.6258487701416016, "eval_runtime": 152.3978, "eval_samples_per_second": 416.515, "eval_steps_per_second": 1.627, "step": 748000 }, { "epoch": 3.0475949929167148, "grad_norm": 4.380719184875488, "learning_rate": 0.0028113070216515173, "loss": 7.7252, "step": 748100 }, { "epoch": 3.0480023709400963, "grad_norm": 17.466968536376953, "learning_rate": 0.002810814675099083, "loss": 7.7, "step": 748200 }, { "epoch": 3.0484097489634774, "grad_norm": 3.0531888008117676, "learning_rate": 0.0028103223164561574, "loss": 7.7112, "step": 748300 }, { "epoch": 3.048817126986859, "grad_norm": 5.355355262756348, "learning_rate": 0.0028098299457422074, "loss": 7.7003, "step": 748400 }, { "epoch": 3.0492245050102404, "grad_norm": 7.948986530303955, "learning_rate": 0.0028093375629766994, "loss": 7.7229, "step": 748500 }, { "epoch": 3.049631883033622, "grad_norm": 4.31436824798584, "learning_rate": 0.0028088451681790986, "loss": 7.6893, "step": 748600 }, { "epoch": 3.0500392610570035, "grad_norm": 6.25513219833374, "learning_rate": 0.0028083527613688743, "loss": 7.7159, "step": 748700 }, { "epoch": 3.050446639080385, "grad_norm": 5.565917491912842, "learning_rate": 0.0028078603425654904, "loss": 7.6689, "step": 748800 }, { "epoch": 3.050854017103766, "grad_norm": 3.3782989978790283, "learning_rate": 0.0028073679117884202, "loss": 7.7223, "step": 748900 }, { "epoch": 3.0512613951271477, "grad_norm": 3.9914166927337646, "learning_rate": 0.002806875469057129, "loss": 7.6964, "step": 749000 }, { "epoch": 3.0512613951271477, "eval_MaskedAccuracy": 0.506501001429651, "eval_loss": 1.6175804138183594, "eval_runtime": 150.5201, "eval_samples_per_second": 421.711, "eval_steps_per_second": 1.648, "step": 749000 }, { "epoch": 3.051668773150529, "grad_norm": 2.5959930419921875, "learning_rate": 0.0028063830143910864, "loss": 7.7105, "step": 749100 }, { "epoch": 3.0520761511739107, "grad_norm": 5.61108922958374, "learning_rate": 0.0028058905478097595, "loss": 7.6973, "step": 749200 }, { "epoch": 3.0524835291972923, "grad_norm": 3.186306953430176, "learning_rate": 0.0028053980693326223, "loss": 7.7117, "step": 749300 }, { "epoch": 3.052890907220674, "grad_norm": 6.981227397918701, "learning_rate": 0.0028049055789791427, "loss": 7.7288, "step": 749400 }, { "epoch": 3.053298285244055, "grad_norm": 5.489975929260254, "learning_rate": 0.0028044130767687914, "loss": 7.7296, "step": 749500 }, { "epoch": 3.0537056632674364, "grad_norm": 5.899916172027588, "learning_rate": 0.002803920562721041, "loss": 7.7182, "step": 749600 }, { "epoch": 3.054113041290818, "grad_norm": 2.500195026397705, "learning_rate": 0.0028034280368553583, "loss": 7.7013, "step": 749700 }, { "epoch": 3.0545204193141995, "grad_norm": 6.336365699768066, "learning_rate": 0.0028029354991912204, "loss": 7.6773, "step": 749800 }, { "epoch": 3.054927797337581, "grad_norm": 8.86232852935791, "learning_rate": 0.0028024429497481023, "loss": 7.697, "step": 749900 }, { "epoch": 3.0553351753609626, "grad_norm": 8.056133270263672, "learning_rate": 0.002801950388545474, "loss": 7.7169, "step": 750000 }, { "epoch": 3.0553351753609626, "eval_MaskedAccuracy": 0.5053425370105782, "eval_loss": 1.6197924613952637, "eval_runtime": 151.6657, "eval_samples_per_second": 418.526, "eval_steps_per_second": 1.635, "step": 750000 }, { "epoch": 3.0557425533843436, "grad_norm": 5.945562362670898, "learning_rate": 0.0028014578156028043, "loss": 7.7112, "step": 750100 }, { "epoch": 3.056149931407725, "grad_norm": 4.3121657371521, "learning_rate": 0.0028009652309395716, "loss": 7.6802, "step": 750200 }, { "epoch": 3.0565573094311067, "grad_norm": 6.809252738952637, "learning_rate": 0.0028004726345752513, "loss": 7.7059, "step": 750300 }, { "epoch": 3.0569646874544882, "grad_norm": 6.637779712677002, "learning_rate": 0.002799980026529318, "loss": 7.6848, "step": 750400 }, { "epoch": 3.0573720654778698, "grad_norm": 5.115314483642578, "learning_rate": 0.0027994874068212436, "loss": 7.6937, "step": 750500 }, { "epoch": 3.0577794435012513, "grad_norm": 3.4024839401245117, "learning_rate": 0.0027989947754705094, "loss": 7.6794, "step": 750600 }, { "epoch": 3.058186821524633, "grad_norm": 7.3663482666015625, "learning_rate": 0.002798502132496588, "loss": 7.6701, "step": 750700 }, { "epoch": 3.058594199548014, "grad_norm": 4.044422626495361, "learning_rate": 0.0027980094779189585, "loss": 7.6958, "step": 750800 }, { "epoch": 3.0590015775713955, "grad_norm": 2.6503303050994873, "learning_rate": 0.002797516811757099, "loss": 7.7023, "step": 750900 }, { "epoch": 3.059408955594777, "grad_norm": 8.92728042602539, "learning_rate": 0.002797024134030482, "loss": 7.6873, "step": 751000 }, { "epoch": 3.059408955594777, "eval_MaskedAccuracy": 0.5059805377342768, "eval_loss": 1.624422550201416, "eval_runtime": 151.4307, "eval_samples_per_second": 419.175, "eval_steps_per_second": 1.638, "step": 751000 }, { "epoch": 3.0598163336181585, "grad_norm": 6.3868818283081055, "learning_rate": 0.0027965314447585925, "loss": 7.711, "step": 751100 }, { "epoch": 3.06022371164154, "grad_norm": 7.902709484100342, "learning_rate": 0.0027960387439609036, "loss": 7.7238, "step": 751200 }, { "epoch": 3.0606310896649216, "grad_norm": 5.930760383605957, "learning_rate": 0.002795546031656899, "loss": 7.6997, "step": 751300 }, { "epoch": 3.0610384676883027, "grad_norm": 3.6389553546905518, "learning_rate": 0.0027950533078660534, "loss": 7.7025, "step": 751400 }, { "epoch": 3.061445845711684, "grad_norm": 2.4691882133483887, "learning_rate": 0.00279456057260785, "loss": 7.6794, "step": 751500 }, { "epoch": 3.0618532237350657, "grad_norm": 5.3170366287231445, "learning_rate": 0.0027940678259017705, "loss": 7.7213, "step": 751600 }, { "epoch": 3.0622606017584473, "grad_norm": 4.524278163909912, "learning_rate": 0.00279357506776729, "loss": 7.7152, "step": 751700 }, { "epoch": 3.062667979781829, "grad_norm": 5.752650260925293, "learning_rate": 0.002793082298223896, "loss": 7.6986, "step": 751800 }, { "epoch": 3.0630753578052103, "grad_norm": 4.981680870056152, "learning_rate": 0.002792589517291067, "loss": 7.7098, "step": 751900 }, { "epoch": 3.0634827358285914, "grad_norm": 3.780088186264038, "learning_rate": 0.0027920967249882873, "loss": 7.7109, "step": 752000 }, { "epoch": 3.0634827358285914, "eval_MaskedAccuracy": 0.5055275134090221, "eval_loss": 1.6219958066940308, "eval_runtime": 159.1803, "eval_samples_per_second": 398.768, "eval_steps_per_second": 1.558, "step": 752000 }, { "epoch": 3.063890113851973, "grad_norm": 5.787014007568359, "learning_rate": 0.0027916039213350397, "loss": 7.6746, "step": 752100 }, { "epoch": 3.0642974918753545, "grad_norm": 7.504022121429443, "learning_rate": 0.0027911111063508025, "loss": 7.6924, "step": 752200 }, { "epoch": 3.064704869898736, "grad_norm": 5.298858642578125, "learning_rate": 0.0027906182800550637, "loss": 7.6964, "step": 752300 }, { "epoch": 3.0651122479221176, "grad_norm": 2.5735104084014893, "learning_rate": 0.002790125442467312, "loss": 7.7175, "step": 752400 }, { "epoch": 3.065519625945499, "grad_norm": 5.3041558265686035, "learning_rate": 0.0027896325936070214, "loss": 7.7047, "step": 752500 }, { "epoch": 3.06592700396888, "grad_norm": 6.335773944854736, "learning_rate": 0.0027891397334936833, "loss": 7.7084, "step": 752600 }, { "epoch": 3.0663343819922617, "grad_norm": 3.198148012161255, "learning_rate": 0.002788646862146781, "loss": 7.701, "step": 752700 }, { "epoch": 3.0667417600156432, "grad_norm": 9.311052322387695, "learning_rate": 0.002788153979585801, "loss": 7.72, "step": 752800 }, { "epoch": 3.067149138039025, "grad_norm": 4.54910945892334, "learning_rate": 0.0027876610858302304, "loss": 7.6623, "step": 752900 }, { "epoch": 3.0675565160624063, "grad_norm": 5.994970321655273, "learning_rate": 0.0027871681808995566, "loss": 7.7136, "step": 753000 }, { "epoch": 3.0675565160624063, "eval_MaskedAccuracy": 0.5054040414939257, "eval_loss": 1.6254481077194214, "eval_runtime": 183.8547, "eval_samples_per_second": 345.251, "eval_steps_per_second": 1.349, "step": 753000 }, { "epoch": 3.067963894085788, "grad_norm": 7.91969633102417, "learning_rate": 0.0027866752648132657, "loss": 7.7479, "step": 753100 }, { "epoch": 3.068371272109169, "grad_norm": 6.5430731773376465, "learning_rate": 0.0027861823375908445, "loss": 7.7175, "step": 753200 }, { "epoch": 3.0687786501325505, "grad_norm": 3.526317596435547, "learning_rate": 0.002785689399251782, "loss": 7.7156, "step": 753300 }, { "epoch": 3.069186028155932, "grad_norm": 5.19420862197876, "learning_rate": 0.0027851964498155656, "loss": 7.6909, "step": 753400 }, { "epoch": 3.0695934061793135, "grad_norm": 8.49240493774414, "learning_rate": 0.002784703489301686, "loss": 7.7331, "step": 753500 }, { "epoch": 3.070000784202695, "grad_norm": 3.6290364265441895, "learning_rate": 0.0027842105177296314, "loss": 7.7022, "step": 753600 }, { "epoch": 3.0704081622260766, "grad_norm": 9.527637481689453, "learning_rate": 0.002783717535118892, "loss": 7.7132, "step": 753700 }, { "epoch": 3.070815540249458, "grad_norm": 11.094533920288086, "learning_rate": 0.0027832245414889578, "loss": 7.7039, "step": 753800 }, { "epoch": 3.071222918272839, "grad_norm": 5.062388896942139, "learning_rate": 0.0027827315368593158, "loss": 7.7081, "step": 753900 }, { "epoch": 3.0716302962962208, "grad_norm": 4.398228645324707, "learning_rate": 0.002782238521249465, "loss": 7.7018, "step": 754000 }, { "epoch": 3.0716302962962208, "eval_MaskedAccuracy": 0.5057051822216122, "eval_loss": 1.6179014444351196, "eval_runtime": 150.941, "eval_samples_per_second": 420.535, "eval_steps_per_second": 1.643, "step": 754000 }, { "epoch": 3.0720376743196023, "grad_norm": 4.103709697723389, "learning_rate": 0.0027817454946788894, "loss": 7.7045, "step": 754100 }, { "epoch": 3.072445052342984, "grad_norm": 7.122503757476807, "learning_rate": 0.0027812524571670847, "loss": 7.7191, "step": 754200 }, { "epoch": 3.0728524303663654, "grad_norm": 2.7399961948394775, "learning_rate": 0.0027807594087335425, "loss": 7.7057, "step": 754300 }, { "epoch": 3.073259808389747, "grad_norm": 3.4263041019439697, "learning_rate": 0.0027802663493977546, "loss": 7.7328, "step": 754400 }, { "epoch": 3.073667186413128, "grad_norm": 2.858315944671631, "learning_rate": 0.0027797732791792202, "loss": 7.7174, "step": 754500 }, { "epoch": 3.0740745644365095, "grad_norm": 10.019489288330078, "learning_rate": 0.002779280198097426, "loss": 7.7021, "step": 754600 }, { "epoch": 3.074481942459891, "grad_norm": 6.572674751281738, "learning_rate": 0.002778787106171868, "loss": 7.743, "step": 754700 }, { "epoch": 3.0748893204832726, "grad_norm": 3.3612263202667236, "learning_rate": 0.002778294003422041, "loss": 7.7244, "step": 754800 }, { "epoch": 3.075296698506654, "grad_norm": 2.9108200073242188, "learning_rate": 0.002777800889867441, "loss": 7.7064, "step": 754900 }, { "epoch": 3.0757040765300356, "grad_norm": 7.32778787612915, "learning_rate": 0.0027773077655275646, "loss": 7.7206, "step": 755000 }, { "epoch": 3.0757040765300356, "eval_MaskedAccuracy": 0.50512931439093, "eval_loss": 1.6299889087677002, "eval_runtime": 150.4053, "eval_samples_per_second": 422.033, "eval_steps_per_second": 1.649, "step": 755000 }, { "epoch": 3.0761114545534167, "grad_norm": 4.767580509185791, "learning_rate": 0.0027768146304219046, "loss": 7.6982, "step": 755100 }, { "epoch": 3.0765188325767983, "grad_norm": 3.6486380100250244, "learning_rate": 0.002776321484569961, "loss": 7.706, "step": 755200 }, { "epoch": 3.07692621060018, "grad_norm": 4.942773342132568, "learning_rate": 0.0027758283279912255, "loss": 7.6904, "step": 755300 }, { "epoch": 3.0773335886235613, "grad_norm": 3.760831832885742, "learning_rate": 0.0027753351607051977, "loss": 7.7028, "step": 755400 }, { "epoch": 3.077740966646943, "grad_norm": 7.592773914337158, "learning_rate": 0.0027748419827313734, "loss": 7.7309, "step": 755500 }, { "epoch": 3.0781483446703244, "grad_norm": 3.427999258041382, "learning_rate": 0.0027743487940892543, "loss": 7.7191, "step": 755600 }, { "epoch": 3.0785557226937055, "grad_norm": 4.776392459869385, "learning_rate": 0.002773855594798336, "loss": 7.7126, "step": 755700 }, { "epoch": 3.078963100717087, "grad_norm": 5.972330093383789, "learning_rate": 0.00277336238487812, "loss": 7.7021, "step": 755800 }, { "epoch": 3.0793704787404685, "grad_norm": 6.197175025939941, "learning_rate": 0.0027728691643481036, "loss": 7.7335, "step": 755900 }, { "epoch": 3.07977785676385, "grad_norm": 3.0436811447143555, "learning_rate": 0.0027723759332277846, "loss": 7.731, "step": 756000 }, { "epoch": 3.07977785676385, "eval_MaskedAccuracy": 0.505950147114793, "eval_loss": 1.6236408948898315, "eval_runtime": 155.3747, "eval_samples_per_second": 408.535, "eval_steps_per_second": 1.596, "step": 756000 }, { "epoch": 3.0801852347872316, "grad_norm": 6.094714641571045, "learning_rate": 0.0027718826915366663, "loss": 7.6971, "step": 756100 }, { "epoch": 3.080592612810613, "grad_norm": 4.8946146965026855, "learning_rate": 0.0027713894392942477, "loss": 7.7232, "step": 756200 }, { "epoch": 3.0809999908339947, "grad_norm": 4.392921447753906, "learning_rate": 0.002770896176520028, "loss": 7.7185, "step": 756300 }, { "epoch": 3.0814073688573758, "grad_norm": 8.929402351379395, "learning_rate": 0.0027704029032335088, "loss": 7.7144, "step": 756400 }, { "epoch": 3.0818147468807573, "grad_norm": 6.839146137237549, "learning_rate": 0.0027699096194541977, "loss": 7.7279, "step": 756500 }, { "epoch": 3.082222124904139, "grad_norm": 8.299478530883789, "learning_rate": 0.002769416325201593, "loss": 7.7313, "step": 756600 }, { "epoch": 3.0826295029275204, "grad_norm": 5.999608516693115, "learning_rate": 0.002768923020495192, "loss": 7.7132, "step": 756700 }, { "epoch": 3.083036880950902, "grad_norm": 2.693417549133301, "learning_rate": 0.002768429705354503, "loss": 7.7067, "step": 756800 }, { "epoch": 3.0834442589742834, "grad_norm": 4.526899814605713, "learning_rate": 0.0027679363797990285, "loss": 7.688, "step": 756900 }, { "epoch": 3.0838516369976645, "grad_norm": 4.774698257446289, "learning_rate": 0.002767443043848276, "loss": 7.7004, "step": 757000 }, { "epoch": 3.0838516369976645, "eval_MaskedAccuracy": 0.5057957761377743, "eval_loss": 1.6153463125228882, "eval_runtime": 154.1542, "eval_samples_per_second": 411.769, "eval_steps_per_second": 1.609, "step": 757000 }, { "epoch": 3.084259015021046, "grad_norm": 3.5766282081604004, "learning_rate": 0.002766949697521747, "loss": 7.7101, "step": 757100 }, { "epoch": 3.0846663930444276, "grad_norm": 6.593492031097412, "learning_rate": 0.0027664563408389474, "loss": 7.722, "step": 757200 }, { "epoch": 3.085073771067809, "grad_norm": 3.907104015350342, "learning_rate": 0.002765962973819378, "loss": 7.696, "step": 757300 }, { "epoch": 3.0854811490911906, "grad_norm": 2.727769136428833, "learning_rate": 0.002765469596482544, "loss": 7.6926, "step": 757400 }, { "epoch": 3.085888527114572, "grad_norm": 5.050406455993652, "learning_rate": 0.0027649762088479564, "loss": 7.6885, "step": 757500 }, { "epoch": 3.0862959051379533, "grad_norm": 3.762338876724243, "learning_rate": 0.0027644828109351194, "loss": 7.6738, "step": 757600 }, { "epoch": 3.086703283161335, "grad_norm": 2.7442774772644043, "learning_rate": 0.0027639894027635357, "loss": 7.654, "step": 757700 }, { "epoch": 3.0871106611847163, "grad_norm": 4.210478782653809, "learning_rate": 0.002763495984352718, "loss": 7.6858, "step": 757800 }, { "epoch": 3.087518039208098, "grad_norm": 2.4720959663391113, "learning_rate": 0.002763002555722172, "loss": 7.713, "step": 757900 }, { "epoch": 3.0879254172314794, "grad_norm": 7.692282199859619, "learning_rate": 0.002762509116891405, "loss": 7.7458, "step": 758000 }, { "epoch": 3.0879254172314794, "eval_MaskedAccuracy": 0.5056275767802583, "eval_loss": 1.6217570304870605, "eval_runtime": 151.3128, "eval_samples_per_second": 419.502, "eval_steps_per_second": 1.639, "step": 758000 }, { "epoch": 3.088332795254861, "grad_norm": 6.1380205154418945, "learning_rate": 0.002762015667879925, "loss": 7.6846, "step": 758100 }, { "epoch": 3.088740173278242, "grad_norm": 4.44442081451416, "learning_rate": 0.002761522208707242, "loss": 7.7066, "step": 758200 }, { "epoch": 3.0891475513016236, "grad_norm": 3.9880988597869873, "learning_rate": 0.002761028739392862, "loss": 7.6778, "step": 758300 }, { "epoch": 3.089554929325005, "grad_norm": 4.732944965362549, "learning_rate": 0.002760535259956298, "loss": 7.6766, "step": 758400 }, { "epoch": 3.0899623073483866, "grad_norm": 2.3220407962799072, "learning_rate": 0.0027600417704170587, "loss": 7.6579, "step": 758500 }, { "epoch": 3.090369685371768, "grad_norm": 4.528346538543701, "learning_rate": 0.0027595482707946546, "loss": 7.7114, "step": 758600 }, { "epoch": 3.0907770633951497, "grad_norm": 5.072365760803223, "learning_rate": 0.002759054761108593, "loss": 7.6787, "step": 758700 }, { "epoch": 3.091184441418531, "grad_norm": 2.202188014984131, "learning_rate": 0.0027585612413783887, "loss": 7.7212, "step": 758800 }, { "epoch": 3.0915918194419123, "grad_norm": 4.132675647735596, "learning_rate": 0.0027580677116235534, "loss": 7.6815, "step": 758900 }, { "epoch": 3.091999197465294, "grad_norm": 4.466198921203613, "learning_rate": 0.0027575741718635963, "loss": 7.706, "step": 759000 }, { "epoch": 3.091999197465294, "eval_MaskedAccuracy": 0.5057517451551324, "eval_loss": 1.625289797782898, "eval_runtime": 150.9548, "eval_samples_per_second": 420.497, "eval_steps_per_second": 1.643, "step": 759000 }, { "epoch": 3.0924065754886754, "grad_norm": 8.609400749206543, "learning_rate": 0.002757080622118032, "loss": 7.7104, "step": 759100 }, { "epoch": 3.092813953512057, "grad_norm": 3.13914155960083, "learning_rate": 0.002756587062406375, "loss": 7.7002, "step": 759200 }, { "epoch": 3.0932213315354384, "grad_norm": 5.4335432052612305, "learning_rate": 0.002756093492748131, "loss": 7.7047, "step": 759300 }, { "epoch": 3.09362870955882, "grad_norm": 8.296807289123535, "learning_rate": 0.002755599913162822, "loss": 7.7104, "step": 759400 }, { "epoch": 3.094036087582201, "grad_norm": 6.492187976837158, "learning_rate": 0.002755106323669959, "loss": 7.707, "step": 759500 }, { "epoch": 3.0944434656055826, "grad_norm": 3.22277569770813, "learning_rate": 0.0027546127242890554, "loss": 7.6715, "step": 759600 }, { "epoch": 3.094850843628964, "grad_norm": 7.480574131011963, "learning_rate": 0.002754119115039624, "loss": 7.6811, "step": 759700 }, { "epoch": 3.0952582216523457, "grad_norm": 3.214850664138794, "learning_rate": 0.0027536254959411826, "loss": 7.7066, "step": 759800 }, { "epoch": 3.095665599675727, "grad_norm": 4.112349510192871, "learning_rate": 0.002753131867013247, "loss": 7.699, "step": 759900 }, { "epoch": 3.0960729776991087, "grad_norm": 7.638454437255859, "learning_rate": 0.0027526382282753274, "loss": 7.7022, "step": 760000 }, { "epoch": 3.0960729776991087, "eval_MaskedAccuracy": 0.5060889775801105, "eval_loss": 1.6206738948822021, "eval_runtime": 159.5708, "eval_samples_per_second": 397.792, "eval_steps_per_second": 1.554, "step": 760000 }, { "epoch": 3.09648035572249, "grad_norm": 4.234745502471924, "learning_rate": 0.002752144579746949, "loss": 7.6631, "step": 760100 }, { "epoch": 3.0968877337458713, "grad_norm": 4.058956623077393, "learning_rate": 0.002751650921447622, "loss": 7.6858, "step": 760200 }, { "epoch": 3.097295111769253, "grad_norm": 2.646798610687256, "learning_rate": 0.0027511572533968623, "loss": 7.6926, "step": 760300 }, { "epoch": 3.0977024897926344, "grad_norm": 6.737486839294434, "learning_rate": 0.002750663575614189, "loss": 7.7034, "step": 760400 }, { "epoch": 3.098109867816016, "grad_norm": 5.023601531982422, "learning_rate": 0.0027501698881191227, "loss": 7.6799, "step": 760500 }, { "epoch": 3.0985172458393975, "grad_norm": 5.718828201293945, "learning_rate": 0.0027496761909311792, "loss": 7.6983, "step": 760600 }, { "epoch": 3.0989246238627786, "grad_norm": 7.084863185882568, "learning_rate": 0.0027491824840698794, "loss": 7.654, "step": 760700 }, { "epoch": 3.09933200188616, "grad_norm": 7.82147741317749, "learning_rate": 0.002748688767554738, "loss": 7.6854, "step": 760800 }, { "epoch": 3.0997393799095416, "grad_norm": 6.596715927124023, "learning_rate": 0.0027481950414052743, "loss": 7.6855, "step": 760900 }, { "epoch": 3.100146757932923, "grad_norm": 8.626282691955566, "learning_rate": 0.002747701305641013, "loss": 7.726, "step": 761000 }, { "epoch": 3.100146757932923, "eval_MaskedAccuracy": 0.5060363108688771, "eval_loss": 1.615813136100769, "eval_runtime": 157.5171, "eval_samples_per_second": 402.978, "eval_steps_per_second": 1.574, "step": 761000 }, { "epoch": 3.1005541359563047, "grad_norm": 3.1481432914733887, "learning_rate": 0.002747207560281467, "loss": 7.6813, "step": 761100 }, { "epoch": 3.1009615139796862, "grad_norm": 2.801621675491333, "learning_rate": 0.002746713805346164, "loss": 7.7187, "step": 761200 }, { "epoch": 3.1013688920030678, "grad_norm": 10.537528038024902, "learning_rate": 0.0027462200408546185, "loss": 7.729, "step": 761300 }, { "epoch": 3.101776270026449, "grad_norm": 3.829329013824463, "learning_rate": 0.002745726266826354, "loss": 7.6956, "step": 761400 }, { "epoch": 3.1021836480498304, "grad_norm": 5.185670375823975, "learning_rate": 0.002745232483280894, "loss": 7.6825, "step": 761500 }, { "epoch": 3.102591026073212, "grad_norm": 3.894953727722168, "learning_rate": 0.0027447386902377615, "loss": 7.6839, "step": 761600 }, { "epoch": 3.1029984040965934, "grad_norm": 4.861565589904785, "learning_rate": 0.002744244887716477, "loss": 7.6802, "step": 761700 }, { "epoch": 3.103405782119975, "grad_norm": 7.542404651641846, "learning_rate": 0.002743751075736563, "loss": 7.6918, "step": 761800 }, { "epoch": 3.1038131601433565, "grad_norm": 5.748804092407227, "learning_rate": 0.0027432572543175425, "loss": 7.6957, "step": 761900 }, { "epoch": 3.1042205381667376, "grad_norm": 5.699087142944336, "learning_rate": 0.0027427634234789377, "loss": 7.7039, "step": 762000 }, { "epoch": 3.1042205381667376, "eval_MaskedAccuracy": 0.5066195956401219, "eval_loss": 1.618896484375, "eval_runtime": 151.8099, "eval_samples_per_second": 418.128, "eval_steps_per_second": 1.634, "step": 762000 }, { "epoch": 3.104627916190119, "grad_norm": 4.970003604888916, "learning_rate": 0.002742269583240274, "loss": 7.6822, "step": 762100 }, { "epoch": 3.1050352942135007, "grad_norm": 10.687162399291992, "learning_rate": 0.0027417757336210733, "loss": 7.6709, "step": 762200 }, { "epoch": 3.105442672236882, "grad_norm": 6.89406681060791, "learning_rate": 0.0027412818746408592, "loss": 7.6976, "step": 762300 }, { "epoch": 3.1058500502602637, "grad_norm": 5.004439353942871, "learning_rate": 0.002740788006319161, "loss": 7.6877, "step": 762400 }, { "epoch": 3.1062574282836453, "grad_norm": 7.998891830444336, "learning_rate": 0.002740294128675499, "loss": 7.647, "step": 762500 }, { "epoch": 3.1066648063070263, "grad_norm": 6.7538161277771, "learning_rate": 0.002739800241729404, "loss": 7.6801, "step": 762600 }, { "epoch": 3.107072184330408, "grad_norm": 8.487037658691406, "learning_rate": 0.002739306345500401, "loss": 7.7004, "step": 762700 }, { "epoch": 3.1074795623537894, "grad_norm": 3.9993841648101807, "learning_rate": 0.0027388124400080137, "loss": 7.6975, "step": 762800 }, { "epoch": 3.107886940377171, "grad_norm": 4.43471097946167, "learning_rate": 0.0027383185252717714, "loss": 7.711, "step": 762900 }, { "epoch": 3.1082943184005525, "grad_norm": 7.731508731842041, "learning_rate": 0.0027378246013111964, "loss": 7.6932, "step": 763000 }, { "epoch": 3.1082943184005525, "eval_MaskedAccuracy": 0.5056539368572125, "eval_loss": 1.6123262643814087, "eval_runtime": 149.35, "eval_samples_per_second": 425.015, "eval_steps_per_second": 1.661, "step": 763000 }, { "epoch": 3.108701696423934, "grad_norm": 9.151464462280273, "learning_rate": 0.00273733066814582, "loss": 7.6852, "step": 763100 }, { "epoch": 3.109109074447315, "grad_norm": 3.820744752883911, "learning_rate": 0.0027368367257951686, "loss": 7.7124, "step": 763200 }, { "epoch": 3.1095164524706966, "grad_norm": 4.586390018463135, "learning_rate": 0.0027363427742787745, "loss": 7.6838, "step": 763300 }, { "epoch": 3.109923830494078, "grad_norm": 6.306432247161865, "learning_rate": 0.002735848813616161, "loss": 7.7275, "step": 763400 }, { "epoch": 3.1103312085174597, "grad_norm": 2.7275564670562744, "learning_rate": 0.0027353548438268597, "loss": 7.7084, "step": 763500 }, { "epoch": 3.1107385865408412, "grad_norm": 4.9357991218566895, "learning_rate": 0.002734860864930402, "loss": 7.7172, "step": 763600 }, { "epoch": 3.1111459645642228, "grad_norm": 12.564998626708984, "learning_rate": 0.002734366876946315, "loss": 7.7076, "step": 763700 }, { "epoch": 3.1115533425876043, "grad_norm": 5.375897407531738, "learning_rate": 0.002733872879894126, "loss": 7.6727, "step": 763800 }, { "epoch": 3.1119607206109854, "grad_norm": 3.797677755355835, "learning_rate": 0.0027333788737933682, "loss": 7.6872, "step": 763900 }, { "epoch": 3.112368098634367, "grad_norm": 5.234559059143066, "learning_rate": 0.0027328848586635744, "loss": 7.6786, "step": 764000 }, { "epoch": 3.112368098634367, "eval_MaskedAccuracy": 0.5064838590018905, "eval_loss": 1.6195224523544312, "eval_runtime": 148.4772, "eval_samples_per_second": 427.514, "eval_steps_per_second": 1.67, "step": 764000 }, { "epoch": 3.1127754766577485, "grad_norm": 5.013713359832764, "learning_rate": 0.002732390834524275, "loss": 7.7024, "step": 764100 }, { "epoch": 3.11318285468113, "grad_norm": 6.045117378234863, "learning_rate": 0.002731896801394997, "loss": 7.7021, "step": 764200 }, { "epoch": 3.1135902327045115, "grad_norm": 3.4436819553375244, "learning_rate": 0.0027314027592952733, "loss": 7.7039, "step": 764300 }, { "epoch": 3.113997610727893, "grad_norm": 6.984184741973877, "learning_rate": 0.00273090870824464, "loss": 7.6958, "step": 764400 }, { "epoch": 3.114404988751274, "grad_norm": 2.6970698833465576, "learning_rate": 0.0027304146482626263, "loss": 7.6822, "step": 764500 }, { "epoch": 3.1148123667746557, "grad_norm": 8.229776382446289, "learning_rate": 0.002729920579368766, "loss": 7.7039, "step": 764600 }, { "epoch": 3.115219744798037, "grad_norm": 5.920711994171143, "learning_rate": 0.0027294265015825932, "loss": 7.698, "step": 764700 }, { "epoch": 3.1156271228214187, "grad_norm": 3.3548264503479004, "learning_rate": 0.0027289324149236405, "loss": 7.6951, "step": 764800 }, { "epoch": 3.1160345008448003, "grad_norm": 2.312197685241699, "learning_rate": 0.0027284383194114427, "loss": 7.6999, "step": 764900 }, { "epoch": 3.116441878868182, "grad_norm": 3.858964681625366, "learning_rate": 0.0027279442150655296, "loss": 7.6669, "step": 765000 }, { "epoch": 3.116441878868182, "eval_MaskedAccuracy": 0.5051968226329966, "eval_loss": 1.6254647970199585, "eval_runtime": 150.8261, "eval_samples_per_second": 420.856, "eval_steps_per_second": 1.644, "step": 765000 }, { "epoch": 3.116849256891563, "grad_norm": 11.507035255432129, "learning_rate": 0.0027274501019054413, "loss": 7.7111, "step": 765100 }, { "epoch": 3.1172566349149444, "grad_norm": 3.928722858428955, "learning_rate": 0.00272695597995071, "loss": 7.6865, "step": 765200 }, { "epoch": 3.117664012938326, "grad_norm": 4.361135005950928, "learning_rate": 0.002726461849220871, "loss": 7.6899, "step": 765300 }, { "epoch": 3.1180713909617075, "grad_norm": 7.6984357833862305, "learning_rate": 0.002725967709735463, "loss": 7.7138, "step": 765400 }, { "epoch": 3.118478768985089, "grad_norm": 4.049012660980225, "learning_rate": 0.002725473561514019, "loss": 7.6653, "step": 765500 }, { "epoch": 3.1188861470084706, "grad_norm": 5.440489292144775, "learning_rate": 0.0027249794045760764, "loss": 7.6729, "step": 765600 }, { "epoch": 3.1192935250318516, "grad_norm": 6.268097400665283, "learning_rate": 0.0027244852389411707, "loss": 7.7095, "step": 765700 }, { "epoch": 3.119700903055233, "grad_norm": 5.593329429626465, "learning_rate": 0.0027239910646288373, "loss": 7.7053, "step": 765800 }, { "epoch": 3.1201082810786147, "grad_norm": 5.340170383453369, "learning_rate": 0.0027234968816586174, "loss": 7.6662, "step": 765900 }, { "epoch": 3.1205156591019962, "grad_norm": 4.175409317016602, "learning_rate": 0.002723002690050049, "loss": 7.7263, "step": 766000 }, { "epoch": 3.1205156591019962, "eval_MaskedAccuracy": 0.5056226058133293, "eval_loss": 1.6213122606277466, "eval_runtime": 151.8872, "eval_samples_per_second": 417.915, "eval_steps_per_second": 1.633, "step": 766000 }, { "epoch": 3.1209230371253778, "grad_norm": 10.50373363494873, "learning_rate": 0.0027225084898226683, "loss": 7.7059, "step": 766100 }, { "epoch": 3.1213304151487593, "grad_norm": 3.3283989429473877, "learning_rate": 0.0027220142809960125, "loss": 7.6871, "step": 766200 }, { "epoch": 3.121737793172141, "grad_norm": 6.762508392333984, "learning_rate": 0.002721520063589619, "loss": 7.6919, "step": 766300 }, { "epoch": 3.122145171195522, "grad_norm": 4.057222843170166, "learning_rate": 0.0027210258376230314, "loss": 7.7004, "step": 766400 }, { "epoch": 3.1225525492189035, "grad_norm": 5.02310848236084, "learning_rate": 0.0027205316031157895, "loss": 7.6696, "step": 766500 }, { "epoch": 3.122959927242285, "grad_norm": 4.003963947296143, "learning_rate": 0.0027200373600874284, "loss": 7.7151, "step": 766600 }, { "epoch": 3.1233673052656665, "grad_norm": 4.325287818908691, "learning_rate": 0.00271954310855749, "loss": 7.6745, "step": 766700 }, { "epoch": 3.123774683289048, "grad_norm": 3.2648262977600098, "learning_rate": 0.0027190488485455146, "loss": 7.714, "step": 766800 }, { "epoch": 3.1241820613124296, "grad_norm": 7.260473728179932, "learning_rate": 0.0027185545800710443, "loss": 7.7021, "step": 766900 }, { "epoch": 3.1245894393358107, "grad_norm": 3.1286821365356445, "learning_rate": 0.002718060303153616, "loss": 7.6875, "step": 767000 }, { "epoch": 3.1245894393358107, "eval_MaskedAccuracy": 0.5059939406712075, "eval_loss": 1.6196115016937256, "eval_runtime": 155.225, "eval_samples_per_second": 408.929, "eval_steps_per_second": 1.598, "step": 767000 }, { "epoch": 3.124996817359192, "grad_norm": 9.294156074523926, "learning_rate": 0.0027175660178127767, "loss": 7.7038, "step": 767100 }, { "epoch": 3.1254041953825737, "grad_norm": 6.32208776473999, "learning_rate": 0.002717071724068067, "loss": 7.6961, "step": 767200 }, { "epoch": 3.1258115734059553, "grad_norm": 3.045856237411499, "learning_rate": 0.002716577421939028, "loss": 7.6998, "step": 767300 }, { "epoch": 3.126218951429337, "grad_norm": 3.8133158683776855, "learning_rate": 0.0027160831114452, "loss": 7.6928, "step": 767400 }, { "epoch": 3.1266263294527183, "grad_norm": 3.6581594944000244, "learning_rate": 0.002715588792606129, "loss": 7.6863, "step": 767500 }, { "epoch": 3.1270337074760994, "grad_norm": 5.351816654205322, "learning_rate": 0.0027150944654413573, "loss": 7.6866, "step": 767600 }, { "epoch": 3.127441085499481, "grad_norm": 6.225848197937012, "learning_rate": 0.0027146001299704265, "loss": 7.7126, "step": 767700 }, { "epoch": 3.1278484635228625, "grad_norm": 3.376725912094116, "learning_rate": 0.002714105786212884, "loss": 7.7209, "step": 767800 }, { "epoch": 3.128255841546244, "grad_norm": 2.856078863143921, "learning_rate": 0.002713611434188269, "loss": 7.7319, "step": 767900 }, { "epoch": 3.1286632195696256, "grad_norm": 3.4593234062194824, "learning_rate": 0.0027131170739161264, "loss": 7.6623, "step": 768000 }, { "epoch": 3.1286632195696256, "eval_MaskedAccuracy": 0.5066317843810505, "eval_loss": 1.6134639978408813, "eval_runtime": 353.3584, "eval_samples_per_second": 179.636, "eval_steps_per_second": 0.702, "step": 768000 }, { "epoch": 3.129070597593007, "grad_norm": 4.538331031799316, "learning_rate": 0.0027126227054160056, "loss": 7.7282, "step": 768100 }, { "epoch": 3.129477975616388, "grad_norm": 3.409496784210205, "learning_rate": 0.00271212832870745, "loss": 7.7104, "step": 768200 }, { "epoch": 3.1298853536397697, "grad_norm": 2.624006509780884, "learning_rate": 0.0027116339438100017, "loss": 7.6978, "step": 768300 }, { "epoch": 3.1302927316631513, "grad_norm": 5.233662128448486, "learning_rate": 0.002711139550743207, "loss": 7.6748, "step": 768400 }, { "epoch": 3.130700109686533, "grad_norm": 4.042091369628906, "learning_rate": 0.002710645149526616, "loss": 7.6943, "step": 768500 }, { "epoch": 3.1311074877099143, "grad_norm": 4.5074143409729, "learning_rate": 0.002710150740179772, "loss": 7.6773, "step": 768600 }, { "epoch": 3.131514865733296, "grad_norm": 3.0886244773864746, "learning_rate": 0.00270965632272222, "loss": 7.7103, "step": 768700 }, { "epoch": 3.1319222437566774, "grad_norm": 8.190417289733887, "learning_rate": 0.0027091618971735124, "loss": 7.6885, "step": 768800 }, { "epoch": 3.1323296217800585, "grad_norm": 5.727116584777832, "learning_rate": 0.0027086674635531933, "loss": 7.6754, "step": 768900 }, { "epoch": 3.13273699980344, "grad_norm": 7.181107521057129, "learning_rate": 0.0027081730218808096, "loss": 7.6879, "step": 769000 }, { "epoch": 3.13273699980344, "eval_MaskedAccuracy": 0.5056934334928747, "eval_loss": 1.6126961708068848, "eval_runtime": 157.6959, "eval_samples_per_second": 402.521, "eval_steps_per_second": 1.573, "step": 769000 }, { "epoch": 3.1331443778268215, "grad_norm": 4.570808410644531, "learning_rate": 0.0027076785721759067, "loss": 7.6934, "step": 769100 }, { "epoch": 3.133551755850203, "grad_norm": 3.8933639526367188, "learning_rate": 0.0027071841144580377, "loss": 7.6887, "step": 769200 }, { "epoch": 3.1339591338735846, "grad_norm": 6.667581081390381, "learning_rate": 0.0027066896487467486, "loss": 7.6811, "step": 769300 }, { "epoch": 3.134366511896966, "grad_norm": 3.5448014736175537, "learning_rate": 0.00270619517506159, "loss": 7.7026, "step": 769400 }, { "epoch": 3.1347738899203472, "grad_norm": 4.684604644775391, "learning_rate": 0.002705700693422108, "loss": 7.6957, "step": 769500 }, { "epoch": 3.1351812679437288, "grad_norm": 4.270644187927246, "learning_rate": 0.002705206203847856, "loss": 7.7039, "step": 769600 }, { "epoch": 3.1355886459671103, "grad_norm": 5.729297161102295, "learning_rate": 0.002704711706358382, "loss": 7.7222, "step": 769700 }, { "epoch": 3.135996023990492, "grad_norm": 4.039698123931885, "learning_rate": 0.0027042172009732347, "loss": 7.7137, "step": 769800 }, { "epoch": 3.1364034020138734, "grad_norm": 4.518555641174316, "learning_rate": 0.002703722687711965, "loss": 7.7264, "step": 769900 }, { "epoch": 3.136810780037255, "grad_norm": 4.014138698577881, "learning_rate": 0.002703228166594126, "loss": 7.682, "step": 770000 }, { "epoch": 3.136810780037255, "eval_MaskedAccuracy": 0.5060256769241583, "eval_loss": 1.619013786315918, "eval_runtime": 154.8899, "eval_samples_per_second": 409.814, "eval_steps_per_second": 1.601, "step": 770000 }, { "epoch": 3.137218158060636, "grad_norm": 5.116852283477783, "learning_rate": 0.0027027336376392696, "loss": 7.6818, "step": 770100 }, { "epoch": 3.1376255360840175, "grad_norm": 3.505202293395996, "learning_rate": 0.002702239100866942, "loss": 7.682, "step": 770200 }, { "epoch": 3.138032914107399, "grad_norm": 3.6708834171295166, "learning_rate": 0.002701744556296698, "loss": 7.6966, "step": 770300 }, { "epoch": 3.1384402921307806, "grad_norm": 4.903164386749268, "learning_rate": 0.002701250003948092, "loss": 7.7207, "step": 770400 }, { "epoch": 3.138847670154162, "grad_norm": 3.7474172115325928, "learning_rate": 0.0027007554438406704, "loss": 7.6756, "step": 770500 }, { "epoch": 3.1392550481775436, "grad_norm": 3.6070735454559326, "learning_rate": 0.0027002608759939912, "loss": 7.6811, "step": 770600 }, { "epoch": 3.1396624262009247, "grad_norm": 2.534743547439575, "learning_rate": 0.002699766300427606, "loss": 7.6946, "step": 770700 }, { "epoch": 3.1400698042243063, "grad_norm": 6.393507480621338, "learning_rate": 0.002699271717161066, "loss": 7.7245, "step": 770800 }, { "epoch": 3.140477182247688, "grad_norm": 6.294739723205566, "learning_rate": 0.0026987771262139255, "loss": 7.6895, "step": 770900 }, { "epoch": 3.1408845602710693, "grad_norm": 3.527824640274048, "learning_rate": 0.0026982825276057397, "loss": 7.68, "step": 771000 }, { "epoch": 3.1408845602710693, "eval_MaskedAccuracy": 0.5066774540976102, "eval_loss": 1.6144894361495972, "eval_runtime": 152.2406, "eval_samples_per_second": 416.945, "eval_steps_per_second": 1.629, "step": 771000 }, { "epoch": 3.141291938294451, "grad_norm": 6.735122203826904, "learning_rate": 0.0026977879213560636, "loss": 7.7008, "step": 771100 }, { "epoch": 3.1416993163178324, "grad_norm": 4.2013115882873535, "learning_rate": 0.00269729330748445, "loss": 7.7151, "step": 771200 }, { "epoch": 3.142106694341214, "grad_norm": 9.52634334564209, "learning_rate": 0.0026967986860104514, "loss": 7.6884, "step": 771300 }, { "epoch": 3.142514072364595, "grad_norm": 7.722685813903809, "learning_rate": 0.0026963040569536246, "loss": 7.6874, "step": 771400 }, { "epoch": 3.1429214503879765, "grad_norm": 3.3890602588653564, "learning_rate": 0.002695809420333526, "loss": 7.6941, "step": 771500 }, { "epoch": 3.143328828411358, "grad_norm": 6.16286563873291, "learning_rate": 0.0026953147761697113, "loss": 7.7346, "step": 771600 }, { "epoch": 3.1437362064347396, "grad_norm": 4.1274590492248535, "learning_rate": 0.002694820124481733, "loss": 7.7027, "step": 771700 }, { "epoch": 3.144143584458121, "grad_norm": 7.944971561431885, "learning_rate": 0.002694325465289152, "loss": 7.7287, "step": 771800 }, { "epoch": 3.1445509624815027, "grad_norm": 4.1289873123168945, "learning_rate": 0.00269383079861152, "loss": 7.7105, "step": 771900 }, { "epoch": 3.1449583405048838, "grad_norm": 4.869955539703369, "learning_rate": 0.0026933361244684028, "loss": 7.6976, "step": 772000 }, { "epoch": 3.1449583405048838, "eval_MaskedAccuracy": 0.5060203453393523, "eval_loss": 1.620253324508667, "eval_runtime": 149.7289, "eval_samples_per_second": 423.939, "eval_steps_per_second": 1.656, "step": 772000 }, { "epoch": 3.1453657185282653, "grad_norm": 4.264239311218262, "learning_rate": 0.0026928414428793474, "loss": 7.6762, "step": 772100 }, { "epoch": 3.145773096551647, "grad_norm": 7.852622985839844, "learning_rate": 0.002692346753863915, "loss": 7.6972, "step": 772200 }, { "epoch": 3.1461804745750284, "grad_norm": 10.748968124389648, "learning_rate": 0.0026918520574416615, "loss": 7.6776, "step": 772300 }, { "epoch": 3.14658785259841, "grad_norm": 7.000277042388916, "learning_rate": 0.002691357353632149, "loss": 7.7146, "step": 772400 }, { "epoch": 3.1469952306217914, "grad_norm": 8.364676475524902, "learning_rate": 0.0026908626424549365, "loss": 7.7151, "step": 772500 }, { "epoch": 3.1474026086451725, "grad_norm": 9.26862621307373, "learning_rate": 0.0026903679239295773, "loss": 7.6791, "step": 772600 }, { "epoch": 3.147809986668554, "grad_norm": 10.48083782196045, "learning_rate": 0.0026898731980756316, "loss": 7.6928, "step": 772700 }, { "epoch": 3.1482173646919356, "grad_norm": 4.311800479888916, "learning_rate": 0.002689378464912659, "loss": 7.7025, "step": 772800 }, { "epoch": 3.148624742715317, "grad_norm": 12.309150695800781, "learning_rate": 0.002688883724460219, "loss": 7.6861, "step": 772900 }, { "epoch": 3.1490321207386986, "grad_norm": 7.959762096405029, "learning_rate": 0.0026883889767378728, "loss": 7.6916, "step": 773000 }, { "epoch": 3.1490321207386986, "eval_MaskedAccuracy": 0.5059848410346622, "eval_loss": 1.614486575126648, "eval_runtime": 153.1919, "eval_samples_per_second": 414.356, "eval_steps_per_second": 1.619, "step": 773000 }, { "epoch": 3.14943949876208, "grad_norm": 8.189927101135254, "learning_rate": 0.0026878942217651793, "loss": 7.6946, "step": 773100 }, { "epoch": 3.1498468767854613, "grad_norm": 3.512836456298828, "learning_rate": 0.002687399459561704, "loss": 7.6772, "step": 773200 }, { "epoch": 3.150254254808843, "grad_norm": 4.427940845489502, "learning_rate": 0.0026869046901469977, "loss": 7.7165, "step": 773300 }, { "epoch": 3.1506616328322243, "grad_norm": 13.9074125289917, "learning_rate": 0.002686409913540624, "loss": 7.6802, "step": 773400 }, { "epoch": 3.151069010855606, "grad_norm": 4.601502895355225, "learning_rate": 0.0026859151297621477, "loss": 7.6944, "step": 773500 }, { "epoch": 3.1514763888789874, "grad_norm": 4.3032732009887695, "learning_rate": 0.00268542033883113, "loss": 7.7127, "step": 773600 }, { "epoch": 3.151883766902369, "grad_norm": 11.035786628723145, "learning_rate": 0.0026849255407671332, "loss": 7.7172, "step": 773700 }, { "epoch": 3.1522911449257505, "grad_norm": 5.978248596191406, "learning_rate": 0.0026844307355897152, "loss": 7.6879, "step": 773800 }, { "epoch": 3.1526985229491316, "grad_norm": 3.032963514328003, "learning_rate": 0.0026839359233184405, "loss": 7.6867, "step": 773900 }, { "epoch": 3.153105900972513, "grad_norm": 11.734149932861328, "learning_rate": 0.002683441103972874, "loss": 7.7394, "step": 774000 }, { "epoch": 3.153105900972513, "eval_MaskedAccuracy": 0.5058565238362183, "eval_loss": 1.614915132522583, "eval_runtime": 148.9732, "eval_samples_per_second": 426.09, "eval_steps_per_second": 1.665, "step": 774000 }, { "epoch": 3.1535132789958946, "grad_norm": 3.2387919425964355, "learning_rate": 0.0026829462775725737, "loss": 7.6518, "step": 774100 }, { "epoch": 3.153920657019276, "grad_norm": 11.412351608276367, "learning_rate": 0.0026824514441371084, "loss": 7.6718, "step": 774200 }, { "epoch": 3.1543280350426577, "grad_norm": 2.723578929901123, "learning_rate": 0.0026819566036860366, "loss": 7.698, "step": 774300 }, { "epoch": 3.154735413066039, "grad_norm": 4.925586700439453, "learning_rate": 0.002681461756238926, "loss": 7.6967, "step": 774400 }, { "epoch": 3.1551427910894203, "grad_norm": 3.7491700649261475, "learning_rate": 0.0026809669018153385, "loss": 7.6531, "step": 774500 }, { "epoch": 3.155550169112802, "grad_norm": 2.9871318340301514, "learning_rate": 0.002680472040434838, "loss": 7.704, "step": 774600 }, { "epoch": 3.1559575471361834, "grad_norm": 2.868842840194702, "learning_rate": 0.002679977172116985, "loss": 7.7028, "step": 774700 }, { "epoch": 3.156364925159565, "grad_norm": 4.306030750274658, "learning_rate": 0.002679482296881351, "loss": 7.6876, "step": 774800 }, { "epoch": 3.1567723031829464, "grad_norm": 9.307113647460938, "learning_rate": 0.002678987414747501, "loss": 7.6928, "step": 774900 }, { "epoch": 3.157179681206328, "grad_norm": 4.642914772033691, "learning_rate": 0.0026784925257349944, "loss": 7.7102, "step": 775000 }, { "epoch": 3.157179681206328, "eval_MaskedAccuracy": 0.5066560003822872, "eval_loss": 1.6202186346054077, "eval_runtime": 149.5397, "eval_samples_per_second": 424.476, "eval_steps_per_second": 1.658, "step": 775000 }, { "epoch": 3.157587059229709, "grad_norm": 8.615238189697266, "learning_rate": 0.0026779976298634024, "loss": 7.6996, "step": 775100 }, { "epoch": 3.1579944372530906, "grad_norm": 4.410147190093994, "learning_rate": 0.0026775027271522906, "loss": 7.7021, "step": 775200 }, { "epoch": 3.158401815276472, "grad_norm": 8.252384185791016, "learning_rate": 0.0026770078176212193, "loss": 7.6904, "step": 775300 }, { "epoch": 3.1588091932998537, "grad_norm": 3.639887571334839, "learning_rate": 0.0026765129012897592, "loss": 7.6941, "step": 775400 }, { "epoch": 3.159216571323235, "grad_norm": 3.2815327644348145, "learning_rate": 0.0026760179781774736, "loss": 7.7047, "step": 775500 }, { "epoch": 3.1596239493466167, "grad_norm": 7.7857818603515625, "learning_rate": 0.002675523048303935, "loss": 7.6927, "step": 775600 }, { "epoch": 3.160031327369998, "grad_norm": 5.169656753540039, "learning_rate": 0.0026750281116887085, "loss": 7.6722, "step": 775700 }, { "epoch": 3.1604387053933793, "grad_norm": 3.543030023574829, "learning_rate": 0.0026745331683513626, "loss": 7.7149, "step": 775800 }, { "epoch": 3.160846083416761, "grad_norm": 4.759836673736572, "learning_rate": 0.0026740382183114615, "loss": 7.7005, "step": 775900 }, { "epoch": 3.1612534614401424, "grad_norm": 4.95238733291626, "learning_rate": 0.0026735432615885776, "loss": 7.6842, "step": 776000 }, { "epoch": 3.1612534614401424, "eval_MaskedAccuracy": 0.5044781518966327, "eval_loss": 1.6273850202560425, "eval_runtime": 149.6541, "eval_samples_per_second": 424.151, "eval_steps_per_second": 1.657, "step": 776000 }, { "epoch": 3.161660839463524, "grad_norm": 7.366076946258545, "learning_rate": 0.002673048298202278, "loss": 7.6985, "step": 776100 }, { "epoch": 3.1620682174869055, "grad_norm": 3.573741912841797, "learning_rate": 0.002672553328172129, "loss": 7.692, "step": 776200 }, { "epoch": 3.162475595510287, "grad_norm": 6.183897972106934, "learning_rate": 0.0026720583515177014, "loss": 7.6938, "step": 776300 }, { "epoch": 3.162882973533668, "grad_norm": 5.623685359954834, "learning_rate": 0.002671563368258564, "loss": 7.6706, "step": 776400 }, { "epoch": 3.1632903515570496, "grad_norm": 3.4732744693756104, "learning_rate": 0.002671068378414285, "loss": 7.6934, "step": 776500 }, { "epoch": 3.163697729580431, "grad_norm": 3.870776891708374, "learning_rate": 0.002670573382004436, "loss": 7.6801, "step": 776600 }, { "epoch": 3.1641051076038127, "grad_norm": 9.050191879272461, "learning_rate": 0.002670078379048585, "loss": 7.7044, "step": 776700 }, { "epoch": 3.1645124856271942, "grad_norm": 5.239398956298828, "learning_rate": 0.0026695833695663043, "loss": 7.6659, "step": 776800 }, { "epoch": 3.1649198636505758, "grad_norm": 3.975712299346924, "learning_rate": 0.0026690883535771637, "loss": 7.7076, "step": 776900 }, { "epoch": 3.165327241673957, "grad_norm": 4.15570068359375, "learning_rate": 0.0026685933311007314, "loss": 7.6763, "step": 777000 }, { "epoch": 3.165327241673957, "eval_MaskedAccuracy": 0.5066619214389548, "eval_loss": 1.6140919923782349, "eval_runtime": 160.6162, "eval_samples_per_second": 395.203, "eval_steps_per_second": 1.544, "step": 777000 }, { "epoch": 3.1657346196973384, "grad_norm": 5.735141754150391, "learning_rate": 0.0026680983021565804, "loss": 7.7132, "step": 777100 }, { "epoch": 3.16614199772072, "grad_norm": 3.445868730545044, "learning_rate": 0.0026676032667642792, "loss": 7.6667, "step": 777200 }, { "epoch": 3.1665493757441014, "grad_norm": 3.7501468658447266, "learning_rate": 0.0026671082249434025, "loss": 7.6929, "step": 777300 }, { "epoch": 3.166956753767483, "grad_norm": 8.324304580688477, "learning_rate": 0.0026666131767135214, "loss": 7.6802, "step": 777400 }, { "epoch": 3.1673641317908645, "grad_norm": 5.689042568206787, "learning_rate": 0.0026661181220942053, "loss": 7.7014, "step": 777500 }, { "epoch": 3.1677715098142456, "grad_norm": 6.338901996612549, "learning_rate": 0.00266562306110503, "loss": 7.6994, "step": 777600 }, { "epoch": 3.168178887837627, "grad_norm": 7.056424617767334, "learning_rate": 0.0026651279937655644, "loss": 7.6646, "step": 777700 }, { "epoch": 3.1685862658610087, "grad_norm": 6.031956672668457, "learning_rate": 0.002664632920095385, "loss": 7.6769, "step": 777800 }, { "epoch": 3.16899364388439, "grad_norm": 3.548583984375, "learning_rate": 0.0026641378401140612, "loss": 7.6747, "step": 777900 }, { "epoch": 3.1694010219077717, "grad_norm": 3.889172315597534, "learning_rate": 0.0026636427538411643, "loss": 7.6888, "step": 778000 }, { "epoch": 3.1694010219077717, "eval_MaskedAccuracy": 0.5056879036485077, "eval_loss": 1.6238088607788086, "eval_runtime": 154.305, "eval_samples_per_second": 411.367, "eval_steps_per_second": 1.607, "step": 778000 }, { "epoch": 3.1698083999311533, "grad_norm": 6.30963134765625, "learning_rate": 0.0026631476612962745, "loss": 7.6877, "step": 778100 }, { "epoch": 3.1702157779545344, "grad_norm": 2.9132330417633057, "learning_rate": 0.0026626525624989616, "loss": 7.69, "step": 778200 }, { "epoch": 3.170623155977916, "grad_norm": 5.352282524108887, "learning_rate": 0.0026621574574687986, "loss": 7.6722, "step": 778300 }, { "epoch": 3.1710305340012974, "grad_norm": 5.92992639541626, "learning_rate": 0.0026616623462253626, "loss": 7.6795, "step": 778400 }, { "epoch": 3.171437912024679, "grad_norm": 5.393660545349121, "learning_rate": 0.0026611672287882235, "loss": 7.6734, "step": 778500 }, { "epoch": 3.1718452900480605, "grad_norm": 5.707131385803223, "learning_rate": 0.002660672105176957, "loss": 7.7077, "step": 778600 }, { "epoch": 3.172252668071442, "grad_norm": 3.5185000896453857, "learning_rate": 0.0026601769754111392, "loss": 7.6781, "step": 778700 }, { "epoch": 3.1726600460948235, "grad_norm": 2.8914966583251953, "learning_rate": 0.0026596818395103468, "loss": 7.7267, "step": 778800 }, { "epoch": 3.1730674241182046, "grad_norm": 2.5010547637939453, "learning_rate": 0.0026591866974941558, "loss": 7.6956, "step": 778900 }, { "epoch": 3.173474802141586, "grad_norm": 4.04721212387085, "learning_rate": 0.00265869154938214, "loss": 7.6819, "step": 779000 }, { "epoch": 3.173474802141586, "eval_MaskedAccuracy": 0.5064465607655882, "eval_loss": 1.6194339990615845, "eval_runtime": 150.4166, "eval_samples_per_second": 422.001, "eval_steps_per_second": 1.649, "step": 779000 }, { "epoch": 3.1738821801649677, "grad_norm": 8.249896049499512, "learning_rate": 0.002658196395193876, "loss": 7.6997, "step": 779100 }, { "epoch": 3.1742895581883492, "grad_norm": 10.266298294067383, "learning_rate": 0.002657701234948938, "loss": 7.7037, "step": 779200 }, { "epoch": 3.1746969362117308, "grad_norm": 4.910303592681885, "learning_rate": 0.0026572060686669047, "loss": 7.6826, "step": 779300 }, { "epoch": 3.1751043142351123, "grad_norm": 5.045969009399414, "learning_rate": 0.002656710896367351, "loss": 7.6876, "step": 779400 }, { "epoch": 3.1755116922584934, "grad_norm": 6.6033616065979, "learning_rate": 0.0026562157180698522, "loss": 7.6794, "step": 779500 }, { "epoch": 3.175919070281875, "grad_norm": 3.0145375728607178, "learning_rate": 0.0026557205337939854, "loss": 7.6678, "step": 779600 }, { "epoch": 3.1763264483052565, "grad_norm": 3.813138484954834, "learning_rate": 0.00265522534355933, "loss": 7.6944, "step": 779700 }, { "epoch": 3.176733826328638, "grad_norm": 6.576977252960205, "learning_rate": 0.0026547301473854638, "loss": 7.6928, "step": 779800 }, { "epoch": 3.1771412043520195, "grad_norm": 3.4925901889801025, "learning_rate": 0.0026542349452919694, "loss": 7.7008, "step": 779900 }, { "epoch": 3.177548582375401, "grad_norm": 4.209640979766846, "learning_rate": 0.002653739737298417, "loss": 7.6801, "step": 780000 }, { "epoch": 3.177548582375401, "eval_MaskedAccuracy": 0.5061531277212185, "eval_loss": 1.6210638284683228, "eval_runtime": 152.9389, "eval_samples_per_second": 415.042, "eval_steps_per_second": 1.622, "step": 780000 }, { "epoch": 3.177955960398782, "grad_norm": 4.764199256896973, "learning_rate": 0.0026532445234243858, "loss": 7.6904, "step": 780100 }, { "epoch": 3.1783633384221637, "grad_norm": 6.731513023376465, "learning_rate": 0.0026527493036894545, "loss": 7.7001, "step": 780200 }, { "epoch": 3.178770716445545, "grad_norm": 6.768033504486084, "learning_rate": 0.002652254078113203, "loss": 7.7091, "step": 780300 }, { "epoch": 3.1791780944689267, "grad_norm": 4.610159873962402, "learning_rate": 0.0026517588467152133, "loss": 7.6851, "step": 780400 }, { "epoch": 3.1795854724923083, "grad_norm": 3.216808319091797, "learning_rate": 0.002651263609515061, "loss": 7.7173, "step": 780500 }, { "epoch": 3.17999285051569, "grad_norm": 5.673268795013428, "learning_rate": 0.002650768366532327, "loss": 7.6733, "step": 780600 }, { "epoch": 3.180400228539071, "grad_norm": 3.343284845352173, "learning_rate": 0.002650273117786585, "loss": 7.6909, "step": 780700 }, { "epoch": 3.1808076065624524, "grad_norm": 6.623497009277344, "learning_rate": 0.0026497778632974243, "loss": 7.6662, "step": 780800 }, { "epoch": 3.181214984585834, "grad_norm": 3.500135660171509, "learning_rate": 0.002649282603084417, "loss": 7.6678, "step": 780900 }, { "epoch": 3.1816223626092155, "grad_norm": 7.12910795211792, "learning_rate": 0.002648787337167148, "loss": 7.691, "step": 781000 }, { "epoch": 3.1816223626092155, "eval_MaskedAccuracy": 0.5053428490295067, "eval_loss": 1.6209814548492432, "eval_runtime": 150.3438, "eval_samples_per_second": 422.206, "eval_steps_per_second": 1.65, "step": 781000 }, { "epoch": 3.182029740632597, "grad_norm": 4.574215412139893, "learning_rate": 0.002648292065565196, "loss": 7.6977, "step": 781100 }, { "epoch": 3.1824371186559786, "grad_norm": 6.274864673614502, "learning_rate": 0.0026477967882981464, "loss": 7.698, "step": 781200 }, { "epoch": 3.18284449667936, "grad_norm": 4.036904335021973, "learning_rate": 0.0026473015053855723, "loss": 7.6828, "step": 781300 }, { "epoch": 3.183251874702741, "grad_norm": 3.259281873703003, "learning_rate": 0.002646806216847059, "loss": 7.6906, "step": 781400 }, { "epoch": 3.1836592527261227, "grad_norm": 3.483506202697754, "learning_rate": 0.0026463109227021853, "loss": 7.6865, "step": 781500 }, { "epoch": 3.1840666307495042, "grad_norm": 3.4609780311584473, "learning_rate": 0.0026458156229705373, "loss": 7.679, "step": 781600 }, { "epoch": 3.184474008772886, "grad_norm": 10.09542465209961, "learning_rate": 0.002645320317671695, "loss": 7.689, "step": 781700 }, { "epoch": 3.1848813867962673, "grad_norm": 5.758722305297852, "learning_rate": 0.002644825006825238, "loss": 7.6808, "step": 781800 }, { "epoch": 3.185288764819649, "grad_norm": 5.515896797180176, "learning_rate": 0.002644329690450753, "loss": 7.6765, "step": 781900 }, { "epoch": 3.18569614284303, "grad_norm": 3.3453032970428467, "learning_rate": 0.0026438343685678193, "loss": 7.6858, "step": 782000 }, { "epoch": 3.18569614284303, "eval_MaskedAccuracy": 0.5066394594667833, "eval_loss": 1.617942214012146, "eval_runtime": 150.4516, "eval_samples_per_second": 421.903, "eval_steps_per_second": 1.648, "step": 782000 }, { "epoch": 3.1861035208664115, "grad_norm": 5.306074142456055, "learning_rate": 0.0026433390411960207, "loss": 7.6757, "step": 782100 }, { "epoch": 3.186510898889793, "grad_norm": 4.115302085876465, "learning_rate": 0.002642843708354939, "loss": 7.6585, "step": 782200 }, { "epoch": 3.1869182769131745, "grad_norm": 5.047887325286865, "learning_rate": 0.0026423483700641587, "loss": 7.7016, "step": 782300 }, { "epoch": 3.187325654936556, "grad_norm": 5.8126397132873535, "learning_rate": 0.0026418530263432644, "loss": 7.694, "step": 782400 }, { "epoch": 3.1877330329599376, "grad_norm": 6.155248165130615, "learning_rate": 0.002641357677211836, "loss": 7.7, "step": 782500 }, { "epoch": 3.1881404109833187, "grad_norm": 4.365386486053467, "learning_rate": 0.002640862322689461, "loss": 7.6771, "step": 782600 }, { "epoch": 3.1885477890067, "grad_norm": 3.0681233406066895, "learning_rate": 0.002640366962795722, "loss": 7.67, "step": 782700 }, { "epoch": 3.1889551670300817, "grad_norm": 7.4326300621032715, "learning_rate": 0.0026398715975502036, "loss": 7.6714, "step": 782800 }, { "epoch": 3.1893625450534633, "grad_norm": 3.5183217525482178, "learning_rate": 0.002639376226972487, "loss": 7.6961, "step": 782900 }, { "epoch": 3.189769923076845, "grad_norm": 6.676580905914307, "learning_rate": 0.0026388808510821593, "loss": 7.7003, "step": 783000 }, { "epoch": 3.189769923076845, "eval_MaskedAccuracy": 0.5067654794652767, "eval_loss": 1.6153063774108887, "eval_runtime": 150.0364, "eval_samples_per_second": 423.071, "eval_steps_per_second": 1.653, "step": 783000 }, { "epoch": 3.1901773011002263, "grad_norm": 6.243706703186035, "learning_rate": 0.0026383854698988074, "loss": 7.6941, "step": 783100 }, { "epoch": 3.1905846791236074, "grad_norm": 8.316851615905762, "learning_rate": 0.0026378900834420154, "loss": 7.6673, "step": 783200 }, { "epoch": 3.190992057146989, "grad_norm": 5.983206272125244, "learning_rate": 0.0026373946917313657, "loss": 7.6855, "step": 783300 }, { "epoch": 3.1913994351703705, "grad_norm": 5.882169246673584, "learning_rate": 0.0026368992947864447, "loss": 7.7042, "step": 783400 }, { "epoch": 3.191806813193752, "grad_norm": 7.513192653656006, "learning_rate": 0.002636403892626837, "loss": 7.7099, "step": 783500 }, { "epoch": 3.1922141912171336, "grad_norm": 6.345666885375977, "learning_rate": 0.0026359084852721365, "loss": 7.6897, "step": 783600 }, { "epoch": 3.192621569240515, "grad_norm": 7.76584005355835, "learning_rate": 0.002635413072741921, "loss": 7.6796, "step": 783700 }, { "epoch": 3.1930289472638966, "grad_norm": 3.749234914779663, "learning_rate": 0.0026349176550557797, "loss": 7.6882, "step": 783800 }, { "epoch": 3.1934363252872777, "grad_norm": 3.1690194606781006, "learning_rate": 0.002634422232233299, "loss": 7.6791, "step": 783900 }, { "epoch": 3.1938437033106593, "grad_norm": 4.605543613433838, "learning_rate": 0.002633926804294065, "loss": 7.701, "step": 784000 }, { "epoch": 3.1938437033106593, "eval_MaskedAccuracy": 0.5065523382952148, "eval_loss": 1.6239582300186157, "eval_runtime": 152.2555, "eval_samples_per_second": 416.905, "eval_steps_per_second": 1.629, "step": 784000 }, { "epoch": 3.194251081334041, "grad_norm": 6.241485118865967, "learning_rate": 0.002633431371257664, "loss": 7.7153, "step": 784100 }, { "epoch": 3.1946584593574223, "grad_norm": 2.8318705558776855, "learning_rate": 0.0026329359331436855, "loss": 7.6827, "step": 784200 }, { "epoch": 3.195065837380804, "grad_norm": 3.8722984790802, "learning_rate": 0.002632440489971713, "loss": 7.6778, "step": 784300 }, { "epoch": 3.1954732154041854, "grad_norm": 6.953472137451172, "learning_rate": 0.0026319450417613363, "loss": 7.6649, "step": 784400 }, { "epoch": 3.1958805934275665, "grad_norm": 4.383271217346191, "learning_rate": 0.0026314495885321415, "loss": 7.6842, "step": 784500 }, { "epoch": 3.196287971450948, "grad_norm": 6.655606746673584, "learning_rate": 0.002630954130303722, "loss": 7.6752, "step": 784600 }, { "epoch": 3.1966953494743295, "grad_norm": 5.002296447753906, "learning_rate": 0.0026304586670956617, "loss": 7.6938, "step": 784700 }, { "epoch": 3.197102727497711, "grad_norm": 8.698090553283691, "learning_rate": 0.0026299631989275493, "loss": 7.6592, "step": 784800 }, { "epoch": 3.1975101055210926, "grad_norm": 4.085105895996094, "learning_rate": 0.002629467725818974, "loss": 7.7082, "step": 784900 }, { "epoch": 3.197917483544474, "grad_norm": 5.493113994598389, "learning_rate": 0.0026289722477895227, "loss": 7.6974, "step": 785000 }, { "epoch": 3.197917483544474, "eval_MaskedAccuracy": 0.5063409749369749, "eval_loss": 1.6177254915237427, "eval_runtime": 154.8526, "eval_samples_per_second": 409.912, "eval_steps_per_second": 1.602, "step": 785000 }, { "epoch": 3.1983248615678552, "grad_norm": 5.456820487976074, "learning_rate": 0.0026284767648587846, "loss": 7.6985, "step": 785100 }, { "epoch": 3.1987322395912368, "grad_norm": 7.716866493225098, "learning_rate": 0.0026279812770463487, "loss": 7.7319, "step": 785200 }, { "epoch": 3.1991396176146183, "grad_norm": 5.643834590911865, "learning_rate": 0.0026274857843718077, "loss": 7.6988, "step": 785300 }, { "epoch": 3.199546995638, "grad_norm": 4.009466648101807, "learning_rate": 0.002626990286854748, "loss": 7.672, "step": 785400 }, { "epoch": 3.1999543736613814, "grad_norm": 6.094020843505859, "learning_rate": 0.0026264947845147605, "loss": 7.6919, "step": 785500 }, { "epoch": 3.200361751684763, "grad_norm": 5.463302135467529, "learning_rate": 0.002625999277371434, "loss": 7.6692, "step": 785600 }, { "epoch": 3.200769129708144, "grad_norm": 3.863553047180176, "learning_rate": 0.0026255037654443593, "loss": 7.6706, "step": 785700 }, { "epoch": 3.2011765077315255, "grad_norm": 5.52654504776001, "learning_rate": 0.002625008248753126, "loss": 7.6779, "step": 785800 }, { "epoch": 3.201583885754907, "grad_norm": 5.104406356811523, "learning_rate": 0.002624512727317326, "loss": 7.6878, "step": 785900 }, { "epoch": 3.2019912637782886, "grad_norm": 4.26261568069458, "learning_rate": 0.0026240172011565456, "loss": 7.7023, "step": 786000 }, { "epoch": 3.2019912637782886, "eval_MaskedAccuracy": 0.5069396359302315, "eval_loss": 1.6208125352859497, "eval_runtime": 150.9245, "eval_samples_per_second": 420.581, "eval_steps_per_second": 1.643, "step": 786000 }, { "epoch": 3.20239864180167, "grad_norm": 2.636861562728882, "learning_rate": 0.0026235216702903785, "loss": 7.7174, "step": 786100 }, { "epoch": 3.2028060198250516, "grad_norm": 5.463211536407471, "learning_rate": 0.002623026134738419, "loss": 7.7076, "step": 786200 }, { "epoch": 3.203213397848433, "grad_norm": 11.473145484924316, "learning_rate": 0.002622530594520253, "loss": 7.7009, "step": 786300 }, { "epoch": 3.2036207758718143, "grad_norm": 11.875235557556152, "learning_rate": 0.0026220350496554724, "loss": 7.71, "step": 786400 }, { "epoch": 3.204028153895196, "grad_norm": 3.7144064903259277, "learning_rate": 0.0026215395001636735, "loss": 7.7101, "step": 786500 }, { "epoch": 3.2044355319185773, "grad_norm": 4.499942779541016, "learning_rate": 0.002621043946064445, "loss": 7.6817, "step": 786600 }, { "epoch": 3.204842909941959, "grad_norm": 4.006444931030273, "learning_rate": 0.002620548387377378, "loss": 7.6984, "step": 786700 }, { "epoch": 3.2052502879653404, "grad_norm": 5.552825927734375, "learning_rate": 0.0026200528241220634, "loss": 7.6526, "step": 786800 }, { "epoch": 3.205657665988722, "grad_norm": 5.5530781745910645, "learning_rate": 0.002619557256318098, "loss": 7.6713, "step": 786900 }, { "epoch": 3.206065044012103, "grad_norm": 3.4495584964752197, "learning_rate": 0.0026190616839850706, "loss": 7.7045, "step": 787000 }, { "epoch": 3.206065044012103, "eval_MaskedAccuracy": 0.5069079826713474, "eval_loss": 1.6114245653152466, "eval_runtime": 152.2694, "eval_samples_per_second": 416.867, "eval_steps_per_second": 1.629, "step": 787000 }, { "epoch": 3.2064724220354845, "grad_norm": 6.009396553039551, "learning_rate": 0.0026185661071425736, "loss": 7.6799, "step": 787100 }, { "epoch": 3.206879800058866, "grad_norm": 6.559325695037842, "learning_rate": 0.0026180705258102006, "loss": 7.6822, "step": 787200 }, { "epoch": 3.2072871780822476, "grad_norm": 3.152937173843384, "learning_rate": 0.0026175749400075437, "loss": 7.689, "step": 787300 }, { "epoch": 3.207694556105629, "grad_norm": 7.587808609008789, "learning_rate": 0.0026170793497542013, "loss": 7.6745, "step": 787400 }, { "epoch": 3.2081019341290107, "grad_norm": 7.427546501159668, "learning_rate": 0.0026165837550697624, "loss": 7.6911, "step": 787500 }, { "epoch": 3.2085093121523918, "grad_norm": 6.7571563720703125, "learning_rate": 0.00261608815597382, "loss": 7.7035, "step": 787600 }, { "epoch": 3.2089166901757733, "grad_norm": 5.109492778778076, "learning_rate": 0.0026155925524859684, "loss": 7.7036, "step": 787700 }, { "epoch": 3.209324068199155, "grad_norm": 5.578980922698975, "learning_rate": 0.0026150969446258015, "loss": 7.6811, "step": 787800 }, { "epoch": 3.2097314462225364, "grad_norm": 8.288555145263672, "learning_rate": 0.0026146013324129156, "loss": 7.6812, "step": 787900 }, { "epoch": 3.210138824245918, "grad_norm": 3.1266098022460938, "learning_rate": 0.0026141057158668994, "loss": 7.6949, "step": 788000 }, { "epoch": 3.210138824245918, "eval_MaskedAccuracy": 0.5067794550282789, "eval_loss": 1.6136877536773682, "eval_runtime": 155.6805, "eval_samples_per_second": 407.733, "eval_steps_per_second": 1.593, "step": 788000 }, { "epoch": 3.2105462022692994, "grad_norm": 7.264710903167725, "learning_rate": 0.0026136100950073533, "loss": 7.6616, "step": 788100 }, { "epoch": 3.2109535802926805, "grad_norm": 3.103065013885498, "learning_rate": 0.0026131144698538656, "loss": 7.6963, "step": 788200 }, { "epoch": 3.211360958316062, "grad_norm": 3.8947644233703613, "learning_rate": 0.0026126188404260397, "loss": 7.6848, "step": 788300 }, { "epoch": 3.2117683363394436, "grad_norm": 5.128793239593506, "learning_rate": 0.002612123206743462, "loss": 7.6758, "step": 788400 }, { "epoch": 3.212175714362825, "grad_norm": 10.643733024597168, "learning_rate": 0.002611627568825732, "loss": 7.6847, "step": 788500 }, { "epoch": 3.2125830923862067, "grad_norm": 7.177453994750977, "learning_rate": 0.0026111319266924433, "loss": 7.706, "step": 788600 }, { "epoch": 3.212990470409588, "grad_norm": 6.658900260925293, "learning_rate": 0.002610636280363193, "loss": 7.7116, "step": 788700 }, { "epoch": 3.2133978484329697, "grad_norm": 3.2530908584594727, "learning_rate": 0.0026101406298575753, "loss": 7.6861, "step": 788800 }, { "epoch": 3.213805226456351, "grad_norm": 2.9466047286987305, "learning_rate": 0.0026096449751951863, "loss": 7.6951, "step": 788900 }, { "epoch": 3.2142126044797323, "grad_norm": 8.129348754882812, "learning_rate": 0.002609149316395622, "loss": 7.6805, "step": 789000 }, { "epoch": 3.2142126044797323, "eval_MaskedAccuracy": 0.5062638108644215, "eval_loss": 1.624267339706421, "eval_runtime": 173.9503, "eval_samples_per_second": 364.909, "eval_steps_per_second": 1.426, "step": 789000 }, { "epoch": 3.214619982503114, "grad_norm": 4.266531944274902, "learning_rate": 0.002608653653478478, "loss": 7.6982, "step": 789100 }, { "epoch": 3.2150273605264954, "grad_norm": 5.689974308013916, "learning_rate": 0.002608157986463352, "loss": 7.6479, "step": 789200 }, { "epoch": 3.215434738549877, "grad_norm": 5.786675930023193, "learning_rate": 0.0026076623153698363, "loss": 7.6495, "step": 789300 }, { "epoch": 3.2158421165732585, "grad_norm": 4.292298316955566, "learning_rate": 0.002607166640217529, "loss": 7.6508, "step": 789400 }, { "epoch": 3.2162494945966396, "grad_norm": 7.28933572769165, "learning_rate": 0.0026066709610260298, "loss": 7.6981, "step": 789500 }, { "epoch": 3.216656872620021, "grad_norm": 5.634224891662598, "learning_rate": 0.002606175277814936, "loss": 7.6803, "step": 789600 }, { "epoch": 3.2170642506434026, "grad_norm": 9.318557739257812, "learning_rate": 0.0026056795906038385, "loss": 7.6912, "step": 789700 }, { "epoch": 3.217471628666784, "grad_norm": 5.599298477172852, "learning_rate": 0.002605183899412342, "loss": 7.6777, "step": 789800 }, { "epoch": 3.2178790066901657, "grad_norm": 7.910877227783203, "learning_rate": 0.002604688204260037, "loss": 7.6685, "step": 789900 }, { "epoch": 3.218286384713547, "grad_norm": 9.06559944152832, "learning_rate": 0.002604192505166525, "loss": 7.6621, "step": 790000 }, { "epoch": 3.218286384713547, "eval_MaskedAccuracy": 0.5063175242791842, "eval_loss": 1.6187844276428223, "eval_runtime": 168.5787, "eval_samples_per_second": 376.536, "eval_steps_per_second": 1.471, "step": 790000 }, { "epoch": 3.2186937627369283, "grad_norm": 4.518896579742432, "learning_rate": 0.002603696802151403, "loss": 7.6735, "step": 790100 }, { "epoch": 3.21910114076031, "grad_norm": 6.1575188636779785, "learning_rate": 0.002603201095234266, "loss": 7.6557, "step": 790200 }, { "epoch": 3.2195085187836914, "grad_norm": 5.235797882080078, "learning_rate": 0.002602705384434718, "loss": 7.6597, "step": 790300 }, { "epoch": 3.219915896807073, "grad_norm": 7.212177276611328, "learning_rate": 0.0026022096697723538, "loss": 7.6782, "step": 790400 }, { "epoch": 3.2203232748304544, "grad_norm": 15.714856147766113, "learning_rate": 0.002601713951266769, "loss": 7.677, "step": 790500 }, { "epoch": 3.220730652853836, "grad_norm": 6.259230136871338, "learning_rate": 0.002601218228937566, "loss": 7.6816, "step": 790600 }, { "epoch": 3.221138030877217, "grad_norm": 4.9577178955078125, "learning_rate": 0.002600722502804343, "loss": 7.6754, "step": 790700 }, { "epoch": 3.2215454089005986, "grad_norm": 6.513192653656006, "learning_rate": 0.0026002267728866962, "loss": 7.6776, "step": 790800 }, { "epoch": 3.22195278692398, "grad_norm": 4.687143325805664, "learning_rate": 0.0025997310392042233, "loss": 7.6789, "step": 790900 }, { "epoch": 3.2223601649473617, "grad_norm": 6.113856315612793, "learning_rate": 0.002599235301776526, "loss": 7.6858, "step": 791000 }, { "epoch": 3.2223601649473617, "eval_MaskedAccuracy": 0.5065886525316977, "eval_loss": 1.6032770872116089, "eval_runtime": 157.6746, "eval_samples_per_second": 402.576, "eval_steps_per_second": 1.573, "step": 791000 }, { "epoch": 3.222767542970743, "grad_norm": 6.812439441680908, "learning_rate": 0.002598739560623205, "loss": 7.6818, "step": 791100 }, { "epoch": 3.2231749209941247, "grad_norm": 6.572329044342041, "learning_rate": 0.002598243815763861, "loss": 7.6804, "step": 791200 }, { "epoch": 3.2235822990175063, "grad_norm": 10.578540802001953, "learning_rate": 0.0025977480672180852, "loss": 7.6719, "step": 791300 }, { "epoch": 3.2239896770408873, "grad_norm": 5.3097381591796875, "learning_rate": 0.0025972523150054817, "loss": 7.6551, "step": 791400 }, { "epoch": 3.224397055064269, "grad_norm": 6.691742420196533, "learning_rate": 0.002596756559145654, "loss": 7.7099, "step": 791500 }, { "epoch": 3.2248044330876504, "grad_norm": 7.910216808319092, "learning_rate": 0.0025962607996581947, "loss": 7.6928, "step": 791600 }, { "epoch": 3.225211811111032, "grad_norm": 3.708089828491211, "learning_rate": 0.002595765036562709, "loss": 7.6641, "step": 791700 }, { "epoch": 3.2256191891344135, "grad_norm": 5.971957683563232, "learning_rate": 0.0025952692698788, "loss": 7.6519, "step": 791800 }, { "epoch": 3.226026567157795, "grad_norm": 6.111754894256592, "learning_rate": 0.0025947734996260657, "loss": 7.6577, "step": 791900 }, { "epoch": 3.226433945181176, "grad_norm": 9.812180519104004, "learning_rate": 0.0025942777258241047, "loss": 7.6786, "step": 792000 }, { "epoch": 3.226433945181176, "eval_MaskedAccuracy": 0.5064100908461951, "eval_loss": 1.6158021688461304, "eval_runtime": 152.3994, "eval_samples_per_second": 416.511, "eval_steps_per_second": 1.627, "step": 792000 }, { "epoch": 3.2268413232045576, "grad_norm": 3.814610481262207, "learning_rate": 0.0025937819484925166, "loss": 7.6836, "step": 792100 }, { "epoch": 3.227248701227939, "grad_norm": 7.241386890411377, "learning_rate": 0.0025932861676508994, "loss": 7.6843, "step": 792200 }, { "epoch": 3.2276560792513207, "grad_norm": 9.676058769226074, "learning_rate": 0.0025927903833188573, "loss": 7.6763, "step": 792300 }, { "epoch": 3.2280634572747022, "grad_norm": 9.320780754089355, "learning_rate": 0.002592294595515992, "loss": 7.682, "step": 792400 }, { "epoch": 3.2284708352980838, "grad_norm": 7.394233703613281, "learning_rate": 0.002591798804261906, "loss": 7.6691, "step": 792500 }, { "epoch": 3.228878213321465, "grad_norm": 3.47312331199646, "learning_rate": 0.0025913030095761993, "loss": 7.6674, "step": 792600 }, { "epoch": 3.2292855913448464, "grad_norm": 3.3234751224517822, "learning_rate": 0.002590807211478471, "loss": 7.6895, "step": 792700 }, { "epoch": 3.229692969368228, "grad_norm": 4.033292770385742, "learning_rate": 0.0025903114099883264, "loss": 7.6742, "step": 792800 }, { "epoch": 3.2301003473916094, "grad_norm": 9.050682067871094, "learning_rate": 0.002589815605125367, "loss": 7.6805, "step": 792900 }, { "epoch": 3.230507725414991, "grad_norm": 3.7985522747039795, "learning_rate": 0.0025893197969091937, "loss": 7.6909, "step": 793000 }, { "epoch": 3.230507725414991, "eval_MaskedAccuracy": 0.5060678756069075, "eval_loss": 1.6170668601989746, "eval_runtime": 157.5492, "eval_samples_per_second": 402.896, "eval_steps_per_second": 1.574, "step": 793000 }, { "epoch": 3.2309151034383725, "grad_norm": 11.379815101623535, "learning_rate": 0.0025888239853594087, "loss": 7.6679, "step": 793100 }, { "epoch": 3.2313224814617536, "grad_norm": 4.251285552978516, "learning_rate": 0.0025883281704956094, "loss": 7.6722, "step": 793200 }, { "epoch": 3.231729859485135, "grad_norm": 3.687894582748413, "learning_rate": 0.0025878323523374014, "loss": 7.6736, "step": 793300 }, { "epoch": 3.2321372375085167, "grad_norm": 3.4797394275665283, "learning_rate": 0.002587336530904386, "loss": 7.6828, "step": 793400 }, { "epoch": 3.232544615531898, "grad_norm": 7.058933258056641, "learning_rate": 0.0025868407062161723, "loss": 7.6808, "step": 793500 }, { "epoch": 3.2329519935552797, "grad_norm": 4.875361919403076, "learning_rate": 0.002586344878292359, "loss": 7.6649, "step": 793600 }, { "epoch": 3.2333593715786613, "grad_norm": 5.618971824645996, "learning_rate": 0.002585849047152545, "loss": 7.6777, "step": 793700 }, { "epoch": 3.233766749602043, "grad_norm": 6.132779598236084, "learning_rate": 0.002585353212816335, "loss": 7.6948, "step": 793800 }, { "epoch": 3.234174127625424, "grad_norm": 3.53932785987854, "learning_rate": 0.0025848573753033334, "loss": 7.6862, "step": 793900 }, { "epoch": 3.2345815056488054, "grad_norm": 5.543338298797607, "learning_rate": 0.002584361534633141, "loss": 7.6988, "step": 794000 }, { "epoch": 3.2345815056488054, "eval_MaskedAccuracy": 0.50695645326961, "eval_loss": 1.6186436414718628, "eval_runtime": 161.857, "eval_samples_per_second": 392.173, "eval_steps_per_second": 1.532, "step": 794000 }, { "epoch": 3.234988883672187, "grad_norm": 17.440731048583984, "learning_rate": 0.0025838656908253664, "loss": 7.6754, "step": 794100 }, { "epoch": 3.2353962616955685, "grad_norm": 3.4629099369049072, "learning_rate": 0.00258336984389961, "loss": 7.6435, "step": 794200 }, { "epoch": 3.23580363971895, "grad_norm": 9.417774200439453, "learning_rate": 0.0025828739938754688, "loss": 7.6781, "step": 794300 }, { "epoch": 3.2362110177423316, "grad_norm": 6.153289794921875, "learning_rate": 0.0025823781407725553, "loss": 7.6774, "step": 794400 }, { "epoch": 3.2366183957657126, "grad_norm": 12.250358581542969, "learning_rate": 0.002581882284610468, "loss": 7.6891, "step": 794500 }, { "epoch": 3.237025773789094, "grad_norm": 10.045564651489258, "learning_rate": 0.0025813864254088134, "loss": 7.6464, "step": 794600 }, { "epoch": 3.2374331518124757, "grad_norm": 4.859644889831543, "learning_rate": 0.002580890563187195, "loss": 7.6857, "step": 794700 }, { "epoch": 3.2378405298358572, "grad_norm": 3.5310721397399902, "learning_rate": 0.0025803946979652197, "loss": 7.6961, "step": 794800 }, { "epoch": 3.2382479078592388, "grad_norm": 5.769550323486328, "learning_rate": 0.0025798988297624876, "loss": 7.6683, "step": 794900 }, { "epoch": 3.2386552858826203, "grad_norm": 8.49172592163086, "learning_rate": 0.0025794029585986034, "loss": 7.6626, "step": 795000 }, { "epoch": 3.2386552858826203, "eval_MaskedAccuracy": 0.506165139490274, "eval_loss": 1.6166064739227295, "eval_runtime": 155.8636, "eval_samples_per_second": 407.253, "eval_steps_per_second": 1.591, "step": 795000 }, { "epoch": 3.2390626639060014, "grad_norm": 3.65195369720459, "learning_rate": 0.002578907084493171, "loss": 7.6849, "step": 795100 }, { "epoch": 3.239470041929383, "grad_norm": 7.123879909515381, "learning_rate": 0.0025784112074657994, "loss": 7.6886, "step": 795200 }, { "epoch": 3.2398774199527645, "grad_norm": 3.4789581298828125, "learning_rate": 0.0025779153275360886, "loss": 7.657, "step": 795300 }, { "epoch": 3.240284797976146, "grad_norm": 6.556728839874268, "learning_rate": 0.0025774194447236456, "loss": 7.6918, "step": 795400 }, { "epoch": 3.2406921759995275, "grad_norm": 4.326566219329834, "learning_rate": 0.0025769235590480735, "loss": 7.66, "step": 795500 }, { "epoch": 3.241099554022909, "grad_norm": 5.212819576263428, "learning_rate": 0.0025764276705289796, "loss": 7.6641, "step": 795600 }, { "epoch": 3.24150693204629, "grad_norm": 11.032466888427734, "learning_rate": 0.002575931779185968, "loss": 7.6738, "step": 795700 }, { "epoch": 3.2419143100696717, "grad_norm": 3.8308167457580566, "learning_rate": 0.0025754358850386423, "loss": 7.6656, "step": 795800 }, { "epoch": 3.242321688093053, "grad_norm": 5.099149227142334, "learning_rate": 0.002574939988106612, "loss": 7.6669, "step": 795900 }, { "epoch": 3.2427290661164347, "grad_norm": 6.100481986999512, "learning_rate": 0.002574444088409479, "loss": 7.7018, "step": 796000 }, { "epoch": 3.2427290661164347, "eval_MaskedAccuracy": 0.5066901859609768, "eval_loss": 1.6176397800445557, "eval_runtime": 154.6266, "eval_samples_per_second": 410.512, "eval_steps_per_second": 1.604, "step": 796000 }, { "epoch": 3.2431364441398163, "grad_norm": 3.527095079421997, "learning_rate": 0.0025739481859668495, "loss": 7.6949, "step": 796100 }, { "epoch": 3.243543822163198, "grad_norm": 6.980803966522217, "learning_rate": 0.0025734522807983303, "loss": 7.6947, "step": 796200 }, { "epoch": 3.2439512001865793, "grad_norm": 5.4837212562561035, "learning_rate": 0.002572956372923524, "loss": 7.6714, "step": 796300 }, { "epoch": 3.2443585782099604, "grad_norm": 4.07946252822876, "learning_rate": 0.002572460462362038, "loss": 7.6724, "step": 796400 }, { "epoch": 3.244765956233342, "grad_norm": 3.421415090560913, "learning_rate": 0.0025719645491334786, "loss": 7.6686, "step": 796500 }, { "epoch": 3.2451733342567235, "grad_norm": 6.321123123168945, "learning_rate": 0.0025714686332574534, "loss": 7.693, "step": 796600 }, { "epoch": 3.245580712280105, "grad_norm": 6.269840717315674, "learning_rate": 0.002570972714753567, "loss": 7.6786, "step": 796700 }, { "epoch": 3.2459880903034866, "grad_norm": 4.550161361694336, "learning_rate": 0.002570476793641428, "loss": 7.6605, "step": 796800 }, { "epoch": 3.246395468326868, "grad_norm": 8.534757614135742, "learning_rate": 0.002569980869940643, "loss": 7.6706, "step": 796900 }, { "epoch": 3.246802846350249, "grad_norm": 11.600919723510742, "learning_rate": 0.0025694849436708168, "loss": 7.6493, "step": 797000 }, { "epoch": 3.246802846350249, "eval_MaskedAccuracy": 0.5071739173757946, "eval_loss": 1.6162763833999634, "eval_runtime": 159.0367, "eval_samples_per_second": 399.128, "eval_steps_per_second": 1.559, "step": 797000 }, { "epoch": 3.2472102243736307, "grad_norm": 3.2398877143859863, "learning_rate": 0.002568989014851553, "loss": 7.6875, "step": 797100 }, { "epoch": 3.2476176023970122, "grad_norm": 4.973508358001709, "learning_rate": 0.0025684930835024604, "loss": 7.7187, "step": 797200 }, { "epoch": 3.248024980420394, "grad_norm": 2.883657693862915, "learning_rate": 0.0025679971496431497, "loss": 7.6984, "step": 797300 }, { "epoch": 3.2484323584437753, "grad_norm": 6.177520275115967, "learning_rate": 0.0025675012132932214, "loss": 7.6635, "step": 797400 }, { "epoch": 3.248839736467157, "grad_norm": 5.778260707855225, "learning_rate": 0.0025670052744722867, "loss": 7.6964, "step": 797500 }, { "epoch": 3.249247114490538, "grad_norm": 8.930655479431152, "learning_rate": 0.002566509333199952, "loss": 7.7054, "step": 797600 }, { "epoch": 3.2496544925139195, "grad_norm": 9.496335983276367, "learning_rate": 0.002566013389495825, "loss": 7.6869, "step": 797700 }, { "epoch": 3.250061870537301, "grad_norm": 5.685776710510254, "learning_rate": 0.0025655174433795104, "loss": 7.6672, "step": 797800 }, { "epoch": 3.2504692485606825, "grad_norm": 4.112448692321777, "learning_rate": 0.002565021494870619, "loss": 7.6707, "step": 797900 }, { "epoch": 3.250876626584064, "grad_norm": 9.60294246673584, "learning_rate": 0.0025645255439887553, "loss": 7.6897, "step": 798000 }, { "epoch": 3.250876626584064, "eval_MaskedAccuracy": 0.5067806243928795, "eval_loss": 1.6066287755966187, "eval_runtime": 151.4015, "eval_samples_per_second": 419.256, "eval_steps_per_second": 1.638, "step": 798000 }, { "epoch": 3.2512840046074456, "grad_norm": 11.241584777832031, "learning_rate": 0.0025640295907535323, "loss": 7.6509, "step": 798100 }, { "epoch": 3.2516913826308267, "grad_norm": 5.0842461585998535, "learning_rate": 0.0025635336351845506, "loss": 7.6853, "step": 798200 }, { "epoch": 3.252098760654208, "grad_norm": 3.7420406341552734, "learning_rate": 0.0025630376773014226, "loss": 7.6807, "step": 798300 }, { "epoch": 3.2525061386775898, "grad_norm": 4.594023704528809, "learning_rate": 0.002562541717123752, "loss": 7.6355, "step": 798400 }, { "epoch": 3.2529135167009713, "grad_norm": 3.3977713584899902, "learning_rate": 0.0025620457546711517, "loss": 7.7051, "step": 798500 }, { "epoch": 3.253320894724353, "grad_norm": 3.1453049182891846, "learning_rate": 0.0025615497899632296, "loss": 7.7023, "step": 798600 }, { "epoch": 3.2537282727477344, "grad_norm": 2.8873097896575928, "learning_rate": 0.0025610538230195906, "loss": 7.6959, "step": 798700 }, { "epoch": 3.254135650771116, "grad_norm": 4.299108982086182, "learning_rate": 0.002560557853859846, "loss": 7.6753, "step": 798800 }, { "epoch": 3.254543028794497, "grad_norm": 5.521244049072266, "learning_rate": 0.0025600618825036025, "loss": 7.7243, "step": 798900 }, { "epoch": 3.2549504068178785, "grad_norm": 2.167487144470215, "learning_rate": 0.002559565908970463, "loss": 7.6554, "step": 799000 }, { "epoch": 3.2549504068178785, "eval_MaskedAccuracy": 0.5067806013398966, "eval_loss": 1.620605230331421, "eval_runtime": 153.664, "eval_samples_per_second": 413.083, "eval_steps_per_second": 1.614, "step": 799000 }, { "epoch": 3.25535778484126, "grad_norm": 4.254641532897949, "learning_rate": 0.002559069933280044, "loss": 7.6711, "step": 799100 }, { "epoch": 3.2557651628646416, "grad_norm": 8.08716869354248, "learning_rate": 0.002558573955451954, "loss": 7.692, "step": 799200 }, { "epoch": 3.256172540888023, "grad_norm": 6.920125484466553, "learning_rate": 0.0025580779755057944, "loss": 7.6689, "step": 799300 }, { "epoch": 3.2565799189114046, "grad_norm": 12.316925048828125, "learning_rate": 0.002557581993461181, "loss": 7.6866, "step": 799400 }, { "epoch": 3.2569872969347857, "grad_norm": 4.028006553649902, "learning_rate": 0.0025570860093377203, "loss": 7.7037, "step": 799500 }, { "epoch": 3.2573946749581673, "grad_norm": 4.205632209777832, "learning_rate": 0.0025565900231550196, "loss": 7.6766, "step": 799600 }, { "epoch": 3.257802052981549, "grad_norm": 3.163806200027466, "learning_rate": 0.0025560940349326935, "loss": 7.6651, "step": 799700 }, { "epoch": 3.2582094310049303, "grad_norm": 4.043802261352539, "learning_rate": 0.0025555980446903444, "loss": 7.6903, "step": 799800 }, { "epoch": 3.258616809028312, "grad_norm": 5.942582130432129, "learning_rate": 0.0025551020524475824, "loss": 7.714, "step": 799900 }, { "epoch": 3.2590241870516934, "grad_norm": 10.770370483398438, "learning_rate": 0.002554606058224021, "loss": 7.7052, "step": 800000 }, { "epoch": 3.2590241870516934, "eval_MaskedAccuracy": 0.5065970476477271, "eval_loss": 1.621268630027771, "eval_runtime": 151.7535, "eval_samples_per_second": 418.284, "eval_steps_per_second": 1.634, "step": 800000 }, { "epoch": 3.259431565075075, "grad_norm": 6.294545650482178, "learning_rate": 0.002554110062039264, "loss": 7.6744, "step": 800100 }, { "epoch": 3.259838943098456, "grad_norm": 14.498682022094727, "learning_rate": 0.0025536140639129245, "loss": 7.6987, "step": 800200 }, { "epoch": 3.2602463211218375, "grad_norm": 3.168377637863159, "learning_rate": 0.0025531180638646103, "loss": 7.6485, "step": 800300 }, { "epoch": 3.260653699145219, "grad_norm": 12.571187973022461, "learning_rate": 0.0025526220619139315, "loss": 7.722, "step": 800400 }, { "epoch": 3.2610610771686006, "grad_norm": 9.39305591583252, "learning_rate": 0.0025521260580804992, "loss": 7.6599, "step": 800500 }, { "epoch": 3.261468455191982, "grad_norm": 4.574954032897949, "learning_rate": 0.0025516300523839228, "loss": 7.6529, "step": 800600 }, { "epoch": 3.2618758332153632, "grad_norm": 7.393759250640869, "learning_rate": 0.002551134044843811, "loss": 7.6694, "step": 800700 }, { "epoch": 3.2622832112387448, "grad_norm": 10.334128379821777, "learning_rate": 0.002550638035479774, "loss": 7.6941, "step": 800800 }, { "epoch": 3.2626905892621263, "grad_norm": 3.6924545764923096, "learning_rate": 0.0025501420243114205, "loss": 7.663, "step": 800900 }, { "epoch": 3.263097967285508, "grad_norm": 5.345485687255859, "learning_rate": 0.002549646011358361, "loss": 7.645, "step": 801000 }, { "epoch": 3.263097967285508, "eval_MaskedAccuracy": 0.5065235927475761, "eval_loss": 1.615440845489502, "eval_runtime": 151.0628, "eval_samples_per_second": 420.196, "eval_steps_per_second": 1.642, "step": 801000 }, { "epoch": 3.2635053453088894, "grad_norm": 6.097316741943359, "learning_rate": 0.002549149996640208, "loss": 7.6828, "step": 801100 }, { "epoch": 3.263912723332271, "grad_norm": 5.069605827331543, "learning_rate": 0.0025486539801765685, "loss": 7.7135, "step": 801200 }, { "epoch": 3.2643201013556524, "grad_norm": 3.518381118774414, "learning_rate": 0.0025481579619870518, "loss": 7.6617, "step": 801300 }, { "epoch": 3.2647274793790335, "grad_norm": 3.3528285026550293, "learning_rate": 0.0025476619420912736, "loss": 7.6659, "step": 801400 }, { "epoch": 3.265134857402415, "grad_norm": 6.6037421226501465, "learning_rate": 0.0025471659205088355, "loss": 7.7063, "step": 801500 }, { "epoch": 3.2655422354257966, "grad_norm": 5.243923664093018, "learning_rate": 0.0025466698972593563, "loss": 7.6827, "step": 801600 }, { "epoch": 3.265949613449178, "grad_norm": 6.573047637939453, "learning_rate": 0.0025461738723624432, "loss": 7.6895, "step": 801700 }, { "epoch": 3.2663569914725596, "grad_norm": 5.855910778045654, "learning_rate": 0.0025456778458377055, "loss": 7.673, "step": 801800 }, { "epoch": 3.266764369495941, "grad_norm": 5.195866584777832, "learning_rate": 0.0025451818177047547, "loss": 7.6784, "step": 801900 }, { "epoch": 3.2671717475193223, "grad_norm": 4.261105060577393, "learning_rate": 0.002544685787983205, "loss": 7.6847, "step": 802000 }, { "epoch": 3.2671717475193223, "eval_MaskedAccuracy": 0.5070783132797549, "eval_loss": 1.6130510568618774, "eval_runtime": 178.8347, "eval_samples_per_second": 354.942, "eval_steps_per_second": 1.387, "step": 802000 }, { "epoch": 3.267579125542704, "grad_norm": 9.53943157196045, "learning_rate": 0.002544189756692662, "loss": 7.7014, "step": 802100 }, { "epoch": 3.2679865035660853, "grad_norm": 4.8277130126953125, "learning_rate": 0.002543693723852737, "loss": 7.6575, "step": 802200 }, { "epoch": 3.268393881589467, "grad_norm": 2.8353869915008545, "learning_rate": 0.0025431976894830424, "loss": 7.7079, "step": 802300 }, { "epoch": 3.2688012596128484, "grad_norm": 4.182636260986328, "learning_rate": 0.0025427016536031886, "loss": 7.6707, "step": 802400 }, { "epoch": 3.26920863763623, "grad_norm": 4.57123327255249, "learning_rate": 0.0025422056162327854, "loss": 7.6783, "step": 802500 }, { "epoch": 3.2696160156596115, "grad_norm": 4.688640594482422, "learning_rate": 0.002541709577391448, "loss": 7.6814, "step": 802600 }, { "epoch": 3.2700233936829926, "grad_norm": 6.715690612792969, "learning_rate": 0.002541213537098782, "loss": 7.6787, "step": 802700 }, { "epoch": 3.270430771706374, "grad_norm": 12.053544044494629, "learning_rate": 0.0025407174953744035, "loss": 7.669, "step": 802800 }, { "epoch": 3.2708381497297556, "grad_norm": 4.029909133911133, "learning_rate": 0.0025402214522379195, "loss": 7.6633, "step": 802900 }, { "epoch": 3.271245527753137, "grad_norm": 3.22373366355896, "learning_rate": 0.0025397254077089435, "loss": 7.6628, "step": 803000 }, { "epoch": 3.271245527753137, "eval_MaskedAccuracy": 0.5067959192123417, "eval_loss": 1.615372657775879, "eval_runtime": 152.4193, "eval_samples_per_second": 416.456, "eval_steps_per_second": 1.627, "step": 803000 }, { "epoch": 3.2716529057765187, "grad_norm": 4.651641368865967, "learning_rate": 0.002539229361807089, "loss": 7.6845, "step": 803100 }, { "epoch": 3.2720602837998998, "grad_norm": 5.103263854980469, "learning_rate": 0.002538733314551964, "loss": 7.6793, "step": 803200 }, { "epoch": 3.2724676618232813, "grad_norm": 5.449814319610596, "learning_rate": 0.002538237265963181, "loss": 7.666, "step": 803300 }, { "epoch": 3.272875039846663, "grad_norm": 8.951945304870605, "learning_rate": 0.002537741216060347, "loss": 7.6829, "step": 803400 }, { "epoch": 3.2732824178700444, "grad_norm": 10.639776229858398, "learning_rate": 0.002537245164863083, "loss": 7.6379, "step": 803500 }, { "epoch": 3.273689795893426, "grad_norm": 4.770523548126221, "learning_rate": 0.002536749112390995, "loss": 7.6708, "step": 803600 }, { "epoch": 3.2740971739168074, "grad_norm": 6.401170253753662, "learning_rate": 0.0025362530586636903, "loss": 7.6789, "step": 803700 }, { "epoch": 3.274504551940189, "grad_norm": 10.611383438110352, "learning_rate": 0.0025357570037007884, "loss": 7.6818, "step": 803800 }, { "epoch": 3.27491192996357, "grad_norm": 4.672033786773682, "learning_rate": 0.002535260947521895, "loss": 7.6442, "step": 803900 }, { "epoch": 3.2753193079869516, "grad_norm": 4.015482425689697, "learning_rate": 0.002534764890146625, "loss": 7.6568, "step": 804000 }, { "epoch": 3.2753193079869516, "eval_MaskedAccuracy": 0.5067048275041367, "eval_loss": 1.618336796760559, "eval_runtime": 150.6131, "eval_samples_per_second": 421.451, "eval_steps_per_second": 1.647, "step": 804000 }, { "epoch": 3.275726686010333, "grad_norm": 4.1908769607543945, "learning_rate": 0.0025342688315945897, "loss": 7.6986, "step": 804100 }, { "epoch": 3.2761340640337147, "grad_norm": 8.904797554016113, "learning_rate": 0.002533772771885403, "loss": 7.6702, "step": 804200 }, { "epoch": 3.276541442057096, "grad_norm": 3.950209856033325, "learning_rate": 0.0025332767110386716, "loss": 7.679, "step": 804300 }, { "epoch": 3.2769488200804777, "grad_norm": 5.024580955505371, "learning_rate": 0.002532780649074014, "loss": 7.7098, "step": 804400 }, { "epoch": 3.277356198103859, "grad_norm": 10.476592063903809, "learning_rate": 0.002532284586011038, "loss": 7.6975, "step": 804500 }, { "epoch": 3.2777635761272403, "grad_norm": 5.804891586303711, "learning_rate": 0.0025317885218693564, "loss": 7.6933, "step": 804600 }, { "epoch": 3.278170954150622, "grad_norm": 4.553762912750244, "learning_rate": 0.0025312924566685824, "loss": 7.6826, "step": 804700 }, { "epoch": 3.2785783321740034, "grad_norm": 4.334897041320801, "learning_rate": 0.002530796390428322, "loss": 7.6761, "step": 804800 }, { "epoch": 3.278985710197385, "grad_norm": 3.2402517795562744, "learning_rate": 0.002530300323168194, "loss": 7.698, "step": 804900 }, { "epoch": 3.2793930882207665, "grad_norm": 5.841893672943115, "learning_rate": 0.002529804254907811, "loss": 7.6657, "step": 805000 }, { "epoch": 3.2793930882207665, "eval_MaskedAccuracy": 0.5069957634032773, "eval_loss": 1.621890664100647, "eval_runtime": 158.4293, "eval_samples_per_second": 400.658, "eval_steps_per_second": 1.565, "step": 805000 }, { "epoch": 3.279800466244148, "grad_norm": 3.275071382522583, "learning_rate": 0.002529308185666784, "loss": 7.6573, "step": 805100 }, { "epoch": 3.280207844267529, "grad_norm": 8.552112579345703, "learning_rate": 0.0025288121154647216, "loss": 7.672, "step": 805200 }, { "epoch": 3.2806152222909106, "grad_norm": 6.308941841125488, "learning_rate": 0.0025283160443212403, "loss": 7.6898, "step": 805300 }, { "epoch": 3.281022600314292, "grad_norm": 5.6279215812683105, "learning_rate": 0.0025278199722559526, "loss": 7.6964, "step": 805400 }, { "epoch": 3.2814299783376737, "grad_norm": 9.12334156036377, "learning_rate": 0.0025273238992884694, "loss": 7.6976, "step": 805500 }, { "epoch": 3.2818373563610552, "grad_norm": 4.141939640045166, "learning_rate": 0.002526827825438402, "loss": 7.6785, "step": 805600 }, { "epoch": 3.2822447343844363, "grad_norm": 4.386415481567383, "learning_rate": 0.0025263317507253627, "loss": 7.6561, "step": 805700 }, { "epoch": 3.282652112407818, "grad_norm": 12.528108596801758, "learning_rate": 0.0025258356751689673, "loss": 7.6917, "step": 805800 }, { "epoch": 3.2830594904311994, "grad_norm": 4.788679599761963, "learning_rate": 0.002525339598788825, "loss": 7.6804, "step": 805900 }, { "epoch": 3.283466868454581, "grad_norm": 4.137840747833252, "learning_rate": 0.0025248435216045483, "loss": 7.6733, "step": 806000 }, { "epoch": 3.283466868454581, "eval_MaskedAccuracy": 0.5072434898924681, "eval_loss": 1.6060513257980347, "eval_runtime": 168.0009, "eval_samples_per_second": 377.831, "eval_steps_per_second": 1.476, "step": 806000 }, { "epoch": 3.2838742464779624, "grad_norm": 4.594651699066162, "learning_rate": 0.0025243474436357507, "loss": 7.6882, "step": 806100 }, { "epoch": 3.284281624501344, "grad_norm": 3.3456647396087646, "learning_rate": 0.0025238513649020475, "loss": 7.6801, "step": 806200 }, { "epoch": 3.2846890025247255, "grad_norm": 7.094583034515381, "learning_rate": 0.002523355285423046, "loss": 7.6871, "step": 806300 }, { "epoch": 3.2850963805481066, "grad_norm": 15.75999927520752, "learning_rate": 0.0025228592052183654, "loss": 7.6768, "step": 806400 }, { "epoch": 3.285503758571488, "grad_norm": 6.918394565582275, "learning_rate": 0.002522363124307616, "loss": 7.6739, "step": 806500 }, { "epoch": 3.2859111365948697, "grad_norm": 11.002787590026855, "learning_rate": 0.002521867042710408, "loss": 7.6724, "step": 806600 }, { "epoch": 3.286318514618251, "grad_norm": 4.709118366241455, "learning_rate": 0.0025213709604463516, "loss": 7.6999, "step": 806700 }, { "epoch": 3.2867258926416327, "grad_norm": 5.836373329162598, "learning_rate": 0.0025208748775350665, "loss": 7.6598, "step": 806800 }, { "epoch": 3.2871332706650143, "grad_norm": 3.9379138946533203, "learning_rate": 0.0025203787939961633, "loss": 7.7014, "step": 806900 }, { "epoch": 3.2875406486883954, "grad_norm": 7.974735736846924, "learning_rate": 0.002519882709849256, "loss": 7.6726, "step": 807000 }, { "epoch": 3.2875406486883954, "eval_MaskedAccuracy": 0.5066687168272626, "eval_loss": 1.6104329824447632, "eval_runtime": 156.0427, "eval_samples_per_second": 406.786, "eval_steps_per_second": 1.589, "step": 807000 }, { "epoch": 3.287948026711777, "grad_norm": 9.561921119689941, "learning_rate": 0.002519386625113955, "loss": 7.6568, "step": 807100 }, { "epoch": 3.2883554047351584, "grad_norm": 2.4830827713012695, "learning_rate": 0.0025188905398098745, "loss": 7.6945, "step": 807200 }, { "epoch": 3.28876278275854, "grad_norm": 8.926530838012695, "learning_rate": 0.002518394453956622, "loss": 7.6425, "step": 807300 }, { "epoch": 3.2891701607819215, "grad_norm": 6.503739833831787, "learning_rate": 0.0025178983675738145, "loss": 7.6919, "step": 807400 }, { "epoch": 3.289577538805303, "grad_norm": 5.133355617523193, "learning_rate": 0.0025174022806810662, "loss": 7.6513, "step": 807500 }, { "epoch": 3.2899849168286845, "grad_norm": 3.24489164352417, "learning_rate": 0.002516906193297992, "loss": 7.6742, "step": 807600 }, { "epoch": 3.2903922948520656, "grad_norm": 4.072437286376953, "learning_rate": 0.0025164101054442035, "loss": 7.6676, "step": 807700 }, { "epoch": 3.290799672875447, "grad_norm": 4.990175724029541, "learning_rate": 0.0025159140171393114, "loss": 7.6899, "step": 807800 }, { "epoch": 3.2912070508988287, "grad_norm": 4.706521034240723, "learning_rate": 0.0025154179284029334, "loss": 7.6741, "step": 807900 }, { "epoch": 3.2916144289222102, "grad_norm": 3.965909719467163, "learning_rate": 0.002514921839254675, "loss": 7.6574, "step": 808000 }, { "epoch": 3.2916144289222102, "eval_MaskedAccuracy": 0.5066463820582249, "eval_loss": 1.6170014142990112, "eval_runtime": 155.9467, "eval_samples_per_second": 407.037, "eval_steps_per_second": 1.59, "step": 808000 }, { "epoch": 3.2920218069455918, "grad_norm": 4.205779075622559, "learning_rate": 0.002514425749714154, "loss": 7.6996, "step": 808100 }, { "epoch": 3.292429184968973, "grad_norm": 3.5748183727264404, "learning_rate": 0.002513929659800981, "loss": 7.6637, "step": 808200 }, { "epoch": 3.2928365629923544, "grad_norm": 3.411655902862549, "learning_rate": 0.002513433569534771, "loss": 7.667, "step": 808300 }, { "epoch": 3.293243941015736, "grad_norm": 5.77153205871582, "learning_rate": 0.002512937478935137, "loss": 7.6738, "step": 808400 }, { "epoch": 3.2936513190391175, "grad_norm": 8.544811248779297, "learning_rate": 0.0025124413880216936, "loss": 7.6931, "step": 808500 }, { "epoch": 3.294058697062499, "grad_norm": 9.314810752868652, "learning_rate": 0.0025119452968140493, "loss": 7.6602, "step": 808600 }, { "epoch": 3.2944660750858805, "grad_norm": 4.716907978057861, "learning_rate": 0.002511449205331821, "loss": 7.6472, "step": 808700 }, { "epoch": 3.294873453109262, "grad_norm": 6.9617743492126465, "learning_rate": 0.0025109531135946235, "loss": 7.6572, "step": 808800 }, { "epoch": 3.295280831132643, "grad_norm": 4.736232280731201, "learning_rate": 0.002510457021622067, "loss": 7.6661, "step": 808900 }, { "epoch": 3.2956882091560247, "grad_norm": 3.1326653957366943, "learning_rate": 0.002509960929433766, "loss": 7.6567, "step": 809000 }, { "epoch": 3.2956882091560247, "eval_MaskedAccuracy": 0.5064585278057439, "eval_loss": 1.6185801029205322, "eval_runtime": 154.763, "eval_samples_per_second": 410.15, "eval_steps_per_second": 1.602, "step": 809000 }, { "epoch": 3.296095587179406, "grad_norm": 4.165744304656982, "learning_rate": 0.002509464837049331, "loss": 7.7108, "step": 809100 }, { "epoch": 3.2965029652027877, "grad_norm": 5.649426460266113, "learning_rate": 0.0025089687444883745, "loss": 7.6691, "step": 809200 }, { "epoch": 3.2969103432261693, "grad_norm": 4.543972969055176, "learning_rate": 0.0025084726517705137, "loss": 7.6689, "step": 809300 }, { "epoch": 3.297317721249551, "grad_norm": 2.8531885147094727, "learning_rate": 0.002507976558915362, "loss": 7.6529, "step": 809400 }, { "epoch": 3.297725099272932, "grad_norm": 6.886300086975098, "learning_rate": 0.0025074804659425314, "loss": 7.6462, "step": 809500 }, { "epoch": 3.2981324772963134, "grad_norm": 3.5927135944366455, "learning_rate": 0.0025069843728716353, "loss": 7.666, "step": 809600 }, { "epoch": 3.298539855319695, "grad_norm": 9.71517276763916, "learning_rate": 0.0025064882797222895, "loss": 7.6824, "step": 809700 }, { "epoch": 3.2989472333430765, "grad_norm": 8.23390007019043, "learning_rate": 0.0025059921865141018, "loss": 7.677, "step": 809800 }, { "epoch": 3.299354611366458, "grad_norm": 2.9719061851501465, "learning_rate": 0.002505496093266686, "loss": 7.6958, "step": 809900 }, { "epoch": 3.2997619893898396, "grad_norm": 6.405104160308838, "learning_rate": 0.002504999999999657, "loss": 7.6959, "step": 810000 }, { "epoch": 3.2997619893898396, "eval_MaskedAccuracy": 0.5065891772807434, "eval_loss": 1.6186394691467285, "eval_runtime": 159.5458, "eval_samples_per_second": 397.854, "eval_steps_per_second": 1.554, "step": 810000 }, { "epoch": 3.300169367413221, "grad_norm": 5.008472442626953, "learning_rate": 0.0025045039067326273, "loss": 7.7108, "step": 810100 }, { "epoch": 3.300576745436602, "grad_norm": 6.079156875610352, "learning_rate": 0.002504007813485209, "loss": 7.6702, "step": 810200 }, { "epoch": 3.3009841234599837, "grad_norm": 3.707399845123291, "learning_rate": 0.0025035117202770163, "loss": 7.6839, "step": 810300 }, { "epoch": 3.3013915014833652, "grad_norm": 4.559576988220215, "learning_rate": 0.002503015627127667, "loss": 7.6854, "step": 810400 }, { "epoch": 3.3017988795067468, "grad_norm": 6.961183071136475, "learning_rate": 0.0025025195340567714, "loss": 7.681, "step": 810500 }, { "epoch": 3.3022062575301283, "grad_norm": 4.960519313812256, "learning_rate": 0.002502023441083942, "loss": 7.6743, "step": 810600 }, { "epoch": 3.3026136355535094, "grad_norm": 4.832784175872803, "learning_rate": 0.0025015273482287895, "loss": 7.6567, "step": 810700 }, { "epoch": 3.303021013576891, "grad_norm": 3.261326313018799, "learning_rate": 0.0025010312555109305, "loss": 7.7116, "step": 810800 }, { "epoch": 3.3034283916002725, "grad_norm": 8.754937171936035, "learning_rate": 0.002500535162949978, "loss": 7.6876, "step": 810900 }, { "epoch": 3.303835769623654, "grad_norm": 4.600485324859619, "learning_rate": 0.0025000390705655438, "loss": 7.674, "step": 811000 }, { "epoch": 3.303835769623654, "eval_MaskedAccuracy": 0.5071697466006893, "eval_loss": 1.604413628578186, "eval_runtime": 156.5897, "eval_samples_per_second": 405.365, "eval_steps_per_second": 1.584, "step": 811000 }, { "epoch": 3.3042431476470355, "grad_norm": 10.910696983337402, "learning_rate": 0.002499542978377241, "loss": 7.6794, "step": 811100 }, { "epoch": 3.304650525670417, "grad_norm": 5.202807426452637, "learning_rate": 0.0024990468864046853, "loss": 7.6865, "step": 811200 }, { "epoch": 3.3050579036937986, "grad_norm": 2.4385552406311035, "learning_rate": 0.002498550794667481, "loss": 7.6832, "step": 811300 }, { "epoch": 3.3054652817171797, "grad_norm": 3.8294498920440674, "learning_rate": 0.002498054703185255, "loss": 7.6796, "step": 811400 }, { "epoch": 3.305872659740561, "grad_norm": 4.572168350219727, "learning_rate": 0.0024975586119776122, "loss": 7.6867, "step": 811500 }, { "epoch": 3.3062800377639427, "grad_norm": 4.635795593261719, "learning_rate": 0.0024970625210641666, "loss": 7.6873, "step": 811600 }, { "epoch": 3.3066874157873243, "grad_norm": 4.829721450805664, "learning_rate": 0.0024965664304645342, "loss": 7.6672, "step": 811700 }, { "epoch": 3.307094793810706, "grad_norm": 5.453036308288574, "learning_rate": 0.0024960703401983248, "loss": 7.6808, "step": 811800 }, { "epoch": 3.3075021718340873, "grad_norm": 10.133064270019531, "learning_rate": 0.0024955742502851524, "loss": 7.6761, "step": 811900 }, { "epoch": 3.3079095498574684, "grad_norm": 4.497378826141357, "learning_rate": 0.0024950781607446302, "loss": 7.6589, "step": 812000 }, { "epoch": 3.3079095498574684, "eval_MaskedAccuracy": 0.5067306555130758, "eval_loss": 1.6124553680419922, "eval_runtime": 157.0336, "eval_samples_per_second": 404.219, "eval_steps_per_second": 1.579, "step": 812000 }, { "epoch": 3.30831692788085, "grad_norm": 4.651226997375488, "learning_rate": 0.0024945820715963694, "loss": 7.6896, "step": 812100 }, { "epoch": 3.3087243059042315, "grad_norm": 5.181884765625, "learning_rate": 0.002494085982859984, "loss": 7.6753, "step": 812200 }, { "epoch": 3.309131683927613, "grad_norm": 6.102396011352539, "learning_rate": 0.002493589894555091, "loss": 7.6916, "step": 812300 }, { "epoch": 3.3095390619509946, "grad_norm": 5.432448387145996, "learning_rate": 0.0024930938067013004, "loss": 7.7008, "step": 812400 }, { "epoch": 3.309946439974376, "grad_norm": 5.259257793426514, "learning_rate": 0.0024925977193182235, "loss": 7.6815, "step": 812500 }, { "epoch": 3.3103538179977576, "grad_norm": 4.742974758148193, "learning_rate": 0.0024921016324254788, "loss": 7.6563, "step": 812600 }, { "epoch": 3.3107611960211387, "grad_norm": 3.7323997020721436, "learning_rate": 0.002491605546042675, "loss": 7.6616, "step": 812700 }, { "epoch": 3.3111685740445203, "grad_norm": 4.109499454498291, "learning_rate": 0.0024911094601894235, "loss": 7.6636, "step": 812800 }, { "epoch": 3.311575952067902, "grad_norm": 4.057988166809082, "learning_rate": 0.0024906133748853407, "loss": 7.6849, "step": 812900 }, { "epoch": 3.3119833300912833, "grad_norm": 5.340011119842529, "learning_rate": 0.0024901172901500407, "loss": 7.6987, "step": 813000 }, { "epoch": 3.3119833300912833, "eval_MaskedAccuracy": 0.5066784227780029, "eval_loss": 1.6174936294555664, "eval_runtime": 157.179, "eval_samples_per_second": 403.845, "eval_steps_per_second": 1.578, "step": 813000 }, { "epoch": 3.312390708114665, "grad_norm": 7.912527084350586, "learning_rate": 0.0024896212060031325, "loss": 7.6837, "step": 813100 }, { "epoch": 3.312798086138046, "grad_norm": 6.485486030578613, "learning_rate": 0.0024891251224642284, "loss": 7.6961, "step": 813200 }, { "epoch": 3.3132054641614275, "grad_norm": 5.897899627685547, "learning_rate": 0.002488629039552944, "loss": 7.6642, "step": 813300 }, { "epoch": 3.313612842184809, "grad_norm": 3.0667595863342285, "learning_rate": 0.002488132957288891, "loss": 7.6893, "step": 813400 }, { "epoch": 3.3140202202081905, "grad_norm": 3.066885232925415, "learning_rate": 0.002487636875691679, "loss": 7.6936, "step": 813500 }, { "epoch": 3.314427598231572, "grad_norm": 4.320520877838135, "learning_rate": 0.00248714079478093, "loss": 7.6324, "step": 813600 }, { "epoch": 3.3148349762549536, "grad_norm": 9.33206844329834, "learning_rate": 0.0024866447145762472, "loss": 7.6475, "step": 813700 }, { "epoch": 3.315242354278335, "grad_norm": 5.835254192352295, "learning_rate": 0.002486148635097248, "loss": 7.6736, "step": 813800 }, { "epoch": 3.3156497323017162, "grad_norm": 5.379861354827881, "learning_rate": 0.0024856525563635465, "loss": 7.6713, "step": 813900 }, { "epoch": 3.3160571103250978, "grad_norm": 4.814055919647217, "learning_rate": 0.0024851564783947497, "loss": 7.6467, "step": 814000 }, { "epoch": 3.3160571103250978, "eval_MaskedAccuracy": 0.5070498232020509, "eval_loss": 1.6167371273040771, "eval_runtime": 156.0631, "eval_samples_per_second": 406.733, "eval_steps_per_second": 1.589, "step": 814000 }, { "epoch": 3.3164644883484793, "grad_norm": 7.058439254760742, "learning_rate": 0.0024846604012104746, "loss": 7.6669, "step": 814100 }, { "epoch": 3.316871866371861, "grad_norm": 9.72732925415039, "learning_rate": 0.002484164324830333, "loss": 7.6409, "step": 814200 }, { "epoch": 3.3172792443952424, "grad_norm": 7.237351417541504, "learning_rate": 0.0024836682492739353, "loss": 7.6698, "step": 814300 }, { "epoch": 3.317686622418624, "grad_norm": 4.3387064933776855, "learning_rate": 0.002483172174560898, "loss": 7.6407, "step": 814400 }, { "epoch": 3.318094000442005, "grad_norm": 5.568198204040527, "learning_rate": 0.0024826761007108295, "loss": 7.6651, "step": 814500 }, { "epoch": 3.3185013784653865, "grad_norm": 11.06213665008545, "learning_rate": 0.002482180027743345, "loss": 7.6739, "step": 814600 }, { "epoch": 3.318908756488768, "grad_norm": 3.0172247886657715, "learning_rate": 0.0024816839556780546, "loss": 7.67, "step": 814700 }, { "epoch": 3.3193161345121496, "grad_norm": 5.155795574188232, "learning_rate": 0.002481187884534573, "loss": 7.6611, "step": 814800 }, { "epoch": 3.319723512535531, "grad_norm": 8.746994018554688, "learning_rate": 0.0024806918143325143, "loss": 7.6505, "step": 814900 }, { "epoch": 3.3201308905589126, "grad_norm": 8.797126770019531, "learning_rate": 0.0024801957450914895, "loss": 7.6684, "step": 815000 }, { "epoch": 3.3201308905589126, "eval_MaskedAccuracy": 0.506765295166268, "eval_loss": 1.6302165985107422, "eval_runtime": 158.1441, "eval_samples_per_second": 401.381, "eval_steps_per_second": 1.568, "step": 815000 }, { "epoch": 3.320538268582294, "grad_norm": 4.325685024261475, "learning_rate": 0.0024796996768311048, "loss": 7.6537, "step": 815100 }, { "epoch": 3.3209456466056753, "grad_norm": 3.9383885860443115, "learning_rate": 0.0024792036095709776, "loss": 7.6676, "step": 815200 }, { "epoch": 3.321353024629057, "grad_norm": 4.64644193649292, "learning_rate": 0.002478707543330718, "loss": 7.6647, "step": 815300 }, { "epoch": 3.3217604026524383, "grad_norm": 4.420475482940674, "learning_rate": 0.002478211478129945, "loss": 7.6666, "step": 815400 }, { "epoch": 3.32216778067582, "grad_norm": 16.778709411621094, "learning_rate": 0.0024777154139882635, "loss": 7.6451, "step": 815500 }, { "epoch": 3.3225751586992014, "grad_norm": 3.701763153076172, "learning_rate": 0.0024772193509252837, "loss": 7.6381, "step": 815600 }, { "epoch": 3.3229825367225825, "grad_norm": 8.478386878967285, "learning_rate": 0.002476723288960625, "loss": 7.6689, "step": 815700 }, { "epoch": 3.323389914745964, "grad_norm": 8.00002670288086, "learning_rate": 0.002476227228113897, "loss": 7.6785, "step": 815800 }, { "epoch": 3.3237972927693455, "grad_norm": 6.67539119720459, "learning_rate": 0.0024757311684047104, "loss": 7.6529, "step": 815900 }, { "epoch": 3.324204670792727, "grad_norm": 9.989371299743652, "learning_rate": 0.002475235109852678, "loss": 7.6829, "step": 816000 }, { "epoch": 3.324204670792727, "eval_MaskedAccuracy": 0.506909330141037, "eval_loss": 1.6146438121795654, "eval_runtime": 160.3621, "eval_samples_per_second": 395.829, "eval_steps_per_second": 1.547, "step": 816000 }, { "epoch": 3.3246120488161086, "grad_norm": 3.530759811401367, "learning_rate": 0.0024747390524774084, "loss": 7.6373, "step": 816100 }, { "epoch": 3.32501942683949, "grad_norm": 4.5824198722839355, "learning_rate": 0.002474242996298517, "loss": 7.6693, "step": 816200 }, { "epoch": 3.3254268048628717, "grad_norm": 5.958522319793701, "learning_rate": 0.0024737469413356147, "loss": 7.6494, "step": 816300 }, { "epoch": 3.3258341828862528, "grad_norm": 13.90539836883545, "learning_rate": 0.002473250887608314, "loss": 7.645, "step": 816400 }, { "epoch": 3.3262415609096343, "grad_norm": 3.1944069862365723, "learning_rate": 0.0024727548351362225, "loss": 7.6785, "step": 816500 }, { "epoch": 3.326648938933016, "grad_norm": 7.595718860626221, "learning_rate": 0.00247225878393896, "loss": 7.6689, "step": 816600 }, { "epoch": 3.3270563169563974, "grad_norm": 8.94078540802002, "learning_rate": 0.002471762734036131, "loss": 7.6623, "step": 816700 }, { "epoch": 3.327463694979779, "grad_norm": 4.060114860534668, "learning_rate": 0.0024712666854473475, "loss": 7.6815, "step": 816800 }, { "epoch": 3.3278710730031604, "grad_norm": 5.923506259918213, "learning_rate": 0.0024707706381922228, "loss": 7.6493, "step": 816900 }, { "epoch": 3.3282784510265415, "grad_norm": 10.673074722290039, "learning_rate": 0.0024702745922903693, "loss": 7.6818, "step": 817000 }, { "epoch": 3.3282784510265415, "eval_MaskedAccuracy": 0.506995028131734, "eval_loss": 1.6104918718338013, "eval_runtime": 161.0248, "eval_samples_per_second": 394.2, "eval_steps_per_second": 1.54, "step": 817000 }, { "epoch": 3.328685829049923, "grad_norm": 5.100268363952637, "learning_rate": 0.002469778547761395, "loss": 7.6891, "step": 817100 }, { "epoch": 3.3290932070733046, "grad_norm": 5.348010540008545, "learning_rate": 0.002469282504624912, "loss": 7.6533, "step": 817200 }, { "epoch": 3.329500585096686, "grad_norm": 8.126382827758789, "learning_rate": 0.002468786462900533, "loss": 7.6506, "step": 817300 }, { "epoch": 3.3299079631200676, "grad_norm": 5.103185653686523, "learning_rate": 0.002468290422607871, "loss": 7.6558, "step": 817400 }, { "epoch": 3.330315341143449, "grad_norm": 9.422660827636719, "learning_rate": 0.0024677943837665356, "loss": 7.6619, "step": 817500 }, { "epoch": 3.3307227191668307, "grad_norm": 6.416852951049805, "learning_rate": 0.0024672983463961346, "loss": 7.6366, "step": 817600 }, { "epoch": 3.331130097190212, "grad_norm": 3.414886951446533, "learning_rate": 0.0024668023105162812, "loss": 7.6791, "step": 817700 }, { "epoch": 3.3315374752135933, "grad_norm": 5.236874580383301, "learning_rate": 0.0024663062761465884, "loss": 7.6712, "step": 817800 }, { "epoch": 3.331944853236975, "grad_norm": 4.077887058258057, "learning_rate": 0.002465810243306666, "loss": 7.6438, "step": 817900 }, { "epoch": 3.3323522312603564, "grad_norm": 8.175515174865723, "learning_rate": 0.0024653142120161235, "loss": 7.6625, "step": 818000 }, { "epoch": 3.3323522312603564, "eval_MaskedAccuracy": 0.5075873129937887, "eval_loss": 1.612989902496338, "eval_runtime": 160.8331, "eval_samples_per_second": 394.67, "eval_steps_per_second": 1.542, "step": 818000 }, { "epoch": 3.332759609283738, "grad_norm": 9.09115219116211, "learning_rate": 0.002464818182294573, "loss": 7.6806, "step": 818100 }, { "epoch": 3.333166987307119, "grad_norm": 9.988933563232422, "learning_rate": 0.0024643221541616215, "loss": 7.6469, "step": 818200 }, { "epoch": 3.3335743653305006, "grad_norm": 7.367795467376709, "learning_rate": 0.002463826127636883, "loss": 7.6759, "step": 818300 }, { "epoch": 3.333981743353882, "grad_norm": 7.297520637512207, "learning_rate": 0.0024633301027399703, "loss": 7.6325, "step": 818400 }, { "epoch": 3.3343891213772636, "grad_norm": 4.209428787231445, "learning_rate": 0.0024628340794904894, "loss": 7.6408, "step": 818500 }, { "epoch": 3.334796499400645, "grad_norm": 7.046091556549072, "learning_rate": 0.0024623380579080535, "loss": 7.6797, "step": 818600 }, { "epoch": 3.3352038774240267, "grad_norm": 7.12550163269043, "learning_rate": 0.0024618420380122736, "loss": 7.6509, "step": 818700 }, { "epoch": 3.335611255447408, "grad_norm": 4.62762975692749, "learning_rate": 0.002461346019822758, "loss": 7.6946, "step": 818800 }, { "epoch": 3.3360186334707893, "grad_norm": 7.47962760925293, "learning_rate": 0.002460850003359117, "loss": 7.6563, "step": 818900 }, { "epoch": 3.336426011494171, "grad_norm": 9.683699607849121, "learning_rate": 0.0024603539886409614, "loss": 7.6782, "step": 819000 }, { "epoch": 3.336426011494171, "eval_MaskedAccuracy": 0.5057671322914739, "eval_loss": 1.620146632194519, "eval_runtime": 161.8696, "eval_samples_per_second": 392.143, "eval_steps_per_second": 1.532, "step": 819000 }, { "epoch": 3.3368333895175524, "grad_norm": 3.1025774478912354, "learning_rate": 0.0024598579756879035, "loss": 7.6678, "step": 819100 }, { "epoch": 3.337240767540934, "grad_norm": 5.008169174194336, "learning_rate": 0.002459361964519551, "loss": 7.6418, "step": 819200 }, { "epoch": 3.3376481455643154, "grad_norm": 4.815037727355957, "learning_rate": 0.0024588659551555126, "loss": 7.6788, "step": 819300 }, { "epoch": 3.3380555235876965, "grad_norm": 5.289239406585693, "learning_rate": 0.002458369947615401, "loss": 7.6675, "step": 819400 }, { "epoch": 3.338462901611078, "grad_norm": 9.67807674407959, "learning_rate": 0.0024578739419188252, "loss": 7.6561, "step": 819500 }, { "epoch": 3.3388702796344596, "grad_norm": 6.424745082855225, "learning_rate": 0.0024573779380853934, "loss": 7.6818, "step": 819600 }, { "epoch": 3.339277657657841, "grad_norm": 10.432646751403809, "learning_rate": 0.002456881936134717, "loss": 7.6853, "step": 819700 }, { "epoch": 3.3396850356812227, "grad_norm": 3.7950234413146973, "learning_rate": 0.0024563859360864047, "loss": 7.6897, "step": 819800 }, { "epoch": 3.340092413704604, "grad_norm": 8.507339477539062, "learning_rate": 0.0024558899379600657, "loss": 7.6618, "step": 819900 }, { "epoch": 3.3404997917279857, "grad_norm": 5.6887617111206055, "learning_rate": 0.0024553939417753094, "loss": 7.6526, "step": 820000 }, { "epoch": 3.3404997917279857, "eval_MaskedAccuracy": 0.5076526077855775, "eval_loss": 1.6125506162643433, "eval_runtime": 157.1165, "eval_samples_per_second": 404.006, "eval_steps_per_second": 1.578, "step": 820000 }, { "epoch": 3.3409071697513673, "grad_norm": 4.954686164855957, "learning_rate": 0.0024548979475517456, "loss": 7.6473, "step": 820100 }, { "epoch": 3.3413145477747483, "grad_norm": 3.6541824340820312, "learning_rate": 0.0024544019553089853, "loss": 7.6493, "step": 820200 }, { "epoch": 3.34172192579813, "grad_norm": 8.365074157714844, "learning_rate": 0.0024539059650666413, "loss": 7.671, "step": 820300 }, { "epoch": 3.3421293038215114, "grad_norm": 4.78241491317749, "learning_rate": 0.0024534099768443157, "loss": 7.6516, "step": 820400 }, { "epoch": 3.342536681844893, "grad_norm": 5.602843284606934, "learning_rate": 0.0024529139906616167, "loss": 7.667, "step": 820500 }, { "epoch": 3.3429440598682745, "grad_norm": 9.948620796203613, "learning_rate": 0.0024524180065381576, "loss": 7.6627, "step": 820600 }, { "epoch": 3.3433514378916556, "grad_norm": 13.063933372497559, "learning_rate": 0.0024519220244935444, "loss": 7.669, "step": 820700 }, { "epoch": 3.343758815915037, "grad_norm": 9.975808143615723, "learning_rate": 0.0024514260445473867, "loss": 7.6707, "step": 820800 }, { "epoch": 3.3441661939384186, "grad_norm": 3.7139270305633545, "learning_rate": 0.0024509300667192977, "loss": 7.6485, "step": 820900 }, { "epoch": 3.3445735719618, "grad_norm": 7.64076566696167, "learning_rate": 0.0024504340910288832, "loss": 7.6765, "step": 821000 }, { "epoch": 3.3445735719618, "eval_MaskedAccuracy": 0.5068409945564115, "eval_loss": 1.608394980430603, "eval_runtime": 154.527, "eval_samples_per_second": 410.776, "eval_steps_per_second": 1.605, "step": 821000 }, { "epoch": 3.3449809499851817, "grad_norm": 4.352261066436768, "learning_rate": 0.0024499381174957462, "loss": 7.6776, "step": 821100 }, { "epoch": 3.3453883280085632, "grad_norm": 3.314389228820801, "learning_rate": 0.0024494421461395024, "loss": 7.6637, "step": 821200 }, { "epoch": 3.3457957060319448, "grad_norm": 4.812958240509033, "learning_rate": 0.0024489461769797573, "loss": 7.7008, "step": 821300 }, { "epoch": 3.346203084055326, "grad_norm": 9.614579200744629, "learning_rate": 0.0024484502100361166, "loss": 7.6563, "step": 821400 }, { "epoch": 3.3466104620787074, "grad_norm": 9.866796493530273, "learning_rate": 0.002447954245328194, "loss": 7.6485, "step": 821500 }, { "epoch": 3.347017840102089, "grad_norm": 6.312223434448242, "learning_rate": 0.0024474582828755984, "loss": 7.6668, "step": 821600 }, { "epoch": 3.3474252181254704, "grad_norm": 3.8034420013427734, "learning_rate": 0.002446962322697934, "loss": 7.6654, "step": 821700 }, { "epoch": 3.347832596148852, "grad_norm": 5.688786506652832, "learning_rate": 0.002446466364814806, "loss": 7.6635, "step": 821800 }, { "epoch": 3.348239974172233, "grad_norm": 4.1226677894592285, "learning_rate": 0.0024459704092458265, "loss": 7.6707, "step": 821900 }, { "epoch": 3.3486473521956146, "grad_norm": 5.9946417808532715, "learning_rate": 0.002445474456010603, "loss": 7.679, "step": 822000 }, { "epoch": 3.3486473521956146, "eval_MaskedAccuracy": 0.5075059830427925, "eval_loss": 1.6148271560668945, "eval_runtime": 156.096, "eval_samples_per_second": 406.647, "eval_steps_per_second": 1.589, "step": 822000 }, { "epoch": 3.349054730218996, "grad_norm": 5.2468390464782715, "learning_rate": 0.00244497850512874, "loss": 7.6851, "step": 822100 }, { "epoch": 3.3494621082423777, "grad_norm": 10.354859352111816, "learning_rate": 0.0024444825566198487, "loss": 7.6781, "step": 822200 }, { "epoch": 3.349869486265759, "grad_norm": 5.5955424308776855, "learning_rate": 0.002443986610503536, "loss": 7.6694, "step": 822300 }, { "epoch": 3.3502768642891407, "grad_norm": 6.757050514221191, "learning_rate": 0.0024434906667994095, "loss": 7.6669, "step": 822400 }, { "epoch": 3.3506842423125223, "grad_norm": 4.649733066558838, "learning_rate": 0.0024429947255270763, "loss": 7.6732, "step": 822500 }, { "epoch": 3.351091620335904, "grad_norm": 5.6255998611450195, "learning_rate": 0.00244249878670614, "loss": 7.6712, "step": 822600 }, { "epoch": 3.351498998359285, "grad_norm": 3.62913179397583, "learning_rate": 0.002442002850356214, "loss": 7.6585, "step": 822700 }, { "epoch": 3.3519063763826664, "grad_norm": 4.7510833740234375, "learning_rate": 0.0024415069164969005, "loss": 7.6666, "step": 822800 }, { "epoch": 3.352313754406048, "grad_norm": 8.73514175415039, "learning_rate": 0.002441010985147808, "loss": 7.6782, "step": 822900 }, { "epoch": 3.3527211324294295, "grad_norm": 8.673792839050293, "learning_rate": 0.002440515056328547, "loss": 7.6822, "step": 823000 }, { "epoch": 3.3527211324294295, "eval_MaskedAccuracy": 0.5070311418367376, "eval_loss": 1.6214524507522583, "eval_runtime": 167.6911, "eval_samples_per_second": 378.529, "eval_steps_per_second": 1.479, "step": 823000 }, { "epoch": 3.353128510452811, "grad_norm": 9.47994327545166, "learning_rate": 0.0024400191300587167, "loss": 7.6653, "step": 823100 }, { "epoch": 3.353535888476192, "grad_norm": 3.8950607776641846, "learning_rate": 0.002439523206357927, "loss": 7.6919, "step": 823200 }, { "epoch": 3.3539432664995736, "grad_norm": 8.29250717163086, "learning_rate": 0.0024390272852457874, "loss": 7.6693, "step": 823300 }, { "epoch": 3.354350644522955, "grad_norm": 3.7033770084381104, "learning_rate": 0.002438531366741902, "loss": 7.6653, "step": 823400 }, { "epoch": 3.3547580225463367, "grad_norm": 8.865307807922363, "learning_rate": 0.0024380354508658783, "loss": 7.6741, "step": 823500 }, { "epoch": 3.3551654005697182, "grad_norm": 2.245955467224121, "learning_rate": 0.0024375395376373195, "loss": 7.6803, "step": 823600 }, { "epoch": 3.3555727785930998, "grad_norm": 3.0160439014434814, "learning_rate": 0.002437043627075836, "loss": 7.6555, "step": 823700 }, { "epoch": 3.3559801566164813, "grad_norm": 6.213773727416992, "learning_rate": 0.00243654771920103, "loss": 7.6513, "step": 823800 }, { "epoch": 3.3563875346398624, "grad_norm": 5.2603983879089355, "learning_rate": 0.0024360518140325083, "loss": 7.6741, "step": 823900 }, { "epoch": 3.356794912663244, "grad_norm": 5.670897483825684, "learning_rate": 0.0024355559115898763, "loss": 7.6521, "step": 824000 }, { "epoch": 3.356794912663244, "eval_MaskedAccuracy": 0.5072930216411269, "eval_loss": 1.6101815700531006, "eval_runtime": 151.6168, "eval_samples_per_second": 418.661, "eval_steps_per_second": 1.636, "step": 824000 }, { "epoch": 3.3572022906866255, "grad_norm": 7.838451862335205, "learning_rate": 0.0024350600118927417, "loss": 7.6388, "step": 824100 }, { "epoch": 3.357609668710007, "grad_norm": 8.521012306213379, "learning_rate": 0.0024345641149607095, "loss": 7.6269, "step": 824200 }, { "epoch": 3.3580170467333885, "grad_norm": 4.4888081550598145, "learning_rate": 0.0024340682208133847, "loss": 7.6575, "step": 824300 }, { "epoch": 3.3584244247567696, "grad_norm": 12.87949275970459, "learning_rate": 0.0024335723294703735, "loss": 7.6461, "step": 824400 }, { "epoch": 3.358831802780151, "grad_norm": 8.240680694580078, "learning_rate": 0.0024330764409512783, "loss": 7.6768, "step": 824500 }, { "epoch": 3.3592391808035327, "grad_norm": 6.6971917152404785, "learning_rate": 0.0024325805552757053, "loss": 7.6795, "step": 824600 }, { "epoch": 3.359646558826914, "grad_norm": 5.107844829559326, "learning_rate": 0.0024320846724632627, "loss": 7.6604, "step": 824700 }, { "epoch": 3.3600539368502957, "grad_norm": 3.7403807640075684, "learning_rate": 0.0024315887925335537, "loss": 7.6643, "step": 824800 }, { "epoch": 3.3604613148736773, "grad_norm": 3.1838297843933105, "learning_rate": 0.0024310929155061793, "loss": 7.6528, "step": 824900 }, { "epoch": 3.360868692897059, "grad_norm": 4.040480613708496, "learning_rate": 0.002430597041400746, "loss": 7.636, "step": 825000 }, { "epoch": 3.360868692897059, "eval_MaskedAccuracy": 0.5077692151325165, "eval_loss": 1.606154441833496, "eval_runtime": 167.0439, "eval_samples_per_second": 379.996, "eval_steps_per_second": 1.485, "step": 825000 }, { "epoch": 3.3612760709204403, "grad_norm": 5.605712413787842, "learning_rate": 0.0024301011702368607, "loss": 7.6593, "step": 825100 }, { "epoch": 3.3616834489438214, "grad_norm": 6.239404678344727, "learning_rate": 0.002429605302034129, "loss": 7.6526, "step": 825200 }, { "epoch": 3.362090826967203, "grad_norm": 4.008897304534912, "learning_rate": 0.002429109436812153, "loss": 7.6671, "step": 825300 }, { "epoch": 3.3624982049905845, "grad_norm": 5.22152853012085, "learning_rate": 0.0024286135745905397, "loss": 7.6774, "step": 825400 }, { "epoch": 3.362905583013966, "grad_norm": 5.3362555503845215, "learning_rate": 0.0024281177153888866, "loss": 7.671, "step": 825500 }, { "epoch": 3.3633129610373476, "grad_norm": 3.5259885787963867, "learning_rate": 0.0024276218592268, "loss": 7.6432, "step": 825600 }, { "epoch": 3.3637203390607286, "grad_norm": 4.645276069641113, "learning_rate": 0.0024271260061238874, "loss": 7.6706, "step": 825700 }, { "epoch": 3.36412771708411, "grad_norm": 5.165298938751221, "learning_rate": 0.002426630156099749, "loss": 7.6634, "step": 825800 }, { "epoch": 3.3645350951074917, "grad_norm": 3.0694339275360107, "learning_rate": 0.0024261343091739906, "loss": 7.6427, "step": 825900 }, { "epoch": 3.3649424731308732, "grad_norm": 6.479582786560059, "learning_rate": 0.002425638465366213, "loss": 7.6484, "step": 826000 }, { "epoch": 3.3649424731308732, "eval_MaskedAccuracy": 0.507014539951461, "eval_loss": 1.6130748987197876, "eval_runtime": 162.9599, "eval_samples_per_second": 389.519, "eval_steps_per_second": 1.522, "step": 826000 }, { "epoch": 3.365349851154255, "grad_norm": 6.702629089355469, "learning_rate": 0.002425142624696022, "loss": 7.657, "step": 826100 }, { "epoch": 3.3657572291776363, "grad_norm": 4.354644775390625, "learning_rate": 0.002424646787183022, "loss": 7.6614, "step": 826200 }, { "epoch": 3.366164607201018, "grad_norm": 6.56011962890625, "learning_rate": 0.0024241509528468127, "loss": 7.6674, "step": 826300 }, { "epoch": 3.366571985224399, "grad_norm": 7.575295448303223, "learning_rate": 0.002423655121707001, "loss": 7.6662, "step": 826400 }, { "epoch": 3.3669793632477805, "grad_norm": 5.027916431427002, "learning_rate": 0.0024231592937831847, "loss": 7.6646, "step": 826500 }, { "epoch": 3.367386741271162, "grad_norm": 2.8268332481384277, "learning_rate": 0.00242266346909497, "loss": 7.6551, "step": 826600 }, { "epoch": 3.3677941192945435, "grad_norm": 12.005270957946777, "learning_rate": 0.0024221676476619616, "loss": 7.6723, "step": 826700 }, { "epoch": 3.368201497317925, "grad_norm": 4.354057312011719, "learning_rate": 0.0024216718295037546, "loss": 7.6652, "step": 826800 }, { "epoch": 3.368608875341306, "grad_norm": 4.566364288330078, "learning_rate": 0.0024211760146399574, "loss": 7.657, "step": 826900 }, { "epoch": 3.3690162533646877, "grad_norm": 3.726116180419922, "learning_rate": 0.0024206802030901707, "loss": 7.6665, "step": 827000 }, { "epoch": 3.3690162533646877, "eval_MaskedAccuracy": 0.5072880942238474, "eval_loss": 1.610743522644043, "eval_runtime": 156.0728, "eval_samples_per_second": 406.708, "eval_steps_per_second": 1.589, "step": 827000 }, { "epoch": 3.369423631388069, "grad_norm": 3.6981687545776367, "learning_rate": 0.0024201843948739974, "loss": 7.6521, "step": 827100 }, { "epoch": 3.3698310094114508, "grad_norm": 2.9989585876464844, "learning_rate": 0.0024196885900110397, "loss": 7.6803, "step": 827200 }, { "epoch": 3.3702383874348323, "grad_norm": 10.081335067749023, "learning_rate": 0.0024191927885208993, "loss": 7.6505, "step": 827300 }, { "epoch": 3.370645765458214, "grad_norm": 10.75118637084961, "learning_rate": 0.0024186969904231755, "loss": 7.6719, "step": 827400 }, { "epoch": 3.3710531434815953, "grad_norm": 4.597393035888672, "learning_rate": 0.002418201195737471, "loss": 7.6763, "step": 827500 }, { "epoch": 3.3714605215049764, "grad_norm": 7.283711910247803, "learning_rate": 0.0024177054044833856, "loss": 7.6566, "step": 827600 }, { "epoch": 3.371867899528358, "grad_norm": 4.523886203765869, "learning_rate": 0.002417209616680525, "loss": 7.6649, "step": 827700 }, { "epoch": 3.3722752775517395, "grad_norm": 4.122554779052734, "learning_rate": 0.002416713832348482, "loss": 7.6693, "step": 827800 }, { "epoch": 3.372682655575121, "grad_norm": 5.343982219696045, "learning_rate": 0.0024162180515068655, "loss": 7.6575, "step": 827900 }, { "epoch": 3.3730900335985026, "grad_norm": 6.65548849105835, "learning_rate": 0.0024157222741752757, "loss": 7.6535, "step": 828000 }, { "epoch": 3.3730900335985026, "eval_MaskedAccuracy": 0.5065662796216804, "eval_loss": 1.6128860712051392, "eval_runtime": 156.5143, "eval_samples_per_second": 405.561, "eval_steps_per_second": 1.585, "step": 828000 }, { "epoch": 3.373497411621884, "grad_norm": 3.8162009716033936, "learning_rate": 0.002415226500373313, "loss": 7.6711, "step": 828100 }, { "epoch": 3.373904789645265, "grad_norm": 12.830845832824707, "learning_rate": 0.0024147307301205755, "loss": 7.6657, "step": 828200 }, { "epoch": 3.3743121676686467, "grad_norm": 8.961577415466309, "learning_rate": 0.002414234963436664, "loss": 7.6956, "step": 828300 }, { "epoch": 3.3747195456920283, "grad_norm": 6.989332675933838, "learning_rate": 0.0024137392003411814, "loss": 7.6912, "step": 828400 }, { "epoch": 3.37512692371541, "grad_norm": 4.718576431274414, "learning_rate": 0.002413243440853725, "loss": 7.6849, "step": 828500 }, { "epoch": 3.3755343017387913, "grad_norm": 5.740972518920898, "learning_rate": 0.002412747684993897, "loss": 7.6422, "step": 828600 }, { "epoch": 3.375941679762173, "grad_norm": 8.048383712768555, "learning_rate": 0.0024122519327812956, "loss": 7.6806, "step": 828700 }, { "epoch": 3.3763490577855544, "grad_norm": 7.133415699005127, "learning_rate": 0.0024117561842355215, "loss": 7.6453, "step": 828800 }, { "epoch": 3.3767564358089355, "grad_norm": 4.826624393463135, "learning_rate": 0.002411260439376174, "loss": 7.6548, "step": 828900 }, { "epoch": 3.377163813832317, "grad_norm": 10.053572654724121, "learning_rate": 0.002410764698222855, "loss": 7.6543, "step": 829000 }, { "epoch": 3.377163813832317, "eval_MaskedAccuracy": 0.5070204764525635, "eval_loss": 1.6115893125534058, "eval_runtime": 154.8877, "eval_samples_per_second": 409.819, "eval_steps_per_second": 1.601, "step": 829000 }, { "epoch": 3.3775711918556985, "grad_norm": 4.761382579803467, "learning_rate": 0.0024102689607951597, "loss": 7.644, "step": 829100 }, { "epoch": 3.37797856987908, "grad_norm": 9.084874153137207, "learning_rate": 0.00240977322711269, "loss": 7.6406, "step": 829200 }, { "epoch": 3.3783859479024616, "grad_norm": 3.393242835998535, "learning_rate": 0.0024092774971950473, "loss": 7.6674, "step": 829300 }, { "epoch": 3.3787933259258427, "grad_norm": 4.38031530380249, "learning_rate": 0.0024087817710618242, "loss": 7.663, "step": 829400 }, { "epoch": 3.3792007039492242, "grad_norm": 4.793169975280762, "learning_rate": 0.0024082860487326216, "loss": 7.6583, "step": 829500 }, { "epoch": 3.3796080819726058, "grad_norm": 6.607907772064209, "learning_rate": 0.0024077903302270366, "loss": 7.6713, "step": 829600 }, { "epoch": 3.3800154599959873, "grad_norm": 3.7557523250579834, "learning_rate": 0.002407294615564673, "loss": 7.657, "step": 829700 }, { "epoch": 3.380422838019369, "grad_norm": 6.895918369293213, "learning_rate": 0.002406798904765123, "loss": 7.6658, "step": 829800 }, { "epoch": 3.3808302160427504, "grad_norm": 14.377764701843262, "learning_rate": 0.0024063031978479887, "loss": 7.6702, "step": 829900 }, { "epoch": 3.381237594066132, "grad_norm": 7.503129005432129, "learning_rate": 0.0024058074948328676, "loss": 7.6571, "step": 830000 }, { "epoch": 3.381237594066132, "eval_MaskedAccuracy": 0.5073381341409644, "eval_loss": 1.6124346256256104, "eval_runtime": 189.1157, "eval_samples_per_second": 335.646, "eval_steps_per_second": 1.311, "step": 830000 }, { "epoch": 3.381644972089513, "grad_norm": 6.230504989624023, "learning_rate": 0.002405311795739352, "loss": 7.6695, "step": 830100 }, { "epoch": 3.3820523501128945, "grad_norm": 5.507158279418945, "learning_rate": 0.0024048161005870506, "loss": 7.7014, "step": 830200 }, { "epoch": 3.382459728136276, "grad_norm": 3.3367087841033936, "learning_rate": 0.0024043204093955536, "loss": 7.6919, "step": 830300 }, { "epoch": 3.3828671061596576, "grad_norm": 4.103170871734619, "learning_rate": 0.0024038247221844565, "loss": 7.646, "step": 830400 }, { "epoch": 3.383274484183039, "grad_norm": 2.787203550338745, "learning_rate": 0.002403329038973361, "loss": 7.6722, "step": 830500 }, { "epoch": 3.3836818622064206, "grad_norm": 7.393489837646484, "learning_rate": 0.002402833359781865, "loss": 7.6633, "step": 830600 }, { "epoch": 3.3840892402298017, "grad_norm": 7.408133506774902, "learning_rate": 0.0024023376846295597, "loss": 7.7092, "step": 830700 }, { "epoch": 3.3844966182531833, "grad_norm": 6.954722881317139, "learning_rate": 0.002401842013536045, "loss": 7.6782, "step": 830800 }, { "epoch": 3.384903996276565, "grad_norm": 7.104769229888916, "learning_rate": 0.002401346346520915, "loss": 7.6704, "step": 830900 }, { "epoch": 3.3853113742999463, "grad_norm": 6.4443535804748535, "learning_rate": 0.0024008506836037725, "loss": 7.6697, "step": 831000 }, { "epoch": 3.3853113742999463, "eval_MaskedAccuracy": 0.5072718987680596, "eval_loss": 1.6166130304336548, "eval_runtime": 152.8159, "eval_samples_per_second": 415.376, "eval_steps_per_second": 1.623, "step": 831000 }, { "epoch": 3.385718752323328, "grad_norm": 7.106754779815674, "learning_rate": 0.0024003550248042063, "loss": 7.6626, "step": 831100 }, { "epoch": 3.3861261303467094, "grad_norm": 4.369802474975586, "learning_rate": 0.0023998593701418185, "loss": 7.6737, "step": 831200 }, { "epoch": 3.386533508370091, "grad_norm": 3.6945748329162598, "learning_rate": 0.0023993637196362003, "loss": 7.6837, "step": 831300 }, { "epoch": 3.386940886393472, "grad_norm": 4.411739826202393, "learning_rate": 0.002398868073306949, "loss": 7.6952, "step": 831400 }, { "epoch": 3.3873482644168535, "grad_norm": 3.0309081077575684, "learning_rate": 0.0023983724311736634, "loss": 7.674, "step": 831500 }, { "epoch": 3.387755642440235, "grad_norm": 4.751334190368652, "learning_rate": 0.002397876793255933, "loss": 7.6716, "step": 831600 }, { "epoch": 3.3881630204636166, "grad_norm": 5.458463668823242, "learning_rate": 0.0023973811595733555, "loss": 7.699, "step": 831700 }, { "epoch": 3.388570398486998, "grad_norm": 3.3413469791412354, "learning_rate": 0.0023968855301455282, "loss": 7.7036, "step": 831800 }, { "epoch": 3.3889777765103792, "grad_norm": 3.039830207824707, "learning_rate": 0.0023963899049920384, "loss": 7.674, "step": 831900 }, { "epoch": 3.3893851545337608, "grad_norm": 3.8685097694396973, "learning_rate": 0.002395894284132492, "loss": 7.6793, "step": 832000 }, { "epoch": 3.3893851545337608, "eval_MaskedAccuracy": 0.5073406821303406, "eval_loss": 1.6137722730636597, "eval_runtime": 161.1481, "eval_samples_per_second": 393.899, "eval_steps_per_second": 1.539, "step": 832000 }, { "epoch": 3.3897925325571423, "grad_norm": 3.665583848953247, "learning_rate": 0.002395398667586476, "loss": 7.6717, "step": 832100 }, { "epoch": 3.390199910580524, "grad_norm": 4.173010349273682, "learning_rate": 0.002394903055373587, "loss": 7.659, "step": 832200 }, { "epoch": 3.3906072886039054, "grad_norm": 3.1028964519500732, "learning_rate": 0.0023944074475134203, "loss": 7.6563, "step": 832300 }, { "epoch": 3.391014666627287, "grad_norm": 5.7776288986206055, "learning_rate": 0.0023939118440255694, "loss": 7.667, "step": 832400 }, { "epoch": 3.3914220446506684, "grad_norm": 10.680218696594238, "learning_rate": 0.0023934162449296297, "loss": 7.6756, "step": 832500 }, { "epoch": 3.3918294226740495, "grad_norm": 4.816702842712402, "learning_rate": 0.002392920650245192, "loss": 7.6148, "step": 832600 }, { "epoch": 3.392236800697431, "grad_norm": 4.031116962432861, "learning_rate": 0.0023924250599918484, "loss": 7.6459, "step": 832700 }, { "epoch": 3.3926441787208126, "grad_norm": 4.229380130767822, "learning_rate": 0.0023919294741891945, "loss": 7.6725, "step": 832800 }, { "epoch": 3.393051556744194, "grad_norm": 5.243058681488037, "learning_rate": 0.0023914338928568237, "loss": 7.6495, "step": 832900 }, { "epoch": 3.3934589347675757, "grad_norm": 7.159852981567383, "learning_rate": 0.0023909383160143284, "loss": 7.6504, "step": 833000 }, { "epoch": 3.3934589347675757, "eval_MaskedAccuracy": 0.50725899062841, "eval_loss": 1.6201882362365723, "eval_runtime": 163.8534, "eval_samples_per_second": 387.395, "eval_steps_per_second": 1.514, "step": 833000 }, { "epoch": 3.393866312790957, "grad_norm": 5.065793991088867, "learning_rate": 0.002390442743681302, "loss": 7.6309, "step": 833100 }, { "epoch": 3.3942736908143383, "grad_norm": 6.484888553619385, "learning_rate": 0.0023899471758773356, "loss": 7.6971, "step": 833200 }, { "epoch": 3.39468106883772, "grad_norm": 3.7068583965301514, "learning_rate": 0.002389451612622021, "loss": 7.661, "step": 833300 }, { "epoch": 3.3950884468611013, "grad_norm": 9.850645065307617, "learning_rate": 0.0023889560539349544, "loss": 7.6384, "step": 833400 }, { "epoch": 3.395495824884483, "grad_norm": 8.616034507751465, "learning_rate": 0.002388460499835725, "loss": 7.6634, "step": 833500 }, { "epoch": 3.3959032029078644, "grad_norm": 5.236738204956055, "learning_rate": 0.002387964950343926, "loss": 7.6906, "step": 833600 }, { "epoch": 3.396310580931246, "grad_norm": 4.034549713134766, "learning_rate": 0.0023874694054791482, "loss": 7.6662, "step": 833700 }, { "epoch": 3.3967179589546275, "grad_norm": 3.472949981689453, "learning_rate": 0.0023869738652609832, "loss": 7.6667, "step": 833800 }, { "epoch": 3.3971253369780086, "grad_norm": 3.4258108139038086, "learning_rate": 0.0023864783297090197, "loss": 7.6713, "step": 833900 }, { "epoch": 3.39753271500139, "grad_norm": 5.242177486419678, "learning_rate": 0.002385982798842854, "loss": 7.6666, "step": 834000 }, { "epoch": 3.39753271500139, "eval_MaskedAccuracy": 0.5080079469272007, "eval_loss": 1.6206566095352173, "eval_runtime": 159.6813, "eval_samples_per_second": 397.517, "eval_steps_per_second": 1.553, "step": 834000 }, { "epoch": 3.3979400930247716, "grad_norm": 8.542133331298828, "learning_rate": 0.0023854872726820782, "loss": 7.6469, "step": 834100 }, { "epoch": 3.398347471048153, "grad_norm": 8.990893363952637, "learning_rate": 0.002384991751246276, "loss": 7.6571, "step": 834200 }, { "epoch": 3.3987548490715347, "grad_norm": 6.003313064575195, "learning_rate": 0.0023844962345550428, "loss": 7.6614, "step": 834300 }, { "epoch": 3.399162227094916, "grad_norm": 7.175158500671387, "learning_rate": 0.0023840007226279664, "loss": 7.6823, "step": 834400 }, { "epoch": 3.3995696051182973, "grad_norm": 7.723865985870361, "learning_rate": 0.0023835052154846407, "loss": 7.6675, "step": 834500 }, { "epoch": 3.399976983141679, "grad_norm": 8.568977355957031, "learning_rate": 0.002383009713144654, "loss": 7.6294, "step": 834600 }, { "epoch": 3.4003843611650604, "grad_norm": 4.583963394165039, "learning_rate": 0.0023825142156275957, "loss": 7.6542, "step": 834700 }, { "epoch": 3.400791739188442, "grad_norm": 9.71217155456543, "learning_rate": 0.002382018722953053, "loss": 7.6478, "step": 834800 }, { "epoch": 3.4011991172118234, "grad_norm": 6.550137996673584, "learning_rate": 0.002381523235140619, "loss": 7.6678, "step": 834900 }, { "epoch": 3.401606495235205, "grad_norm": 2.844496488571167, "learning_rate": 0.0023810277522098825, "loss": 7.6329, "step": 835000 }, { "epoch": 3.401606495235205, "eval_MaskedAccuracy": 0.5079399934763913, "eval_loss": 1.6060205698013306, "eval_runtime": 156.158, "eval_samples_per_second": 406.486, "eval_steps_per_second": 1.588, "step": 835000 }, { "epoch": 3.402013873258586, "grad_norm": 5.937469005584717, "learning_rate": 0.002380532274180431, "loss": 7.676, "step": 835100 }, { "epoch": 3.4024212512819676, "grad_norm": 5.086205005645752, "learning_rate": 0.0023800368010718533, "loss": 7.644, "step": 835200 }, { "epoch": 3.402828629305349, "grad_norm": 3.5045113563537598, "learning_rate": 0.002379541332903739, "loss": 7.6603, "step": 835300 }, { "epoch": 3.4032360073287307, "grad_norm": 7.176479339599609, "learning_rate": 0.0023790458696956772, "loss": 7.6436, "step": 835400 }, { "epoch": 3.403643385352112, "grad_norm": 4.983194351196289, "learning_rate": 0.0023785504114672566, "loss": 7.6698, "step": 835500 }, { "epoch": 3.4040507633754937, "grad_norm": 3.059422731399536, "learning_rate": 0.002378054958238063, "loss": 7.6595, "step": 835600 }, { "epoch": 3.404458141398875, "grad_norm": 8.528168678283691, "learning_rate": 0.0023775595100276863, "loss": 7.6538, "step": 835700 }, { "epoch": 3.4048655194222563, "grad_norm": 5.731561183929443, "learning_rate": 0.002377064066855718, "loss": 7.6842, "step": 835800 }, { "epoch": 3.405272897445638, "grad_norm": 10.47243881225586, "learning_rate": 0.002376568628741738, "loss": 7.6407, "step": 835900 }, { "epoch": 3.4056802754690194, "grad_norm": 5.699708461761475, "learning_rate": 0.0023760731957053367, "loss": 7.6225, "step": 836000 }, { "epoch": 3.4056802754690194, "eval_MaskedAccuracy": 0.5075000617329241, "eval_loss": 1.609775185585022, "eval_runtime": 155.485, "eval_samples_per_second": 408.245, "eval_steps_per_second": 1.595, "step": 836000 }, { "epoch": 3.406087653492401, "grad_norm": 4.290980815887451, "learning_rate": 0.002375577767766104, "loss": 7.6752, "step": 836100 }, { "epoch": 3.4064950315157825, "grad_norm": 5.101619720458984, "learning_rate": 0.002375082344943621, "loss": 7.6888, "step": 836200 }, { "epoch": 3.406902409539164, "grad_norm": 4.72704553604126, "learning_rate": 0.002374586927257481, "loss": 7.6807, "step": 836300 }, { "epoch": 3.407309787562545, "grad_norm": 5.734152793884277, "learning_rate": 0.0023740915147272687, "loss": 7.6606, "step": 836400 }, { "epoch": 3.4077171655859266, "grad_norm": 12.271175384521484, "learning_rate": 0.002373596107372566, "loss": 7.6721, "step": 836500 }, { "epoch": 3.408124543609308, "grad_norm": 9.951292991638184, "learning_rate": 0.0023731007052129613, "loss": 7.6859, "step": 836600 }, { "epoch": 3.4085319216326897, "grad_norm": 4.7487030029296875, "learning_rate": 0.0023726053082680433, "loss": 7.6775, "step": 836700 }, { "epoch": 3.4089392996560712, "grad_norm": 7.660290241241455, "learning_rate": 0.0023721099165573962, "loss": 7.6731, "step": 836800 }, { "epoch": 3.4093466776794523, "grad_norm": 3.7929229736328125, "learning_rate": 0.002371614530100602, "loss": 7.6713, "step": 836900 }, { "epoch": 3.409754055702834, "grad_norm": 4.572562217712402, "learning_rate": 0.0023711191489172484, "loss": 7.6696, "step": 837000 }, { "epoch": 3.409754055702834, "eval_MaskedAccuracy": 0.507034410269781, "eval_loss": 1.612809181213379, "eval_runtime": 164.4911, "eval_samples_per_second": 385.893, "eval_steps_per_second": 1.508, "step": 837000 }, { "epoch": 3.4101614337262154, "grad_norm": 11.561025619506836, "learning_rate": 0.0023706237730269255, "loss": 7.652, "step": 837100 }, { "epoch": 3.410568811749597, "grad_norm": 5.122629165649414, "learning_rate": 0.002370128402449212, "loss": 7.6614, "step": 837200 }, { "epoch": 3.4109761897729785, "grad_norm": 6.234327793121338, "learning_rate": 0.0023696330372036936, "loss": 7.6058, "step": 837300 }, { "epoch": 3.41138356779636, "grad_norm": 10.977176666259766, "learning_rate": 0.002369137677309954, "loss": 7.6961, "step": 837400 }, { "epoch": 3.4117909458197415, "grad_norm": 4.57222843170166, "learning_rate": 0.0023686423227875794, "loss": 7.6631, "step": 837500 }, { "epoch": 3.4121983238431226, "grad_norm": 4.3072991371154785, "learning_rate": 0.0023681469736561507, "loss": 7.6266, "step": 837600 }, { "epoch": 3.412605701866504, "grad_norm": 4.29278564453125, "learning_rate": 0.0023676516299352547, "loss": 7.6838, "step": 837700 }, { "epoch": 3.4130130798898857, "grad_norm": 5.988570213317871, "learning_rate": 0.0023671562916444764, "loss": 7.6315, "step": 837800 }, { "epoch": 3.413420457913267, "grad_norm": 3.544550895690918, "learning_rate": 0.0023666609588033935, "loss": 7.6627, "step": 837900 }, { "epoch": 3.4138278359366487, "grad_norm": 5.6219329833984375, "learning_rate": 0.0023661656314315953, "loss": 7.6553, "step": 838000 }, { "epoch": 3.4138278359366487, "eval_MaskedAccuracy": 0.5070233921709371, "eval_loss": 1.610743522644043, "eval_runtime": 173.3502, "eval_samples_per_second": 366.172, "eval_steps_per_second": 1.431, "step": 838000 }, { "epoch": 3.4142352139600303, "grad_norm": 9.236532211303711, "learning_rate": 0.002365670309548661, "loss": 7.6769, "step": 838100 }, { "epoch": 3.4146425919834114, "grad_norm": 5.578697681427002, "learning_rate": 0.0023651749931741742, "loss": 7.66, "step": 838200 }, { "epoch": 3.415049970006793, "grad_norm": 9.232965469360352, "learning_rate": 0.002364679682327721, "loss": 7.6472, "step": 838300 }, { "epoch": 3.4154573480301744, "grad_norm": 4.654336929321289, "learning_rate": 0.0023641843770288774, "loss": 7.6592, "step": 838400 }, { "epoch": 3.415864726053556, "grad_norm": 4.323183059692383, "learning_rate": 0.0023636890772972315, "loss": 7.654, "step": 838500 }, { "epoch": 3.4162721040769375, "grad_norm": 4.7549848556518555, "learning_rate": 0.002363193783152361, "loss": 7.6212, "step": 838600 }, { "epoch": 3.416679482100319, "grad_norm": 8.74051284790039, "learning_rate": 0.0023626984946138463, "loss": 7.6539, "step": 838700 }, { "epoch": 3.4170868601237006, "grad_norm": 7.818248271942139, "learning_rate": 0.0023622032117012735, "loss": 7.6563, "step": 838800 }, { "epoch": 3.4174942381470816, "grad_norm": 4.790457248687744, "learning_rate": 0.00236170793443422, "loss": 7.6859, "step": 838900 }, { "epoch": 3.417901616170463, "grad_norm": 4.331161022186279, "learning_rate": 0.0023612126628322714, "loss": 7.6578, "step": 839000 }, { "epoch": 3.417901616170463, "eval_MaskedAccuracy": 0.5078824194169587, "eval_loss": 1.6002832651138306, "eval_runtime": 161.4853, "eval_samples_per_second": 393.076, "eval_steps_per_second": 1.536, "step": 839000 }, { "epoch": 3.4183089941938447, "grad_norm": 5.092179775238037, "learning_rate": 0.0023607173969150035, "loss": 7.6521, "step": 839100 }, { "epoch": 3.4187163722172262, "grad_norm": 10.025911331176758, "learning_rate": 0.0023602221367020003, "loss": 7.6779, "step": 839200 }, { "epoch": 3.4191237502406078, "grad_norm": 5.616159439086914, "learning_rate": 0.00235972688221284, "loss": 7.6613, "step": 839300 }, { "epoch": 3.419531128263989, "grad_norm": 3.7099969387054443, "learning_rate": 0.0023592316334671002, "loss": 7.666, "step": 839400 }, { "epoch": 3.4199385062873704, "grad_norm": 9.61128044128418, "learning_rate": 0.0023587363904843633, "loss": 7.6497, "step": 839500 }, { "epoch": 3.420345884310752, "grad_norm": 3.531198024749756, "learning_rate": 0.002358241153284211, "loss": 7.6365, "step": 839600 }, { "epoch": 3.4207532623341335, "grad_norm": 3.091862440109253, "learning_rate": 0.002357745921886218, "loss": 7.6708, "step": 839700 }, { "epoch": 3.421160640357515, "grad_norm": 9.81743049621582, "learning_rate": 0.0023572506963099706, "loss": 7.6823, "step": 839800 }, { "epoch": 3.4215680183808965, "grad_norm": 7.8127055168151855, "learning_rate": 0.0023567554765750423, "loss": 7.6884, "step": 839900 }, { "epoch": 3.421975396404278, "grad_norm": 12.666727066040039, "learning_rate": 0.002356260262701012, "loss": 7.6404, "step": 840000 }, { "epoch": 3.421975396404278, "eval_MaskedAccuracy": 0.5075237537295956, "eval_loss": 1.6168293952941895, "eval_runtime": 160.0079, "eval_samples_per_second": 396.705, "eval_steps_per_second": 1.55, "step": 840000 }, { "epoch": 3.422382774427659, "grad_norm": 5.36226749420166, "learning_rate": 0.0023557650547074605, "loss": 7.6518, "step": 840100 }, { "epoch": 3.4227901524510407, "grad_norm": 3.917013645172119, "learning_rate": 0.0023552698526139635, "loss": 7.6621, "step": 840200 }, { "epoch": 3.423197530474422, "grad_norm": 6.045384883880615, "learning_rate": 0.0023547746564400976, "loss": 7.6472, "step": 840300 }, { "epoch": 3.4236049084978037, "grad_norm": 3.1460657119750977, "learning_rate": 0.002354279466205448, "loss": 7.6519, "step": 840400 }, { "epoch": 3.4240122865211853, "grad_norm": 6.753716945648193, "learning_rate": 0.0023537842819295837, "loss": 7.6562, "step": 840500 }, { "epoch": 3.424419664544567, "grad_norm": 6.56367301940918, "learning_rate": 0.0023532891036320857, "loss": 7.6255, "step": 840600 }, { "epoch": 3.424827042567948, "grad_norm": 3.510305881500244, "learning_rate": 0.0023527939313325304, "loss": 7.6579, "step": 840700 }, { "epoch": 3.4252344205913294, "grad_norm": 4.690621376037598, "learning_rate": 0.0023522987650504935, "loss": 7.6697, "step": 840800 }, { "epoch": 3.425641798614711, "grad_norm": 2.993744134902954, "learning_rate": 0.0023518036048055563, "loss": 7.6767, "step": 840900 }, { "epoch": 3.4260491766380925, "grad_norm": 7.15242862701416, "learning_rate": 0.00235130845061729, "loss": 7.6736, "step": 841000 }, { "epoch": 3.4260491766380925, "eval_MaskedAccuracy": 0.5068737466172768, "eval_loss": 1.6180052757263184, "eval_runtime": 176.1771, "eval_samples_per_second": 360.297, "eval_steps_per_second": 1.408, "step": 841000 }, { "epoch": 3.426456554661474, "grad_norm": 4.285866737365723, "learning_rate": 0.0023508133025052736, "loss": 7.6334, "step": 841100 }, { "epoch": 3.4268639326848556, "grad_norm": 3.8874728679656982, "learning_rate": 0.0023503181604890835, "loss": 7.6595, "step": 841200 }, { "epoch": 3.427271310708237, "grad_norm": 8.724327087402344, "learning_rate": 0.00234982302458829, "loss": 7.7037, "step": 841300 }, { "epoch": 3.427678688731618, "grad_norm": 3.802838087081909, "learning_rate": 0.0023493278948224744, "loss": 7.6617, "step": 841400 }, { "epoch": 3.4280860667549997, "grad_norm": 5.62872838973999, "learning_rate": 0.002348832771211211, "loss": 7.6423, "step": 841500 }, { "epoch": 3.4284934447783812, "grad_norm": 4.328555107116699, "learning_rate": 0.0023483376537740703, "loss": 7.6562, "step": 841600 }, { "epoch": 3.428900822801763, "grad_norm": 3.9430062770843506, "learning_rate": 0.0023478425425306326, "loss": 7.6444, "step": 841700 }, { "epoch": 3.4293082008251443, "grad_norm": 10.714035987854004, "learning_rate": 0.002347347437500468, "loss": 7.6648, "step": 841800 }, { "epoch": 3.4297155788485254, "grad_norm": 4.730955600738525, "learning_rate": 0.0023468523387031527, "loss": 7.644, "step": 841900 }, { "epoch": 3.430122956871907, "grad_norm": 3.557774066925049, "learning_rate": 0.0023463572461582603, "loss": 7.6728, "step": 842000 }, { "epoch": 3.430122956871907, "eval_MaskedAccuracy": 0.5075078193670638, "eval_loss": 1.6078718900680542, "eval_runtime": 162.7347, "eval_samples_per_second": 390.058, "eval_steps_per_second": 1.524, "step": 842000 }, { "epoch": 3.4305303348952885, "grad_norm": 6.733087539672852, "learning_rate": 0.0023458621598853656, "loss": 7.6328, "step": 842100 }, { "epoch": 3.43093771291867, "grad_norm": 7.9630303382873535, "learning_rate": 0.0023453670799040427, "loss": 7.6389, "step": 842200 }, { "epoch": 3.4313450909420515, "grad_norm": 6.708206653594971, "learning_rate": 0.0023448720062338624, "loss": 7.6758, "step": 842300 }, { "epoch": 3.431752468965433, "grad_norm": 8.816062927246094, "learning_rate": 0.0023443769388943984, "loss": 7.6319, "step": 842400 }, { "epoch": 3.4321598469888146, "grad_norm": 4.445310592651367, "learning_rate": 0.0023438818779052233, "loss": 7.661, "step": 842500 }, { "epoch": 3.4325672250121957, "grad_norm": 7.010202884674072, "learning_rate": 0.0023433868232859076, "loss": 7.6547, "step": 842600 }, { "epoch": 3.432974603035577, "grad_norm": 4.143481254577637, "learning_rate": 0.002342891775056026, "loss": 7.699, "step": 842700 }, { "epoch": 3.4333819810589588, "grad_norm": 7.141845703125, "learning_rate": 0.002342396733235149, "loss": 7.6717, "step": 842800 }, { "epoch": 3.4337893590823403, "grad_norm": 4.089468479156494, "learning_rate": 0.002341901697842847, "loss": 7.661, "step": 842900 }, { "epoch": 3.434196737105722, "grad_norm": 5.759546756744385, "learning_rate": 0.0023414066688986954, "loss": 7.6137, "step": 843000 }, { "epoch": 3.434196737105722, "eval_MaskedAccuracy": 0.5077780087292555, "eval_loss": 1.6099501848220825, "eval_runtime": 163.3517, "eval_samples_per_second": 388.585, "eval_steps_per_second": 1.518, "step": 843000 }, { "epoch": 3.4346041151291034, "grad_norm": 4.585111141204834, "learning_rate": 0.0023409116464222653, "loss": 7.6545, "step": 843100 }, { "epoch": 3.4350114931524844, "grad_norm": 10.037331581115723, "learning_rate": 0.002340416630433123, "loss": 7.6688, "step": 843200 }, { "epoch": 3.435418871175866, "grad_norm": 6.103098392486572, "learning_rate": 0.0023399216209508424, "loss": 7.6411, "step": 843300 }, { "epoch": 3.4358262491992475, "grad_norm": 6.380743026733398, "learning_rate": 0.0023394266179949936, "loss": 7.6444, "step": 843400 }, { "epoch": 3.436233627222629, "grad_norm": 12.773555755615234, "learning_rate": 0.0023389316215851434, "loss": 7.6456, "step": 843500 }, { "epoch": 3.4366410052460106, "grad_norm": 5.444146156311035, "learning_rate": 0.0023384366317408677, "loss": 7.6484, "step": 843600 }, { "epoch": 3.437048383269392, "grad_norm": 4.911456108093262, "learning_rate": 0.0023379416484817298, "loss": 7.6505, "step": 843700 }, { "epoch": 3.4374557612927736, "grad_norm": 10.463674545288086, "learning_rate": 0.002337446671827304, "loss": 7.6468, "step": 843800 }, { "epoch": 3.4378631393161547, "grad_norm": 4.498503684997559, "learning_rate": 0.002336951701797155, "loss": 7.6749, "step": 843900 }, { "epoch": 3.4382705173395363, "grad_norm": 5.607254981994629, "learning_rate": 0.0023364567384108557, "loss": 7.6593, "step": 844000 }, { "epoch": 3.4382705173395363, "eval_MaskedAccuracy": 0.5075430470323052, "eval_loss": 1.6166020631790161, "eval_runtime": 161.8542, "eval_samples_per_second": 392.18, "eval_steps_per_second": 1.532, "step": 844000 }, { "epoch": 3.438677895362918, "grad_norm": 4.140470027923584, "learning_rate": 0.002335961781687971, "loss": 7.6626, "step": 844100 }, { "epoch": 3.4390852733862993, "grad_norm": 3.1368746757507324, "learning_rate": 0.002335466831648073, "loss": 7.6365, "step": 844200 }, { "epoch": 3.439492651409681, "grad_norm": 6.783154010772705, "learning_rate": 0.002334971888310729, "loss": 7.63, "step": 844300 }, { "epoch": 3.439900029433062, "grad_norm": 2.900268077850342, "learning_rate": 0.0023344769516955043, "loss": 7.6636, "step": 844400 }, { "epoch": 3.4403074074564435, "grad_norm": 5.9655442237854, "learning_rate": 0.002333982021821967, "loss": 7.6346, "step": 844500 }, { "epoch": 3.440714785479825, "grad_norm": 5.214728832244873, "learning_rate": 0.0023334870987096836, "loss": 7.6563, "step": 844600 }, { "epoch": 3.4411221635032065, "grad_norm": 5.482659816741943, "learning_rate": 0.002332992182378224, "loss": 7.6706, "step": 844700 }, { "epoch": 3.441529541526588, "grad_norm": 6.027997016906738, "learning_rate": 0.0023324972728471513, "loss": 7.6664, "step": 844800 }, { "epoch": 3.4419369195499696, "grad_norm": 5.681214809417725, "learning_rate": 0.002332002370136035, "loss": 7.6654, "step": 844900 }, { "epoch": 3.442344297573351, "grad_norm": 6.694900035858154, "learning_rate": 0.002331507474264443, "loss": 7.6208, "step": 845000 }, { "epoch": 3.442344297573351, "eval_MaskedAccuracy": 0.5075787665132473, "eval_loss": 1.6100114583969116, "eval_runtime": 166.2451, "eval_samples_per_second": 381.822, "eval_steps_per_second": 1.492, "step": 845000 }, { "epoch": 3.4427516755967322, "grad_norm": 4.156238555908203, "learning_rate": 0.002331012585251938, "loss": 7.669, "step": 845100 }, { "epoch": 3.4431590536201138, "grad_norm": 5.38286018371582, "learning_rate": 0.0023305177031180836, "loss": 7.6487, "step": 845200 }, { "epoch": 3.4435664316434953, "grad_norm": 3.2154221534729004, "learning_rate": 0.0023300228278824485, "loss": 7.6623, "step": 845300 }, { "epoch": 3.443973809666877, "grad_norm": 7.410704612731934, "learning_rate": 0.0023295279595645967, "loss": 7.6404, "step": 845400 }, { "epoch": 3.4443811876902584, "grad_norm": 3.97457218170166, "learning_rate": 0.0023290330981840936, "loss": 7.675, "step": 845500 }, { "epoch": 3.44478856571364, "grad_norm": 3.8987467288970947, "learning_rate": 0.0023285382437605038, "loss": 7.654, "step": 845600 }, { "epoch": 3.445195943737021, "grad_norm": 4.117824554443359, "learning_rate": 0.0023280433963133894, "loss": 7.6211, "step": 845700 }, { "epoch": 3.4456033217604025, "grad_norm": 3.741800546646118, "learning_rate": 0.002327548555862319, "loss": 7.6609, "step": 845800 }, { "epoch": 3.446010699783784, "grad_norm": 5.278165817260742, "learning_rate": 0.0023270537224268505, "loss": 7.6582, "step": 845900 }, { "epoch": 3.4464180778071656, "grad_norm": 5.979162693023682, "learning_rate": 0.00232655889602655, "loss": 7.6605, "step": 846000 }, { "epoch": 3.4464180778071656, "eval_MaskedAccuracy": 0.5075461690063723, "eval_loss": 1.609590768814087, "eval_runtime": 164.4847, "eval_samples_per_second": 385.908, "eval_steps_per_second": 1.508, "step": 846000 }, { "epoch": 3.446825455830547, "grad_norm": 3.0768141746520996, "learning_rate": 0.0023260640766809815, "loss": 7.6496, "step": 846100 }, { "epoch": 3.4472328338539286, "grad_norm": 4.9974284172058105, "learning_rate": 0.0023255692644097085, "loss": 7.6899, "step": 846200 }, { "epoch": 3.44764021187731, "grad_norm": 5.592404842376709, "learning_rate": 0.0023250744592322914, "loss": 7.6687, "step": 846300 }, { "epoch": 3.4480475899006913, "grad_norm": 6.906416416168213, "learning_rate": 0.002324579661168292, "loss": 7.6625, "step": 846400 }, { "epoch": 3.448454967924073, "grad_norm": 11.000245094299316, "learning_rate": 0.0023240848702372717, "loss": 7.6962, "step": 846500 }, { "epoch": 3.4488623459474543, "grad_norm": 4.422186374664307, "learning_rate": 0.0023235900864587975, "loss": 7.6695, "step": 846600 }, { "epoch": 3.449269723970836, "grad_norm": 3.7603700160980225, "learning_rate": 0.002323095309852429, "loss": 7.6794, "step": 846700 }, { "epoch": 3.4496771019942174, "grad_norm": 4.572369575500488, "learning_rate": 0.002322600540437724, "loss": 7.6602, "step": 846800 }, { "epoch": 3.4500844800175985, "grad_norm": 8.38378620147705, "learning_rate": 0.0023221057782342436, "loss": 7.65, "step": 846900 }, { "epoch": 3.45049185804098, "grad_norm": 4.574950695037842, "learning_rate": 0.002321611023261551, "loss": 7.6548, "step": 847000 }, { "epoch": 3.45049185804098, "eval_MaskedAccuracy": 0.5073147574991682, "eval_loss": 1.611008644104004, "eval_runtime": 155.2146, "eval_samples_per_second": 408.956, "eval_steps_per_second": 1.598, "step": 847000 }, { "epoch": 3.4508992360643616, "grad_norm": 11.892849922180176, "learning_rate": 0.002321116275539205, "loss": 7.6373, "step": 847100 }, { "epoch": 3.451306614087743, "grad_norm": 7.027309417724609, "learning_rate": 0.002320621535086765, "loss": 7.6589, "step": 847200 }, { "epoch": 3.4517139921111246, "grad_norm": 6.043745994567871, "learning_rate": 0.002320126801923793, "loss": 7.6598, "step": 847300 }, { "epoch": 3.452121370134506, "grad_norm": 8.969237327575684, "learning_rate": 0.002319632076069847, "loss": 7.6371, "step": 847400 }, { "epoch": 3.4525287481578877, "grad_norm": 3.8428471088409424, "learning_rate": 0.002319137357544488, "loss": 7.6735, "step": 847500 }, { "epoch": 3.4529361261812688, "grad_norm": 5.334589958190918, "learning_rate": 0.0023186426463672736, "loss": 7.6582, "step": 847600 }, { "epoch": 3.4533435042046503, "grad_norm": 8.992326736450195, "learning_rate": 0.0023181479425577606, "loss": 7.6213, "step": 847700 }, { "epoch": 3.453750882228032, "grad_norm": 4.794075012207031, "learning_rate": 0.0023176532461355063, "loss": 7.643, "step": 847800 }, { "epoch": 3.4541582602514134, "grad_norm": 4.427097797393799, "learning_rate": 0.002317158557120074, "loss": 7.6314, "step": 847900 }, { "epoch": 3.454565638274795, "grad_norm": 7.120694160461426, "learning_rate": 0.002316663875531021, "loss": 7.6218, "step": 848000 }, { "epoch": 3.454565638274795, "eval_MaskedAccuracy": 0.5076196713802682, "eval_loss": 1.6033300161361694, "eval_runtime": 153.0214, "eval_samples_per_second": 414.818, "eval_steps_per_second": 1.621, "step": 848000 }, { "epoch": 3.4549730162981764, "grad_norm": 8.416783332824707, "learning_rate": 0.0023161692013878997, "loss": 7.6466, "step": 848100 }, { "epoch": 3.4553803943215575, "grad_norm": 6.999683380126953, "learning_rate": 0.0023156745347102685, "loss": 7.6664, "step": 848200 }, { "epoch": 3.455787772344939, "grad_norm": 2.674804449081421, "learning_rate": 0.002315179875517685, "loss": 7.6796, "step": 848300 }, { "epoch": 3.4561951503683206, "grad_norm": 11.891407012939453, "learning_rate": 0.00231468522382971, "loss": 7.6673, "step": 848400 }, { "epoch": 3.456602528391702, "grad_norm": 11.699591636657715, "learning_rate": 0.002314190579665894, "loss": 7.6564, "step": 848500 }, { "epoch": 3.4570099064150837, "grad_norm": 2.579500436782837, "learning_rate": 0.002313695943045797, "loss": 7.651, "step": 848600 }, { "epoch": 3.457417284438465, "grad_norm": 4.95082426071167, "learning_rate": 0.002313201313988971, "loss": 7.6678, "step": 848700 }, { "epoch": 3.4578246624618467, "grad_norm": 5.20321798324585, "learning_rate": 0.002312706692514975, "loss": 7.6261, "step": 848800 }, { "epoch": 3.458232040485228, "grad_norm": 3.087709426879883, "learning_rate": 0.0023122120786433593, "loss": 7.6493, "step": 848900 }, { "epoch": 3.4586394185086093, "grad_norm": 6.596898555755615, "learning_rate": 0.0023117174723936815, "loss": 7.671, "step": 849000 }, { "epoch": 3.4586394185086093, "eval_MaskedAccuracy": 0.5074166381801796, "eval_loss": 1.610337257385254, "eval_runtime": 169.3394, "eval_samples_per_second": 374.845, "eval_steps_per_second": 1.465, "step": 849000 }, { "epoch": 3.459046796531991, "grad_norm": 4.9657368659973145, "learning_rate": 0.0023112228737854944, "loss": 7.6404, "step": 849100 }, { "epoch": 3.4594541745553724, "grad_norm": 9.766314506530762, "learning_rate": 0.0023107282828383553, "loss": 7.674, "step": 849200 }, { "epoch": 3.459861552578754, "grad_norm": 9.312552452087402, "learning_rate": 0.0023102336995718164, "loss": 7.6506, "step": 849300 }, { "epoch": 3.460268930602135, "grad_norm": 5.444794178009033, "learning_rate": 0.0023097391240054312, "loss": 7.6577, "step": 849400 }, { "epoch": 3.4606763086255166, "grad_norm": 5.837581634521484, "learning_rate": 0.002309244556158751, "loss": 7.6578, "step": 849500 }, { "epoch": 3.461083686648898, "grad_norm": 3.6645426750183105, "learning_rate": 0.002308749996051329, "loss": 7.6411, "step": 849600 }, { "epoch": 3.4614910646722796, "grad_norm": 3.197244644165039, "learning_rate": 0.0023082554437027234, "loss": 7.6716, "step": 849700 }, { "epoch": 3.461898442695661, "grad_norm": 5.347982406616211, "learning_rate": 0.002307760899132483, "loss": 7.6504, "step": 849800 }, { "epoch": 3.4623058207190427, "grad_norm": 4.38731050491333, "learning_rate": 0.0023072663623601567, "loss": 7.6677, "step": 849900 }, { "epoch": 3.4627131987424242, "grad_norm": 4.275853633880615, "learning_rate": 0.0023067718334052988, "loss": 7.6603, "step": 850000 }, { "epoch": 3.4627131987424242, "eval_MaskedAccuracy": 0.5076754871832684, "eval_loss": 1.6124376058578491, "eval_runtime": 162.2568, "eval_samples_per_second": 391.207, "eval_steps_per_second": 1.528, "step": 850000 }, { "epoch": 3.4631205767658053, "grad_norm": 3.056399345397949, "learning_rate": 0.0023062773122874613, "loss": 7.6485, "step": 850100 }, { "epoch": 3.463527954789187, "grad_norm": 7.190247535705566, "learning_rate": 0.0023057827990261934, "loss": 7.6689, "step": 850200 }, { "epoch": 3.4639353328125684, "grad_norm": 5.2235612869262695, "learning_rate": 0.00230528829364105, "loss": 7.6253, "step": 850300 }, { "epoch": 3.46434271083595, "grad_norm": 6.351413726806641, "learning_rate": 0.0023047937961515746, "loss": 7.6658, "step": 850400 }, { "epoch": 3.4647500888593314, "grad_norm": 6.8891191482543945, "learning_rate": 0.0023042993065773214, "loss": 7.6694, "step": 850500 }, { "epoch": 3.465157466882713, "grad_norm": 3.4621753692626953, "learning_rate": 0.0023038048249378407, "loss": 7.6528, "step": 850600 }, { "epoch": 3.465564844906094, "grad_norm": 9.703103065490723, "learning_rate": 0.0023033103512526826, "loss": 7.6309, "step": 850700 }, { "epoch": 3.4659722229294756, "grad_norm": 3.3330769538879395, "learning_rate": 0.002302815885541394, "loss": 7.6724, "step": 850800 }, { "epoch": 3.466379600952857, "grad_norm": 6.550228595733643, "learning_rate": 0.0023023214278235244, "loss": 7.6785, "step": 850900 }, { "epoch": 3.4667869789762387, "grad_norm": 9.059885025024414, "learning_rate": 0.002301826978118625, "loss": 7.6422, "step": 851000 }, { "epoch": 3.4667869789762387, "eval_MaskedAccuracy": 0.5075013760835131, "eval_loss": 1.6093096733093262, "eval_runtime": 162.2557, "eval_samples_per_second": 391.21, "eval_steps_per_second": 1.528, "step": 851000 }, { "epoch": 3.46719435699962, "grad_norm": 11.803396224975586, "learning_rate": 0.002301332536446239, "loss": 7.6661, "step": 851100 }, { "epoch": 3.4676017350230017, "grad_norm": 7.435437202453613, "learning_rate": 0.0023008381028259196, "loss": 7.6549, "step": 851200 }, { "epoch": 3.4680091130463833, "grad_norm": 3.796633005142212, "learning_rate": 0.0023003436772772116, "loss": 7.6632, "step": 851300 }, { "epoch": 3.4684164910697644, "grad_norm": 8.798277854919434, "learning_rate": 0.0022998492598196585, "loss": 7.6673, "step": 851400 }, { "epoch": 3.468823869093146, "grad_norm": 6.033581733703613, "learning_rate": 0.0022993548504728132, "loss": 7.6103, "step": 851500 }, { "epoch": 3.4692312471165274, "grad_norm": 3.235607147216797, "learning_rate": 0.0022988604492562223, "loss": 7.656, "step": 851600 }, { "epoch": 3.469638625139909, "grad_norm": 8.098994255065918, "learning_rate": 0.00229836605618943, "loss": 7.6414, "step": 851700 }, { "epoch": 3.4700460031632905, "grad_norm": 3.687758445739746, "learning_rate": 0.0022978716712919807, "loss": 7.6537, "step": 851800 }, { "epoch": 3.4704533811866716, "grad_norm": 3.4156992435455322, "learning_rate": 0.0022973772945834208, "loss": 7.6415, "step": 851900 }, { "epoch": 3.470860759210053, "grad_norm": 6.617143630981445, "learning_rate": 0.0022968829260833026, "loss": 7.6185, "step": 852000 }, { "epoch": 3.470860759210053, "eval_MaskedAccuracy": 0.5079360642413758, "eval_loss": 1.6091926097869873, "eval_runtime": 157.0559, "eval_samples_per_second": 404.162, "eval_steps_per_second": 1.579, "step": 852000 }, { "epoch": 3.4712681372334346, "grad_norm": 10.953904151916504, "learning_rate": 0.002296388565811161, "loss": 7.6753, "step": 852100 }, { "epoch": 3.471675515256816, "grad_norm": 4.454187870025635, "learning_rate": 0.0022958942137865468, "loss": 7.7009, "step": 852200 }, { "epoch": 3.4720828932801977, "grad_norm": 13.108904838562012, "learning_rate": 0.0022953998700290046, "loss": 7.6521, "step": 852300 }, { "epoch": 3.4724902713035792, "grad_norm": 7.302590370178223, "learning_rate": 0.0022949055345580774, "loss": 7.6702, "step": 852400 }, { "epoch": 3.4728976493269608, "grad_norm": 8.430106163024902, "learning_rate": 0.0022944112073933066, "loss": 7.6754, "step": 852500 }, { "epoch": 3.473305027350342, "grad_norm": 3.9765546321868896, "learning_rate": 0.002293916888554235, "loss": 7.6396, "step": 852600 }, { "epoch": 3.4737124053737234, "grad_norm": 7.331282615661621, "learning_rate": 0.0022934225780604106, "loss": 7.6808, "step": 852700 }, { "epoch": 3.474119783397105, "grad_norm": 3.967650890350342, "learning_rate": 0.0022929282759313707, "loss": 7.6518, "step": 852800 }, { "epoch": 3.4745271614204865, "grad_norm": 3.893359899520874, "learning_rate": 0.002292433982186661, "loss": 7.6548, "step": 852900 }, { "epoch": 3.474934539443868, "grad_norm": 5.097898483276367, "learning_rate": 0.0022919396968458217, "loss": 7.6644, "step": 853000 }, { "epoch": 3.474934539443868, "eval_MaskedAccuracy": 0.5079424612083103, "eval_loss": 1.614734411239624, "eval_runtime": 163.4256, "eval_samples_per_second": 388.409, "eval_steps_per_second": 1.518, "step": 853000 }, { "epoch": 3.4753419174672495, "grad_norm": 11.478796005249023, "learning_rate": 0.0022914454199283985, "loss": 7.6821, "step": 853100 }, { "epoch": 3.4757492954906306, "grad_norm": 6.6236395835876465, "learning_rate": 0.0022909511514539295, "loss": 7.6885, "step": 853200 }, { "epoch": 3.476156673514012, "grad_norm": 7.352938175201416, "learning_rate": 0.0022904568914419547, "loss": 7.6725, "step": 853300 }, { "epoch": 3.4765640515373937, "grad_norm": 14.414018630981445, "learning_rate": 0.0022899626399120175, "loss": 7.6521, "step": 853400 }, { "epoch": 3.476971429560775, "grad_norm": 5.734878063201904, "learning_rate": 0.002289468396883654, "loss": 7.6881, "step": 853500 }, { "epoch": 3.4773788075841567, "grad_norm": 4.813170909881592, "learning_rate": 0.0022889741623764123, "loss": 7.6756, "step": 853600 }, { "epoch": 3.4777861856075383, "grad_norm": 7.029141902923584, "learning_rate": 0.0022884799364098255, "loss": 7.6432, "step": 853700 }, { "epoch": 3.47819356363092, "grad_norm": 3.3507001399993896, "learning_rate": 0.0022879857190034342, "loss": 7.6259, "step": 853800 }, { "epoch": 3.478600941654301, "grad_norm": 4.767160892486572, "learning_rate": 0.002287491510176778, "loss": 7.6345, "step": 853900 }, { "epoch": 3.4790083196776824, "grad_norm": 3.7685225009918213, "learning_rate": 0.0022869973099493984, "loss": 7.6477, "step": 854000 }, { "epoch": 3.4790083196776824, "eval_MaskedAccuracy": 0.507706179515219, "eval_loss": 1.5977897644042969, "eval_runtime": 173.9633, "eval_samples_per_second": 364.882, "eval_steps_per_second": 1.426, "step": 854000 }, { "epoch": 3.479415697701064, "grad_norm": 6.97108793258667, "learning_rate": 0.0022865031183408268, "loss": 7.6404, "step": 854100 }, { "epoch": 3.4798230757244455, "grad_norm": 9.563529014587402, "learning_rate": 0.0022860089353706043, "loss": 7.65, "step": 854200 }, { "epoch": 3.480230453747827, "grad_norm": 4.536949634552002, "learning_rate": 0.0022855147610582726, "loss": 7.6417, "step": 854300 }, { "epoch": 3.480637831771208, "grad_norm": 10.869682312011719, "learning_rate": 0.0022850205954233664, "loss": 7.6466, "step": 854400 }, { "epoch": 3.4810452097945896, "grad_norm": 7.783188819885254, "learning_rate": 0.002284526438485423, "loss": 7.657, "step": 854500 }, { "epoch": 3.481452587817971, "grad_norm": 5.100917339324951, "learning_rate": 0.0022840322902639818, "loss": 7.6232, "step": 854600 }, { "epoch": 3.4818599658413527, "grad_norm": 17.816843032836914, "learning_rate": 0.0022835381507785735, "loss": 7.6513, "step": 854700 }, { "epoch": 3.4822673438647342, "grad_norm": 7.975155830383301, "learning_rate": 0.002283044020048735, "loss": 7.6461, "step": 854800 }, { "epoch": 3.4826747218881158, "grad_norm": 4.69168758392334, "learning_rate": 0.002282549898094005, "loss": 7.6544, "step": 854900 }, { "epoch": 3.4830820999114973, "grad_norm": 4.364626884460449, "learning_rate": 0.002282055784933918, "loss": 7.6323, "step": 855000 }, { "epoch": 3.4830820999114973, "eval_MaskedAccuracy": 0.5072453696647916, "eval_loss": 1.6087863445281982, "eval_runtime": 157.7357, "eval_samples_per_second": 402.42, "eval_steps_per_second": 1.572, "step": 855000 }, { "epoch": 3.4834894779348784, "grad_norm": 3.039024829864502, "learning_rate": 0.00228156168058801, "loss": 7.638, "step": 855100 }, { "epoch": 3.48389685595826, "grad_norm": 3.5653367042541504, "learning_rate": 0.002281067585075812, "loss": 7.6424, "step": 855200 }, { "epoch": 3.4843042339816415, "grad_norm": 6.967785358428955, "learning_rate": 0.0022805734984168587, "loss": 7.669, "step": 855300 }, { "epoch": 3.484711612005023, "grad_norm": 4.891968250274658, "learning_rate": 0.002280079420630686, "loss": 7.6531, "step": 855400 }, { "epoch": 3.4851189900284045, "grad_norm": 4.905412673950195, "learning_rate": 0.0022795853517368248, "loss": 7.6608, "step": 855500 }, { "epoch": 3.485526368051786, "grad_norm": 3.145766019821167, "learning_rate": 0.0022790912917548117, "loss": 7.6707, "step": 855600 }, { "epoch": 3.485933746075167, "grad_norm": 9.804694175720215, "learning_rate": 0.0022785972407041795, "loss": 7.6604, "step": 855700 }, { "epoch": 3.4863411240985487, "grad_norm": 5.194327354431152, "learning_rate": 0.0022781031986044554, "loss": 7.6586, "step": 855800 }, { "epoch": 3.48674850212193, "grad_norm": 11.771062850952148, "learning_rate": 0.0022776091654751777, "loss": 7.6579, "step": 855900 }, { "epoch": 3.4871558801453117, "grad_norm": 10.029210090637207, "learning_rate": 0.002277115141335875, "loss": 7.6355, "step": 856000 }, { "epoch": 3.4871558801453117, "eval_MaskedAccuracy": 0.5075777652786438, "eval_loss": 1.608164668083191, "eval_runtime": 167.0189, "eval_samples_per_second": 380.053, "eval_steps_per_second": 1.485, "step": 856000 }, { "epoch": 3.4875632581686933, "grad_norm": 4.199847221374512, "learning_rate": 0.002276621126206079, "loss": 7.6806, "step": 856100 }, { "epoch": 3.487970636192075, "grad_norm": 4.938039779663086, "learning_rate": 0.002276127120105323, "loss": 7.6347, "step": 856200 }, { "epoch": 3.4883780142154563, "grad_norm": 4.0086350440979, "learning_rate": 0.002275633123053136, "loss": 7.662, "step": 856300 }, { "epoch": 3.4887853922388374, "grad_norm": 11.219549179077148, "learning_rate": 0.0022751391350690486, "loss": 7.6609, "step": 856400 }, { "epoch": 3.489192770262219, "grad_norm": 2.854553461074829, "learning_rate": 0.002274645156172589, "loss": 7.6425, "step": 856500 }, { "epoch": 3.4896001482856005, "grad_norm": 11.257872581481934, "learning_rate": 0.0022741511863832876, "loss": 7.6496, "step": 856600 }, { "epoch": 3.490007526308982, "grad_norm": 6.427947044372559, "learning_rate": 0.002273657225720677, "loss": 7.6464, "step": 856700 }, { "epoch": 3.4904149043323636, "grad_norm": 5.208683013916016, "learning_rate": 0.0022731632742042838, "loss": 7.6603, "step": 856800 }, { "epoch": 3.4908222823557447, "grad_norm": 10.774505615234375, "learning_rate": 0.0022726693318536346, "loss": 7.649, "step": 856900 }, { "epoch": 3.491229660379126, "grad_norm": 6.411166191101074, "learning_rate": 0.00227217539868826, "loss": 7.6436, "step": 857000 }, { "epoch": 3.491229660379126, "eval_MaskedAccuracy": 0.5076728692143582, "eval_loss": 1.6141939163208008, "eval_runtime": 199.506, "eval_samples_per_second": 318.166, "eval_steps_per_second": 1.243, "step": 857000 }, { "epoch": 3.4916370384025077, "grad_norm": 7.696373462677002, "learning_rate": 0.002271681474727689, "loss": 7.6009, "step": 857100 }, { "epoch": 3.4920444164258893, "grad_norm": 3.2003326416015625, "learning_rate": 0.0022711875599914447, "loss": 7.6482, "step": 857200 }, { "epoch": 3.492451794449271, "grad_norm": 3.957310914993286, "learning_rate": 0.0022706936544990572, "loss": 7.66, "step": 857300 }, { "epoch": 3.4928591724726523, "grad_norm": 19.03056526184082, "learning_rate": 0.0022701997582700523, "loss": 7.6458, "step": 857400 }, { "epoch": 3.493266550496034, "grad_norm": 2.931204080581665, "learning_rate": 0.0022697058713239578, "loss": 7.6854, "step": 857500 }, { "epoch": 3.493673928519415, "grad_norm": 3.9756855964660645, "learning_rate": 0.0022692119936802975, "loss": 7.657, "step": 857600 }, { "epoch": 3.4940813065427965, "grad_norm": 5.919060230255127, "learning_rate": 0.0022687181253586, "loss": 7.6684, "step": 857700 }, { "epoch": 3.494488684566178, "grad_norm": 7.748708248138428, "learning_rate": 0.0022682242663783865, "loss": 7.6627, "step": 857800 }, { "epoch": 3.4948960625895595, "grad_norm": 7.620152473449707, "learning_rate": 0.0022677304167591857, "loss": 7.6573, "step": 857900 }, { "epoch": 3.495303440612941, "grad_norm": 10.640727996826172, "learning_rate": 0.0022672365765205194, "loss": 7.658, "step": 858000 }, { "epoch": 3.495303440612941, "eval_MaskedAccuracy": 0.5079203208738784, "eval_loss": 1.6093806028366089, "eval_runtime": 155.7792, "eval_samples_per_second": 407.474, "eval_steps_per_second": 1.592, "step": 858000 }, { "epoch": 3.4957108186363226, "grad_norm": 5.147162914276123, "learning_rate": 0.002266742745681915, "loss": 7.6528, "step": 858100 }, { "epoch": 3.4961181966597037, "grad_norm": 5.744800567626953, "learning_rate": 0.0022662489242628913, "loss": 7.6804, "step": 858200 }, { "epoch": 3.4965255746830852, "grad_norm": 4.321141242980957, "learning_rate": 0.002265755112282977, "loss": 7.6806, "step": 858300 }, { "epoch": 3.4969329527064668, "grad_norm": 8.63270378112793, "learning_rate": 0.002265261309761691, "loss": 7.6707, "step": 858400 }, { "epoch": 3.4973403307298483, "grad_norm": 5.3142900466918945, "learning_rate": 0.0022647675167185577, "loss": 7.6382, "step": 858500 }, { "epoch": 3.49774770875323, "grad_norm": 5.011209011077881, "learning_rate": 0.0022642737331730993, "loss": 7.6731, "step": 858600 }, { "epoch": 3.4981550867766114, "grad_norm": 9.25820255279541, "learning_rate": 0.0022637799591448383, "loss": 7.6675, "step": 858700 }, { "epoch": 3.498562464799993, "grad_norm": 4.86635684967041, "learning_rate": 0.002263286194653295, "loss": 7.6337, "step": 858800 }, { "epoch": 3.498969842823374, "grad_norm": 5.551971912384033, "learning_rate": 0.002262792439717993, "loss": 7.6365, "step": 858900 }, { "epoch": 3.4993772208467555, "grad_norm": 16.170394897460938, "learning_rate": 0.0022622986943584534, "loss": 7.668, "step": 859000 }, { "epoch": 3.4993772208467555, "eval_MaskedAccuracy": 0.507519777077441, "eval_loss": 1.6039706468582153, "eval_runtime": 156.5045, "eval_samples_per_second": 405.586, "eval_steps_per_second": 1.585, "step": 859000 }, { "epoch": 3.499784598870137, "grad_norm": 17.170879364013672, "learning_rate": 0.0022618049585941906, "loss": 7.635, "step": 859100 }, { "epoch": 3.5001919768935186, "grad_norm": 6.251809597015381, "learning_rate": 0.002261311232444728, "loss": 7.6465, "step": 859200 }, { "epoch": 3.5005993549169, "grad_norm": 6.715796947479248, "learning_rate": 0.002260817515929586, "loss": 7.6451, "step": 859300 }, { "epoch": 3.501006732940281, "grad_norm": 4.212535381317139, "learning_rate": 0.002260323809068286, "loss": 7.6352, "step": 859400 }, { "epoch": 3.5014141109636627, "grad_norm": 5.789183139801025, "learning_rate": 0.0022598301118803435, "loss": 7.6419, "step": 859500 }, { "epoch": 3.5018214889870443, "grad_norm": 7.619360446929932, "learning_rate": 0.0022593364243852767, "loss": 7.6433, "step": 859600 }, { "epoch": 3.502228867010426, "grad_norm": 10.62928581237793, "learning_rate": 0.0022588427466026036, "loss": 7.6414, "step": 859700 }, { "epoch": 3.5026362450338073, "grad_norm": 6.69666862487793, "learning_rate": 0.002258349078551843, "loss": 7.6481, "step": 859800 }, { "epoch": 3.503043623057189, "grad_norm": 10.35774040222168, "learning_rate": 0.002257855420252513, "loss": 7.6396, "step": 859900 }, { "epoch": 3.5034510010805704, "grad_norm": 9.715471267700195, "learning_rate": 0.002257361771724131, "loss": 7.6518, "step": 860000 }, { "epoch": 3.5034510010805704, "eval_MaskedAccuracy": 0.5082721296773883, "eval_loss": 1.6053332090377808, "eval_runtime": 156.5663, "eval_samples_per_second": 405.426, "eval_steps_per_second": 1.584, "step": 860000 }, { "epoch": 3.503858379103952, "grad_norm": 6.737760066986084, "learning_rate": 0.0022568681329862136, "loss": 7.6393, "step": 860100 }, { "epoch": 3.504265757127333, "grad_norm": 12.537190437316895, "learning_rate": 0.002256374504058277, "loss": 7.6236, "step": 860200 }, { "epoch": 3.5046731351507145, "grad_norm": 6.036975860595703, "learning_rate": 0.0022558808849598324, "loss": 7.6448, "step": 860300 }, { "epoch": 3.505080513174096, "grad_norm": 7.80363130569458, "learning_rate": 0.0022553872757104, "loss": 7.6271, "step": 860400 }, { "epoch": 3.5054878911974776, "grad_norm": 4.068698406219482, "learning_rate": 0.002254893676329494, "loss": 7.6543, "step": 860500 }, { "epoch": 3.5058952692208587, "grad_norm": 10.014059066772461, "learning_rate": 0.002254400086836629, "loss": 7.6429, "step": 860600 }, { "epoch": 3.5063026472442402, "grad_norm": 8.289796829223633, "learning_rate": 0.0022539065072513173, "loss": 7.6515, "step": 860700 }, { "epoch": 3.5067100252676218, "grad_norm": 5.379649639129639, "learning_rate": 0.002253412937593076, "loss": 7.6373, "step": 860800 }, { "epoch": 3.5071174032910033, "grad_norm": 11.413267135620117, "learning_rate": 0.002252919377881416, "loss": 7.6663, "step": 860900 }, { "epoch": 3.507524781314385, "grad_norm": 8.432605743408203, "learning_rate": 0.0022524258281358556, "loss": 7.6316, "step": 861000 }, { "epoch": 3.507524781314385, "eval_MaskedAccuracy": 0.5077200990247405, "eval_loss": 1.6118197441101074, "eval_runtime": 153.7307, "eval_samples_per_second": 412.904, "eval_steps_per_second": 1.613, "step": 861000 }, { "epoch": 3.5079321593377664, "grad_norm": 6.178477764129639, "learning_rate": 0.0022519322883759003, "loss": 7.6285, "step": 861100 }, { "epoch": 3.508339537361148, "grad_norm": 6.329875469207764, "learning_rate": 0.0022514387586210637, "loss": 7.6426, "step": 861200 }, { "epoch": 3.5087469153845294, "grad_norm": 7.15531587600708, "learning_rate": 0.002250945238890861, "loss": 7.6393, "step": 861300 }, { "epoch": 3.5091542934079105, "grad_norm": 4.224843502044678, "learning_rate": 0.0022504517292048025, "loss": 7.6266, "step": 861400 }, { "epoch": 3.509561671431292, "grad_norm": 6.41402006149292, "learning_rate": 0.002249958229582398, "loss": 7.6397, "step": 861500 }, { "epoch": 3.5099690494546736, "grad_norm": 8.511131286621094, "learning_rate": 0.002249464740043158, "loss": 7.665, "step": 861600 }, { "epoch": 3.510376427478055, "grad_norm": 9.402780532836914, "learning_rate": 0.0022489712606065916, "loss": 7.6536, "step": 861700 }, { "epoch": 3.5107838055014366, "grad_norm": 6.63273811340332, "learning_rate": 0.0022484777912922106, "loss": 7.6683, "step": 861800 }, { "epoch": 3.5111911835248177, "grad_norm": 12.446701049804688, "learning_rate": 0.0022479843321195265, "loss": 7.6191, "step": 861900 }, { "epoch": 3.5115985615481993, "grad_norm": 4.364391803741455, "learning_rate": 0.002247490883108045, "loss": 7.6422, "step": 862000 }, { "epoch": 3.5115985615481993, "eval_MaskedAccuracy": 0.5081787838428641, "eval_loss": 1.6055957078933716, "eval_runtime": 160.9716, "eval_samples_per_second": 394.33, "eval_steps_per_second": 1.541, "step": 862000 }, { "epoch": 3.512005939571581, "grad_norm": 4.078834056854248, "learning_rate": 0.0022469974442772753, "loss": 7.6737, "step": 862100 }, { "epoch": 3.5124133175949623, "grad_norm": 2.7827649116516113, "learning_rate": 0.0022465040156467273, "loss": 7.6371, "step": 862200 }, { "epoch": 3.512820695618344, "grad_norm": 9.272007942199707, "learning_rate": 0.0022460105972359093, "loss": 7.6791, "step": 862300 }, { "epoch": 3.5132280736417254, "grad_norm": 11.089544296264648, "learning_rate": 0.0022455171890643252, "loss": 7.6573, "step": 862400 }, { "epoch": 3.513635451665107, "grad_norm": 4.901332855224609, "learning_rate": 0.0022450237911514847, "loss": 7.6553, "step": 862500 }, { "epoch": 3.5140428296884885, "grad_norm": 6.1682891845703125, "learning_rate": 0.0022445304035168976, "loss": 7.637, "step": 862600 }, { "epoch": 3.5144502077118696, "grad_norm": 7.853944778442383, "learning_rate": 0.0022440370261800685, "loss": 7.6601, "step": 862700 }, { "epoch": 3.514857585735251, "grad_norm": 11.519634246826172, "learning_rate": 0.002243543659160498, "loss": 7.6357, "step": 862800 }, { "epoch": 3.5152649637586326, "grad_norm": 2.628192186355591, "learning_rate": 0.0022430503024776956, "loss": 7.6635, "step": 862900 }, { "epoch": 3.515672341782014, "grad_norm": 10.383161544799805, "learning_rate": 0.002242556956151167, "loss": 7.6626, "step": 863000 }, { "epoch": 3.515672341782014, "eval_MaskedAccuracy": 0.5071665735026636, "eval_loss": 1.6087853908538818, "eval_runtime": 160.8476, "eval_samples_per_second": 394.634, "eval_steps_per_second": 1.542, "step": 863000 }, { "epoch": 3.5160797198053952, "grad_norm": 7.387240886688232, "learning_rate": 0.0022420636202004148, "loss": 7.6364, "step": 863100 }, { "epoch": 3.5164870978287768, "grad_norm": 13.875762939453125, "learning_rate": 0.0022415702946449443, "loss": 7.6802, "step": 863200 }, { "epoch": 3.5168944758521583, "grad_norm": 2.7607388496398926, "learning_rate": 0.0022410769795042592, "loss": 7.6344, "step": 863300 }, { "epoch": 3.51730185387554, "grad_norm": 9.745854377746582, "learning_rate": 0.002240583674797862, "loss": 7.6235, "step": 863400 }, { "epoch": 3.5177092318989214, "grad_norm": 3.2563323974609375, "learning_rate": 0.002240090380545254, "loss": 7.6139, "step": 863500 }, { "epoch": 3.518116609922303, "grad_norm": 3.583744764328003, "learning_rate": 0.002239597096765943, "loss": 7.6296, "step": 863600 }, { "epoch": 3.5185239879456844, "grad_norm": 7.513589859008789, "learning_rate": 0.002239103823479425, "loss": 7.6522, "step": 863700 }, { "epoch": 3.518931365969066, "grad_norm": 7.896854877471924, "learning_rate": 0.0022386105607052078, "loss": 7.6664, "step": 863800 }, { "epoch": 3.519338743992447, "grad_norm": 6.768326282501221, "learning_rate": 0.0022381173084627883, "loss": 7.6193, "step": 863900 }, { "epoch": 3.5197461220158286, "grad_norm": 7.823085784912109, "learning_rate": 0.0022376240667716673, "loss": 7.6272, "step": 864000 }, { "epoch": 3.5197461220158286, "eval_MaskedAccuracy": 0.5073980986652228, "eval_loss": 1.6157567501068115, "eval_runtime": 156.696, "eval_samples_per_second": 405.09, "eval_steps_per_second": 1.583, "step": 864000 }, { "epoch": 3.52015350003921, "grad_norm": 10.738860130310059, "learning_rate": 0.0022371308356513527, "loss": 7.6474, "step": 864100 }, { "epoch": 3.5205608780625917, "grad_norm": 5.488829612731934, "learning_rate": 0.0022366376151213346, "loss": 7.6362, "step": 864200 }, { "epoch": 3.520968256085973, "grad_norm": 9.830921173095703, "learning_rate": 0.0022361444052011186, "loss": 7.6252, "step": 864300 }, { "epoch": 3.5213756341093543, "grad_norm": 4.068374156951904, "learning_rate": 0.0022356512059102025, "loss": 7.6508, "step": 864400 }, { "epoch": 3.521783012132736, "grad_norm": 8.725272178649902, "learning_rate": 0.002235158017268082, "loss": 7.6446, "step": 864500 }, { "epoch": 3.5221903901561173, "grad_norm": 5.265230178833008, "learning_rate": 0.0022346648392942603, "loss": 7.6805, "step": 864600 }, { "epoch": 3.522597768179499, "grad_norm": 10.594454765319824, "learning_rate": 0.0022341716720082334, "loss": 7.6503, "step": 864700 }, { "epoch": 3.5230051462028804, "grad_norm": 4.110137462615967, "learning_rate": 0.0022336785154294953, "loss": 7.5997, "step": 864800 }, { "epoch": 3.523412524226262, "grad_norm": 7.475287437438965, "learning_rate": 0.0022331853695775504, "loss": 7.6551, "step": 864900 }, { "epoch": 3.5238199022496435, "grad_norm": 10.34378433227539, "learning_rate": 0.00223269223447189, "loss": 7.6492, "step": 865000 }, { "epoch": 3.5238199022496435, "eval_MaskedAccuracy": 0.5075955693325948, "eval_loss": 1.6088305711746216, "eval_runtime": 156.3533, "eval_samples_per_second": 405.978, "eval_steps_per_second": 1.586, "step": 865000 }, { "epoch": 3.524227280273025, "grad_norm": 4.775081634521484, "learning_rate": 0.0022321991101320166, "loss": 7.6302, "step": 865100 }, { "epoch": 3.524634658296406, "grad_norm": 3.5852935314178467, "learning_rate": 0.002231705996577416, "loss": 7.6434, "step": 865200 }, { "epoch": 3.5250420363197876, "grad_norm": 7.204189777374268, "learning_rate": 0.0022312128938275907, "loss": 7.6231, "step": 865300 }, { "epoch": 3.525449414343169, "grad_norm": 3.662602424621582, "learning_rate": 0.0022307198019020313, "loss": 7.6321, "step": 865400 }, { "epoch": 3.5258567923665507, "grad_norm": 12.092575073242188, "learning_rate": 0.002230226720820236, "loss": 7.6488, "step": 865500 }, { "epoch": 3.526264170389932, "grad_norm": 5.39505672454834, "learning_rate": 0.0022297336506016986, "loss": 7.6587, "step": 865600 }, { "epoch": 3.5266715484133133, "grad_norm": 3.886173725128174, "learning_rate": 0.0022292405912659116, "loss": 7.6305, "step": 865700 }, { "epoch": 3.527078926436695, "grad_norm": 7.400486469268799, "learning_rate": 0.002228747542832371, "loss": 7.6279, "step": 865800 }, { "epoch": 3.5274863044600764, "grad_norm": 3.2266361713409424, "learning_rate": 0.0022282545053205647, "loss": 7.6445, "step": 865900 }, { "epoch": 3.527893682483458, "grad_norm": 6.4646406173706055, "learning_rate": 0.0022277614787499906, "loss": 7.6614, "step": 866000 }, { "epoch": 3.527893682483458, "eval_MaskedAccuracy": 0.508327164619293, "eval_loss": 1.6072479486465454, "eval_runtime": 178.343, "eval_samples_per_second": 355.921, "eval_steps_per_second": 1.391, "step": 866000 }, { "epoch": 3.5283010605068394, "grad_norm": 5.047513008117676, "learning_rate": 0.0022272684631401383, "loss": 7.6268, "step": 866100 }, { "epoch": 3.528708438530221, "grad_norm": 3.4761483669281006, "learning_rate": 0.002226775458510497, "loss": 7.6494, "step": 866200 }, { "epoch": 3.5291158165536025, "grad_norm": 9.057031631469727, "learning_rate": 0.0022262824648805607, "loss": 7.6314, "step": 866300 }, { "epoch": 3.5295231945769836, "grad_norm": 5.275428771972656, "learning_rate": 0.0022257894822698206, "loss": 7.6535, "step": 866400 }, { "epoch": 3.529930572600365, "grad_norm": 8.792091369628906, "learning_rate": 0.002225296510697767, "loss": 7.6516, "step": 866500 }, { "epoch": 3.5303379506237467, "grad_norm": 10.201578140258789, "learning_rate": 0.0022248035501838837, "loss": 7.6221, "step": 866600 }, { "epoch": 3.530745328647128, "grad_norm": 10.079939842224121, "learning_rate": 0.0022243106007476658, "loss": 7.6352, "step": 866700 }, { "epoch": 3.5311527066705097, "grad_norm": 3.5065391063690186, "learning_rate": 0.0022238176624086023, "loss": 7.637, "step": 866800 }, { "epoch": 3.531560084693891, "grad_norm": 3.66342830657959, "learning_rate": 0.002223324735186181, "loss": 7.6384, "step": 866900 }, { "epoch": 3.5319674627172724, "grad_norm": 4.13564395904541, "learning_rate": 0.0022228318190998897, "loss": 7.6674, "step": 867000 }, { "epoch": 3.5319674627172724, "eval_MaskedAccuracy": 0.5078339605061698, "eval_loss": 1.6016442775726318, "eval_runtime": 167.2502, "eval_samples_per_second": 379.527, "eval_steps_per_second": 1.483, "step": 867000 }, { "epoch": 3.532374840740654, "grad_norm": 9.793412208557129, "learning_rate": 0.002222338914169216, "loss": 7.6937, "step": 867100 }, { "epoch": 3.5327822187640354, "grad_norm": 6.829715728759766, "learning_rate": 0.002221846020413645, "loss": 7.6395, "step": 867200 }, { "epoch": 3.533189596787417, "grad_norm": 4.2996506690979, "learning_rate": 0.0022213531378526656, "loss": 7.6595, "step": 867300 }, { "epoch": 3.5335969748107985, "grad_norm": 4.399179458618164, "learning_rate": 0.0022208602665057652, "loss": 7.6601, "step": 867400 }, { "epoch": 3.53400435283418, "grad_norm": 9.048361778259277, "learning_rate": 0.0022203674063924284, "loss": 7.6578, "step": 867500 }, { "epoch": 3.5344117308575616, "grad_norm": 5.1573567390441895, "learning_rate": 0.0022198745575321395, "loss": 7.646, "step": 867600 }, { "epoch": 3.5348191088809426, "grad_norm": 10.374008178710938, "learning_rate": 0.002219381719944383, "loss": 7.6632, "step": 867700 }, { "epoch": 3.535226486904324, "grad_norm": 4.753935813903809, "learning_rate": 0.002218888893648646, "loss": 7.6326, "step": 867800 }, { "epoch": 3.5356338649277057, "grad_norm": 3.602890968322754, "learning_rate": 0.002218396078664412, "loss": 7.655, "step": 867900 }, { "epoch": 3.5360412429510872, "grad_norm": 4.311542510986328, "learning_rate": 0.002217903275011164, "loss": 7.6387, "step": 868000 }, { "epoch": 3.5360412429510872, "eval_MaskedAccuracy": 0.5078075448271019, "eval_loss": 1.6154001951217651, "eval_runtime": 193.8666, "eval_samples_per_second": 327.421, "eval_steps_per_second": 1.279, "step": 868000 }, { "epoch": 3.5364486209744683, "grad_norm": 9.806257247924805, "learning_rate": 0.002217410482708384, "loss": 7.6441, "step": 868100 }, { "epoch": 3.53685599899785, "grad_norm": 7.351921081542969, "learning_rate": 0.002216917701775556, "loss": 7.6496, "step": 868200 }, { "epoch": 3.5372633770212314, "grad_norm": 3.7131845951080322, "learning_rate": 0.00221642493223216, "loss": 7.6651, "step": 868300 }, { "epoch": 3.537670755044613, "grad_norm": 8.0667724609375, "learning_rate": 0.0022159321740976828, "loss": 7.6045, "step": 868400 }, { "epoch": 3.5380781330679945, "grad_norm": 6.803450584411621, "learning_rate": 0.0022154394273916004, "loss": 7.6298, "step": 868500 }, { "epoch": 3.538485511091376, "grad_norm": 6.105456829071045, "learning_rate": 0.002214946692133398, "loss": 7.6561, "step": 868600 }, { "epoch": 3.5388928891147575, "grad_norm": 5.054286956787109, "learning_rate": 0.002214453968342553, "loss": 7.6309, "step": 868700 }, { "epoch": 3.539300267138139, "grad_norm": 5.653079032897949, "learning_rate": 0.0022139612560385484, "loss": 7.6118, "step": 868800 }, { "epoch": 3.53970764516152, "grad_norm": 8.30525016784668, "learning_rate": 0.0022134685552408612, "loss": 7.6569, "step": 868900 }, { "epoch": 3.5401150231849017, "grad_norm": 7.925002098083496, "learning_rate": 0.002212975865968968, "loss": 7.6269, "step": 869000 }, { "epoch": 3.5401150231849017, "eval_MaskedAccuracy": 0.5070655657190831, "eval_loss": 1.6203315258026123, "eval_runtime": 407.0913, "eval_samples_per_second": 155.926, "eval_steps_per_second": 0.609, "step": 869000 }, { "epoch": 3.540522401208283, "grad_norm": 5.564241886138916, "learning_rate": 0.0022124831882423535, "loss": 7.6465, "step": 869100 }, { "epoch": 3.5409297792316647, "grad_norm": 8.257840156555176, "learning_rate": 0.0022119905220804935, "loss": 7.6475, "step": 869200 }, { "epoch": 3.5413371572550463, "grad_norm": 6.186727046966553, "learning_rate": 0.0022114978675028636, "loss": 7.6533, "step": 869300 }, { "epoch": 3.5417445352784274, "grad_norm": 8.564448356628418, "learning_rate": 0.0022110052245289434, "loss": 7.6514, "step": 869400 }, { "epoch": 3.542151913301809, "grad_norm": 4.951117992401123, "learning_rate": 0.002210512593178211, "loss": 7.6313, "step": 869500 }, { "epoch": 3.5425592913251904, "grad_norm": 6.333032131195068, "learning_rate": 0.00221001997347014, "loss": 7.6275, "step": 869600 }, { "epoch": 3.542966669348572, "grad_norm": 5.469081878662109, "learning_rate": 0.0022095273654242063, "loss": 7.6638, "step": 869700 }, { "epoch": 3.5433740473719535, "grad_norm": 2.768857717514038, "learning_rate": 0.0022090347690598856, "loss": 7.647, "step": 869800 }, { "epoch": 3.543781425395335, "grad_norm": 11.990005493164062, "learning_rate": 0.0022085421843966533, "loss": 7.6299, "step": 869900 }, { "epoch": 3.5441888034187166, "grad_norm": 5.885626316070557, "learning_rate": 0.0022080496114539865, "loss": 7.6763, "step": 870000 }, { "epoch": 3.5441888034187166, "eval_MaskedAccuracy": 0.5077186503853933, "eval_loss": 1.6030515432357788, "eval_runtime": 186.6037, "eval_samples_per_second": 340.165, "eval_steps_per_second": 1.329, "step": 870000 }, { "epoch": 3.544596181442098, "grad_norm": 4.5661115646362305, "learning_rate": 0.0022075570502513553, "loss": 7.6813, "step": 870100 }, { "epoch": 3.545003559465479, "grad_norm": 9.719332695007324, "learning_rate": 0.002207064500808236, "loss": 7.6504, "step": 870200 }, { "epoch": 3.5454109374888607, "grad_norm": 2.993973731994629, "learning_rate": 0.0022065719631440975, "loss": 7.6145, "step": 870300 }, { "epoch": 3.5458183155122422, "grad_norm": 8.430723190307617, "learning_rate": 0.0022060794372784145, "loss": 7.6143, "step": 870400 }, { "epoch": 3.546225693535624, "grad_norm": 11.184769630432129, "learning_rate": 0.0022055869232306628, "loss": 7.6144, "step": 870500 }, { "epoch": 3.546633071559005, "grad_norm": 2.9587326049804688, "learning_rate": 0.002205094421020311, "loss": 7.6254, "step": 870600 }, { "epoch": 3.5470404495823864, "grad_norm": 8.092852592468262, "learning_rate": 0.002204601930666831, "loss": 7.6361, "step": 870700 }, { "epoch": 3.547447827605768, "grad_norm": 9.773558616638184, "learning_rate": 0.0022041094521896933, "loss": 7.6525, "step": 870800 }, { "epoch": 3.5478552056291495, "grad_norm": 5.076406478881836, "learning_rate": 0.0022036169856083695, "loss": 7.6596, "step": 870900 }, { "epoch": 3.548262583652531, "grad_norm": 5.9360785484313965, "learning_rate": 0.002203124530942327, "loss": 7.6561, "step": 871000 }, { "epoch": 3.548262583652531, "eval_MaskedAccuracy": 0.5077936875617609, "eval_loss": 1.6093236207962036, "eval_runtime": 213.3314, "eval_samples_per_second": 297.547, "eval_steps_per_second": 1.163, "step": 871000 }, { "epoch": 3.5486699616759125, "grad_norm": 5.750765323638916, "learning_rate": 0.0022026320882110348, "loss": 7.6483, "step": 871100 }, { "epoch": 3.549077339699294, "grad_norm": 6.0726470947265625, "learning_rate": 0.0022021396574339646, "loss": 7.6499, "step": 871200 }, { "epoch": 3.5494847177226756, "grad_norm": 5.044371128082275, "learning_rate": 0.0022016472386305825, "loss": 7.6338, "step": 871300 }, { "epoch": 3.5498920957460567, "grad_norm": 7.558262825012207, "learning_rate": 0.0022011548318203616, "loss": 7.6243, "step": 871400 }, { "epoch": 3.550299473769438, "grad_norm": 4.824985980987549, "learning_rate": 0.0022006624370227612, "loss": 7.6515, "step": 871500 }, { "epoch": 3.5507068517928198, "grad_norm": 3.010861396789551, "learning_rate": 0.002200170054257255, "loss": 7.6246, "step": 871600 }, { "epoch": 3.5511142298162013, "grad_norm": 4.284688949584961, "learning_rate": 0.002199677683543306, "loss": 7.6511, "step": 871700 }, { "epoch": 3.551521607839583, "grad_norm": 2.767360210418701, "learning_rate": 0.0021991853249003807, "loss": 7.6695, "step": 871800 }, { "epoch": 3.551928985862964, "grad_norm": 7.54105806350708, "learning_rate": 0.0021986929783479455, "loss": 7.6632, "step": 871900 }, { "epoch": 3.5523363638863454, "grad_norm": 6.911463737487793, "learning_rate": 0.002198200643905462, "loss": 7.654, "step": 872000 }, { "epoch": 3.5523363638863454, "eval_MaskedAccuracy": 0.5081919099463476, "eval_loss": 1.605948805809021, "eval_runtime": 322.012, "eval_samples_per_second": 197.123, "eval_steps_per_second": 0.77, "step": 872000 }, { "epoch": 3.552743741909727, "grad_norm": 4.053170204162598, "learning_rate": 0.0021977083215924008, "loss": 7.657, "step": 872100 }, { "epoch": 3.5531511199331085, "grad_norm": 11.83296012878418, "learning_rate": 0.002197216011428222, "loss": 7.6329, "step": 872200 }, { "epoch": 3.55355849795649, "grad_norm": 4.449741840362549, "learning_rate": 0.002196723713432391, "loss": 7.6774, "step": 872300 }, { "epoch": 3.5539658759798716, "grad_norm": 6.159327507019043, "learning_rate": 0.0021962314276243694, "loss": 7.6271, "step": 872400 }, { "epoch": 3.554373254003253, "grad_norm": 9.53366470336914, "learning_rate": 0.0021957391540236232, "loss": 7.6257, "step": 872500 }, { "epoch": 3.5547806320266346, "grad_norm": 4.169156551361084, "learning_rate": 0.002195246892649613, "loss": 7.6404, "step": 872600 }, { "epoch": 3.5551880100500157, "grad_norm": 6.204394340515137, "learning_rate": 0.0021947546435217968, "loss": 7.65, "step": 872700 }, { "epoch": 3.5555953880733973, "grad_norm": 4.08635950088501, "learning_rate": 0.002194262406659641, "loss": 7.6369, "step": 872800 }, { "epoch": 3.556002766096779, "grad_norm": 6.075716018676758, "learning_rate": 0.002193770182082606, "loss": 7.636, "step": 872900 }, { "epoch": 3.5564101441201603, "grad_norm": 8.030529975891113, "learning_rate": 0.002193277969810146, "loss": 7.6425, "step": 873000 }, { "epoch": 3.5564101441201603, "eval_MaskedAccuracy": 0.507682783853077, "eval_loss": 1.6131370067596436, "eval_runtime": 265.9635, "eval_samples_per_second": 238.664, "eval_steps_per_second": 0.932, "step": 873000 }, { "epoch": 3.5568175221435414, "grad_norm": 5.904312610626221, "learning_rate": 0.0021927857698617265, "loss": 7.6403, "step": 873100 }, { "epoch": 3.557224900166923, "grad_norm": 5.031678199768066, "learning_rate": 0.002192293582256804, "loss": 7.6359, "step": 873200 }, { "epoch": 3.5576322781903045, "grad_norm": 5.363577842712402, "learning_rate": 0.0021918014070148413, "loss": 7.6331, "step": 873300 }, { "epoch": 3.558039656213686, "grad_norm": 7.076882362365723, "learning_rate": 0.002191309244155291, "loss": 7.6238, "step": 873400 }, { "epoch": 3.5584470342370675, "grad_norm": 7.54783821105957, "learning_rate": 0.0021908170936976164, "loss": 7.6256, "step": 873500 }, { "epoch": 3.558854412260449, "grad_norm": 4.612677574157715, "learning_rate": 0.0021903249556612706, "loss": 7.6591, "step": 873600 }, { "epoch": 3.5592617902838306, "grad_norm": 2.7689554691314697, "learning_rate": 0.0021898328300657102, "loss": 7.6651, "step": 873700 }, { "epoch": 3.559669168307212, "grad_norm": 8.302124977111816, "learning_rate": 0.002189340716930396, "loss": 7.6497, "step": 873800 }, { "epoch": 3.5600765463305932, "grad_norm": 4.4748125076293945, "learning_rate": 0.0021888486162747836, "loss": 7.6327, "step": 873900 }, { "epoch": 3.5604839243539748, "grad_norm": 7.404576778411865, "learning_rate": 0.0021883565281183236, "loss": 7.6806, "step": 874000 }, { "epoch": 3.5604839243539748, "eval_MaskedAccuracy": 0.5083195085567322, "eval_loss": 1.6122350692749023, "eval_runtime": 171.3922, "eval_samples_per_second": 370.355, "eval_steps_per_second": 1.447, "step": 874000 }, { "epoch": 3.5608913023773563, "grad_norm": 4.407498359680176, "learning_rate": 0.0021878644524804727, "loss": 7.6571, "step": 874100 }, { "epoch": 3.561298680400738, "grad_norm": 14.454693794250488, "learning_rate": 0.0021873723893806878, "loss": 7.6546, "step": 874200 }, { "epoch": 3.5617060584241194, "grad_norm": 3.9932785034179688, "learning_rate": 0.002186880338838423, "loss": 7.6362, "step": 874300 }, { "epoch": 3.5621134364475004, "grad_norm": 7.53162145614624, "learning_rate": 0.002186388300873129, "loss": 7.6297, "step": 874400 }, { "epoch": 3.562520814470882, "grad_norm": 4.9030890464782715, "learning_rate": 0.0021858962755042614, "loss": 7.645, "step": 874500 }, { "epoch": 3.5629281924942635, "grad_norm": 4.759460926055908, "learning_rate": 0.0021854042627512697, "loss": 7.6694, "step": 874600 }, { "epoch": 3.563335570517645, "grad_norm": 5.22857141494751, "learning_rate": 0.0021849122626336043, "loss": 7.6052, "step": 874700 }, { "epoch": 3.5637429485410266, "grad_norm": 4.2129807472229, "learning_rate": 0.0021844202751707207, "loss": 7.6232, "step": 874800 }, { "epoch": 3.564150326564408, "grad_norm": 4.902995586395264, "learning_rate": 0.002183928300382066, "loss": 7.6013, "step": 874900 }, { "epoch": 3.5645577045877896, "grad_norm": 11.46718692779541, "learning_rate": 0.0021834363382870945, "loss": 7.6285, "step": 875000 }, { "epoch": 3.5645577045877896, "eval_MaskedAccuracy": 0.5080486656609293, "eval_loss": 1.608930230140686, "eval_runtime": 181.7519, "eval_samples_per_second": 349.245, "eval_steps_per_second": 1.364, "step": 875000 }, { "epoch": 3.564965082611171, "grad_norm": 5.476961612701416, "learning_rate": 0.0021829443889052535, "loss": 7.6636, "step": 875100 }, { "epoch": 3.5653724606345523, "grad_norm": 5.607533931732178, "learning_rate": 0.002182452452255995, "loss": 7.6408, "step": 875200 }, { "epoch": 3.565779838657934, "grad_norm": 4.789079666137695, "learning_rate": 0.0021819605283587653, "loss": 7.6229, "step": 875300 }, { "epoch": 3.5661872166813153, "grad_norm": 8.760357856750488, "learning_rate": 0.0021814686172330136, "loss": 7.6536, "step": 875400 }, { "epoch": 3.566594594704697, "grad_norm": 8.119916915893555, "learning_rate": 0.0021809767188981865, "loss": 7.6421, "step": 875500 }, { "epoch": 3.567001972728078, "grad_norm": 7.076836585998535, "learning_rate": 0.0021804848333737367, "loss": 7.6652, "step": 875600 }, { "epoch": 3.5674093507514595, "grad_norm": 6.827029705047607, "learning_rate": 0.0021799929606791044, "loss": 7.6398, "step": 875700 }, { "epoch": 3.567816728774841, "grad_norm": 5.244140625, "learning_rate": 0.0021795011008337393, "loss": 7.6074, "step": 875800 }, { "epoch": 3.5682241067982226, "grad_norm": 4.503063201904297, "learning_rate": 0.0021790092538570864, "loss": 7.6649, "step": 875900 }, { "epoch": 3.568631484821604, "grad_norm": 4.182742118835449, "learning_rate": 0.002178517419768591, "loss": 7.6101, "step": 876000 }, { "epoch": 3.568631484821604, "eval_MaskedAccuracy": 0.5082813732347208, "eval_loss": 1.6111879348754883, "eval_runtime": 170.1251, "eval_samples_per_second": 373.114, "eval_steps_per_second": 1.458, "step": 876000 }, { "epoch": 3.5690388628449856, "grad_norm": 8.675579071044922, "learning_rate": 0.0021780255985876995, "loss": 7.6186, "step": 876100 }, { "epoch": 3.569446240868367, "grad_norm": 2.9475040435791016, "learning_rate": 0.0021775337903338545, "loss": 7.6093, "step": 876200 }, { "epoch": 3.5698536188917487, "grad_norm": 10.103228569030762, "learning_rate": 0.0021770419950265, "loss": 7.6757, "step": 876300 }, { "epoch": 3.5702609969151298, "grad_norm": 6.754882335662842, "learning_rate": 0.002176550212685081, "loss": 7.6479, "step": 876400 }, { "epoch": 3.5706683749385113, "grad_norm": 5.378737926483154, "learning_rate": 0.002176058443329037, "loss": 7.6296, "step": 876500 }, { "epoch": 3.571075752961893, "grad_norm": 4.21753454208374, "learning_rate": 0.0021755666869778113, "loss": 7.6436, "step": 876600 }, { "epoch": 3.5714831309852744, "grad_norm": 7.464591979980469, "learning_rate": 0.0021750749436508453, "loss": 7.6198, "step": 876700 }, { "epoch": 3.571890509008656, "grad_norm": 3.5655879974365234, "learning_rate": 0.0021745832133675843, "loss": 7.6177, "step": 876800 }, { "epoch": 3.572297887032037, "grad_norm": 3.804499864578247, "learning_rate": 0.002174091496147467, "loss": 7.6452, "step": 876900 }, { "epoch": 3.5727052650554185, "grad_norm": 3.8965039253234863, "learning_rate": 0.002173599792009929, "loss": 7.6433, "step": 877000 }, { "epoch": 3.5727052650554185, "eval_MaskedAccuracy": 0.5077628099448271, "eval_loss": 1.615176796913147, "eval_runtime": 171.6107, "eval_samples_per_second": 369.884, "eval_steps_per_second": 1.445, "step": 877000 }, { "epoch": 3.5731126430788, "grad_norm": 7.956528663635254, "learning_rate": 0.002173108100974414, "loss": 7.6361, "step": 877100 }, { "epoch": 3.5735200211021816, "grad_norm": 7.382539749145508, "learning_rate": 0.002172616423060361, "loss": 7.604, "step": 877200 }, { "epoch": 3.573927399125563, "grad_norm": 12.62801742553711, "learning_rate": 0.0021721247582872096, "loss": 7.6364, "step": 877300 }, { "epoch": 3.5743347771489447, "grad_norm": 6.42581033706665, "learning_rate": 0.0021716331066743964, "loss": 7.6594, "step": 877400 }, { "epoch": 3.574742155172326, "grad_norm": 10.753722190856934, "learning_rate": 0.002171141468241361, "loss": 7.6234, "step": 877500 }, { "epoch": 3.5751495331957077, "grad_norm": 8.056370735168457, "learning_rate": 0.0021706498430075346, "loss": 7.6749, "step": 877600 }, { "epoch": 3.575556911219089, "grad_norm": 13.071642875671387, "learning_rate": 0.0021701582309923604, "loss": 7.6608, "step": 877700 }, { "epoch": 3.5759642892424703, "grad_norm": 7.352053165435791, "learning_rate": 0.002169666632215273, "loss": 7.6386, "step": 877800 }, { "epoch": 3.576371667265852, "grad_norm": 7.554142951965332, "learning_rate": 0.0021691750466957056, "loss": 7.6175, "step": 877900 }, { "epoch": 3.5767790452892334, "grad_norm": 6.251834392547607, "learning_rate": 0.0021686834744530925, "loss": 7.6542, "step": 878000 }, { "epoch": 3.5767790452892334, "eval_MaskedAccuracy": 0.5077756948399319, "eval_loss": 1.6095921993255615, "eval_runtime": 439.2866, "eval_samples_per_second": 144.498, "eval_steps_per_second": 0.565, "step": 878000 }, { "epoch": 3.5771864233126145, "grad_norm": 6.588141441345215, "learning_rate": 0.0021681919155068733, "loss": 7.647, "step": 878100 }, { "epoch": 3.577593801335996, "grad_norm": 6.322723388671875, "learning_rate": 0.002167700369876481, "loss": 7.6746, "step": 878200 }, { "epoch": 3.5780011793593776, "grad_norm": 6.316498756408691, "learning_rate": 0.002167208837581346, "loss": 7.6256, "step": 878300 }, { "epoch": 3.578408557382759, "grad_norm": 5.972914695739746, "learning_rate": 0.002166717318640904, "loss": 7.6369, "step": 878400 }, { "epoch": 3.5788159354061406, "grad_norm": 8.659676551818848, "learning_rate": 0.0021662258130745854, "loss": 7.5971, "step": 878500 }, { "epoch": 3.579223313429522, "grad_norm": 10.309601783752441, "learning_rate": 0.002165734320901821, "loss": 7.6087, "step": 878600 }, { "epoch": 3.5796306914529037, "grad_norm": 8.766733169555664, "learning_rate": 0.002165242842142042, "loss": 7.628, "step": 878700 }, { "epoch": 3.5800380694762852, "grad_norm": 6.971269607543945, "learning_rate": 0.002164751376814684, "loss": 7.6364, "step": 878800 }, { "epoch": 3.5804454474996663, "grad_norm": 17.327312469482422, "learning_rate": 0.002164259924939174, "loss": 7.6125, "step": 878900 }, { "epoch": 3.580852825523048, "grad_norm": 3.636362075805664, "learning_rate": 0.0021637684865349394, "loss": 7.6401, "step": 879000 }, { "epoch": 3.580852825523048, "eval_MaskedAccuracy": 0.5087040819822556, "eval_loss": 1.605638861656189, "eval_runtime": 171.5638, "eval_samples_per_second": 369.985, "eval_steps_per_second": 1.446, "step": 879000 }, { "epoch": 3.5812602035464294, "grad_norm": 3.375300168991089, "learning_rate": 0.002163277061621411, "loss": 7.644, "step": 879100 }, { "epoch": 3.581667581569811, "grad_norm": 7.568451881408691, "learning_rate": 0.0021627856502180174, "loss": 7.6559, "step": 879200 }, { "epoch": 3.5820749595931924, "grad_norm": 6.051039218902588, "learning_rate": 0.00216229425234419, "loss": 7.6583, "step": 879300 }, { "epoch": 3.5824823376165735, "grad_norm": 3.77250599861145, "learning_rate": 0.002161802868019353, "loss": 7.6349, "step": 879400 }, { "epoch": 3.582889715639955, "grad_norm": 7.995368003845215, "learning_rate": 0.0021613114972629343, "loss": 7.643, "step": 879500 }, { "epoch": 3.5832970936633366, "grad_norm": 9.187032699584961, "learning_rate": 0.0021608201400943597, "loss": 7.6183, "step": 879600 }, { "epoch": 3.583704471686718, "grad_norm": 4.510272026062012, "learning_rate": 0.002160328796533055, "loss": 7.6557, "step": 879700 }, { "epoch": 3.5841118497100997, "grad_norm": 5.87296724319458, "learning_rate": 0.0021598374665984465, "loss": 7.6431, "step": 879800 }, { "epoch": 3.584519227733481, "grad_norm": 15.391018867492676, "learning_rate": 0.002159346150309961, "loss": 7.6412, "step": 879900 }, { "epoch": 3.5849266057568627, "grad_norm": 5.5155110359191895, "learning_rate": 0.0021588548476870182, "loss": 7.6294, "step": 880000 }, { "epoch": 3.5849266057568627, "eval_MaskedAccuracy": 0.5079862422568046, "eval_loss": 1.6146249771118164, "eval_runtime": 170.5121, "eval_samples_per_second": 372.267, "eval_steps_per_second": 1.454, "step": 880000 }, { "epoch": 3.5853339837802443, "grad_norm": 16.883499145507812, "learning_rate": 0.002158363558749045, "loss": 7.6364, "step": 880100 }, { "epoch": 3.5857413618036253, "grad_norm": 6.272616386413574, "learning_rate": 0.0021578722835154654, "loss": 7.6152, "step": 880200 }, { "epoch": 3.586148739827007, "grad_norm": 10.381025314331055, "learning_rate": 0.002157381022005699, "loss": 7.6318, "step": 880300 }, { "epoch": 3.5865561178503884, "grad_norm": 11.820782661437988, "learning_rate": 0.002156889774239174, "loss": 7.6454, "step": 880400 }, { "epoch": 3.58696349587377, "grad_norm": 6.760218143463135, "learning_rate": 0.002156398540235307, "loss": 7.6618, "step": 880500 }, { "epoch": 3.587370873897151, "grad_norm": 4.550642967224121, "learning_rate": 0.0021559073200135195, "loss": 7.6232, "step": 880600 }, { "epoch": 3.5877782519205326, "grad_norm": 4.033792495727539, "learning_rate": 0.00215541611359323, "loss": 7.6453, "step": 880700 }, { "epoch": 3.588185629943914, "grad_norm": 4.415927410125732, "learning_rate": 0.002154924920993864, "loss": 7.6513, "step": 880800 }, { "epoch": 3.5885930079672956, "grad_norm": 8.303081512451172, "learning_rate": 0.0021544337422348373, "loss": 7.6258, "step": 880900 }, { "epoch": 3.589000385990677, "grad_norm": 7.362762451171875, "learning_rate": 0.0021539425773355676, "loss": 7.643, "step": 881000 }, { "epoch": 3.589000385990677, "eval_MaskedAccuracy": 0.5081290455190871, "eval_loss": 1.6080726385116577, "eval_runtime": 179.4765, "eval_samples_per_second": 353.673, "eval_steps_per_second": 1.382, "step": 881000 }, { "epoch": 3.5894077640140587, "grad_norm": 12.051417350769043, "learning_rate": 0.0021534514263154766, "loss": 7.6196, "step": 881100 }, { "epoch": 3.5898151420374402, "grad_norm": 8.640392303466797, "learning_rate": 0.002152960289193983, "loss": 7.6462, "step": 881200 }, { "epoch": 3.5902225200608218, "grad_norm": 5.608726501464844, "learning_rate": 0.0021524691659905, "loss": 7.6224, "step": 881300 }, { "epoch": 3.590629898084203, "grad_norm": 5.6090497970581055, "learning_rate": 0.0021519780567244465, "loss": 7.6475, "step": 881400 }, { "epoch": 3.5910372761075844, "grad_norm": 7.759212493896484, "learning_rate": 0.002151486961415241, "loss": 7.6184, "step": 881500 }, { "epoch": 3.591444654130966, "grad_norm": 10.882990837097168, "learning_rate": 0.0021509958800822945, "loss": 7.6673, "step": 881600 }, { "epoch": 3.5918520321543475, "grad_norm": 6.636781215667725, "learning_rate": 0.002150504812745023, "loss": 7.6569, "step": 881700 }, { "epoch": 3.592259410177729, "grad_norm": 6.783194065093994, "learning_rate": 0.0021500137594228446, "loss": 7.6366, "step": 881800 }, { "epoch": 3.59266678820111, "grad_norm": 6.8873515129089355, "learning_rate": 0.002149522720135168, "loss": 7.6504, "step": 881900 }, { "epoch": 3.5930741662244916, "grad_norm": 9.64719295501709, "learning_rate": 0.0021490316949014108, "loss": 7.6463, "step": 882000 }, { "epoch": 3.5930741662244916, "eval_MaskedAccuracy": 0.507845655953523, "eval_loss": 1.6132745742797852, "eval_runtime": 195.9464, "eval_samples_per_second": 323.946, "eval_steps_per_second": 1.266, "step": 882000 }, { "epoch": 3.593481544247873, "grad_norm": 3.836606979370117, "learning_rate": 0.0021485406837409827, "loss": 7.6709, "step": 882100 }, { "epoch": 3.5938889222712547, "grad_norm": 8.204010963439941, "learning_rate": 0.0021480496866733, "loss": 7.6298, "step": 882200 }, { "epoch": 3.594296300294636, "grad_norm": 3.425236701965332, "learning_rate": 0.0021475587037177732, "loss": 7.6375, "step": 882300 }, { "epoch": 3.5947036783180177, "grad_norm": 5.571497440338135, "learning_rate": 0.002147067734893812, "loss": 7.6211, "step": 882400 }, { "epoch": 3.5951110563413993, "grad_norm": 8.662858963012695, "learning_rate": 0.002146576780220827, "loss": 7.644, "step": 882500 }, { "epoch": 3.595518434364781, "grad_norm": 12.79269027709961, "learning_rate": 0.002146085839718229, "loss": 7.6186, "step": 882600 }, { "epoch": 3.595925812388162, "grad_norm": 8.456405639648438, "learning_rate": 0.002145594913405426, "loss": 7.6344, "step": 882700 }, { "epoch": 3.5963331904115434, "grad_norm": 2.7036311626434326, "learning_rate": 0.0021451040013018307, "loss": 7.6566, "step": 882800 }, { "epoch": 3.596740568434925, "grad_norm": 10.803698539733887, "learning_rate": 0.0021446131034268496, "loss": 7.6465, "step": 882900 }, { "epoch": 3.5971479464583065, "grad_norm": 13.076610565185547, "learning_rate": 0.002144122219799888, "loss": 7.6376, "step": 883000 }, { "epoch": 3.5971479464583065, "eval_MaskedAccuracy": 0.5076123440642883, "eval_loss": 1.6148388385772705, "eval_runtime": 252.0209, "eval_samples_per_second": 251.868, "eval_steps_per_second": 0.984, "step": 883000 }, { "epoch": 3.5975553244816876, "grad_norm": 10.464448928833008, "learning_rate": 0.0021436313504403525, "loss": 7.612, "step": 883100 }, { "epoch": 3.597962702505069, "grad_norm": 4.088959217071533, "learning_rate": 0.002143140495367657, "loss": 7.6582, "step": 883200 }, { "epoch": 3.5983700805284506, "grad_norm": 6.57127046585083, "learning_rate": 0.0021426496546012, "loss": 7.6182, "step": 883300 }, { "epoch": 3.598777458551832, "grad_norm": 3.783757448196411, "learning_rate": 0.0021421588281603913, "loss": 7.6258, "step": 883400 }, { "epoch": 3.5991848365752137, "grad_norm": 6.03716516494751, "learning_rate": 0.0021416680160646355, "loss": 7.6571, "step": 883500 }, { "epoch": 3.5995922145985952, "grad_norm": 10.619534492492676, "learning_rate": 0.0021411772183333347, "loss": 7.6549, "step": 883600 }, { "epoch": 3.5999995926219768, "grad_norm": 6.190208911895752, "learning_rate": 0.002140686434985898, "loss": 7.6275, "step": 883700 }, { "epoch": 3.6004069706453583, "grad_norm": 4.570251941680908, "learning_rate": 0.002140195666041722, "loss": 7.6603, "step": 883800 }, { "epoch": 3.6008143486687394, "grad_norm": 9.952339172363281, "learning_rate": 0.0021397049115202136, "loss": 7.6427, "step": 883900 }, { "epoch": 3.601221726692121, "grad_norm": 4.950263977050781, "learning_rate": 0.002139214171440775, "loss": 7.6463, "step": 884000 }, { "epoch": 3.601221726692121, "eval_MaskedAccuracy": 0.5078028810447769, "eval_loss": 1.6191045045852661, "eval_runtime": 180.2412, "eval_samples_per_second": 352.173, "eval_steps_per_second": 1.376, "step": 884000 }, { "epoch": 3.6016291047155025, "grad_norm": 6.89466667175293, "learning_rate": 0.002138723445822806, "loss": 7.6315, "step": 884100 }, { "epoch": 3.602036482738884, "grad_norm": 4.768282413482666, "learning_rate": 0.0021382327346857084, "loss": 7.6457, "step": 884200 }, { "epoch": 3.6024438607622655, "grad_norm": 11.934220314025879, "learning_rate": 0.0021377420380488807, "loss": 7.6237, "step": 884300 }, { "epoch": 3.6028512387856466, "grad_norm": 5.59706974029541, "learning_rate": 0.002137251355931724, "loss": 7.6372, "step": 884400 }, { "epoch": 3.603258616809028, "grad_norm": 6.063025951385498, "learning_rate": 0.002136760688353638, "loss": 7.6692, "step": 884500 }, { "epoch": 3.6036659948324097, "grad_norm": 5.742177486419678, "learning_rate": 0.002136270035334024, "loss": 7.6454, "step": 884600 }, { "epoch": 3.604073372855791, "grad_norm": 3.289231538772583, "learning_rate": 0.002135779396892275, "loss": 7.6529, "step": 884700 }, { "epoch": 3.6044807508791727, "grad_norm": 4.300974369049072, "learning_rate": 0.002135288773047792, "loss": 7.6624, "step": 884800 }, { "epoch": 3.6048881289025543, "grad_norm": 7.5341997146606445, "learning_rate": 0.002134798163819972, "loss": 7.6501, "step": 884900 }, { "epoch": 3.605295506925936, "grad_norm": 12.429261207580566, "learning_rate": 0.002134307569228211, "loss": 7.6669, "step": 885000 }, { "epoch": 3.605295506925936, "eval_MaskedAccuracy": 0.5080789949313501, "eval_loss": 1.614434003829956, "eval_runtime": 208.2465, "eval_samples_per_second": 304.812, "eval_steps_per_second": 1.191, "step": 885000 }, { "epoch": 3.6057028849493173, "grad_norm": 7.60191011428833, "learning_rate": 0.0021338169892919064, "loss": 7.6408, "step": 885100 }, { "epoch": 3.6061102629726984, "grad_norm": 2.9621386528015137, "learning_rate": 0.002133326424030451, "loss": 7.6484, "step": 885200 }, { "epoch": 3.60651764099608, "grad_norm": 11.868209838867188, "learning_rate": 0.002132835873463239, "loss": 7.6349, "step": 885300 }, { "epoch": 3.6069250190194615, "grad_norm": 10.728883743286133, "learning_rate": 0.0021323453376096636, "loss": 7.6487, "step": 885400 }, { "epoch": 3.607332397042843, "grad_norm": 5.587571620941162, "learning_rate": 0.002131854816489121, "loss": 7.6376, "step": 885500 }, { "epoch": 3.607739775066224, "grad_norm": 6.433836460113525, "learning_rate": 0.0021313643101210015, "loss": 7.6492, "step": 885600 }, { "epoch": 3.6081471530896057, "grad_norm": 10.285565376281738, "learning_rate": 0.0021308738185247017, "loss": 7.6476, "step": 885700 }, { "epoch": 3.608554531112987, "grad_norm": 3.460230588912964, "learning_rate": 0.00213038334171961, "loss": 7.6237, "step": 885800 }, { "epoch": 3.6089619091363687, "grad_norm": 4.975857734680176, "learning_rate": 0.002129892879725118, "loss": 7.646, "step": 885900 }, { "epoch": 3.6093692871597503, "grad_norm": 8.449679374694824, "learning_rate": 0.0021294024325606144, "loss": 7.6341, "step": 886000 }, { "epoch": 3.6093692871597503, "eval_MaskedAccuracy": 0.5083332423401736, "eval_loss": 1.607765555381775, "eval_runtime": 186.2194, "eval_samples_per_second": 340.867, "eval_steps_per_second": 1.332, "step": 886000 }, { "epoch": 3.609776665183132, "grad_norm": 3.533583641052246, "learning_rate": 0.0021289120002454948, "loss": 7.6316, "step": 886100 }, { "epoch": 3.6101840432065133, "grad_norm": 7.7914838790893555, "learning_rate": 0.002128421582799146, "loss": 7.628, "step": 886200 }, { "epoch": 3.610591421229895, "grad_norm": 3.6636064052581787, "learning_rate": 0.0021279311802409567, "loss": 7.6392, "step": 886300 }, { "epoch": 3.610998799253276, "grad_norm": 7.064916610717773, "learning_rate": 0.0021274407925903124, "loss": 7.6339, "step": 886400 }, { "epoch": 3.6114061772766575, "grad_norm": 6.368926525115967, "learning_rate": 0.0021269504198666014, "loss": 7.6003, "step": 886500 }, { "epoch": 3.611813555300039, "grad_norm": 9.411510467529297, "learning_rate": 0.0021264600620892115, "loss": 7.6517, "step": 886600 }, { "epoch": 3.6122209333234205, "grad_norm": 2.905181407928467, "learning_rate": 0.002125969719277534, "loss": 7.6408, "step": 886700 }, { "epoch": 3.612628311346802, "grad_norm": 2.163015604019165, "learning_rate": 0.002125479391450947, "loss": 7.6243, "step": 886800 }, { "epoch": 3.613035689370183, "grad_norm": 13.100394248962402, "learning_rate": 0.00212498907862884, "loss": 7.649, "step": 886900 }, { "epoch": 3.6134430673935647, "grad_norm": 8.736556053161621, "learning_rate": 0.002124498780830599, "loss": 7.6183, "step": 887000 }, { "epoch": 3.6134430673935647, "eval_MaskedAccuracy": 0.5080628830843521, "eval_loss": 1.6166939735412598, "eval_runtime": 172.8935, "eval_samples_per_second": 367.139, "eval_steps_per_second": 1.434, "step": 887000 }, { "epoch": 3.6138504454169462, "grad_norm": 3.8619651794433594, "learning_rate": 0.002124008498075608, "loss": 7.6219, "step": 887100 }, { "epoch": 3.6142578234403278, "grad_norm": 3.6366708278656006, "learning_rate": 0.002123518230383244, "loss": 7.6517, "step": 887200 }, { "epoch": 3.6146652014637093, "grad_norm": 7.061435699462891, "learning_rate": 0.0021230279777728974, "loss": 7.6591, "step": 887300 }, { "epoch": 3.615072579487091, "grad_norm": 3.008315086364746, "learning_rate": 0.002122537740263949, "loss": 7.6186, "step": 887400 }, { "epoch": 3.6154799575104724, "grad_norm": 5.90070104598999, "learning_rate": 0.002122047517875774, "loss": 7.6343, "step": 887500 }, { "epoch": 3.615887335533854, "grad_norm": 4.847671031951904, "learning_rate": 0.002121557310627762, "loss": 7.6518, "step": 887600 }, { "epoch": 3.616294713557235, "grad_norm": 3.052391767501831, "learning_rate": 0.002121067118539291, "loss": 7.602, "step": 887700 }, { "epoch": 3.6167020915806165, "grad_norm": 6.374086856842041, "learning_rate": 0.0021205769416297393, "loss": 7.6295, "step": 887800 }, { "epoch": 3.617109469603998, "grad_norm": 10.769808769226074, "learning_rate": 0.002120086779918488, "loss": 7.6628, "step": 887900 }, { "epoch": 3.6175168476273796, "grad_norm": 6.5341620445251465, "learning_rate": 0.0021195966334249163, "loss": 7.6452, "step": 888000 }, { "epoch": 3.6175168476273796, "eval_MaskedAccuracy": 0.5081228197925867, "eval_loss": 1.6121056079864502, "eval_runtime": 182.3327, "eval_samples_per_second": 348.133, "eval_steps_per_second": 1.36, "step": 888000 }, { "epoch": 3.6179242256507607, "grad_norm": 10.292481422424316, "learning_rate": 0.0021191065021683984, "loss": 7.657, "step": 888100 }, { "epoch": 3.618331603674142, "grad_norm": 7.091249465942383, "learning_rate": 0.0021186163861683155, "loss": 7.6772, "step": 888200 }, { "epoch": 3.6187389816975237, "grad_norm": 3.879603385925293, "learning_rate": 0.0021181262854440453, "loss": 7.6197, "step": 888300 }, { "epoch": 3.6191463597209053, "grad_norm": 2.8298897743225098, "learning_rate": 0.0021176362000149577, "loss": 7.6453, "step": 888400 }, { "epoch": 3.619553737744287, "grad_norm": 6.767982006072998, "learning_rate": 0.0021171461299004353, "loss": 7.6315, "step": 888500 }, { "epoch": 3.6199611157676683, "grad_norm": 9.25776195526123, "learning_rate": 0.0021166560751198524, "loss": 7.6741, "step": 888600 }, { "epoch": 3.62036849379105, "grad_norm": 5.514349460601807, "learning_rate": 0.002116166035692577, "loss": 7.6259, "step": 888700 }, { "epoch": 3.6207758718144314, "grad_norm": 5.164654731750488, "learning_rate": 0.0021156760116379926, "loss": 7.6129, "step": 888800 }, { "epoch": 3.6211832498378125, "grad_norm": 11.57204818725586, "learning_rate": 0.0021151860029754647, "loss": 7.6765, "step": 888900 }, { "epoch": 3.621590627861194, "grad_norm": 4.421428680419922, "learning_rate": 0.002114696009724373, "loss": 7.651, "step": 889000 }, { "epoch": 3.621590627861194, "eval_MaskedAccuracy": 0.5081642887177764, "eval_loss": 1.6025340557098389, "eval_runtime": 175.4212, "eval_samples_per_second": 361.849, "eval_steps_per_second": 1.414, "step": 889000 }, { "epoch": 3.6219980058845755, "grad_norm": 11.134932518005371, "learning_rate": 0.0021142060319040814, "loss": 7.6201, "step": 889100 }, { "epoch": 3.622405383907957, "grad_norm": 10.667376518249512, "learning_rate": 0.002113716069533967, "loss": 7.6492, "step": 889200 }, { "epoch": 3.6228127619313386, "grad_norm": 12.43516731262207, "learning_rate": 0.002113226122633397, "loss": 7.6487, "step": 889300 }, { "epoch": 3.6232201399547197, "grad_norm": 5.9131646156311035, "learning_rate": 0.0021127361912217425, "loss": 7.6384, "step": 889400 }, { "epoch": 3.6236275179781012, "grad_norm": 6.579653739929199, "learning_rate": 0.0021122462753183796, "loss": 7.6262, "step": 889500 }, { "epoch": 3.6240348960014828, "grad_norm": 4.793057918548584, "learning_rate": 0.0021117563749426726, "loss": 7.6143, "step": 889600 }, { "epoch": 3.6244422740248643, "grad_norm": 5.972531795501709, "learning_rate": 0.0021112664901139864, "loss": 7.6014, "step": 889700 }, { "epoch": 3.624849652048246, "grad_norm": 5.3787522315979, "learning_rate": 0.0021107766208516903, "loss": 7.6263, "step": 889800 }, { "epoch": 3.6252570300716274, "grad_norm": 8.72839069366455, "learning_rate": 0.0021102867671751527, "loss": 7.6563, "step": 889900 }, { "epoch": 3.625664408095009, "grad_norm": 5.1460723876953125, "learning_rate": 0.002109796929103743, "loss": 7.6243, "step": 890000 }, { "epoch": 3.625664408095009, "eval_MaskedAccuracy": 0.5083265363894516, "eval_loss": 1.6125589609146118, "eval_runtime": 157.2142, "eval_samples_per_second": 403.755, "eval_steps_per_second": 1.577, "step": 890000 }, { "epoch": 3.6260717861183904, "grad_norm": 4.11286735534668, "learning_rate": 0.002109307106656823, "loss": 7.6554, "step": 890100 }, { "epoch": 3.6264791641417715, "grad_norm": 5.135982513427734, "learning_rate": 0.0021088172998537597, "loss": 7.6006, "step": 890200 }, { "epoch": 3.626886542165153, "grad_norm": 9.693115234375, "learning_rate": 0.0021083275087139175, "loss": 7.6465, "step": 890300 }, { "epoch": 3.6272939201885346, "grad_norm": 8.499496459960938, "learning_rate": 0.002107837733256662, "loss": 7.6339, "step": 890400 }, { "epoch": 3.627701298211916, "grad_norm": 5.727963924407959, "learning_rate": 0.0021073479735013523, "loss": 7.6518, "step": 890500 }, { "epoch": 3.628108676235297, "grad_norm": 3.8457725048065186, "learning_rate": 0.002106858229467355, "loss": 7.6473, "step": 890600 }, { "epoch": 3.6285160542586787, "grad_norm": 8.028230667114258, "learning_rate": 0.002106368501174033, "loss": 7.6095, "step": 890700 }, { "epoch": 3.6289234322820603, "grad_norm": 7.135133266448975, "learning_rate": 0.002105878788640746, "loss": 7.6446, "step": 890800 }, { "epoch": 3.629330810305442, "grad_norm": 7.200752258300781, "learning_rate": 0.0021053890918868562, "loss": 7.6468, "step": 890900 }, { "epoch": 3.6297381883288233, "grad_norm": 7.582151889801025, "learning_rate": 0.0021048994109317196, "loss": 7.6434, "step": 891000 }, { "epoch": 3.6297381883288233, "eval_MaskedAccuracy": 0.5084321665863265, "eval_loss": 1.6051677465438843, "eval_runtime": 157.0626, "eval_samples_per_second": 404.145, "eval_steps_per_second": 1.579, "step": 891000 }, { "epoch": 3.630145566352205, "grad_norm": 7.139379024505615, "learning_rate": 0.0021044097457946997, "loss": 7.6365, "step": 891100 }, { "epoch": 3.6305529443755864, "grad_norm": 2.8056225776672363, "learning_rate": 0.002103920096495156, "loss": 7.6527, "step": 891200 }, { "epoch": 3.630960322398968, "grad_norm": 3.0227692127227783, "learning_rate": 0.002103430463052445, "loss": 7.6285, "step": 891300 }, { "epoch": 3.631367700422349, "grad_norm": 5.516915798187256, "learning_rate": 0.002102940845485924, "loss": 7.6399, "step": 891400 }, { "epoch": 3.6317750784457306, "grad_norm": 6.106595993041992, "learning_rate": 0.0021024512438149543, "loss": 7.6298, "step": 891500 }, { "epoch": 3.632182456469112, "grad_norm": 5.04689884185791, "learning_rate": 0.0021019616580588898, "loss": 7.6452, "step": 891600 }, { "epoch": 3.6325898344924936, "grad_norm": 5.227834224700928, "learning_rate": 0.0021014720882370857, "loss": 7.6337, "step": 891700 }, { "epoch": 3.632997212515875, "grad_norm": 3.651257276535034, "learning_rate": 0.0021009825343688985, "loss": 7.6431, "step": 891800 }, { "epoch": 3.6334045905392562, "grad_norm": 4.403896331787109, "learning_rate": 0.0021004929964736833, "loss": 7.6151, "step": 891900 }, { "epoch": 3.6338119685626378, "grad_norm": 6.865539073944092, "learning_rate": 0.0021000034745707963, "loss": 7.6336, "step": 892000 }, { "epoch": 3.6338119685626378, "eval_MaskedAccuracy": 0.5085419425763867, "eval_loss": 1.6030364036560059, "eval_runtime": 164.8446, "eval_samples_per_second": 385.066, "eval_steps_per_second": 1.504, "step": 892000 }, { "epoch": 3.6342193465860193, "grad_norm": 6.314415454864502, "learning_rate": 0.002099513968679585, "loss": 7.6495, "step": 892100 }, { "epoch": 3.634626724609401, "grad_norm": 5.714036464691162, "learning_rate": 0.002099024478819405, "loss": 7.6556, "step": 892200 }, { "epoch": 3.6350341026327824, "grad_norm": 3.957120418548584, "learning_rate": 0.0020985350050096076, "loss": 7.6168, "step": 892300 }, { "epoch": 3.635441480656164, "grad_norm": 5.000965118408203, "learning_rate": 0.0020980455472695446, "loss": 7.5973, "step": 892400 }, { "epoch": 3.6358488586795454, "grad_norm": 4.517456531524658, "learning_rate": 0.0020975561056185686, "loss": 7.6422, "step": 892500 }, { "epoch": 3.636256236702927, "grad_norm": 12.821765899658203, "learning_rate": 0.0020970666800760288, "loss": 7.6485, "step": 892600 }, { "epoch": 3.636663614726308, "grad_norm": 4.305283069610596, "learning_rate": 0.002096577270661276, "loss": 7.6234, "step": 892700 }, { "epoch": 3.6370709927496896, "grad_norm": 10.463276863098145, "learning_rate": 0.0020960878773936572, "loss": 7.6539, "step": 892800 }, { "epoch": 3.637478370773071, "grad_norm": 6.03818941116333, "learning_rate": 0.0020955985002925216, "loss": 7.6635, "step": 892900 }, { "epoch": 3.6378857487964527, "grad_norm": 7.45972204208374, "learning_rate": 0.002095109139377217, "loss": 7.5967, "step": 893000 }, { "epoch": 3.6378857487964527, "eval_MaskedAccuracy": 0.508194595348992, "eval_loss": 1.6029776334762573, "eval_runtime": 193.1379, "eval_samples_per_second": 328.656, "eval_steps_per_second": 1.284, "step": 893000 }, { "epoch": 3.6382931268198337, "grad_norm": 9.505080223083496, "learning_rate": 0.0020946197946670898, "loss": 7.6727, "step": 893100 }, { "epoch": 3.6387005048432153, "grad_norm": 14.210636138916016, "learning_rate": 0.0020941304661814862, "loss": 7.6625, "step": 893200 }, { "epoch": 3.639107882866597, "grad_norm": 5.206877708435059, "learning_rate": 0.002093641153939754, "loss": 7.6222, "step": 893300 }, { "epoch": 3.6395152608899783, "grad_norm": 7.7267913818359375, "learning_rate": 0.002093151857961241, "loss": 7.6441, "step": 893400 }, { "epoch": 3.63992263891336, "grad_norm": 3.843454599380493, "learning_rate": 0.002092662578265287, "loss": 7.6285, "step": 893500 }, { "epoch": 3.6403300169367414, "grad_norm": 6.689214706420898, "learning_rate": 0.0020921733148712355, "loss": 7.6272, "step": 893600 }, { "epoch": 3.640737394960123, "grad_norm": 7.946575164794922, "learning_rate": 0.0020916840677984304, "loss": 7.6374, "step": 893700 }, { "epoch": 3.6411447729835045, "grad_norm": 3.621073007583618, "learning_rate": 0.0020911948370662173, "loss": 7.6289, "step": 893800 }, { "epoch": 3.6415521510068856, "grad_norm": 10.197724342346191, "learning_rate": 0.0020907056226939316, "loss": 7.638, "step": 893900 }, { "epoch": 3.641959529030267, "grad_norm": 9.06722640991211, "learning_rate": 0.0020902164247009196, "loss": 7.6382, "step": 894000 }, { "epoch": 3.641959529030267, "eval_MaskedAccuracy": 0.507824996847747, "eval_loss": 1.6094565391540527, "eval_runtime": 225.8666, "eval_samples_per_second": 281.033, "eval_steps_per_second": 1.098, "step": 894000 }, { "epoch": 3.6423669070536486, "grad_norm": 6.535556793212891, "learning_rate": 0.00208972724310652, "loss": 7.6525, "step": 894100 }, { "epoch": 3.64277428507703, "grad_norm": 11.369897842407227, "learning_rate": 0.0020892380779300715, "loss": 7.6301, "step": 894200 }, { "epoch": 3.6431816631004117, "grad_norm": 11.29706859588623, "learning_rate": 0.002088748929190916, "loss": 7.6612, "step": 894300 }, { "epoch": 3.643589041123793, "grad_norm": 3.8788299560546875, "learning_rate": 0.002088259796908394, "loss": 7.6186, "step": 894400 }, { "epoch": 3.6439964191471743, "grad_norm": 3.9642136096954346, "learning_rate": 0.002087770681101839, "loss": 7.6408, "step": 894500 }, { "epoch": 3.644403797170556, "grad_norm": 11.986189842224121, "learning_rate": 0.002087281581790592, "loss": 7.6151, "step": 894600 }, { "epoch": 3.6448111751939374, "grad_norm": 4.163311958312988, "learning_rate": 0.0020867924989939886, "loss": 7.6424, "step": 894700 }, { "epoch": 3.645218553217319, "grad_norm": 7.488025188446045, "learning_rate": 0.002086303432731363, "loss": 7.627, "step": 894800 }, { "epoch": 3.6456259312407004, "grad_norm": 6.97831392288208, "learning_rate": 0.002085814383022053, "loss": 7.6482, "step": 894900 }, { "epoch": 3.646033309264082, "grad_norm": 7.675668239593506, "learning_rate": 0.002085325349885391, "loss": 7.6348, "step": 895000 }, { "epoch": 3.646033309264082, "eval_MaskedAccuracy": 0.5079352136965306, "eval_loss": 1.6075338125228882, "eval_runtime": 184.7545, "eval_samples_per_second": 343.569, "eval_steps_per_second": 1.342, "step": 895000 }, { "epoch": 3.6464406872874635, "grad_norm": 5.817262649536133, "learning_rate": 0.002084836333340711, "loss": 7.6289, "step": 895100 }, { "epoch": 3.6468480653108446, "grad_norm": 4.482575416564941, "learning_rate": 0.002084347333407352, "loss": 7.6222, "step": 895200 }, { "epoch": 3.647255443334226, "grad_norm": 8.638537406921387, "learning_rate": 0.00208385835010464, "loss": 7.6217, "step": 895300 }, { "epoch": 3.6476628213576077, "grad_norm": 4.3328375816345215, "learning_rate": 0.0020833693834519107, "loss": 7.6416, "step": 895400 }, { "epoch": 3.648070199380989, "grad_norm": 6.512923240661621, "learning_rate": 0.0020828804334684903, "loss": 7.6581, "step": 895500 }, { "epoch": 3.6484775774043703, "grad_norm": 9.623422622680664, "learning_rate": 0.00208239150017372, "loss": 7.6086, "step": 895600 }, { "epoch": 3.648884955427752, "grad_norm": 9.830459594726562, "learning_rate": 0.0020819025835869197, "loss": 7.6891, "step": 895700 }, { "epoch": 3.6492923334511334, "grad_norm": 4.8809099197387695, "learning_rate": 0.0020814136837274265, "loss": 7.6553, "step": 895800 }, { "epoch": 3.649699711474515, "grad_norm": 4.494979381561279, "learning_rate": 0.0020809248006145656, "loss": 7.6237, "step": 895900 }, { "epoch": 3.6501070894978964, "grad_norm": 11.90749454498291, "learning_rate": 0.0020804359342676665, "loss": 7.627, "step": 896000 }, { "epoch": 3.6501070894978964, "eval_MaskedAccuracy": 0.5086819885680661, "eval_loss": 1.601949691772461, "eval_runtime": 156.8831, "eval_samples_per_second": 404.607, "eval_steps_per_second": 1.581, "step": 896000 }, { "epoch": 3.650514467521278, "grad_norm": 9.674173355102539, "learning_rate": 0.0020799470847060572, "loss": 7.626, "step": 896100 }, { "epoch": 3.6509218455446595, "grad_norm": 3.3969898223876953, "learning_rate": 0.0020794582519490594, "loss": 7.6072, "step": 896200 }, { "epoch": 3.651329223568041, "grad_norm": 10.800909042358398, "learning_rate": 0.0020789694360160045, "loss": 7.5998, "step": 896300 }, { "epoch": 3.651736601591422, "grad_norm": 4.3343353271484375, "learning_rate": 0.0020784806369262138, "loss": 7.6265, "step": 896400 }, { "epoch": 3.6521439796148036, "grad_norm": 6.152104377746582, "learning_rate": 0.0020779918546990203, "loss": 7.6163, "step": 896500 }, { "epoch": 3.652551357638185, "grad_norm": 4.405981540679932, "learning_rate": 0.0020775030893537367, "loss": 7.6275, "step": 896600 }, { "epoch": 3.6529587356615667, "grad_norm": 9.420879364013672, "learning_rate": 0.0020770143409096957, "loss": 7.6232, "step": 896700 }, { "epoch": 3.6533661136849482, "grad_norm": 7.606795310974121, "learning_rate": 0.0020765256093862147, "loss": 7.6283, "step": 896800 }, { "epoch": 3.6537734917083293, "grad_norm": 4.768980979919434, "learning_rate": 0.002076036894802622, "loss": 7.6383, "step": 896900 }, { "epoch": 3.654180869731711, "grad_norm": 4.798346996307373, "learning_rate": 0.0020755481971782334, "loss": 7.622, "step": 897000 }, { "epoch": 3.654180869731711, "eval_MaskedAccuracy": 0.5083818313553284, "eval_loss": 1.6076077222824097, "eval_runtime": 161.0187, "eval_samples_per_second": 394.215, "eval_steps_per_second": 1.54, "step": 897000 }, { "epoch": 3.6545882477550924, "grad_norm": 11.385787010192871, "learning_rate": 0.002075059516532371, "loss": 7.6523, "step": 897100 }, { "epoch": 3.654995625778474, "grad_norm": 7.296698570251465, "learning_rate": 0.0020745708528843525, "loss": 7.6199, "step": 897200 }, { "epoch": 3.6554030038018555, "grad_norm": 10.062533378601074, "learning_rate": 0.0020740822062535014, "loss": 7.6328, "step": 897300 }, { "epoch": 3.655810381825237, "grad_norm": 4.156154632568359, "learning_rate": 0.0020735935766591353, "loss": 7.6397, "step": 897400 }, { "epoch": 3.6562177598486185, "grad_norm": 5.237282752990723, "learning_rate": 0.0020731049641205773, "loss": 7.6019, "step": 897500 }, { "epoch": 3.656625137872, "grad_norm": 11.06998062133789, "learning_rate": 0.0020726163686571385, "loss": 7.6242, "step": 897600 }, { "epoch": 3.657032515895381, "grad_norm": 5.797039985656738, "learning_rate": 0.002072127790288137, "loss": 7.6159, "step": 897700 }, { "epoch": 3.6574398939187627, "grad_norm": 5.237504482269287, "learning_rate": 0.0020716392290328883, "loss": 7.6375, "step": 897800 }, { "epoch": 3.657847271942144, "grad_norm": 7.14192533493042, "learning_rate": 0.002071150684910709, "loss": 7.6388, "step": 897900 }, { "epoch": 3.6582546499655257, "grad_norm": 4.7378106117248535, "learning_rate": 0.0020706621579409112, "loss": 7.5909, "step": 898000 }, { "epoch": 3.6582546499655257, "eval_MaskedAccuracy": 0.5089759042056345, "eval_loss": 1.5967822074890137, "eval_runtime": 171.3986, "eval_samples_per_second": 370.341, "eval_steps_per_second": 1.447, "step": 898000 }, { "epoch": 3.658662027988907, "grad_norm": 3.3009800910949707, "learning_rate": 0.002070173648142811, "loss": 7.6107, "step": 898100 }, { "epoch": 3.6590694060122884, "grad_norm": 3.3634684085845947, "learning_rate": 0.002069685155535723, "loss": 7.6128, "step": 898200 }, { "epoch": 3.65947678403567, "grad_norm": 3.283738136291504, "learning_rate": 0.00206919668013896, "loss": 7.6455, "step": 898300 }, { "epoch": 3.6598841620590514, "grad_norm": 5.325892925262451, "learning_rate": 0.002068708221971831, "loss": 7.6271, "step": 898400 }, { "epoch": 3.660291540082433, "grad_norm": 3.7771248817443848, "learning_rate": 0.002068219781053649, "loss": 7.6476, "step": 898500 }, { "epoch": 3.6606989181058145, "grad_norm": 11.584171295166016, "learning_rate": 0.0020677313574037274, "loss": 7.6508, "step": 898600 }, { "epoch": 3.661106296129196, "grad_norm": 5.788618564605713, "learning_rate": 0.0020672429510413723, "loss": 7.641, "step": 898700 }, { "epoch": 3.6615136741525776, "grad_norm": 5.707223892211914, "learning_rate": 0.0020667545619858924, "loss": 7.623, "step": 898800 }, { "epoch": 3.6619210521759586, "grad_norm": 5.797310829162598, "learning_rate": 0.0020662661902566028, "loss": 7.6452, "step": 898900 }, { "epoch": 3.66232843019934, "grad_norm": 8.31090259552002, "learning_rate": 0.0020657778358728067, "loss": 7.6483, "step": 899000 }, { "epoch": 3.66232843019934, "eval_MaskedAccuracy": 0.5084633935316983, "eval_loss": 1.61117684841156, "eval_runtime": 166.658, "eval_samples_per_second": 380.876, "eval_steps_per_second": 1.488, "step": 899000 }, { "epoch": 3.6627358082227217, "grad_norm": 9.342093467712402, "learning_rate": 0.002065289498853814, "loss": 7.6256, "step": 899100 }, { "epoch": 3.6631431862461032, "grad_norm": 4.870360374450684, "learning_rate": 0.0020648011792189272, "loss": 7.6325, "step": 899200 }, { "epoch": 3.6635505642694848, "grad_norm": 4.592978477478027, "learning_rate": 0.002064312876987453, "loss": 7.6417, "step": 899300 }, { "epoch": 3.663957942292866, "grad_norm": 6.682163715362549, "learning_rate": 0.002063824592178697, "loss": 7.6283, "step": 899400 }, { "epoch": 3.6643653203162474, "grad_norm": 11.536015510559082, "learning_rate": 0.002063336324811965, "loss": 7.6433, "step": 899500 }, { "epoch": 3.664772698339629, "grad_norm": 6.760446071624756, "learning_rate": 0.00206284807490656, "loss": 7.6333, "step": 899600 }, { "epoch": 3.6651800763630105, "grad_norm": 2.6141417026519775, "learning_rate": 0.0020623598424817863, "loss": 7.6164, "step": 899700 }, { "epoch": 3.665587454386392, "grad_norm": 4.794512748718262, "learning_rate": 0.0020618716275569455, "loss": 7.6386, "step": 899800 }, { "epoch": 3.6659948324097735, "grad_norm": 11.944540023803711, "learning_rate": 0.00206138343015134, "loss": 7.6386, "step": 899900 }, { "epoch": 3.666402210433155, "grad_norm": 2.631260871887207, "learning_rate": 0.0020608952502842713, "loss": 7.6714, "step": 900000 }, { "epoch": 3.666402210433155, "eval_MaskedAccuracy": 0.5085446445840142, "eval_loss": 1.612879991531372, "eval_runtime": 178.0649, "eval_samples_per_second": 356.477, "eval_steps_per_second": 1.393, "step": 900000 }, { "epoch": 3.6668095884565366, "grad_norm": 4.964359283447266, "learning_rate": 0.0020604070879750368, "loss": 7.6399, "step": 900100 }, { "epoch": 3.6672169664799177, "grad_norm": 11.734545707702637, "learning_rate": 0.0020599189432429364, "loss": 7.6603, "step": 900200 }, { "epoch": 3.667624344503299, "grad_norm": 4.237592697143555, "learning_rate": 0.002059430816107272, "loss": 7.6365, "step": 900300 }, { "epoch": 3.6680317225266807, "grad_norm": 5.3469743728637695, "learning_rate": 0.0020589427065873405, "loss": 7.6163, "step": 900400 }, { "epoch": 3.6684391005500623, "grad_norm": 10.180450439453125, "learning_rate": 0.002058454614702438, "loss": 7.6176, "step": 900500 }, { "epoch": 3.6688464785734434, "grad_norm": 5.476974010467529, "learning_rate": 0.0020579665404718635, "loss": 7.642, "step": 900600 }, { "epoch": 3.669253856596825, "grad_norm": 3.240492105484009, "learning_rate": 0.002057478483914913, "loss": 7.6244, "step": 900700 }, { "epoch": 3.6696612346202064, "grad_norm": 4.910428047180176, "learning_rate": 0.0020569904450508836, "loss": 7.6391, "step": 900800 }, { "epoch": 3.670068612643588, "grad_norm": 8.302295684814453, "learning_rate": 0.002056502423899065, "loss": 7.6495, "step": 900900 }, { "epoch": 3.6704759906669695, "grad_norm": 3.451448917388916, "learning_rate": 0.0020560144204787552, "loss": 7.6515, "step": 901000 }, { "epoch": 3.6704759906669695, "eval_MaskedAccuracy": 0.5089203770788138, "eval_loss": 1.603334665298462, "eval_runtime": 180.2357, "eval_samples_per_second": 352.183, "eval_steps_per_second": 1.376, "step": 901000 }, { "epoch": 3.670883368690351, "grad_norm": 3.943767547607422, "learning_rate": 0.002055526434809248, "loss": 7.6194, "step": 901100 }, { "epoch": 3.6712907467137326, "grad_norm": 9.712082862854004, "learning_rate": 0.0020550384669098336, "loss": 7.643, "step": 901200 }, { "epoch": 3.671698124737114, "grad_norm": 8.442924499511719, "learning_rate": 0.002054550516799806, "loss": 7.6393, "step": 901300 }, { "epoch": 3.672105502760495, "grad_norm": 8.293615341186523, "learning_rate": 0.0020540625844984546, "loss": 7.6541, "step": 901400 }, { "epoch": 3.6725128807838767, "grad_norm": 10.332056999206543, "learning_rate": 0.002053574670025072, "loss": 7.6333, "step": 901500 }, { "epoch": 3.6729202588072583, "grad_norm": 3.7203426361083984, "learning_rate": 0.0020530867733989456, "loss": 7.6398, "step": 901600 }, { "epoch": 3.67332763683064, "grad_norm": 3.9798550605773926, "learning_rate": 0.002052598894639367, "loss": 7.6438, "step": 901700 }, { "epoch": 3.6737350148540213, "grad_norm": 3.9761064052581787, "learning_rate": 0.0020521110337656237, "loss": 7.596, "step": 901800 }, { "epoch": 3.6741423928774024, "grad_norm": 2.844905138015747, "learning_rate": 0.0020516231907970026, "loss": 7.6495, "step": 901900 }, { "epoch": 3.674549770900784, "grad_norm": 3.3876631259918213, "learning_rate": 0.0020511353657527926, "loss": 7.6413, "step": 902000 }, { "epoch": 3.674549770900784, "eval_MaskedAccuracy": 0.5079336132759298, "eval_loss": 1.6143743991851807, "eval_runtime": 191.2568, "eval_samples_per_second": 331.889, "eval_steps_per_second": 1.297, "step": 902000 }, { "epoch": 3.6749571489241655, "grad_norm": 6.008819580078125, "learning_rate": 0.002050647558652281, "loss": 7.6347, "step": 902100 }, { "epoch": 3.675364526947547, "grad_norm": 3.743192672729492, "learning_rate": 0.0020501597695147472, "loss": 7.6377, "step": 902200 }, { "epoch": 3.6757719049709285, "grad_norm": 4.820095062255859, "learning_rate": 0.002049671998359482, "loss": 7.6569, "step": 902300 }, { "epoch": 3.67617928299431, "grad_norm": 3.514479875564575, "learning_rate": 0.0020491842452057644, "loss": 7.627, "step": 902400 }, { "epoch": 3.6765866610176916, "grad_norm": 5.559212684631348, "learning_rate": 0.002048696510072885, "loss": 7.6581, "step": 902500 }, { "epoch": 3.676994039041073, "grad_norm": 3.9598398208618164, "learning_rate": 0.0020482087929801253, "loss": 7.6323, "step": 902600 }, { "epoch": 3.6774014170644542, "grad_norm": 4.086262226104736, "learning_rate": 0.0020477210939467633, "loss": 7.6391, "step": 902700 }, { "epoch": 3.6778087950878358, "grad_norm": 9.071392059326172, "learning_rate": 0.002047233412992082, "loss": 7.6273, "step": 902800 }, { "epoch": 3.6782161731112173, "grad_norm": 7.524474143981934, "learning_rate": 0.0020467457501353622, "loss": 7.6524, "step": 902900 }, { "epoch": 3.678623551134599, "grad_norm": 3.225539445877075, "learning_rate": 0.0020462581053958865, "loss": 7.6391, "step": 903000 }, { "epoch": 3.678623551134599, "eval_MaskedAccuracy": 0.5081074179945364, "eval_loss": 1.6174676418304443, "eval_runtime": 170.825, "eval_samples_per_second": 371.585, "eval_steps_per_second": 1.452, "step": 903000 }, { "epoch": 3.67903092915798, "grad_norm": 4.473433017730713, "learning_rate": 0.002045770478792931, "loss": 7.6309, "step": 903100 }, { "epoch": 3.6794383071813614, "grad_norm": 8.447395324707031, "learning_rate": 0.002045282870345772, "loss": 7.5943, "step": 903200 }, { "epoch": 3.679845685204743, "grad_norm": 10.553620338439941, "learning_rate": 0.00204479528007369, "loss": 7.6199, "step": 903300 }, { "epoch": 3.6802530632281245, "grad_norm": 7.491590976715088, "learning_rate": 0.002044307707995961, "loss": 7.6273, "step": 903400 }, { "epoch": 3.680660441251506, "grad_norm": 6.983431339263916, "learning_rate": 0.002043820154131864, "loss": 7.5983, "step": 903500 }, { "epoch": 3.6810678192748876, "grad_norm": 7.161558628082275, "learning_rate": 0.002043332618500672, "loss": 7.6203, "step": 903600 }, { "epoch": 3.681475197298269, "grad_norm": 6.19473934173584, "learning_rate": 0.0020428451011216607, "loss": 7.6518, "step": 903700 }, { "epoch": 3.6818825753216506, "grad_norm": 9.952789306640625, "learning_rate": 0.0020423576020141055, "loss": 7.6285, "step": 903800 }, { "epoch": 3.6822899533450317, "grad_norm": 6.193504810333252, "learning_rate": 0.0020418701211972804, "loss": 7.6272, "step": 903900 }, { "epoch": 3.6826973313684133, "grad_norm": 4.269301891326904, "learning_rate": 0.0020413826586904533, "loss": 7.6176, "step": 904000 }, { "epoch": 3.6826973313684133, "eval_MaskedAccuracy": 0.5083830301982519, "eval_loss": 1.61100172996521, "eval_runtime": 193.3793, "eval_samples_per_second": 328.246, "eval_steps_per_second": 1.282, "step": 904000 }, { "epoch": 3.683104709391795, "grad_norm": 5.101426601409912, "learning_rate": 0.0020408952145129, "loss": 7.6138, "step": 904100 }, { "epoch": 3.6835120874151763, "grad_norm": 4.785317420959473, "learning_rate": 0.002040407788683889, "loss": 7.6337, "step": 904200 }, { "epoch": 3.683919465438558, "grad_norm": 4.20262336730957, "learning_rate": 0.002039920381222697, "loss": 7.6427, "step": 904300 }, { "epoch": 3.684326843461939, "grad_norm": 3.919015407562256, "learning_rate": 0.0020394329921485854, "loss": 7.6244, "step": 904400 }, { "epoch": 3.6847342214853205, "grad_norm": 7.638157844543457, "learning_rate": 0.00203894562148083, "loss": 7.6385, "step": 904500 }, { "epoch": 3.685141599508702, "grad_norm": 7.017099380493164, "learning_rate": 0.002038458269238695, "loss": 7.6287, "step": 904600 }, { "epoch": 3.6855489775320835, "grad_norm": 4.562326431274414, "learning_rate": 0.0020379709354414514, "loss": 7.6273, "step": 904700 }, { "epoch": 3.685956355555465, "grad_norm": 5.913040637969971, "learning_rate": 0.0020374836201083623, "loss": 7.6478, "step": 904800 }, { "epoch": 3.6863637335788466, "grad_norm": 6.3806891441345215, "learning_rate": 0.0020369963232587007, "loss": 7.6429, "step": 904900 }, { "epoch": 3.686771111602228, "grad_norm": 6.356684684753418, "learning_rate": 0.002036509044911725, "loss": 7.6437, "step": 905000 }, { "epoch": 3.686771111602228, "eval_MaskedAccuracy": 0.5082058590471668, "eval_loss": 1.6076563596725464, "eval_runtime": 163.38, "eval_samples_per_second": 388.518, "eval_steps_per_second": 1.518, "step": 905000 }, { "epoch": 3.6871784896256097, "grad_norm": 6.108409881591797, "learning_rate": 0.002036021785086704, "loss": 7.6024, "step": 905100 }, { "epoch": 3.6875858676489908, "grad_norm": 4.241765975952148, "learning_rate": 0.002035534543802898, "loss": 7.6181, "step": 905200 }, { "epoch": 3.6879932456723723, "grad_norm": 7.218456268310547, "learning_rate": 0.0020350473210795747, "loss": 7.6158, "step": 905300 }, { "epoch": 3.688400623695754, "grad_norm": 2.6576573848724365, "learning_rate": 0.002034560116935994, "loss": 7.6536, "step": 905400 }, { "epoch": 3.6888080017191354, "grad_norm": 4.5723700523376465, "learning_rate": 0.002034072931391417, "loss": 7.6368, "step": 905500 }, { "epoch": 3.6892153797425165, "grad_norm": 6.250267028808594, "learning_rate": 0.002033585764465104, "loss": 7.6665, "step": 905600 }, { "epoch": 3.689622757765898, "grad_norm": 7.656291484832764, "learning_rate": 0.0020330986161763206, "loss": 7.639, "step": 905700 }, { "epoch": 3.6900301357892795, "grad_norm": 4.797623634338379, "learning_rate": 0.0020326114865443206, "loss": 7.6211, "step": 905800 }, { "epoch": 3.690437513812661, "grad_norm": 9.249707221984863, "learning_rate": 0.0020321243755883674, "loss": 7.6396, "step": 905900 }, { "epoch": 3.6908448918360426, "grad_norm": 6.6858439445495605, "learning_rate": 0.0020316372833277144, "loss": 7.6265, "step": 906000 }, { "epoch": 3.6908448918360426, "eval_MaskedAccuracy": 0.5085401463231789, "eval_loss": 1.6020891666412354, "eval_runtime": 182.5086, "eval_samples_per_second": 347.797, "eval_steps_per_second": 1.359, "step": 906000 }, { "epoch": 3.691252269859424, "grad_norm": 6.077550888061523, "learning_rate": 0.002031150209781623, "loss": 7.6321, "step": 906100 }, { "epoch": 3.6916596478828057, "grad_norm": 3.146226644515991, "learning_rate": 0.0020306631549693453, "loss": 7.6213, "step": 906200 }, { "epoch": 3.692067025906187, "grad_norm": 7.1037211418151855, "learning_rate": 0.002030176118910142, "loss": 7.6714, "step": 906300 }, { "epoch": 3.6924744039295683, "grad_norm": 5.205930709838867, "learning_rate": 0.0020296891016232657, "loss": 7.6276, "step": 906400 }, { "epoch": 3.69288178195295, "grad_norm": 3.720398187637329, "learning_rate": 0.0020292021031279696, "loss": 7.6167, "step": 906500 }, { "epoch": 3.6932891599763313, "grad_norm": 6.338487148284912, "learning_rate": 0.0020287151234435093, "loss": 7.5683, "step": 906600 }, { "epoch": 3.693696537999713, "grad_norm": 9.687365531921387, "learning_rate": 0.00202822816258914, "loss": 7.6301, "step": 906700 }, { "epoch": 3.6941039160230944, "grad_norm": 4.659130096435547, "learning_rate": 0.00202774122058411, "loss": 7.6488, "step": 906800 }, { "epoch": 3.6945112940464755, "grad_norm": 6.577041149139404, "learning_rate": 0.0020272542974476737, "loss": 7.6651, "step": 906900 }, { "epoch": 3.694918672069857, "grad_norm": 4.362553596496582, "learning_rate": 0.002026767393199078, "loss": 7.645, "step": 907000 }, { "epoch": 3.694918672069857, "eval_MaskedAccuracy": 0.5082458465978787, "eval_loss": 1.6149495840072632, "eval_runtime": 176.4897, "eval_samples_per_second": 359.658, "eval_steps_per_second": 1.405, "step": 907000 }, { "epoch": 3.6953260500932386, "grad_norm": 9.533172607421875, "learning_rate": 0.0020262805078575764, "loss": 7.6072, "step": 907100 }, { "epoch": 3.69573342811662, "grad_norm": 8.81629753112793, "learning_rate": 0.0020257936414424136, "loss": 7.6248, "step": 907200 }, { "epoch": 3.6961408061400016, "grad_norm": 4.230309009552002, "learning_rate": 0.0020253067939728455, "loss": 7.6214, "step": 907300 }, { "epoch": 3.696548184163383, "grad_norm": 7.531736373901367, "learning_rate": 0.002024819965468115, "loss": 7.5999, "step": 907400 }, { "epoch": 3.6969555621867647, "grad_norm": 7.610778331756592, "learning_rate": 0.0020243331559474686, "loss": 7.6341, "step": 907500 }, { "epoch": 3.697362940210146, "grad_norm": 3.27508807182312, "learning_rate": 0.0020238463654301555, "loss": 7.6104, "step": 907600 }, { "epoch": 3.6977703182335273, "grad_norm": 10.008770942687988, "learning_rate": 0.0020233595939354145, "loss": 7.6273, "step": 907700 }, { "epoch": 3.698177696256909, "grad_norm": 5.908839702606201, "learning_rate": 0.002022872841482497, "loss": 7.6468, "step": 907800 }, { "epoch": 3.6985850742802904, "grad_norm": 4.560897350311279, "learning_rate": 0.0020223861080906442, "loss": 7.6604, "step": 907900 }, { "epoch": 3.698992452303672, "grad_norm": 6.814455032348633, "learning_rate": 0.0020218993937791033, "loss": 7.6636, "step": 908000 }, { "epoch": 3.698992452303672, "eval_MaskedAccuracy": 0.5087985280176558, "eval_loss": 1.6035388708114624, "eval_runtime": 170.1775, "eval_samples_per_second": 372.999, "eval_steps_per_second": 1.457, "step": 908000 }, { "epoch": 3.699399830327053, "grad_norm": 4.749573707580566, "learning_rate": 0.0020214126985671106, "loss": 7.6349, "step": 908100 }, { "epoch": 3.6998072083504345, "grad_norm": 8.210793495178223, "learning_rate": 0.0020209260224739096, "loss": 7.6234, "step": 908200 }, { "epoch": 3.700214586373816, "grad_norm": 4.4578657150268555, "learning_rate": 0.0020204393655187405, "loss": 7.6381, "step": 908300 }, { "epoch": 3.7006219643971976, "grad_norm": 4.6508612632751465, "learning_rate": 0.0020199527277208473, "loss": 7.626, "step": 908400 }, { "epoch": 3.701029342420579, "grad_norm": 5.958534240722656, "learning_rate": 0.0020194661090994683, "loss": 7.6338, "step": 908500 }, { "epoch": 3.7014367204439607, "grad_norm": 4.354109764099121, "learning_rate": 0.0020189795096738376, "loss": 7.618, "step": 908600 }, { "epoch": 3.701844098467342, "grad_norm": 4.007742404937744, "learning_rate": 0.002018492929463198, "loss": 7.6354, "step": 908700 }, { "epoch": 3.7022514764907237, "grad_norm": 4.210424423217773, "learning_rate": 0.0020180063684867855, "loss": 7.6146, "step": 908800 }, { "epoch": 3.702658854514105, "grad_norm": 7.69437837600708, "learning_rate": 0.002017519826763835, "loss": 7.624, "step": 908900 }, { "epoch": 3.7030662325374863, "grad_norm": 7.199598789215088, "learning_rate": 0.0020170333043135853, "loss": 7.615, "step": 909000 }, { "epoch": 3.7030662325374863, "eval_MaskedAccuracy": 0.5089749605240288, "eval_loss": 1.6038508415222168, "eval_runtime": 166.3254, "eval_samples_per_second": 381.638, "eval_steps_per_second": 1.491, "step": 909000 }, { "epoch": 3.703473610560868, "grad_norm": 8.40762710571289, "learning_rate": 0.0020165468011552647, "loss": 7.6256, "step": 909100 }, { "epoch": 3.7038809885842494, "grad_norm": 3.7668421268463135, "learning_rate": 0.0020160603173081135, "loss": 7.6101, "step": 909200 }, { "epoch": 3.704288366607631, "grad_norm": 10.608351707458496, "learning_rate": 0.0020155738527913624, "loss": 7.6247, "step": 909300 }, { "epoch": 3.704695744631012, "grad_norm": 5.143110752105713, "learning_rate": 0.002015087407624243, "loss": 7.631, "step": 909400 }, { "epoch": 3.7051031226543936, "grad_norm": 12.672056198120117, "learning_rate": 0.0020146009818259904, "loss": 7.624, "step": 909500 }, { "epoch": 3.705510500677775, "grad_norm": 8.395350456237793, "learning_rate": 0.0020141145754158348, "loss": 7.6635, "step": 909600 }, { "epoch": 3.7059178787011566, "grad_norm": 13.078659057617188, "learning_rate": 0.002013628188413004, "loss": 7.6088, "step": 909700 }, { "epoch": 3.706325256724538, "grad_norm": 5.733210563659668, "learning_rate": 0.0020131418208367306, "loss": 7.6195, "step": 909800 }, { "epoch": 3.7067326347479197, "grad_norm": 5.2784857749938965, "learning_rate": 0.0020126554727062434, "loss": 7.6087, "step": 909900 }, { "epoch": 3.7071400127713012, "grad_norm": 4.753120422363281, "learning_rate": 0.002012169144040761, "loss": 7.6253, "step": 910000 }, { "epoch": 3.7071400127713012, "eval_MaskedAccuracy": 0.5084015905511936, "eval_loss": 1.610862135887146, "eval_runtime": 168.2194, "eval_samples_per_second": 377.34, "eval_steps_per_second": 1.474, "step": 910000 }, { "epoch": 3.7075473907946828, "grad_norm": 6.685218334197998, "learning_rate": 0.0020116828348595204, "loss": 7.6265, "step": 910100 }, { "epoch": 3.707954768818064, "grad_norm": 8.341331481933594, "learning_rate": 0.0020111965451817494, "loss": 7.629, "step": 910200 }, { "epoch": 3.7083621468414454, "grad_norm": 7.240558624267578, "learning_rate": 0.002010710275026669, "loss": 7.6323, "step": 910300 }, { "epoch": 3.708769524864827, "grad_norm": 4.858267307281494, "learning_rate": 0.0020102240244135082, "loss": 7.6126, "step": 910400 }, { "epoch": 3.7091769028882084, "grad_norm": 7.6505560874938965, "learning_rate": 0.0020097377933614856, "loss": 7.6153, "step": 910500 }, { "epoch": 3.7095842809115895, "grad_norm": 7.287432670593262, "learning_rate": 0.0020092515818898276, "loss": 7.6285, "step": 910600 }, { "epoch": 3.709991658934971, "grad_norm": 5.595905303955078, "learning_rate": 0.0020087653900177542, "loss": 7.66, "step": 910700 }, { "epoch": 3.7103990369583526, "grad_norm": 5.411009788513184, "learning_rate": 0.0020082792177644865, "loss": 7.635, "step": 910800 }, { "epoch": 3.710806414981734, "grad_norm": 12.026354789733887, "learning_rate": 0.002007793065149252, "loss": 7.6275, "step": 910900 }, { "epoch": 3.7112137930051157, "grad_norm": 3.368460178375244, "learning_rate": 0.002007306932191267, "loss": 7.6035, "step": 911000 }, { "epoch": 3.7112137930051157, "eval_MaskedAccuracy": 0.5090320116095475, "eval_loss": 1.6031590700149536, "eval_runtime": 170.3081, "eval_samples_per_second": 372.713, "eval_steps_per_second": 1.456, "step": 911000 }, { "epoch": 3.711621171028497, "grad_norm": 7.255367755889893, "learning_rate": 0.002006820818909748, "loss": 7.6008, "step": 911100 }, { "epoch": 3.7120285490518787, "grad_norm": 4.47907829284668, "learning_rate": 0.0020063347253239158, "loss": 7.6551, "step": 911200 }, { "epoch": 3.7124359270752603, "grad_norm": 6.953063011169434, "learning_rate": 0.002005848651452989, "loss": 7.6289, "step": 911300 }, { "epoch": 3.7128433050986414, "grad_norm": 5.317480564117432, "learning_rate": 0.0020053625973161875, "loss": 7.6479, "step": 911400 }, { "epoch": 3.713250683122023, "grad_norm": 3.5525434017181396, "learning_rate": 0.0020048765629327213, "loss": 7.6311, "step": 911500 }, { "epoch": 3.7136580611454044, "grad_norm": 5.438886642456055, "learning_rate": 0.002004390548321811, "loss": 7.6205, "step": 911600 }, { "epoch": 3.714065439168786, "grad_norm": 3.7494962215423584, "learning_rate": 0.0020039045535026737, "loss": 7.6396, "step": 911700 }, { "epoch": 3.7144728171921675, "grad_norm": 5.122317314147949, "learning_rate": 0.0020034185784945178, "loss": 7.6494, "step": 911800 }, { "epoch": 3.7148801952155486, "grad_norm": 5.092377662658691, "learning_rate": 0.002002932623316553, "loss": 7.6101, "step": 911900 }, { "epoch": 3.71528757323893, "grad_norm": 3.388908863067627, "learning_rate": 0.0020024466879880014, "loss": 7.6088, "step": 912000 }, { "epoch": 3.71528757323893, "eval_MaskedAccuracy": 0.5082819329059137, "eval_loss": 1.6137927770614624, "eval_runtime": 168.6591, "eval_samples_per_second": 376.357, "eval_steps_per_second": 1.47, "step": 912000 }, { "epoch": 3.7156949512623116, "grad_norm": 9.515368461608887, "learning_rate": 0.0020019607725280707, "loss": 7.6392, "step": 912100 }, { "epoch": 3.716102329285693, "grad_norm": 5.843565940856934, "learning_rate": 0.0020014748769559693, "loss": 7.629, "step": 912200 }, { "epoch": 3.7165097073090747, "grad_norm": 5.855798244476318, "learning_rate": 0.0020009890012909087, "loss": 7.6139, "step": 912300 }, { "epoch": 3.7169170853324562, "grad_norm": 8.065990447998047, "learning_rate": 0.0020005031455521015, "loss": 7.6508, "step": 912400 }, { "epoch": 3.7173244633558378, "grad_norm": 6.999570369720459, "learning_rate": 0.0020000173097587505, "loss": 7.6084, "step": 912500 }, { "epoch": 3.7177318413792193, "grad_norm": 6.957723140716553, "learning_rate": 0.0019995314939300645, "loss": 7.6511, "step": 912600 }, { "epoch": 3.7181392194026004, "grad_norm": 5.770177364349365, "learning_rate": 0.0019990456980852546, "loss": 7.6507, "step": 912700 }, { "epoch": 3.718546597425982, "grad_norm": 11.67274284362793, "learning_rate": 0.001998559922243523, "loss": 7.6312, "step": 912800 }, { "epoch": 3.7189539754493635, "grad_norm": 8.449727058410645, "learning_rate": 0.0019980741664240784, "loss": 7.5975, "step": 912900 }, { "epoch": 3.719361353472745, "grad_norm": 5.722390651702881, "learning_rate": 0.001997588430646121, "loss": 7.6141, "step": 913000 }, { "epoch": 3.719361353472745, "eval_MaskedAccuracy": 0.5086555214553905, "eval_loss": 1.6092941761016846, "eval_runtime": 211.3674, "eval_samples_per_second": 300.311, "eval_steps_per_second": 1.173, "step": 913000 }, { "epoch": 3.719768731496126, "grad_norm": 7.868215560913086, "learning_rate": 0.0019971027149288576, "loss": 7.6129, "step": 913100 }, { "epoch": 3.7201761095195076, "grad_norm": 8.220807075500488, "learning_rate": 0.0019966170192914907, "loss": 7.635, "step": 913200 }, { "epoch": 3.720583487542889, "grad_norm": 4.928590297698975, "learning_rate": 0.0019961313437532203, "loss": 7.6159, "step": 913300 }, { "epoch": 3.7209908655662707, "grad_norm": 8.91843318939209, "learning_rate": 0.00199564568833325, "loss": 7.5896, "step": 913400 }, { "epoch": 3.721398243589652, "grad_norm": 7.5331501960754395, "learning_rate": 0.00199516005305078, "loss": 7.6472, "step": 913500 }, { "epoch": 3.7218056216130337, "grad_norm": 5.197667598724365, "learning_rate": 0.0019946744379250115, "loss": 7.6312, "step": 913600 }, { "epoch": 3.7222129996364153, "grad_norm": 5.760436058044434, "learning_rate": 0.001994188842975141, "loss": 7.6081, "step": 913700 }, { "epoch": 3.722620377659797, "grad_norm": 4.419961452484131, "learning_rate": 0.0019937032682203664, "loss": 7.5963, "step": 913800 }, { "epoch": 3.723027755683178, "grad_norm": 4.65272331237793, "learning_rate": 0.001993217713679888, "loss": 7.6263, "step": 913900 }, { "epoch": 3.7234351337065594, "grad_norm": 9.407983779907227, "learning_rate": 0.0019927321793728973, "loss": 7.6246, "step": 914000 }, { "epoch": 3.7234351337065594, "eval_MaskedAccuracy": 0.5081790232648262, "eval_loss": 1.6192362308502197, "eval_runtime": 164.5577, "eval_samples_per_second": 385.737, "eval_steps_per_second": 1.507, "step": 914000 }, { "epoch": 3.723842511729941, "grad_norm": 8.16222858428955, "learning_rate": 0.0019922466653185945, "loss": 7.6237, "step": 914100 }, { "epoch": 3.7242498897533225, "grad_norm": 4.593684673309326, "learning_rate": 0.0019917611715361745, "loss": 7.59, "step": 914200 }, { "epoch": 3.724657267776704, "grad_norm": 14.930048942565918, "learning_rate": 0.0019912756980448297, "loss": 7.586, "step": 914300 }, { "epoch": 3.725064645800085, "grad_norm": 4.083841323852539, "learning_rate": 0.0019907902448637557, "loss": 7.6586, "step": 914400 }, { "epoch": 3.7254720238234666, "grad_norm": 4.637545108795166, "learning_rate": 0.001990304812012143, "loss": 7.6282, "step": 914500 }, { "epoch": 3.725879401846848, "grad_norm": 4.426086902618408, "learning_rate": 0.001989819399509184, "loss": 7.6296, "step": 914600 }, { "epoch": 3.7262867798702297, "grad_norm": 4.19540548324585, "learning_rate": 0.0019893340073740703, "loss": 7.6132, "step": 914700 }, { "epoch": 3.7266941578936112, "grad_norm": 4.008831977844238, "learning_rate": 0.001988848635625989, "loss": 7.6167, "step": 914800 }, { "epoch": 3.727101535916993, "grad_norm": 4.061708927154541, "learning_rate": 0.001988363284284133, "loss": 7.6498, "step": 914900 }, { "epoch": 3.7275089139403743, "grad_norm": 3.444599151611328, "learning_rate": 0.0019878779533676894, "loss": 7.6172, "step": 915000 }, { "epoch": 3.7275089139403743, "eval_MaskedAccuracy": 0.508393301990947, "eval_loss": 1.6040995121002197, "eval_runtime": 211.9712, "eval_samples_per_second": 299.456, "eval_steps_per_second": 1.17, "step": 915000 }, { "epoch": 3.727916291963756, "grad_norm": 5.284646034240723, "learning_rate": 0.001987392642895846, "loss": 7.6335, "step": 915100 }, { "epoch": 3.728323669987137, "grad_norm": 6.965787410736084, "learning_rate": 0.0019869073528877915, "loss": 7.6545, "step": 915200 }, { "epoch": 3.7287310480105185, "grad_norm": 10.287710189819336, "learning_rate": 0.0019864220833627085, "loss": 7.6184, "step": 915300 }, { "epoch": 3.7291384260339, "grad_norm": 10.985379219055176, "learning_rate": 0.001985936834339787, "loss": 7.6207, "step": 915400 }, { "epoch": 3.7295458040572815, "grad_norm": 6.770969390869141, "learning_rate": 0.0019854516058382078, "loss": 7.6202, "step": 915500 }, { "epoch": 3.7299531820806626, "grad_norm": 5.606983184814453, "learning_rate": 0.001984966397877154, "loss": 7.6091, "step": 915600 }, { "epoch": 3.730360560104044, "grad_norm": 7.817966938018799, "learning_rate": 0.0019844812104758123, "loss": 7.6162, "step": 915700 }, { "epoch": 3.7307679381274257, "grad_norm": 5.647502899169922, "learning_rate": 0.0019839960436533587, "loss": 7.637, "step": 915800 }, { "epoch": 3.731175316150807, "grad_norm": 7.4646525382995605, "learning_rate": 0.001983510897428981, "loss": 7.609, "step": 915900 }, { "epoch": 3.7315826941741888, "grad_norm": 6.90963888168335, "learning_rate": 0.0019830257718218528, "loss": 7.6275, "step": 916000 }, { "epoch": 3.7315826941741888, "eval_MaskedAccuracy": 0.5085301177683921, "eval_loss": 1.616068720817566, "eval_runtime": 184.7032, "eval_samples_per_second": 343.665, "eval_steps_per_second": 1.343, "step": 916000 }, { "epoch": 3.7319900721975703, "grad_norm": 4.30002498626709, "learning_rate": 0.001982540666851161, "loss": 7.6591, "step": 916100 }, { "epoch": 3.732397450220952, "grad_norm": 6.250236988067627, "learning_rate": 0.001982055582536079, "loss": 7.633, "step": 916200 }, { "epoch": 3.7328048282443334, "grad_norm": 8.453675270080566, "learning_rate": 0.001981570518895786, "loss": 7.6295, "step": 916300 }, { "epoch": 3.7332122062677144, "grad_norm": 4.36338996887207, "learning_rate": 0.0019810854759494596, "loss": 7.6035, "step": 916400 }, { "epoch": 3.733619584291096, "grad_norm": 10.802000999450684, "learning_rate": 0.0019806004537162767, "loss": 7.6431, "step": 916500 }, { "epoch": 3.7340269623144775, "grad_norm": 5.39288330078125, "learning_rate": 0.001980115452215415, "loss": 7.6356, "step": 916600 }, { "epoch": 3.734434340337859, "grad_norm": 13.934906005859375, "learning_rate": 0.001979630471466044, "loss": 7.6191, "step": 916700 }, { "epoch": 3.7348417183612406, "grad_norm": 4.2907233238220215, "learning_rate": 0.0019791455114873398, "loss": 7.6207, "step": 916800 }, { "epoch": 3.7352490963846217, "grad_norm": 3.705734968185425, "learning_rate": 0.001978660572298477, "loss": 7.6012, "step": 916900 }, { "epoch": 3.735656474408003, "grad_norm": 3.0304861068725586, "learning_rate": 0.001978175653918627, "loss": 7.6497, "step": 917000 }, { "epoch": 3.735656474408003, "eval_MaskedAccuracy": 0.5084901686969557, "eval_loss": 1.614431619644165, "eval_runtime": 174.333, "eval_samples_per_second": 364.108, "eval_steps_per_second": 1.423, "step": 917000 }, { "epoch": 3.7360638524313847, "grad_norm": 6.816333293914795, "learning_rate": 0.0019776907563669624, "loss": 7.6248, "step": 917100 }, { "epoch": 3.7364712304547663, "grad_norm": 8.567267417907715, "learning_rate": 0.001977205879662651, "loss": 7.6239, "step": 917200 }, { "epoch": 3.736878608478148, "grad_norm": 4.037398815155029, "learning_rate": 0.001976721023824865, "loss": 7.6, "step": 917300 }, { "epoch": 3.7372859865015293, "grad_norm": 3.024914264678955, "learning_rate": 0.00197623618887277, "loss": 7.6173, "step": 917400 }, { "epoch": 3.737693364524911, "grad_norm": 5.302032947540283, "learning_rate": 0.001975751374825533, "loss": 7.6221, "step": 917500 }, { "epoch": 3.7381007425482924, "grad_norm": 5.088874816894531, "learning_rate": 0.0019752665817023307, "loss": 7.6246, "step": 917600 }, { "epoch": 3.7385081205716735, "grad_norm": 2.968637704849243, "learning_rate": 0.001974781809522325, "loss": 7.6405, "step": 917700 }, { "epoch": 3.738915498595055, "grad_norm": 13.242080688476562, "learning_rate": 0.0019742970583046765, "loss": 7.6194, "step": 917800 }, { "epoch": 3.7393228766184365, "grad_norm": 5.979050636291504, "learning_rate": 0.0019738123280685564, "loss": 7.6502, "step": 917900 }, { "epoch": 3.739730254641818, "grad_norm": 2.8038132190704346, "learning_rate": 0.0019733276188331253, "loss": 7.6651, "step": 918000 }, { "epoch": 3.739730254641818, "eval_MaskedAccuracy": 0.5081496086599453, "eval_loss": 1.61286199092865, "eval_runtime": 174.8365, "eval_samples_per_second": 363.059, "eval_steps_per_second": 1.418, "step": 918000 }, { "epoch": 3.740137632665199, "grad_norm": 12.330081939697266, "learning_rate": 0.001972842930617548, "loss": 7.6246, "step": 918100 }, { "epoch": 3.7405450106885807, "grad_norm": 6.4346537590026855, "learning_rate": 0.0019723582634409826, "loss": 7.6453, "step": 918200 }, { "epoch": 3.7409523887119622, "grad_norm": 12.415046691894531, "learning_rate": 0.0019718736173225957, "loss": 7.6195, "step": 918300 }, { "epoch": 3.7413597667353438, "grad_norm": 4.888115882873535, "learning_rate": 0.001971388992281549, "loss": 7.637, "step": 918400 }, { "epoch": 3.7417671447587253, "grad_norm": 7.780838966369629, "learning_rate": 0.001970904388336996, "loss": 7.6309, "step": 918500 }, { "epoch": 3.742174522782107, "grad_norm": 11.20776653289795, "learning_rate": 0.0019704198055080988, "loss": 7.6622, "step": 918600 }, { "epoch": 3.7425819008054884, "grad_norm": 9.625692367553711, "learning_rate": 0.001969935243814019, "loss": 7.6533, "step": 918700 }, { "epoch": 3.74298927882887, "grad_norm": 7.409010410308838, "learning_rate": 0.001969450703273912, "loss": 7.5809, "step": 918800 }, { "epoch": 3.743396656852251, "grad_norm": 6.391397476196289, "learning_rate": 0.0019689661839069296, "loss": 7.6396, "step": 918900 }, { "epoch": 3.7438040348756325, "grad_norm": 5.268857479095459, "learning_rate": 0.0019684816857322295, "loss": 7.6487, "step": 919000 }, { "epoch": 3.7438040348756325, "eval_MaskedAccuracy": 0.5092185881805066, "eval_loss": 1.6022710800170898, "eval_runtime": 167.9693, "eval_samples_per_second": 377.902, "eval_steps_per_second": 1.476, "step": 919000 }, { "epoch": 3.744211412899014, "grad_norm": 5.5644659996032715, "learning_rate": 0.00196799720876897, "loss": 7.6263, "step": 919100 }, { "epoch": 3.7446187909223956, "grad_norm": 5.995559215545654, "learning_rate": 0.001967512753036302, "loss": 7.649, "step": 919200 }, { "epoch": 3.745026168945777, "grad_norm": 8.925673484802246, "learning_rate": 0.0019670283185533794, "loss": 7.6201, "step": 919300 }, { "epoch": 3.745433546969158, "grad_norm": 4.775676250457764, "learning_rate": 0.0019665439053393567, "loss": 7.6222, "step": 919400 }, { "epoch": 3.7458409249925397, "grad_norm": 9.755720138549805, "learning_rate": 0.0019660595134133813, "loss": 7.6237, "step": 919500 }, { "epoch": 3.7462483030159213, "grad_norm": 3.6146299839019775, "learning_rate": 0.0019655751427946062, "loss": 7.5949, "step": 919600 }, { "epoch": 3.746655681039303, "grad_norm": 6.875545978546143, "learning_rate": 0.0019650907935021803, "loss": 7.6283, "step": 919700 }, { "epoch": 3.7470630590626843, "grad_norm": 5.4195356369018555, "learning_rate": 0.0019646064655552543, "loss": 7.6282, "step": 919800 }, { "epoch": 3.747470437086066, "grad_norm": 16.771759033203125, "learning_rate": 0.001964122158972976, "loss": 7.6528, "step": 919900 }, { "epoch": 3.7478778151094474, "grad_norm": 11.797569274902344, "learning_rate": 0.00196363787377449, "loss": 7.6391, "step": 920000 }, { "epoch": 3.7478778151094474, "eval_MaskedAccuracy": 0.5082259376828405, "eval_loss": 1.6109910011291504, "eval_runtime": 176.7264, "eval_samples_per_second": 359.177, "eval_steps_per_second": 1.403, "step": 920000 }, { "epoch": 3.748285193132829, "grad_norm": 4.627211570739746, "learning_rate": 0.001963153609978944, "loss": 7.5959, "step": 920100 }, { "epoch": 3.74869257115621, "grad_norm": 4.405156135559082, "learning_rate": 0.001962669367605476, "loss": 7.6312, "step": 920200 }, { "epoch": 3.7490999491795916, "grad_norm": 6.155109882354736, "learning_rate": 0.00196218514667323, "loss": 7.6155, "step": 920300 }, { "epoch": 3.749507327202973, "grad_norm": 8.350790023803711, "learning_rate": 0.001961700947201364, "loss": 7.6211, "step": 920400 }, { "epoch": 3.7499147052263546, "grad_norm": 4.750356197357178, "learning_rate": 0.0019612167692090164, "loss": 7.6295, "step": 920500 }, { "epoch": 3.7503220832497357, "grad_norm": 4.070127487182617, "learning_rate": 0.0019607326127153266, "loss": 7.59, "step": 920600 }, { "epoch": 3.7507294612731172, "grad_norm": 6.613310813903809, "learning_rate": 0.0019602484777394382, "loss": 7.6204, "step": 920700 }, { "epoch": 3.7511368392964988, "grad_norm": 4.784940242767334, "learning_rate": 0.001959764364300491, "loss": 7.6265, "step": 920800 }, { "epoch": 3.7515442173198803, "grad_norm": 4.124524116516113, "learning_rate": 0.001959280272417624, "loss": 7.5986, "step": 920900 }, { "epoch": 3.751951595343262, "grad_norm": 6.608432292938232, "learning_rate": 0.0019587962021099764, "loss": 7.6198, "step": 921000 }, { "epoch": 3.751951595343262, "eval_MaskedAccuracy": 0.5098709177786279, "eval_loss": 1.603236436843872, "eval_runtime": 207.7534, "eval_samples_per_second": 305.535, "eval_steps_per_second": 1.194, "step": 921000 }, { "epoch": 3.7523589733666434, "grad_norm": 15.3954439163208, "learning_rate": 0.001958312153396688, "loss": 7.6055, "step": 921100 }, { "epoch": 3.752766351390025, "grad_norm": 8.494688987731934, "learning_rate": 0.001957828126296891, "loss": 7.6394, "step": 921200 }, { "epoch": 3.7531737294134064, "grad_norm": 6.292644023895264, "learning_rate": 0.001957344120829729, "loss": 7.6064, "step": 921300 }, { "epoch": 3.7535811074367875, "grad_norm": 7.271989345550537, "learning_rate": 0.001956860137014331, "loss": 7.5715, "step": 921400 }, { "epoch": 3.753988485460169, "grad_norm": 5.526138782501221, "learning_rate": 0.0019563761748698323, "loss": 7.6033, "step": 921500 }, { "epoch": 3.7543958634835506, "grad_norm": 4.571513652801514, "learning_rate": 0.001955892234415367, "loss": 7.6333, "step": 921600 }, { "epoch": 3.754803241506932, "grad_norm": 4.379602432250977, "learning_rate": 0.00195540831567007, "loss": 7.6146, "step": 921700 }, { "epoch": 3.7552106195303137, "grad_norm": 5.573449611663818, "learning_rate": 0.0019549244186530712, "loss": 7.6089, "step": 921800 }, { "epoch": 3.7556179975536947, "grad_norm": 4.742440223693848, "learning_rate": 0.0019544405433835014, "loss": 7.6135, "step": 921900 }, { "epoch": 3.7560253755770763, "grad_norm": 7.930488109588623, "learning_rate": 0.001953956689880494, "loss": 7.6125, "step": 922000 }, { "epoch": 3.7560253755770763, "eval_MaskedAccuracy": 0.508604559251422, "eval_loss": 1.6128079891204834, "eval_runtime": 178.4683, "eval_samples_per_second": 355.671, "eval_steps_per_second": 1.39, "step": 922000 }, { "epoch": 3.756432753600458, "grad_norm": 5.972048759460449, "learning_rate": 0.001953472858163176, "loss": 7.6117, "step": 922100 }, { "epoch": 3.7568401316238393, "grad_norm": 12.296833038330078, "learning_rate": 0.0019529890482506767, "loss": 7.614, "step": 922200 }, { "epoch": 3.757247509647221, "grad_norm": 14.029672622680664, "learning_rate": 0.001952505260162131, "loss": 7.5928, "step": 922300 }, { "epoch": 3.7576548876706024, "grad_norm": 5.605931758880615, "learning_rate": 0.0019520214939166565, "loss": 7.6218, "step": 922400 }, { "epoch": 3.758062265693984, "grad_norm": 3.330333709716797, "learning_rate": 0.0019515377495333808, "loss": 7.5931, "step": 922500 }, { "epoch": 3.7584696437173655, "grad_norm": 13.505358695983887, "learning_rate": 0.0019510540270314277, "loss": 7.6312, "step": 922600 }, { "epoch": 3.7588770217407466, "grad_norm": 6.706369876861572, "learning_rate": 0.0019505703264299232, "loss": 7.6074, "step": 922700 }, { "epoch": 3.759284399764128, "grad_norm": 5.232789039611816, "learning_rate": 0.0019500866477479882, "loss": 7.5991, "step": 922800 }, { "epoch": 3.7596917777875096, "grad_norm": 7.2715044021606445, "learning_rate": 0.0019496029910047465, "loss": 7.6205, "step": 922900 }, { "epoch": 3.760099155810891, "grad_norm": 12.410377502441406, "learning_rate": 0.001949119356219318, "loss": 7.6358, "step": 923000 }, { "epoch": 3.760099155810891, "eval_MaskedAccuracy": 0.5088255505899758, "eval_loss": 1.6051766872406006, "eval_runtime": 167.2217, "eval_samples_per_second": 379.592, "eval_steps_per_second": 1.483, "step": 923000 }, { "epoch": 3.7605065338342722, "grad_norm": 3.711641788482666, "learning_rate": 0.0019486357434108253, "loss": 7.6025, "step": 923100 }, { "epoch": 3.760913911857654, "grad_norm": 8.183929443359375, "learning_rate": 0.001948152152598386, "loss": 7.6324, "step": 923200 }, { "epoch": 3.7613212898810353, "grad_norm": 7.197895526885986, "learning_rate": 0.0019476685838011201, "loss": 7.6022, "step": 923300 }, { "epoch": 3.761728667904417, "grad_norm": 10.465385437011719, "learning_rate": 0.001947185037038145, "loss": 7.6294, "step": 923400 }, { "epoch": 3.7621360459277984, "grad_norm": 3.7687559127807617, "learning_rate": 0.0019467015123285816, "loss": 7.6236, "step": 923500 }, { "epoch": 3.76254342395118, "grad_norm": 3.5964677333831787, "learning_rate": 0.0019462180096915425, "loss": 7.6261, "step": 923600 }, { "epoch": 3.7629508019745614, "grad_norm": 7.2088117599487305, "learning_rate": 0.0019457345291461424, "loss": 7.6201, "step": 923700 }, { "epoch": 3.763358179997943, "grad_norm": 2.728327512741089, "learning_rate": 0.0019452510707114983, "loss": 7.6378, "step": 923800 }, { "epoch": 3.763765558021324, "grad_norm": 4.735380172729492, "learning_rate": 0.0019447676344067238, "loss": 7.6563, "step": 923900 }, { "epoch": 3.7641729360447056, "grad_norm": 8.870068550109863, "learning_rate": 0.0019442842202509302, "loss": 7.6321, "step": 924000 }, { "epoch": 3.7641729360447056, "eval_MaskedAccuracy": 0.5082377429610162, "eval_loss": 1.6108441352844238, "eval_runtime": 169.2683, "eval_samples_per_second": 375.002, "eval_steps_per_second": 1.465, "step": 924000 }, { "epoch": 3.764580314068087, "grad_norm": 6.801137924194336, "learning_rate": 0.0019438008282632287, "loss": 7.6365, "step": 924100 }, { "epoch": 3.7649876920914687, "grad_norm": 11.852954864501953, "learning_rate": 0.0019433174584627332, "loss": 7.6249, "step": 924200 }, { "epoch": 3.76539507011485, "grad_norm": 4.879085063934326, "learning_rate": 0.0019428341108685529, "loss": 7.6219, "step": 924300 }, { "epoch": 3.7658024481382313, "grad_norm": 8.953462600708008, "learning_rate": 0.0019423507854997965, "loss": 7.6171, "step": 924400 }, { "epoch": 3.766209826161613, "grad_norm": 6.114043712615967, "learning_rate": 0.0019418674823755747, "loss": 7.6225, "step": 924500 }, { "epoch": 3.7666172041849943, "grad_norm": 4.322595119476318, "learning_rate": 0.001941384201514992, "loss": 7.6156, "step": 924600 }, { "epoch": 3.767024582208376, "grad_norm": 9.810853958129883, "learning_rate": 0.0019409009429371554, "loss": 7.6142, "step": 924700 }, { "epoch": 3.7674319602317574, "grad_norm": 4.716188907623291, "learning_rate": 0.0019404177066611708, "loss": 7.6106, "step": 924800 }, { "epoch": 3.767839338255139, "grad_norm": 5.210825443267822, "learning_rate": 0.001939934492706145, "loss": 7.6208, "step": 924900 }, { "epoch": 3.7682467162785205, "grad_norm": 6.435670375823975, "learning_rate": 0.0019394513010911813, "loss": 7.6332, "step": 925000 }, { "epoch": 3.7682467162785205, "eval_MaskedAccuracy": 0.5089074662516333, "eval_loss": 1.6012661457061768, "eval_runtime": 172.1616, "eval_samples_per_second": 368.7, "eval_steps_per_second": 1.441, "step": 925000 }, { "epoch": 3.768654094301902, "grad_norm": 3.518427848815918, "learning_rate": 0.0019389681318353832, "loss": 7.6217, "step": 925100 }, { "epoch": 3.769061472325283, "grad_norm": 4.321313858032227, "learning_rate": 0.0019384849849578495, "loss": 7.6026, "step": 925200 }, { "epoch": 3.7694688503486646, "grad_norm": 5.672753810882568, "learning_rate": 0.0019380018604776858, "loss": 7.6097, "step": 925300 }, { "epoch": 3.769876228372046, "grad_norm": 7.020403861999512, "learning_rate": 0.001937518758413992, "loss": 7.6288, "step": 925400 }, { "epoch": 3.7702836063954277, "grad_norm": 14.594400405883789, "learning_rate": 0.0019370356787858662, "loss": 7.6228, "step": 925500 }, { "epoch": 3.770690984418809, "grad_norm": 5.069800853729248, "learning_rate": 0.0019365526216124085, "loss": 7.6339, "step": 925600 }, { "epoch": 3.7710983624421903, "grad_norm": 3.8607585430145264, "learning_rate": 0.0019360695869127139, "loss": 7.5989, "step": 925700 }, { "epoch": 3.771505740465572, "grad_norm": 4.882491588592529, "learning_rate": 0.0019355865747058842, "loss": 7.6535, "step": 925800 }, { "epoch": 3.7719131184889534, "grad_norm": 5.538855075836182, "learning_rate": 0.0019351035850110126, "loss": 7.6216, "step": 925900 }, { "epoch": 3.772320496512335, "grad_norm": 18.590662002563477, "learning_rate": 0.001934620617847193, "loss": 7.6136, "step": 926000 }, { "epoch": 3.772320496512335, "eval_MaskedAccuracy": 0.5085267091249369, "eval_loss": 1.6080429553985596, "eval_runtime": 173.6734, "eval_samples_per_second": 365.491, "eval_steps_per_second": 1.428, "step": 926000 }, { "epoch": 3.7727278745357165, "grad_norm": 4.088473796844482, "learning_rate": 0.0019341376732335217, "loss": 7.614, "step": 926100 }, { "epoch": 3.773135252559098, "grad_norm": 8.415093421936035, "learning_rate": 0.0019336547511890896, "loss": 7.6151, "step": 926200 }, { "epoch": 3.7735426305824795, "grad_norm": 5.7778120040893555, "learning_rate": 0.001933171851732991, "loss": 7.5784, "step": 926300 }, { "epoch": 3.7739500086058606, "grad_norm": 10.873912811279297, "learning_rate": 0.0019326889748843164, "loss": 7.6529, "step": 926400 }, { "epoch": 3.774357386629242, "grad_norm": 10.82746696472168, "learning_rate": 0.00193220612066216, "loss": 7.6368, "step": 926500 }, { "epoch": 3.7747647646526237, "grad_norm": 6.487828731536865, "learning_rate": 0.0019317232890856083, "loss": 7.6463, "step": 926600 }, { "epoch": 3.775172142676005, "grad_norm": 3.0424578189849854, "learning_rate": 0.0019312404801737518, "loss": 7.6184, "step": 926700 }, { "epoch": 3.7755795206993867, "grad_norm": 3.9002697467803955, "learning_rate": 0.001930757693945678, "loss": 7.6292, "step": 926800 }, { "epoch": 3.775986898722768, "grad_norm": 7.217895984649658, "learning_rate": 0.0019302749304204731, "loss": 7.6211, "step": 926900 }, { "epoch": 3.7763942767461494, "grad_norm": 9.165586471557617, "learning_rate": 0.0019297921896172245, "loss": 7.6304, "step": 927000 }, { "epoch": 3.7763942767461494, "eval_MaskedAccuracy": 0.5088129069204577, "eval_loss": 1.6102675199508667, "eval_runtime": 184.8197, "eval_samples_per_second": 343.448, "eval_steps_per_second": 1.342, "step": 927000 }, { "epoch": 3.776801654769531, "grad_norm": 7.244925022125244, "learning_rate": 0.0019293094715550159, "loss": 7.6093, "step": 927100 }, { "epoch": 3.7772090327929124, "grad_norm": 10.581029891967773, "learning_rate": 0.0019288267762529328, "loss": 7.612, "step": 927200 }, { "epoch": 3.777616410816294, "grad_norm": 3.60493540763855, "learning_rate": 0.0019283441037300602, "loss": 7.6197, "step": 927300 }, { "epoch": 3.7780237888396755, "grad_norm": 10.525854110717773, "learning_rate": 0.001927861454005478, "loss": 7.6008, "step": 927400 }, { "epoch": 3.778431166863057, "grad_norm": 11.118393898010254, "learning_rate": 0.0019273788270982703, "loss": 7.631, "step": 927500 }, { "epoch": 3.7788385448864386, "grad_norm": 4.733307361602783, "learning_rate": 0.001926896223027518, "loss": 7.6265, "step": 927600 }, { "epoch": 3.7792459229098196, "grad_norm": 3.1409990787506104, "learning_rate": 0.0019264136418122996, "loss": 7.6193, "step": 927700 }, { "epoch": 3.779653300933201, "grad_norm": 7.09230899810791, "learning_rate": 0.001925931083471695, "loss": 7.6131, "step": 927800 }, { "epoch": 3.7800606789565827, "grad_norm": 3.0488314628601074, "learning_rate": 0.0019254485480247794, "loss": 7.6366, "step": 927900 }, { "epoch": 3.7804680569799642, "grad_norm": 5.295105457305908, "learning_rate": 0.0019249660354906334, "loss": 7.6328, "step": 928000 }, { "epoch": 3.7804680569799642, "eval_MaskedAccuracy": 0.5088230856977412, "eval_loss": 1.6120420694351196, "eval_runtime": 177.4072, "eval_samples_per_second": 357.798, "eval_steps_per_second": 1.398, "step": 928000 }, { "epoch": 3.7808754350033453, "grad_norm": 4.067548751831055, "learning_rate": 0.0019244835458883335, "loss": 7.5919, "step": 928100 }, { "epoch": 3.781282813026727, "grad_norm": 6.178030014038086, "learning_rate": 0.001924001079236952, "loss": 7.6033, "step": 928200 }, { "epoch": 3.7816901910501084, "grad_norm": 4.132225036621094, "learning_rate": 0.0019235186355555647, "loss": 7.6217, "step": 928300 }, { "epoch": 3.78209756907349, "grad_norm": 3.7145018577575684, "learning_rate": 0.0019230362148632463, "loss": 7.6341, "step": 928400 }, { "epoch": 3.7825049470968715, "grad_norm": 6.242832660675049, "learning_rate": 0.001922553817179067, "loss": 7.6216, "step": 928500 }, { "epoch": 3.782912325120253, "grad_norm": 10.651412963867188, "learning_rate": 0.0019220714425221036, "loss": 7.6105, "step": 928600 }, { "epoch": 3.7833197031436345, "grad_norm": 3.809922695159912, "learning_rate": 0.0019215890909114223, "loss": 7.6333, "step": 928700 }, { "epoch": 3.783727081167016, "grad_norm": 4.2344255447387695, "learning_rate": 0.001921106762366095, "loss": 7.6358, "step": 928800 }, { "epoch": 3.784134459190397, "grad_norm": 5.805576324462891, "learning_rate": 0.00192062445690519, "loss": 7.6337, "step": 928900 }, { "epoch": 3.7845418372137787, "grad_norm": 3.8189687728881836, "learning_rate": 0.0019201421745477757, "loss": 7.63, "step": 929000 }, { "epoch": 3.7845418372137787, "eval_MaskedAccuracy": 0.5085324869720714, "eval_loss": 1.6022095680236816, "eval_runtime": 218.4841, "eval_samples_per_second": 290.529, "eval_steps_per_second": 1.135, "step": 929000 }, { "epoch": 3.78494921523716, "grad_norm": 4.850886344909668, "learning_rate": 0.0019196599153129196, "loss": 7.6288, "step": 929100 }, { "epoch": 3.7853565932605417, "grad_norm": 4.773021697998047, "learning_rate": 0.0019191776792196885, "loss": 7.6495, "step": 929200 }, { "epoch": 3.7857639712839233, "grad_norm": 9.408202171325684, "learning_rate": 0.0019186954662871458, "loss": 7.6224, "step": 929300 }, { "epoch": 3.7861713493073044, "grad_norm": 9.539839744567871, "learning_rate": 0.0019182132765343577, "loss": 7.6402, "step": 929400 }, { "epoch": 3.786578727330686, "grad_norm": 7.151216983795166, "learning_rate": 0.0019177311099803872, "loss": 7.601, "step": 929500 }, { "epoch": 3.7869861053540674, "grad_norm": 5.781540870666504, "learning_rate": 0.0019172489666442968, "loss": 7.6295, "step": 929600 }, { "epoch": 3.787393483377449, "grad_norm": 8.596144676208496, "learning_rate": 0.0019167668465451477, "loss": 7.6083, "step": 929700 }, { "epoch": 3.7878008614008305, "grad_norm": 3.7301230430603027, "learning_rate": 0.0019162847497020026, "loss": 7.6516, "step": 929800 }, { "epoch": 3.788208239424212, "grad_norm": 10.695462226867676, "learning_rate": 0.0019158026761339204, "loss": 7.5908, "step": 929900 }, { "epoch": 3.7886156174475936, "grad_norm": 5.438191890716553, "learning_rate": 0.001915320625859957, "loss": 7.6035, "step": 930000 }, { "epoch": 3.7886156174475936, "eval_MaskedAccuracy": 0.5088793106898304, "eval_loss": 1.6012581586837769, "eval_runtime": 202.1123, "eval_samples_per_second": 314.063, "eval_steps_per_second": 1.227, "step": 930000 }, { "epoch": 3.789022995470975, "grad_norm": 8.394279479980469, "learning_rate": 0.0019148385988991758, "loss": 7.6457, "step": 930100 }, { "epoch": 3.789430373494356, "grad_norm": 3.487502336502075, "learning_rate": 0.001914356595270628, "loss": 7.6192, "step": 930200 }, { "epoch": 3.7898377515177377, "grad_norm": 8.857083320617676, "learning_rate": 0.0019138746149933753, "loss": 7.601, "step": 930300 }, { "epoch": 3.7902451295411193, "grad_norm": 6.209765911102295, "learning_rate": 0.0019133926580864706, "loss": 7.6184, "step": 930400 }, { "epoch": 3.790652507564501, "grad_norm": 12.463996887207031, "learning_rate": 0.0019129107245689683, "loss": 7.6209, "step": 930500 }, { "epoch": 3.791059885587882, "grad_norm": 9.588804244995117, "learning_rate": 0.001912428814459921, "loss": 7.5902, "step": 930600 }, { "epoch": 3.7914672636112634, "grad_norm": 9.467856407165527, "learning_rate": 0.001911946927778382, "loss": 7.6354, "step": 930700 }, { "epoch": 3.791874641634645, "grad_norm": 9.193414688110352, "learning_rate": 0.0019114650645434032, "loss": 7.6607, "step": 930800 }, { "epoch": 3.7922820196580265, "grad_norm": 10.822127342224121, "learning_rate": 0.001910983224774035, "loss": 7.6242, "step": 930900 }, { "epoch": 3.792689397681408, "grad_norm": 5.29072380065918, "learning_rate": 0.001910501408489326, "loss": 7.6329, "step": 931000 }, { "epoch": 3.792689397681408, "eval_MaskedAccuracy": 0.5088497738630215, "eval_loss": 1.6049249172210693, "eval_runtime": 173.2161, "eval_samples_per_second": 366.456, "eval_steps_per_second": 1.432, "step": 931000 }, { "epoch": 3.7930967757047895, "grad_norm": 6.270878314971924, "learning_rate": 0.0019100196157083252, "loss": 7.6315, "step": 931100 }, { "epoch": 3.793504153728171, "grad_norm": 8.242561340332031, "learning_rate": 0.0019095378464500808, "loss": 7.6053, "step": 931200 }, { "epoch": 3.7939115317515526, "grad_norm": 3.960211992263794, "learning_rate": 0.001909056100733643, "loss": 7.6133, "step": 931300 }, { "epoch": 3.7943189097749337, "grad_norm": 6.9959235191345215, "learning_rate": 0.0019085743785780516, "loss": 7.6454, "step": 931400 }, { "epoch": 3.7947262877983152, "grad_norm": 4.7930192947387695, "learning_rate": 0.0019080926800023575, "loss": 7.6258, "step": 931500 }, { "epoch": 3.7951336658216968, "grad_norm": 5.788893699645996, "learning_rate": 0.0019076110050256006, "loss": 7.6056, "step": 931600 }, { "epoch": 3.7955410438450783, "grad_norm": 10.426204681396484, "learning_rate": 0.0019071293536668263, "loss": 7.6422, "step": 931700 }, { "epoch": 3.79594842186846, "grad_norm": 7.6468682289123535, "learning_rate": 0.001906647725945077, "loss": 7.6086, "step": 931800 }, { "epoch": 3.796355799891841, "grad_norm": 4.9427266120910645, "learning_rate": 0.0019061661218793935, "loss": 7.6002, "step": 931900 }, { "epoch": 3.7967631779152224, "grad_norm": 9.223947525024414, "learning_rate": 0.0019056845414888162, "loss": 7.63, "step": 932000 }, { "epoch": 3.7967631779152224, "eval_MaskedAccuracy": 0.5096634473074941, "eval_loss": 1.6023601293563843, "eval_runtime": 183.5559, "eval_samples_per_second": 345.813, "eval_steps_per_second": 1.351, "step": 932000 }, { "epoch": 3.797170555938604, "grad_norm": 6.308687210083008, "learning_rate": 0.0019052029847923845, "loss": 7.5935, "step": 932100 }, { "epoch": 3.7975779339619855, "grad_norm": 8.173504829406738, "learning_rate": 0.001904721451809135, "loss": 7.6145, "step": 932200 }, { "epoch": 3.797985311985367, "grad_norm": 13.783915519714355, "learning_rate": 0.0019042399425581089, "loss": 7.5895, "step": 932300 }, { "epoch": 3.7983926900087486, "grad_norm": 6.4456987380981445, "learning_rate": 0.0019037584570583406, "loss": 7.6284, "step": 932400 }, { "epoch": 3.79880006803213, "grad_norm": 3.5840206146240234, "learning_rate": 0.0019032769953288655, "loss": 7.6604, "step": 932500 }, { "epoch": 3.7992074460555116, "grad_norm": 11.0537109375, "learning_rate": 0.00190279555738872, "loss": 7.6327, "step": 932600 }, { "epoch": 3.7996148240788927, "grad_norm": 4.1343913078308105, "learning_rate": 0.0019023141432569365, "loss": 7.6355, "step": 932700 }, { "epoch": 3.8000222021022743, "grad_norm": 8.910087585449219, "learning_rate": 0.0019018327529525486, "loss": 7.5976, "step": 932800 }, { "epoch": 3.800429580125656, "grad_norm": 4.534204959869385, "learning_rate": 0.0019013513864945871, "loss": 7.6299, "step": 932900 }, { "epoch": 3.8008369581490373, "grad_norm": 3.7702343463897705, "learning_rate": 0.001900870043902081, "loss": 7.6218, "step": 933000 }, { "epoch": 3.8008369581490373, "eval_MaskedAccuracy": 0.5087608229897715, "eval_loss": 1.6011755466461182, "eval_runtime": 168.4595, "eval_samples_per_second": 376.803, "eval_steps_per_second": 1.472, "step": 933000 }, { "epoch": 3.8012443361724184, "grad_norm": 20.06410789489746, "learning_rate": 0.0019003887251940656, "loss": 7.6295, "step": 933100 }, { "epoch": 3.8016517141958, "grad_norm": 7.299319267272949, "learning_rate": 0.001899907430389567, "loss": 7.6214, "step": 933200 }, { "epoch": 3.8020590922191815, "grad_norm": 9.940072059631348, "learning_rate": 0.0018994261595076155, "loss": 7.6014, "step": 933300 }, { "epoch": 3.802466470242563, "grad_norm": 9.4749755859375, "learning_rate": 0.001898944912567237, "loss": 7.6262, "step": 933400 }, { "epoch": 3.8028738482659445, "grad_norm": 8.810791969299316, "learning_rate": 0.0018984636895874555, "loss": 7.6003, "step": 933500 }, { "epoch": 3.803281226289326, "grad_norm": 5.451295852661133, "learning_rate": 0.0018979824905872994, "loss": 7.6203, "step": 933600 }, { "epoch": 3.8036886043127076, "grad_norm": 9.41309642791748, "learning_rate": 0.0018975013155857938, "loss": 7.628, "step": 933700 }, { "epoch": 3.804095982336089, "grad_norm": 3.6500532627105713, "learning_rate": 0.0018970201646019583, "loss": 7.6062, "step": 933800 }, { "epoch": 3.8045033603594702, "grad_norm": 3.9726932048797607, "learning_rate": 0.0018965390376548175, "loss": 7.6384, "step": 933900 }, { "epoch": 3.8049107383828518, "grad_norm": 18.39219856262207, "learning_rate": 0.0018960579347633922, "loss": 7.6166, "step": 934000 }, { "epoch": 3.8049107383828518, "eval_MaskedAccuracy": 0.5078392971486618, "eval_loss": 1.615269660949707, "eval_runtime": 165.284, "eval_samples_per_second": 384.042, "eval_steps_per_second": 1.5, "step": 934000 }, { "epoch": 3.8053181164062333, "grad_norm": 7.437594890594482, "learning_rate": 0.001895576855946705, "loss": 7.6228, "step": 934100 }, { "epoch": 3.805725494429615, "grad_norm": 5.421675205230713, "learning_rate": 0.001895095801223774, "loss": 7.6216, "step": 934200 }, { "epoch": 3.8061328724529964, "grad_norm": 4.602786540985107, "learning_rate": 0.001894614770613619, "loss": 7.6374, "step": 934300 }, { "epoch": 3.8065402504763775, "grad_norm": 3.846876621246338, "learning_rate": 0.001894133764135255, "loss": 7.5964, "step": 934400 }, { "epoch": 3.806947628499759, "grad_norm": 3.219874143600464, "learning_rate": 0.0018936527818077004, "loss": 7.5776, "step": 934500 }, { "epoch": 3.8073550065231405, "grad_norm": 6.049060344696045, "learning_rate": 0.0018931718236499724, "loss": 7.6029, "step": 934600 }, { "epoch": 3.807762384546522, "grad_norm": 8.037505149841309, "learning_rate": 0.0018926908896810847, "loss": 7.5972, "step": 934700 }, { "epoch": 3.8081697625699036, "grad_norm": 6.002288341522217, "learning_rate": 0.0018922099799200507, "loss": 7.6172, "step": 934800 }, { "epoch": 3.808577140593285, "grad_norm": 8.529500961303711, "learning_rate": 0.0018917290943858815, "loss": 7.648, "step": 934900 }, { "epoch": 3.8089845186166666, "grad_norm": 4.888249397277832, "learning_rate": 0.0018912482330975921, "loss": 7.6155, "step": 935000 }, { "epoch": 3.8089845186166666, "eval_MaskedAccuracy": 0.5092839951477184, "eval_loss": 1.6106300354003906, "eval_runtime": 175.5796, "eval_samples_per_second": 361.523, "eval_steps_per_second": 1.412, "step": 935000 }, { "epoch": 3.809391896640048, "grad_norm": 12.686517715454102, "learning_rate": 0.0018907673960741923, "loss": 7.6283, "step": 935100 }, { "epoch": 3.8097992746634293, "grad_norm": 5.5246734619140625, "learning_rate": 0.001890286583334692, "loss": 7.6263, "step": 935200 }, { "epoch": 3.810206652686811, "grad_norm": 11.302197456359863, "learning_rate": 0.0018898057948981016, "loss": 7.626, "step": 935300 }, { "epoch": 3.8106140307101923, "grad_norm": 10.458709716796875, "learning_rate": 0.001889325030783428, "loss": 7.6231, "step": 935400 }, { "epoch": 3.811021408733574, "grad_norm": 5.100559711456299, "learning_rate": 0.0018888442910096797, "loss": 7.5966, "step": 935500 }, { "epoch": 3.811428786756955, "grad_norm": 5.678092956542969, "learning_rate": 0.0018883635755958617, "loss": 7.5932, "step": 935600 }, { "epoch": 3.8118361647803365, "grad_norm": 10.808277130126953, "learning_rate": 0.0018878828845609794, "loss": 7.5963, "step": 935700 }, { "epoch": 3.812243542803718, "grad_norm": 6.407227993011475, "learning_rate": 0.0018874022179240372, "loss": 7.5891, "step": 935800 }, { "epoch": 3.8126509208270996, "grad_norm": 5.734281063079834, "learning_rate": 0.0018869215757040378, "loss": 7.6034, "step": 935900 }, { "epoch": 3.813058298850481, "grad_norm": 4.35515022277832, "learning_rate": 0.001886440957919985, "loss": 7.613, "step": 936000 }, { "epoch": 3.813058298850481, "eval_MaskedAccuracy": 0.5086288339335939, "eval_loss": 1.6063237190246582, "eval_runtime": 168.5456, "eval_samples_per_second": 376.61, "eval_steps_per_second": 1.471, "step": 936000 }, { "epoch": 3.8134656768738626, "grad_norm": 3.6834418773651123, "learning_rate": 0.0018859603645908793, "loss": 7.6224, "step": 936100 }, { "epoch": 3.813873054897244, "grad_norm": 12.3207368850708, "learning_rate": 0.001885479795735722, "loss": 7.634, "step": 936200 }, { "epoch": 3.8142804329206257, "grad_norm": 10.722724914550781, "learning_rate": 0.0018849992513735108, "loss": 7.6177, "step": 936300 }, { "epoch": 3.8146878109440068, "grad_norm": 3.2028284072875977, "learning_rate": 0.001884518731523246, "loss": 7.6264, "step": 936400 }, { "epoch": 3.8150951889673883, "grad_norm": 7.487059116363525, "learning_rate": 0.0018840382362039253, "loss": 7.6002, "step": 936500 }, { "epoch": 3.81550256699077, "grad_norm": 6.927460193634033, "learning_rate": 0.0018835577654345444, "loss": 7.6204, "step": 936600 }, { "epoch": 3.8159099450141514, "grad_norm": 5.701291561126709, "learning_rate": 0.0018830773192340979, "loss": 7.613, "step": 936700 }, { "epoch": 3.816317323037533, "grad_norm": 4.743677139282227, "learning_rate": 0.00188259689762158, "loss": 7.6305, "step": 936800 }, { "epoch": 3.816724701060914, "grad_norm": 6.083077430725098, "learning_rate": 0.0018821165006159862, "loss": 7.6235, "step": 936900 }, { "epoch": 3.8171320790842955, "grad_norm": 7.612229347229004, "learning_rate": 0.0018816361282363078, "loss": 7.6209, "step": 937000 }, { "epoch": 3.8171320790842955, "eval_MaskedAccuracy": 0.5091577492180929, "eval_loss": 1.593906283378601, "eval_runtime": 185.9105, "eval_samples_per_second": 341.433, "eval_steps_per_second": 1.334, "step": 937000 }, { "epoch": 3.817539457107677, "grad_norm": 6.0063796043396, "learning_rate": 0.001881155780501538, "loss": 7.5972, "step": 937100 }, { "epoch": 3.8179468351310586, "grad_norm": 3.575380802154541, "learning_rate": 0.0018806754574306674, "loss": 7.6217, "step": 937200 }, { "epoch": 3.81835421315444, "grad_norm": 3.4223945140838623, "learning_rate": 0.0018801951590426843, "loss": 7.6167, "step": 937300 }, { "epoch": 3.8187615911778217, "grad_norm": 8.924440383911133, "learning_rate": 0.0018797148853565793, "loss": 7.6246, "step": 937400 }, { "epoch": 3.819168969201203, "grad_norm": 6.881430625915527, "learning_rate": 0.0018792346363913372, "loss": 7.6216, "step": 937500 }, { "epoch": 3.8195763472245847, "grad_norm": 5.718722343444824, "learning_rate": 0.001878754412165949, "loss": 7.5987, "step": 937600 }, { "epoch": 3.819983725247966, "grad_norm": 14.568428993225098, "learning_rate": 0.0018782742126993966, "loss": 7.6669, "step": 937700 }, { "epoch": 3.8203911032713473, "grad_norm": 8.684883117675781, "learning_rate": 0.0018777940380106677, "loss": 7.6132, "step": 937800 }, { "epoch": 3.820798481294729, "grad_norm": 6.8858561515808105, "learning_rate": 0.0018773138881187448, "loss": 7.6092, "step": 937900 }, { "epoch": 3.8212058593181104, "grad_norm": 8.121291160583496, "learning_rate": 0.0018768337630426127, "loss": 7.5881, "step": 938000 }, { "epoch": 3.8212058593181104, "eval_MaskedAccuracy": 0.5095008639566333, "eval_loss": 1.6043816804885864, "eval_runtime": 176.2299, "eval_samples_per_second": 360.189, "eval_steps_per_second": 1.407, "step": 938000 }, { "epoch": 3.8216132373414915, "grad_norm": 5.040372371673584, "learning_rate": 0.0018763536628012478, "loss": 7.5973, "step": 938100 }, { "epoch": 3.822020615364873, "grad_norm": 4.880376815795898, "learning_rate": 0.0018758735874136376, "loss": 7.5983, "step": 938200 }, { "epoch": 3.8224279933882546, "grad_norm": 7.10554838180542, "learning_rate": 0.0018753935368987585, "loss": 7.617, "step": 938300 }, { "epoch": 3.822835371411636, "grad_norm": 4.163276672363281, "learning_rate": 0.001874913511275591, "loss": 7.6003, "step": 938400 }, { "epoch": 3.8232427494350176, "grad_norm": 10.246548652648926, "learning_rate": 0.0018744335105631128, "loss": 7.5941, "step": 938500 }, { "epoch": 3.823650127458399, "grad_norm": 5.069756984710693, "learning_rate": 0.0018739535347802989, "loss": 7.6244, "step": 938600 }, { "epoch": 3.8240575054817807, "grad_norm": 5.792740345001221, "learning_rate": 0.001873473583946127, "loss": 7.624, "step": 938700 }, { "epoch": 3.8244648835051622, "grad_norm": 3.303527355194092, "learning_rate": 0.0018729936580795734, "loss": 7.5887, "step": 938800 }, { "epoch": 3.8248722615285433, "grad_norm": 4.251362323760986, "learning_rate": 0.0018725137571996093, "loss": 7.6391, "step": 938900 }, { "epoch": 3.825279639551925, "grad_norm": 8.193767547607422, "learning_rate": 0.0018720338813252085, "loss": 7.6252, "step": 939000 }, { "epoch": 3.825279639551925, "eval_MaskedAccuracy": 0.509100322426978, "eval_loss": 1.6003364324569702, "eval_runtime": 176.6721, "eval_samples_per_second": 359.287, "eval_steps_per_second": 1.404, "step": 939000 }, { "epoch": 3.8256870175753064, "grad_norm": 4.346183776855469, "learning_rate": 0.0018715540304753435, "loss": 7.6372, "step": 939100 }, { "epoch": 3.826094395598688, "grad_norm": 5.751767635345459, "learning_rate": 0.0018710742046689859, "loss": 7.6037, "step": 939200 }, { "epoch": 3.8265017736220694, "grad_norm": 6.1118669509887695, "learning_rate": 0.0018705944039251064, "loss": 7.5879, "step": 939300 }, { "epoch": 3.8269091516454505, "grad_norm": 4.788643836975098, "learning_rate": 0.0018701146282626734, "loss": 7.6114, "step": 939400 }, { "epoch": 3.827316529668832, "grad_norm": 8.829459190368652, "learning_rate": 0.0018696348777006541, "loss": 7.6234, "step": 939500 }, { "epoch": 3.8277239076922136, "grad_norm": 4.282959938049316, "learning_rate": 0.0018691551522580156, "loss": 7.6345, "step": 939600 }, { "epoch": 3.828131285715595, "grad_norm": 5.3322367668151855, "learning_rate": 0.001868675451953727, "loss": 7.5969, "step": 939700 }, { "epoch": 3.8285386637389767, "grad_norm": 10.194397926330566, "learning_rate": 0.0018681957768067491, "loss": 7.6247, "step": 939800 }, { "epoch": 3.828946041762358, "grad_norm": 7.6213698387146, "learning_rate": 0.0018677161268360488, "loss": 7.6347, "step": 939900 }, { "epoch": 3.8293534197857397, "grad_norm": 3.7974536418914795, "learning_rate": 0.0018672365020605875, "loss": 7.6137, "step": 940000 }, { "epoch": 3.8293534197857397, "eval_MaskedAccuracy": 0.5102094498043321, "eval_loss": 1.6037489175796509, "eval_runtime": 232.3575, "eval_samples_per_second": 273.183, "eval_steps_per_second": 1.067, "step": 940000 }, { "epoch": 3.8297607978091213, "grad_norm": 9.21261978149414, "learning_rate": 0.0018667569024993279, "loss": 7.6084, "step": 940100 }, { "epoch": 3.8301681758325024, "grad_norm": 3.3972275257110596, "learning_rate": 0.0018662773281712317, "loss": 7.6157, "step": 940200 }, { "epoch": 3.830575553855884, "grad_norm": 9.21441650390625, "learning_rate": 0.0018657977790952607, "loss": 7.6034, "step": 940300 }, { "epoch": 3.8309829318792654, "grad_norm": 4.649506092071533, "learning_rate": 0.0018653182552903706, "loss": 7.6163, "step": 940400 }, { "epoch": 3.831390309902647, "grad_norm": 6.620223045349121, "learning_rate": 0.001864838756775523, "loss": 7.6052, "step": 940500 }, { "epoch": 3.831797687926028, "grad_norm": 11.1571683883667, "learning_rate": 0.0018643592835696726, "loss": 7.6362, "step": 940600 }, { "epoch": 3.8322050659494096, "grad_norm": 4.182370185852051, "learning_rate": 0.0018638798356917763, "loss": 7.6197, "step": 940700 }, { "epoch": 3.832612443972791, "grad_norm": 3.2624902725219727, "learning_rate": 0.0018634004131607887, "loss": 7.6173, "step": 940800 }, { "epoch": 3.8330198219961726, "grad_norm": 6.115269660949707, "learning_rate": 0.0018629210159956644, "loss": 7.609, "step": 940900 }, { "epoch": 3.833427200019554, "grad_norm": 6.403346061706543, "learning_rate": 0.0018624416442153534, "loss": 7.6072, "step": 941000 }, { "epoch": 3.833427200019554, "eval_MaskedAccuracy": 0.5094567683345336, "eval_loss": 1.5979567766189575, "eval_runtime": 185.0722, "eval_samples_per_second": 342.98, "eval_steps_per_second": 1.34, "step": 941000 }, { "epoch": 3.8338345780429357, "grad_norm": 7.553493022918701, "learning_rate": 0.0018619622978388137, "loss": 7.6242, "step": 941100 }, { "epoch": 3.8342419560663172, "grad_norm": 6.307436943054199, "learning_rate": 0.0018614829768849937, "loss": 7.6189, "step": 941200 }, { "epoch": 3.8346493340896988, "grad_norm": 7.843674659729004, "learning_rate": 0.001861003681372844, "loss": 7.5906, "step": 941300 }, { "epoch": 3.83505671211308, "grad_norm": 5.269289016723633, "learning_rate": 0.0018605244113213118, "loss": 7.6151, "step": 941400 }, { "epoch": 3.8354640901364614, "grad_norm": 8.50742244720459, "learning_rate": 0.0018600451667493468, "loss": 7.5925, "step": 941500 }, { "epoch": 3.835871468159843, "grad_norm": 2.3496034145355225, "learning_rate": 0.0018595659476758944, "loss": 7.605, "step": 941600 }, { "epoch": 3.8362788461832245, "grad_norm": 7.684658050537109, "learning_rate": 0.0018590867541199022, "loss": 7.6516, "step": 941700 }, { "epoch": 3.836686224206606, "grad_norm": 9.886786460876465, "learning_rate": 0.001858607586100315, "loss": 7.6137, "step": 941800 }, { "epoch": 3.837093602229987, "grad_norm": 3.5811352729797363, "learning_rate": 0.0018581284436360777, "loss": 7.5893, "step": 941900 }, { "epoch": 3.8375009802533686, "grad_norm": 5.257310390472412, "learning_rate": 0.0018576493267461313, "loss": 7.5956, "step": 942000 }, { "epoch": 3.8375009802533686, "eval_MaskedAccuracy": 0.5094653795671995, "eval_loss": 1.5970203876495361, "eval_runtime": 192.4947, "eval_samples_per_second": 329.755, "eval_steps_per_second": 1.288, "step": 942000 }, { "epoch": 3.83790835827675, "grad_norm": 3.2904205322265625, "learning_rate": 0.0018571702354494217, "loss": 7.6574, "step": 942100 }, { "epoch": 3.8383157363001317, "grad_norm": 4.732083797454834, "learning_rate": 0.0018566911697648872, "loss": 7.6345, "step": 942200 }, { "epoch": 3.838723114323513, "grad_norm": 3.3520989418029785, "learning_rate": 0.0018562121297114693, "loss": 7.6397, "step": 942300 }, { "epoch": 3.8391304923468947, "grad_norm": 8.889657020568848, "learning_rate": 0.001855733115308104, "loss": 7.615, "step": 942400 }, { "epoch": 3.8395378703702763, "grad_norm": 13.190738677978516, "learning_rate": 0.0018552541265737315, "loss": 7.616, "step": 942500 }, { "epoch": 3.839945248393658, "grad_norm": 13.407934188842773, "learning_rate": 0.0018547751635272886, "loss": 7.6206, "step": 942600 }, { "epoch": 3.840352626417039, "grad_norm": 11.850515365600586, "learning_rate": 0.0018542962261877106, "loss": 7.6271, "step": 942700 }, { "epoch": 3.8407600044404204, "grad_norm": 10.754595756530762, "learning_rate": 0.0018538173145739353, "loss": 7.605, "step": 942800 }, { "epoch": 3.841167382463802, "grad_norm": 6.437925338745117, "learning_rate": 0.001853338428704893, "loss": 7.6474, "step": 942900 }, { "epoch": 3.8415747604871835, "grad_norm": 14.888747215270996, "learning_rate": 0.0018528595685995176, "loss": 7.6063, "step": 943000 }, { "epoch": 3.8415747604871835, "eval_MaskedAccuracy": 0.5082340440738872, "eval_loss": 1.611876368522644, "eval_runtime": 187.8381, "eval_samples_per_second": 337.929, "eval_steps_per_second": 1.32, "step": 943000 }, { "epoch": 3.8419821385105646, "grad_norm": 4.881290435791016, "learning_rate": 0.0018523807342767406, "loss": 7.5982, "step": 943100 }, { "epoch": 3.842389516533946, "grad_norm": 5.884538173675537, "learning_rate": 0.0018519019257554944, "loss": 7.6097, "step": 943200 }, { "epoch": 3.8427968945573276, "grad_norm": 9.731324195861816, "learning_rate": 0.0018514231430547066, "loss": 7.6141, "step": 943300 }, { "epoch": 3.843204272580709, "grad_norm": 6.106199264526367, "learning_rate": 0.0018509443861933084, "loss": 7.6104, "step": 943400 }, { "epoch": 3.8436116506040907, "grad_norm": 5.420655250549316, "learning_rate": 0.0018504656551902273, "loss": 7.6209, "step": 943500 }, { "epoch": 3.8440190286274722, "grad_norm": 12.647811889648438, "learning_rate": 0.0018499869500643897, "loss": 7.6155, "step": 943600 }, { "epoch": 3.844426406650854, "grad_norm": 8.811073303222656, "learning_rate": 0.001849508270834721, "loss": 7.6172, "step": 943700 }, { "epoch": 3.8448337846742353, "grad_norm": 7.697712421417236, "learning_rate": 0.0018490296175201456, "loss": 7.615, "step": 943800 }, { "epoch": 3.8452411626976164, "grad_norm": 4.827374458312988, "learning_rate": 0.0018485509901395893, "loss": 7.5803, "step": 943900 }, { "epoch": 3.845648540720998, "grad_norm": 5.738444805145264, "learning_rate": 0.0018480723887119724, "loss": 7.6146, "step": 944000 }, { "epoch": 3.845648540720998, "eval_MaskedAccuracy": 0.5092260782136705, "eval_loss": 1.6010421514511108, "eval_runtime": 243.1795, "eval_samples_per_second": 261.025, "eval_steps_per_second": 1.02, "step": 944000 }, { "epoch": 3.8460559187443795, "grad_norm": 4.160953998565674, "learning_rate": 0.0018475938132562178, "loss": 7.5895, "step": 944100 }, { "epoch": 3.846463296767761, "grad_norm": 3.4514048099517822, "learning_rate": 0.001847115263791246, "loss": 7.6234, "step": 944200 }, { "epoch": 3.8468706747911425, "grad_norm": 3.569908618927002, "learning_rate": 0.001846636740335977, "loss": 7.602, "step": 944300 }, { "epoch": 3.8472780528145236, "grad_norm": 6.771123886108398, "learning_rate": 0.0018461582429093262, "loss": 7.6438, "step": 944400 }, { "epoch": 3.847685430837905, "grad_norm": 9.543065071105957, "learning_rate": 0.0018456797715302152, "loss": 7.5958, "step": 944500 }, { "epoch": 3.8480928088612867, "grad_norm": 5.088683605194092, "learning_rate": 0.001845201326217559, "loss": 7.6053, "step": 944600 }, { "epoch": 3.848500186884668, "grad_norm": 10.062211036682129, "learning_rate": 0.0018447229069902728, "loss": 7.5971, "step": 944700 }, { "epoch": 3.8489075649080497, "grad_norm": 7.096020221710205, "learning_rate": 0.0018442445138672728, "loss": 7.5848, "step": 944800 }, { "epoch": 3.8493149429314313, "grad_norm": 7.388985633850098, "learning_rate": 0.0018437661468674706, "loss": 7.6223, "step": 944900 }, { "epoch": 3.849722320954813, "grad_norm": 4.046175956726074, "learning_rate": 0.001843287806009778, "loss": 7.6378, "step": 945000 }, { "epoch": 3.849722320954813, "eval_MaskedAccuracy": 0.5084699091181181, "eval_loss": 1.6078386306762695, "eval_runtime": 164.7419, "eval_samples_per_second": 385.306, "eval_steps_per_second": 1.505, "step": 945000 }, { "epoch": 3.8501296989781943, "grad_norm": 9.305213928222656, "learning_rate": 0.0018428094913131078, "loss": 7.6307, "step": 945100 }, { "epoch": 3.8505370770015754, "grad_norm": 2.908653497695923, "learning_rate": 0.0018423312027963708, "loss": 7.5821, "step": 945200 }, { "epoch": 3.850944455024957, "grad_norm": 5.208691596984863, "learning_rate": 0.0018418529404784748, "loss": 7.6321, "step": 945300 }, { "epoch": 3.8513518330483385, "grad_norm": 4.687861919403076, "learning_rate": 0.0018413747043783295, "loss": 7.6145, "step": 945400 }, { "epoch": 3.85175921107172, "grad_norm": 14.521341323852539, "learning_rate": 0.001840896494514841, "loss": 7.6239, "step": 945500 }, { "epoch": 3.852166589095101, "grad_norm": 6.697803497314453, "learning_rate": 0.001840418310906916, "loss": 7.6066, "step": 945600 }, { "epoch": 3.8525739671184827, "grad_norm": 6.27771520614624, "learning_rate": 0.0018399401535734604, "loss": 7.5982, "step": 945700 }, { "epoch": 3.852981345141864, "grad_norm": 6.302993297576904, "learning_rate": 0.0018394620225333782, "loss": 7.6163, "step": 945800 }, { "epoch": 3.8533887231652457, "grad_norm": 3.5316030979156494, "learning_rate": 0.0018389839178055702, "loss": 7.6322, "step": 945900 }, { "epoch": 3.8537961011886273, "grad_norm": 6.291867733001709, "learning_rate": 0.0018385058394089402, "loss": 7.6303, "step": 946000 }, { "epoch": 3.8537961011886273, "eval_MaskedAccuracy": 0.5088446917423474, "eval_loss": 1.6043281555175781, "eval_runtime": 246.4298, "eval_samples_per_second": 257.582, "eval_steps_per_second": 1.006, "step": 946000 }, { "epoch": 3.854203479212009, "grad_norm": 13.41540813446045, "learning_rate": 0.001838027787362389, "loss": 7.6073, "step": 946100 }, { "epoch": 3.8546108572353903, "grad_norm": 3.8155667781829834, "learning_rate": 0.001837549761684816, "loss": 7.6011, "step": 946200 }, { "epoch": 3.855018235258772, "grad_norm": 5.04111385345459, "learning_rate": 0.0018370717623951212, "loss": 7.5932, "step": 946300 }, { "epoch": 3.855425613282153, "grad_norm": 9.936125755310059, "learning_rate": 0.001836593789512203, "loss": 7.6087, "step": 946400 }, { "epoch": 3.8558329913055345, "grad_norm": 9.701949119567871, "learning_rate": 0.0018361158430549548, "loss": 7.6082, "step": 946500 }, { "epoch": 3.856240369328916, "grad_norm": 6.736760139465332, "learning_rate": 0.0018356379230422763, "loss": 7.6148, "step": 946600 }, { "epoch": 3.8566477473522975, "grad_norm": 4.913790225982666, "learning_rate": 0.0018351600294930627, "loss": 7.5924, "step": 946700 }, { "epoch": 3.857055125375679, "grad_norm": 4.850794792175293, "learning_rate": 0.001834682162426204, "loss": 7.5985, "step": 946800 }, { "epoch": 3.85746250339906, "grad_norm": 3.34824800491333, "learning_rate": 0.0018342043218605947, "loss": 7.6191, "step": 946900 }, { "epoch": 3.8578698814224417, "grad_norm": 6.440149307250977, "learning_rate": 0.0018337265078151254, "loss": 7.6166, "step": 947000 }, { "epoch": 3.8578698814224417, "eval_MaskedAccuracy": 0.5092632863592347, "eval_loss": 1.6087101697921753, "eval_runtime": 231.7468, "eval_samples_per_second": 273.902, "eval_steps_per_second": 1.07, "step": 947000 }, { "epoch": 3.8582772594458232, "grad_norm": 4.203551769256592, "learning_rate": 0.0018332487203086883, "loss": 7.6093, "step": 947100 }, { "epoch": 3.8586846374692048, "grad_norm": 10.298797607421875, "learning_rate": 0.0018327709593601706, "loss": 7.5878, "step": 947200 }, { "epoch": 3.8590920154925863, "grad_norm": 9.265373229980469, "learning_rate": 0.0018322932249884632, "loss": 7.6051, "step": 947300 }, { "epoch": 3.859499393515968, "grad_norm": 5.258175849914551, "learning_rate": 0.0018318155172124529, "loss": 7.5989, "step": 947400 }, { "epoch": 3.8599067715393494, "grad_norm": 4.714539527893066, "learning_rate": 0.0018313378360510244, "loss": 7.6084, "step": 947500 }, { "epoch": 3.860314149562731, "grad_norm": 5.320790767669678, "learning_rate": 0.0018308601815230647, "loss": 7.5995, "step": 947600 }, { "epoch": 3.860721527586112, "grad_norm": 2.9611928462982178, "learning_rate": 0.001830382553647459, "loss": 7.6135, "step": 947700 }, { "epoch": 3.8611289056094935, "grad_norm": 6.9255828857421875, "learning_rate": 0.0018299049524430893, "loss": 7.6082, "step": 947800 }, { "epoch": 3.861536283632875, "grad_norm": 3.260333776473999, "learning_rate": 0.001829427377928837, "loss": 7.5901, "step": 947900 }, { "epoch": 3.8619436616562566, "grad_norm": 3.394360065460205, "learning_rate": 0.0018289498301235828, "loss": 7.6032, "step": 948000 }, { "epoch": 3.8619436616562566, "eval_MaskedAccuracy": 0.5090682639138114, "eval_loss": 1.602862000465393, "eval_runtime": 208.2255, "eval_samples_per_second": 304.843, "eval_steps_per_second": 1.191, "step": 948000 }, { "epoch": 3.8623510396796377, "grad_norm": 8.246703147888184, "learning_rate": 0.001828472309046208, "loss": 7.6268, "step": 948100 }, { "epoch": 3.862758417703019, "grad_norm": 6.750222206115723, "learning_rate": 0.0018279948147155913, "loss": 7.5911, "step": 948200 }, { "epoch": 3.8631657957264007, "grad_norm": 11.483348846435547, "learning_rate": 0.0018275173471506106, "loss": 7.6327, "step": 948300 }, { "epoch": 3.8635731737497823, "grad_norm": 8.000398635864258, "learning_rate": 0.0018270399063701423, "loss": 7.6494, "step": 948400 }, { "epoch": 3.863980551773164, "grad_norm": 4.537847995758057, "learning_rate": 0.0018265624923930628, "loss": 7.6297, "step": 948500 }, { "epoch": 3.8643879297965453, "grad_norm": 3.8453502655029297, "learning_rate": 0.0018260851052382477, "loss": 7.5948, "step": 948600 }, { "epoch": 3.864795307819927, "grad_norm": 4.933693885803223, "learning_rate": 0.0018256077449245681, "loss": 7.5849, "step": 948700 }, { "epoch": 3.8652026858433084, "grad_norm": 7.7320780754089355, "learning_rate": 0.0018251304114708976, "loss": 7.6149, "step": 948800 }, { "epoch": 3.8656100638666895, "grad_norm": 11.603869438171387, "learning_rate": 0.0018246531048961095, "loss": 7.5858, "step": 948900 }, { "epoch": 3.866017441890071, "grad_norm": 4.933516025543213, "learning_rate": 0.0018241758252190703, "loss": 7.6174, "step": 949000 }, { "epoch": 3.866017441890071, "eval_MaskedAccuracy": 0.5089156285212543, "eval_loss": 1.6060400009155273, "eval_runtime": 167.3093, "eval_samples_per_second": 379.393, "eval_steps_per_second": 1.482, "step": 949000 }, { "epoch": 3.8664248199134525, "grad_norm": 11.33521842956543, "learning_rate": 0.0018236985724586544, "loss": 7.6226, "step": 949100 }, { "epoch": 3.866832197936834, "grad_norm": 4.204806804656982, "learning_rate": 0.001823221346633727, "loss": 7.627, "step": 949200 }, { "epoch": 3.8672395759602156, "grad_norm": 4.625539302825928, "learning_rate": 0.0018227441477631562, "loss": 7.6229, "step": 949300 }, { "epoch": 3.8676469539835967, "grad_norm": 4.905984878540039, "learning_rate": 0.0018222669758658076, "loss": 7.6129, "step": 949400 }, { "epoch": 3.8680543320069782, "grad_norm": 13.30921459197998, "learning_rate": 0.0018217898309605485, "loss": 7.6132, "step": 949500 }, { "epoch": 3.8684617100303598, "grad_norm": 4.7313666343688965, "learning_rate": 0.0018213127130662413, "loss": 7.6394, "step": 949600 }, { "epoch": 3.8688690880537413, "grad_norm": 5.9433088302612305, "learning_rate": 0.0018208356222017474, "loss": 7.6041, "step": 949700 }, { "epoch": 3.869276466077123, "grad_norm": 8.721516609191895, "learning_rate": 0.0018203585583859312, "loss": 7.6002, "step": 949800 }, { "epoch": 3.8696838441005044, "grad_norm": 14.288046836853027, "learning_rate": 0.0018198815216376514, "loss": 7.6247, "step": 949900 }, { "epoch": 3.870091222123886, "grad_norm": 4.396093368530273, "learning_rate": 0.0018194045119757716, "loss": 7.6175, "step": 950000 }, { "epoch": 3.870091222123886, "eval_MaskedAccuracy": 0.5089442366642627, "eval_loss": 1.6116803884506226, "eval_runtime": 178.9291, "eval_samples_per_second": 354.755, "eval_steps_per_second": 1.386, "step": 950000 }, { "epoch": 3.8704986001472674, "grad_norm": 5.026941776275635, "learning_rate": 0.001818927529419148, "loss": 7.6204, "step": 950100 }, { "epoch": 3.8709059781706485, "grad_norm": 10.700847625732422, "learning_rate": 0.0018184505739866378, "loss": 7.6121, "step": 950200 }, { "epoch": 3.87131335619403, "grad_norm": 3.735995292663574, "learning_rate": 0.001817973645697099, "loss": 7.6079, "step": 950300 }, { "epoch": 3.8717207342174116, "grad_norm": 5.800289154052734, "learning_rate": 0.001817496744569386, "loss": 7.6376, "step": 950400 }, { "epoch": 3.872128112240793, "grad_norm": 5.358453750610352, "learning_rate": 0.0018170198706223535, "loss": 7.6084, "step": 950500 }, { "epoch": 3.872535490264174, "grad_norm": 3.3876280784606934, "learning_rate": 0.0018165430238748564, "loss": 7.5943, "step": 950600 }, { "epoch": 3.8729428682875557, "grad_norm": 5.961701393127441, "learning_rate": 0.0018160662043457461, "loss": 7.5772, "step": 950700 }, { "epoch": 3.8733502463109373, "grad_norm": 4.473485469818115, "learning_rate": 0.001815589412053871, "loss": 7.6057, "step": 950800 }, { "epoch": 3.873757624334319, "grad_norm": 4.792442798614502, "learning_rate": 0.0018151126470180847, "loss": 7.6274, "step": 950900 }, { "epoch": 3.8741650023577003, "grad_norm": 3.3934457302093506, "learning_rate": 0.0018146359092572346, "loss": 7.6034, "step": 951000 }, { "epoch": 3.8741650023577003, "eval_MaskedAccuracy": 0.5088081165592627, "eval_loss": 1.6002564430236816, "eval_runtime": 177.1429, "eval_samples_per_second": 358.332, "eval_steps_per_second": 1.4, "step": 951000 }, { "epoch": 3.874572380381082, "grad_norm": 7.944741249084473, "learning_rate": 0.0018141591987901704, "loss": 7.5975, "step": 951100 }, { "epoch": 3.8749797584044634, "grad_norm": 4.314601421356201, "learning_rate": 0.001813682515635739, "loss": 7.5904, "step": 951200 }, { "epoch": 3.875387136427845, "grad_norm": 5.166213035583496, "learning_rate": 0.0018132058598127844, "loss": 7.6441, "step": 951300 }, { "epoch": 3.875794514451226, "grad_norm": 13.966766357421875, "learning_rate": 0.0018127292313401516, "loss": 7.6071, "step": 951400 }, { "epoch": 3.8762018924746076, "grad_norm": 6.3037285804748535, "learning_rate": 0.0018122526302366841, "loss": 7.6163, "step": 951500 }, { "epoch": 3.876609270497989, "grad_norm": 8.044075012207031, "learning_rate": 0.0018117760565212254, "loss": 7.6163, "step": 951600 }, { "epoch": 3.8770166485213706, "grad_norm": 5.0798773765563965, "learning_rate": 0.0018112995102126184, "loss": 7.6001, "step": 951700 }, { "epoch": 3.877424026544752, "grad_norm": 4.548254489898682, "learning_rate": 0.0018108229913297, "loss": 7.6032, "step": 951800 }, { "epoch": 3.8778314045681332, "grad_norm": 6.899433612823486, "learning_rate": 0.0018103464998913126, "loss": 7.6263, "step": 951900 }, { "epoch": 3.8782387825915148, "grad_norm": 5.320175647735596, "learning_rate": 0.0018098700359162933, "loss": 7.6256, "step": 952000 }, { "epoch": 3.8782387825915148, "eval_MaskedAccuracy": 0.5090562569753042, "eval_loss": 1.6056095361709595, "eval_runtime": 163.5442, "eval_samples_per_second": 388.127, "eval_steps_per_second": 1.516, "step": 952000 }, { "epoch": 3.8786461606148963, "grad_norm": 4.4534592628479, "learning_rate": 0.0018093935994234792, "loss": 7.6215, "step": 952100 }, { "epoch": 3.879053538638278, "grad_norm": 3.465465545654297, "learning_rate": 0.0018089171904317083, "loss": 7.6213, "step": 952200 }, { "epoch": 3.8794609166616594, "grad_norm": 5.833391189575195, "learning_rate": 0.0018084408089598103, "loss": 7.6305, "step": 952300 }, { "epoch": 3.879868294685041, "grad_norm": 11.478015899658203, "learning_rate": 0.0018079644550266227, "loss": 7.6171, "step": 952400 }, { "epoch": 3.8802756727084224, "grad_norm": 8.241008758544922, "learning_rate": 0.00180748812865098, "loss": 7.6236, "step": 952500 }, { "epoch": 3.880683050731804, "grad_norm": 4.015230655670166, "learning_rate": 0.0018070118298517106, "loss": 7.6139, "step": 952600 }, { "epoch": 3.881090428755185, "grad_norm": 9.021292686462402, "learning_rate": 0.0018065355586476476, "loss": 7.6198, "step": 952700 }, { "epoch": 3.8814978067785666, "grad_norm": 4.2154107093811035, "learning_rate": 0.0018060593150576196, "loss": 7.6215, "step": 952800 }, { "epoch": 3.881905184801948, "grad_norm": 9.103936195373535, "learning_rate": 0.0018055830991004554, "loss": 7.5944, "step": 952900 }, { "epoch": 3.8823125628253297, "grad_norm": 3.8361706733703613, "learning_rate": 0.0018051069107949814, "loss": 7.6266, "step": 953000 }, { "epoch": 3.8823125628253297, "eval_MaskedAccuracy": 0.5090985913748143, "eval_loss": 1.6088173389434814, "eval_runtime": 171.0992, "eval_samples_per_second": 370.989, "eval_steps_per_second": 1.449, "step": 953000 }, { "epoch": 3.8827199408487107, "grad_norm": 5.715306758880615, "learning_rate": 0.0018046307501600268, "loss": 7.6275, "step": 953100 }, { "epoch": 3.8831273188720923, "grad_norm": 9.456602096557617, "learning_rate": 0.0018041546172144127, "loss": 7.6222, "step": 953200 }, { "epoch": 3.883534696895474, "grad_norm": 5.366934776306152, "learning_rate": 0.0018036785119769636, "loss": 7.6055, "step": 953300 }, { "epoch": 3.8839420749188553, "grad_norm": 8.507433891296387, "learning_rate": 0.0018032024344665062, "loss": 7.619, "step": 953400 }, { "epoch": 3.884349452942237, "grad_norm": 4.057890892028809, "learning_rate": 0.0018027263847018601, "loss": 7.6024, "step": 953500 }, { "epoch": 3.8847568309656184, "grad_norm": 4.30263614654541, "learning_rate": 0.0018022503627018458, "loss": 7.596, "step": 953600 }, { "epoch": 3.885164208989, "grad_norm": 6.657095909118652, "learning_rate": 0.001801774368485282, "loss": 7.6062, "step": 953700 }, { "epoch": 3.8855715870123815, "grad_norm": 7.9841718673706055, "learning_rate": 0.0018012984020709894, "loss": 7.6091, "step": 953800 }, { "epoch": 3.8859789650357626, "grad_norm": 6.192052841186523, "learning_rate": 0.0018008224634777819, "loss": 7.5954, "step": 953900 }, { "epoch": 3.886386343059144, "grad_norm": 4.977832317352295, "learning_rate": 0.0018003465527244812, "loss": 7.6245, "step": 954000 }, { "epoch": 3.886386343059144, "eval_MaskedAccuracy": 0.5091509910645146, "eval_loss": 1.6058160066604614, "eval_runtime": 242.3496, "eval_samples_per_second": 261.919, "eval_steps_per_second": 1.023, "step": 954000 }, { "epoch": 3.8867937210825256, "grad_norm": 4.110084056854248, "learning_rate": 0.0017998706698299, "loss": 7.631, "step": 954100 }, { "epoch": 3.887201099105907, "grad_norm": 8.557034492492676, "learning_rate": 0.0017993948148128517, "loss": 7.6177, "step": 954200 }, { "epoch": 3.8876084771292887, "grad_norm": 4.660292625427246, "learning_rate": 0.0017989189876921508, "loss": 7.6417, "step": 954300 }, { "epoch": 3.88801585515267, "grad_norm": 3.670198917388916, "learning_rate": 0.0017984431884866085, "loss": 7.6058, "step": 954400 }, { "epoch": 3.8884232331760513, "grad_norm": 6.274688243865967, "learning_rate": 0.0017979674172150357, "loss": 7.5926, "step": 954500 }, { "epoch": 3.888830611199433, "grad_norm": 4.719417572021484, "learning_rate": 0.0017974916738962435, "loss": 7.6303, "step": 954600 }, { "epoch": 3.8892379892228144, "grad_norm": 6.074313640594482, "learning_rate": 0.0017970159585490387, "loss": 7.5992, "step": 954700 }, { "epoch": 3.889645367246196, "grad_norm": 8.574957847595215, "learning_rate": 0.0017965402711922296, "loss": 7.6452, "step": 954800 }, { "epoch": 3.8900527452695774, "grad_norm": 6.722024440765381, "learning_rate": 0.0017960646118446234, "loss": 7.5882, "step": 954900 }, { "epoch": 3.890460123292959, "grad_norm": 6.899868488311768, "learning_rate": 0.0017955889805250235, "loss": 7.6136, "step": 955000 }, { "epoch": 3.890460123292959, "eval_MaskedAccuracy": 0.509182417232623, "eval_loss": 1.6039791107177734, "eval_runtime": 172.1183, "eval_samples_per_second": 368.793, "eval_steps_per_second": 1.441, "step": 955000 }, { "epoch": 3.8908675013163405, "grad_norm": 15.284417152404785, "learning_rate": 0.0017951133772522363, "loss": 7.6104, "step": 955100 }, { "epoch": 3.8912748793397216, "grad_norm": 6.407052993774414, "learning_rate": 0.0017946378020450644, "loss": 7.6112, "step": 955200 }, { "epoch": 3.891682257363103, "grad_norm": 9.482291221618652, "learning_rate": 0.0017941622549223083, "loss": 7.6071, "step": 955300 }, { "epoch": 3.8920896353864847, "grad_norm": 8.136209487915039, "learning_rate": 0.00179368673590277, "loss": 7.5947, "step": 955400 }, { "epoch": 3.892497013409866, "grad_norm": 4.7726593017578125, "learning_rate": 0.0017932112450052492, "loss": 7.5978, "step": 955500 }, { "epoch": 3.8929043914332473, "grad_norm": 8.94983196258545, "learning_rate": 0.0017927357822485454, "loss": 7.595, "step": 955600 }, { "epoch": 3.893311769456629, "grad_norm": 3.8978352546691895, "learning_rate": 0.0017922603476514564, "loss": 7.6174, "step": 955700 }, { "epoch": 3.8937191474800104, "grad_norm": 3.3412771224975586, "learning_rate": 0.001791784941232777, "loss": 7.5995, "step": 955800 }, { "epoch": 3.894126525503392, "grad_norm": 7.2563276290893555, "learning_rate": 0.001791309563011302, "loss": 7.6281, "step": 955900 }, { "epoch": 3.8945339035267734, "grad_norm": 5.581161975860596, "learning_rate": 0.0017908342130058292, "loss": 7.6131, "step": 956000 }, { "epoch": 3.8945339035267734, "eval_MaskedAccuracy": 0.5084608431798913, "eval_loss": 1.6025763750076294, "eval_runtime": 175.0895, "eval_samples_per_second": 362.535, "eval_steps_per_second": 1.416, "step": 956000 }, { "epoch": 3.894941281550155, "grad_norm": 13.122600555419922, "learning_rate": 0.0017903588912351505, "loss": 7.6163, "step": 956100 }, { "epoch": 3.8953486595735365, "grad_norm": 6.420519828796387, "learning_rate": 0.0017898835977180553, "loss": 7.6012, "step": 956200 }, { "epoch": 3.895756037596918, "grad_norm": 4.720472812652588, "learning_rate": 0.0017894083324733357, "loss": 7.5975, "step": 956300 }, { "epoch": 3.896163415620299, "grad_norm": 3.968144655227661, "learning_rate": 0.001788933095519783, "loss": 7.6479, "step": 956400 }, { "epoch": 3.8965707936436806, "grad_norm": 7.333112716674805, "learning_rate": 0.0017884578868761845, "loss": 7.617, "step": 956500 }, { "epoch": 3.896978171667062, "grad_norm": 6.139729976654053, "learning_rate": 0.0017879827065613268, "loss": 7.616, "step": 956600 }, { "epoch": 3.8973855496904437, "grad_norm": 5.550684452056885, "learning_rate": 0.0017875075545939997, "loss": 7.614, "step": 956700 }, { "epoch": 3.8977929277138252, "grad_norm": 5.037663459777832, "learning_rate": 0.0017870324309929861, "loss": 7.6058, "step": 956800 }, { "epoch": 3.8982003057372063, "grad_norm": 5.413069725036621, "learning_rate": 0.0017865573357770685, "loss": 7.6169, "step": 956900 }, { "epoch": 3.898607683760588, "grad_norm": 7.729787826538086, "learning_rate": 0.0017860822689650326, "loss": 7.6048, "step": 957000 }, { "epoch": 3.898607683760588, "eval_MaskedAccuracy": 0.5091071710516075, "eval_loss": 1.5998338460922241, "eval_runtime": 188.7024, "eval_samples_per_second": 336.382, "eval_steps_per_second": 1.314, "step": 957000 }, { "epoch": 3.8990150617839694, "grad_norm": 6.152901649475098, "learning_rate": 0.0017856072305756593, "loss": 7.582, "step": 957100 }, { "epoch": 3.899422439807351, "grad_norm": 3.584260940551758, "learning_rate": 0.001785132220627732, "loss": 7.6105, "step": 957200 }, { "epoch": 3.8998298178307325, "grad_norm": 9.127362251281738, "learning_rate": 0.001784657239140025, "loss": 7.6376, "step": 957300 }, { "epoch": 3.900237195854114, "grad_norm": 6.2226176261901855, "learning_rate": 0.0017841822861313214, "loss": 7.633, "step": 957400 }, { "epoch": 3.9006445738774955, "grad_norm": 10.394842147827148, "learning_rate": 0.0017837073616203969, "loss": 7.6098, "step": 957500 }, { "epoch": 3.9010519519008766, "grad_norm": 4.542253494262695, "learning_rate": 0.001783232465626029, "loss": 7.6327, "step": 957600 }, { "epoch": 3.901459329924258, "grad_norm": 3.889946937561035, "learning_rate": 0.001782757598166993, "loss": 7.6256, "step": 957700 }, { "epoch": 3.9018667079476397, "grad_norm": 4.671126365661621, "learning_rate": 0.0017822827592620604, "loss": 7.5918, "step": 957800 }, { "epoch": 3.902274085971021, "grad_norm": 10.369836807250977, "learning_rate": 0.001781807948930006, "loss": 7.6108, "step": 957900 }, { "epoch": 3.9026814639944027, "grad_norm": 5.292967319488525, "learning_rate": 0.0017813331671896024, "loss": 7.632, "step": 958000 }, { "epoch": 3.9026814639944027, "eval_MaskedAccuracy": 0.5089657313638909, "eval_loss": 1.600001573562622, "eval_runtime": 191.4429, "eval_samples_per_second": 331.566, "eval_steps_per_second": 1.295, "step": 958000 }, { "epoch": 3.903088842017784, "grad_norm": 12.011616706848145, "learning_rate": 0.0017808584140596186, "loss": 7.6033, "step": 958100 }, { "epoch": 3.9034962200411654, "grad_norm": 6.7674560546875, "learning_rate": 0.0017803836895588245, "loss": 7.608, "step": 958200 }, { "epoch": 3.903903598064547, "grad_norm": 3.2566893100738525, "learning_rate": 0.0017799089937059888, "loss": 7.5775, "step": 958300 }, { "epoch": 3.9043109760879284, "grad_norm": 8.980989456176758, "learning_rate": 0.0017794343265198785, "loss": 7.628, "step": 958400 }, { "epoch": 3.90471835411131, "grad_norm": 5.547982692718506, "learning_rate": 0.001778959688019261, "loss": 7.6237, "step": 958500 }, { "epoch": 3.9051257321346915, "grad_norm": 5.356801509857178, "learning_rate": 0.0017784850782228997, "loss": 7.6141, "step": 958600 }, { "epoch": 3.905533110158073, "grad_norm": 5.821535110473633, "learning_rate": 0.0017780104971495583, "loss": 7.6203, "step": 958700 }, { "epoch": 3.9059404881814546, "grad_norm": 3.865333080291748, "learning_rate": 0.001777535944818001, "loss": 7.5888, "step": 958800 }, { "epoch": 3.9063478662048357, "grad_norm": 4.598865985870361, "learning_rate": 0.001777061421246987, "loss": 7.6083, "step": 958900 }, { "epoch": 3.906755244228217, "grad_norm": 2.5107874870300293, "learning_rate": 0.0017765869264552787, "loss": 7.6133, "step": 959000 }, { "epoch": 3.906755244228217, "eval_MaskedAccuracy": 0.5089750421296063, "eval_loss": 1.6102250814437866, "eval_runtime": 180.9229, "eval_samples_per_second": 350.846, "eval_steps_per_second": 1.371, "step": 959000 }, { "epoch": 3.9071626222515987, "grad_norm": 6.061223030090332, "learning_rate": 0.0017761124604616337, "loss": 7.6094, "step": 959100 }, { "epoch": 3.9075700002749802, "grad_norm": 6.3825507164001465, "learning_rate": 0.001775638023284814, "loss": 7.617, "step": 959200 }, { "epoch": 3.907977378298362, "grad_norm": 4.168699741363525, "learning_rate": 0.0017751636149435756, "loss": 7.5893, "step": 959300 }, { "epoch": 3.908384756321743, "grad_norm": 6.144078731536865, "learning_rate": 0.001774689235456671, "loss": 7.614, "step": 959400 }, { "epoch": 3.9087921343451244, "grad_norm": 4.921387195587158, "learning_rate": 0.0017742148848428558, "loss": 7.5867, "step": 959500 }, { "epoch": 3.909199512368506, "grad_norm": 4.409626007080078, "learning_rate": 0.0017737405631208852, "loss": 7.6084, "step": 959600 }, { "epoch": 3.9096068903918875, "grad_norm": 11.31052303314209, "learning_rate": 0.0017732662703095119, "loss": 7.6115, "step": 959700 }, { "epoch": 3.910014268415269, "grad_norm": 7.96267032623291, "learning_rate": 0.0017727920064274866, "loss": 7.5897, "step": 959800 }, { "epoch": 3.9104216464386505, "grad_norm": 7.232409477233887, "learning_rate": 0.0017723177714935585, "loss": 7.615, "step": 959900 }, { "epoch": 3.910829024462032, "grad_norm": 8.711453437805176, "learning_rate": 0.0017718435655264782, "loss": 7.6072, "step": 960000 }, { "epoch": 3.910829024462032, "eval_MaskedAccuracy": 0.5096968113028026, "eval_loss": 1.5956754684448242, "eval_runtime": 186.8973, "eval_samples_per_second": 339.63, "eval_steps_per_second": 1.327, "step": 960000 }, { "epoch": 3.911236402485413, "grad_norm": 5.794650554656982, "learning_rate": 0.0017713693885449923, "loss": 7.6014, "step": 960100 }, { "epoch": 3.9116437805087947, "grad_norm": 6.808177947998047, "learning_rate": 0.0017708952405678495, "loss": 7.6244, "step": 960200 }, { "epoch": 3.912051158532176, "grad_norm": 6.023766994476318, "learning_rate": 0.0017704211216137936, "loss": 7.6023, "step": 960300 }, { "epoch": 3.9124585365555578, "grad_norm": 15.710735321044922, "learning_rate": 0.001769947031701568, "loss": 7.6043, "step": 960400 }, { "epoch": 3.9128659145789393, "grad_norm": 9.586337089538574, "learning_rate": 0.0017694729708499183, "loss": 7.6036, "step": 960500 }, { "epoch": 3.9132732926023204, "grad_norm": 6.30381441116333, "learning_rate": 0.0017689989390775856, "loss": 7.5745, "step": 960600 }, { "epoch": 3.913680670625702, "grad_norm": 6.227634906768799, "learning_rate": 0.001768524936403311, "loss": 7.6371, "step": 960700 }, { "epoch": 3.9140880486490834, "grad_norm": 8.263335227966309, "learning_rate": 0.001768050962845835, "loss": 7.5995, "step": 960800 }, { "epoch": 3.914495426672465, "grad_norm": 9.07831859588623, "learning_rate": 0.0017675770184238972, "loss": 7.6242, "step": 960900 }, { "epoch": 3.9149028046958465, "grad_norm": 7.0234293937683105, "learning_rate": 0.0017671031031562317, "loss": 7.6544, "step": 961000 }, { "epoch": 3.9149028046958465, "eval_MaskedAccuracy": 0.5089093694398151, "eval_loss": 1.6114834547042847, "eval_runtime": 184.1641, "eval_samples_per_second": 344.671, "eval_steps_per_second": 1.347, "step": 961000 }, { "epoch": 3.915310182719228, "grad_norm": 9.08312702178955, "learning_rate": 0.0017666292170615795, "loss": 7.6247, "step": 961100 }, { "epoch": 3.9157175607426096, "grad_norm": 4.557876110076904, "learning_rate": 0.0017661553601586717, "loss": 7.5911, "step": 961200 }, { "epoch": 3.916124938765991, "grad_norm": 7.523808479309082, "learning_rate": 0.0017656815324662454, "loss": 7.6428, "step": 961300 }, { "epoch": 3.916532316789372, "grad_norm": 6.667905330657959, "learning_rate": 0.001765207734003032, "loss": 7.6174, "step": 961400 }, { "epoch": 3.9169396948127537, "grad_norm": 8.559423446655273, "learning_rate": 0.0017647339647877628, "loss": 7.6077, "step": 961500 }, { "epoch": 3.9173470728361353, "grad_norm": 6.447027683258057, "learning_rate": 0.0017642602248391698, "loss": 7.6136, "step": 961600 }, { "epoch": 3.917754450859517, "grad_norm": 7.463019847869873, "learning_rate": 0.001763786514175982, "loss": 7.6081, "step": 961700 }, { "epoch": 3.9181618288828983, "grad_norm": 3.941450595855713, "learning_rate": 0.0017633128328169271, "loss": 7.5928, "step": 961800 }, { "epoch": 3.9185692069062794, "grad_norm": 10.279614448547363, "learning_rate": 0.0017628391807807317, "loss": 7.6013, "step": 961900 }, { "epoch": 3.918976584929661, "grad_norm": 5.762928009033203, "learning_rate": 0.0017623655580861245, "loss": 7.5891, "step": 962000 }, { "epoch": 3.918976584929661, "eval_MaskedAccuracy": 0.509734933997005, "eval_loss": 1.6062231063842773, "eval_runtime": 253.4031, "eval_samples_per_second": 250.494, "eval_steps_per_second": 0.979, "step": 962000 }, { "epoch": 3.9193839629530425, "grad_norm": 7.73829984664917, "learning_rate": 0.0017618919647518288, "loss": 7.6409, "step": 962100 }, { "epoch": 3.919791340976424, "grad_norm": 3.7185044288635254, "learning_rate": 0.001761418400796568, "loss": 7.6207, "step": 962200 }, { "epoch": 3.9201987189998055, "grad_norm": 9.785548210144043, "learning_rate": 0.0017609448662390655, "loss": 7.5845, "step": 962300 }, { "epoch": 3.920606097023187, "grad_norm": 6.399051189422607, "learning_rate": 0.0017604713610980382, "loss": 7.6083, "step": 962400 }, { "epoch": 3.9210134750465686, "grad_norm": 8.724244117736816, "learning_rate": 0.0017599978853922115, "loss": 7.6192, "step": 962500 }, { "epoch": 3.9214208530699497, "grad_norm": 7.021622180938721, "learning_rate": 0.0017595244391403057, "loss": 7.5972, "step": 962600 }, { "epoch": 3.9218282310933312, "grad_norm": 11.517878532409668, "learning_rate": 0.0017590510223610327, "loss": 7.5885, "step": 962700 }, { "epoch": 3.9222356091167128, "grad_norm": 8.714223861694336, "learning_rate": 0.0017585776350731155, "loss": 7.6014, "step": 962800 }, { "epoch": 3.9226429871400943, "grad_norm": 6.0792741775512695, "learning_rate": 0.0017581042772952648, "loss": 7.585, "step": 962900 }, { "epoch": 3.923050365163476, "grad_norm": 13.479080200195312, "learning_rate": 0.0017576309490461974, "loss": 7.606, "step": 963000 }, { "epoch": 3.923050365163476, "eval_MaskedAccuracy": 0.50965225417915, "eval_loss": 1.6007351875305176, "eval_runtime": 174.2466, "eval_samples_per_second": 364.288, "eval_steps_per_second": 1.423, "step": 963000 }, { "epoch": 3.923457743186857, "grad_norm": 10.573036193847656, "learning_rate": 0.0017571576503446235, "loss": 7.6, "step": 963100 }, { "epoch": 3.9238651212102384, "grad_norm": 3.684870719909668, "learning_rate": 0.001756684381209259, "loss": 7.6255, "step": 963200 }, { "epoch": 3.92427249923362, "grad_norm": 4.756135940551758, "learning_rate": 0.001756211141658813, "loss": 7.6018, "step": 963300 }, { "epoch": 3.9246798772570015, "grad_norm": 3.6143898963928223, "learning_rate": 0.0017557379317119962, "loss": 7.6029, "step": 963400 }, { "epoch": 3.925087255280383, "grad_norm": 19.82906723022461, "learning_rate": 0.0017552647513875167, "loss": 7.6159, "step": 963500 }, { "epoch": 3.9254946333037646, "grad_norm": 3.3934342861175537, "learning_rate": 0.0017547916007040806, "loss": 7.6193, "step": 963600 }, { "epoch": 3.925902011327146, "grad_norm": 6.459085941314697, "learning_rate": 0.001754318479680394, "loss": 7.604, "step": 963700 }, { "epoch": 3.9263093893505276, "grad_norm": 2.7584681510925293, "learning_rate": 0.0017538453883351635, "loss": 7.6087, "step": 963800 }, { "epoch": 3.9267167673739087, "grad_norm": 5.214748859405518, "learning_rate": 0.0017533723266870934, "loss": 7.6072, "step": 963900 }, { "epoch": 3.9271241453972903, "grad_norm": 9.482467651367188, "learning_rate": 0.0017528992947548842, "loss": 7.5804, "step": 964000 }, { "epoch": 3.9271241453972903, "eval_MaskedAccuracy": 0.5089183610110668, "eval_loss": 1.5995738506317139, "eval_runtime": 179.4685, "eval_samples_per_second": 353.689, "eval_steps_per_second": 1.382, "step": 964000 }, { "epoch": 3.927531523420672, "grad_norm": 8.670701026916504, "learning_rate": 0.001752426292557239, "loss": 7.6204, "step": 964100 }, { "epoch": 3.9279389014440533, "grad_norm": 4.509506702423096, "learning_rate": 0.001751953320112857, "loss": 7.5954, "step": 964200 }, { "epoch": 3.928346279467435, "grad_norm": 5.005879878997803, "learning_rate": 0.0017514803774404375, "loss": 7.6298, "step": 964300 }, { "epoch": 3.928753657490816, "grad_norm": 3.8375208377838135, "learning_rate": 0.001751007464558679, "loss": 7.6267, "step": 964400 }, { "epoch": 3.9291610355141975, "grad_norm": 5.750095844268799, "learning_rate": 0.0017505345814862777, "loss": 7.6051, "step": 964500 }, { "epoch": 3.929568413537579, "grad_norm": 14.052850723266602, "learning_rate": 0.00175006172824193, "loss": 7.5784, "step": 964600 }, { "epoch": 3.9299757915609606, "grad_norm": 4.827563762664795, "learning_rate": 0.0017495889048443297, "loss": 7.6119, "step": 964700 }, { "epoch": 3.930383169584342, "grad_norm": 4.114831447601318, "learning_rate": 0.0017491161113121691, "loss": 7.6101, "step": 964800 }, { "epoch": 3.9307905476077236, "grad_norm": 7.301882266998291, "learning_rate": 0.0017486433476641423, "loss": 7.6026, "step": 964900 }, { "epoch": 3.931197925631105, "grad_norm": 6.055305004119873, "learning_rate": 0.00174817061391894, "loss": 7.605, "step": 965000 }, { "epoch": 3.931197925631105, "eval_MaskedAccuracy": 0.5094075744219674, "eval_loss": 1.6030793190002441, "eval_runtime": 170.5433, "eval_samples_per_second": 372.199, "eval_steps_per_second": 1.454, "step": 965000 }, { "epoch": 3.9316053036544862, "grad_norm": 7.509965896606445, "learning_rate": 0.0017476979100952516, "loss": 7.5782, "step": 965100 }, { "epoch": 3.9320126816778678, "grad_norm": 11.511940956115723, "learning_rate": 0.0017472252362117642, "loss": 7.5963, "step": 965200 }, { "epoch": 3.9324200597012493, "grad_norm": 8.02153205871582, "learning_rate": 0.0017467525922871662, "loss": 7.5949, "step": 965300 }, { "epoch": 3.932827437724631, "grad_norm": 10.607674598693848, "learning_rate": 0.0017462799783401447, "loss": 7.6329, "step": 965400 }, { "epoch": 3.9332348157480124, "grad_norm": 7.6859331130981445, "learning_rate": 0.0017458073943893825, "loss": 7.6219, "step": 965500 }, { "epoch": 3.9336421937713935, "grad_norm": 6.8652520179748535, "learning_rate": 0.001745334840453566, "loss": 7.6236, "step": 965600 }, { "epoch": 3.934049571794775, "grad_norm": 6.446781158447266, "learning_rate": 0.0017448623165513748, "loss": 7.6278, "step": 965700 }, { "epoch": 3.9344569498181565, "grad_norm": 9.547246932983398, "learning_rate": 0.0017443898227014916, "loss": 7.6108, "step": 965800 }, { "epoch": 3.934864327841538, "grad_norm": 12.40895938873291, "learning_rate": 0.0017439173589225963, "loss": 7.5992, "step": 965900 }, { "epoch": 3.9352717058649196, "grad_norm": 13.574623107910156, "learning_rate": 0.001743444925233369, "loss": 7.6124, "step": 966000 }, { "epoch": 3.9352717058649196, "eval_MaskedAccuracy": 0.5092441180934023, "eval_loss": 1.6030031442642212, "eval_runtime": 218.0381, "eval_samples_per_second": 291.123, "eval_steps_per_second": 1.137, "step": 966000 }, { "epoch": 3.935679083888301, "grad_norm": 10.19545841217041, "learning_rate": 0.0017429725216524861, "loss": 7.6029, "step": 966100 }, { "epoch": 3.9360864619116827, "grad_norm": 8.474671363830566, "learning_rate": 0.0017425001481986258, "loss": 7.6071, "step": 966200 }, { "epoch": 3.936493839935064, "grad_norm": 5.1744513511657715, "learning_rate": 0.0017420278048904632, "loss": 7.636, "step": 966300 }, { "epoch": 3.9369012179584453, "grad_norm": 4.764209747314453, "learning_rate": 0.0017415554917466717, "loss": 7.6004, "step": 966400 }, { "epoch": 3.937308595981827, "grad_norm": 3.4906716346740723, "learning_rate": 0.0017410832087859248, "loss": 7.6217, "step": 966500 }, { "epoch": 3.9377159740052083, "grad_norm": 7.8018622398376465, "learning_rate": 0.0017406109560268955, "loss": 7.6183, "step": 966600 }, { "epoch": 3.93812335202859, "grad_norm": 8.805583000183105, "learning_rate": 0.0017401387334882506, "loss": 7.6126, "step": 966700 }, { "epoch": 3.9385307300519714, "grad_norm": 4.326712131500244, "learning_rate": 0.001739666541188665, "loss": 7.5709, "step": 966800 }, { "epoch": 3.9389381080753525, "grad_norm": 10.123628616333008, "learning_rate": 0.0017391943791468026, "loss": 7.6283, "step": 966900 }, { "epoch": 3.939345486098734, "grad_norm": 7.375999450683594, "learning_rate": 0.0017387222473813327, "loss": 7.6254, "step": 967000 }, { "epoch": 3.939345486098734, "eval_MaskedAccuracy": 0.509036903561673, "eval_loss": 1.612851858139038, "eval_runtime": 172.39, "eval_samples_per_second": 368.212, "eval_steps_per_second": 1.439, "step": 967000 }, { "epoch": 3.9397528641221156, "grad_norm": 9.184760093688965, "learning_rate": 0.00173825014591092, "loss": 7.6214, "step": 967100 }, { "epoch": 3.940160242145497, "grad_norm": 3.240121603012085, "learning_rate": 0.0017377780747542307, "loss": 7.6016, "step": 967200 }, { "epoch": 3.9405676201688786, "grad_norm": 12.256264686584473, "learning_rate": 0.0017373060339299265, "loss": 7.6188, "step": 967300 }, { "epoch": 3.94097499819226, "grad_norm": 3.1830883026123047, "learning_rate": 0.001736834023456671, "loss": 7.5812, "step": 967400 }, { "epoch": 3.9413823762156417, "grad_norm": 5.6326775550842285, "learning_rate": 0.001736362043353126, "loss": 7.6057, "step": 967500 }, { "epoch": 3.941789754239023, "grad_norm": 7.11724328994751, "learning_rate": 0.0017358900936379502, "loss": 7.6092, "step": 967600 }, { "epoch": 3.9421971322624043, "grad_norm": 9.236283302307129, "learning_rate": 0.0017354181743298033, "loss": 7.6022, "step": 967700 }, { "epoch": 3.942604510285786, "grad_norm": 10.890929222106934, "learning_rate": 0.001734946285447342, "loss": 7.6339, "step": 967800 }, { "epoch": 3.9430118883091674, "grad_norm": 4.577576160430908, "learning_rate": 0.0017344744270092205, "loss": 7.5962, "step": 967900 }, { "epoch": 3.943419266332549, "grad_norm": 4.31857967376709, "learning_rate": 0.0017340025990340954, "loss": 7.5695, "step": 968000 }, { "epoch": 3.943419266332549, "eval_MaskedAccuracy": 0.5087030499207391, "eval_loss": 1.6019302606582642, "eval_runtime": 194.1768, "eval_samples_per_second": 326.898, "eval_steps_per_second": 1.277, "step": 968000 }, { "epoch": 3.94382664435593, "grad_norm": 7.419013977050781, "learning_rate": 0.0017335308015406228, "loss": 7.5877, "step": 968100 }, { "epoch": 3.9442340223793115, "grad_norm": 7.132618427276611, "learning_rate": 0.0017330590345474527, "loss": 7.6275, "step": 968200 }, { "epoch": 3.944641400402693, "grad_norm": 5.5635294914245605, "learning_rate": 0.0017325872980732374, "loss": 7.609, "step": 968300 }, { "epoch": 3.9450487784260746, "grad_norm": 5.992742538452148, "learning_rate": 0.0017321155921366273, "loss": 7.5958, "step": 968400 }, { "epoch": 3.945456156449456, "grad_norm": 3.753831386566162, "learning_rate": 0.0017316439167562722, "loss": 7.6208, "step": 968500 }, { "epoch": 3.9458635344728377, "grad_norm": 4.255530834197998, "learning_rate": 0.0017311722719508204, "loss": 7.599, "step": 968600 }, { "epoch": 3.946270912496219, "grad_norm": 5.118152618408203, "learning_rate": 0.0017307006577389158, "loss": 7.6313, "step": 968700 }, { "epoch": 3.9466782905196007, "grad_norm": 6.278252601623535, "learning_rate": 0.001730229074139205, "loss": 7.5869, "step": 968800 }, { "epoch": 3.947085668542982, "grad_norm": 3.765784740447998, "learning_rate": 0.001729757521170333, "loss": 7.6278, "step": 968900 }, { "epoch": 3.9474930465663634, "grad_norm": 6.677095413208008, "learning_rate": 0.0017292859988509428, "loss": 7.6083, "step": 969000 }, { "epoch": 3.9474930465663634, "eval_MaskedAccuracy": 0.5089457616204685, "eval_loss": 1.6062957048416138, "eval_runtime": 244.5024, "eval_samples_per_second": 259.613, "eval_steps_per_second": 1.014, "step": 969000 }, { "epoch": 3.947900424589745, "grad_norm": 5.654714584350586, "learning_rate": 0.0017288145071996778, "loss": 7.5821, "step": 969100 }, { "epoch": 3.9483078026131264, "grad_norm": 6.486881732940674, "learning_rate": 0.0017283430462351763, "loss": 7.609, "step": 969200 }, { "epoch": 3.948715180636508, "grad_norm": 6.111940860748291, "learning_rate": 0.0017278716159760778, "loss": 7.6086, "step": 969300 }, { "epoch": 3.949122558659889, "grad_norm": 10.307491302490234, "learning_rate": 0.0017274002164410205, "loss": 7.6249, "step": 969400 }, { "epoch": 3.9495299366832706, "grad_norm": 6.614736557006836, "learning_rate": 0.0017269288476486414, "loss": 7.5924, "step": 969500 }, { "epoch": 3.949937314706652, "grad_norm": 9.90733814239502, "learning_rate": 0.0017264575096175763, "loss": 7.6071, "step": 969600 }, { "epoch": 3.9503446927300336, "grad_norm": 7.4483561515808105, "learning_rate": 0.001725986202366459, "loss": 7.5843, "step": 969700 }, { "epoch": 3.950752070753415, "grad_norm": 8.133569717407227, "learning_rate": 0.0017255149259139243, "loss": 7.6001, "step": 969800 }, { "epoch": 3.9511594487767967, "grad_norm": 3.9263856410980225, "learning_rate": 0.001725043680278603, "loss": 7.5881, "step": 969900 }, { "epoch": 3.9515668268001782, "grad_norm": 8.540437698364258, "learning_rate": 0.001724572465479127, "loss": 7.6145, "step": 970000 }, { "epoch": 3.9515668268001782, "eval_MaskedAccuracy": 0.5089144960412382, "eval_loss": 1.612705111503601, "eval_runtime": 171.0321, "eval_samples_per_second": 371.135, "eval_steps_per_second": 1.45, "step": 970000 }, { "epoch": 3.9519742048235593, "grad_norm": 5.326159477233887, "learning_rate": 0.001724101281534126, "loss": 7.5963, "step": 970100 }, { "epoch": 3.952381582846941, "grad_norm": 10.774977684020996, "learning_rate": 0.0017236301284622268, "loss": 7.5999, "step": 970200 }, { "epoch": 3.9527889608703224, "grad_norm": 11.163222312927246, "learning_rate": 0.0017231590062820577, "loss": 7.5947, "step": 970300 }, { "epoch": 3.953196338893704, "grad_norm": 8.657665252685547, "learning_rate": 0.001722687915012246, "loss": 7.6081, "step": 970400 }, { "epoch": 3.9536037169170855, "grad_norm": 8.744972229003906, "learning_rate": 0.0017222168546714152, "loss": 7.6163, "step": 970500 }, { "epoch": 3.9540110949404665, "grad_norm": 7.80633544921875, "learning_rate": 0.001721745825278188, "loss": 7.5912, "step": 970600 }, { "epoch": 3.954418472963848, "grad_norm": 9.006880760192871, "learning_rate": 0.0017212748268511874, "loss": 7.6229, "step": 970700 }, { "epoch": 3.9548258509872296, "grad_norm": 6.297878265380859, "learning_rate": 0.0017208038594090347, "loss": 7.6206, "step": 970800 }, { "epoch": 3.955233229010611, "grad_norm": 4.641481876373291, "learning_rate": 0.0017203329229703505, "loss": 7.5948, "step": 970900 }, { "epoch": 3.9556406070339927, "grad_norm": 2.9865880012512207, "learning_rate": 0.0017198620175537522, "loss": 7.6016, "step": 971000 }, { "epoch": 3.9556406070339927, "eval_MaskedAccuracy": 0.5096631115317253, "eval_loss": 1.6074703931808472, "eval_runtime": 167.0376, "eval_samples_per_second": 380.01, "eval_steps_per_second": 1.485, "step": 971000 }, { "epoch": 3.956047985057374, "grad_norm": 3.9123666286468506, "learning_rate": 0.001719391143177859, "loss": 7.5958, "step": 971100 }, { "epoch": 3.9564553630807557, "grad_norm": 4.388565540313721, "learning_rate": 0.0017189202998612813, "loss": 7.6196, "step": 971200 }, { "epoch": 3.9568627411041373, "grad_norm": 3.3493175506591797, "learning_rate": 0.0017184494876226403, "loss": 7.5778, "step": 971300 }, { "epoch": 3.9572701191275184, "grad_norm": 10.546339988708496, "learning_rate": 0.001717978706480547, "loss": 7.6032, "step": 971400 }, { "epoch": 3.9576774971509, "grad_norm": 5.917593479156494, "learning_rate": 0.0017175079564536165, "loss": 7.6366, "step": 971500 }, { "epoch": 3.9580848751742814, "grad_norm": 4.218117713928223, "learning_rate": 0.0017170372375604577, "loss": 7.598, "step": 971600 }, { "epoch": 3.958492253197663, "grad_norm": 4.911594390869141, "learning_rate": 0.001716566549819682, "loss": 7.6187, "step": 971700 }, { "epoch": 3.9588996312210445, "grad_norm": 13.799729347229004, "learning_rate": 0.001716095893249896, "loss": 7.5556, "step": 971800 }, { "epoch": 3.9593070092444256, "grad_norm": 7.282037734985352, "learning_rate": 0.0017156252678697103, "loss": 7.581, "step": 971900 }, { "epoch": 3.959714387267807, "grad_norm": 10.808502197265625, "learning_rate": 0.0017151546736977295, "loss": 7.6168, "step": 972000 }, { "epoch": 3.959714387267807, "eval_MaskedAccuracy": 0.5091707189279271, "eval_loss": 1.6053344011306763, "eval_runtime": 169.707, "eval_samples_per_second": 374.033, "eval_steps_per_second": 1.461, "step": 972000 }, { "epoch": 3.9601217652911886, "grad_norm": 4.173104763031006, "learning_rate": 0.0017146841107525577, "loss": 7.6016, "step": 972100 }, { "epoch": 3.96052914331457, "grad_norm": 7.0381622314453125, "learning_rate": 0.0017142135790527997, "loss": 7.6365, "step": 972200 }, { "epoch": 3.9609365213379517, "grad_norm": 7.141263961791992, "learning_rate": 0.0017137430786170601, "loss": 7.5984, "step": 972300 }, { "epoch": 3.9613438993613332, "grad_norm": 8.423711776733398, "learning_rate": 0.0017132726094639373, "loss": 7.6243, "step": 972400 }, { "epoch": 3.9617512773847148, "grad_norm": 6.234470367431641, "learning_rate": 0.0017128021716120326, "loss": 7.5877, "step": 972500 }, { "epoch": 3.962158655408096, "grad_norm": 7.703559875488281, "learning_rate": 0.001712331765079945, "loss": 7.6258, "step": 972600 }, { "epoch": 3.9625660334314774, "grad_norm": 8.55420207977295, "learning_rate": 0.0017118613898862722, "loss": 7.6121, "step": 972700 }, { "epoch": 3.962973411454859, "grad_norm": 3.7489073276519775, "learning_rate": 0.0017113910460496123, "loss": 7.6163, "step": 972800 }, { "epoch": 3.9633807894782405, "grad_norm": 4.63359260559082, "learning_rate": 0.001710920733588557, "loss": 7.6142, "step": 972900 }, { "epoch": 3.963788167501622, "grad_norm": 4.0801591873168945, "learning_rate": 0.0017104504525217038, "loss": 7.6015, "step": 973000 }, { "epoch": 3.963788167501622, "eval_MaskedAccuracy": 0.5098385944356569, "eval_loss": 1.6042712926864624, "eval_runtime": 179.3414, "eval_samples_per_second": 353.939, "eval_steps_per_second": 1.383, "step": 973000 }, { "epoch": 3.964195545525003, "grad_norm": 4.366600036621094, "learning_rate": 0.001709980202867641, "loss": 7.6032, "step": 973100 }, { "epoch": 3.9646029235483846, "grad_norm": 5.428745269775391, "learning_rate": 0.001709509984644964, "loss": 7.6191, "step": 973200 }, { "epoch": 3.965010301571766, "grad_norm": 5.178256511688232, "learning_rate": 0.0017090397978722625, "loss": 7.6357, "step": 973300 }, { "epoch": 3.9654176795951477, "grad_norm": 7.077785491943359, "learning_rate": 0.0017085696425681242, "loss": 7.6124, "step": 973400 }, { "epoch": 3.965825057618529, "grad_norm": 13.619942665100098, "learning_rate": 0.0017080995187511376, "loss": 7.6049, "step": 973500 }, { "epoch": 3.9662324356419107, "grad_norm": 8.637674331665039, "learning_rate": 0.001707629426439889, "loss": 7.6215, "step": 973600 }, { "epoch": 3.9666398136652923, "grad_norm": 10.719552040100098, "learning_rate": 0.0017071593656529647, "loss": 7.6161, "step": 973700 }, { "epoch": 3.967047191688674, "grad_norm": 5.537758827209473, "learning_rate": 0.0017066893364089464, "loss": 7.6187, "step": 973800 }, { "epoch": 3.967454569712055, "grad_norm": 5.392650604248047, "learning_rate": 0.0017062193387264188, "loss": 7.6237, "step": 973900 }, { "epoch": 3.9678619477354364, "grad_norm": 3.8435933589935303, "learning_rate": 0.0017057493726239635, "loss": 7.6246, "step": 974000 }, { "epoch": 3.9678619477354364, "eval_MaskedAccuracy": 0.5092166668473508, "eval_loss": 1.6011650562286377, "eval_runtime": 175.5725, "eval_samples_per_second": 361.537, "eval_steps_per_second": 1.413, "step": 974000 }, { "epoch": 3.968269325758818, "grad_norm": 9.764803886413574, "learning_rate": 0.001705279438120159, "loss": 7.6261, "step": 974100 }, { "epoch": 3.9686767037821995, "grad_norm": 4.899872303009033, "learning_rate": 0.0017048095352335858, "loss": 7.612, "step": 974200 }, { "epoch": 3.969084081805581, "grad_norm": 10.642428398132324, "learning_rate": 0.0017043396639828234, "loss": 7.6152, "step": 974300 }, { "epoch": 3.969491459828962, "grad_norm": 4.387692451477051, "learning_rate": 0.0017038698243864439, "loss": 7.5949, "step": 974400 }, { "epoch": 3.9698988378523437, "grad_norm": 9.260013580322266, "learning_rate": 0.0017034000164630257, "loss": 7.5844, "step": 974500 }, { "epoch": 3.970306215875725, "grad_norm": 6.511177062988281, "learning_rate": 0.0017029302402311421, "loss": 7.6172, "step": 974600 }, { "epoch": 3.9707135938991067, "grad_norm": 10.507186889648438, "learning_rate": 0.0017024604957093668, "loss": 7.6099, "step": 974700 }, { "epoch": 3.9711209719224883, "grad_norm": 4.583454608917236, "learning_rate": 0.0017019907829162696, "loss": 7.6258, "step": 974800 }, { "epoch": 3.97152834994587, "grad_norm": 5.074526786804199, "learning_rate": 0.0017015211018704221, "loss": 7.586, "step": 974900 }, { "epoch": 3.9719357279692513, "grad_norm": 6.329677581787109, "learning_rate": 0.001701051452590394, "loss": 7.5945, "step": 975000 }, { "epoch": 3.9719357279692513, "eval_MaskedAccuracy": 0.5089593009373351, "eval_loss": 1.614335060119629, "eval_runtime": 181.824, "eval_samples_per_second": 349.107, "eval_steps_per_second": 1.364, "step": 975000 }, { "epoch": 3.9723431059926324, "grad_norm": 3.5868403911590576, "learning_rate": 0.0017005818350947512, "loss": 7.623, "step": 975100 }, { "epoch": 3.972750484016014, "grad_norm": 5.082174301147461, "learning_rate": 0.0017001122494020608, "loss": 7.5911, "step": 975200 }, { "epoch": 3.9731578620393955, "grad_norm": 9.512438774108887, "learning_rate": 0.0016996426955308878, "loss": 7.6167, "step": 975300 }, { "epoch": 3.973565240062777, "grad_norm": 7.000560283660889, "learning_rate": 0.0016991731734997963, "loss": 7.6084, "step": 975400 }, { "epoch": 3.9739726180861585, "grad_norm": 6.193122863769531, "learning_rate": 0.0016987036833273482, "loss": 7.6148, "step": 975500 }, { "epoch": 3.9743799961095396, "grad_norm": 8.048707008361816, "learning_rate": 0.001698234225032108, "loss": 7.6017, "step": 975600 }, { "epoch": 3.974787374132921, "grad_norm": 8.468701362609863, "learning_rate": 0.0016977647986326306, "loss": 7.6221, "step": 975700 }, { "epoch": 3.9751947521563027, "grad_norm": 4.622277736663818, "learning_rate": 0.0016972954041474797, "loss": 7.5971, "step": 975800 }, { "epoch": 3.9756021301796842, "grad_norm": 9.617279052734375, "learning_rate": 0.0016968260415952117, "loss": 7.6124, "step": 975900 }, { "epoch": 3.9760095082030658, "grad_norm": 4.895694732666016, "learning_rate": 0.001696356710994383, "loss": 7.6079, "step": 976000 }, { "epoch": 3.9760095082030658, "eval_MaskedAccuracy": 0.50983445511661, "eval_loss": 1.5986188650131226, "eval_runtime": 159.5097, "eval_samples_per_second": 397.944, "eval_steps_per_second": 1.555, "step": 976000 }, { "epoch": 3.9764168862264473, "grad_norm": 6.036518096923828, "learning_rate": 0.0016958874123635483, "loss": 7.585, "step": 976100 }, { "epoch": 3.976824264249829, "grad_norm": 5.62017297744751, "learning_rate": 0.0016954181457212634, "loss": 7.5829, "step": 976200 }, { "epoch": 3.9772316422732104, "grad_norm": 4.487478256225586, "learning_rate": 0.001694948911086078, "loss": 7.5964, "step": 976300 }, { "epoch": 3.9776390202965914, "grad_norm": 4.295134544372559, "learning_rate": 0.0016944797084765442, "loss": 7.6068, "step": 976400 }, { "epoch": 3.978046398319973, "grad_norm": 13.148701667785645, "learning_rate": 0.0016940105379112125, "loss": 7.6207, "step": 976500 }, { "epoch": 3.9784537763433545, "grad_norm": 5.530330657958984, "learning_rate": 0.0016935413994086294, "loss": 7.6016, "step": 976600 }, { "epoch": 3.978861154366736, "grad_norm": 3.2301185131073, "learning_rate": 0.0016930722929873465, "loss": 7.5981, "step": 976700 }, { "epoch": 3.9792685323901176, "grad_norm": 4.73488187789917, "learning_rate": 0.001692603218665906, "loss": 7.5991, "step": 976800 }, { "epoch": 3.9796759104134987, "grad_norm": 7.956972122192383, "learning_rate": 0.0016921341764628577, "loss": 7.5862, "step": 976900 }, { "epoch": 3.98008328843688, "grad_norm": 3.1814346313476562, "learning_rate": 0.0016916651663967416, "loss": 7.6019, "step": 977000 }, { "epoch": 3.98008328843688, "eval_MaskedAccuracy": 0.509535113683961, "eval_loss": 1.6061532497406006, "eval_runtime": 200.0001, "eval_samples_per_second": 317.38, "eval_steps_per_second": 1.24, "step": 977000 }, { "epoch": 3.9804906664602617, "grad_norm": 9.073088645935059, "learning_rate": 0.0016911961884861035, "loss": 7.5911, "step": 977100 }, { "epoch": 3.9808980444836433, "grad_norm": 11.336462020874023, "learning_rate": 0.0016907272427494826, "loss": 7.5966, "step": 977200 }, { "epoch": 3.981305422507025, "grad_norm": 5.6354146003723145, "learning_rate": 0.001690258329205419, "loss": 7.5939, "step": 977300 }, { "epoch": 3.9817128005304063, "grad_norm": 4.563399791717529, "learning_rate": 0.0016897894478724495, "loss": 7.5959, "step": 977400 }, { "epoch": 3.982120178553788, "grad_norm": 9.256034851074219, "learning_rate": 0.0016893205987691133, "loss": 7.598, "step": 977500 }, { "epoch": 3.982527556577169, "grad_norm": 4.321191310882568, "learning_rate": 0.0016888517819139454, "loss": 7.5936, "step": 977600 }, { "epoch": 3.9829349346005505, "grad_norm": 15.723010063171387, "learning_rate": 0.001688382997325483, "loss": 7.5967, "step": 977700 }, { "epoch": 3.983342312623932, "grad_norm": 3.105146646499634, "learning_rate": 0.0016879142450222577, "loss": 7.6134, "step": 977800 }, { "epoch": 3.9837496906473135, "grad_norm": 10.083918571472168, "learning_rate": 0.0016874455250228044, "loss": 7.5799, "step": 977900 }, { "epoch": 3.984157068670695, "grad_norm": 3.9866275787353516, "learning_rate": 0.00168697683734565, "loss": 7.5977, "step": 978000 }, { "epoch": 3.984157068670695, "eval_MaskedAccuracy": 0.5096758100810419, "eval_loss": 1.6063538789749146, "eval_runtime": 232.7136, "eval_samples_per_second": 272.765, "eval_steps_per_second": 1.066, "step": 978000 }, { "epoch": 3.984564446694076, "grad_norm": 4.538233757019043, "learning_rate": 0.0016865081820093273, "loss": 7.6205, "step": 978100 }, { "epoch": 3.9849718247174577, "grad_norm": 15.1337251663208, "learning_rate": 0.0016860395590323645, "loss": 7.5709, "step": 978200 }, { "epoch": 3.9853792027408392, "grad_norm": 3.3948187828063965, "learning_rate": 0.0016855709684332866, "loss": 7.5849, "step": 978300 }, { "epoch": 3.9857865807642208, "grad_norm": 5.072014331817627, "learning_rate": 0.0016851024102306207, "loss": 7.6037, "step": 978400 }, { "epoch": 3.9861939587876023, "grad_norm": 3.318345785140991, "learning_rate": 0.0016846338844428927, "loss": 7.6003, "step": 978500 }, { "epoch": 3.986601336810984, "grad_norm": 4.902764320373535, "learning_rate": 0.0016841653910886253, "loss": 7.6191, "step": 978600 }, { "epoch": 3.9870087148343654, "grad_norm": 17.06964111328125, "learning_rate": 0.0016836969301863385, "loss": 7.6105, "step": 978700 }, { "epoch": 3.987416092857747, "grad_norm": 6.311084270477295, "learning_rate": 0.0016832285017545558, "loss": 7.6117, "step": 978800 }, { "epoch": 3.987823470881128, "grad_norm": 7.435804843902588, "learning_rate": 0.0016827601058117963, "loss": 7.5868, "step": 978900 }, { "epoch": 3.9882308489045095, "grad_norm": 5.136353969573975, "learning_rate": 0.0016822917423765765, "loss": 7.5817, "step": 979000 }, { "epoch": 3.9882308489045095, "eval_MaskedAccuracy": 0.5093313363367059, "eval_loss": 1.6005762815475464, "eval_runtime": 185.8371, "eval_samples_per_second": 341.568, "eval_steps_per_second": 1.335, "step": 979000 }, { "epoch": 3.988638226927891, "grad_norm": 5.9919843673706055, "learning_rate": 0.0016818234114674131, "loss": 7.6002, "step": 979100 }, { "epoch": 3.9890456049512726, "grad_norm": 7.936841011047363, "learning_rate": 0.0016813551131028238, "loss": 7.5694, "step": 979200 }, { "epoch": 3.989452982974654, "grad_norm": 7.31936502456665, "learning_rate": 0.0016808868473013221, "loss": 7.6011, "step": 979300 }, { "epoch": 3.989860360998035, "grad_norm": 3.525270938873291, "learning_rate": 0.0016804186140814228, "loss": 7.5873, "step": 979400 }, { "epoch": 3.9902677390214167, "grad_norm": 6.6307573318481445, "learning_rate": 0.001679950413461635, "loss": 7.5841, "step": 979500 }, { "epoch": 3.9906751170447983, "grad_norm": 5.909872531890869, "learning_rate": 0.0016794822454604698, "loss": 7.6352, "step": 979600 }, { "epoch": 3.99108249506818, "grad_norm": 4.36875581741333, "learning_rate": 0.001679014110096436, "loss": 7.5939, "step": 979700 }, { "epoch": 3.9914898730915613, "grad_norm": 7.524434566497803, "learning_rate": 0.0016785460073880415, "loss": 7.5861, "step": 979800 }, { "epoch": 3.991897251114943, "grad_norm": 14.55718994140625, "learning_rate": 0.0016780779373537955, "loss": 7.5725, "step": 979900 }, { "epoch": 3.9923046291383244, "grad_norm": 4.0364603996276855, "learning_rate": 0.0016776099000121991, "loss": 7.5801, "step": 980000 }, { "epoch": 3.9923046291383244, "eval_MaskedAccuracy": 0.5095230067761933, "eval_loss": 1.6059062480926514, "eval_runtime": 183.7045, "eval_samples_per_second": 345.533, "eval_steps_per_second": 1.35, "step": 980000 }, { "epoch": 3.9927120071617055, "grad_norm": 7.353693008422852, "learning_rate": 0.0016771418953817595, "loss": 7.6074, "step": 980100 }, { "epoch": 3.993119385185087, "grad_norm": 9.953048706054688, "learning_rate": 0.0016766739234809792, "loss": 7.6262, "step": 980200 }, { "epoch": 3.9935267632084686, "grad_norm": 9.833102226257324, "learning_rate": 0.0016762059843283582, "loss": 7.613, "step": 980300 }, { "epoch": 3.99393414123185, "grad_norm": 3.764328956604004, "learning_rate": 0.0016757380779423984, "loss": 7.6269, "step": 980400 }, { "epoch": 3.9943415192552316, "grad_norm": 11.513262748718262, "learning_rate": 0.0016752702043415966, "loss": 7.5752, "step": 980500 }, { "epoch": 3.9947488972786127, "grad_norm": 6.5440850257873535, "learning_rate": 0.0016748023635444504, "loss": 7.6159, "step": 980600 }, { "epoch": 3.9951562753019942, "grad_norm": 8.95826530456543, "learning_rate": 0.0016743345555694574, "loss": 7.6012, "step": 980700 }, { "epoch": 3.9955636533253758, "grad_norm": 5.079588890075684, "learning_rate": 0.0016738667804351115, "loss": 7.5855, "step": 980800 }, { "epoch": 3.9959710313487573, "grad_norm": 2.878133535385132, "learning_rate": 0.0016733990381599072, "loss": 7.5802, "step": 980900 }, { "epoch": 3.996378409372139, "grad_norm": 3.4497833251953125, "learning_rate": 0.0016729313287623354, "loss": 7.6233, "step": 981000 }, { "epoch": 3.996378409372139, "eval_MaskedAccuracy": 0.5100188147294573, "eval_loss": 1.6072598695755005, "eval_runtime": 202.7131, "eval_samples_per_second": 313.132, "eval_steps_per_second": 1.223, "step": 981000 }, { "epoch": 3.9967857873955204, "grad_norm": 5.88665771484375, "learning_rate": 0.0016724636522608906, "loss": 7.621, "step": 981100 }, { "epoch": 3.997193165418902, "grad_norm": 5.950596332550049, "learning_rate": 0.0016719960086740602, "loss": 7.6084, "step": 981200 }, { "epoch": 3.9976005434422834, "grad_norm": 4.331485271453857, "learning_rate": 0.0016715283980203329, "loss": 7.6046, "step": 981300 }, { "epoch": 3.9980079214656645, "grad_norm": 2.948194742202759, "learning_rate": 0.0016710608203181947, "loss": 7.5938, "step": 981400 }, { "epoch": 3.998415299489046, "grad_norm": 14.660123825073242, "learning_rate": 0.0016705932755861326, "loss": 7.5907, "step": 981500 }, { "epoch": 3.9988226775124276, "grad_norm": 7.557115077972412, "learning_rate": 0.001670125763842631, "loss": 7.6077, "step": 981600 }, { "epoch": 3.999230055535809, "grad_norm": 6.886069297790527, "learning_rate": 0.0016696582851061742, "loss": 7.6244, "step": 981700 }, { "epoch": 3.9996374335591907, "grad_norm": 10.294854164123535, "learning_rate": 0.001669190839395242, "loss": 7.6077, "step": 981800 }, { "epoch": 4.000044811582572, "grad_norm": 3.667914390563965, "learning_rate": 0.0016687234267283172, "loss": 7.603, "step": 981900 }, { "epoch": 4.000452189605953, "grad_norm": 3.2173373699188232, "learning_rate": 0.001668256047123878, "loss": 7.6231, "step": 982000 }, { "epoch": 4.000452189605953, "eval_MaskedAccuracy": 0.5094959772927058, "eval_loss": 1.6051380634307861, "eval_runtime": 148.8675, "eval_samples_per_second": 426.393, "eval_steps_per_second": 1.666, "step": 982000 }, { "epoch": 4.000859567629335, "grad_norm": 5.491061687469482, "learning_rate": 0.0016677887006004016, "loss": 7.6002, "step": 982100 }, { "epoch": 4.001266945652716, "grad_norm": 12.87983226776123, "learning_rate": 0.0016673213871763659, "loss": 7.6504, "step": 982200 }, { "epoch": 4.001674323676098, "grad_norm": 7.052155017852783, "learning_rate": 0.0016668541068702472, "loss": 7.6467, "step": 982300 }, { "epoch": 4.002081701699479, "grad_norm": 2.9802677631378174, "learning_rate": 0.0016663868597005167, "loss": 7.6518, "step": 982400 }, { "epoch": 4.002489079722861, "grad_norm": 8.544371604919434, "learning_rate": 0.0016659196456856483, "loss": 7.6359, "step": 982500 }, { "epoch": 4.0028964577462425, "grad_norm": 9.191143035888672, "learning_rate": 0.0016654524648441146, "loss": 7.6313, "step": 982600 }, { "epoch": 4.003303835769624, "grad_norm": 3.2784583568573, "learning_rate": 0.0016649853171943865, "loss": 7.6256, "step": 982700 }, { "epoch": 4.0037112137930055, "grad_norm": 3.854095935821533, "learning_rate": 0.0016645182027549292, "loss": 7.5983, "step": 982800 }, { "epoch": 4.004118591816386, "grad_norm": 7.503828048706055, "learning_rate": 0.0016640511215442137, "loss": 7.6112, "step": 982900 }, { "epoch": 4.004525969839768, "grad_norm": 4.859063625335693, "learning_rate": 0.0016635840735807047, "loss": 7.5607, "step": 983000 }, { "epoch": 4.004525969839768, "eval_MaskedAccuracy": 0.509650864568538, "eval_loss": 1.601935863494873, "eval_runtime": 149.9558, "eval_samples_per_second": 423.298, "eval_steps_per_second": 1.654, "step": 983000 }, { "epoch": 4.004933347863149, "grad_norm": 7.183450698852539, "learning_rate": 0.0016631170588828686, "loss": 7.5983, "step": 983100 }, { "epoch": 4.005340725886531, "grad_norm": 4.626616477966309, "learning_rate": 0.0016626500774691677, "loss": 7.6083, "step": 983200 }, { "epoch": 4.005748103909912, "grad_norm": 7.315069198608398, "learning_rate": 0.001662183129358065, "loss": 7.6087, "step": 983300 }, { "epoch": 4.006155481933294, "grad_norm": 4.400745391845703, "learning_rate": 0.0016617162145680213, "loss": 7.6365, "step": 983400 }, { "epoch": 4.006562859956675, "grad_norm": 6.4837775230407715, "learning_rate": 0.001661249333117496, "loss": 7.6326, "step": 983500 }, { "epoch": 4.006970237980057, "grad_norm": 5.988894462585449, "learning_rate": 0.0016607824850249481, "loss": 7.5876, "step": 983600 }, { "epoch": 4.0073776160034384, "grad_norm": 4.420219421386719, "learning_rate": 0.0016603156703088318, "loss": 7.6013, "step": 983700 }, { "epoch": 4.00778499402682, "grad_norm": 8.457923889160156, "learning_rate": 0.0016598488889876047, "loss": 7.6053, "step": 983800 }, { "epoch": 4.0081923720502015, "grad_norm": 9.585550308227539, "learning_rate": 0.0016593821410797213, "loss": 7.6126, "step": 983900 }, { "epoch": 4.008599750073583, "grad_norm": 5.95546293258667, "learning_rate": 0.001658915426603634, "loss": 7.6169, "step": 984000 }, { "epoch": 4.008599750073583, "eval_MaskedAccuracy": 0.5094694198042954, "eval_loss": 1.61019766330719, "eval_runtime": 149.01, "eval_samples_per_second": 425.985, "eval_steps_per_second": 1.664, "step": 984000 }, { "epoch": 4.009007128096964, "grad_norm": 5.147625923156738, "learning_rate": 0.001658448745577795, "loss": 7.6507, "step": 984100 }, { "epoch": 4.009414506120345, "grad_norm": 3.7334346771240234, "learning_rate": 0.001657982098020656, "loss": 7.6011, "step": 984200 }, { "epoch": 4.009821884143727, "grad_norm": 2.3425774574279785, "learning_rate": 0.001657515483950663, "loss": 7.6154, "step": 984300 }, { "epoch": 4.010229262167108, "grad_norm": 7.669173717498779, "learning_rate": 0.0016570489033862663, "loss": 7.5999, "step": 984400 }, { "epoch": 4.01063664019049, "grad_norm": 5.005259037017822, "learning_rate": 0.001656582356345912, "loss": 7.6213, "step": 984500 }, { "epoch": 4.011044018213871, "grad_norm": 3.5709750652313232, "learning_rate": 0.0016561158428480445, "loss": 7.6224, "step": 984600 }, { "epoch": 4.011451396237253, "grad_norm": 3.1985106468200684, "learning_rate": 0.0016556493629111056, "loss": 7.6002, "step": 984700 }, { "epoch": 4.011858774260634, "grad_norm": 4.026301383972168, "learning_rate": 0.001655182916553543, "loss": 7.6022, "step": 984800 }, { "epoch": 4.012266152284016, "grad_norm": 7.075306415557861, "learning_rate": 0.0016547165037937945, "loss": 7.6124, "step": 984900 }, { "epoch": 4.0126735303073975, "grad_norm": 4.863493919372559, "learning_rate": 0.0016542501246503004, "loss": 7.5892, "step": 985000 }, { "epoch": 4.0126735303073975, "eval_MaskedAccuracy": 0.5098572916061271, "eval_loss": 1.6003751754760742, "eval_runtime": 149.1404, "eval_samples_per_second": 425.612, "eval_steps_per_second": 1.663, "step": 985000 }, { "epoch": 4.013080908330779, "grad_norm": 5.730146408081055, "learning_rate": 0.0016537837791414979, "loss": 7.5818, "step": 985100 }, { "epoch": 4.0134882863541606, "grad_norm": 8.001017570495605, "learning_rate": 0.0016533174672858262, "loss": 7.5501, "step": 985200 }, { "epoch": 4.013895664377542, "grad_norm": 5.380482196807861, "learning_rate": 0.0016528511891017196, "loss": 7.6155, "step": 985300 }, { "epoch": 4.014303042400923, "grad_norm": 7.937867641448975, "learning_rate": 0.001652384944607614, "loss": 7.597, "step": 985400 }, { "epoch": 4.014710420424304, "grad_norm": 6.902302265167236, "learning_rate": 0.0016519187338219412, "loss": 7.6014, "step": 985500 }, { "epoch": 4.015117798447686, "grad_norm": 4.475093364715576, "learning_rate": 0.0016514525567631324, "loss": 7.6247, "step": 985600 }, { "epoch": 4.015525176471067, "grad_norm": 7.708363056182861, "learning_rate": 0.0016509864134496201, "loss": 7.5872, "step": 985700 }, { "epoch": 4.015932554494449, "grad_norm": 9.547569274902344, "learning_rate": 0.0016505203038998325, "loss": 7.6135, "step": 985800 }, { "epoch": 4.01633993251783, "grad_norm": 6.526007175445557, "learning_rate": 0.0016500542281321974, "loss": 7.6144, "step": 985900 }, { "epoch": 4.016747310541212, "grad_norm": 3.0797715187072754, "learning_rate": 0.0016495881861651417, "loss": 7.618, "step": 986000 }, { "epoch": 4.016747310541212, "eval_MaskedAccuracy": 0.5102666310395091, "eval_loss": 1.6040263175964355, "eval_runtime": 150.5843, "eval_samples_per_second": 421.531, "eval_steps_per_second": 1.647, "step": 986000 }, { "epoch": 4.0171546885645935, "grad_norm": 4.383269786834717, "learning_rate": 0.0016491221780170894, "loss": 7.6177, "step": 986100 }, { "epoch": 4.017562066587975, "grad_norm": 4.634573459625244, "learning_rate": 0.0016486562037064652, "loss": 7.608, "step": 986200 }, { "epoch": 4.0179694446113565, "grad_norm": 6.890470027923584, "learning_rate": 0.0016481902632516927, "loss": 7.6174, "step": 986300 }, { "epoch": 4.018376822634738, "grad_norm": 4.450423240661621, "learning_rate": 0.001647724356671189, "loss": 7.6121, "step": 986400 }, { "epoch": 4.01878420065812, "grad_norm": 9.86596393585205, "learning_rate": 0.0016472584839833787, "loss": 7.5766, "step": 986500 }, { "epoch": 4.0191915786815, "grad_norm": 5.9871134757995605, "learning_rate": 0.0016467926452066763, "loss": 7.5972, "step": 986600 }, { "epoch": 4.019598956704882, "grad_norm": 5.728337287902832, "learning_rate": 0.0016463268403595012, "loss": 7.6381, "step": 986700 }, { "epoch": 4.020006334728263, "grad_norm": 5.415317058563232, "learning_rate": 0.0016458610694602702, "loss": 7.6249, "step": 986800 }, { "epoch": 4.020413712751645, "grad_norm": 9.332226753234863, "learning_rate": 0.001645395332527394, "loss": 7.6199, "step": 986900 }, { "epoch": 4.020821090775026, "grad_norm": 7.070971965789795, "learning_rate": 0.0016449296295792878, "loss": 7.6067, "step": 987000 }, { "epoch": 4.020821090775026, "eval_MaskedAccuracy": 0.5097597074861391, "eval_loss": 1.6057909727096558, "eval_runtime": 151.2127, "eval_samples_per_second": 419.78, "eval_steps_per_second": 1.64, "step": 987000 }, { "epoch": 4.021228468798408, "grad_norm": 9.921390533447266, "learning_rate": 0.0016444639606343634, "loss": 7.5979, "step": 987100 }, { "epoch": 4.021635846821789, "grad_norm": 6.192162990570068, "learning_rate": 0.0016439983257110322, "loss": 7.5848, "step": 987200 }, { "epoch": 4.022043224845171, "grad_norm": 7.434414386749268, "learning_rate": 0.001643532724827702, "loss": 7.6073, "step": 987300 }, { "epoch": 4.0224506028685525, "grad_norm": 4.407102584838867, "learning_rate": 0.0016430671580027806, "loss": 7.6069, "step": 987400 }, { "epoch": 4.022857980891934, "grad_norm": 5.222054958343506, "learning_rate": 0.0016426016252546745, "loss": 7.6095, "step": 987500 }, { "epoch": 4.023265358915316, "grad_norm": 10.153409004211426, "learning_rate": 0.0016421361266017879, "loss": 7.6373, "step": 987600 }, { "epoch": 4.023672736938697, "grad_norm": 3.4026260375976562, "learning_rate": 0.001641670662062526, "loss": 7.6217, "step": 987700 }, { "epoch": 4.024080114962079, "grad_norm": 4.457971572875977, "learning_rate": 0.0016412052316552898, "loss": 7.6221, "step": 987800 }, { "epoch": 4.024487492985459, "grad_norm": 3.766571521759033, "learning_rate": 0.0016407398353984804, "loss": 7.6067, "step": 987900 }, { "epoch": 4.024894871008841, "grad_norm": 5.549269676208496, "learning_rate": 0.0016402744733104982, "loss": 7.6168, "step": 988000 }, { "epoch": 4.024894871008841, "eval_MaskedAccuracy": 0.5099150393420648, "eval_loss": 1.5979849100112915, "eval_runtime": 149.1503, "eval_samples_per_second": 425.584, "eval_steps_per_second": 1.663, "step": 988000 }, { "epoch": 4.025302249032222, "grad_norm": 13.185425758361816, "learning_rate": 0.001639809145409741, "loss": 7.6263, "step": 988100 }, { "epoch": 4.025709627055604, "grad_norm": 6.519994258880615, "learning_rate": 0.0016393438517146067, "loss": 7.5787, "step": 988200 }, { "epoch": 4.026117005078985, "grad_norm": 3.994704008102417, "learning_rate": 0.0016388785922434874, "loss": 7.6051, "step": 988300 }, { "epoch": 4.026524383102367, "grad_norm": 9.871971130371094, "learning_rate": 0.0016384133670147814, "loss": 7.5802, "step": 988400 }, { "epoch": 4.0269317611257485, "grad_norm": 6.690221309661865, "learning_rate": 0.0016379481760468797, "loss": 7.6185, "step": 988500 }, { "epoch": 4.02733913914913, "grad_norm": 9.233526229858398, "learning_rate": 0.0016374830193581717, "loss": 7.5756, "step": 988600 }, { "epoch": 4.0277465171725115, "grad_norm": 4.646430492401123, "learning_rate": 0.0016370178969670529, "loss": 7.5945, "step": 988700 }, { "epoch": 4.028153895195893, "grad_norm": 9.425500869750977, "learning_rate": 0.0016365528088919086, "loss": 7.6269, "step": 988800 }, { "epoch": 4.028561273219275, "grad_norm": 6.34346866607666, "learning_rate": 0.0016360877551511253, "loss": 7.6043, "step": 988900 }, { "epoch": 4.028968651242656, "grad_norm": 3.5296967029571533, "learning_rate": 0.0016356227357630902, "loss": 7.6116, "step": 989000 }, { "epoch": 4.028968651242656, "eval_MaskedAccuracy": 0.5102494184687643, "eval_loss": 1.5941894054412842, "eval_runtime": 152.0137, "eval_samples_per_second": 417.568, "eval_steps_per_second": 1.631, "step": 989000 }, { "epoch": 4.029376029266037, "grad_norm": 8.836751937866211, "learning_rate": 0.0016351577507461887, "loss": 7.6095, "step": 989100 }, { "epoch": 4.029783407289418, "grad_norm": 10.11208438873291, "learning_rate": 0.0016346928001188044, "loss": 7.6392, "step": 989200 }, { "epoch": 4.0301907853128, "grad_norm": 6.892326354980469, "learning_rate": 0.001634227883899316, "loss": 7.6189, "step": 989300 }, { "epoch": 4.030598163336181, "grad_norm": 5.5347676277160645, "learning_rate": 0.0016337630021061084, "loss": 7.5896, "step": 989400 }, { "epoch": 4.031005541359563, "grad_norm": 3.7020421028137207, "learning_rate": 0.0016332981547575577, "loss": 7.6069, "step": 989500 }, { "epoch": 4.031412919382944, "grad_norm": 11.612074851989746, "learning_rate": 0.0016328333418720417, "loss": 7.61, "step": 989600 }, { "epoch": 4.031820297406326, "grad_norm": 10.339410781860352, "learning_rate": 0.0016323685634679404, "loss": 7.5975, "step": 989700 }, { "epoch": 4.0322276754297075, "grad_norm": 3.8707027435302734, "learning_rate": 0.0016319038195636267, "loss": 7.6044, "step": 989800 }, { "epoch": 4.032635053453089, "grad_norm": 9.976296424865723, "learning_rate": 0.0016314391101774752, "loss": 7.6275, "step": 989900 }, { "epoch": 4.033042431476471, "grad_norm": 4.477710723876953, "learning_rate": 0.0016309744353278568, "loss": 7.6148, "step": 990000 }, { "epoch": 4.033042431476471, "eval_MaskedAccuracy": 0.5096885277811427, "eval_loss": 1.6079641580581665, "eval_runtime": 152.6634, "eval_samples_per_second": 415.791, "eval_steps_per_second": 1.624, "step": 990000 }, { "epoch": 4.033449809499852, "grad_norm": 5.303618907928467, "learning_rate": 0.0016305097950331442, "loss": 7.6572, "step": 990100 }, { "epoch": 4.033857187523234, "grad_norm": 5.709973335266113, "learning_rate": 0.0016300451893117066, "loss": 7.5981, "step": 990200 }, { "epoch": 4.034264565546615, "grad_norm": 7.839023590087891, "learning_rate": 0.0016295806181819128, "loss": 7.6312, "step": 990300 }, { "epoch": 4.034671943569996, "grad_norm": 9.670504570007324, "learning_rate": 0.0016291160816621312, "loss": 7.6089, "step": 990400 }, { "epoch": 4.035079321593377, "grad_norm": 4.9290242195129395, "learning_rate": 0.0016286515797707215, "loss": 7.5979, "step": 990500 }, { "epoch": 4.035486699616759, "grad_norm": 6.394056797027588, "learning_rate": 0.0016281871125260526, "loss": 7.6154, "step": 990600 }, { "epoch": 4.03589407764014, "grad_norm": 3.729290723800659, "learning_rate": 0.0016277226799464872, "loss": 7.5927, "step": 990700 }, { "epoch": 4.036301455663522, "grad_norm": 3.4127912521362305, "learning_rate": 0.0016272582820503864, "loss": 7.6005, "step": 990800 }, { "epoch": 4.0367088336869035, "grad_norm": 5.75628137588501, "learning_rate": 0.00162679391885611, "loss": 7.6042, "step": 990900 }, { "epoch": 4.037116211710285, "grad_norm": 3.784318447113037, "learning_rate": 0.0016263295903820177, "loss": 7.6334, "step": 991000 }, { "epoch": 4.037116211710285, "eval_MaskedAccuracy": 0.5097704659043, "eval_loss": 1.6059882640838623, "eval_runtime": 151.5063, "eval_samples_per_second": 418.966, "eval_steps_per_second": 1.637, "step": 991000 }, { "epoch": 4.0375235897336665, "grad_norm": 5.234433650970459, "learning_rate": 0.001625865296646467, "loss": 7.5861, "step": 991100 }, { "epoch": 4.037930967757048, "grad_norm": 8.855758666992188, "learning_rate": 0.0016254010376678116, "loss": 7.6207, "step": 991200 }, { "epoch": 4.03833834578043, "grad_norm": 4.37705135345459, "learning_rate": 0.0016249368134644093, "loss": 7.6377, "step": 991300 }, { "epoch": 4.038745723803811, "grad_norm": 5.495611190795898, "learning_rate": 0.0016244726240546114, "loss": 7.6066, "step": 991400 }, { "epoch": 4.039153101827193, "grad_norm": 6.446033477783203, "learning_rate": 0.0016240084694567692, "loss": 7.6175, "step": 991500 }, { "epoch": 4.039560479850573, "grad_norm": 3.4947476387023926, "learning_rate": 0.0016235443496892326, "loss": 7.6115, "step": 991600 }, { "epoch": 4.039967857873955, "grad_norm": 6.529889106750488, "learning_rate": 0.001623080264770353, "loss": 7.618, "step": 991700 }, { "epoch": 4.040375235897336, "grad_norm": 4.808955192565918, "learning_rate": 0.0016226162147184753, "loss": 7.62, "step": 991800 }, { "epoch": 4.040782613920718, "grad_norm": 5.324098587036133, "learning_rate": 0.00162215219955195, "loss": 7.6277, "step": 991900 }, { "epoch": 4.0411899919440994, "grad_norm": 5.577237129211426, "learning_rate": 0.0016216882192891199, "loss": 7.6204, "step": 992000 }, { "epoch": 4.0411899919440994, "eval_MaskedAccuracy": 0.5092054552655934, "eval_loss": 1.6071096658706665, "eval_runtime": 154.4256, "eval_samples_per_second": 411.046, "eval_steps_per_second": 1.606, "step": 992000 }, { "epoch": 4.041597369967481, "grad_norm": 3.5991404056549072, "learning_rate": 0.0016212242739483292, "loss": 7.5895, "step": 992100 }, { "epoch": 4.0420047479908625, "grad_norm": 7.769160270690918, "learning_rate": 0.0016207603635479207, "loss": 7.6234, "step": 992200 }, { "epoch": 4.042412126014244, "grad_norm": 4.4813361167907715, "learning_rate": 0.0016202964881062338, "loss": 7.6024, "step": 992300 }, { "epoch": 4.042819504037626, "grad_norm": 4.528625011444092, "learning_rate": 0.001619832647641608, "loss": 7.6168, "step": 992400 }, { "epoch": 4.043226882061007, "grad_norm": 5.218510150909424, "learning_rate": 0.0016193688421723832, "loss": 7.5872, "step": 992500 }, { "epoch": 4.043634260084389, "grad_norm": 6.234872341156006, "learning_rate": 0.0016189050717168917, "loss": 7.6172, "step": 992600 }, { "epoch": 4.04404163810777, "grad_norm": 5.389354228973389, "learning_rate": 0.0016184413362934737, "loss": 7.6093, "step": 992700 }, { "epoch": 4.044449016131152, "grad_norm": 8.0420560836792, "learning_rate": 0.0016179776359204606, "loss": 7.626, "step": 992800 }, { "epoch": 4.044856394154532, "grad_norm": 4.45302152633667, "learning_rate": 0.0016175139706161878, "loss": 7.6349, "step": 992900 }, { "epoch": 4.045263772177914, "grad_norm": 5.611714839935303, "learning_rate": 0.0016170503403989834, "loss": 7.6196, "step": 993000 }, { "epoch": 4.045263772177914, "eval_MaskedAccuracy": 0.510505442108171, "eval_loss": 1.6032553911209106, "eval_runtime": 152.13, "eval_samples_per_second": 417.248, "eval_steps_per_second": 1.63, "step": 993000 }, { "epoch": 4.045671150201295, "grad_norm": 6.103917598724365, "learning_rate": 0.0016165867452871774, "loss": 7.6103, "step": 993100 }, { "epoch": 4.046078528224677, "grad_norm": 6.502655506134033, "learning_rate": 0.0016161231852990993, "loss": 7.5727, "step": 993200 }, { "epoch": 4.0464859062480585, "grad_norm": 3.6164493560791016, "learning_rate": 0.0016156596604530759, "loss": 7.6015, "step": 993300 }, { "epoch": 4.04689328427144, "grad_norm": 12.996712684631348, "learning_rate": 0.0016151961707674323, "loss": 7.6171, "step": 993400 }, { "epoch": 4.0473006622948215, "grad_norm": 9.091630935668945, "learning_rate": 0.0016147327162604945, "loss": 7.5743, "step": 993500 }, { "epoch": 4.047708040318203, "grad_norm": 5.92149543762207, "learning_rate": 0.0016142692969505815, "loss": 7.6215, "step": 993600 }, { "epoch": 4.048115418341585, "grad_norm": 2.944608211517334, "learning_rate": 0.001613805912856018, "loss": 7.6067, "step": 993700 }, { "epoch": 4.048522796364966, "grad_norm": 5.606834411621094, "learning_rate": 0.0016133425639951239, "loss": 7.6413, "step": 993800 }, { "epoch": 4.048930174388348, "grad_norm": 4.560391902923584, "learning_rate": 0.0016128792503862168, "loss": 7.619, "step": 993900 }, { "epoch": 4.049337552411729, "grad_norm": 8.38029670715332, "learning_rate": 0.0016124159720476146, "loss": 7.6077, "step": 994000 }, { "epoch": 4.049337552411729, "eval_MaskedAccuracy": 0.5097825120515572, "eval_loss": 1.6010316610336304, "eval_runtime": 154.5815, "eval_samples_per_second": 410.631, "eval_steps_per_second": 1.604, "step": 994000 }, { "epoch": 4.04974493043511, "grad_norm": 6.94528865814209, "learning_rate": 0.0016119527289976318, "loss": 7.6216, "step": 994100 }, { "epoch": 4.050152308458491, "grad_norm": 7.764667510986328, "learning_rate": 0.0016114895212545849, "loss": 7.6427, "step": 994200 }, { "epoch": 4.050559686481873, "grad_norm": 5.185911655426025, "learning_rate": 0.0016110263488367858, "loss": 7.6261, "step": 994300 }, { "epoch": 4.0509670645052545, "grad_norm": 10.981317520141602, "learning_rate": 0.001610563211762547, "loss": 7.6634, "step": 994400 }, { "epoch": 4.051374442528636, "grad_norm": 6.662348747253418, "learning_rate": 0.0016101001100501785, "loss": 7.6104, "step": 994500 }, { "epoch": 4.0517818205520175, "grad_norm": 8.90304183959961, "learning_rate": 0.0016096370437179881, "loss": 7.6094, "step": 994600 }, { "epoch": 4.052189198575399, "grad_norm": 3.5736660957336426, "learning_rate": 0.0016091740127842846, "loss": 7.62, "step": 994700 }, { "epoch": 4.052596576598781, "grad_norm": 6.343061447143555, "learning_rate": 0.001608711017267372, "loss": 7.62, "step": 994800 }, { "epoch": 4.053003954622162, "grad_norm": 3.335782051086426, "learning_rate": 0.0016082480571855582, "loss": 7.6294, "step": 994900 }, { "epoch": 4.053411332645544, "grad_norm": 8.102592468261719, "learning_rate": 0.001607785132557147, "loss": 7.5968, "step": 995000 }, { "epoch": 4.053411332645544, "eval_MaskedAccuracy": 0.5096118635349681, "eval_loss": 1.6051462888717651, "eval_runtime": 184.6131, "eval_samples_per_second": 343.833, "eval_steps_per_second": 1.343, "step": 995000 }, { "epoch": 4.053818710668925, "grad_norm": 4.340451240539551, "learning_rate": 0.0016073222434004341, "loss": 7.6314, "step": 995100 }, { "epoch": 4.054226088692307, "grad_norm": 6.552633285522461, "learning_rate": 0.001606859389733726, "loss": 7.6269, "step": 995200 }, { "epoch": 4.054633466715688, "grad_norm": 6.394277572631836, "learning_rate": 0.0016063965715753207, "loss": 7.629, "step": 995300 }, { "epoch": 4.055040844739069, "grad_norm": 3.224449634552002, "learning_rate": 0.0016059337889435129, "loss": 7.6144, "step": 995400 }, { "epoch": 4.05544822276245, "grad_norm": 6.690535545349121, "learning_rate": 0.0016054710418566002, "loss": 7.5875, "step": 995500 }, { "epoch": 4.055855600785832, "grad_norm": 10.263288497924805, "learning_rate": 0.0016050083303328787, "loss": 7.6265, "step": 995600 }, { "epoch": 4.0562629788092135, "grad_norm": 7.773268699645996, "learning_rate": 0.0016045456543906432, "loss": 7.6158, "step": 995700 }, { "epoch": 4.056670356832595, "grad_norm": 4.887666702270508, "learning_rate": 0.0016040830140481835, "loss": 7.594, "step": 995800 }, { "epoch": 4.057077734855977, "grad_norm": 4.525401592254639, "learning_rate": 0.0016036204093237904, "loss": 7.6172, "step": 995900 }, { "epoch": 4.057485112879358, "grad_norm": 4.199537754058838, "learning_rate": 0.0016031578402357536, "loss": 7.6219, "step": 996000 }, { "epoch": 4.057485112879358, "eval_MaskedAccuracy": 0.5098717340069447, "eval_loss": 1.6112149953842163, "eval_runtime": 152.5527, "eval_samples_per_second": 416.092, "eval_steps_per_second": 1.626, "step": 996000 }, { "epoch": 4.05789249090274, "grad_norm": 4.9359002113342285, "learning_rate": 0.001602695306802361, "loss": 7.6123, "step": 996100 }, { "epoch": 4.058299868926121, "grad_norm": 2.5359299182891846, "learning_rate": 0.0016022328090418978, "loss": 7.6179, "step": 996200 }, { "epoch": 4.058707246949503, "grad_norm": 4.1552581787109375, "learning_rate": 0.0016017703469726487, "loss": 7.6299, "step": 996300 }, { "epoch": 4.059114624972884, "grad_norm": 4.035418510437012, "learning_rate": 0.001601307920612902, "loss": 7.6203, "step": 996400 }, { "epoch": 4.059522002996266, "grad_norm": 6.833289623260498, "learning_rate": 0.0016008455299809361, "loss": 7.5886, "step": 996500 }, { "epoch": 4.059929381019646, "grad_norm": 4.115273952484131, "learning_rate": 0.0016003831750950293, "loss": 7.6369, "step": 996600 }, { "epoch": 4.060336759043028, "grad_norm": 5.999115943908691, "learning_rate": 0.0015999208559734658, "loss": 7.6249, "step": 996700 }, { "epoch": 4.0607441370664095, "grad_norm": 7.40685510635376, "learning_rate": 0.0015994585726345207, "loss": 7.6339, "step": 996800 }, { "epoch": 4.061151515089791, "grad_norm": 6.59105110168457, "learning_rate": 0.0015989963250964715, "loss": 7.6084, "step": 996900 }, { "epoch": 4.0615588931131725, "grad_norm": 4.839681148529053, "learning_rate": 0.001598534113377593, "loss": 7.6405, "step": 997000 }, { "epoch": 4.0615588931131725, "eval_MaskedAccuracy": 0.5093082663863335, "eval_loss": 1.6117303371429443, "eval_runtime": 156.9476, "eval_samples_per_second": 404.441, "eval_steps_per_second": 1.58, "step": 997000 }, { "epoch": 4.061966271136554, "grad_norm": 3.898164987564087, "learning_rate": 0.0015980719374961607, "loss": 7.6197, "step": 997100 }, { "epoch": 4.062373649159936, "grad_norm": 11.058553695678711, "learning_rate": 0.001597609797470446, "loss": 7.6005, "step": 997200 }, { "epoch": 4.062781027183317, "grad_norm": 3.065469741821289, "learning_rate": 0.0015971476933187194, "loss": 7.6122, "step": 997300 }, { "epoch": 4.063188405206699, "grad_norm": 9.281084060668945, "learning_rate": 0.0015966856250592492, "loss": 7.6556, "step": 997400 }, { "epoch": 4.06359578323008, "grad_norm": 7.05238676071167, "learning_rate": 0.0015962235927103024, "loss": 7.6558, "step": 997500 }, { "epoch": 4.064003161253462, "grad_norm": 4.852511882781982, "learning_rate": 0.0015957615962901494, "loss": 7.6202, "step": 997600 }, { "epoch": 4.064410539276843, "grad_norm": 6.081695556640625, "learning_rate": 0.0015952996358170536, "loss": 7.6344, "step": 997700 }, { "epoch": 4.064817917300225, "grad_norm": 12.288058280944824, "learning_rate": 0.001594837711309277, "loss": 7.6243, "step": 997800 }, { "epoch": 4.065225295323605, "grad_norm": 3.905149221420288, "learning_rate": 0.0015943758227850845, "loss": 7.5941, "step": 997900 }, { "epoch": 4.065632673346987, "grad_norm": 7.204836845397949, "learning_rate": 0.0015939139702627356, "loss": 7.6221, "step": 998000 }, { "epoch": 4.065632673346987, "eval_MaskedAccuracy": 0.5095109931374482, "eval_loss": 1.6079601049423218, "eval_runtime": 211.6343, "eval_samples_per_second": 299.932, "eval_steps_per_second": 1.172, "step": 998000 }, { "epoch": 4.0660400513703685, "grad_norm": 7.875951766967773, "learning_rate": 0.0015934521537604896, "loss": 7.6319, "step": 998100 }, { "epoch": 4.06644742939375, "grad_norm": 9.63471794128418, "learning_rate": 0.0015929903732966062, "loss": 7.6374, "step": 998200 }, { "epoch": 4.066854807417132, "grad_norm": 9.316731452941895, "learning_rate": 0.0015925286288893405, "loss": 7.6151, "step": 998300 }, { "epoch": 4.067262185440513, "grad_norm": 6.268803119659424, "learning_rate": 0.001592066920556949, "loss": 7.646, "step": 998400 }, { "epoch": 4.067669563463895, "grad_norm": 5.5083770751953125, "learning_rate": 0.0015916052483176837, "loss": 7.5851, "step": 998500 }, { "epoch": 4.068076941487276, "grad_norm": 4.724705219268799, "learning_rate": 0.001591143612189798, "loss": 7.6039, "step": 998600 }, { "epoch": 4.068484319510658, "grad_norm": 7.567147254943848, "learning_rate": 0.0015906820121915446, "loss": 7.6138, "step": 998700 }, { "epoch": 4.068891697534039, "grad_norm": 6.970294952392578, "learning_rate": 0.0015902204483411696, "loss": 7.5879, "step": 998800 }, { "epoch": 4.069299075557421, "grad_norm": 4.607194900512695, "learning_rate": 0.001589758920656924, "loss": 7.5753, "step": 998900 }, { "epoch": 4.069706453580802, "grad_norm": 16.934329986572266, "learning_rate": 0.0015892974291570534, "loss": 7.617, "step": 999000 }, { "epoch": 4.069706453580802, "eval_MaskedAccuracy": 0.5097562622975296, "eval_loss": 1.5989048480987549, "eval_runtime": 160.4169, "eval_samples_per_second": 395.694, "eval_steps_per_second": 1.546, "step": 999000 }, { "epoch": 4.070113831604183, "grad_norm": 5.921639442443848, "learning_rate": 0.0015888359738598025, "loss": 7.6129, "step": 999100 }, { "epoch": 4.0705212096275645, "grad_norm": 10.488616943359375, "learning_rate": 0.0015883745547834159, "loss": 7.5983, "step": 999200 }, { "epoch": 4.070928587650946, "grad_norm": 8.565415382385254, "learning_rate": 0.0015879131719461353, "loss": 7.6103, "step": 999300 }, { "epoch": 4.0713359656743275, "grad_norm": 5.272050857543945, "learning_rate": 0.0015874518253662024, "loss": 7.6472, "step": 999400 }, { "epoch": 4.071743343697709, "grad_norm": 6.496955871582031, "learning_rate": 0.0015869905150618555, "loss": 7.5782, "step": 999500 }, { "epoch": 4.072150721721091, "grad_norm": 10.269801139831543, "learning_rate": 0.0015865292410513344, "loss": 7.5903, "step": 999600 }, { "epoch": 4.072558099744472, "grad_norm": 7.985873222351074, "learning_rate": 0.001586068003352875, "loss": 7.6196, "step": 999700 }, { "epoch": 4.072965477767854, "grad_norm": 3.554377794265747, "learning_rate": 0.001585606801984711, "loss": 7.6271, "step": 999800 }, { "epoch": 4.073372855791235, "grad_norm": 6.6276140213012695, "learning_rate": 0.0015851456369650782, "loss": 7.6388, "step": 999900 }, { "epoch": 4.073780233814617, "grad_norm": 4.672713756561279, "learning_rate": 0.0015846845083122068, "loss": 7.6125, "step": 1000000 }, { "epoch": 4.073780233814617, "eval_MaskedAccuracy": 0.5093898608426418, "eval_loss": 1.595578908920288, "eval_runtime": 163.6962, "eval_samples_per_second": 387.767, "eval_steps_per_second": 1.515, "step": 1000000 }, { "epoch": 4.074187611837998, "grad_norm": 5.099730968475342, "learning_rate": 0.0015842234160443301, "loss": 7.635, "step": 1000100 }, { "epoch": 4.07459498986138, "grad_norm": 3.789768695831299, "learning_rate": 0.0015837623601796787, "loss": 7.6247, "step": 1000200 }, { "epoch": 4.075002367884761, "grad_norm": 4.410281181335449, "learning_rate": 0.001583301340736476, "loss": 7.6248, "step": 1000300 }, { "epoch": 4.075409745908142, "grad_norm": 11.91579532623291, "learning_rate": 0.001582840357732951, "loss": 7.6268, "step": 1000400 }, { "epoch": 4.0758171239315235, "grad_norm": 5.041029930114746, "learning_rate": 0.0015823794111873269, "loss": 7.6216, "step": 1000500 }, { "epoch": 4.076224501954905, "grad_norm": 3.9867160320281982, "learning_rate": 0.0015819185011178296, "loss": 7.6004, "step": 1000600 }, { "epoch": 4.076631879978287, "grad_norm": 6.8187150955200195, "learning_rate": 0.001581457627542683, "loss": 7.639, "step": 1000700 }, { "epoch": 4.077039258001668, "grad_norm": 13.169952392578125, "learning_rate": 0.0015809967904801055, "loss": 7.5955, "step": 1000800 }, { "epoch": 4.07744663602505, "grad_norm": 15.388030052185059, "learning_rate": 0.0015805359899483179, "loss": 7.6198, "step": 1000900 }, { "epoch": 4.077854014048431, "grad_norm": 9.966484069824219, "learning_rate": 0.0015800752259655358, "loss": 7.6413, "step": 1001000 }, { "epoch": 4.077854014048431, "eval_MaskedAccuracy": 0.5095671997960911, "eval_loss": 1.600243091583252, "eval_runtime": 159.2321, "eval_samples_per_second": 398.638, "eval_steps_per_second": 1.557, "step": 1001000 }, { "epoch": 4.078261392071813, "grad_norm": 3.6168150901794434, "learning_rate": 0.0015796144985499767, "loss": 7.6015, "step": 1001100 }, { "epoch": 4.078668770095194, "grad_norm": 3.234494209289551, "learning_rate": 0.0015791538077198546, "loss": 7.5954, "step": 1001200 }, { "epoch": 4.079076148118576, "grad_norm": 8.642640113830566, "learning_rate": 0.0015786931534933872, "loss": 7.6037, "step": 1001300 }, { "epoch": 4.079483526141957, "grad_norm": 4.173941135406494, "learning_rate": 0.001578232535888782, "loss": 7.6303, "step": 1001400 }, { "epoch": 4.079890904165339, "grad_norm": 4.0859785079956055, "learning_rate": 0.00157777195492425, "loss": 7.627, "step": 1001500 }, { "epoch": 4.0802982821887195, "grad_norm": 7.465732574462891, "learning_rate": 0.001577311410618002, "loss": 7.6308, "step": 1001600 }, { "epoch": 4.080705660212101, "grad_norm": 9.598640441894531, "learning_rate": 0.0015768509029882465, "loss": 7.5924, "step": 1001700 }, { "epoch": 4.0811130382354825, "grad_norm": 8.930276870727539, "learning_rate": 0.0015763904320531894, "loss": 7.6109, "step": 1001800 }, { "epoch": 4.081520416258864, "grad_norm": 5.911867618560791, "learning_rate": 0.0015759299978310352, "loss": 7.6128, "step": 1001900 }, { "epoch": 4.081927794282246, "grad_norm": 3.2866337299346924, "learning_rate": 0.001575469600339987, "loss": 7.6519, "step": 1002000 }, { "epoch": 4.081927794282246, "eval_MaskedAccuracy": 0.5103328025454996, "eval_loss": 1.590938687324524, "eval_runtime": 160.5887, "eval_samples_per_second": 395.271, "eval_steps_per_second": 1.544, "step": 1002000 }, { "epoch": 4.082335172305627, "grad_norm": 6.869104862213135, "learning_rate": 0.0015750092395982473, "loss": 7.6288, "step": 1002100 }, { "epoch": 4.082742550329009, "grad_norm": 7.4071831703186035, "learning_rate": 0.0015745489156240152, "loss": 7.5954, "step": 1002200 }, { "epoch": 4.08314992835239, "grad_norm": 5.9728899002075195, "learning_rate": 0.0015740886284354913, "loss": 7.6114, "step": 1002300 }, { "epoch": 4.083557306375772, "grad_norm": 9.210744857788086, "learning_rate": 0.001573628378050873, "loss": 7.643, "step": 1002400 }, { "epoch": 4.083964684399153, "grad_norm": 4.123696327209473, "learning_rate": 0.0015731681644883544, "loss": 7.6331, "step": 1002500 }, { "epoch": 4.084372062422535, "grad_norm": 4.110208034515381, "learning_rate": 0.0015727079877661328, "loss": 7.5809, "step": 1002600 }, { "epoch": 4.084779440445916, "grad_norm": 6.685558795928955, "learning_rate": 0.001572247847902402, "loss": 7.6414, "step": 1002700 }, { "epoch": 4.085186818469298, "grad_norm": 9.374321937561035, "learning_rate": 0.0015717877449153528, "loss": 7.5948, "step": 1002800 }, { "epoch": 4.0855941964926785, "grad_norm": 7.754372596740723, "learning_rate": 0.0015713276788231736, "loss": 7.5892, "step": 1002900 }, { "epoch": 4.08600157451606, "grad_norm": 3.866623878479004, "learning_rate": 0.0015708676496440563, "loss": 7.6056, "step": 1003000 }, { "epoch": 4.08600157451606, "eval_MaskedAccuracy": 0.510134365929565, "eval_loss": 1.6037026643753052, "eval_runtime": 168.95, "eval_samples_per_second": 375.709, "eval_steps_per_second": 1.468, "step": 1003000 }, { "epoch": 4.086408952539442, "grad_norm": 3.749837875366211, "learning_rate": 0.0015704076573961844, "loss": 7.5931, "step": 1003100 }, { "epoch": 4.086816330562823, "grad_norm": 4.7531256675720215, "learning_rate": 0.001569947702097749, "loss": 7.5845, "step": 1003200 }, { "epoch": 4.087223708586205, "grad_norm": 7.454687118530273, "learning_rate": 0.0015694877837669316, "loss": 7.6232, "step": 1003300 }, { "epoch": 4.087631086609586, "grad_norm": 4.9076433181762695, "learning_rate": 0.001569027902421916, "loss": 7.5976, "step": 1003400 }, { "epoch": 4.088038464632968, "grad_norm": 7.2205891609191895, "learning_rate": 0.0015685680580808828, "loss": 7.5995, "step": 1003500 }, { "epoch": 4.088445842656349, "grad_norm": 10.62397575378418, "learning_rate": 0.0015681082507620124, "loss": 7.62, "step": 1003600 }, { "epoch": 4.088853220679731, "grad_norm": 5.233545780181885, "learning_rate": 0.0015676484804834838, "loss": 7.6229, "step": 1003700 }, { "epoch": 4.089260598703112, "grad_norm": 3.8612184524536133, "learning_rate": 0.0015671887472634735, "loss": 7.613, "step": 1003800 }, { "epoch": 4.089667976726494, "grad_norm": 6.494972229003906, "learning_rate": 0.0015667290511201565, "loss": 7.6151, "step": 1003900 }, { "epoch": 4.090075354749875, "grad_norm": 7.1356892585754395, "learning_rate": 0.0015662693920717105, "loss": 7.5971, "step": 1004000 }, { "epoch": 4.090075354749875, "eval_MaskedAccuracy": 0.5094093150471627, "eval_loss": 1.6064566373825073, "eval_runtime": 154.5638, "eval_samples_per_second": 410.678, "eval_steps_per_second": 1.605, "step": 1004000 }, { "epoch": 4.090482732773256, "grad_norm": 6.189616680145264, "learning_rate": 0.0015658097701363061, "loss": 7.6011, "step": 1004100 }, { "epoch": 4.090890110796638, "grad_norm": 6.372153282165527, "learning_rate": 0.0015653501853321145, "loss": 7.577, "step": 1004200 }, { "epoch": 4.091297488820019, "grad_norm": 9.208247184753418, "learning_rate": 0.001564890637677305, "loss": 7.6186, "step": 1004300 }, { "epoch": 4.091704866843401, "grad_norm": 6.365452766418457, "learning_rate": 0.001564431127190048, "loss": 7.5769, "step": 1004400 }, { "epoch": 4.092112244866782, "grad_norm": 5.940836429595947, "learning_rate": 0.0015639716538885069, "loss": 7.5911, "step": 1004500 }, { "epoch": 4.092519622890164, "grad_norm": 10.46280288696289, "learning_rate": 0.001563512217790851, "loss": 7.6008, "step": 1004600 }, { "epoch": 4.092927000913545, "grad_norm": 3.738527297973633, "learning_rate": 0.0015630528189152428, "loss": 7.5919, "step": 1004700 }, { "epoch": 4.093334378936927, "grad_norm": 11.295626640319824, "learning_rate": 0.001562593457279842, "loss": 7.622, "step": 1004800 }, { "epoch": 4.093741756960308, "grad_norm": 7.543964862823486, "learning_rate": 0.0015621341329028135, "loss": 7.6281, "step": 1004900 }, { "epoch": 4.09414913498369, "grad_norm": 5.9705047607421875, "learning_rate": 0.001561674845802315, "loss": 7.629, "step": 1005000 }, { "epoch": 4.09414913498369, "eval_MaskedAccuracy": 0.5098191979939412, "eval_loss": 1.5994396209716797, "eval_runtime": 150.1648, "eval_samples_per_second": 422.709, "eval_steps_per_second": 1.652, "step": 1005000 }, { "epoch": 4.094556513007071, "grad_norm": 4.041198253631592, "learning_rate": 0.0015612155959965055, "loss": 7.5617, "step": 1005100 }, { "epoch": 4.094963891030453, "grad_norm": 12.734151840209961, "learning_rate": 0.001560756383503542, "loss": 7.5798, "step": 1005200 }, { "epoch": 4.095371269053834, "grad_norm": 10.603124618530273, "learning_rate": 0.0015602972083415788, "loss": 7.5945, "step": 1005300 }, { "epoch": 4.095778647077215, "grad_norm": 8.641983985900879, "learning_rate": 0.0015598380705287693, "loss": 7.5958, "step": 1005400 }, { "epoch": 4.096186025100597, "grad_norm": 3.963597536087036, "learning_rate": 0.0015593789700832634, "loss": 7.6038, "step": 1005500 }, { "epoch": 4.096593403123978, "grad_norm": 10.408990859985352, "learning_rate": 0.0015589199070232171, "loss": 7.5924, "step": 1005600 }, { "epoch": 4.09700078114736, "grad_norm": 9.836311340332031, "learning_rate": 0.0015584608813667773, "loss": 7.6157, "step": 1005700 }, { "epoch": 4.097408159170741, "grad_norm": 6.25810432434082, "learning_rate": 0.00155800189313209, "loss": 7.5978, "step": 1005800 }, { "epoch": 4.097815537194123, "grad_norm": 12.14841365814209, "learning_rate": 0.0015575429423373042, "loss": 7.5971, "step": 1005900 }, { "epoch": 4.098222915217504, "grad_norm": 9.926202774047852, "learning_rate": 0.0015570840290005628, "loss": 7.6078, "step": 1006000 }, { "epoch": 4.098222915217504, "eval_MaskedAccuracy": 0.5101178245561072, "eval_loss": 1.602278232574463, "eval_runtime": 151.9913, "eval_samples_per_second": 417.629, "eval_steps_per_second": 1.632, "step": 1006000 }, { "epoch": 4.098630293240886, "grad_norm": 5.3686442375183105, "learning_rate": 0.001556625153140009, "loss": 7.5883, "step": 1006100 }, { "epoch": 4.099037671264267, "grad_norm": 9.6718111038208, "learning_rate": 0.0015561663147737857, "loss": 7.5826, "step": 1006200 }, { "epoch": 4.099445049287649, "grad_norm": 4.49713134765625, "learning_rate": 0.001555707513920033, "loss": 7.619, "step": 1006300 }, { "epoch": 4.09985242731103, "grad_norm": 6.915175437927246, "learning_rate": 0.0015552487505968878, "loss": 7.6089, "step": 1006400 }, { "epoch": 4.100259805334412, "grad_norm": 12.97745132446289, "learning_rate": 0.0015547900248224895, "loss": 7.6195, "step": 1006500 }, { "epoch": 4.100667183357793, "grad_norm": 6.967992782592773, "learning_rate": 0.0015543313366149737, "loss": 7.5922, "step": 1006600 }, { "epoch": 4.101074561381174, "grad_norm": 3.911487340927124, "learning_rate": 0.001553872685992476, "loss": 7.5906, "step": 1006700 }, { "epoch": 4.101481939404556, "grad_norm": 8.793313026428223, "learning_rate": 0.0015534140729731266, "loss": 7.6096, "step": 1006800 }, { "epoch": 4.101889317427937, "grad_norm": 7.74241828918457, "learning_rate": 0.0015529554975750592, "loss": 7.5587, "step": 1006900 }, { "epoch": 4.102296695451319, "grad_norm": 11.245430946350098, "learning_rate": 0.0015524969598164034, "loss": 7.6231, "step": 1007000 }, { "epoch": 4.102296695451319, "eval_MaskedAccuracy": 0.509991830061726, "eval_loss": 1.5961247682571411, "eval_runtime": 157.9598, "eval_samples_per_second": 401.849, "eval_steps_per_second": 1.57, "step": 1007000 }, { "epoch": 4.1027040734747, "grad_norm": 5.222518444061279, "learning_rate": 0.0015520384597152866, "loss": 7.5836, "step": 1007100 }, { "epoch": 4.103111451498082, "grad_norm": 7.076982498168945, "learning_rate": 0.0015515799972898351, "loss": 7.5813, "step": 1007200 }, { "epoch": 4.103518829521463, "grad_norm": 4.8812947273254395, "learning_rate": 0.0015511215725581766, "loss": 7.5946, "step": 1007300 }, { "epoch": 4.103926207544845, "grad_norm": 5.624736309051514, "learning_rate": 0.0015506631855384337, "loss": 7.6165, "step": 1007400 }, { "epoch": 4.104333585568226, "grad_norm": 6.506237983703613, "learning_rate": 0.001550204836248731, "loss": 7.6067, "step": 1007500 }, { "epoch": 4.104740963591608, "grad_norm": 4.0211896896362305, "learning_rate": 0.0015497465247071868, "loss": 7.5996, "step": 1007600 }, { "epoch": 4.105148341614989, "grad_norm": 5.129552364349365, "learning_rate": 0.00154928825093192, "loss": 7.5939, "step": 1007700 }, { "epoch": 4.105555719638371, "grad_norm": 5.615102291107178, "learning_rate": 0.00154883001494105, "loss": 7.5863, "step": 1007800 }, { "epoch": 4.105963097661752, "grad_norm": 3.3849809169769287, "learning_rate": 0.0015483718167526948, "loss": 7.623, "step": 1007900 }, { "epoch": 4.106370475685133, "grad_norm": 4.826959609985352, "learning_rate": 0.001547913656384966, "loss": 7.6, "step": 1008000 }, { "epoch": 4.106370475685133, "eval_MaskedAccuracy": 0.5105324922947476, "eval_loss": 1.59247887134552, "eval_runtime": 169.9465, "eval_samples_per_second": 373.506, "eval_steps_per_second": 1.459, "step": 1008000 }, { "epoch": 4.106777853708515, "grad_norm": 7.931018352508545, "learning_rate": 0.0015474555338559819, "loss": 7.6056, "step": 1008100 }, { "epoch": 4.107185231731896, "grad_norm": 3.6159827709198, "learning_rate": 0.0015469974491838507, "loss": 7.6173, "step": 1008200 }, { "epoch": 4.107592609755278, "grad_norm": 4.925380229949951, "learning_rate": 0.001546539402386685, "loss": 7.6099, "step": 1008300 }, { "epoch": 4.107999987778659, "grad_norm": 6.228870868682861, "learning_rate": 0.001546081393482591, "loss": 7.6284, "step": 1008400 }, { "epoch": 4.108407365802041, "grad_norm": 4.068419933319092, "learning_rate": 0.0015456234224896779, "loss": 7.586, "step": 1008500 }, { "epoch": 4.108814743825422, "grad_norm": 5.585260391235352, "learning_rate": 0.0015451654894260538, "loss": 7.5905, "step": 1008600 }, { "epoch": 4.109222121848804, "grad_norm": 9.680155754089355, "learning_rate": 0.001544707594309821, "loss": 7.6205, "step": 1008700 }, { "epoch": 4.109629499872185, "grad_norm": 6.640585422515869, "learning_rate": 0.0015442497371590835, "loss": 7.597, "step": 1008800 }, { "epoch": 4.110036877895567, "grad_norm": 6.251063823699951, "learning_rate": 0.0015437919179919425, "loss": 7.625, "step": 1008900 }, { "epoch": 4.1104442559189485, "grad_norm": 6.639671802520752, "learning_rate": 0.0015433341368264976, "loss": 7.6054, "step": 1009000 }, { "epoch": 4.1104442559189485, "eval_MaskedAccuracy": 0.5102449508291053, "eval_loss": 1.5968902111053467, "eval_runtime": 151.4579, "eval_samples_per_second": 419.1, "eval_steps_per_second": 1.637, "step": 1009000 }, { "epoch": 4.110851633942329, "grad_norm": 5.833513259887695, "learning_rate": 0.0015428763936808465, "loss": 7.5765, "step": 1009100 }, { "epoch": 4.111259011965711, "grad_norm": 13.104619026184082, "learning_rate": 0.0015424186885730888, "loss": 7.6393, "step": 1009200 }, { "epoch": 4.111666389989092, "grad_norm": 8.669432640075684, "learning_rate": 0.0015419610215213183, "loss": 7.5906, "step": 1009300 }, { "epoch": 4.112073768012474, "grad_norm": 6.3840227127075195, "learning_rate": 0.001541503392543628, "loss": 7.6109, "step": 1009400 }, { "epoch": 4.112481146035855, "grad_norm": 9.165207862854004, "learning_rate": 0.001541045801658114, "loss": 7.6238, "step": 1009500 }, { "epoch": 4.112888524059237, "grad_norm": 10.769502639770508, "learning_rate": 0.001540588248882864, "loss": 7.5711, "step": 1009600 }, { "epoch": 4.113295902082618, "grad_norm": 4.284952640533447, "learning_rate": 0.0015401307342359696, "loss": 7.616, "step": 1009700 }, { "epoch": 4.113703280106, "grad_norm": 8.703279495239258, "learning_rate": 0.0015396732577355173, "loss": 7.616, "step": 1009800 }, { "epoch": 4.114110658129381, "grad_norm": 6.439189434051514, "learning_rate": 0.0015392158193995938, "loss": 7.6108, "step": 1009900 }, { "epoch": 4.114518036152763, "grad_norm": 3.0160834789276123, "learning_rate": 0.0015387584192462836, "loss": 7.5934, "step": 1010000 }, { "epoch": 4.114518036152763, "eval_MaskedAccuracy": 0.5099782953950945, "eval_loss": 1.5968457460403442, "eval_runtime": 152.8504, "eval_samples_per_second": 415.282, "eval_steps_per_second": 1.623, "step": 1010000 }, { "epoch": 4.114925414176144, "grad_norm": 4.228946208953857, "learning_rate": 0.001538301057293671, "loss": 7.6321, "step": 1010100 }, { "epoch": 4.115332792199526, "grad_norm": 4.3874616622924805, "learning_rate": 0.0015378437335598373, "loss": 7.616, "step": 1010200 }, { "epoch": 4.1157401702229075, "grad_norm": 4.825639724731445, "learning_rate": 0.0015373864480628648, "loss": 7.6088, "step": 1010300 }, { "epoch": 4.116147548246288, "grad_norm": 4.5141448974609375, "learning_rate": 0.00153692920082083, "loss": 7.6245, "step": 1010400 }, { "epoch": 4.11655492626967, "grad_norm": 8.228910446166992, "learning_rate": 0.0015364719918518114, "loss": 7.5834, "step": 1010500 }, { "epoch": 4.116962304293051, "grad_norm": 7.053731918334961, "learning_rate": 0.001536014821173885, "loss": 7.6027, "step": 1010600 }, { "epoch": 4.117369682316433, "grad_norm": 6.305589199066162, "learning_rate": 0.0015355576888051263, "loss": 7.5999, "step": 1010700 }, { "epoch": 4.117777060339814, "grad_norm": 6.593777656555176, "learning_rate": 0.0015351005947636073, "loss": 7.584, "step": 1010800 }, { "epoch": 4.118184438363196, "grad_norm": 7.992525100708008, "learning_rate": 0.0015346435390673988, "loss": 7.5915, "step": 1010900 }, { "epoch": 4.118591816386577, "grad_norm": 4.64131498336792, "learning_rate": 0.0015341865217345708, "loss": 7.596, "step": 1011000 }, { "epoch": 4.118591816386577, "eval_MaskedAccuracy": 0.5103032010326884, "eval_loss": 1.5907819271087646, "eval_runtime": 152.8091, "eval_samples_per_second": 415.394, "eval_steps_per_second": 1.623, "step": 1011000 }, { "epoch": 4.118999194409959, "grad_norm": 4.767360687255859, "learning_rate": 0.0015337295427831914, "loss": 7.6241, "step": 1011100 }, { "epoch": 4.11940657243334, "grad_norm": 12.26150894165039, "learning_rate": 0.0015332726022313271, "loss": 7.6232, "step": 1011200 }, { "epoch": 4.119813950456722, "grad_norm": 3.7885890007019043, "learning_rate": 0.0015328157000970447, "loss": 7.5975, "step": 1011300 }, { "epoch": 4.1202213284801035, "grad_norm": 9.345982551574707, "learning_rate": 0.0015323588363984075, "loss": 7.6507, "step": 1011400 }, { "epoch": 4.120628706503485, "grad_norm": 3.2871835231781006, "learning_rate": 0.0015319020111534778, "loss": 7.5993, "step": 1011500 }, { "epoch": 4.121036084526866, "grad_norm": 12.509317398071289, "learning_rate": 0.0015314452243803155, "loss": 7.5736, "step": 1011600 }, { "epoch": 4.121443462550247, "grad_norm": 10.545855522155762, "learning_rate": 0.001530988476096982, "loss": 7.6432, "step": 1011700 }, { "epoch": 4.121850840573629, "grad_norm": 12.254087448120117, "learning_rate": 0.001530531766321532, "loss": 7.6374, "step": 1011800 }, { "epoch": 4.12225821859701, "grad_norm": 7.642628192901611, "learning_rate": 0.0015300750950720237, "loss": 7.6145, "step": 1011900 }, { "epoch": 4.122665596620392, "grad_norm": 8.72175121307373, "learning_rate": 0.0015296184623665113, "loss": 7.5868, "step": 1012000 }, { "epoch": 4.122665596620392, "eval_MaskedAccuracy": 0.509399024333583, "eval_loss": 1.596692681312561, "eval_runtime": 160.3397, "eval_samples_per_second": 395.884, "eval_steps_per_second": 1.547, "step": 1012000 }, { "epoch": 4.123072974643773, "grad_norm": 3.646857500076294, "learning_rate": 0.0015291618682230473, "loss": 7.609, "step": 1012100 }, { "epoch": 4.123480352667155, "grad_norm": 8.230676651000977, "learning_rate": 0.0015287053126596844, "loss": 7.6032, "step": 1012200 }, { "epoch": 4.123887730690536, "grad_norm": 8.112112045288086, "learning_rate": 0.0015282487956944717, "loss": 7.6091, "step": 1012300 }, { "epoch": 4.124295108713918, "grad_norm": 8.373854637145996, "learning_rate": 0.001527792317345457, "loss": 7.5804, "step": 1012400 }, { "epoch": 4.1247024867372994, "grad_norm": 4.3560943603515625, "learning_rate": 0.0015273358776306905, "loss": 7.6364, "step": 1012500 }, { "epoch": 4.125109864760681, "grad_norm": 4.420809745788574, "learning_rate": 0.001526879476568214, "loss": 7.5761, "step": 1012600 }, { "epoch": 4.1255172427840625, "grad_norm": 9.671296119689941, "learning_rate": 0.0015264231141760753, "loss": 7.6387, "step": 1012700 }, { "epoch": 4.125924620807444, "grad_norm": 6.722683906555176, "learning_rate": 0.0015259667904723136, "loss": 7.6265, "step": 1012800 }, { "epoch": 4.126331998830825, "grad_norm": 7.6011834144592285, "learning_rate": 0.001525510505474971, "loss": 7.6063, "step": 1012900 }, { "epoch": 4.126739376854206, "grad_norm": 8.455955505371094, "learning_rate": 0.0015250542592020857, "loss": 7.6501, "step": 1013000 }, { "epoch": 4.126739376854206, "eval_MaskedAccuracy": 0.5094283299674538, "eval_loss": 1.5934301614761353, "eval_runtime": 155.974, "eval_samples_per_second": 406.965, "eval_steps_per_second": 1.59, "step": 1013000 }, { "epoch": 4.127146754877588, "grad_norm": 4.501232624053955, "learning_rate": 0.0015245980516716974, "loss": 7.6197, "step": 1013100 }, { "epoch": 4.127554132900969, "grad_norm": 4.817880630493164, "learning_rate": 0.0015241418829018414, "loss": 7.6162, "step": 1013200 }, { "epoch": 4.127961510924351, "grad_norm": 11.781376838684082, "learning_rate": 0.0015236857529105532, "loss": 7.6102, "step": 1013300 }, { "epoch": 4.128368888947732, "grad_norm": 11.634780883789062, "learning_rate": 0.001523229661715866, "loss": 7.6006, "step": 1013400 }, { "epoch": 4.128776266971114, "grad_norm": 4.63670539855957, "learning_rate": 0.001522773609335811, "loss": 7.5974, "step": 1013500 }, { "epoch": 4.129183644994495, "grad_norm": 6.439024925231934, "learning_rate": 0.0015223175957884171, "loss": 7.5839, "step": 1013600 }, { "epoch": 4.129591023017877, "grad_norm": 6.234420299530029, "learning_rate": 0.0015218616210917163, "loss": 7.6136, "step": 1013700 }, { "epoch": 4.1299984010412585, "grad_norm": 4.24252462387085, "learning_rate": 0.0015214056852637325, "loss": 7.6078, "step": 1013800 }, { "epoch": 4.13040577906464, "grad_norm": 12.226213455200195, "learning_rate": 0.0015209497883224919, "loss": 7.6127, "step": 1013900 }, { "epoch": 4.1308131570880215, "grad_norm": 12.963217735290527, "learning_rate": 0.0015204939302860202, "loss": 7.5984, "step": 1014000 }, { "epoch": 4.1308131570880215, "eval_MaskedAccuracy": 0.5100630276593748, "eval_loss": 1.5979079008102417, "eval_runtime": 151.547, "eval_samples_per_second": 418.854, "eval_steps_per_second": 1.636, "step": 1014000 }, { "epoch": 4.131220535111402, "grad_norm": 3.741776704788208, "learning_rate": 0.0015200381111723402, "loss": 7.6189, "step": 1014100 }, { "epoch": 4.131627913134784, "grad_norm": 5.306728363037109, "learning_rate": 0.0015195823309994698, "loss": 7.6562, "step": 1014200 }, { "epoch": 4.132035291158165, "grad_norm": 3.6316115856170654, "learning_rate": 0.0015191265897854314, "loss": 7.6176, "step": 1014300 }, { "epoch": 4.132442669181547, "grad_norm": 10.061392784118652, "learning_rate": 0.001518670887548241, "loss": 7.6344, "step": 1014400 }, { "epoch": 4.132850047204928, "grad_norm": 4.140845775604248, "learning_rate": 0.0015182152243059172, "loss": 7.6132, "step": 1014500 }, { "epoch": 4.13325742522831, "grad_norm": 7.131834983825684, "learning_rate": 0.001517759600076473, "loss": 7.6036, "step": 1014600 }, { "epoch": 4.133664803251691, "grad_norm": 3.7401998043060303, "learning_rate": 0.0015173040148779215, "loss": 7.6173, "step": 1014700 }, { "epoch": 4.134072181275073, "grad_norm": 6.527963638305664, "learning_rate": 0.0015168484687282742, "loss": 7.6034, "step": 1014800 }, { "epoch": 4.1344795592984545, "grad_norm": 12.643240928649902, "learning_rate": 0.0015163929616455426, "loss": 7.6096, "step": 1014900 }, { "epoch": 4.134886937321836, "grad_norm": 6.257790565490723, "learning_rate": 0.0015159374936477344, "loss": 7.6189, "step": 1015000 }, { "epoch": 4.134886937321836, "eval_MaskedAccuracy": 0.5102536230156649, "eval_loss": 1.6067798137664795, "eval_runtime": 156.6268, "eval_samples_per_second": 405.269, "eval_steps_per_second": 1.583, "step": 1015000 }, { "epoch": 4.1352943153452175, "grad_norm": 9.284547805786133, "learning_rate": 0.001515482064752858, "loss": 7.602, "step": 1015100 }, { "epoch": 4.135701693368599, "grad_norm": 6.334458827972412, "learning_rate": 0.0015150266749789161, "loss": 7.5702, "step": 1015200 }, { "epoch": 4.13610907139198, "grad_norm": 11.795574188232422, "learning_rate": 0.0015145713243439152, "loss": 7.5911, "step": 1015300 }, { "epoch": 4.136516449415361, "grad_norm": 6.572207450866699, "learning_rate": 0.0015141160128658575, "loss": 7.5995, "step": 1015400 }, { "epoch": 4.136923827438743, "grad_norm": 9.94412899017334, "learning_rate": 0.001513660740562742, "loss": 7.5867, "step": 1015500 }, { "epoch": 4.137331205462124, "grad_norm": 15.848379135131836, "learning_rate": 0.0015132055074525707, "loss": 7.5937, "step": 1015600 }, { "epoch": 4.137738583485506, "grad_norm": 8.111241340637207, "learning_rate": 0.0015127503135533393, "loss": 7.6214, "step": 1015700 }, { "epoch": 4.138145961508887, "grad_norm": 8.875723838806152, "learning_rate": 0.0015122951588830453, "loss": 7.612, "step": 1015800 }, { "epoch": 4.138553339532269, "grad_norm": 7.9057817459106445, "learning_rate": 0.001511840043459683, "loss": 7.6189, "step": 1015900 }, { "epoch": 4.13896071755565, "grad_norm": 3.75134015083313, "learning_rate": 0.001511384967301245, "loss": 7.6028, "step": 1016000 }, { "epoch": 4.13896071755565, "eval_MaskedAccuracy": 0.5100770322511757, "eval_loss": 1.59774649143219, "eval_runtime": 159.2981, "eval_samples_per_second": 398.473, "eval_steps_per_second": 1.557, "step": 1016000 }, { "epoch": 4.139368095579032, "grad_norm": 4.936377048492432, "learning_rate": 0.001510929930425725, "loss": 7.5967, "step": 1016100 }, { "epoch": 4.1397754736024135, "grad_norm": 6.553722858428955, "learning_rate": 0.0015104749328511118, "loss": 7.5972, "step": 1016200 }, { "epoch": 4.140182851625795, "grad_norm": 8.784997940063477, "learning_rate": 0.0015100199745953918, "loss": 7.6006, "step": 1016300 }, { "epoch": 4.140590229649177, "grad_norm": 4.709084987640381, "learning_rate": 0.001509565055676554, "loss": 7.5659, "step": 1016400 }, { "epoch": 4.140997607672558, "grad_norm": 9.501605987548828, "learning_rate": 0.0015091101761125843, "loss": 7.6151, "step": 1016500 }, { "epoch": 4.141404985695939, "grad_norm": 4.299465656280518, "learning_rate": 0.0015086553359214647, "loss": 7.6033, "step": 1016600 }, { "epoch": 4.14181236371932, "grad_norm": 19.92583465576172, "learning_rate": 0.0015082005351211793, "loss": 7.6114, "step": 1016700 }, { "epoch": 4.142219741742702, "grad_norm": 7.212285995483398, "learning_rate": 0.001507745773729707, "loss": 7.6489, "step": 1016800 }, { "epoch": 4.142627119766083, "grad_norm": 5.368351936340332, "learning_rate": 0.0015072910517650296, "loss": 7.607, "step": 1016900 }, { "epoch": 4.143034497789465, "grad_norm": 5.146826267242432, "learning_rate": 0.0015068363692451218, "loss": 7.6289, "step": 1017000 }, { "epoch": 4.143034497789465, "eval_MaskedAccuracy": 0.5098016886272805, "eval_loss": 1.6056424379348755, "eval_runtime": 166.4361, "eval_samples_per_second": 381.384, "eval_steps_per_second": 1.49, "step": 1017000 }, { "epoch": 4.143441875812846, "grad_norm": 4.130514144897461, "learning_rate": 0.0015063817261879615, "loss": 7.6085, "step": 1017100 }, { "epoch": 4.143849253836228, "grad_norm": 13.537511825561523, "learning_rate": 0.0015059271226115219, "loss": 7.6193, "step": 1017200 }, { "epoch": 4.1442566318596095, "grad_norm": 5.954915523529053, "learning_rate": 0.001505472558533777, "loss": 7.5905, "step": 1017300 }, { "epoch": 4.144664009882991, "grad_norm": 6.077690601348877, "learning_rate": 0.0015050180339726965, "loss": 7.6382, "step": 1017400 }, { "epoch": 4.1450713879063725, "grad_norm": 4.083377838134766, "learning_rate": 0.0015045635489462526, "loss": 7.626, "step": 1017500 }, { "epoch": 4.145478765929754, "grad_norm": 6.283422470092773, "learning_rate": 0.0015041091034724118, "loss": 7.6278, "step": 1017600 }, { "epoch": 4.145886143953136, "grad_norm": 3.2693064212799072, "learning_rate": 0.0015036546975691425, "loss": 7.6108, "step": 1017700 }, { "epoch": 4.146293521976517, "grad_norm": 5.898975372314453, "learning_rate": 0.0015032003312544081, "loss": 7.6138, "step": 1017800 }, { "epoch": 4.146700899999898, "grad_norm": 5.674659252166748, "learning_rate": 0.0015027460045461706, "loss": 7.6308, "step": 1017900 }, { "epoch": 4.147108278023279, "grad_norm": 9.477431297302246, "learning_rate": 0.0015022917174623966, "loss": 7.6202, "step": 1018000 }, { "epoch": 4.147108278023279, "eval_MaskedAccuracy": 0.5103109231301669, "eval_loss": 1.5969358682632446, "eval_runtime": 167.1107, "eval_samples_per_second": 379.844, "eval_steps_per_second": 1.484, "step": 1018000 }, { "epoch": 4.147515656046661, "grad_norm": 4.549951076507568, "learning_rate": 0.0015018374700210449, "loss": 7.6322, "step": 1018100 }, { "epoch": 4.147923034070042, "grad_norm": 2.744879961013794, "learning_rate": 0.00150138326224007, "loss": 7.6188, "step": 1018200 }, { "epoch": 4.148330412093424, "grad_norm": 10.048369407653809, "learning_rate": 0.001500929094137433, "loss": 7.6332, "step": 1018300 }, { "epoch": 4.148737790116805, "grad_norm": 5.693751811981201, "learning_rate": 0.0015004749657310898, "loss": 7.5786, "step": 1018400 }, { "epoch": 4.149145168140187, "grad_norm": 6.913130760192871, "learning_rate": 0.0015000208770389943, "loss": 7.5854, "step": 1018500 }, { "epoch": 4.1495525461635685, "grad_norm": 5.684288501739502, "learning_rate": 0.0014995668280790985, "loss": 7.5938, "step": 1018600 }, { "epoch": 4.14995992418695, "grad_norm": 4.903898239135742, "learning_rate": 0.0014991128188693535, "loss": 7.6359, "step": 1018700 }, { "epoch": 4.150367302210332, "grad_norm": 6.3219313621521, "learning_rate": 0.0014986588494277082, "loss": 7.5987, "step": 1018800 }, { "epoch": 4.150774680233713, "grad_norm": 5.960380554199219, "learning_rate": 0.001498204919772111, "loss": 7.5887, "step": 1018900 }, { "epoch": 4.151182058257095, "grad_norm": 4.532342433929443, "learning_rate": 0.0014977510299205093, "loss": 7.6135, "step": 1019000 }, { "epoch": 4.151182058257095, "eval_MaskedAccuracy": 0.5101085505391345, "eval_loss": 1.608246088027954, "eval_runtime": 156.2264, "eval_samples_per_second": 406.308, "eval_steps_per_second": 1.587, "step": 1019000 }, { "epoch": 4.151589436280475, "grad_norm": 6.990480422973633, "learning_rate": 0.0014972971798908447, "loss": 7.5806, "step": 1019100 }, { "epoch": 4.151996814303857, "grad_norm": 5.496598243713379, "learning_rate": 0.0014968433697010622, "loss": 7.6284, "step": 1019200 }, { "epoch": 4.152404192327238, "grad_norm": 8.429397583007812, "learning_rate": 0.0014963895993691026, "loss": 7.6237, "step": 1019300 }, { "epoch": 4.15281157035062, "grad_norm": 2.5694942474365234, "learning_rate": 0.0014959358689129074, "loss": 7.6191, "step": 1019400 }, { "epoch": 4.153218948374001, "grad_norm": 7.12736701965332, "learning_rate": 0.0014954821783504152, "loss": 7.6026, "step": 1019500 }, { "epoch": 4.153626326397383, "grad_norm": 10.624343872070312, "learning_rate": 0.0014950285276995616, "loss": 7.6264, "step": 1019600 }, { "epoch": 4.1540337044207645, "grad_norm": 3.3696537017822266, "learning_rate": 0.0014945749169782824, "loss": 7.613, "step": 1019700 }, { "epoch": 4.154441082444146, "grad_norm": 4.400647163391113, "learning_rate": 0.0014941213462045097, "loss": 7.618, "step": 1019800 }, { "epoch": 4.1548484604675275, "grad_norm": 4.743301868438721, "learning_rate": 0.0014936678153961778, "loss": 7.5831, "step": 1019900 }, { "epoch": 4.155255838490909, "grad_norm": 5.98439884185791, "learning_rate": 0.001493214324571215, "loss": 7.589, "step": 1020000 }, { "epoch": 4.155255838490909, "eval_MaskedAccuracy": 0.5094930484841899, "eval_loss": 1.588067889213562, "eval_runtime": 182.8127, "eval_samples_per_second": 347.219, "eval_steps_per_second": 1.357, "step": 1020000 }, { "epoch": 4.155663216514291, "grad_norm": 9.075858116149902, "learning_rate": 0.0014927608737475527, "loss": 7.6092, "step": 1020100 }, { "epoch": 4.156070594537672, "grad_norm": 4.691498756408691, "learning_rate": 0.0014923074629431177, "loss": 7.6096, "step": 1020200 }, { "epoch": 4.156477972561053, "grad_norm": 4.332222938537598, "learning_rate": 0.0014918540921758342, "loss": 7.6053, "step": 1020300 }, { "epoch": 4.156885350584434, "grad_norm": 6.397316932678223, "learning_rate": 0.001491400761463626, "loss": 7.6311, "step": 1020400 }, { "epoch": 4.157292728607816, "grad_norm": 11.58827018737793, "learning_rate": 0.0014909474708244147, "loss": 7.6123, "step": 1020500 }, { "epoch": 4.157700106631197, "grad_norm": 4.856008052825928, "learning_rate": 0.0014904942202761241, "loss": 7.6194, "step": 1020600 }, { "epoch": 4.158107484654579, "grad_norm": 3.3364298343658447, "learning_rate": 0.001490041009836675, "loss": 7.5989, "step": 1020700 }, { "epoch": 4.1585148626779604, "grad_norm": 4.057903289794922, "learning_rate": 0.001489587839523983, "loss": 7.5818, "step": 1020800 }, { "epoch": 4.158922240701342, "grad_norm": 5.017449855804443, "learning_rate": 0.0014891347093559642, "loss": 7.5748, "step": 1020900 }, { "epoch": 4.1593296187247235, "grad_norm": 2.404221296310425, "learning_rate": 0.0014886816193505337, "loss": 7.5745, "step": 1021000 }, { "epoch": 4.1593296187247235, "eval_MaskedAccuracy": 0.5101413743344813, "eval_loss": 1.6007146835327148, "eval_runtime": 155.7336, "eval_samples_per_second": 407.593, "eval_steps_per_second": 1.592, "step": 1021000 }, { "epoch": 4.159736996748105, "grad_norm": 4.9971795082092285, "learning_rate": 0.0014882285695256045, "loss": 7.6058, "step": 1021100 }, { "epoch": 4.160144374771487, "grad_norm": 4.318365573883057, "learning_rate": 0.0014877755598990865, "loss": 7.6422, "step": 1021200 }, { "epoch": 4.160551752794868, "grad_norm": 5.754800319671631, "learning_rate": 0.0014873225904888903, "loss": 7.5933, "step": 1021300 }, { "epoch": 4.16095913081825, "grad_norm": 3.9653968811035156, "learning_rate": 0.0014868696613129265, "loss": 7.6267, "step": 1021400 }, { "epoch": 4.161366508841631, "grad_norm": 5.344866752624512, "learning_rate": 0.0014864167723891012, "loss": 7.6064, "step": 1021500 }, { "epoch": 4.161773886865012, "grad_norm": 4.454051494598389, "learning_rate": 0.0014859639237353205, "loss": 7.5842, "step": 1021600 }, { "epoch": 4.162181264888393, "grad_norm": 14.928621292114258, "learning_rate": 0.001485511115369483, "loss": 7.6124, "step": 1021700 }, { "epoch": 4.162588642911775, "grad_norm": 3.4864470958709717, "learning_rate": 0.0014850583473094946, "loss": 7.6281, "step": 1021800 }, { "epoch": 4.162996020935156, "grad_norm": 5.536746501922607, "learning_rate": 0.0014846056195732538, "loss": 7.6176, "step": 1021900 }, { "epoch": 4.163403398958538, "grad_norm": 7.407474994659424, "learning_rate": 0.001484152932178661, "loss": 7.6222, "step": 1022000 }, { "epoch": 4.163403398958538, "eval_MaskedAccuracy": 0.5099852719236984, "eval_loss": 1.6082144975662231, "eval_runtime": 157.8049, "eval_samples_per_second": 402.244, "eval_steps_per_second": 1.572, "step": 1022000 }, { "epoch": 4.1638107769819195, "grad_norm": 6.846193790435791, "learning_rate": 0.0014837002851436137, "loss": 7.6007, "step": 1022100 }, { "epoch": 4.164218155005301, "grad_norm": 4.985725402832031, "learning_rate": 0.0014832476784860044, "loss": 7.5966, "step": 1022200 }, { "epoch": 4.1646255330286825, "grad_norm": 9.667717933654785, "learning_rate": 0.0014827951122237273, "loss": 7.6015, "step": 1022300 }, { "epoch": 4.165032911052064, "grad_norm": 5.9029130935668945, "learning_rate": 0.0014823425863746812, "loss": 7.6246, "step": 1022400 }, { "epoch": 4.165440289075446, "grad_norm": 7.814671039581299, "learning_rate": 0.0014818901009567508, "loss": 7.5965, "step": 1022500 }, { "epoch": 4.165847667098827, "grad_norm": 7.018571853637695, "learning_rate": 0.0014814376559878259, "loss": 7.605, "step": 1022600 }, { "epoch": 4.166255045122209, "grad_norm": 5.882373332977295, "learning_rate": 0.0014809852514857955, "loss": 7.5994, "step": 1022700 }, { "epoch": 4.16666242314559, "grad_norm": 7.16847038269043, "learning_rate": 0.0014805328874685439, "loss": 7.6254, "step": 1022800 }, { "epoch": 4.167069801168971, "grad_norm": 8.592397689819336, "learning_rate": 0.0014800805639539583, "loss": 7.5882, "step": 1022900 }, { "epoch": 4.167477179192352, "grad_norm": 3.2145416736602783, "learning_rate": 0.0014796282809599192, "loss": 7.5909, "step": 1023000 }, { "epoch": 4.167477179192352, "eval_MaskedAccuracy": 0.5102767782158548, "eval_loss": 1.5986086130142212, "eval_runtime": 159.2053, "eval_samples_per_second": 398.705, "eval_steps_per_second": 1.558, "step": 1023000 }, { "epoch": 4.167884557215734, "grad_norm": 4.349625110626221, "learning_rate": 0.0014791760385043076, "loss": 7.5913, "step": 1023100 }, { "epoch": 4.1682919352391155, "grad_norm": 8.549214363098145, "learning_rate": 0.0014787238366050037, "loss": 7.5978, "step": 1023200 }, { "epoch": 4.168699313262497, "grad_norm": 4.202352523803711, "learning_rate": 0.0014782716752798862, "loss": 7.614, "step": 1023300 }, { "epoch": 4.1691066912858785, "grad_norm": 7.24500036239624, "learning_rate": 0.0014778195545468314, "loss": 7.5903, "step": 1023400 }, { "epoch": 4.16951406930926, "grad_norm": 10.378110885620117, "learning_rate": 0.0014773674744237115, "loss": 7.5901, "step": 1023500 }, { "epoch": 4.169921447332642, "grad_norm": 3.6616709232330322, "learning_rate": 0.0014769154349284037, "loss": 7.6289, "step": 1023600 }, { "epoch": 4.170328825356023, "grad_norm": 7.426562309265137, "learning_rate": 0.0014764634360787782, "loss": 7.5777, "step": 1023700 }, { "epoch": 4.170736203379405, "grad_norm": 9.659289360046387, "learning_rate": 0.0014760114778927013, "loss": 7.5904, "step": 1023800 }, { "epoch": 4.171143581402786, "grad_norm": 9.703145027160645, "learning_rate": 0.001475559560388045, "loss": 7.6234, "step": 1023900 }, { "epoch": 4.171550959426168, "grad_norm": 4.7512006759643555, "learning_rate": 0.0014751076835826742, "loss": 7.5886, "step": 1024000 }, { "epoch": 4.171550959426168, "eval_MaskedAccuracy": 0.5098047120751074, "eval_loss": 1.5959850549697876, "eval_runtime": 159.3796, "eval_samples_per_second": 398.269, "eval_steps_per_second": 1.556, "step": 1024000 }, { "epoch": 4.171958337449548, "grad_norm": 11.228114128112793, "learning_rate": 0.0014746558474944566, "loss": 7.5962, "step": 1024100 }, { "epoch": 4.17236571547293, "grad_norm": 9.726095199584961, "learning_rate": 0.001474204052141253, "loss": 7.5837, "step": 1024200 }, { "epoch": 4.172773093496311, "grad_norm": 3.0137314796447754, "learning_rate": 0.001473752297540926, "loss": 7.5867, "step": 1024300 }, { "epoch": 4.173180471519693, "grad_norm": 9.817083358764648, "learning_rate": 0.0014733005837113374, "loss": 7.6123, "step": 1024400 }, { "epoch": 4.1735878495430745, "grad_norm": 4.301794528961182, "learning_rate": 0.0014728489106703444, "loss": 7.5733, "step": 1024500 }, { "epoch": 4.173995227566456, "grad_norm": 7.952913284301758, "learning_rate": 0.0014723972784358033, "loss": 7.5825, "step": 1024600 }, { "epoch": 4.174402605589838, "grad_norm": 3.9031291007995605, "learning_rate": 0.0014719456870255704, "loss": 7.6271, "step": 1024700 }, { "epoch": 4.174809983613219, "grad_norm": 5.106107711791992, "learning_rate": 0.0014714941364575005, "loss": 7.5446, "step": 1024800 }, { "epoch": 4.175217361636601, "grad_norm": 13.670040130615234, "learning_rate": 0.0014710426267494458, "loss": 7.6107, "step": 1024900 }, { "epoch": 4.175624739659982, "grad_norm": 10.635793685913086, "learning_rate": 0.0014705911579192554, "loss": 7.5953, "step": 1025000 }, { "epoch": 4.175624739659982, "eval_MaskedAccuracy": 0.5106687895399447, "eval_loss": 1.593947410583496, "eval_runtime": 160.2436, "eval_samples_per_second": 396.122, "eval_steps_per_second": 1.548, "step": 1025000 }, { "epoch": 4.176032117683364, "grad_norm": 8.559614181518555, "learning_rate": 0.0014701397299847787, "loss": 7.5839, "step": 1025100 }, { "epoch": 4.176439495706745, "grad_norm": 9.795751571655273, "learning_rate": 0.001469688342963864, "loss": 7.615, "step": 1025200 }, { "epoch": 4.176846873730126, "grad_norm": 4.664596080780029, "learning_rate": 0.001469236996874355, "loss": 7.6213, "step": 1025300 }, { "epoch": 4.177254251753507, "grad_norm": 3.550440788269043, "learning_rate": 0.0014687856917340986, "loss": 7.6127, "step": 1025400 }, { "epoch": 4.177661629776889, "grad_norm": 8.955238342285156, "learning_rate": 0.0014683344275609337, "loss": 7.5867, "step": 1025500 }, { "epoch": 4.1780690078002705, "grad_norm": 3.6599338054656982, "learning_rate": 0.0014678832043727042, "loss": 7.592, "step": 1025600 }, { "epoch": 4.178476385823652, "grad_norm": 5.006877422332764, "learning_rate": 0.0014674320221872489, "loss": 7.5848, "step": 1025700 }, { "epoch": 4.1788837638470335, "grad_norm": 8.671599388122559, "learning_rate": 0.0014669808810224034, "loss": 7.589, "step": 1025800 }, { "epoch": 4.179291141870415, "grad_norm": 5.992976665496826, "learning_rate": 0.0014665297808960065, "loss": 7.603, "step": 1025900 }, { "epoch": 4.179698519893797, "grad_norm": 10.025277137756348, "learning_rate": 0.0014660787218258914, "loss": 7.5922, "step": 1026000 }, { "epoch": 4.179698519893797, "eval_MaskedAccuracy": 0.5096075246703081, "eval_loss": 1.6091586351394653, "eval_runtime": 157.9571, "eval_samples_per_second": 401.856, "eval_steps_per_second": 1.57, "step": 1026000 }, { "epoch": 4.180105897917178, "grad_norm": 8.122149467468262, "learning_rate": 0.0014656277038298919, "loss": 7.5758, "step": 1026100 }, { "epoch": 4.18051327594056, "grad_norm": 3.658780813217163, "learning_rate": 0.0014651767269258384, "loss": 7.6023, "step": 1026200 }, { "epoch": 4.180920653963941, "grad_norm": 7.74526834487915, "learning_rate": 0.001464725791131558, "loss": 7.6064, "step": 1026300 }, { "epoch": 4.181328031987323, "grad_norm": 10.288622856140137, "learning_rate": 0.0014642748964648821, "loss": 7.6268, "step": 1026400 }, { "epoch": 4.181735410010704, "grad_norm": 11.330092430114746, "learning_rate": 0.001463824042943636, "loss": 7.5888, "step": 1026500 }, { "epoch": 4.182142788034085, "grad_norm": 6.9427080154418945, "learning_rate": 0.0014633732305856443, "loss": 7.6169, "step": 1026600 }, { "epoch": 4.182550166057466, "grad_norm": 10.429247856140137, "learning_rate": 0.00146292245940873, "loss": 7.5902, "step": 1026700 }, { "epoch": 4.182957544080848, "grad_norm": 14.039982795715332, "learning_rate": 0.001462471729430715, "loss": 7.6083, "step": 1026800 }, { "epoch": 4.1833649221042295, "grad_norm": 2.7318310737609863, "learning_rate": 0.001462021040669417, "loss": 7.6211, "step": 1026900 }, { "epoch": 4.183772300127611, "grad_norm": 4.351866245269775, "learning_rate": 0.0014615703931426555, "loss": 7.5958, "step": 1027000 }, { "epoch": 4.183772300127611, "eval_MaskedAccuracy": 0.5106928532701904, "eval_loss": 1.6009950637817383, "eval_runtime": 158.6672, "eval_samples_per_second": 400.057, "eval_steps_per_second": 1.563, "step": 1027000 }, { "epoch": 4.184179678150993, "grad_norm": 5.183883190155029, "learning_rate": 0.001461119786868247, "loss": 7.5966, "step": 1027100 }, { "epoch": 4.184587056174374, "grad_norm": 12.216350555419922, "learning_rate": 0.0014606692218640058, "loss": 7.6025, "step": 1027200 }, { "epoch": 4.184994434197756, "grad_norm": 3.1864845752716064, "learning_rate": 0.0014602186981477446, "loss": 7.6272, "step": 1027300 }, { "epoch": 4.185401812221137, "grad_norm": 8.0393705368042, "learning_rate": 0.0014597682157372787, "loss": 7.6049, "step": 1027400 }, { "epoch": 4.185809190244519, "grad_norm": 5.613422870635986, "learning_rate": 0.001459317774650415, "loss": 7.5624, "step": 1027500 }, { "epoch": 4.1862165682679, "grad_norm": 11.337660789489746, "learning_rate": 0.001458867374904963, "loss": 7.5756, "step": 1027600 }, { "epoch": 4.186623946291282, "grad_norm": 6.008993148803711, "learning_rate": 0.0014584170165187287, "loss": 7.5922, "step": 1027700 }, { "epoch": 4.187031324314663, "grad_norm": 14.749534606933594, "learning_rate": 0.0014579666995095167, "loss": 7.5996, "step": 1027800 }, { "epoch": 4.187438702338044, "grad_norm": 14.441166877746582, "learning_rate": 0.0014575164238951315, "loss": 7.6402, "step": 1027900 }, { "epoch": 4.1878460803614255, "grad_norm": 3.8267993927001953, "learning_rate": 0.0014570661896933725, "loss": 7.591, "step": 1028000 }, { "epoch": 4.1878460803614255, "eval_MaskedAccuracy": 0.5104208789487067, "eval_loss": 1.5986571311950684, "eval_runtime": 164.851, "eval_samples_per_second": 385.051, "eval_steps_per_second": 1.504, "step": 1028000 }, { "epoch": 4.188253458384807, "grad_norm": 12.592936515808105, "learning_rate": 0.001456615996922042, "loss": 7.5936, "step": 1028100 }, { "epoch": 4.1886608364081885, "grad_norm": 16.150226593017578, "learning_rate": 0.0014561658455989404, "loss": 7.57, "step": 1028200 }, { "epoch": 4.18906821443157, "grad_norm": 16.680103302001953, "learning_rate": 0.0014557157357418605, "loss": 7.5961, "step": 1028300 }, { "epoch": 4.189475592454952, "grad_norm": 4.220909118652344, "learning_rate": 0.0014552656673686006, "loss": 7.6108, "step": 1028400 }, { "epoch": 4.189882970478333, "grad_norm": 9.812032699584961, "learning_rate": 0.001454815640496952, "loss": 7.5964, "step": 1028500 }, { "epoch": 4.190290348501715, "grad_norm": 12.421441078186035, "learning_rate": 0.00145436565514471, "loss": 7.5877, "step": 1028600 }, { "epoch": 4.190697726525096, "grad_norm": 5.953688144683838, "learning_rate": 0.001453915711329661, "loss": 7.5787, "step": 1028700 }, { "epoch": 4.191105104548478, "grad_norm": 4.339315414428711, "learning_rate": 0.001453465809069596, "loss": 7.6027, "step": 1028800 }, { "epoch": 4.191512482571859, "grad_norm": 5.007308006286621, "learning_rate": 0.0014530159483823024, "loss": 7.5843, "step": 1028900 }, { "epoch": 4.191919860595241, "grad_norm": 2.6288020610809326, "learning_rate": 0.0014525661292855642, "loss": 7.6077, "step": 1029000 }, { "epoch": 4.191919860595241, "eval_MaskedAccuracy": 0.5101682141368029, "eval_loss": 1.6052380800247192, "eval_runtime": 155.8335, "eval_samples_per_second": 407.332, "eval_steps_per_second": 1.591, "step": 1029000 }, { "epoch": 4.192327238618621, "grad_norm": 7.667224407196045, "learning_rate": 0.0014521163517971663, "loss": 7.5938, "step": 1029100 }, { "epoch": 4.192734616642003, "grad_norm": 12.914789199829102, "learning_rate": 0.0014516666159348891, "loss": 7.5844, "step": 1029200 }, { "epoch": 4.1931419946653845, "grad_norm": 4.420145511627197, "learning_rate": 0.0014512169217165151, "loss": 7.5856, "step": 1029300 }, { "epoch": 4.193549372688766, "grad_norm": 6.624450206756592, "learning_rate": 0.0014507672691598215, "loss": 7.5676, "step": 1029400 }, { "epoch": 4.193956750712148, "grad_norm": 16.761884689331055, "learning_rate": 0.0014503176582825875, "loss": 7.5813, "step": 1029500 }, { "epoch": 4.194364128735529, "grad_norm": 3.6186683177948, "learning_rate": 0.0014498680891025866, "loss": 7.5833, "step": 1029600 }, { "epoch": 4.194771506758911, "grad_norm": 10.661664009094238, "learning_rate": 0.0014494185616375924, "loss": 7.599, "step": 1029700 }, { "epoch": 4.195178884782292, "grad_norm": 3.963146448135376, "learning_rate": 0.0014489690759053792, "loss": 7.5886, "step": 1029800 }, { "epoch": 4.195586262805674, "grad_norm": 13.324056625366211, "learning_rate": 0.0014485196319237175, "loss": 7.5916, "step": 1029900 }, { "epoch": 4.195993640829055, "grad_norm": 11.302262306213379, "learning_rate": 0.0014480702297103752, "loss": 7.5888, "step": 1030000 }, { "epoch": 4.195993640829055, "eval_MaskedAccuracy": 0.5096965536116576, "eval_loss": 1.5952415466308594, "eval_runtime": 168.002, "eval_samples_per_second": 377.829, "eval_steps_per_second": 1.476, "step": 1030000 }, { "epoch": 4.196401018852437, "grad_norm": 7.486935138702393, "learning_rate": 0.0014476208692831183, "loss": 7.6255, "step": 1030100 }, { "epoch": 4.196808396875818, "grad_norm": 8.094013214111328, "learning_rate": 0.0014471715506597115, "loss": 7.5757, "step": 1030200 }, { "epoch": 4.197215774899199, "grad_norm": 6.376736640930176, "learning_rate": 0.0014467222738579234, "loss": 7.5918, "step": 1030300 }, { "epoch": 4.1976231529225805, "grad_norm": 9.68858814239502, "learning_rate": 0.0014462730388955147, "loss": 7.6198, "step": 1030400 }, { "epoch": 4.198030530945962, "grad_norm": 5.73710823059082, "learning_rate": 0.001445823845790246, "loss": 7.6224, "step": 1030500 }, { "epoch": 4.1984379089693435, "grad_norm": 3.1832115650177, "learning_rate": 0.0014453746945598735, "loss": 7.6208, "step": 1030600 }, { "epoch": 4.198845286992725, "grad_norm": 9.9255952835083, "learning_rate": 0.0014449255852221573, "loss": 7.5886, "step": 1030700 }, { "epoch": 4.199252665016107, "grad_norm": 5.664330005645752, "learning_rate": 0.0014444765177948518, "loss": 7.6045, "step": 1030800 }, { "epoch": 4.199660043039488, "grad_norm": 6.452301979064941, "learning_rate": 0.0014440274922957123, "loss": 7.6006, "step": 1030900 }, { "epoch": 4.20006742106287, "grad_norm": 3.30295729637146, "learning_rate": 0.0014435785087424905, "loss": 7.6189, "step": 1031000 }, { "epoch": 4.20006742106287, "eval_MaskedAccuracy": 0.5100643277420648, "eval_loss": 1.6013303995132446, "eval_runtime": 172.5314, "eval_samples_per_second": 367.91, "eval_steps_per_second": 1.437, "step": 1031000 }, { "epoch": 4.200474799086251, "grad_norm": 5.853790760040283, "learning_rate": 0.0014431295671529364, "loss": 7.6163, "step": 1031100 }, { "epoch": 4.200882177109633, "grad_norm": 9.750107765197754, "learning_rate": 0.0014426806675447992, "loss": 7.5881, "step": 1031200 }, { "epoch": 4.201289555133014, "grad_norm": 11.970582008361816, "learning_rate": 0.0014422318099358263, "loss": 7.6326, "step": 1031300 }, { "epoch": 4.201696933156396, "grad_norm": 9.420632362365723, "learning_rate": 0.0014417829943437666, "loss": 7.6037, "step": 1031400 }, { "epoch": 4.202104311179777, "grad_norm": 13.569077491760254, "learning_rate": 0.001441334220786359, "loss": 7.6071, "step": 1031500 }, { "epoch": 4.202511689203158, "grad_norm": 4.433078289031982, "learning_rate": 0.0014408854892813492, "loss": 7.5812, "step": 1031600 }, { "epoch": 4.2029190672265395, "grad_norm": 5.293403148651123, "learning_rate": 0.0014404367998464768, "loss": 7.6147, "step": 1031700 }, { "epoch": 4.203326445249921, "grad_norm": 5.65088415145874, "learning_rate": 0.0014399881524994828, "loss": 7.6042, "step": 1031800 }, { "epoch": 4.203733823273303, "grad_norm": 12.346303939819336, "learning_rate": 0.001439539547258102, "loss": 7.6056, "step": 1031900 }, { "epoch": 4.204141201296684, "grad_norm": 4.291621208190918, "learning_rate": 0.0014390909841400735, "loss": 7.5951, "step": 1032000 }, { "epoch": 4.204141201296684, "eval_MaskedAccuracy": 0.5096514852280516, "eval_loss": 1.6052285432815552, "eval_runtime": 162.7154, "eval_samples_per_second": 390.105, "eval_steps_per_second": 1.524, "step": 1032000 }, { "epoch": 4.204548579320066, "grad_norm": 14.775259017944336, "learning_rate": 0.0014386424631631278, "loss": 7.5904, "step": 1032100 }, { "epoch": 4.204955957343447, "grad_norm": 9.05004596710205, "learning_rate": 0.0014381939843449991, "loss": 7.5775, "step": 1032200 }, { "epoch": 4.205363335366829, "grad_norm": 3.9531989097595215, "learning_rate": 0.0014377455477034187, "loss": 7.5917, "step": 1032300 }, { "epoch": 4.20577071339021, "grad_norm": 5.5266523361206055, "learning_rate": 0.001437297153256115, "loss": 7.5868, "step": 1032400 }, { "epoch": 4.206178091413592, "grad_norm": 10.092397689819336, "learning_rate": 0.0014368488010208142, "loss": 7.5904, "step": 1032500 }, { "epoch": 4.206585469436973, "grad_norm": 5.211591720581055, "learning_rate": 0.0014364004910152439, "loss": 7.607, "step": 1032600 }, { "epoch": 4.206992847460355, "grad_norm": 10.154476165771484, "learning_rate": 0.0014359522232571264, "loss": 7.5883, "step": 1032700 }, { "epoch": 4.207400225483736, "grad_norm": 8.700800895690918, "learning_rate": 0.0014355039977641858, "loss": 7.6247, "step": 1032800 }, { "epoch": 4.207807603507117, "grad_norm": 7.243443965911865, "learning_rate": 0.0014350558145541437, "loss": 7.6093, "step": 1032900 }, { "epoch": 4.2082149815304986, "grad_norm": 4.729804039001465, "learning_rate": 0.001434607673644716, "loss": 7.6221, "step": 1033000 }, { "epoch": 4.2082149815304986, "eval_MaskedAccuracy": 0.5105174837283005, "eval_loss": 1.6016790866851807, "eval_runtime": 163.8821, "eval_samples_per_second": 387.327, "eval_steps_per_second": 1.513, "step": 1033000 }, { "epoch": 4.20862235955388, "grad_norm": 13.310611724853516, "learning_rate": 0.001434159575053622, "loss": 7.613, "step": 1033100 }, { "epoch": 4.209029737577262, "grad_norm": 11.235798835754395, "learning_rate": 0.0014337115187985775, "loss": 7.5982, "step": 1033200 }, { "epoch": 4.209437115600643, "grad_norm": 7.849621295928955, "learning_rate": 0.0014332635048972964, "loss": 7.6235, "step": 1033300 }, { "epoch": 4.209844493624025, "grad_norm": 6.901366710662842, "learning_rate": 0.0014328155333674913, "loss": 7.6068, "step": 1033400 }, { "epoch": 4.210251871647406, "grad_norm": 4.541335105895996, "learning_rate": 0.0014323676042268712, "loss": 7.6033, "step": 1033500 }, { "epoch": 4.210659249670788, "grad_norm": 5.252453804016113, "learning_rate": 0.0014319197174931465, "loss": 7.5956, "step": 1033600 }, { "epoch": 4.211066627694169, "grad_norm": 6.729776859283447, "learning_rate": 0.0014314718731840248, "loss": 7.6165, "step": 1033700 }, { "epoch": 4.211474005717551, "grad_norm": 5.497008800506592, "learning_rate": 0.0014310240713172123, "loss": 7.597, "step": 1033800 }, { "epoch": 4.211881383740932, "grad_norm": 5.694904804229736, "learning_rate": 0.0014305763119104125, "loss": 7.6054, "step": 1033900 }, { "epoch": 4.212288761764314, "grad_norm": 7.6464385986328125, "learning_rate": 0.001430128594981329, "loss": 7.6043, "step": 1034000 }, { "epoch": 4.212288761764314, "eval_MaskedAccuracy": 0.509850729345662, "eval_loss": 1.5982760190963745, "eval_runtime": 152.0538, "eval_samples_per_second": 417.458, "eval_steps_per_second": 1.631, "step": 1034000 }, { "epoch": 4.2126961397876945, "grad_norm": 3.917483329772949, "learning_rate": 0.0014296809205476592, "loss": 7.5896, "step": 1034100 }, { "epoch": 4.213103517811076, "grad_norm": 5.192936897277832, "learning_rate": 0.0014292332886271031, "loss": 7.6009, "step": 1034200 }, { "epoch": 4.213510895834458, "grad_norm": 6.1599602699279785, "learning_rate": 0.0014287856992373602, "loss": 7.6249, "step": 1034300 }, { "epoch": 4.213918273857839, "grad_norm": 4.17832612991333, "learning_rate": 0.0014283381523961223, "loss": 7.6259, "step": 1034400 }, { "epoch": 4.214325651881221, "grad_norm": 14.357792854309082, "learning_rate": 0.001427890648121088, "loss": 7.5974, "step": 1034500 }, { "epoch": 4.214733029904602, "grad_norm": 7.662722110748291, "learning_rate": 0.0014274431864299462, "loss": 7.5946, "step": 1034600 }, { "epoch": 4.215140407927984, "grad_norm": 4.965076923370361, "learning_rate": 0.0014269957673403895, "loss": 7.6232, "step": 1034700 }, { "epoch": 4.215547785951365, "grad_norm": 4.649047374725342, "learning_rate": 0.0014265483908701044, "loss": 7.5993, "step": 1034800 }, { "epoch": 4.215955163974747, "grad_norm": 9.61099910736084, "learning_rate": 0.0014261010570367795, "loss": 7.5981, "step": 1034900 }, { "epoch": 4.216362541998128, "grad_norm": 5.705331802368164, "learning_rate": 0.001425653765858099, "loss": 7.5778, "step": 1035000 }, { "epoch": 4.216362541998128, "eval_MaskedAccuracy": 0.5102395071453286, "eval_loss": 1.589215874671936, "eval_runtime": 179.1055, "eval_samples_per_second": 354.406, "eval_steps_per_second": 1.385, "step": 1035000 }, { "epoch": 4.21676992002151, "grad_norm": 2.996056318283081, "learning_rate": 0.0014252065173517492, "loss": 7.5826, "step": 1035100 }, { "epoch": 4.217177298044891, "grad_norm": 4.911251068115234, "learning_rate": 0.0014247593115354097, "loss": 7.6332, "step": 1035200 }, { "epoch": 4.217584676068272, "grad_norm": 4.924323558807373, "learning_rate": 0.0014243121484267633, "loss": 7.5839, "step": 1035300 }, { "epoch": 4.217992054091654, "grad_norm": 7.712258815765381, "learning_rate": 0.0014238650280434877, "loss": 7.5621, "step": 1035400 }, { "epoch": 4.218399432115035, "grad_norm": 9.961227416992188, "learning_rate": 0.00142341795040326, "loss": 7.6159, "step": 1035500 }, { "epoch": 4.218806810138417, "grad_norm": 8.253767013549805, "learning_rate": 0.0014229709155237581, "loss": 7.622, "step": 1035600 }, { "epoch": 4.219214188161798, "grad_norm": 4.563155174255371, "learning_rate": 0.0014225239234226506, "loss": 7.5653, "step": 1035700 }, { "epoch": 4.21962156618518, "grad_norm": 10.279685974121094, "learning_rate": 0.0014220769741176116, "loss": 7.59, "step": 1035800 }, { "epoch": 4.220028944208561, "grad_norm": 8.487176895141602, "learning_rate": 0.0014216300676263118, "loss": 7.5725, "step": 1035900 }, { "epoch": 4.220436322231943, "grad_norm": 11.135157585144043, "learning_rate": 0.0014211832039664192, "loss": 7.5991, "step": 1036000 }, { "epoch": 4.220436322231943, "eval_MaskedAccuracy": 0.510516270683006, "eval_loss": 1.6004313230514526, "eval_runtime": 184.468, "eval_samples_per_second": 344.103, "eval_steps_per_second": 1.344, "step": 1036000 }, { "epoch": 4.220843700255324, "grad_norm": 14.776406288146973, "learning_rate": 0.0014207363831556011, "loss": 7.5955, "step": 1036100 }, { "epoch": 4.221251078278706, "grad_norm": 6.738020896911621, "learning_rate": 0.0014202896052115208, "loss": 7.5831, "step": 1036200 }, { "epoch": 4.221658456302087, "grad_norm": 10.824565887451172, "learning_rate": 0.001419842870151847, "loss": 7.5895, "step": 1036300 }, { "epoch": 4.222065834325469, "grad_norm": 3.366100549697876, "learning_rate": 0.001419396177994238, "loss": 7.6135, "step": 1036400 }, { "epoch": 4.22247321234885, "grad_norm": 3.5348172187805176, "learning_rate": 0.0014189495287563537, "loss": 7.5733, "step": 1036500 }, { "epoch": 4.222880590372231, "grad_norm": 5.367615699768066, "learning_rate": 0.0014185029224558546, "loss": 7.6033, "step": 1036600 }, { "epoch": 4.223287968395613, "grad_norm": 8.914054870605469, "learning_rate": 0.001418056359110395, "loss": 7.6153, "step": 1036700 }, { "epoch": 4.223695346418994, "grad_norm": 8.189297676086426, "learning_rate": 0.0014176098387376306, "loss": 7.5986, "step": 1036800 }, { "epoch": 4.224102724442376, "grad_norm": 8.711004257202148, "learning_rate": 0.001417163361355215, "loss": 7.5879, "step": 1036900 }, { "epoch": 4.224510102465757, "grad_norm": 7.876670837402344, "learning_rate": 0.001416716926980801, "loss": 7.5858, "step": 1037000 }, { "epoch": 4.224510102465757, "eval_MaskedAccuracy": 0.5104020098319255, "eval_loss": 1.5996097326278687, "eval_runtime": 181.2853, "eval_samples_per_second": 350.144, "eval_steps_per_second": 1.368, "step": 1037000 }, { "epoch": 4.224917480489139, "grad_norm": 4.606138706207275, "learning_rate": 0.0014162705356320368, "loss": 7.5649, "step": 1037100 }, { "epoch": 4.22532485851252, "grad_norm": 8.091697692871094, "learning_rate": 0.0014158241873265707, "loss": 7.6043, "step": 1037200 }, { "epoch": 4.225732236535902, "grad_norm": 3.9498138427734375, "learning_rate": 0.0014153778820820506, "loss": 7.6016, "step": 1037300 }, { "epoch": 4.226139614559283, "grad_norm": 12.753782272338867, "learning_rate": 0.0014149316199161205, "loss": 7.606, "step": 1037400 }, { "epoch": 4.226546992582665, "grad_norm": 4.535918235778809, "learning_rate": 0.0014144854008464216, "loss": 7.6137, "step": 1037500 }, { "epoch": 4.226954370606046, "grad_norm": 3.7183914184570312, "learning_rate": 0.0014140392248905985, "loss": 7.6012, "step": 1037600 }, { "epoch": 4.227361748629428, "grad_norm": 4.210314750671387, "learning_rate": 0.0014135930920662909, "loss": 7.5942, "step": 1037700 }, { "epoch": 4.2277691266528095, "grad_norm": 6.041977882385254, "learning_rate": 0.0014131470023911344, "loss": 7.5879, "step": 1037800 }, { "epoch": 4.22817650467619, "grad_norm": 8.699797630310059, "learning_rate": 0.0014127009558827677, "loss": 7.5819, "step": 1037900 }, { "epoch": 4.228583882699572, "grad_norm": 7.279827117919922, "learning_rate": 0.0014122549525588236, "loss": 7.5784, "step": 1038000 }, { "epoch": 4.228583882699572, "eval_MaskedAccuracy": 0.5103667723872006, "eval_loss": 1.5985842943191528, "eval_runtime": 160.8712, "eval_samples_per_second": 394.577, "eval_steps_per_second": 1.542, "step": 1038000 }, { "epoch": 4.228991260722953, "grad_norm": 4.683993339538574, "learning_rate": 0.0014118089924369357, "loss": 7.5954, "step": 1038100 }, { "epoch": 4.229398638746335, "grad_norm": 8.449949264526367, "learning_rate": 0.001411363075534735, "loss": 7.5867, "step": 1038200 }, { "epoch": 4.229806016769716, "grad_norm": 7.157567977905273, "learning_rate": 0.0014109172018698503, "loss": 7.6008, "step": 1038300 }, { "epoch": 4.230213394793098, "grad_norm": 8.54863166809082, "learning_rate": 0.0014104713714599103, "loss": 7.583, "step": 1038400 }, { "epoch": 4.230620772816479, "grad_norm": 9.954252243041992, "learning_rate": 0.0014100255843225405, "loss": 7.5946, "step": 1038500 }, { "epoch": 4.231028150839861, "grad_norm": 13.772327423095703, "learning_rate": 0.0014095798404753664, "loss": 7.6321, "step": 1038600 }, { "epoch": 4.231435528863242, "grad_norm": 7.2670135498046875, "learning_rate": 0.001409134139936011, "loss": 7.5994, "step": 1038700 }, { "epoch": 4.231842906886624, "grad_norm": 3.6352944374084473, "learning_rate": 0.001408688482722093, "loss": 7.5996, "step": 1038800 }, { "epoch": 4.232250284910005, "grad_norm": 10.875031471252441, "learning_rate": 0.0014082428688512328, "loss": 7.6008, "step": 1038900 }, { "epoch": 4.232657662933387, "grad_norm": 7.153680801391602, "learning_rate": 0.0014077972983410488, "loss": 7.6032, "step": 1039000 }, { "epoch": 4.232657662933387, "eval_MaskedAccuracy": 0.51030481513918, "eval_loss": 1.59966242313385, "eval_runtime": 202.6644, "eval_samples_per_second": 313.207, "eval_steps_per_second": 1.224, "step": 1039000 }, { "epoch": 4.233065040956768, "grad_norm": 4.558526039123535, "learning_rate": 0.0014073517712091558, "loss": 7.586, "step": 1039100 }, { "epoch": 4.233472418980149, "grad_norm": 6.087801456451416, "learning_rate": 0.001406906287473165, "loss": 7.5847, "step": 1039200 }, { "epoch": 4.233879797003531, "grad_norm": 6.982261657714844, "learning_rate": 0.0014064608471506957, "loss": 7.5899, "step": 1039300 }, { "epoch": 4.234287175026912, "grad_norm": 14.22995376586914, "learning_rate": 0.001406015450259353, "loss": 7.6126, "step": 1039400 }, { "epoch": 4.234694553050294, "grad_norm": 11.781092643737793, "learning_rate": 0.001405570096816747, "loss": 7.5737, "step": 1039500 }, { "epoch": 4.235101931073675, "grad_norm": 11.608367919921875, "learning_rate": 0.0014051247868404847, "loss": 7.584, "step": 1039600 }, { "epoch": 4.235509309097057, "grad_norm": 5.197314262390137, "learning_rate": 0.0014046795203481724, "loss": 7.5912, "step": 1039700 }, { "epoch": 4.235916687120438, "grad_norm": 13.955909729003906, "learning_rate": 0.0014042342973574148, "loss": 7.5847, "step": 1039800 }, { "epoch": 4.23632406514382, "grad_norm": 4.1139349937438965, "learning_rate": 0.0014037891178858126, "loss": 7.5895, "step": 1039900 }, { "epoch": 4.236731443167201, "grad_norm": 3.392300605773926, "learning_rate": 0.001403343981950965, "loss": 7.581, "step": 1040000 }, { "epoch": 4.236731443167201, "eval_MaskedAccuracy": 0.5107230557023007, "eval_loss": 1.5984443426132202, "eval_runtime": 160.6273, "eval_samples_per_second": 395.176, "eval_steps_per_second": 1.544, "step": 1040000 }, { "epoch": 4.237138821190583, "grad_norm": 4.306652069091797, "learning_rate": 0.0014028988895704718, "loss": 7.6129, "step": 1040100 }, { "epoch": 4.2375461992139645, "grad_norm": 6.447711944580078, "learning_rate": 0.00140245384076193, "loss": 7.5864, "step": 1040200 }, { "epoch": 4.237953577237345, "grad_norm": 7.224948406219482, "learning_rate": 0.0014020088355429353, "loss": 7.5909, "step": 1040300 }, { "epoch": 4.238360955260727, "grad_norm": 7.60656213760376, "learning_rate": 0.0014015638739310813, "loss": 7.5982, "step": 1040400 }, { "epoch": 4.238768333284108, "grad_norm": 5.7304205894470215, "learning_rate": 0.0014011189559439577, "loss": 7.5931, "step": 1040500 }, { "epoch": 4.23917571130749, "grad_norm": 3.2884790897369385, "learning_rate": 0.001400674081599157, "loss": 7.5858, "step": 1040600 }, { "epoch": 4.239583089330871, "grad_norm": 5.337141513824463, "learning_rate": 0.0014002292509142655, "loss": 7.5829, "step": 1040700 }, { "epoch": 4.239990467354253, "grad_norm": 13.813345909118652, "learning_rate": 0.0013997844639068696, "loss": 7.6143, "step": 1040800 }, { "epoch": 4.240397845377634, "grad_norm": 10.081512451171875, "learning_rate": 0.0013993397205945566, "loss": 7.5724, "step": 1040900 }, { "epoch": 4.240805223401016, "grad_norm": 5.806445121765137, "learning_rate": 0.0013988950209949072, "loss": 7.5967, "step": 1041000 }, { "epoch": 4.240805223401016, "eval_MaskedAccuracy": 0.5106415551820171, "eval_loss": 1.5973789691925049, "eval_runtime": 151.5209, "eval_samples_per_second": 418.926, "eval_steps_per_second": 1.637, "step": 1041000 }, { "epoch": 4.241212601424397, "grad_norm": 6.213423252105713, "learning_rate": 0.001398450365125503, "loss": 7.5996, "step": 1041100 }, { "epoch": 4.241619979447779, "grad_norm": 3.1893868446350098, "learning_rate": 0.0013980057530039243, "loss": 7.5974, "step": 1041200 }, { "epoch": 4.24202735747116, "grad_norm": 4.202600479125977, "learning_rate": 0.0013975611846477483, "loss": 7.5891, "step": 1041300 }, { "epoch": 4.242434735494542, "grad_norm": 3.976889133453369, "learning_rate": 0.0013971166600745522, "loss": 7.6055, "step": 1041400 }, { "epoch": 4.2428421135179235, "grad_norm": 4.6882429122924805, "learning_rate": 0.001396672179301909, "loss": 7.5942, "step": 1041500 }, { "epoch": 4.243249491541304, "grad_norm": 8.26848030090332, "learning_rate": 0.0013962277423473952, "loss": 7.6036, "step": 1041600 }, { "epoch": 4.243656869564686, "grad_norm": 7.022709846496582, "learning_rate": 0.0013957833492285775, "loss": 7.6177, "step": 1041700 }, { "epoch": 4.244064247588067, "grad_norm": 6.45651388168335, "learning_rate": 0.0013953389999630248, "loss": 7.6096, "step": 1041800 }, { "epoch": 4.244471625611449, "grad_norm": 10.376928329467773, "learning_rate": 0.001394894694568308, "loss": 7.6, "step": 1041900 }, { "epoch": 4.24487900363483, "grad_norm": 4.617563247680664, "learning_rate": 0.0013944504330619897, "loss": 7.5957, "step": 1042000 }, { "epoch": 4.24487900363483, "eval_MaskedAccuracy": 0.5098486364057767, "eval_loss": 1.6006395816802979, "eval_runtime": 166.303, "eval_samples_per_second": 381.689, "eval_steps_per_second": 1.491, "step": 1042000 }, { "epoch": 4.245286381658212, "grad_norm": 12.784701347351074, "learning_rate": 0.0013940062154616363, "loss": 7.5798, "step": 1042100 }, { "epoch": 4.245693759681593, "grad_norm": 5.599425792694092, "learning_rate": 0.00139356204178481, "loss": 7.5941, "step": 1042200 }, { "epoch": 4.246101137704975, "grad_norm": 4.726510524749756, "learning_rate": 0.00139311791204907, "loss": 7.6388, "step": 1042300 }, { "epoch": 4.246508515728356, "grad_norm": 2.8609824180603027, "learning_rate": 0.001392673826271974, "loss": 7.6139, "step": 1042400 }, { "epoch": 4.246915893751738, "grad_norm": 3.4965760707855225, "learning_rate": 0.0013922297844710836, "loss": 7.5804, "step": 1042500 }, { "epoch": 4.2473232717751195, "grad_norm": 6.381969928741455, "learning_rate": 0.0013917857866639495, "loss": 7.6251, "step": 1042600 }, { "epoch": 4.247730649798501, "grad_norm": 17.653200149536133, "learning_rate": 0.0013913418328681286, "loss": 7.5774, "step": 1042700 }, { "epoch": 4.2481380278218825, "grad_norm": 6.616718292236328, "learning_rate": 0.0013908979231011703, "loss": 7.6292, "step": 1042800 }, { "epoch": 4.248545405845263, "grad_norm": 8.231491088867188, "learning_rate": 0.0013904540573806265, "loss": 7.5937, "step": 1042900 }, { "epoch": 4.248952783868645, "grad_norm": 6.566020965576172, "learning_rate": 0.0013900102357240441, "loss": 7.5842, "step": 1043000 }, { "epoch": 4.248952783868645, "eval_MaskedAccuracy": 0.5103305871340506, "eval_loss": 1.597257375717163, "eval_runtime": 163.7009, "eval_samples_per_second": 387.756, "eval_steps_per_second": 1.515, "step": 1043000 }, { "epoch": 4.249360161892026, "grad_norm": 9.477441787719727, "learning_rate": 0.0013895664581489715, "loss": 7.6107, "step": 1043100 }, { "epoch": 4.249767539915408, "grad_norm": 7.334949970245361, "learning_rate": 0.001389122724672952, "loss": 7.6041, "step": 1043200 }, { "epoch": 4.250174917938789, "grad_norm": 6.031586647033691, "learning_rate": 0.0013886790353135295, "loss": 7.591, "step": 1043300 }, { "epoch": 4.250582295962171, "grad_norm": 9.586661338806152, "learning_rate": 0.0013882353900882443, "loss": 7.5904, "step": 1043400 }, { "epoch": 4.250989673985552, "grad_norm": 5.399155616760254, "learning_rate": 0.0013877917890146392, "loss": 7.6182, "step": 1043500 }, { "epoch": 4.251397052008934, "grad_norm": 9.838690757751465, "learning_rate": 0.0013873482321102477, "loss": 7.5695, "step": 1043600 }, { "epoch": 4.2518044300323155, "grad_norm": 4.266875267028809, "learning_rate": 0.0013869047193926082, "loss": 7.6018, "step": 1043700 }, { "epoch": 4.252211808055697, "grad_norm": 3.9155490398406982, "learning_rate": 0.001386461250879256, "loss": 7.6015, "step": 1043800 }, { "epoch": 4.2526191860790785, "grad_norm": 12.214604377746582, "learning_rate": 0.0013860178265877234, "loss": 7.5713, "step": 1043900 }, { "epoch": 4.25302656410246, "grad_norm": 7.104584693908691, "learning_rate": 0.0013855744465355415, "loss": 7.5933, "step": 1044000 }, { "epoch": 4.25302656410246, "eval_MaskedAccuracy": 0.5102716451615913, "eval_loss": 1.5962787866592407, "eval_runtime": 161.6995, "eval_samples_per_second": 392.555, "eval_steps_per_second": 1.534, "step": 1044000 }, { "epoch": 4.253433942125841, "grad_norm": 12.101835250854492, "learning_rate": 0.0013851311107402372, "loss": 7.6064, "step": 1044100 }, { "epoch": 4.253841320149222, "grad_norm": 2.8748786449432373, "learning_rate": 0.0013846878192193396, "loss": 7.5928, "step": 1044200 }, { "epoch": 4.254248698172604, "grad_norm": 3.9991018772125244, "learning_rate": 0.0013842445719903747, "loss": 7.611, "step": 1044300 }, { "epoch": 4.254656076195985, "grad_norm": 7.150573253631592, "learning_rate": 0.0013838013690708664, "loss": 7.6046, "step": 1044400 }, { "epoch": 4.255063454219367, "grad_norm": 18.1768798828125, "learning_rate": 0.001383358210478336, "loss": 7.5745, "step": 1044500 }, { "epoch": 4.255470832242748, "grad_norm": 6.408421039581299, "learning_rate": 0.001382915096230304, "loss": 7.6166, "step": 1044600 }, { "epoch": 4.25587821026613, "grad_norm": 4.033831596374512, "learning_rate": 0.0013824720263442894, "loss": 7.595, "step": 1044700 }, { "epoch": 4.256285588289511, "grad_norm": 18.95318031311035, "learning_rate": 0.0013820290008378102, "loss": 7.6036, "step": 1044800 }, { "epoch": 4.256692966312893, "grad_norm": 15.923903465270996, "learning_rate": 0.0013815860197283812, "loss": 7.6255, "step": 1044900 }, { "epoch": 4.2571003443362745, "grad_norm": 5.607954502105713, "learning_rate": 0.0013811430830335137, "loss": 7.6046, "step": 1045000 }, { "epoch": 4.2571003443362745, "eval_MaskedAccuracy": 0.5101724898143549, "eval_loss": 1.6080166101455688, "eval_runtime": 158.8932, "eval_samples_per_second": 399.489, "eval_steps_per_second": 1.561, "step": 1045000 }, { "epoch": 4.257507722359656, "grad_norm": 7.38274621963501, "learning_rate": 0.0013807001907707212, "loss": 7.5877, "step": 1045100 }, { "epoch": 4.2579151003830376, "grad_norm": 4.9756903648376465, "learning_rate": 0.0013802573429575144, "loss": 7.5956, "step": 1045200 }, { "epoch": 4.258322478406418, "grad_norm": 4.759303092956543, "learning_rate": 0.0013798145396114007, "loss": 7.5925, "step": 1045300 }, { "epoch": 4.2587298564298, "grad_norm": 6.458241939544678, "learning_rate": 0.001379371780749885, "loss": 7.5946, "step": 1045400 }, { "epoch": 4.259137234453181, "grad_norm": 5.849318504333496, "learning_rate": 0.0013789290663904726, "loss": 7.5974, "step": 1045500 }, { "epoch": 4.259544612476563, "grad_norm": 3.0111122131347656, "learning_rate": 0.0013784863965506677, "loss": 7.5756, "step": 1045600 }, { "epoch": 4.259951990499944, "grad_norm": 3.285658121109009, "learning_rate": 0.0013780437712479705, "loss": 7.6009, "step": 1045700 }, { "epoch": 4.260359368523326, "grad_norm": 3.3373794555664062, "learning_rate": 0.0013776011904998813, "loss": 7.5979, "step": 1045800 }, { "epoch": 4.260766746546707, "grad_norm": 6.0788679122924805, "learning_rate": 0.0013771586543238974, "loss": 7.608, "step": 1045900 }, { "epoch": 4.261174124570089, "grad_norm": 6.220348834991455, "learning_rate": 0.0013767161627375128, "loss": 7.6254, "step": 1046000 }, { "epoch": 4.261174124570089, "eval_MaskedAccuracy": 0.5105576711595019, "eval_loss": 1.605486512184143, "eval_runtime": 155.2058, "eval_samples_per_second": 408.98, "eval_steps_per_second": 1.598, "step": 1046000 }, { "epoch": 4.2615815025934705, "grad_norm": 8.3060302734375, "learning_rate": 0.0013762737157582215, "loss": 7.5906, "step": 1046100 }, { "epoch": 4.261988880616852, "grad_norm": 6.216963291168213, "learning_rate": 0.0013758313134035185, "loss": 7.5889, "step": 1046200 }, { "epoch": 4.2623962586402335, "grad_norm": 10.733205795288086, "learning_rate": 0.001375388955690892, "loss": 7.6103, "step": 1046300 }, { "epoch": 4.262803636663615, "grad_norm": 4.733798503875732, "learning_rate": 0.0013749466426378333, "loss": 7.5805, "step": 1046400 }, { "epoch": 4.263211014686997, "grad_norm": 9.358062744140625, "learning_rate": 0.0013745043742618272, "loss": 7.6144, "step": 1046500 }, { "epoch": 4.263618392710377, "grad_norm": 5.283135414123535, "learning_rate": 0.0013740621505803597, "loss": 7.5833, "step": 1046600 }, { "epoch": 4.264025770733759, "grad_norm": 13.439477920532227, "learning_rate": 0.001373619971610914, "loss": 7.5921, "step": 1046700 }, { "epoch": 4.26443314875714, "grad_norm": 10.138379096984863, "learning_rate": 0.001373177837370972, "loss": 7.5857, "step": 1046800 }, { "epoch": 4.264840526780522, "grad_norm": 10.753410339355469, "learning_rate": 0.001372735747878013, "loss": 7.6175, "step": 1046900 }, { "epoch": 4.265247904803903, "grad_norm": 8.986906051635742, "learning_rate": 0.0013722937031495144, "loss": 7.5749, "step": 1047000 }, { "epoch": 4.265247904803903, "eval_MaskedAccuracy": 0.510165483161183, "eval_loss": 1.5947030782699585, "eval_runtime": 175.7149, "eval_samples_per_second": 361.244, "eval_steps_per_second": 1.411, "step": 1047000 }, { "epoch": 4.265655282827285, "grad_norm": 5.9366912841796875, "learning_rate": 0.0013718517032029558, "loss": 7.611, "step": 1047100 }, { "epoch": 4.266062660850666, "grad_norm": 4.973095893859863, "learning_rate": 0.00137140974805581, "loss": 7.587, "step": 1047200 }, { "epoch": 4.266470038874048, "grad_norm": 9.203110694885254, "learning_rate": 0.001370967837725549, "loss": 7.5996, "step": 1047300 }, { "epoch": 4.2668774168974295, "grad_norm": 17.141218185424805, "learning_rate": 0.0013705259722296443, "loss": 7.6149, "step": 1047400 }, { "epoch": 4.267284794920811, "grad_norm": 3.210751533508301, "learning_rate": 0.001370084151585565, "loss": 7.5693, "step": 1047500 }, { "epoch": 4.267692172944193, "grad_norm": 11.829336166381836, "learning_rate": 0.0013696423758107795, "loss": 7.6087, "step": 1047600 }, { "epoch": 4.268099550967574, "grad_norm": 5.585712909698486, "learning_rate": 0.0013692006449227518, "loss": 7.5905, "step": 1047700 }, { "epoch": 4.268506928990956, "grad_norm": 3.361020088195801, "learning_rate": 0.0013687589589389479, "loss": 7.5976, "step": 1047800 }, { "epoch": 4.268914307014336, "grad_norm": 6.276754856109619, "learning_rate": 0.001368317317876831, "loss": 7.5851, "step": 1047900 }, { "epoch": 4.269321685037718, "grad_norm": 7.628347873687744, "learning_rate": 0.0013678757217538587, "loss": 7.6, "step": 1048000 }, { "epoch": 4.269321685037718, "eval_MaskedAccuracy": 0.510544796005558, "eval_loss": 1.601022720336914, "eval_runtime": 155.935, "eval_samples_per_second": 407.067, "eval_steps_per_second": 1.59, "step": 1048000 }, { "epoch": 4.269729063061099, "grad_norm": 4.361343860626221, "learning_rate": 0.0013674341705874912, "loss": 7.5912, "step": 1048100 }, { "epoch": 4.270136441084481, "grad_norm": 10.683218002319336, "learning_rate": 0.0013669926643951852, "loss": 7.5705, "step": 1048200 }, { "epoch": 4.270543819107862, "grad_norm": 5.449821472167969, "learning_rate": 0.0013665512031943957, "loss": 7.5813, "step": 1048300 }, { "epoch": 4.270951197131244, "grad_norm": 6.593095779418945, "learning_rate": 0.0013661097870025756, "loss": 7.6175, "step": 1048400 }, { "epoch": 4.2713585751546255, "grad_norm": 5.898374557495117, "learning_rate": 0.001365668415837177, "loss": 7.6092, "step": 1048500 }, { "epoch": 4.271765953178007, "grad_norm": 21.504728317260742, "learning_rate": 0.0013652270897156489, "loss": 7.5872, "step": 1048600 }, { "epoch": 4.2721733312013885, "grad_norm": 3.979430913925171, "learning_rate": 0.001364785808655441, "loss": 7.5815, "step": 1048700 }, { "epoch": 4.27258070922477, "grad_norm": 15.458041191101074, "learning_rate": 0.0013643445726739976, "loss": 7.5795, "step": 1048800 }, { "epoch": 4.272988087248152, "grad_norm": 6.766093730926514, "learning_rate": 0.001363903381788764, "loss": 7.5902, "step": 1048900 }, { "epoch": 4.273395465271533, "grad_norm": 11.906927108764648, "learning_rate": 0.0013634622360171836, "loss": 7.5898, "step": 1049000 }, { "epoch": 4.273395465271533, "eval_MaskedAccuracy": 0.5105699382233186, "eval_loss": 1.6041675806045532, "eval_runtime": 157.58, "eval_samples_per_second": 402.818, "eval_steps_per_second": 1.574, "step": 1049000 }, { "epoch": 4.273802843294914, "grad_norm": 5.8816375732421875, "learning_rate": 0.0013630211353766943, "loss": 7.5897, "step": 1049100 }, { "epoch": 4.274210221318295, "grad_norm": 7.39239501953125, "learning_rate": 0.0013625800798847394, "loss": 7.6126, "step": 1049200 }, { "epoch": 4.274617599341677, "grad_norm": 8.080638885498047, "learning_rate": 0.001362139069558753, "loss": 7.6085, "step": 1049300 }, { "epoch": 4.275024977365058, "grad_norm": 4.267389297485352, "learning_rate": 0.0013616981044161716, "loss": 7.6104, "step": 1049400 }, { "epoch": 4.27543235538844, "grad_norm": 10.538228988647461, "learning_rate": 0.0013612571844744306, "loss": 7.6089, "step": 1049500 }, { "epoch": 4.275839733411821, "grad_norm": 5.671195030212402, "learning_rate": 0.0013608163097509603, "loss": 7.5492, "step": 1049600 }, { "epoch": 4.276247111435203, "grad_norm": 8.128995895385742, "learning_rate": 0.0013603754802631908, "loss": 7.6204, "step": 1049700 }, { "epoch": 4.2766544894585845, "grad_norm": 15.104260444641113, "learning_rate": 0.0013599346960285505, "loss": 7.6049, "step": 1049800 }, { "epoch": 4.277061867481966, "grad_norm": 6.141758918762207, "learning_rate": 0.0013594939570644633, "loss": 7.5944, "step": 1049900 }, { "epoch": 4.277469245505348, "grad_norm": 6.137265682220459, "learning_rate": 0.001359053263388357, "loss": 7.5827, "step": 1050000 }, { "epoch": 4.277469245505348, "eval_MaskedAccuracy": 0.5104539762579546, "eval_loss": 1.604261040687561, "eval_runtime": 160.1805, "eval_samples_per_second": 396.278, "eval_steps_per_second": 1.548, "step": 1050000 }, { "epoch": 4.277876623528729, "grad_norm": 6.1768035888671875, "learning_rate": 0.001358612615017653, "loss": 7.5911, "step": 1050100 }, { "epoch": 4.278284001552111, "grad_norm": 2.9127068519592285, "learning_rate": 0.0013581720119697752, "loss": 7.6041, "step": 1050200 }, { "epoch": 4.278691379575491, "grad_norm": 5.4770426750183105, "learning_rate": 0.0013577314542621418, "loss": 7.573, "step": 1050300 }, { "epoch": 4.279098757598873, "grad_norm": 2.914484977722168, "learning_rate": 0.0013572909419121686, "loss": 7.5765, "step": 1050400 }, { "epoch": 4.279506135622254, "grad_norm": 4.840549945831299, "learning_rate": 0.001356850474937274, "loss": 7.6156, "step": 1050500 }, { "epoch": 4.279913513645636, "grad_norm": 10.579102516174316, "learning_rate": 0.0013564100533548715, "loss": 7.5796, "step": 1050600 }, { "epoch": 4.280320891669017, "grad_norm": 8.241044998168945, "learning_rate": 0.0013559696771823734, "loss": 7.5974, "step": 1050700 }, { "epoch": 4.280728269692399, "grad_norm": 17.360485076904297, "learning_rate": 0.001355529346437188, "loss": 7.6033, "step": 1050800 }, { "epoch": 4.2811356477157805, "grad_norm": 12.6903657913208, "learning_rate": 0.001355089061136727, "loss": 7.6016, "step": 1050900 }, { "epoch": 4.281543025739162, "grad_norm": 11.648893356323242, "learning_rate": 0.001354648821298394, "loss": 7.6138, "step": 1051000 }, { "epoch": 4.281543025739162, "eval_MaskedAccuracy": 0.5107127187312243, "eval_loss": 1.595258355140686, "eval_runtime": 158.3991, "eval_samples_per_second": 400.735, "eval_steps_per_second": 1.566, "step": 1051000 }, { "epoch": 4.2819504037625435, "grad_norm": 10.427696228027344, "learning_rate": 0.0013542086269395966, "loss": 7.6047, "step": 1051100 }, { "epoch": 4.282357781785925, "grad_norm": 4.498067378997803, "learning_rate": 0.0013537684780777367, "loss": 7.6153, "step": 1051200 }, { "epoch": 4.282765159809307, "grad_norm": 6.52211332321167, "learning_rate": 0.0013533283747302147, "loss": 7.6057, "step": 1051300 }, { "epoch": 4.283172537832688, "grad_norm": 3.8469252586364746, "learning_rate": 0.0013528883169144329, "loss": 7.6022, "step": 1051400 }, { "epoch": 4.28357991585607, "grad_norm": 15.737393379211426, "learning_rate": 0.0013524483046477884, "loss": 7.5877, "step": 1051500 }, { "epoch": 4.28398729387945, "grad_norm": 9.453332901000977, "learning_rate": 0.0013520083379476757, "loss": 7.6081, "step": 1051600 }, { "epoch": 4.284394671902832, "grad_norm": 4.673506736755371, "learning_rate": 0.0013515684168314904, "loss": 7.6194, "step": 1051700 }, { "epoch": 4.284802049926213, "grad_norm": 5.137613296508789, "learning_rate": 0.0013511285413166255, "loss": 7.6118, "step": 1051800 }, { "epoch": 4.285209427949595, "grad_norm": 4.768191814422607, "learning_rate": 0.001350688711420469, "loss": 7.5906, "step": 1051900 }, { "epoch": 4.2856168059729765, "grad_norm": 10.33692455291748, "learning_rate": 0.0013502489271604132, "loss": 7.573, "step": 1052000 }, { "epoch": 4.2856168059729765, "eval_MaskedAccuracy": 0.5105846314590503, "eval_loss": 1.6071923971176147, "eval_runtime": 175.5904, "eval_samples_per_second": 361.5, "eval_steps_per_second": 1.412, "step": 1052000 }, { "epoch": 4.286024183996358, "grad_norm": 4.2482171058654785, "learning_rate": 0.0013498091885538435, "loss": 7.6372, "step": 1052100 }, { "epoch": 4.2864315620197395, "grad_norm": 8.275416374206543, "learning_rate": 0.0013493694956181429, "loss": 7.5969, "step": 1052200 }, { "epoch": 4.286838940043121, "grad_norm": 8.151731491088867, "learning_rate": 0.0013489298483706984, "loss": 7.586, "step": 1052300 }, { "epoch": 4.287246318066503, "grad_norm": 7.892337799072266, "learning_rate": 0.0013484902468288904, "loss": 7.5684, "step": 1052400 }, { "epoch": 4.287653696089884, "grad_norm": 8.394124984741211, "learning_rate": 0.001348050691010097, "loss": 7.6165, "step": 1052500 }, { "epoch": 4.288061074113266, "grad_norm": 6.7744879722595215, "learning_rate": 0.0013476111809316964, "loss": 7.5881, "step": 1052600 }, { "epoch": 4.288468452136647, "grad_norm": 4.3457770347595215, "learning_rate": 0.0013471717166110664, "loss": 7.5829, "step": 1052700 }, { "epoch": 4.288875830160029, "grad_norm": 6.017189979553223, "learning_rate": 0.0013467322980655826, "loss": 7.5883, "step": 1052800 }, { "epoch": 4.289283208183409, "grad_norm": 8.758934020996094, "learning_rate": 0.0013462929253126165, "loss": 7.5988, "step": 1052900 }, { "epoch": 4.289690586206791, "grad_norm": 4.954868793487549, "learning_rate": 0.001345853598369537, "loss": 7.5871, "step": 1053000 }, { "epoch": 4.289690586206791, "eval_MaskedAccuracy": 0.5101855293138061, "eval_loss": 1.5933858156204224, "eval_runtime": 158.1265, "eval_samples_per_second": 401.425, "eval_steps_per_second": 1.568, "step": 1053000 }, { "epoch": 4.290097964230172, "grad_norm": 3.5732669830322266, "learning_rate": 0.0013454143172537143, "loss": 7.6088, "step": 1053100 }, { "epoch": 4.290505342253554, "grad_norm": 5.973149299621582, "learning_rate": 0.0013449750819825156, "loss": 7.5845, "step": 1053200 }, { "epoch": 4.2909127202769355, "grad_norm": 7.148528575897217, "learning_rate": 0.0013445358925733074, "loss": 7.5924, "step": 1053300 }, { "epoch": 4.291320098300317, "grad_norm": 4.60052490234375, "learning_rate": 0.0013440967490434504, "loss": 7.5947, "step": 1053400 }, { "epoch": 4.2917274763236986, "grad_norm": 4.36650276184082, "learning_rate": 0.0013436576514103096, "loss": 7.5904, "step": 1053500 }, { "epoch": 4.29213485434708, "grad_norm": 5.558254718780518, "learning_rate": 0.001343218599691242, "loss": 7.6059, "step": 1053600 }, { "epoch": 4.292542232370462, "grad_norm": 8.331778526306152, "learning_rate": 0.0013427795939036072, "loss": 7.6071, "step": 1053700 }, { "epoch": 4.292949610393843, "grad_norm": 4.540841102600098, "learning_rate": 0.0013423406340647614, "loss": 7.6168, "step": 1053800 }, { "epoch": 4.293356988417225, "grad_norm": 13.898836135864258, "learning_rate": 0.0013419017201920572, "loss": 7.5812, "step": 1053900 }, { "epoch": 4.293764366440606, "grad_norm": 4.2923736572265625, "learning_rate": 0.001341462852302848, "loss": 7.6087, "step": 1054000 }, { "epoch": 4.293764366440606, "eval_MaskedAccuracy": 0.5105729284503422, "eval_loss": 1.599921464920044, "eval_runtime": 158.362, "eval_samples_per_second": 400.828, "eval_steps_per_second": 1.566, "step": 1054000 }, { "epoch": 4.294171744463987, "grad_norm": 9.693986892700195, "learning_rate": 0.0013410240304144857, "loss": 7.6009, "step": 1054100 }, { "epoch": 4.294579122487368, "grad_norm": 5.510178565979004, "learning_rate": 0.0013405852545443201, "loss": 7.6049, "step": 1054200 }, { "epoch": 4.29498650051075, "grad_norm": 6.429800987243652, "learning_rate": 0.0013401465247096964, "loss": 7.598, "step": 1054300 }, { "epoch": 4.2953938785341315, "grad_norm": 9.711774826049805, "learning_rate": 0.0013397078409279613, "loss": 7.5863, "step": 1054400 }, { "epoch": 4.295801256557513, "grad_norm": 6.588146686553955, "learning_rate": 0.0013392692032164566, "loss": 7.5982, "step": 1054500 }, { "epoch": 4.2962086345808945, "grad_norm": 4.1650261878967285, "learning_rate": 0.0013388306115925262, "loss": 7.5887, "step": 1054600 }, { "epoch": 4.296616012604276, "grad_norm": 4.303426742553711, "learning_rate": 0.0013383920660735079, "loss": 7.598, "step": 1054700 }, { "epoch": 4.297023390627658, "grad_norm": 6.5245771408081055, "learning_rate": 0.0013379535666767429, "loss": 7.5787, "step": 1054800 }, { "epoch": 4.297430768651039, "grad_norm": 5.145477294921875, "learning_rate": 0.0013375151134195633, "loss": 7.5935, "step": 1054900 }, { "epoch": 4.297838146674421, "grad_norm": 3.6686227321624756, "learning_rate": 0.0013370767063193049, "loss": 7.5919, "step": 1055000 }, { "epoch": 4.297838146674421, "eval_MaskedAccuracy": 0.5110081258676045, "eval_loss": 1.5933496952056885, "eval_runtime": 155.7441, "eval_samples_per_second": 407.566, "eval_steps_per_second": 1.592, "step": 1055000 }, { "epoch": 4.298245524697802, "grad_norm": 10.672622680664062, "learning_rate": 0.0013366383453933017, "loss": 7.607, "step": 1055100 }, { "epoch": 4.298652902721184, "grad_norm": 4.603048801422119, "learning_rate": 0.0013362000306588828, "loss": 7.6094, "step": 1055200 }, { "epoch": 4.299060280744564, "grad_norm": 4.876296043395996, "learning_rate": 0.001335761762133378, "loss": 7.5943, "step": 1055300 }, { "epoch": 4.299467658767946, "grad_norm": 3.4054372310638428, "learning_rate": 0.0013353235398341147, "loss": 7.5754, "step": 1055400 }, { "epoch": 4.299875036791327, "grad_norm": 6.237635135650635, "learning_rate": 0.0013348853637784183, "loss": 7.6041, "step": 1055500 }, { "epoch": 4.300282414814709, "grad_norm": 6.513390064239502, "learning_rate": 0.0013344472339836128, "loss": 7.5821, "step": 1055600 }, { "epoch": 4.3006897928380905, "grad_norm": 4.691460609436035, "learning_rate": 0.0013340091504670183, "loss": 7.5944, "step": 1055700 }, { "epoch": 4.301097170861472, "grad_norm": 10.522651672363281, "learning_rate": 0.0013335711132459538, "loss": 7.6126, "step": 1055800 }, { "epoch": 4.301504548884854, "grad_norm": 4.642693042755127, "learning_rate": 0.001333133122337738, "loss": 7.6016, "step": 1055900 }, { "epoch": 4.301911926908235, "grad_norm": 6.95733118057251, "learning_rate": 0.0013326951777596892, "loss": 7.6056, "step": 1056000 }, { "epoch": 4.301911926908235, "eval_MaskedAccuracy": 0.5115172713143512, "eval_loss": 1.5941381454467773, "eval_runtime": 166.1302, "eval_samples_per_second": 382.086, "eval_steps_per_second": 1.493, "step": 1056000 }, { "epoch": 4.302319304931617, "grad_norm": 5.869209289550781, "learning_rate": 0.0013322572795291196, "loss": 7.5703, "step": 1056100 }, { "epoch": 4.302726682954998, "grad_norm": 5.532618999481201, "learning_rate": 0.0013318194276633426, "loss": 7.611, "step": 1056200 }, { "epoch": 4.30313406097838, "grad_norm": 11.812193870544434, "learning_rate": 0.0013313816221796673, "loss": 7.5935, "step": 1056300 }, { "epoch": 4.303541439001761, "grad_norm": 6.252383708953857, "learning_rate": 0.001330943863095403, "loss": 7.6089, "step": 1056400 }, { "epoch": 4.303948817025143, "grad_norm": 5.153746128082275, "learning_rate": 0.001330506150427856, "loss": 7.6089, "step": 1056500 }, { "epoch": 4.304356195048523, "grad_norm": 9.104736328125, "learning_rate": 0.0013300684841943335, "loss": 7.5613, "step": 1056600 }, { "epoch": 4.304763573071905, "grad_norm": 12.152100563049316, "learning_rate": 0.0013296308644121396, "loss": 7.616, "step": 1056700 }, { "epoch": 4.3051709510952865, "grad_norm": 4.863597393035889, "learning_rate": 0.0013291932910985728, "loss": 7.6433, "step": 1056800 }, { "epoch": 4.305578329118668, "grad_norm": 5.482358455657959, "learning_rate": 0.001328755764270934, "loss": 7.5724, "step": 1056900 }, { "epoch": 4.3059857071420495, "grad_norm": 15.2509183883667, "learning_rate": 0.0013283182839465178, "loss": 7.6107, "step": 1057000 }, { "epoch": 4.3059857071420495, "eval_MaskedAccuracy": 0.5105492409469372, "eval_loss": 1.58795166015625, "eval_runtime": 163.1464, "eval_samples_per_second": 389.074, "eval_steps_per_second": 1.52, "step": 1057000 }, { "epoch": 4.306393085165431, "grad_norm": 4.565687656402588, "learning_rate": 0.0013278808501426232, "loss": 7.5928, "step": 1057100 }, { "epoch": 4.306800463188813, "grad_norm": 7.921701431274414, "learning_rate": 0.0013274434628765457, "loss": 7.6182, "step": 1057200 }, { "epoch": 4.307207841212194, "grad_norm": 8.388190269470215, "learning_rate": 0.0013270061221655762, "loss": 7.6275, "step": 1057300 }, { "epoch": 4.307615219235576, "grad_norm": 6.327116012573242, "learning_rate": 0.0013265688280270057, "loss": 7.6206, "step": 1057400 }, { "epoch": 4.308022597258957, "grad_norm": 3.048905611038208, "learning_rate": 0.0013261315804781223, "loss": 7.6101, "step": 1057500 }, { "epoch": 4.308429975282339, "grad_norm": 2.9544801712036133, "learning_rate": 0.0013256943795362124, "loss": 7.5938, "step": 1057600 }, { "epoch": 4.30883735330572, "grad_norm": 4.046011447906494, "learning_rate": 0.0013252572252185609, "loss": 7.6113, "step": 1057700 }, { "epoch": 4.309244731329102, "grad_norm": 4.219917297363281, "learning_rate": 0.0013248201175424508, "loss": 7.5877, "step": 1057800 }, { "epoch": 4.309652109352482, "grad_norm": 6.405755996704102, "learning_rate": 0.0013243830565251619, "loss": 7.5735, "step": 1057900 }, { "epoch": 4.310059487375864, "grad_norm": 7.971467018127441, "learning_rate": 0.0013239460421839771, "loss": 7.6056, "step": 1058000 }, { "epoch": 4.310059487375864, "eval_MaskedAccuracy": 0.5106496008305519, "eval_loss": 1.5981202125549316, "eval_runtime": 168.8455, "eval_samples_per_second": 375.941, "eval_steps_per_second": 1.469, "step": 1058000 }, { "epoch": 4.3104668653992455, "grad_norm": 3.9189813137054443, "learning_rate": 0.001323509074536172, "loss": 7.6137, "step": 1058100 }, { "epoch": 4.310874243422627, "grad_norm": 4.11265754699707, "learning_rate": 0.001323072153599021, "loss": 7.5849, "step": 1058200 }, { "epoch": 4.311281621446009, "grad_norm": 5.129466533660889, "learning_rate": 0.0013226352793898007, "loss": 7.6074, "step": 1058300 }, { "epoch": 4.31168899946939, "grad_norm": 9.669137001037598, "learning_rate": 0.001322198451925782, "loss": 7.6006, "step": 1058400 }, { "epoch": 4.312096377492772, "grad_norm": 5.684308052062988, "learning_rate": 0.0013217616712242343, "loss": 7.5882, "step": 1058500 }, { "epoch": 4.312503755516153, "grad_norm": 10.73115348815918, "learning_rate": 0.0013213249373024265, "loss": 7.5805, "step": 1058600 }, { "epoch": 4.312911133539535, "grad_norm": 3.2344930171966553, "learning_rate": 0.0013208882501776244, "loss": 7.6101, "step": 1058700 }, { "epoch": 4.313318511562916, "grad_norm": 16.917919158935547, "learning_rate": 0.001320451609867093, "loss": 7.5866, "step": 1058800 }, { "epoch": 4.313725889586298, "grad_norm": 5.793231010437012, "learning_rate": 0.0013200150163880935, "loss": 7.6191, "step": 1058900 }, { "epoch": 4.314133267609679, "grad_norm": 10.23379898071289, "learning_rate": 0.0013195784697578887, "loss": 7.5969, "step": 1059000 }, { "epoch": 4.314133267609679, "eval_MaskedAccuracy": 0.5108911332159818, "eval_loss": 1.5956271886825562, "eval_runtime": 165.7102, "eval_samples_per_second": 383.054, "eval_steps_per_second": 1.497, "step": 1059000 }, { "epoch": 4.31454064563306, "grad_norm": 11.354375839233398, "learning_rate": 0.0013191419699937383, "loss": 7.5962, "step": 1059100 }, { "epoch": 4.3149480236564415, "grad_norm": 6.699918746948242, "learning_rate": 0.0013187055171128969, "loss": 7.5858, "step": 1059200 }, { "epoch": 4.315355401679823, "grad_norm": 8.553930282592773, "learning_rate": 0.0013182691111326198, "loss": 7.5681, "step": 1059300 }, { "epoch": 4.3157627797032045, "grad_norm": 4.089259147644043, "learning_rate": 0.001317832752070163, "loss": 7.6107, "step": 1059400 }, { "epoch": 4.316170157726586, "grad_norm": 7.99910831451416, "learning_rate": 0.0013173964399427762, "loss": 7.5744, "step": 1059500 }, { "epoch": 4.316577535749968, "grad_norm": 11.323901176452637, "learning_rate": 0.0013169601747677108, "loss": 7.5732, "step": 1059600 }, { "epoch": 4.316984913773349, "grad_norm": 5.703823566436768, "learning_rate": 0.001316523956562213, "loss": 7.5945, "step": 1059700 }, { "epoch": 4.317392291796731, "grad_norm": 14.853983879089355, "learning_rate": 0.0013160877853435307, "loss": 7.6008, "step": 1059800 }, { "epoch": 4.317799669820112, "grad_norm": 8.825891494750977, "learning_rate": 0.0013156516611289068, "loss": 7.605, "step": 1059900 }, { "epoch": 4.318207047843494, "grad_norm": 4.946917533874512, "learning_rate": 0.0013152155839355844, "loss": 7.5915, "step": 1060000 }, { "epoch": 4.318207047843494, "eval_MaskedAccuracy": 0.5103674138807739, "eval_loss": 1.5986543893814087, "eval_runtime": 161.7593, "eval_samples_per_second": 392.41, "eval_steps_per_second": 1.533, "step": 1060000 }, { "epoch": 4.318614425866875, "grad_norm": 4.960884094238281, "learning_rate": 0.0013147795537808021, "loss": 7.608, "step": 1060100 }, { "epoch": 4.319021803890257, "grad_norm": 15.583209991455078, "learning_rate": 0.0013143435706818, "loss": 7.5951, "step": 1060200 }, { "epoch": 4.3194291819136374, "grad_norm": 10.425910949707031, "learning_rate": 0.001313907634655815, "loss": 7.5948, "step": 1060300 }, { "epoch": 4.319836559937019, "grad_norm": 8.672053337097168, "learning_rate": 0.0013134717457200823, "loss": 7.5649, "step": 1060400 }, { "epoch": 4.3202439379604005, "grad_norm": 9.029831886291504, "learning_rate": 0.001313035903891833, "loss": 7.5745, "step": 1060500 }, { "epoch": 4.320651315983782, "grad_norm": 5.3896355628967285, "learning_rate": 0.0013126001091883005, "loss": 7.5787, "step": 1060600 }, { "epoch": 4.321058694007164, "grad_norm": 8.003243446350098, "learning_rate": 0.0013121643616267134, "loss": 7.5771, "step": 1060700 }, { "epoch": 4.321466072030545, "grad_norm": 3.931807279586792, "learning_rate": 0.001311728661224299, "loss": 7.6167, "step": 1060800 }, { "epoch": 4.321873450053927, "grad_norm": 2.4016737937927246, "learning_rate": 0.0013112930079982807, "loss": 7.5967, "step": 1060900 }, { "epoch": 4.322280828077308, "grad_norm": 7.975673675537109, "learning_rate": 0.001310857401965886, "loss": 7.6061, "step": 1061000 }, { "epoch": 4.322280828077308, "eval_MaskedAccuracy": 0.510862917167637, "eval_loss": 1.5919030904769897, "eval_runtime": 158.5265, "eval_samples_per_second": 400.413, "eval_steps_per_second": 1.564, "step": 1061000 }, { "epoch": 4.32268820610069, "grad_norm": 7.915943145751953, "learning_rate": 0.001310421843144334, "loss": 7.6006, "step": 1061100 }, { "epoch": 4.323095584124071, "grad_norm": 10.837762832641602, "learning_rate": 0.0013099863315508451, "loss": 7.5836, "step": 1061200 }, { "epoch": 4.323502962147453, "grad_norm": 10.817831993103027, "learning_rate": 0.0013095508672026379, "loss": 7.567, "step": 1061300 }, { "epoch": 4.323910340170834, "grad_norm": 5.905236721038818, "learning_rate": 0.001309115450116929, "loss": 7.5539, "step": 1061400 }, { "epoch": 4.324317718194216, "grad_norm": 10.229114532470703, "learning_rate": 0.0013086800803109317, "loss": 7.6162, "step": 1061500 }, { "epoch": 4.3247250962175965, "grad_norm": 8.440672874450684, "learning_rate": 0.0013082447578018604, "loss": 7.6045, "step": 1061600 }, { "epoch": 4.325132474240978, "grad_norm": 5.581210136413574, "learning_rate": 0.0013078094826069252, "loss": 7.6123, "step": 1061700 }, { "epoch": 4.3255398522643596, "grad_norm": 16.999338150024414, "learning_rate": 0.001307374254743332, "loss": 7.5482, "step": 1061800 }, { "epoch": 4.325947230287741, "grad_norm": 4.336997032165527, "learning_rate": 0.0013069390742282895, "loss": 7.5791, "step": 1061900 }, { "epoch": 4.326354608311123, "grad_norm": 9.085204124450684, "learning_rate": 0.0013065039410790047, "loss": 7.5664, "step": 1062000 }, { "epoch": 4.326354608311123, "eval_MaskedAccuracy": 0.5104186976277901, "eval_loss": 1.5993964672088623, "eval_runtime": 157.3186, "eval_samples_per_second": 403.487, "eval_steps_per_second": 1.576, "step": 1062000 }, { "epoch": 4.326761986334504, "grad_norm": 4.992656230926514, "learning_rate": 0.001306068855312679, "loss": 7.6227, "step": 1062100 }, { "epoch": 4.327169364357886, "grad_norm": 8.597797393798828, "learning_rate": 0.0013056338169465133, "loss": 7.5815, "step": 1062200 }, { "epoch": 4.327576742381267, "grad_norm": 9.000292778015137, "learning_rate": 0.0013051988259977084, "loss": 7.6169, "step": 1062300 }, { "epoch": 4.327984120404649, "grad_norm": 15.233333587646484, "learning_rate": 0.0013047638824834605, "loss": 7.5652, "step": 1062400 }, { "epoch": 4.32839149842803, "grad_norm": 9.209403991699219, "learning_rate": 0.001304328986420965, "loss": 7.5839, "step": 1062500 }, { "epoch": 4.328798876451412, "grad_norm": 11.246329307556152, "learning_rate": 0.0013038941378274167, "loss": 7.6156, "step": 1062600 }, { "epoch": 4.329206254474793, "grad_norm": 8.702144622802734, "learning_rate": 0.0013034593367200076, "loss": 7.5804, "step": 1062700 }, { "epoch": 4.329613632498175, "grad_norm": 6.944125652313232, "learning_rate": 0.0013030245831159271, "loss": 7.5906, "step": 1062800 }, { "epoch": 4.3300210105215555, "grad_norm": 10.930323600769043, "learning_rate": 0.0013025898770323624, "loss": 7.5963, "step": 1062900 }, { "epoch": 4.330428388544937, "grad_norm": 5.019044876098633, "learning_rate": 0.0013021552184865027, "loss": 7.5525, "step": 1063000 }, { "epoch": 4.330428388544937, "eval_MaskedAccuracy": 0.5101939467960676, "eval_loss": 1.5975756645202637, "eval_runtime": 158.1521, "eval_samples_per_second": 401.36, "eval_steps_per_second": 1.568, "step": 1063000 }, { "epoch": 4.330835766568319, "grad_norm": 8.213278770446777, "learning_rate": 0.001301720607495529, "loss": 7.6018, "step": 1063100 }, { "epoch": 4.3312431445917, "grad_norm": 6.062457084655762, "learning_rate": 0.0013012860440766243, "loss": 7.5964, "step": 1063200 }, { "epoch": 4.331650522615082, "grad_norm": 3.820427894592285, "learning_rate": 0.0013008515282469705, "loss": 7.5949, "step": 1063300 }, { "epoch": 4.332057900638463, "grad_norm": 6.628716945648193, "learning_rate": 0.0013004170600237471, "loss": 7.607, "step": 1063400 }, { "epoch": 4.332465278661845, "grad_norm": 3.9690921306610107, "learning_rate": 0.0012999826394241298, "loss": 7.596, "step": 1063500 }, { "epoch": 4.332872656685226, "grad_norm": 5.554694175720215, "learning_rate": 0.001299548266465293, "loss": 7.5683, "step": 1063600 }, { "epoch": 4.333280034708608, "grad_norm": 10.183398246765137, "learning_rate": 0.0012991139411644094, "loss": 7.6026, "step": 1063700 }, { "epoch": 4.333687412731989, "grad_norm": 5.579558372497559, "learning_rate": 0.001298679663538653, "loss": 7.5957, "step": 1063800 }, { "epoch": 4.334094790755371, "grad_norm": 5.486879348754883, "learning_rate": 0.0012982454336051906, "loss": 7.6042, "step": 1063900 }, { "epoch": 4.334502168778752, "grad_norm": 10.786160469055176, "learning_rate": 0.001297811251381193, "loss": 7.5803, "step": 1064000 }, { "epoch": 4.334502168778752, "eval_MaskedAccuracy": 0.51076194041149, "eval_loss": 1.5966955423355103, "eval_runtime": 173.4607, "eval_samples_per_second": 365.939, "eval_steps_per_second": 1.43, "step": 1064000 }, { "epoch": 4.334909546802133, "grad_norm": 8.863340377807617, "learning_rate": 0.0012973771168838214, "loss": 7.5892, "step": 1064100 }, { "epoch": 4.335316924825515, "grad_norm": 4.055553436279297, "learning_rate": 0.0012969430301302412, "loss": 7.6095, "step": 1064200 }, { "epoch": 4.335724302848896, "grad_norm": 9.943106651306152, "learning_rate": 0.0012965089911376128, "loss": 7.5997, "step": 1064300 }, { "epoch": 4.336131680872278, "grad_norm": 8.628669738769531, "learning_rate": 0.001296074999923099, "loss": 7.5851, "step": 1064400 }, { "epoch": 4.336539058895659, "grad_norm": 7.279753684997559, "learning_rate": 0.0012956410565038562, "loss": 7.5732, "step": 1064500 }, { "epoch": 4.336946436919041, "grad_norm": 5.713074684143066, "learning_rate": 0.001295207160897041, "loss": 7.6035, "step": 1064600 }, { "epoch": 4.337353814942422, "grad_norm": 6.424903392791748, "learning_rate": 0.001294773313119807, "loss": 7.5754, "step": 1064700 }, { "epoch": 4.337761192965804, "grad_norm": 7.36214017868042, "learning_rate": 0.0012943395131893049, "loss": 7.5841, "step": 1064800 }, { "epoch": 4.338168570989185, "grad_norm": 10.240035057067871, "learning_rate": 0.0012939057611226885, "loss": 7.5897, "step": 1064900 }, { "epoch": 4.338575949012567, "grad_norm": 5.936344623565674, "learning_rate": 0.001293472056937105, "loss": 7.5685, "step": 1065000 }, { "epoch": 4.338575949012567, "eval_MaskedAccuracy": 0.509844156468321, "eval_loss": 1.6008100509643555, "eval_runtime": 158.0993, "eval_samples_per_second": 401.495, "eval_steps_per_second": 1.569, "step": 1065000 }, { "epoch": 4.338983327035948, "grad_norm": 5.6455864906311035, "learning_rate": 0.0012930384006496994, "loss": 7.5927, "step": 1065100 }, { "epoch": 4.33939070505933, "grad_norm": 7.265775203704834, "learning_rate": 0.0012926047922776187, "loss": 7.6001, "step": 1065200 }, { "epoch": 4.3397980830827105, "grad_norm": 9.657461166381836, "learning_rate": 0.0012921712318380051, "loss": 7.5824, "step": 1065300 }, { "epoch": 4.340205461106092, "grad_norm": 10.5460786819458, "learning_rate": 0.0012917377193479994, "loss": 7.5894, "step": 1065400 }, { "epoch": 4.340612839129474, "grad_norm": 4.517817974090576, "learning_rate": 0.0012913042548247412, "loss": 7.593, "step": 1065500 }, { "epoch": 4.341020217152855, "grad_norm": 12.844802856445312, "learning_rate": 0.0012908708382853673, "loss": 7.6035, "step": 1065600 }, { "epoch": 4.341427595176237, "grad_norm": 4.840984344482422, "learning_rate": 0.0012904374697470123, "loss": 7.5903, "step": 1065700 }, { "epoch": 4.341834973199618, "grad_norm": 12.027416229248047, "learning_rate": 0.00129000414922681, "loss": 7.5852, "step": 1065800 }, { "epoch": 4.342242351223, "grad_norm": 5.543475151062012, "learning_rate": 0.0012895708767418918, "loss": 7.5684, "step": 1065900 }, { "epoch": 4.342649729246381, "grad_norm": 8.735795021057129, "learning_rate": 0.0012891376523093865, "loss": 7.5857, "step": 1066000 }, { "epoch": 4.342649729246381, "eval_MaskedAccuracy": 0.5101855763272692, "eval_loss": 1.5994043350219727, "eval_runtime": 154.8085, "eval_samples_per_second": 410.029, "eval_steps_per_second": 1.602, "step": 1066000 }, { "epoch": 4.343057107269763, "grad_norm": 4.258831977844238, "learning_rate": 0.0012887044759464234, "loss": 7.6141, "step": 1066100 }, { "epoch": 4.343464485293144, "grad_norm": 10.738426208496094, "learning_rate": 0.0012882713476701272, "loss": 7.589, "step": 1066200 }, { "epoch": 4.343871863316526, "grad_norm": 4.778125762939453, "learning_rate": 0.0012878382674976223, "loss": 7.6103, "step": 1066300 }, { "epoch": 4.344279241339907, "grad_norm": 6.218105792999268, "learning_rate": 0.0012874052354460307, "loss": 7.5739, "step": 1066400 }, { "epoch": 4.344686619363289, "grad_norm": 4.286153793334961, "learning_rate": 0.0012869722515324725, "loss": 7.5593, "step": 1066500 }, { "epoch": 4.34509399738667, "grad_norm": 3.8365478515625, "learning_rate": 0.0012865393157740653, "loss": 7.5927, "step": 1066600 }, { "epoch": 4.345501375410051, "grad_norm": 11.182649612426758, "learning_rate": 0.0012861064281879256, "loss": 7.5998, "step": 1066700 }, { "epoch": 4.345908753433433, "grad_norm": 5.147824287414551, "learning_rate": 0.0012856735887911684, "loss": 7.6066, "step": 1066800 }, { "epoch": 4.346316131456814, "grad_norm": 3.963681936264038, "learning_rate": 0.0012852407976009048, "loss": 7.5603, "step": 1066900 }, { "epoch": 4.346723509480196, "grad_norm": 12.69094181060791, "learning_rate": 0.0012848080546342458, "loss": 7.5934, "step": 1067000 }, { "epoch": 4.346723509480196, "eval_MaskedAccuracy": 0.5104228786458219, "eval_loss": 1.605615258216858, "eval_runtime": 162.6117, "eval_samples_per_second": 390.353, "eval_steps_per_second": 1.525, "step": 1067000 }, { "epoch": 4.347130887503577, "grad_norm": 4.976997375488281, "learning_rate": 0.0012843753599083004, "loss": 7.6242, "step": 1067100 }, { "epoch": 4.347538265526959, "grad_norm": 3.4787328243255615, "learning_rate": 0.0012839427134401769, "loss": 7.5837, "step": 1067200 }, { "epoch": 4.34794564355034, "grad_norm": 8.680774688720703, "learning_rate": 0.0012835101152469778, "loss": 7.5942, "step": 1067300 }, { "epoch": 4.348353021573722, "grad_norm": 8.437795639038086, "learning_rate": 0.0012830775653458076, "loss": 7.5711, "step": 1067400 }, { "epoch": 4.348760399597103, "grad_norm": 7.789732933044434, "learning_rate": 0.0012826450637537662, "loss": 7.5909, "step": 1067500 }, { "epoch": 4.349167777620485, "grad_norm": 7.138700008392334, "learning_rate": 0.0012822126104879519, "loss": 7.6016, "step": 1067600 }, { "epoch": 4.349575155643866, "grad_norm": 3.996811628341675, "learning_rate": 0.0012817802055654635, "loss": 7.5711, "step": 1067700 }, { "epoch": 4.349982533667248, "grad_norm": 8.46236515045166, "learning_rate": 0.001281347849003394, "loss": 7.5659, "step": 1067800 }, { "epoch": 4.350389911690629, "grad_norm": 15.083024024963379, "learning_rate": 0.0012809155408188399, "loss": 7.6039, "step": 1067900 }, { "epoch": 4.35079728971401, "grad_norm": 3.8241968154907227, "learning_rate": 0.00128048328102889, "loss": 7.5949, "step": 1068000 }, { "epoch": 4.35079728971401, "eval_MaskedAccuracy": 0.5108676400674896, "eval_loss": 1.596187710762024, "eval_runtime": 153.5642, "eval_samples_per_second": 413.351, "eval_steps_per_second": 1.615, "step": 1068000 }, { "epoch": 4.351204667737392, "grad_norm": 12.495206832885742, "learning_rate": 0.001280051069650638, "loss": 7.589, "step": 1068100 }, { "epoch": 4.351612045760773, "grad_norm": 9.804719924926758, "learning_rate": 0.0012796189067011679, "loss": 7.586, "step": 1068200 }, { "epoch": 4.352019423784155, "grad_norm": 4.911618709564209, "learning_rate": 0.0012791867921975667, "loss": 7.5939, "step": 1068300 }, { "epoch": 4.352426801807536, "grad_norm": 10.578044891357422, "learning_rate": 0.0012787547261569163, "loss": 7.5736, "step": 1068400 }, { "epoch": 4.352834179830918, "grad_norm": 4.151943683624268, "learning_rate": 0.001278322708596301, "loss": 7.6047, "step": 1068500 }, { "epoch": 4.353241557854299, "grad_norm": 11.867549896240234, "learning_rate": 0.0012778907395327996, "loss": 7.5943, "step": 1068600 }, { "epoch": 4.353648935877681, "grad_norm": 16.011520385742188, "learning_rate": 0.001277458818983491, "loss": 7.5673, "step": 1068700 }, { "epoch": 4.354056313901062, "grad_norm": 3.1570136547088623, "learning_rate": 0.0012770269469654513, "loss": 7.5858, "step": 1068800 }, { "epoch": 4.354463691924444, "grad_norm": 11.839057922363281, "learning_rate": 0.0012765951234957537, "loss": 7.6403, "step": 1068900 }, { "epoch": 4.3548710699478255, "grad_norm": 5.1624650955200195, "learning_rate": 0.00127616334859147, "loss": 7.5585, "step": 1069000 }, { "epoch": 4.3548710699478255, "eval_MaskedAccuracy": 0.5108027954903482, "eval_loss": 1.5931923389434814, "eval_runtime": 152.3651, "eval_samples_per_second": 416.605, "eval_steps_per_second": 1.628, "step": 1069000 }, { "epoch": 4.355278447971206, "grad_norm": 11.090703964233398, "learning_rate": 0.0012757316222696718, "loss": 7.5973, "step": 1069100 }, { "epoch": 4.355685825994588, "grad_norm": 16.196626663208008, "learning_rate": 0.0012752999445474263, "loss": 7.5732, "step": 1069200 }, { "epoch": 4.356093204017969, "grad_norm": 14.063507080078125, "learning_rate": 0.0012748683154418018, "loss": 7.6002, "step": 1069300 }, { "epoch": 4.356500582041351, "grad_norm": 4.258674621582031, "learning_rate": 0.0012744367349698624, "loss": 7.565, "step": 1069400 }, { "epoch": 4.356907960064732, "grad_norm": 7.568656921386719, "learning_rate": 0.0012740052031486708, "loss": 7.5596, "step": 1069500 }, { "epoch": 4.357315338088114, "grad_norm": 8.4083890914917, "learning_rate": 0.0012735737199952857, "loss": 7.5934, "step": 1069600 }, { "epoch": 4.357722716111495, "grad_norm": 9.756007194519043, "learning_rate": 0.0012731422855267699, "loss": 7.5897, "step": 1069700 }, { "epoch": 4.358130094134877, "grad_norm": 10.273256301879883, "learning_rate": 0.0012727108997601757, "loss": 7.5617, "step": 1069800 }, { "epoch": 4.358537472158258, "grad_norm": 12.686280250549316, "learning_rate": 0.0012722795627125609, "loss": 7.5649, "step": 1069900 }, { "epoch": 4.35894485018164, "grad_norm": 4.117603778839111, "learning_rate": 0.0012718482744009778, "loss": 7.5786, "step": 1070000 }, { "epoch": 4.35894485018164, "eval_MaskedAccuracy": 0.5102318150094306, "eval_loss": 1.5989423990249634, "eval_runtime": 168.4935, "eval_samples_per_second": 376.727, "eval_steps_per_second": 1.472, "step": 1070000 }, { "epoch": 4.359352228205021, "grad_norm": 6.047852993011475, "learning_rate": 0.0012714170348424791, "loss": 7.5837, "step": 1070100 }, { "epoch": 4.359759606228403, "grad_norm": 10.343796730041504, "learning_rate": 0.001270985844054111, "loss": 7.5884, "step": 1070200 }, { "epoch": 4.360166984251784, "grad_norm": 4.143160343170166, "learning_rate": 0.0012705547020529224, "loss": 7.5587, "step": 1070300 }, { "epoch": 4.360574362275165, "grad_norm": 3.8405749797821045, "learning_rate": 0.0012701236088559617, "loss": 7.6016, "step": 1070400 }, { "epoch": 4.360981740298547, "grad_norm": 15.33755111694336, "learning_rate": 0.0012696925644802672, "loss": 7.5771, "step": 1070500 }, { "epoch": 4.361389118321928, "grad_norm": 6.435998916625977, "learning_rate": 0.0012692615689428833, "loss": 7.5763, "step": 1070600 }, { "epoch": 4.36179649634531, "grad_norm": 7.149518013000488, "learning_rate": 0.001268830622260848, "loss": 7.5817, "step": 1070700 }, { "epoch": 4.362203874368691, "grad_norm": 5.455778121948242, "learning_rate": 0.0012683997244511992, "loss": 7.5637, "step": 1070800 }, { "epoch": 4.362611252392073, "grad_norm": 7.155077934265137, "learning_rate": 0.0012679688755309734, "loss": 7.5535, "step": 1070900 }, { "epoch": 4.363018630415454, "grad_norm": 4.293442726135254, "learning_rate": 0.0012675380755172044, "loss": 7.6067, "step": 1071000 }, { "epoch": 4.363018630415454, "eval_MaskedAccuracy": 0.5106091601963929, "eval_loss": 1.5963608026504517, "eval_runtime": 165.3609, "eval_samples_per_second": 383.863, "eval_steps_per_second": 1.5, "step": 1071000 }, { "epoch": 4.363426008438836, "grad_norm": 9.340729713439941, "learning_rate": 0.0012671073244269228, "loss": 7.6366, "step": 1071100 }, { "epoch": 4.363833386462217, "grad_norm": 9.988269805908203, "learning_rate": 0.0012666766222771578, "loss": 7.5923, "step": 1071200 }, { "epoch": 4.364240764485599, "grad_norm": 6.921746253967285, "learning_rate": 0.00126624596908494, "loss": 7.5636, "step": 1071300 }, { "epoch": 4.3646481425089805, "grad_norm": 11.134282112121582, "learning_rate": 0.0012658153648672959, "loss": 7.5798, "step": 1071400 }, { "epoch": 4.365055520532362, "grad_norm": 4.560843467712402, "learning_rate": 0.0012653848096412477, "loss": 7.5861, "step": 1071500 }, { "epoch": 4.365462898555743, "grad_norm": 9.250143051147461, "learning_rate": 0.0012649543034238177, "loss": 7.5835, "step": 1071600 }, { "epoch": 4.365870276579124, "grad_norm": 6.468830108642578, "learning_rate": 0.0012645238462320268, "loss": 7.5474, "step": 1071700 }, { "epoch": 4.366277654602506, "grad_norm": 10.883308410644531, "learning_rate": 0.0012640934380828924, "loss": 7.5726, "step": 1071800 }, { "epoch": 4.366685032625887, "grad_norm": 13.532042503356934, "learning_rate": 0.00126366307899343, "loss": 7.5847, "step": 1071900 }, { "epoch": 4.367092410649269, "grad_norm": 9.324649810791016, "learning_rate": 0.0012632327689806565, "loss": 7.562, "step": 1072000 }, { "epoch": 4.367092410649269, "eval_MaskedAccuracy": 0.5105239092277821, "eval_loss": 1.5967248678207397, "eval_runtime": 152.6265, "eval_samples_per_second": 415.891, "eval_steps_per_second": 1.625, "step": 1072000 }, { "epoch": 4.36749978867265, "grad_norm": 11.059331893920898, "learning_rate": 0.0012628025080615828, "loss": 7.598, "step": 1072100 }, { "epoch": 4.367907166696032, "grad_norm": 7.78730583190918, "learning_rate": 0.0012623722962532204, "loss": 7.5879, "step": 1072200 }, { "epoch": 4.368314544719413, "grad_norm": 8.362425804138184, "learning_rate": 0.0012619421335725762, "loss": 7.5957, "step": 1072300 }, { "epoch": 4.368721922742795, "grad_norm": 9.769828796386719, "learning_rate": 0.0012615120200366592, "loss": 7.6245, "step": 1072400 }, { "epoch": 4.3691293007661764, "grad_norm": 4.522059917449951, "learning_rate": 0.0012610819556624709, "loss": 7.6131, "step": 1072500 }, { "epoch": 4.369536678789558, "grad_norm": 11.34514331817627, "learning_rate": 0.0012606519404670161, "loss": 7.5927, "step": 1072600 }, { "epoch": 4.3699440568129395, "grad_norm": 3.706719160079956, "learning_rate": 0.0012602219744672959, "loss": 7.6163, "step": 1072700 }, { "epoch": 4.370351434836321, "grad_norm": 7.14650821685791, "learning_rate": 0.001259792057680308, "loss": 7.5554, "step": 1072800 }, { "epoch": 4.370758812859702, "grad_norm": 9.408866882324219, "learning_rate": 0.0012593621901230512, "loss": 7.5682, "step": 1072900 }, { "epoch": 4.371166190883083, "grad_norm": 6.3065690994262695, "learning_rate": 0.0012589323718125189, "loss": 7.5868, "step": 1073000 }, { "epoch": 4.371166190883083, "eval_MaskedAccuracy": 0.5104471424898137, "eval_loss": 1.5974905490875244, "eval_runtime": 163.3728, "eval_samples_per_second": 388.535, "eval_steps_per_second": 1.518, "step": 1073000 }, { "epoch": 4.371573568906465, "grad_norm": 10.61609935760498, "learning_rate": 0.0012585026027657027, "loss": 7.6045, "step": 1073100 }, { "epoch": 4.371980946929846, "grad_norm": 3.768588066101074, "learning_rate": 0.0012580728829995958, "loss": 7.5806, "step": 1073200 }, { "epoch": 4.372388324953228, "grad_norm": 10.519930839538574, "learning_rate": 0.0012576432125311887, "loss": 7.6019, "step": 1073300 }, { "epoch": 4.372795702976609, "grad_norm": 3.8004915714263916, "learning_rate": 0.001257213591377466, "loss": 7.6136, "step": 1073400 }, { "epoch": 4.373203080999991, "grad_norm": 7.739326477050781, "learning_rate": 0.001256784019555413, "loss": 7.5987, "step": 1073500 }, { "epoch": 4.373610459023372, "grad_norm": 6.079395771026611, "learning_rate": 0.0012563544970820137, "loss": 7.5881, "step": 1073600 }, { "epoch": 4.374017837046754, "grad_norm": 15.077213287353516, "learning_rate": 0.0012559250239742494, "loss": 7.5886, "step": 1073700 }, { "epoch": 4.3744252150701355, "grad_norm": 4.818278789520264, "learning_rate": 0.0012554956002491006, "loss": 7.6213, "step": 1073800 }, { "epoch": 4.374832593093517, "grad_norm": 15.234819412231445, "learning_rate": 0.0012550662259235427, "loss": 7.5854, "step": 1073900 }, { "epoch": 4.3752399711168986, "grad_norm": 11.360335350036621, "learning_rate": 0.0012546369010145533, "loss": 7.5524, "step": 1074000 }, { "epoch": 4.3752399711168986, "eval_MaskedAccuracy": 0.5106042494614875, "eval_loss": 1.59951651096344, "eval_runtime": 164.5784, "eval_samples_per_second": 385.689, "eval_steps_per_second": 1.507, "step": 1074000 }, { "epoch": 4.375647349140279, "grad_norm": 9.735004425048828, "learning_rate": 0.0012542076255391025, "loss": 7.5879, "step": 1074100 }, { "epoch": 4.376054727163661, "grad_norm": 18.597047805786133, "learning_rate": 0.0012537783995141651, "loss": 7.6059, "step": 1074200 }, { "epoch": 4.376462105187042, "grad_norm": 14.264388084411621, "learning_rate": 0.0012533492229567097, "loss": 7.5719, "step": 1074300 }, { "epoch": 4.376869483210424, "grad_norm": 3.560699224472046, "learning_rate": 0.0012529200958837038, "loss": 7.6011, "step": 1074400 }, { "epoch": 4.377276861233805, "grad_norm": 6.42880916595459, "learning_rate": 0.0012524910183121121, "loss": 7.5836, "step": 1074500 }, { "epoch": 4.377684239257187, "grad_norm": 4.812469959259033, "learning_rate": 0.0012520619902589001, "loss": 7.5817, "step": 1074600 }, { "epoch": 4.378091617280568, "grad_norm": 6.2986249923706055, "learning_rate": 0.0012516330117410296, "loss": 7.6005, "step": 1074700 }, { "epoch": 4.37849899530395, "grad_norm": 8.868752479553223, "learning_rate": 0.0012512040827754572, "loss": 7.6097, "step": 1074800 }, { "epoch": 4.3789063733273315, "grad_norm": 13.603480339050293, "learning_rate": 0.001250775203379144, "loss": 7.5842, "step": 1074900 }, { "epoch": 4.379313751350713, "grad_norm": 9.5311279296875, "learning_rate": 0.0012503463735690453, "loss": 7.5775, "step": 1075000 }, { "epoch": 4.379313751350713, "eval_MaskedAccuracy": 0.5112514387056566, "eval_loss": 1.5999317169189453, "eval_runtime": 165.7465, "eval_samples_per_second": 382.97, "eval_steps_per_second": 1.496, "step": 1075000 }, { "epoch": 4.3797211293740945, "grad_norm": 9.72091293334961, "learning_rate": 0.001249917593362115, "loss": 7.6107, "step": 1075100 }, { "epoch": 4.380128507397476, "grad_norm": 7.305984973907471, "learning_rate": 0.001249488862775306, "loss": 7.6228, "step": 1075200 }, { "epoch": 4.380535885420857, "grad_norm": 5.1745829582214355, "learning_rate": 0.0012490601818255665, "loss": 7.5918, "step": 1075300 }, { "epoch": 4.380943263444238, "grad_norm": 9.361069679260254, "learning_rate": 0.0012486315505298446, "loss": 7.578, "step": 1075400 }, { "epoch": 4.38135064146762, "grad_norm": 5.21118688583374, "learning_rate": 0.0012482029689050882, "loss": 7.5887, "step": 1075500 }, { "epoch": 4.381758019491001, "grad_norm": 2.8380625247955322, "learning_rate": 0.0012477744369682395, "loss": 7.5884, "step": 1075600 }, { "epoch": 4.382165397514383, "grad_norm": 13.930927276611328, "learning_rate": 0.0012473459547362419, "loss": 7.5569, "step": 1075700 }, { "epoch": 4.382572775537764, "grad_norm": 5.366053581237793, "learning_rate": 0.0012469175222260345, "loss": 7.5963, "step": 1075800 }, { "epoch": 4.382980153561146, "grad_norm": 3.2593436241149902, "learning_rate": 0.0012464891394545579, "loss": 7.6076, "step": 1075900 }, { "epoch": 4.383387531584527, "grad_norm": 8.94016170501709, "learning_rate": 0.0012460608064387459, "loss": 7.606, "step": 1076000 }, { "epoch": 4.383387531584527, "eval_MaskedAccuracy": 0.5099280734437895, "eval_loss": 1.6040574312210083, "eval_runtime": 163.1411, "eval_samples_per_second": 389.086, "eval_steps_per_second": 1.52, "step": 1076000 }, { "epoch": 4.383794909607909, "grad_norm": 4.465366363525391, "learning_rate": 0.0012456325231955325, "loss": 7.5911, "step": 1076100 }, { "epoch": 4.3842022876312905, "grad_norm": 11.04880142211914, "learning_rate": 0.001245204289741853, "loss": 7.5984, "step": 1076200 }, { "epoch": 4.384609665654672, "grad_norm": 9.210282325744629, "learning_rate": 0.0012447761060946346, "loss": 7.585, "step": 1076300 }, { "epoch": 4.385017043678054, "grad_norm": 5.569253444671631, "learning_rate": 0.0012443479722708069, "loss": 7.5811, "step": 1076400 }, { "epoch": 4.385424421701435, "grad_norm": 5.126228332519531, "learning_rate": 0.0012439198882872955, "loss": 7.5979, "step": 1076500 }, { "epoch": 4.385831799724816, "grad_norm": 8.947504997253418, "learning_rate": 0.0012434918541610264, "loss": 7.5892, "step": 1076600 }, { "epoch": 4.386239177748197, "grad_norm": 9.127795219421387, "learning_rate": 0.00124306386990892, "loss": 7.5509, "step": 1076700 }, { "epoch": 4.386646555771579, "grad_norm": 5.873353004455566, "learning_rate": 0.0012426359355478965, "loss": 7.6015, "step": 1076800 }, { "epoch": 4.38705393379496, "grad_norm": 10.857064247131348, "learning_rate": 0.0012422080510948795, "loss": 7.5986, "step": 1076900 }, { "epoch": 4.387461311818342, "grad_norm": 3.498786449432373, "learning_rate": 0.0012417802165667813, "loss": 7.6264, "step": 1077000 }, { "epoch": 4.387461311818342, "eval_MaskedAccuracy": 0.5107773997206252, "eval_loss": 1.602007508277893, "eval_runtime": 168.0554, "eval_samples_per_second": 377.709, "eval_steps_per_second": 1.476, "step": 1077000 }, { "epoch": 4.387868689841723, "grad_norm": 5.145960330963135, "learning_rate": 0.0012413524319805163, "loss": 7.6106, "step": 1077100 }, { "epoch": 4.388276067865105, "grad_norm": 7.251046657562256, "learning_rate": 0.0012409246973529992, "loss": 7.6026, "step": 1077200 }, { "epoch": 4.3886834458884865, "grad_norm": 2.875115394592285, "learning_rate": 0.001240497012701139, "loss": 7.5612, "step": 1077300 }, { "epoch": 4.389090823911868, "grad_norm": 8.643251419067383, "learning_rate": 0.0012400693780418447, "loss": 7.637, "step": 1077400 }, { "epoch": 4.3894982019352495, "grad_norm": 9.116021156311035, "learning_rate": 0.0012396417933920247, "loss": 7.5846, "step": 1077500 }, { "epoch": 4.389905579958631, "grad_norm": 7.570412635803223, "learning_rate": 0.0012392142587685816, "loss": 7.6097, "step": 1077600 }, { "epoch": 4.390312957982013, "grad_norm": 6.775341033935547, "learning_rate": 0.0012387867741884189, "loss": 7.589, "step": 1077700 }, { "epoch": 4.390720336005394, "grad_norm": 3.919856309890747, "learning_rate": 0.0012383593396684373, "loss": 7.5687, "step": 1077800 }, { "epoch": 4.391127714028775, "grad_norm": 5.87698221206665, "learning_rate": 0.0012379319552255357, "loss": 7.6009, "step": 1077900 }, { "epoch": 4.391535092052156, "grad_norm": 10.432027816772461, "learning_rate": 0.0012375046208766098, "loss": 7.5815, "step": 1078000 }, { "epoch": 4.391535092052156, "eval_MaskedAccuracy": 0.5110336118225574, "eval_loss": 1.5920871496200562, "eval_runtime": 166.6891, "eval_samples_per_second": 380.805, "eval_steps_per_second": 1.488, "step": 1078000 }, { "epoch": 4.391942470075538, "grad_norm": 8.6183443069458, "learning_rate": 0.0012370773366385535, "loss": 7.5998, "step": 1078100 }, { "epoch": 4.392349848098919, "grad_norm": 7.473204612731934, "learning_rate": 0.001236650102528263, "loss": 7.5326, "step": 1078200 }, { "epoch": 4.392757226122301, "grad_norm": 13.099045753479004, "learning_rate": 0.0012362229185626274, "loss": 7.585, "step": 1078300 }, { "epoch": 4.393164604145682, "grad_norm": 4.473751068115234, "learning_rate": 0.0012357957847585362, "loss": 7.577, "step": 1078400 }, { "epoch": 4.393571982169064, "grad_norm": 12.312962532043457, "learning_rate": 0.0012353687011328754, "loss": 7.6053, "step": 1078500 }, { "epoch": 4.3939793601924455, "grad_norm": 4.935724258422852, "learning_rate": 0.0012349416677025296, "loss": 7.5757, "step": 1078600 }, { "epoch": 4.394386738215827, "grad_norm": 4.584693431854248, "learning_rate": 0.0012345146844843834, "loss": 7.5887, "step": 1078700 }, { "epoch": 4.394794116239209, "grad_norm": 3.618016481399536, "learning_rate": 0.0012340877514953165, "loss": 7.6074, "step": 1078800 }, { "epoch": 4.39520149426259, "grad_norm": 13.403838157653809, "learning_rate": 0.0012336608687522094, "loss": 7.5633, "step": 1078900 }, { "epoch": 4.395608872285972, "grad_norm": 7.863653659820557, "learning_rate": 0.0012332340362719372, "loss": 7.5719, "step": 1079000 }, { "epoch": 4.395608872285972, "eval_MaskedAccuracy": 0.5112417628732163, "eval_loss": 1.5996752977371216, "eval_runtime": 152.9429, "eval_samples_per_second": 415.031, "eval_steps_per_second": 1.622, "step": 1079000 }, { "epoch": 4.396016250309352, "grad_norm": 6.779383182525635, "learning_rate": 0.001232807254071375, "loss": 7.5974, "step": 1079100 }, { "epoch": 4.396423628332734, "grad_norm": 7.017134189605713, "learning_rate": 0.0012323805221673952, "loss": 7.5547, "step": 1079200 }, { "epoch": 4.396831006356115, "grad_norm": 6.765597343444824, "learning_rate": 0.0012319538405768718, "loss": 7.5843, "step": 1079300 }, { "epoch": 4.397238384379497, "grad_norm": 11.13979434967041, "learning_rate": 0.0012315272093166708, "loss": 7.5937, "step": 1079400 }, { "epoch": 4.397645762402878, "grad_norm": 12.424825668334961, "learning_rate": 0.001231100628403661, "loss": 7.5907, "step": 1079500 }, { "epoch": 4.39805314042626, "grad_norm": 5.490167140960693, "learning_rate": 0.0012306740978547078, "loss": 7.5813, "step": 1079600 }, { "epoch": 4.3984605184496415, "grad_norm": 15.808433532714844, "learning_rate": 0.0012302476176866723, "loss": 7.6011, "step": 1079700 }, { "epoch": 4.398867896473023, "grad_norm": 3.578862190246582, "learning_rate": 0.0012298211879164132, "loss": 7.5841, "step": 1079800 }, { "epoch": 4.3992752744964045, "grad_norm": 6.766655921936035, "learning_rate": 0.001229394808560797, "loss": 7.5828, "step": 1079900 }, { "epoch": 4.399682652519786, "grad_norm": 6.24049711227417, "learning_rate": 0.001228968479636674, "loss": 7.5878, "step": 1080000 }, { "epoch": 4.399682652519786, "eval_MaskedAccuracy": 0.5112307424352367, "eval_loss": 1.5944077968597412, "eval_runtime": 158.2273, "eval_samples_per_second": 401.17, "eval_steps_per_second": 1.567, "step": 1080000 }, { "epoch": 4.400090030543168, "grad_norm": 15.223663330078125, "learning_rate": 0.0012285422011609024, "loss": 7.5534, "step": 1080100 }, { "epoch": 4.400497408566549, "grad_norm": 11.505247116088867, "learning_rate": 0.0012281159731503352, "loss": 7.5738, "step": 1080200 }, { "epoch": 4.40090478658993, "grad_norm": 3.8178861141204834, "learning_rate": 0.001227689795621821, "loss": 7.5777, "step": 1080300 }, { "epoch": 4.401312164613311, "grad_norm": 6.735739231109619, "learning_rate": 0.0012272636685922108, "loss": 7.5766, "step": 1080400 }, { "epoch": 4.401719542636693, "grad_norm": 17.2441463470459, "learning_rate": 0.0012268375920783524, "loss": 7.5491, "step": 1080500 }, { "epoch": 4.402126920660074, "grad_norm": 6.725284099578857, "learning_rate": 0.0012264115660970895, "loss": 7.5856, "step": 1080600 }, { "epoch": 4.402534298683456, "grad_norm": 10.950495719909668, "learning_rate": 0.0012259855906652647, "loss": 7.5661, "step": 1080700 }, { "epoch": 4.4029416767068374, "grad_norm": 3.5515589714050293, "learning_rate": 0.0012255596657997209, "loss": 7.5844, "step": 1080800 }, { "epoch": 4.403349054730219, "grad_norm": 8.926765441894531, "learning_rate": 0.0012251337915172979, "loss": 7.5928, "step": 1080900 }, { "epoch": 4.4037564327536005, "grad_norm": 8.883869171142578, "learning_rate": 0.0012247079678348296, "loss": 7.5739, "step": 1081000 }, { "epoch": 4.4037564327536005, "eval_MaskedAccuracy": 0.5111937848838357, "eval_loss": 1.5912305116653442, "eval_runtime": 160.2994, "eval_samples_per_second": 395.984, "eval_steps_per_second": 1.547, "step": 1081000 }, { "epoch": 4.404163810776982, "grad_norm": 7.667311191558838, "learning_rate": 0.0012242821947691547, "loss": 7.5878, "step": 1081100 }, { "epoch": 4.404571188800364, "grad_norm": 5.586745262145996, "learning_rate": 0.0012238564723371045, "loss": 7.567, "step": 1081200 }, { "epoch": 4.404978566823745, "grad_norm": 5.00696325302124, "learning_rate": 0.0012234308005555075, "loss": 7.5621, "step": 1081300 }, { "epoch": 4.405385944847127, "grad_norm": 3.9975998401641846, "learning_rate": 0.001223005179441196, "loss": 7.5989, "step": 1081400 }, { "epoch": 4.405793322870508, "grad_norm": 10.185312271118164, "learning_rate": 0.0012225796090109965, "loss": 7.5867, "step": 1081500 }, { "epoch": 4.406200700893889, "grad_norm": 12.204691886901855, "learning_rate": 0.001222154089281734, "loss": 7.6038, "step": 1081600 }, { "epoch": 4.40660807891727, "grad_norm": 6.501101016998291, "learning_rate": 0.0012217286202702315, "loss": 7.5944, "step": 1081700 }, { "epoch": 4.407015456940652, "grad_norm": 9.930747985839844, "learning_rate": 0.0012213032019933099, "loss": 7.578, "step": 1081800 }, { "epoch": 4.407422834964033, "grad_norm": 9.826106071472168, "learning_rate": 0.00122087783446779, "loss": 7.5861, "step": 1081900 }, { "epoch": 4.407830212987415, "grad_norm": 6.22445011138916, "learning_rate": 0.001220452517710486, "loss": 7.5844, "step": 1082000 }, { "epoch": 4.407830212987415, "eval_MaskedAccuracy": 0.5105746082038909, "eval_loss": 1.5927674770355225, "eval_runtime": 152.06, "eval_samples_per_second": 417.441, "eval_steps_per_second": 1.631, "step": 1082000 }, { "epoch": 4.4082375910107965, "grad_norm": 11.709757804870605, "learning_rate": 0.0012200272517382154, "loss": 7.6239, "step": 1082100 }, { "epoch": 4.408644969034178, "grad_norm": 5.423645973205566, "learning_rate": 0.0012196020365677902, "loss": 7.5756, "step": 1082200 }, { "epoch": 4.4090523470575596, "grad_norm": 8.007949829101562, "learning_rate": 0.00121917687221602, "loss": 7.5693, "step": 1082300 }, { "epoch": 4.409459725080941, "grad_norm": 16.422317504882812, "learning_rate": 0.0012187517586997152, "loss": 7.6059, "step": 1082400 }, { "epoch": 4.409867103104323, "grad_norm": 7.6850762367248535, "learning_rate": 0.001218326696035683, "loss": 7.5635, "step": 1082500 }, { "epoch": 4.410274481127704, "grad_norm": 4.924514293670654, "learning_rate": 0.0012179016842407268, "loss": 7.5817, "step": 1082600 }, { "epoch": 4.410681859151086, "grad_norm": 3.909653902053833, "learning_rate": 0.0012174767233316518, "loss": 7.551, "step": 1082700 }, { "epoch": 4.411089237174467, "grad_norm": 10.372475624084473, "learning_rate": 0.0012170518133252594, "loss": 7.6428, "step": 1082800 }, { "epoch": 4.411496615197848, "grad_norm": 5.996923446655273, "learning_rate": 0.0012166269542383458, "loss": 7.58, "step": 1082900 }, { "epoch": 4.411903993221229, "grad_norm": 4.6754913330078125, "learning_rate": 0.0012162021460877098, "loss": 7.5846, "step": 1083000 }, { "epoch": 4.411903993221229, "eval_MaskedAccuracy": 0.5109740300946131, "eval_loss": 1.5946656465530396, "eval_runtime": 154.5479, "eval_samples_per_second": 410.72, "eval_steps_per_second": 1.605, "step": 1083000 }, { "epoch": 4.412311371244611, "grad_norm": 5.254715919494629, "learning_rate": 0.0012157773888901468, "loss": 7.5685, "step": 1083100 }, { "epoch": 4.4127187492679925, "grad_norm": 5.549961090087891, "learning_rate": 0.0012153526826624488, "loss": 7.5656, "step": 1083200 }, { "epoch": 4.413126127291374, "grad_norm": 5.245545864105225, "learning_rate": 0.0012149280274214068, "loss": 7.5923, "step": 1083300 }, { "epoch": 4.4135335053147555, "grad_norm": 4.338502883911133, "learning_rate": 0.001214503423183809, "loss": 7.5961, "step": 1083400 }, { "epoch": 4.413940883338137, "grad_norm": 4.973672866821289, "learning_rate": 0.001214078869966443, "loss": 7.5644, "step": 1083500 }, { "epoch": 4.414348261361519, "grad_norm": 5.538344860076904, "learning_rate": 0.001213654367786093, "loss": 7.6126, "step": 1083600 }, { "epoch": 4.4147556393849, "grad_norm": 11.5521240234375, "learning_rate": 0.001213229916659542, "loss": 7.6021, "step": 1083700 }, { "epoch": 4.415163017408282, "grad_norm": 7.949106216430664, "learning_rate": 0.0012128055166035717, "loss": 7.6021, "step": 1083800 }, { "epoch": 4.415570395431663, "grad_norm": 6.9478302001953125, "learning_rate": 0.0012123811676349604, "loss": 7.5997, "step": 1083900 }, { "epoch": 4.415977773455045, "grad_norm": 7.187282562255859, "learning_rate": 0.0012119568697704858, "loss": 7.6215, "step": 1084000 }, { "epoch": 4.415977773455045, "eval_MaskedAccuracy": 0.5106979206601311, "eval_loss": 1.6136956214904785, "eval_runtime": 159.2963, "eval_samples_per_second": 398.478, "eval_steps_per_second": 1.557, "step": 1084000 }, { "epoch": 4.416385151478425, "grad_norm": 8.625130653381348, "learning_rate": 0.0012115326230269213, "loss": 7.5802, "step": 1084100 }, { "epoch": 4.416792529501807, "grad_norm": 4.124234199523926, "learning_rate": 0.001211108427421041, "loss": 7.5743, "step": 1084200 }, { "epoch": 4.417199907525188, "grad_norm": 16.35880470275879, "learning_rate": 0.0012106842829696142, "loss": 7.5847, "step": 1084300 }, { "epoch": 4.41760728554857, "grad_norm": 7.555070400238037, "learning_rate": 0.0012102601896894093, "loss": 7.6228, "step": 1084400 }, { "epoch": 4.4180146635719515, "grad_norm": 8.547143936157227, "learning_rate": 0.0012098361475971948, "loss": 7.5763, "step": 1084500 }, { "epoch": 4.418422041595333, "grad_norm": 10.39609146118164, "learning_rate": 0.0012094121567097348, "loss": 7.5958, "step": 1084600 }, { "epoch": 4.418829419618715, "grad_norm": 4.880538463592529, "learning_rate": 0.0012089882170437894, "loss": 7.6122, "step": 1084700 }, { "epoch": 4.419236797642096, "grad_norm": 5.7425665855407715, "learning_rate": 0.001208564328616122, "loss": 7.5848, "step": 1084800 }, { "epoch": 4.419644175665478, "grad_norm": 19.172582626342773, "learning_rate": 0.0012081404914434924, "loss": 7.5912, "step": 1084900 }, { "epoch": 4.420051553688859, "grad_norm": 9.863676071166992, "learning_rate": 0.0012077167055426544, "loss": 7.5921, "step": 1085000 }, { "epoch": 4.420051553688859, "eval_MaskedAccuracy": 0.5111017514881756, "eval_loss": 1.5961647033691406, "eval_runtime": 151.5831, "eval_samples_per_second": 418.754, "eval_steps_per_second": 1.636, "step": 1085000 }, { "epoch": 4.420458931712241, "grad_norm": 11.566855430603027, "learning_rate": 0.0012072929709303639, "loss": 7.5695, "step": 1085100 }, { "epoch": 4.420866309735622, "grad_norm": 16.011913299560547, "learning_rate": 0.0012068692876233723, "loss": 7.6089, "step": 1085200 }, { "epoch": 4.421273687759003, "grad_norm": 11.74954891204834, "learning_rate": 0.00120644565563843, "loss": 7.5623, "step": 1085300 }, { "epoch": 4.421681065782384, "grad_norm": 12.427276611328125, "learning_rate": 0.0012060220749922873, "loss": 7.565, "step": 1085400 }, { "epoch": 4.422088443805766, "grad_norm": 4.826947212219238, "learning_rate": 0.001205598545701689, "loss": 7.5716, "step": 1085500 }, { "epoch": 4.4224958218291475, "grad_norm": 5.644373893737793, "learning_rate": 0.001205175067783379, "loss": 7.5933, "step": 1085600 }, { "epoch": 4.422903199852529, "grad_norm": 6.411406517028809, "learning_rate": 0.0012047516412541015, "loss": 7.5879, "step": 1085700 }, { "epoch": 4.4233105778759105, "grad_norm": 6.444009304046631, "learning_rate": 0.0012043282661305956, "loss": 7.5915, "step": 1085800 }, { "epoch": 4.423717955899292, "grad_norm": 8.614859580993652, "learning_rate": 0.0012039049424295996, "loss": 7.5614, "step": 1085900 }, { "epoch": 4.424125333922674, "grad_norm": 4.692538261413574, "learning_rate": 0.001203481670167852, "loss": 7.5842, "step": 1086000 }, { "epoch": 4.424125333922674, "eval_MaskedAccuracy": 0.511321580160768, "eval_loss": 1.5966092348098755, "eval_runtime": 151.3759, "eval_samples_per_second": 419.327, "eval_steps_per_second": 1.638, "step": 1086000 }, { "epoch": 4.424532711946055, "grad_norm": 2.622321844100952, "learning_rate": 0.0012030584493620826, "loss": 7.6025, "step": 1086100 }, { "epoch": 4.424940089969437, "grad_norm": 14.408987998962402, "learning_rate": 0.0012026352800290267, "loss": 7.5859, "step": 1086200 }, { "epoch": 4.425347467992818, "grad_norm": 10.01328182220459, "learning_rate": 0.0012022121621854133, "loss": 7.5963, "step": 1086300 }, { "epoch": 4.4257548460162, "grad_norm": 3.0204148292541504, "learning_rate": 0.0012017890958479715, "loss": 7.5842, "step": 1086400 }, { "epoch": 4.426162224039581, "grad_norm": 4.547139644622803, "learning_rate": 0.0012013660810334264, "loss": 7.6014, "step": 1086500 }, { "epoch": 4.426569602062962, "grad_norm": 7.615049362182617, "learning_rate": 0.0012009431177585042, "loss": 7.5841, "step": 1086600 }, { "epoch": 4.426976980086343, "grad_norm": 15.627482414245605, "learning_rate": 0.0012005202060399226, "loss": 7.5759, "step": 1086700 }, { "epoch": 4.427384358109725, "grad_norm": 3.7921981811523438, "learning_rate": 0.0012000973458944047, "loss": 7.5868, "step": 1086800 }, { "epoch": 4.4277917361331065, "grad_norm": 9.941889762878418, "learning_rate": 0.0011996745373386677, "loss": 7.5671, "step": 1086900 }, { "epoch": 4.428199114156488, "grad_norm": 10.985698699951172, "learning_rate": 0.0011992517803894277, "loss": 7.5999, "step": 1087000 }, { "epoch": 4.428199114156488, "eval_MaskedAccuracy": 0.5106609983064447, "eval_loss": 1.6057345867156982, "eval_runtime": 153.6328, "eval_samples_per_second": 413.167, "eval_steps_per_second": 1.614, "step": 1087000 }, { "epoch": 4.42860649217987, "grad_norm": 3.85752272605896, "learning_rate": 0.001198829075063398, "loss": 7.5876, "step": 1087100 }, { "epoch": 4.429013870203251, "grad_norm": 3.826331377029419, "learning_rate": 0.001198406421377292, "loss": 7.5646, "step": 1087200 }, { "epoch": 4.429421248226633, "grad_norm": 5.111105442047119, "learning_rate": 0.0011979838193478171, "loss": 7.6104, "step": 1087300 }, { "epoch": 4.429828626250014, "grad_norm": 6.366291046142578, "learning_rate": 0.0011975612689916837, "loss": 7.5915, "step": 1087400 }, { "epoch": 4.430236004273396, "grad_norm": 8.502144813537598, "learning_rate": 0.0011971387703255951, "loss": 7.5837, "step": 1087500 }, { "epoch": 4.430643382296777, "grad_norm": 3.6586952209472656, "learning_rate": 0.0011967163233662554, "loss": 7.5898, "step": 1087600 }, { "epoch": 4.431050760320159, "grad_norm": 2.7903194427490234, "learning_rate": 0.0011962939281303674, "loss": 7.5713, "step": 1087700 }, { "epoch": 4.43145813834354, "grad_norm": 6.929922103881836, "learning_rate": 0.0011958715846346305, "loss": 7.592, "step": 1087800 }, { "epoch": 4.431865516366921, "grad_norm": 5.419848442077637, "learning_rate": 0.001195449292895742, "loss": 7.5619, "step": 1087900 }, { "epoch": 4.4322728943903025, "grad_norm": 5.424269676208496, "learning_rate": 0.0011950270529303968, "loss": 7.5614, "step": 1088000 }, { "epoch": 4.4322728943903025, "eval_MaskedAccuracy": 0.5111898012969128, "eval_loss": 1.5937535762786865, "eval_runtime": 153.6066, "eval_samples_per_second": 413.237, "eval_steps_per_second": 1.615, "step": 1088000 }, { "epoch": 4.432680272413684, "grad_norm": 3.4336557388305664, "learning_rate": 0.0011946048647552868, "loss": 7.5644, "step": 1088100 }, { "epoch": 4.4330876504370655, "grad_norm": 6.4227614402771, "learning_rate": 0.0011941827283871052, "loss": 7.5607, "step": 1088200 }, { "epoch": 4.433495028460447, "grad_norm": 3.375950813293457, "learning_rate": 0.0011937606438425423, "loss": 7.5485, "step": 1088300 }, { "epoch": 4.433902406483829, "grad_norm": 6.799926280975342, "learning_rate": 0.001193338611138283, "loss": 7.5628, "step": 1088400 }, { "epoch": 4.43430978450721, "grad_norm": 5.494699001312256, "learning_rate": 0.0011929166302910145, "loss": 7.5864, "step": 1088500 }, { "epoch": 4.434717162530592, "grad_norm": 6.4925456047058105, "learning_rate": 0.0011924947013174185, "loss": 7.545, "step": 1088600 }, { "epoch": 4.435124540553973, "grad_norm": 6.94785213470459, "learning_rate": 0.0011920728242341759, "loss": 7.5698, "step": 1088700 }, { "epoch": 4.435531918577355, "grad_norm": 12.0343599319458, "learning_rate": 0.0011916509990579683, "loss": 7.6076, "step": 1088800 }, { "epoch": 4.435939296600736, "grad_norm": 4.20711612701416, "learning_rate": 0.0011912292258054705, "loss": 7.5962, "step": 1088900 }, { "epoch": 4.436346674624118, "grad_norm": 4.755501747131348, "learning_rate": 0.0011908075044933582, "loss": 7.5634, "step": 1089000 }, { "epoch": 4.436346674624118, "eval_MaskedAccuracy": 0.5104738030339839, "eval_loss": 1.5926874876022339, "eval_runtime": 154.5136, "eval_samples_per_second": 410.812, "eval_steps_per_second": 1.605, "step": 1089000 }, { "epoch": 4.4367540526474984, "grad_norm": 8.358015060424805, "learning_rate": 0.001190385835138303, "loss": 7.5721, "step": 1089100 }, { "epoch": 4.43716143067088, "grad_norm": 6.558157920837402, "learning_rate": 0.0011899642177569758, "loss": 7.5767, "step": 1089200 }, { "epoch": 4.4375688086942615, "grad_norm": 4.260998249053955, "learning_rate": 0.0011895426523660475, "loss": 7.578, "step": 1089300 }, { "epoch": 4.437976186717643, "grad_norm": 6.9902119636535645, "learning_rate": 0.0011891211389821852, "loss": 7.5777, "step": 1089400 }, { "epoch": 4.438383564741025, "grad_norm": 5.035665512084961, "learning_rate": 0.0011886996776220512, "loss": 7.5873, "step": 1089500 }, { "epoch": 4.438790942764406, "grad_norm": 4.066616535186768, "learning_rate": 0.0011882782683023104, "loss": 7.5808, "step": 1089600 }, { "epoch": 4.439198320787788, "grad_norm": 5.067715644836426, "learning_rate": 0.0011878569110396193, "loss": 7.5963, "step": 1089700 }, { "epoch": 4.439605698811169, "grad_norm": 9.498620986938477, "learning_rate": 0.0011874356058506406, "loss": 7.5426, "step": 1089800 }, { "epoch": 4.440013076834551, "grad_norm": 4.712803363800049, "learning_rate": 0.001187014352752028, "loss": 7.5929, "step": 1089900 }, { "epoch": 4.440420454857932, "grad_norm": 11.693345069885254, "learning_rate": 0.0011865931517604379, "loss": 7.5669, "step": 1090000 }, { "epoch": 4.440420454857932, "eval_MaskedAccuracy": 0.5109665043950206, "eval_loss": 1.5994694232940674, "eval_runtime": 151.7163, "eval_samples_per_second": 418.386, "eval_steps_per_second": 1.635, "step": 1090000 }, { "epoch": 4.440827832881314, "grad_norm": 13.621916770935059, "learning_rate": 0.0011861720028925205, "loss": 7.6227, "step": 1090100 }, { "epoch": 4.441235210904695, "grad_norm": 11.77079963684082, "learning_rate": 0.0011857509061649278, "loss": 7.5836, "step": 1090200 }, { "epoch": 4.441642588928076, "grad_norm": 6.685378551483154, "learning_rate": 0.001185329861594309, "loss": 7.5556, "step": 1090300 }, { "epoch": 4.4420499669514575, "grad_norm": 4.267038822174072, "learning_rate": 0.0011849088691973082, "loss": 7.5452, "step": 1090400 }, { "epoch": 4.442457344974839, "grad_norm": 13.093615531921387, "learning_rate": 0.0011844879289905723, "loss": 7.6044, "step": 1090500 }, { "epoch": 4.4428647229982206, "grad_norm": 5.280068874359131, "learning_rate": 0.00118406704099074, "loss": 7.5898, "step": 1090600 }, { "epoch": 4.443272101021602, "grad_norm": 11.806068420410156, "learning_rate": 0.0011836462052144503, "loss": 7.5706, "step": 1090700 }, { "epoch": 4.443679479044984, "grad_norm": 6.73701286315918, "learning_rate": 0.0011832254216783457, "loss": 7.5986, "step": 1090800 }, { "epoch": 4.444086857068365, "grad_norm": 4.740633010864258, "learning_rate": 0.0011828046903990582, "loss": 7.6002, "step": 1090900 }, { "epoch": 4.444494235091747, "grad_norm": 3.2756428718566895, "learning_rate": 0.0011823840113932237, "loss": 7.6033, "step": 1091000 }, { "epoch": 4.444494235091747, "eval_MaskedAccuracy": 0.5105687841075114, "eval_loss": 1.608370304107666, "eval_runtime": 155.7681, "eval_samples_per_second": 407.503, "eval_steps_per_second": 1.592, "step": 1091000 }, { "epoch": 4.444901613115128, "grad_norm": 4.4155378341674805, "learning_rate": 0.0011819633846774718, "loss": 7.5621, "step": 1091100 }, { "epoch": 4.44530899113851, "grad_norm": 10.781631469726562, "learning_rate": 0.0011815428102684331, "loss": 7.5782, "step": 1091200 }, { "epoch": 4.445716369161891, "grad_norm": 8.093812942504883, "learning_rate": 0.0011811222881827363, "loss": 7.5588, "step": 1091300 }, { "epoch": 4.446123747185273, "grad_norm": 12.548389434814453, "learning_rate": 0.0011807018184370065, "loss": 7.5939, "step": 1091400 }, { "epoch": 4.446531125208654, "grad_norm": 7.770724773406982, "learning_rate": 0.0011802814010478662, "loss": 7.5897, "step": 1091500 }, { "epoch": 4.446938503232035, "grad_norm": 16.370756149291992, "learning_rate": 0.0011798610360319386, "loss": 7.5856, "step": 1091600 }, { "epoch": 4.4473458812554165, "grad_norm": 10.37841510772705, "learning_rate": 0.001179440723405841, "loss": 7.5576, "step": 1091700 }, { "epoch": 4.447753259278798, "grad_norm": 3.476844310760498, "learning_rate": 0.001179020463186191, "loss": 7.5757, "step": 1091800 }, { "epoch": 4.44816063730218, "grad_norm": 4.5110602378845215, "learning_rate": 0.0011786002553896033, "loss": 7.5616, "step": 1091900 }, { "epoch": 4.448568015325561, "grad_norm": 4.60313081741333, "learning_rate": 0.0011781801000326922, "loss": 7.5999, "step": 1092000 }, { "epoch": 4.448568015325561, "eval_MaskedAccuracy": 0.5106858458393869, "eval_loss": 1.5967841148376465, "eval_runtime": 154.5113, "eval_samples_per_second": 410.818, "eval_steps_per_second": 1.605, "step": 1092000 }, { "epoch": 4.448975393348943, "grad_norm": 4.803823947906494, "learning_rate": 0.0011777599971320677, "loss": 7.5799, "step": 1092100 }, { "epoch": 4.449382771372324, "grad_norm": 8.219694137573242, "learning_rate": 0.0011773399467043405, "loss": 7.6024, "step": 1092200 }, { "epoch": 4.449790149395706, "grad_norm": 5.870816707611084, "learning_rate": 0.0011769199487661158, "loss": 7.5963, "step": 1092300 }, { "epoch": 4.450197527419087, "grad_norm": 6.064818382263184, "learning_rate": 0.0011765000033339983, "loss": 7.585, "step": 1092400 }, { "epoch": 4.450604905442469, "grad_norm": 6.240481853485107, "learning_rate": 0.0011760801104245902, "loss": 7.5826, "step": 1092500 }, { "epoch": 4.45101228346585, "grad_norm": 6.479210376739502, "learning_rate": 0.0011756602700544927, "loss": 7.5915, "step": 1092600 }, { "epoch": 4.451419661489232, "grad_norm": 4.48755407333374, "learning_rate": 0.0011752404822403034, "loss": 7.5721, "step": 1092700 }, { "epoch": 4.451827039512613, "grad_norm": 6.282494068145752, "learning_rate": 0.001174820746998621, "loss": 7.5633, "step": 1092800 }, { "epoch": 4.452234417535994, "grad_norm": 7.3973798751831055, "learning_rate": 0.0011744010643460392, "loss": 7.5684, "step": 1092900 }, { "epoch": 4.452641795559376, "grad_norm": 10.869181632995605, "learning_rate": 0.0011739814342991495, "loss": 7.5983, "step": 1093000 }, { "epoch": 4.452641795559376, "eval_MaskedAccuracy": 0.5115278303939732, "eval_loss": 1.5952904224395752, "eval_runtime": 161.1135, "eval_samples_per_second": 393.983, "eval_steps_per_second": 1.539, "step": 1093000 }, { "epoch": 4.453049173582757, "grad_norm": 9.483210563659668, "learning_rate": 0.0011735618568745423, "loss": 7.5681, "step": 1093100 }, { "epoch": 4.453456551606139, "grad_norm": 22.26955223083496, "learning_rate": 0.0011731423320888049, "loss": 7.5787, "step": 1093200 }, { "epoch": 4.45386392962952, "grad_norm": 4.262212753295898, "learning_rate": 0.0011727228599585258, "loss": 7.5832, "step": 1093300 }, { "epoch": 4.454271307652902, "grad_norm": 12.767822265625, "learning_rate": 0.001172303440500288, "loss": 7.5825, "step": 1093400 }, { "epoch": 4.454678685676283, "grad_norm": 4.370329856872559, "learning_rate": 0.0011718840737306728, "loss": 7.5992, "step": 1093500 }, { "epoch": 4.455086063699665, "grad_norm": 10.459832191467285, "learning_rate": 0.0011714647596662608, "loss": 7.5409, "step": 1093600 }, { "epoch": 4.455493441723046, "grad_norm": 5.395259857177734, "learning_rate": 0.0011710454983236286, "loss": 7.5962, "step": 1093700 }, { "epoch": 4.455900819746428, "grad_norm": 13.553374290466309, "learning_rate": 0.0011706262897193522, "loss": 7.5825, "step": 1093800 }, { "epoch": 4.456308197769809, "grad_norm": 12.136026382446289, "learning_rate": 0.0011702071338700055, "loss": 7.6182, "step": 1093900 }, { "epoch": 4.456715575793191, "grad_norm": 8.024489402770996, "learning_rate": 0.0011697880307921598, "loss": 7.5562, "step": 1094000 }, { "epoch": 4.456715575793191, "eval_MaskedAccuracy": 0.5102432589090533, "eval_loss": 1.600201964378357, "eval_runtime": 152.8216, "eval_samples_per_second": 415.36, "eval_steps_per_second": 1.623, "step": 1094000 }, { "epoch": 4.4571229538165715, "grad_norm": 6.045042514801025, "learning_rate": 0.0011693689805023838, "loss": 7.5699, "step": 1094100 }, { "epoch": 4.457530331839953, "grad_norm": 8.82064437866211, "learning_rate": 0.0011689499830172458, "loss": 7.5747, "step": 1094200 }, { "epoch": 4.457937709863335, "grad_norm": 14.469527244567871, "learning_rate": 0.001168531038353309, "loss": 7.6117, "step": 1094300 }, { "epoch": 4.458345087886716, "grad_norm": 7.1125922203063965, "learning_rate": 0.0011681121465271403, "loss": 7.5571, "step": 1094400 }, { "epoch": 4.458752465910098, "grad_norm": 7.602787017822266, "learning_rate": 0.0011676933075552979, "loss": 7.6027, "step": 1094500 }, { "epoch": 4.459159843933479, "grad_norm": 6.530357360839844, "learning_rate": 0.001167274521454341, "loss": 7.5878, "step": 1094600 }, { "epoch": 4.459567221956861, "grad_norm": 4.430764675140381, "learning_rate": 0.0011668557882408286, "loss": 7.5595, "step": 1094700 }, { "epoch": 4.459974599980242, "grad_norm": 5.939826011657715, "learning_rate": 0.0011664371079313123, "loss": 7.5885, "step": 1094800 }, { "epoch": 4.460381978003624, "grad_norm": 9.3687162399292, "learning_rate": 0.0011660184805423464, "loss": 7.562, "step": 1094900 }, { "epoch": 4.460789356027005, "grad_norm": 10.468751907348633, "learning_rate": 0.0011655999060904807, "loss": 7.5865, "step": 1095000 }, { "epoch": 4.460789356027005, "eval_MaskedAccuracy": 0.5109743135864614, "eval_loss": 1.5934405326843262, "eval_runtime": 169.5334, "eval_samples_per_second": 374.416, "eval_steps_per_second": 1.463, "step": 1095000 }, { "epoch": 4.461196734050387, "grad_norm": 8.076781272888184, "learning_rate": 0.0011651813845922653, "loss": 7.5814, "step": 1095100 }, { "epoch": 4.461604112073768, "grad_norm": 6.898649215698242, "learning_rate": 0.0011647629160642461, "loss": 7.5781, "step": 1095200 }, { "epoch": 4.462011490097149, "grad_norm": 10.0347318649292, "learning_rate": 0.0011643445005229671, "loss": 7.5619, "step": 1095300 }, { "epoch": 4.462418868120531, "grad_norm": 3.8157432079315186, "learning_rate": 0.0011639261379849698, "loss": 7.5844, "step": 1095400 }, { "epoch": 4.462826246143912, "grad_norm": 8.751187324523926, "learning_rate": 0.0011635078284667946, "loss": 7.5724, "step": 1095500 }, { "epoch": 4.463233624167294, "grad_norm": 8.541107177734375, "learning_rate": 0.0011630895719849797, "loss": 7.56, "step": 1095600 }, { "epoch": 4.463641002190675, "grad_norm": 11.511309623718262, "learning_rate": 0.0011626713685560608, "loss": 7.6009, "step": 1095700 }, { "epoch": 4.464048380214057, "grad_norm": 4.9507551193237305, "learning_rate": 0.001162253218196572, "loss": 7.5872, "step": 1095800 }, { "epoch": 4.464455758237438, "grad_norm": 5.776829719543457, "learning_rate": 0.0011618351209230458, "loss": 7.5693, "step": 1095900 }, { "epoch": 4.46486313626082, "grad_norm": 4.541170597076416, "learning_rate": 0.00116141707675201, "loss": 7.5666, "step": 1096000 }, { "epoch": 4.46486313626082, "eval_MaskedAccuracy": 0.5106916249196752, "eval_loss": 1.5983508825302124, "eval_runtime": 158.9889, "eval_samples_per_second": 399.248, "eval_steps_per_second": 1.56, "step": 1096000 }, { "epoch": 4.465270514284201, "grad_norm": 6.547701835632324, "learning_rate": 0.0011609990856999933, "loss": 7.5948, "step": 1096100 }, { "epoch": 4.465677892307583, "grad_norm": 8.93804931640625, "learning_rate": 0.001160581147783522, "loss": 7.618, "step": 1096200 }, { "epoch": 4.466085270330964, "grad_norm": 7.38311767578125, "learning_rate": 0.0011601632630191178, "loss": 7.5622, "step": 1096300 }, { "epoch": 4.466492648354346, "grad_norm": 12.070695877075195, "learning_rate": 0.0011597454314233031, "loss": 7.5446, "step": 1096400 }, { "epoch": 4.466900026377727, "grad_norm": 5.291610240936279, "learning_rate": 0.001159327653012594, "loss": 7.5962, "step": 1096500 }, { "epoch": 4.467307404401108, "grad_norm": 8.69069766998291, "learning_rate": 0.0011589099278035118, "loss": 7.5772, "step": 1096600 }, { "epoch": 4.46771478242449, "grad_norm": 4.978950023651123, "learning_rate": 0.0011584922558125702, "loss": 7.5725, "step": 1096700 }, { "epoch": 4.468122160447871, "grad_norm": 9.29713249206543, "learning_rate": 0.0011580746370562825, "loss": 7.5776, "step": 1096800 }, { "epoch": 4.468529538471253, "grad_norm": 7.964879989624023, "learning_rate": 0.0011576570715511553, "loss": 7.5843, "step": 1096900 }, { "epoch": 4.468936916494634, "grad_norm": 5.121318340301514, "learning_rate": 0.0011572395593137006, "loss": 7.605, "step": 1097000 }, { "epoch": 4.468936916494634, "eval_MaskedAccuracy": 0.5107859315154086, "eval_loss": 1.599096655845642, "eval_runtime": 156.7402, "eval_samples_per_second": 404.976, "eval_steps_per_second": 1.582, "step": 1097000 }, { "epoch": 4.469344294518016, "grad_norm": 8.140433311462402, "learning_rate": 0.0011568221003604255, "loss": 7.5799, "step": 1097100 }, { "epoch": 4.469751672541397, "grad_norm": 6.903355121612549, "learning_rate": 0.001156404694707832, "loss": 7.5554, "step": 1097200 }, { "epoch": 4.470159050564779, "grad_norm": 6.56154203414917, "learning_rate": 0.0011559873423724251, "loss": 7.581, "step": 1097300 }, { "epoch": 4.47056642858816, "grad_norm": 7.409020900726318, "learning_rate": 0.0011555700433707032, "loss": 7.562, "step": 1097400 }, { "epoch": 4.470973806611542, "grad_norm": 4.009287357330322, "learning_rate": 0.0011551527977191648, "loss": 7.5889, "step": 1097500 }, { "epoch": 4.471381184634923, "grad_norm": 8.088984489440918, "learning_rate": 0.001154735605434305, "loss": 7.6211, "step": 1097600 }, { "epoch": 4.471788562658305, "grad_norm": 5.861762523651123, "learning_rate": 0.001154318466532618, "loss": 7.6004, "step": 1097700 }, { "epoch": 4.4721959406816865, "grad_norm": 14.661588668823242, "learning_rate": 0.0011539013810305957, "loss": 7.5999, "step": 1097800 }, { "epoch": 4.472603318705067, "grad_norm": 10.20081615447998, "learning_rate": 0.0011534843489447292, "loss": 7.5885, "step": 1097900 }, { "epoch": 4.473010696728449, "grad_norm": 11.643823623657227, "learning_rate": 0.0011530673702915043, "loss": 7.5972, "step": 1098000 }, { "epoch": 4.473010696728449, "eval_MaskedAccuracy": 0.5111975728571545, "eval_loss": 1.5915279388427734, "eval_runtime": 153.0242, "eval_samples_per_second": 414.81, "eval_steps_per_second": 1.621, "step": 1098000 }, { "epoch": 4.47341807475183, "grad_norm": 7.542632102966309, "learning_rate": 0.0011526504450874073, "loss": 7.6034, "step": 1098100 }, { "epoch": 4.473825452775212, "grad_norm": 4.5746049880981445, "learning_rate": 0.0011522335733489208, "loss": 7.6155, "step": 1098200 }, { "epoch": 4.474232830798593, "grad_norm": 11.481820106506348, "learning_rate": 0.001151816755092527, "loss": 7.5799, "step": 1098300 }, { "epoch": 4.474640208821975, "grad_norm": 6.03773832321167, "learning_rate": 0.0011513999903347044, "loss": 7.6035, "step": 1098400 }, { "epoch": 4.475047586845356, "grad_norm": 5.938760757446289, "learning_rate": 0.0011509832790919277, "loss": 7.5712, "step": 1098500 }, { "epoch": 4.475454964868738, "grad_norm": 8.458950996398926, "learning_rate": 0.0011505666213806741, "loss": 7.5992, "step": 1098600 }, { "epoch": 4.475862342892119, "grad_norm": 3.898512840270996, "learning_rate": 0.001150150017217418, "loss": 7.5664, "step": 1098700 }, { "epoch": 4.476269720915501, "grad_norm": 7.1401519775390625, "learning_rate": 0.0011497334666186272, "loss": 7.5925, "step": 1098800 }, { "epoch": 4.476677098938882, "grad_norm": 7.631970405578613, "learning_rate": 0.0011493169696007693, "loss": 7.5772, "step": 1098900 }, { "epoch": 4.477084476962264, "grad_norm": 5.312163829803467, "learning_rate": 0.0011489005261803142, "loss": 7.5905, "step": 1099000 }, { "epoch": 4.477084476962264, "eval_MaskedAccuracy": 0.5109278042613278, "eval_loss": 1.5994435548782349, "eval_runtime": 153.4923, "eval_samples_per_second": 413.545, "eval_steps_per_second": 1.616, "step": 1099000 }, { "epoch": 4.477491854985645, "grad_norm": 4.4299445152282715, "learning_rate": 0.0011484841363737235, "loss": 7.5938, "step": 1099100 }, { "epoch": 4.477899233009026, "grad_norm": 8.612445831298828, "learning_rate": 0.00114806780019746, "loss": 7.6022, "step": 1099200 }, { "epoch": 4.478306611032408, "grad_norm": 8.290127754211426, "learning_rate": 0.0011476515176679846, "loss": 7.5825, "step": 1099300 }, { "epoch": 4.478713989055789, "grad_norm": 8.893336296081543, "learning_rate": 0.0011472352888017546, "loss": 7.5821, "step": 1099400 }, { "epoch": 4.479121367079171, "grad_norm": 10.074488639831543, "learning_rate": 0.0011468191136152239, "loss": 7.607, "step": 1099500 }, { "epoch": 4.479528745102552, "grad_norm": 5.985406398773193, "learning_rate": 0.001146402992124848, "loss": 7.5869, "step": 1099600 }, { "epoch": 4.479936123125934, "grad_norm": 13.412552833557129, "learning_rate": 0.0011459869243470784, "loss": 7.5837, "step": 1099700 }, { "epoch": 4.480343501149315, "grad_norm": 7.82590913772583, "learning_rate": 0.0011455709102983645, "loss": 7.6066, "step": 1099800 }, { "epoch": 4.480750879172697, "grad_norm": 5.195190906524658, "learning_rate": 0.0011451549499951529, "loss": 7.5673, "step": 1099900 }, { "epoch": 4.481158257196078, "grad_norm": 5.314785957336426, "learning_rate": 0.0011447390434538903, "loss": 7.5783, "step": 1100000 }, { "epoch": 4.481158257196078, "eval_MaskedAccuracy": 0.5108102540832667, "eval_loss": 1.5934927463531494, "eval_runtime": 169.4816, "eval_samples_per_second": 374.53, "eval_steps_per_second": 1.463, "step": 1100000 }, { "epoch": 4.48156563521946, "grad_norm": 7.085400581359863, "learning_rate": 0.001144323190691018, "loss": 7.5407, "step": 1100100 }, { "epoch": 4.4819730132428415, "grad_norm": 11.708148002624512, "learning_rate": 0.001143907391722975, "loss": 7.5778, "step": 1100200 }, { "epoch": 4.482380391266222, "grad_norm": 4.760464191436768, "learning_rate": 0.0011434916465662048, "loss": 7.607, "step": 1100300 }, { "epoch": 4.482787769289604, "grad_norm": 4.3901567459106445, "learning_rate": 0.0011430759552371404, "loss": 7.5536, "step": 1100400 }, { "epoch": 4.483195147312985, "grad_norm": 8.597267150878906, "learning_rate": 0.0011426603177522182, "loss": 7.5832, "step": 1100500 }, { "epoch": 4.483602525336367, "grad_norm": 7.280422210693359, "learning_rate": 0.00114224473412787, "loss": 7.5808, "step": 1100600 }, { "epoch": 4.484009903359748, "grad_norm": 3.35546612739563, "learning_rate": 0.0011418292043805245, "loss": 7.5853, "step": 1100700 }, { "epoch": 4.48441728138313, "grad_norm": 9.373496055603027, "learning_rate": 0.0011414137285266136, "loss": 7.5487, "step": 1100800 }, { "epoch": 4.484824659406511, "grad_norm": 6.74008321762085, "learning_rate": 0.0011409983065825626, "loss": 7.5769, "step": 1100900 }, { "epoch": 4.485232037429893, "grad_norm": 5.951924800872803, "learning_rate": 0.001140582938564791, "loss": 7.555, "step": 1101000 }, { "epoch": 4.485232037429893, "eval_MaskedAccuracy": 0.5108742536780447, "eval_loss": 1.6083155870437622, "eval_runtime": 159.5517, "eval_samples_per_second": 397.84, "eval_steps_per_second": 1.554, "step": 1101000 }, { "epoch": 4.485639415453274, "grad_norm": 3.0179784297943115, "learning_rate": 0.0011401676244897241, "loss": 7.5667, "step": 1101100 }, { "epoch": 4.486046793476656, "grad_norm": 4.752579212188721, "learning_rate": 0.0011397523643737807, "loss": 7.5688, "step": 1101200 }, { "epoch": 4.4864541715000374, "grad_norm": 10.299113273620605, "learning_rate": 0.0011393371582333763, "loss": 7.583, "step": 1101300 }, { "epoch": 4.486861549523419, "grad_norm": 5.598707675933838, "learning_rate": 0.0011389220060849282, "loss": 7.6029, "step": 1101400 }, { "epoch": 4.4872689275468005, "grad_norm": 4.877350330352783, "learning_rate": 0.0011385069079448515, "loss": 7.5969, "step": 1101500 }, { "epoch": 4.487676305570181, "grad_norm": 11.625675201416016, "learning_rate": 0.0011380918638295519, "loss": 7.5557, "step": 1101600 }, { "epoch": 4.488083683593563, "grad_norm": 5.352262496948242, "learning_rate": 0.0011376768737554454, "loss": 7.5396, "step": 1101700 }, { "epoch": 4.488491061616944, "grad_norm": 4.063558101654053, "learning_rate": 0.0011372619377389335, "loss": 7.5678, "step": 1101800 }, { "epoch": 4.488898439640326, "grad_norm": 6.834637641906738, "learning_rate": 0.0011368470557964242, "loss": 7.5794, "step": 1101900 }, { "epoch": 4.489305817663707, "grad_norm": 13.786849975585938, "learning_rate": 0.0011364322279443155, "loss": 7.5751, "step": 1102000 }, { "epoch": 4.489305817663707, "eval_MaskedAccuracy": 0.5115187416144197, "eval_loss": 1.5978195667266846, "eval_runtime": 157.5184, "eval_samples_per_second": 402.975, "eval_steps_per_second": 1.574, "step": 1102000 }, { "epoch": 4.489713195687089, "grad_norm": 7.651937484741211, "learning_rate": 0.0011360174541990118, "loss": 7.5754, "step": 1102100 }, { "epoch": 4.49012057371047, "grad_norm": 5.232959270477295, "learning_rate": 0.00113560273457691, "loss": 7.5747, "step": 1102200 }, { "epoch": 4.490527951733852, "grad_norm": 8.001144409179688, "learning_rate": 0.0011351880690944067, "loss": 7.5895, "step": 1102300 }, { "epoch": 4.490935329757233, "grad_norm": 6.840724468231201, "learning_rate": 0.001134773457767896, "loss": 7.5791, "step": 1102400 }, { "epoch": 4.491342707780615, "grad_norm": 7.303138256072998, "learning_rate": 0.0011343589006137683, "loss": 7.5895, "step": 1102500 }, { "epoch": 4.4917500858039965, "grad_norm": 7.090054512023926, "learning_rate": 0.0011339443976484118, "loss": 7.5712, "step": 1102600 }, { "epoch": 4.492157463827378, "grad_norm": 6.0141472816467285, "learning_rate": 0.0011335299488882196, "loss": 7.5956, "step": 1102700 }, { "epoch": 4.4925648418507595, "grad_norm": 5.018124580383301, "learning_rate": 0.0011331155543495713, "loss": 7.5981, "step": 1102800 }, { "epoch": 4.49297221987414, "grad_norm": 7.8402099609375, "learning_rate": 0.0011327012140488523, "loss": 7.5694, "step": 1102900 }, { "epoch": 4.493379597897522, "grad_norm": 5.013631343841553, "learning_rate": 0.0011322869280024441, "loss": 7.5893, "step": 1103000 }, { "epoch": 4.493379597897522, "eval_MaskedAccuracy": 0.5105309000422119, "eval_loss": 1.5985708236694336, "eval_runtime": 153.7186, "eval_samples_per_second": 412.936, "eval_steps_per_second": 1.613, "step": 1103000 }, { "epoch": 4.493786975920903, "grad_norm": 7.964849472045898, "learning_rate": 0.0011318726962267253, "loss": 7.5889, "step": 1103100 }, { "epoch": 4.494194353944285, "grad_norm": 10.593032836914062, "learning_rate": 0.0011314585187380726, "loss": 7.5907, "step": 1103200 }, { "epoch": 4.494601731967666, "grad_norm": 6.0983734130859375, "learning_rate": 0.001131044395552862, "loss": 7.5453, "step": 1103300 }, { "epoch": 4.495009109991048, "grad_norm": 15.389650344848633, "learning_rate": 0.0011306303266874656, "loss": 7.5742, "step": 1103400 }, { "epoch": 4.495416488014429, "grad_norm": 11.013498306274414, "learning_rate": 0.0011302163121582514, "loss": 7.5711, "step": 1103500 }, { "epoch": 4.495823866037811, "grad_norm": 9.118451118469238, "learning_rate": 0.0011298023519815902, "loss": 7.5783, "step": 1103600 }, { "epoch": 4.4962312440611925, "grad_norm": 9.36022663116455, "learning_rate": 0.001129388446173847, "loss": 7.5725, "step": 1103700 }, { "epoch": 4.496638622084574, "grad_norm": 13.599825859069824, "learning_rate": 0.001128974594751386, "loss": 7.5771, "step": 1103800 }, { "epoch": 4.4970460001079555, "grad_norm": 4.665948867797852, "learning_rate": 0.0011285607977305674, "loss": 7.5897, "step": 1103900 }, { "epoch": 4.497453378131337, "grad_norm": 4.070620536804199, "learning_rate": 0.0011281470551277536, "loss": 7.5785, "step": 1104000 }, { "epoch": 4.497453378131337, "eval_MaskedAccuracy": 0.510516774561958, "eval_loss": 1.5918331146240234, "eval_runtime": 153.3538, "eval_samples_per_second": 413.919, "eval_steps_per_second": 1.617, "step": 1104000 }, { "epoch": 4.497860756154718, "grad_norm": 4.339487075805664, "learning_rate": 0.0011277333669593, "loss": 7.596, "step": 1104100 }, { "epoch": 4.498268134178099, "grad_norm": 6.340195655822754, "learning_rate": 0.001127319733241563, "loss": 7.5638, "step": 1104200 }, { "epoch": 4.498675512201481, "grad_norm": 8.324033737182617, "learning_rate": 0.0011269061539908956, "loss": 7.5983, "step": 1104300 }, { "epoch": 4.499082890224862, "grad_norm": 9.860664367675781, "learning_rate": 0.0011264926292236473, "loss": 7.5914, "step": 1104400 }, { "epoch": 4.499490268248244, "grad_norm": 9.858746528625488, "learning_rate": 0.0011260791589561695, "loss": 7.5984, "step": 1104500 }, { "epoch": 4.499897646271625, "grad_norm": 6.180097579956055, "learning_rate": 0.001125665743204807, "loss": 7.5839, "step": 1104600 }, { "epoch": 4.500305024295007, "grad_norm": 5.815901756286621, "learning_rate": 0.0011252523819859056, "loss": 7.5861, "step": 1104700 }, { "epoch": 4.500712402318388, "grad_norm": 6.733134746551514, "learning_rate": 0.0011248390753158055, "loss": 7.6005, "step": 1104800 }, { "epoch": 4.50111978034177, "grad_norm": 10.06464672088623, "learning_rate": 0.0011244258232108479, "loss": 7.5665, "step": 1104900 }, { "epoch": 4.5015271583651515, "grad_norm": 4.955789089202881, "learning_rate": 0.0011240126256873725, "loss": 7.6031, "step": 1105000 }, { "epoch": 4.5015271583651515, "eval_MaskedAccuracy": 0.5112919021049737, "eval_loss": 1.5959364175796509, "eval_runtime": 152.5655, "eval_samples_per_second": 416.057, "eval_steps_per_second": 1.626, "step": 1105000 }, { "epoch": 4.501934536388533, "grad_norm": 4.404020309448242, "learning_rate": 0.0011235994827617146, "loss": 7.5809, "step": 1105100 }, { "epoch": 4.502341914411915, "grad_norm": 5.39438009262085, "learning_rate": 0.0011231863944502062, "loss": 7.6003, "step": 1105200 }, { "epoch": 4.502749292435295, "grad_norm": 3.4735138416290283, "learning_rate": 0.0011227733607691805, "loss": 7.5682, "step": 1105300 }, { "epoch": 4.503156670458677, "grad_norm": 7.234744548797607, "learning_rate": 0.0011223603817349668, "loss": 7.6346, "step": 1105400 }, { "epoch": 4.503564048482058, "grad_norm": 7.141273498535156, "learning_rate": 0.0011219474573638916, "loss": 7.5507, "step": 1105500 }, { "epoch": 4.50397142650544, "grad_norm": 8.873198509216309, "learning_rate": 0.001121534587672281, "loss": 7.5842, "step": 1105600 }, { "epoch": 4.504378804528821, "grad_norm": 9.169697761535645, "learning_rate": 0.0011211217726764572, "loss": 7.5806, "step": 1105700 }, { "epoch": 4.504786182552203, "grad_norm": 7.4849467277526855, "learning_rate": 0.0011207090123927423, "loss": 7.5523, "step": 1105800 }, { "epoch": 4.505193560575584, "grad_norm": 6.278785705566406, "learning_rate": 0.0011202963068374532, "loss": 7.5676, "step": 1105900 }, { "epoch": 4.505600938598966, "grad_norm": 4.431141376495361, "learning_rate": 0.0011198836560269063, "loss": 7.5782, "step": 1106000 }, { "epoch": 4.505600938598966, "eval_MaskedAccuracy": 0.5106673869629107, "eval_loss": 1.5928922891616821, "eval_runtime": 153.6369, "eval_samples_per_second": 413.156, "eval_steps_per_second": 1.614, "step": 1106000 }, { "epoch": 4.5060083166223475, "grad_norm": 5.619106769561768, "learning_rate": 0.0011194710599774175, "loss": 7.5694, "step": 1106100 }, { "epoch": 4.506415694645729, "grad_norm": 7.33353853225708, "learning_rate": 0.001119058518705297, "loss": 7.5745, "step": 1106200 }, { "epoch": 4.5068230726691105, "grad_norm": 10.765326499938965, "learning_rate": 0.001118646032226857, "loss": 7.5548, "step": 1106300 }, { "epoch": 4.507230450692492, "grad_norm": 7.372228622436523, "learning_rate": 0.0011182336005584033, "loss": 7.5957, "step": 1106400 }, { "epoch": 4.507637828715874, "grad_norm": 12.633130073547363, "learning_rate": 0.001117821223716241, "loss": 7.5861, "step": 1106500 }, { "epoch": 4.508045206739254, "grad_norm": 11.810508728027344, "learning_rate": 0.0011174089017166762, "loss": 7.5435, "step": 1106600 }, { "epoch": 4.508452584762636, "grad_norm": 6.721157073974609, "learning_rate": 0.001116996634576008, "loss": 7.5774, "step": 1106700 }, { "epoch": 4.508859962786017, "grad_norm": 6.119353294372559, "learning_rate": 0.0011165844223105371, "loss": 7.5656, "step": 1106800 }, { "epoch": 4.509267340809399, "grad_norm": 14.55726146697998, "learning_rate": 0.0011161722649365609, "loss": 7.5757, "step": 1106900 }, { "epoch": 4.50967471883278, "grad_norm": 7.758322238922119, "learning_rate": 0.0011157601624703722, "loss": 7.6214, "step": 1107000 }, { "epoch": 4.50967471883278, "eval_MaskedAccuracy": 0.5110763911288638, "eval_loss": 1.5922865867614746, "eval_runtime": 153.6657, "eval_samples_per_second": 413.078, "eval_steps_per_second": 1.614, "step": 1107000 }, { "epoch": 4.510082096856162, "grad_norm": 12.179095268249512, "learning_rate": 0.0011153481149282652, "loss": 7.5898, "step": 1107100 }, { "epoch": 4.510489474879543, "grad_norm": 11.939443588256836, "learning_rate": 0.0011149361223265276, "loss": 7.5911, "step": 1107200 }, { "epoch": 4.510896852902925, "grad_norm": 2.733443021774292, "learning_rate": 0.0011145241846814507, "loss": 7.5726, "step": 1107300 }, { "epoch": 4.5113042309263065, "grad_norm": 4.503757953643799, "learning_rate": 0.0011141123020093197, "loss": 7.6024, "step": 1107400 }, { "epoch": 4.511711608949688, "grad_norm": 2.994215726852417, "learning_rate": 0.0011137004743264165, "loss": 7.5971, "step": 1107500 }, { "epoch": 4.51211898697307, "grad_norm": 6.558333396911621, "learning_rate": 0.0011132887016490267, "loss": 7.5714, "step": 1107600 }, { "epoch": 4.512526364996451, "grad_norm": 5.901927947998047, "learning_rate": 0.0011128769839934267, "loss": 7.5853, "step": 1107700 }, { "epoch": 4.512933743019833, "grad_norm": 3.7872025966644287, "learning_rate": 0.0011124653213758955, "loss": 7.5944, "step": 1107800 }, { "epoch": 4.513341121043213, "grad_norm": 3.8414390087127686, "learning_rate": 0.0011120537138127082, "loss": 7.5813, "step": 1107900 }, { "epoch": 4.513748499066595, "grad_norm": 3.5636935234069824, "learning_rate": 0.0011116421613201392, "loss": 7.5588, "step": 1108000 }, { "epoch": 4.513748499066595, "eval_MaskedAccuracy": 0.5110843816749177, "eval_loss": 1.5849589109420776, "eval_runtime": 154.5512, "eval_samples_per_second": 410.712, "eval_steps_per_second": 1.605, "step": 1108000 }, { "epoch": 4.514155877089976, "grad_norm": 6.344196319580078, "learning_rate": 0.0011112306639144561, "loss": 7.5914, "step": 1108100 }, { "epoch": 4.514563255113358, "grad_norm": 11.019896507263184, "learning_rate": 0.0011108192216119298, "loss": 7.5995, "step": 1108200 }, { "epoch": 4.514970633136739, "grad_norm": 11.762144088745117, "learning_rate": 0.001110407834428826, "loss": 7.5953, "step": 1108300 }, { "epoch": 4.515378011160121, "grad_norm": 7.87576961517334, "learning_rate": 0.0011099965023814106, "loss": 7.5839, "step": 1108400 }, { "epoch": 4.5157853891835025, "grad_norm": 8.868873596191406, "learning_rate": 0.0011095852254859415, "loss": 7.5734, "step": 1108500 }, { "epoch": 4.516192767206884, "grad_norm": 5.231029033660889, "learning_rate": 0.0011091740037586844, "loss": 7.5795, "step": 1108600 }, { "epoch": 4.5166001452302655, "grad_norm": 6.209876537322998, "learning_rate": 0.0011087628372158942, "loss": 7.5948, "step": 1108700 }, { "epoch": 4.517007523253647, "grad_norm": 11.849695205688477, "learning_rate": 0.0011083517258738272, "loss": 7.5773, "step": 1108800 }, { "epoch": 4.517414901277029, "grad_norm": 4.6103410720825195, "learning_rate": 0.0011079406697487354, "loss": 7.5971, "step": 1108900 }, { "epoch": 4.517822279300409, "grad_norm": 5.200582504272461, "learning_rate": 0.001107529668856871, "loss": 7.5659, "step": 1109000 }, { "epoch": 4.517822279300409, "eval_MaskedAccuracy": 0.5108471120669347, "eval_loss": 1.5960307121276855, "eval_runtime": 156.5972, "eval_samples_per_second": 405.346, "eval_steps_per_second": 1.584, "step": 1109000 }, { "epoch": 4.518229657323792, "grad_norm": 3.9880106449127197, "learning_rate": 0.0011071187232144823, "loss": 7.5598, "step": 1109100 }, { "epoch": 4.518637035347172, "grad_norm": 12.46435260772705, "learning_rate": 0.0011067078328378165, "loss": 7.5847, "step": 1109200 }, { "epoch": 4.519044413370554, "grad_norm": 4.924691200256348, "learning_rate": 0.0011062969977431183, "loss": 7.6011, "step": 1109300 }, { "epoch": 4.519451791393935, "grad_norm": 12.720778465270996, "learning_rate": 0.0011058862179466325, "loss": 7.608, "step": 1109400 }, { "epoch": 4.519859169417317, "grad_norm": 3.4815590381622314, "learning_rate": 0.001105475493464597, "loss": 7.5707, "step": 1109500 }, { "epoch": 4.5202665474406984, "grad_norm": 7.696225166320801, "learning_rate": 0.0011050648243132496, "loss": 7.5794, "step": 1109600 }, { "epoch": 4.52067392546408, "grad_norm": 8.115972518920898, "learning_rate": 0.0011046542105088277, "loss": 7.6093, "step": 1109700 }, { "epoch": 4.5210813034874615, "grad_norm": 9.846404075622559, "learning_rate": 0.0011042436520675644, "loss": 7.5725, "step": 1109800 }, { "epoch": 4.521488681510843, "grad_norm": 8.468535423278809, "learning_rate": 0.0011038331490056908, "loss": 7.5819, "step": 1109900 }, { "epoch": 4.521896059534225, "grad_norm": 3.5899410247802734, "learning_rate": 0.0011034227013394374, "loss": 7.5703, "step": 1110000 }, { "epoch": 4.521896059534225, "eval_MaskedAccuracy": 0.5103302548077059, "eval_loss": 1.6036618947982788, "eval_runtime": 165.0916, "eval_samples_per_second": 384.49, "eval_steps_per_second": 1.502, "step": 1110000 }, { "epoch": 4.522303437557606, "grad_norm": 15.857691764831543, "learning_rate": 0.00110301230908503, "loss": 7.5776, "step": 1110100 }, { "epoch": 4.522710815580988, "grad_norm": 10.199382781982422, "learning_rate": 0.0011026019722586951, "loss": 7.5996, "step": 1110200 }, { "epoch": 4.523118193604368, "grad_norm": 11.391201972961426, "learning_rate": 0.001102191690876654, "loss": 7.5761, "step": 1110300 }, { "epoch": 4.52352557162775, "grad_norm": 3.94089412689209, "learning_rate": 0.0011017814649551278, "loss": 7.6011, "step": 1110400 }, { "epoch": 4.523932949651131, "grad_norm": 12.507319450378418, "learning_rate": 0.0011013712945103344, "loss": 7.6186, "step": 1110500 }, { "epoch": 4.524340327674513, "grad_norm": 10.309355735778809, "learning_rate": 0.001100961179558493, "loss": 7.5874, "step": 1110600 }, { "epoch": 4.524747705697894, "grad_norm": 14.556020736694336, "learning_rate": 0.0011005511201158148, "loss": 7.5707, "step": 1110700 }, { "epoch": 4.525155083721276, "grad_norm": 7.470675468444824, "learning_rate": 0.0011001411161985122, "loss": 7.5498, "step": 1110800 }, { "epoch": 4.5255624617446575, "grad_norm": 3.487459659576416, "learning_rate": 0.0010997311678227957, "loss": 7.5726, "step": 1110900 }, { "epoch": 4.525969839768039, "grad_norm": 7.366673946380615, "learning_rate": 0.0010993212750048703, "loss": 7.5897, "step": 1111000 }, { "epoch": 4.525969839768039, "eval_MaskedAccuracy": 0.511501025603417, "eval_loss": 1.5909398794174194, "eval_runtime": 154.9582, "eval_samples_per_second": 409.633, "eval_steps_per_second": 1.6, "step": 1111000 }, { "epoch": 4.5263772177914205, "grad_norm": 6.191447734832764, "learning_rate": 0.0010989114377609453, "loss": 7.587, "step": 1111100 }, { "epoch": 4.526784595814802, "grad_norm": 4.839754104614258, "learning_rate": 0.001098501656107221, "loss": 7.5737, "step": 1111200 }, { "epoch": 4.527191973838184, "grad_norm": 10.126070976257324, "learning_rate": 0.0010980919300598976, "loss": 7.5499, "step": 1111300 }, { "epoch": 4.527599351861565, "grad_norm": 10.946996688842773, "learning_rate": 0.0010976822596351747, "loss": 7.5615, "step": 1111400 }, { "epoch": 4.528006729884947, "grad_norm": 9.202312469482422, "learning_rate": 0.0010972726448492491, "loss": 7.5734, "step": 1111500 }, { "epoch": 4.528414107908327, "grad_norm": 5.462584972381592, "learning_rate": 0.001096863085718315, "loss": 7.5489, "step": 1111600 }, { "epoch": 4.528821485931709, "grad_norm": 5.888820648193359, "learning_rate": 0.0010964535822585656, "loss": 7.5567, "step": 1111700 }, { "epoch": 4.52922886395509, "grad_norm": 3.6279215812683105, "learning_rate": 0.001096044134486188, "loss": 7.5847, "step": 1111800 }, { "epoch": 4.529636241978472, "grad_norm": 5.458526134490967, "learning_rate": 0.001095634742417373, "loss": 7.6019, "step": 1111900 }, { "epoch": 4.5300436200018535, "grad_norm": 6.370878219604492, "learning_rate": 0.001095225406068303, "loss": 7.5706, "step": 1112000 }, { "epoch": 4.5300436200018535, "eval_MaskedAccuracy": 0.5114930069502411, "eval_loss": 1.5956523418426514, "eval_runtime": 155.7762, "eval_samples_per_second": 407.482, "eval_steps_per_second": 1.592, "step": 1112000 }, { "epoch": 4.530450998025235, "grad_norm": 7.633890628814697, "learning_rate": 0.0010948161254551636, "loss": 7.5844, "step": 1112100 }, { "epoch": 4.5308583760486165, "grad_norm": 9.703800201416016, "learning_rate": 0.0010944069005941349, "loss": 7.5652, "step": 1112200 }, { "epoch": 4.531265754071998, "grad_norm": 10.638181686401367, "learning_rate": 0.0010939977315013961, "loss": 7.5798, "step": 1112300 }, { "epoch": 4.53167313209538, "grad_norm": 10.037205696105957, "learning_rate": 0.001093588618193123, "loss": 7.5789, "step": 1112400 }, { "epoch": 4.532080510118761, "grad_norm": 5.2120466232299805, "learning_rate": 0.0010931795606854897, "loss": 7.5769, "step": 1112500 }, { "epoch": 4.532487888142143, "grad_norm": 10.160205841064453, "learning_rate": 0.0010927705589946712, "loss": 7.5521, "step": 1112600 }, { "epoch": 4.532895266165524, "grad_norm": 3.2886061668395996, "learning_rate": 0.001092361613136835, "loss": 7.5638, "step": 1112700 }, { "epoch": 4.533302644188906, "grad_norm": 3.281219959259033, "learning_rate": 0.0010919527231281485, "loss": 7.5669, "step": 1112800 }, { "epoch": 4.533710022212286, "grad_norm": 10.00158977508545, "learning_rate": 0.0010915438889847792, "loss": 7.5775, "step": 1112900 }, { "epoch": 4.534117400235668, "grad_norm": 8.850213050842285, "learning_rate": 0.0010911351107228894, "loss": 7.5842, "step": 1113000 }, { "epoch": 4.534117400235668, "eval_MaskedAccuracy": 0.510891954563634, "eval_loss": 1.6031019687652588, "eval_runtime": 170.1512, "eval_samples_per_second": 373.056, "eval_steps_per_second": 1.458, "step": 1113000 }, { "epoch": 4.534524778259049, "grad_norm": 11.173861503601074, "learning_rate": 0.0010907263883586412, "loss": 7.5714, "step": 1113100 }, { "epoch": 4.534932156282431, "grad_norm": 4.537968158721924, "learning_rate": 0.001090317721908192, "loss": 7.5787, "step": 1113200 }, { "epoch": 4.5353395343058125, "grad_norm": 6.516021728515625, "learning_rate": 0.001089909111387701, "loss": 7.5813, "step": 1113300 }, { "epoch": 4.535746912329194, "grad_norm": 6.329670429229736, "learning_rate": 0.0010895005568133192, "loss": 7.6067, "step": 1113400 }, { "epoch": 4.536154290352576, "grad_norm": 4.165231227874756, "learning_rate": 0.0010890920582012013, "loss": 7.5996, "step": 1113500 }, { "epoch": 4.536561668375957, "grad_norm": 9.512068748474121, "learning_rate": 0.001088683615567497, "loss": 7.5564, "step": 1113600 }, { "epoch": 4.536969046399339, "grad_norm": 3.5417563915252686, "learning_rate": 0.0010882752289283555, "loss": 7.5982, "step": 1113700 }, { "epoch": 4.53737642442272, "grad_norm": 10.698873519897461, "learning_rate": 0.0010878668982999226, "loss": 7.5737, "step": 1113800 }, { "epoch": 4.537783802446102, "grad_norm": 6.200733184814453, "learning_rate": 0.001087458623698337, "loss": 7.5809, "step": 1113900 }, { "epoch": 4.538191180469482, "grad_norm": 6.130573272705078, "learning_rate": 0.0010870504051397453, "loss": 7.5746, "step": 1114000 }, { "epoch": 4.538191180469482, "eval_MaskedAccuracy": 0.5102800815169236, "eval_loss": 1.5996474027633667, "eval_runtime": 232.3978, "eval_samples_per_second": 273.135, "eval_steps_per_second": 1.067, "step": 1114000 }, { "epoch": 4.538598558492865, "grad_norm": 11.021953582763672, "learning_rate": 0.0010866422426402853, "loss": 7.5923, "step": 1114100 }, { "epoch": 4.539005936516245, "grad_norm": 5.681769847869873, "learning_rate": 0.001086234136216093, "loss": 7.5695, "step": 1114200 }, { "epoch": 4.539413314539627, "grad_norm": 10.913374900817871, "learning_rate": 0.001085826085883302, "loss": 7.5638, "step": 1114300 }, { "epoch": 4.5398206925630085, "grad_norm": 3.1211705207824707, "learning_rate": 0.0010854180916580484, "loss": 7.565, "step": 1114400 }, { "epoch": 4.54022807058639, "grad_norm": 8.329007148742676, "learning_rate": 0.0010850101535564596, "loss": 7.5831, "step": 1114500 }, { "epoch": 4.5406354486097715, "grad_norm": 4.618702411651611, "learning_rate": 0.001084602271594664, "loss": 7.5554, "step": 1114600 }, { "epoch": 4.541042826633153, "grad_norm": 4.494823455810547, "learning_rate": 0.0010841944457887867, "loss": 7.5761, "step": 1114700 }, { "epoch": 4.541450204656535, "grad_norm": 3.781698703765869, "learning_rate": 0.0010837866761549523, "loss": 7.5645, "step": 1114800 }, { "epoch": 4.541857582679916, "grad_norm": 3.5153980255126953, "learning_rate": 0.0010833789627092817, "loss": 7.5857, "step": 1114900 }, { "epoch": 4.542264960703298, "grad_norm": 4.925096035003662, "learning_rate": 0.0010829713054678943, "loss": 7.5668, "step": 1115000 }, { "epoch": 4.542264960703298, "eval_MaskedAccuracy": 0.5107099442867878, "eval_loss": 1.5973669290542603, "eval_runtime": 167.7262, "eval_samples_per_second": 378.45, "eval_steps_per_second": 1.479, "step": 1115000 }, { "epoch": 4.542672338726679, "grad_norm": 5.03632116317749, "learning_rate": 0.0010825637044469057, "loss": 7.5835, "step": 1115100 }, { "epoch": 4.543079716750061, "grad_norm": 7.940250873565674, "learning_rate": 0.001082156159662433, "loss": 7.5807, "step": 1115200 }, { "epoch": 4.543487094773441, "grad_norm": 4.016974449157715, "learning_rate": 0.0010817486711305864, "loss": 7.5783, "step": 1115300 }, { "epoch": 4.543894472796823, "grad_norm": 8.841370582580566, "learning_rate": 0.001081341238867477, "loss": 7.5237, "step": 1115400 }, { "epoch": 4.544301850820204, "grad_norm": 9.194918632507324, "learning_rate": 0.001080933862889213, "loss": 7.5915, "step": 1115500 }, { "epoch": 4.544709228843586, "grad_norm": 7.682662010192871, "learning_rate": 0.0010805265432118996, "loss": 7.5789, "step": 1115600 }, { "epoch": 4.5451166068669675, "grad_norm": 5.003277778625488, "learning_rate": 0.001080119279851642, "loss": 7.5844, "step": 1115700 }, { "epoch": 4.545523984890349, "grad_norm": 9.853026390075684, "learning_rate": 0.0010797120728245407, "loss": 7.5879, "step": 1115800 }, { "epoch": 4.545931362913731, "grad_norm": 6.268301486968994, "learning_rate": 0.001079304922146694, "loss": 7.5754, "step": 1115900 }, { "epoch": 4.546338740937112, "grad_norm": 4.028406620025635, "learning_rate": 0.0010788978278341976, "loss": 7.5678, "step": 1116000 }, { "epoch": 4.546338740937112, "eval_MaskedAccuracy": 0.5113442019094754, "eval_loss": 1.596558928489685, "eval_runtime": 171.9151, "eval_samples_per_second": 369.229, "eval_steps_per_second": 1.443, "step": 1116000 }, { "epoch": 4.546746118960494, "grad_norm": 3.8657755851745605, "learning_rate": 0.0010784907899031484, "loss": 7.5684, "step": 1116100 }, { "epoch": 4.547153496983875, "grad_norm": 8.76175308227539, "learning_rate": 0.0010780838083696374, "loss": 7.5521, "step": 1116200 }, { "epoch": 4.547560875007257, "grad_norm": 4.481683731079102, "learning_rate": 0.0010776768832497547, "loss": 7.5587, "step": 1116300 }, { "epoch": 4.547968253030638, "grad_norm": 7.46676778793335, "learning_rate": 0.0010772700145595872, "loss": 7.5549, "step": 1116400 }, { "epoch": 4.54837563105402, "grad_norm": 18.588315963745117, "learning_rate": 0.0010768632023152233, "loss": 7.587, "step": 1116500 }, { "epoch": 4.5487830090774, "grad_norm": 4.303117752075195, "learning_rate": 0.0010764564465327454, "loss": 7.5673, "step": 1116600 }, { "epoch": 4.549190387100782, "grad_norm": 4.791867256164551, "learning_rate": 0.0010760497472282326, "loss": 7.562, "step": 1116700 }, { "epoch": 4.5495977651241635, "grad_norm": 9.731392860412598, "learning_rate": 0.0010756431044177683, "loss": 7.555, "step": 1116800 }, { "epoch": 4.550005143147545, "grad_norm": 16.8034610748291, "learning_rate": 0.0010752365181174257, "loss": 7.5842, "step": 1116900 }, { "epoch": 4.5504125211709265, "grad_norm": 11.9800386428833, "learning_rate": 0.0010748299883432803, "loss": 7.5738, "step": 1117000 }, { "epoch": 4.5504125211709265, "eval_MaskedAccuracy": 0.5114025195152014, "eval_loss": 1.5812039375305176, "eval_runtime": 163.9994, "eval_samples_per_second": 387.05, "eval_steps_per_second": 1.512, "step": 1117000 }, { "epoch": 4.550819899194308, "grad_norm": 10.361425399780273, "learning_rate": 0.0010744235151114053, "loss": 7.5542, "step": 1117100 }, { "epoch": 4.55122727721769, "grad_norm": 8.61764144897461, "learning_rate": 0.0010740170984378688, "loss": 7.5817, "step": 1117200 }, { "epoch": 4.551634655241071, "grad_norm": 8.817815780639648, "learning_rate": 0.0010736107383387417, "loss": 7.6032, "step": 1117300 }, { "epoch": 4.552042033264453, "grad_norm": 2.282099723815918, "learning_rate": 0.0010732044348300853, "loss": 7.5587, "step": 1117400 }, { "epoch": 4.552449411287834, "grad_norm": 10.575343132019043, "learning_rate": 0.0010727981879279666, "loss": 7.5449, "step": 1117500 }, { "epoch": 4.552856789311216, "grad_norm": 13.9160795211792, "learning_rate": 0.0010723919976484465, "loss": 7.5514, "step": 1117600 }, { "epoch": 4.553264167334597, "grad_norm": 9.172242164611816, "learning_rate": 0.0010719858640075826, "loss": 7.5638, "step": 1117700 }, { "epoch": 4.553671545357979, "grad_norm": 7.916067600250244, "learning_rate": 0.001071579787021432, "loss": 7.5755, "step": 1117800 }, { "epoch": 4.5540789233813594, "grad_norm": 3.5791852474212646, "learning_rate": 0.0010711737667060484, "loss": 7.5668, "step": 1117900 }, { "epoch": 4.554486301404741, "grad_norm": 12.591864585876465, "learning_rate": 0.0010707678030774837, "loss": 7.586, "step": 1118000 }, { "epoch": 4.554486301404741, "eval_MaskedAccuracy": 0.5105842546790761, "eval_loss": 1.6017931699752808, "eval_runtime": 173.2789, "eval_samples_per_second": 366.323, "eval_steps_per_second": 1.431, "step": 1118000 }, { "epoch": 4.5548936794281225, "grad_norm": 6.084591388702393, "learning_rate": 0.001070361896151789, "loss": 7.5895, "step": 1118100 }, { "epoch": 4.555301057451504, "grad_norm": 13.230748176574707, "learning_rate": 0.0010699560459450108, "loss": 7.5819, "step": 1118200 }, { "epoch": 4.555708435474886, "grad_norm": 6.209146499633789, "learning_rate": 0.001069550252473196, "loss": 7.5958, "step": 1118300 }, { "epoch": 4.556115813498267, "grad_norm": 6.586806774139404, "learning_rate": 0.0010691445157523871, "loss": 7.5711, "step": 1118400 }, { "epoch": 4.556523191521649, "grad_norm": 4.204071998596191, "learning_rate": 0.0010687388357986261, "loss": 7.5655, "step": 1118500 }, { "epoch": 4.55693056954503, "grad_norm": 9.868650436401367, "learning_rate": 0.0010683332126279492, "loss": 7.5773, "step": 1118600 }, { "epoch": 4.557337947568412, "grad_norm": 5.84165620803833, "learning_rate": 0.0010679276462563947, "loss": 7.5928, "step": 1118700 }, { "epoch": 4.557745325591793, "grad_norm": 5.934895038604736, "learning_rate": 0.0010675221366999966, "loss": 7.5915, "step": 1118800 }, { "epoch": 4.558152703615175, "grad_norm": 8.131453514099121, "learning_rate": 0.001067116683974785, "loss": 7.5941, "step": 1118900 }, { "epoch": 4.558560081638555, "grad_norm": 12.419584274291992, "learning_rate": 0.0010667112880967918, "loss": 7.5414, "step": 1119000 }, { "epoch": 4.558560081638555, "eval_MaskedAccuracy": 0.5111769190311796, "eval_loss": 1.6003597974777222, "eval_runtime": 176.7664, "eval_samples_per_second": 359.095, "eval_steps_per_second": 1.403, "step": 1119000 }, { "epoch": 4.558967459661938, "grad_norm": 9.122279167175293, "learning_rate": 0.0010663059490820435, "loss": 7.5885, "step": 1119100 }, { "epoch": 4.5593748376853185, "grad_norm": 5.479848384857178, "learning_rate": 0.0010659006669465668, "loss": 7.5954, "step": 1119200 }, { "epoch": 4.5597822157087, "grad_norm": 9.282177925109863, "learning_rate": 0.0010654954417063827, "loss": 7.5906, "step": 1119300 }, { "epoch": 4.5601895937320815, "grad_norm": 4.657062530517578, "learning_rate": 0.0010650902733775117, "loss": 7.5873, "step": 1119400 }, { "epoch": 4.560596971755463, "grad_norm": 11.059744834899902, "learning_rate": 0.0010646851619759761, "loss": 7.5997, "step": 1119500 }, { "epoch": 4.561004349778845, "grad_norm": 13.795628547668457, "learning_rate": 0.0010642801075177871, "loss": 7.5797, "step": 1119600 }, { "epoch": 4.561411727802226, "grad_norm": 3.236295223236084, "learning_rate": 0.0010638751100189624, "loss": 7.573, "step": 1119700 }, { "epoch": 4.561819105825608, "grad_norm": 7.676112651824951, "learning_rate": 0.0010634701694955113, "loss": 7.5793, "step": 1119800 }, { "epoch": 4.562226483848989, "grad_norm": 7.574018955230713, "learning_rate": 0.0010630652859634452, "loss": 7.5777, "step": 1119900 }, { "epoch": 4.562633861872371, "grad_norm": 7.075233459472656, "learning_rate": 0.0010626604594387709, "loss": 7.6028, "step": 1120000 }, { "epoch": 4.562633861872371, "eval_MaskedAccuracy": 0.5112010266145673, "eval_loss": 1.581120491027832, "eval_runtime": 158.2883, "eval_samples_per_second": 401.015, "eval_steps_per_second": 1.567, "step": 1120000 }, { "epoch": 4.563041239895752, "grad_norm": 6.8384270668029785, "learning_rate": 0.0010622556899374917, "loss": 7.5601, "step": 1120100 }, { "epoch": 4.563448617919134, "grad_norm": 6.318542003631592, "learning_rate": 0.0010618509774756118, "loss": 7.5672, "step": 1120200 }, { "epoch": 4.5638559959425145, "grad_norm": 3.199270009994507, "learning_rate": 0.001061446322069132, "loss": 7.5738, "step": 1120300 }, { "epoch": 4.564263373965896, "grad_norm": 3.4965708255767822, "learning_rate": 0.0010610417237340488, "loss": 7.5535, "step": 1120400 }, { "epoch": 4.5646707519892775, "grad_norm": 8.460668563842773, "learning_rate": 0.0010606371824863588, "loss": 7.5691, "step": 1120500 }, { "epoch": 4.565078130012659, "grad_norm": 5.669637203216553, "learning_rate": 0.0010602326983420562, "loss": 7.5642, "step": 1120600 }, { "epoch": 4.565485508036041, "grad_norm": 10.30564022064209, "learning_rate": 0.0010598282713171327, "loss": 7.5627, "step": 1120700 }, { "epoch": 4.565892886059422, "grad_norm": 4.847383975982666, "learning_rate": 0.0010594239014275766, "loss": 7.5927, "step": 1120800 }, { "epoch": 4.566300264082804, "grad_norm": 4.886056900024414, "learning_rate": 0.0010590195886893757, "loss": 7.5778, "step": 1120900 }, { "epoch": 4.566707642106185, "grad_norm": 14.0728120803833, "learning_rate": 0.0010586153331185138, "loss": 7.5856, "step": 1121000 }, { "epoch": 4.566707642106185, "eval_MaskedAccuracy": 0.5101112160708272, "eval_loss": 1.6021958589553833, "eval_runtime": 170.5827, "eval_samples_per_second": 372.113, "eval_steps_per_second": 1.454, "step": 1121000 }, { "epoch": 4.567115020129567, "grad_norm": 16.848453521728516, "learning_rate": 0.0010582111347309732, "loss": 7.5735, "step": 1121100 }, { "epoch": 4.567522398152948, "grad_norm": 12.972195625305176, "learning_rate": 0.001057806993542734, "loss": 7.5976, "step": 1121200 }, { "epoch": 4.56792977617633, "grad_norm": 6.517111778259277, "learning_rate": 0.0010574029095697759, "loss": 7.5521, "step": 1121300 }, { "epoch": 4.568337154199711, "grad_norm": 9.902020454406738, "learning_rate": 0.001056998882828071, "loss": 7.5594, "step": 1121400 }, { "epoch": 4.568744532223093, "grad_norm": 7.743771076202393, "learning_rate": 0.0010565949133335974, "loss": 7.5574, "step": 1121500 }, { "epoch": 4.5691519102464735, "grad_norm": 5.402374744415283, "learning_rate": 0.0010561910011023217, "loss": 7.5652, "step": 1121600 }, { "epoch": 4.569559288269855, "grad_norm": 17.277074813842773, "learning_rate": 0.001055787146150215, "loss": 7.5835, "step": 1121700 }, { "epoch": 4.569966666293237, "grad_norm": 6.748904705047607, "learning_rate": 0.0010553833484932436, "loss": 7.5952, "step": 1121800 }, { "epoch": 4.570374044316618, "grad_norm": 4.527615070343018, "learning_rate": 0.0010549796081473705, "loss": 7.5704, "step": 1121900 }, { "epoch": 4.57078142234, "grad_norm": 4.347258567810059, "learning_rate": 0.0010545759251285596, "loss": 7.5871, "step": 1122000 }, { "epoch": 4.57078142234, "eval_MaskedAccuracy": 0.5109850990092902, "eval_loss": 1.5945192575454712, "eval_runtime": 162.1547, "eval_samples_per_second": 391.453, "eval_steps_per_second": 1.529, "step": 1122000 }, { "epoch": 4.571188800363381, "grad_norm": 9.114790916442871, "learning_rate": 0.0010541722994527698, "loss": 7.5719, "step": 1122100 }, { "epoch": 4.571596178386763, "grad_norm": 3.812300682067871, "learning_rate": 0.0010537687311359596, "loss": 7.556, "step": 1122200 }, { "epoch": 4.572003556410144, "grad_norm": 8.83957576751709, "learning_rate": 0.0010533652201940818, "loss": 7.5836, "step": 1122300 }, { "epoch": 4.572410934433526, "grad_norm": 4.638280391693115, "learning_rate": 0.0010529617666430922, "loss": 7.5522, "step": 1122400 }, { "epoch": 4.572818312456907, "grad_norm": 5.700409412384033, "learning_rate": 0.0010525583704989396, "loss": 7.5415, "step": 1122500 }, { "epoch": 4.573225690480289, "grad_norm": 13.936721801757812, "learning_rate": 0.001052155031777572, "loss": 7.5813, "step": 1122600 }, { "epoch": 4.57363306850367, "grad_norm": 7.647896766662598, "learning_rate": 0.0010517517504949356, "loss": 7.5843, "step": 1122700 }, { "epoch": 4.574040446527052, "grad_norm": 6.669071674346924, "learning_rate": 0.0010513485266669758, "loss": 7.5789, "step": 1122800 }, { "epoch": 4.5744478245504325, "grad_norm": 14.1397705078125, "learning_rate": 0.0010509453603096321, "loss": 7.6079, "step": 1122900 }, { "epoch": 4.574855202573814, "grad_norm": 7.113717555999756, "learning_rate": 0.0010505422514388472, "loss": 7.5777, "step": 1123000 }, { "epoch": 4.574855202573814, "eval_MaskedAccuracy": 0.5117404056359195, "eval_loss": 1.5946736335754395, "eval_runtime": 164.3125, "eval_samples_per_second": 386.313, "eval_steps_per_second": 1.509, "step": 1123000 }, { "epoch": 4.575262580597196, "grad_norm": 5.9611639976501465, "learning_rate": 0.0010501392000705555, "loss": 7.5581, "step": 1123100 }, { "epoch": 4.575669958620577, "grad_norm": 7.352646827697754, "learning_rate": 0.0010497362062206921, "loss": 7.5928, "step": 1123200 }, { "epoch": 4.576077336643959, "grad_norm": 17.109018325805664, "learning_rate": 0.0010493332699051878, "loss": 7.589, "step": 1123300 }, { "epoch": 4.57648471466734, "grad_norm": 12.888470649719238, "learning_rate": 0.0010489303911399763, "loss": 7.5735, "step": 1123400 }, { "epoch": 4.576892092690722, "grad_norm": 8.116667747497559, "learning_rate": 0.0010485275699409855, "loss": 7.5698, "step": 1123500 }, { "epoch": 4.577299470714103, "grad_norm": 8.50510025024414, "learning_rate": 0.0010481248063241384, "loss": 7.5857, "step": 1123600 }, { "epoch": 4.577706848737485, "grad_norm": 9.210661888122559, "learning_rate": 0.00104772210030536, "loss": 7.5789, "step": 1123700 }, { "epoch": 4.578114226760866, "grad_norm": 12.335138320922852, "learning_rate": 0.0010473194519005718, "loss": 7.5577, "step": 1123800 }, { "epoch": 4.578521604784248, "grad_norm": 14.52278995513916, "learning_rate": 0.0010469168611256918, "loss": 7.5606, "step": 1123900 }, { "epoch": 4.5789289828076285, "grad_norm": 4.502407073974609, "learning_rate": 0.0010465143279966365, "loss": 7.575, "step": 1124000 }, { "epoch": 4.5789289828076285, "eval_MaskedAccuracy": 0.5108547609190957, "eval_loss": 1.5947115421295166, "eval_runtime": 177.8627, "eval_samples_per_second": 356.882, "eval_steps_per_second": 1.394, "step": 1124000 }, { "epoch": 4.579336360831011, "grad_norm": 8.426820755004883, "learning_rate": 0.0010461118525293195, "loss": 7.5658, "step": 1124100 }, { "epoch": 4.579743738854392, "grad_norm": 8.536303520202637, "learning_rate": 0.001045709434739654, "loss": 7.5717, "step": 1124200 }, { "epoch": 4.580151116877773, "grad_norm": 12.823686599731445, "learning_rate": 0.0010453070746435482, "loss": 7.5661, "step": 1124300 }, { "epoch": 4.580558494901155, "grad_norm": 11.003029823303223, "learning_rate": 0.0010449047722569116, "loss": 7.5612, "step": 1124400 }, { "epoch": 4.580965872924536, "grad_norm": 13.4091796875, "learning_rate": 0.0010445025275956474, "loss": 7.5741, "step": 1124500 }, { "epoch": 4.581373250947918, "grad_norm": 8.795026779174805, "learning_rate": 0.0010441003406756603, "loss": 7.5822, "step": 1124600 }, { "epoch": 4.581780628971299, "grad_norm": 12.963661193847656, "learning_rate": 0.0010436982115128507, "loss": 7.5543, "step": 1124700 }, { "epoch": 4.582188006994681, "grad_norm": 8.706557273864746, "learning_rate": 0.0010432961401231166, "loss": 7.562, "step": 1124800 }, { "epoch": 4.582595385018062, "grad_norm": 3.7110719680786133, "learning_rate": 0.0010428941265223544, "loss": 7.5633, "step": 1124900 }, { "epoch": 4.583002763041444, "grad_norm": 15.754704475402832, "learning_rate": 0.0010424921707264572, "loss": 7.5555, "step": 1125000 }, { "epoch": 4.583002763041444, "eval_MaskedAccuracy": 0.5112558824422517, "eval_loss": 1.588430404663086, "eval_runtime": 178.381, "eval_samples_per_second": 355.845, "eval_steps_per_second": 1.39, "step": 1125000 }, { "epoch": 4.583410141064825, "grad_norm": 7.041046619415283, "learning_rate": 0.0010420902727513149, "loss": 7.5687, "step": 1125100 }, { "epoch": 4.583817519088207, "grad_norm": 4.503931999206543, "learning_rate": 0.0010416884326128202, "loss": 7.5734, "step": 1125200 }, { "epoch": 4.5842248971115875, "grad_norm": 5.288613319396973, "learning_rate": 0.0010412866503268581, "loss": 7.5729, "step": 1125300 }, { "epoch": 4.584632275134969, "grad_norm": 12.690825462341309, "learning_rate": 0.0010408849259093118, "loss": 7.5734, "step": 1125400 }, { "epoch": 4.585039653158351, "grad_norm": 14.83486270904541, "learning_rate": 0.001040483259376066, "loss": 7.5675, "step": 1125500 }, { "epoch": 4.585447031181732, "grad_norm": 5.501760959625244, "learning_rate": 0.0010400816507430003, "loss": 7.5875, "step": 1125600 }, { "epoch": 4.585854409205114, "grad_norm": 10.312743186950684, "learning_rate": 0.0010396801000259917, "loss": 7.568, "step": 1125700 }, { "epoch": 4.586261787228495, "grad_norm": 5.359569549560547, "learning_rate": 0.0010392786072409154, "loss": 7.5473, "step": 1125800 }, { "epoch": 4.586669165251877, "grad_norm": 7.668104648590088, "learning_rate": 0.001038877172403645, "loss": 7.5719, "step": 1125900 }, { "epoch": 4.587076543275258, "grad_norm": 9.105608940124512, "learning_rate": 0.0010384757955300514, "loss": 7.5493, "step": 1126000 }, { "epoch": 4.587076543275258, "eval_MaskedAccuracy": 0.5116670547969623, "eval_loss": 1.5944896936416626, "eval_runtime": 170.7368, "eval_samples_per_second": 371.777, "eval_steps_per_second": 1.453, "step": 1126000 }, { "epoch": 4.58748392129864, "grad_norm": 7.818266868591309, "learning_rate": 0.001038074476636002, "loss": 7.5657, "step": 1126100 }, { "epoch": 4.587891299322021, "grad_norm": 5.807822227478027, "learning_rate": 0.0010376732157373648, "loss": 7.5645, "step": 1126200 }, { "epoch": 4.588298677345403, "grad_norm": 5.894556999206543, "learning_rate": 0.0010372720128500035, "loss": 7.5616, "step": 1126300 }, { "epoch": 4.588706055368784, "grad_norm": 9.41707706451416, "learning_rate": 0.0010368708679897782, "loss": 7.5584, "step": 1126400 }, { "epoch": 4.589113433392166, "grad_norm": 10.203904151916504, "learning_rate": 0.0010364697811725506, "loss": 7.5567, "step": 1126500 }, { "epoch": 4.589520811415547, "grad_norm": 12.825005531311035, "learning_rate": 0.0010360687524141774, "loss": 7.5901, "step": 1126600 }, { "epoch": 4.589928189438928, "grad_norm": 7.494446277618408, "learning_rate": 0.0010356677817305125, "loss": 7.5719, "step": 1126700 }, { "epoch": 4.59033556746231, "grad_norm": 5.06825590133667, "learning_rate": 0.0010352668691374096, "loss": 7.5904, "step": 1126800 }, { "epoch": 4.590742945485691, "grad_norm": 7.149888515472412, "learning_rate": 0.0010348660146507153, "loss": 7.5604, "step": 1126900 }, { "epoch": 4.591150323509073, "grad_norm": 18.231800079345703, "learning_rate": 0.0010344652182862825, "loss": 7.5706, "step": 1127000 }, { "epoch": 4.591150323509073, "eval_MaskedAccuracy": 0.5105027030301975, "eval_loss": 1.5943135023117065, "eval_runtime": 175.4477, "eval_samples_per_second": 361.794, "eval_steps_per_second": 1.414, "step": 1127000 }, { "epoch": 4.591557701532454, "grad_norm": 15.378019332885742, "learning_rate": 0.001034064480059953, "loss": 7.5635, "step": 1127100 }, { "epoch": 4.591965079555836, "grad_norm": 8.497958183288574, "learning_rate": 0.001033663799987573, "loss": 7.5727, "step": 1127200 }, { "epoch": 4.592372457579217, "grad_norm": 5.854098796844482, "learning_rate": 0.0010332631780849823, "loss": 7.5606, "step": 1127300 }, { "epoch": 4.592779835602599, "grad_norm": 11.54566478729248, "learning_rate": 0.0010328626143680188, "loss": 7.5814, "step": 1127400 }, { "epoch": 4.59318721362598, "grad_norm": 4.415400505065918, "learning_rate": 0.00103246210885252, "loss": 7.5459, "step": 1127500 }, { "epoch": 4.593594591649362, "grad_norm": 9.316043853759766, "learning_rate": 0.00103206166155432, "loss": 7.5811, "step": 1127600 }, { "epoch": 4.594001969672743, "grad_norm": 8.586962699890137, "learning_rate": 0.0010316612724892505, "loss": 7.5944, "step": 1127700 }, { "epoch": 4.594409347696125, "grad_norm": 6.871153354644775, "learning_rate": 0.0010312609416731407, "loss": 7.5594, "step": 1127800 }, { "epoch": 4.594816725719506, "grad_norm": 3.2205865383148193, "learning_rate": 0.001030860669121818, "loss": 7.5644, "step": 1127900 }, { "epoch": 4.595224103742887, "grad_norm": 2.867100954055786, "learning_rate": 0.0010304604548511073, "loss": 7.5547, "step": 1128000 }, { "epoch": 4.595224103742887, "eval_MaskedAccuracy": 0.5108949392155295, "eval_loss": 1.5948376655578613, "eval_runtime": 163.237, "eval_samples_per_second": 388.858, "eval_steps_per_second": 1.519, "step": 1128000 }, { "epoch": 4.595631481766269, "grad_norm": 5.63533878326416, "learning_rate": 0.0010300602988768314, "loss": 7.5702, "step": 1128100 }, { "epoch": 4.59603885978965, "grad_norm": 4.0221943855285645, "learning_rate": 0.00102966020121481, "loss": 7.5674, "step": 1128200 }, { "epoch": 4.596446237813032, "grad_norm": 4.27165412902832, "learning_rate": 0.0010292601618808609, "loss": 7.5578, "step": 1128300 }, { "epoch": 4.596853615836413, "grad_norm": 3.5418479442596436, "learning_rate": 0.0010288601808908008, "loss": 7.5607, "step": 1128400 }, { "epoch": 4.597260993859795, "grad_norm": 4.3672871589660645, "learning_rate": 0.0010284602582604422, "loss": 7.5502, "step": 1128500 }, { "epoch": 4.597668371883176, "grad_norm": 19.08277130126953, "learning_rate": 0.0010280603940055976, "loss": 7.5674, "step": 1128600 }, { "epoch": 4.598075749906558, "grad_norm": 5.180901050567627, "learning_rate": 0.0010276605881420749, "loss": 7.5999, "step": 1128700 }, { "epoch": 4.598483127929939, "grad_norm": 7.836824893951416, "learning_rate": 0.00102726084068568, "loss": 7.5926, "step": 1128800 }, { "epoch": 4.598890505953321, "grad_norm": 4.071815013885498, "learning_rate": 0.0010268611516522181, "loss": 7.5707, "step": 1128900 }, { "epoch": 4.599297883976702, "grad_norm": 3.604161500930786, "learning_rate": 0.0010264615210574893, "loss": 7.5708, "step": 1129000 }, { "epoch": 4.599297883976702, "eval_MaskedAccuracy": 0.5113635999654071, "eval_loss": 1.5902968645095825, "eval_runtime": 179.3944, "eval_samples_per_second": 353.835, "eval_steps_per_second": 1.382, "step": 1129000 }, { "epoch": 4.599705262000084, "grad_norm": 3.2433528900146484, "learning_rate": 0.0010260619489172959, "loss": 7.5727, "step": 1129100 }, { "epoch": 4.600112640023465, "grad_norm": 7.34473180770874, "learning_rate": 0.001025662435247433, "loss": 7.5644, "step": 1129200 }, { "epoch": 4.600520018046846, "grad_norm": 7.951183795928955, "learning_rate": 0.0010252629800636965, "loss": 7.5735, "step": 1129300 }, { "epoch": 4.600927396070228, "grad_norm": 8.226068496704102, "learning_rate": 0.001024863583381878, "loss": 7.5836, "step": 1129400 }, { "epoch": 4.601334774093609, "grad_norm": 11.12123966217041, "learning_rate": 0.0010244642452177707, "loss": 7.5349, "step": 1129500 }, { "epoch": 4.601742152116991, "grad_norm": 4.654839992523193, "learning_rate": 0.0010240649655871583, "loss": 7.5896, "step": 1129600 }, { "epoch": 4.602149530140372, "grad_norm": 5.445435523986816, "learning_rate": 0.0010236657445058283, "loss": 7.6001, "step": 1129700 }, { "epoch": 4.602556908163754, "grad_norm": 9.480306625366211, "learning_rate": 0.0010232665819895639, "loss": 7.5434, "step": 1129800 }, { "epoch": 4.602964286187135, "grad_norm": 8.214174270629883, "learning_rate": 0.0010228674780541462, "loss": 7.5839, "step": 1129900 }, { "epoch": 4.603371664210517, "grad_norm": 10.587176322937012, "learning_rate": 0.001022468432715355, "loss": 7.5806, "step": 1130000 }, { "epoch": 4.603371664210517, "eval_MaskedAccuracy": 0.5111672492458016, "eval_loss": 1.5945781469345093, "eval_runtime": 171.1909, "eval_samples_per_second": 370.791, "eval_steps_per_second": 1.449, "step": 1130000 }, { "epoch": 4.6037790422338984, "grad_norm": 7.952520370483398, "learning_rate": 0.0010220694459889654, "loss": 7.5885, "step": 1130100 }, { "epoch": 4.60418642025728, "grad_norm": 4.537388801574707, "learning_rate": 0.001021670517890752, "loss": 7.5772, "step": 1130200 }, { "epoch": 4.604593798280661, "grad_norm": 25.86749267578125, "learning_rate": 0.0010212716484364855, "loss": 7.5998, "step": 1130300 }, { "epoch": 4.605001176304042, "grad_norm": 16.674840927124023, "learning_rate": 0.0010208728376419366, "loss": 7.5769, "step": 1130400 }, { "epoch": 4.605408554327424, "grad_norm": 5.859453201293945, "learning_rate": 0.0010204740855228724, "loss": 7.6014, "step": 1130500 }, { "epoch": 4.605815932350805, "grad_norm": 7.464219570159912, "learning_rate": 0.0010200753920950577, "loss": 7.5805, "step": 1130600 }, { "epoch": 4.606223310374187, "grad_norm": 7.1977057456970215, "learning_rate": 0.0010196767573742542, "loss": 7.5872, "step": 1130700 }, { "epoch": 4.606630688397568, "grad_norm": 5.396115779876709, "learning_rate": 0.0010192781813762233, "loss": 7.5671, "step": 1130800 }, { "epoch": 4.60703806642095, "grad_norm": 4.91425085067749, "learning_rate": 0.0010188796641167213, "loss": 7.5778, "step": 1130900 }, { "epoch": 4.607445444444331, "grad_norm": 6.383526802062988, "learning_rate": 0.0010184812056115043, "loss": 7.5679, "step": 1131000 }, { "epoch": 4.607445444444331, "eval_MaskedAccuracy": 0.5117166080500752, "eval_loss": 1.5865892171859741, "eval_runtime": 197.6892, "eval_samples_per_second": 321.09, "eval_steps_per_second": 1.254, "step": 1131000 }, { "epoch": 4.607852822467713, "grad_norm": 4.455655574798584, "learning_rate": 0.0010180828058763266, "loss": 7.5653, "step": 1131100 }, { "epoch": 4.608260200491094, "grad_norm": 4.2978081703186035, "learning_rate": 0.001017684464926938, "loss": 7.5493, "step": 1131200 }, { "epoch": 4.608667578514476, "grad_norm": 6.322062015533447, "learning_rate": 0.001017286182779087, "loss": 7.5542, "step": 1131300 }, { "epoch": 4.6090749565378575, "grad_norm": 10.278402328491211, "learning_rate": 0.0010168879594485202, "loss": 7.5369, "step": 1131400 }, { "epoch": 4.609482334561239, "grad_norm": 13.21437931060791, "learning_rate": 0.0010164897949509818, "loss": 7.5549, "step": 1131500 }, { "epoch": 4.60988971258462, "grad_norm": 5.219822883605957, "learning_rate": 0.0010160916893022125, "loss": 7.5572, "step": 1131600 }, { "epoch": 4.610297090608001, "grad_norm": 8.883370399475098, "learning_rate": 0.0010156936425179516, "loss": 7.5831, "step": 1131700 }, { "epoch": 4.610704468631383, "grad_norm": 5.648265361785889, "learning_rate": 0.0010152956546139373, "loss": 7.5822, "step": 1131800 }, { "epoch": 4.611111846654764, "grad_norm": 10.747003555297852, "learning_rate": 0.001014897725605902, "loss": 7.5424, "step": 1131900 }, { "epoch": 4.611519224678146, "grad_norm": 7.558552265167236, "learning_rate": 0.00101449985550958, "loss": 7.5522, "step": 1132000 }, { "epoch": 4.611519224678146, "eval_MaskedAccuracy": 0.5109786866953794, "eval_loss": 1.5969634056091309, "eval_runtime": 212.6225, "eval_samples_per_second": 298.538, "eval_steps_per_second": 1.166, "step": 1132000 }, { "epoch": 4.611926602701527, "grad_norm": 3.6435275077819824, "learning_rate": 0.0010141020443406989, "loss": 7.5524, "step": 1132100 }, { "epoch": 4.612333980724909, "grad_norm": 4.184134483337402, "learning_rate": 0.0010137042921149891, "loss": 7.5903, "step": 1132200 }, { "epoch": 4.61274135874829, "grad_norm": 7.438037395477295, "learning_rate": 0.0010133065988481749, "loss": 7.5521, "step": 1132300 }, { "epoch": 4.613148736771672, "grad_norm": 5.46174955368042, "learning_rate": 0.0010129089645559797, "loss": 7.5624, "step": 1132400 }, { "epoch": 4.6135561147950535, "grad_norm": 14.735342979431152, "learning_rate": 0.001012511389254122, "loss": 7.5784, "step": 1132500 }, { "epoch": 4.613963492818435, "grad_norm": 4.234184741973877, "learning_rate": 0.001012113872958322, "loss": 7.5548, "step": 1132600 }, { "epoch": 4.6143708708418165, "grad_norm": 4.062404632568359, "learning_rate": 0.0010117164156842973, "loss": 7.5537, "step": 1132700 }, { "epoch": 4.614778248865198, "grad_norm": 11.664778709411621, "learning_rate": 0.001011319017447758, "loss": 7.6005, "step": 1132800 }, { "epoch": 4.615185626888579, "grad_norm": 9.382136344909668, "learning_rate": 0.0010109216782644165, "loss": 7.5342, "step": 1132900 }, { "epoch": 4.61559300491196, "grad_norm": 10.988692283630371, "learning_rate": 0.0010105243981499816, "loss": 7.6027, "step": 1133000 }, { "epoch": 4.61559300491196, "eval_MaskedAccuracy": 0.511027137844796, "eval_loss": 1.6006269454956055, "eval_runtime": 169.4552, "eval_samples_per_second": 374.589, "eval_steps_per_second": 1.464, "step": 1133000 }, { "epoch": 4.616000382935342, "grad_norm": 6.641720771789551, "learning_rate": 0.0010101271771201614, "loss": 7.572, "step": 1133100 }, { "epoch": 4.616407760958723, "grad_norm": 4.952685356140137, "learning_rate": 0.0010097300151906575, "loss": 7.5532, "step": 1133200 }, { "epoch": 4.616815138982105, "grad_norm": 6.696595191955566, "learning_rate": 0.0010093329123771718, "loss": 7.5571, "step": 1133300 }, { "epoch": 4.617222517005486, "grad_norm": 5.600722789764404, "learning_rate": 0.001008935868695409, "loss": 7.5685, "step": 1133400 }, { "epoch": 4.617629895028868, "grad_norm": 13.680784225463867, "learning_rate": 0.0010085388841610601, "loss": 7.56, "step": 1133500 }, { "epoch": 4.618037273052249, "grad_norm": 5.2523417472839355, "learning_rate": 0.0010081419587898229, "loss": 7.5759, "step": 1133600 }, { "epoch": 4.618444651075631, "grad_norm": 3.5989181995391846, "learning_rate": 0.0010077450925973913, "loss": 7.5614, "step": 1133700 }, { "epoch": 4.6188520290990125, "grad_norm": 3.7085886001586914, "learning_rate": 0.0010073482855994537, "loss": 7.5725, "step": 1133800 }, { "epoch": 4.619259407122394, "grad_norm": 10.798259735107422, "learning_rate": 0.0010069515378116978, "loss": 7.5735, "step": 1133900 }, { "epoch": 4.619666785145775, "grad_norm": 5.2844390869140625, "learning_rate": 0.0010065548492498094, "loss": 7.5779, "step": 1134000 }, { "epoch": 4.619666785145775, "eval_MaskedAccuracy": 0.5113493641710556, "eval_loss": 1.5992459058761597, "eval_runtime": 176.7383, "eval_samples_per_second": 359.152, "eval_steps_per_second": 1.403, "step": 1134000 }, { "epoch": 4.620074163169157, "grad_norm": 5.834797382354736, "learning_rate": 0.0010061582199294736, "loss": 7.5809, "step": 1134100 }, { "epoch": 4.620481541192538, "grad_norm": 3.8468172550201416, "learning_rate": 0.0010057616498663682, "loss": 7.5913, "step": 1134200 }, { "epoch": 4.620888919215919, "grad_norm": 9.360038757324219, "learning_rate": 0.0010053651390761734, "loss": 7.5828, "step": 1134300 }, { "epoch": 4.621296297239301, "grad_norm": 5.326218128204346, "learning_rate": 0.0010049686875745656, "loss": 7.5461, "step": 1134400 }, { "epoch": 4.621703675262682, "grad_norm": 5.876611232757568, "learning_rate": 0.001004572295377219, "loss": 7.5581, "step": 1134500 }, { "epoch": 4.622111053286064, "grad_norm": 10.328592300415039, "learning_rate": 0.001004175962499803, "loss": 7.5571, "step": 1134600 }, { "epoch": 4.622518431309445, "grad_norm": 11.294349670410156, "learning_rate": 0.001003779688957991, "loss": 7.5911, "step": 1134700 }, { "epoch": 4.622925809332827, "grad_norm": 5.186825752258301, "learning_rate": 0.0010033834747674452, "loss": 7.5542, "step": 1134800 }, { "epoch": 4.6233331873562085, "grad_norm": 16.173288345336914, "learning_rate": 0.001002987319943831, "loss": 7.5833, "step": 1134900 }, { "epoch": 4.62374056537959, "grad_norm": 13.111103057861328, "learning_rate": 0.0010025912245028121, "loss": 7.554, "step": 1135000 }, { "epoch": 4.62374056537959, "eval_MaskedAccuracy": 0.5109919107297, "eval_loss": 1.588957667350769, "eval_runtime": 158.148, "eval_samples_per_second": 401.371, "eval_steps_per_second": 1.568, "step": 1135000 }, { "epoch": 4.6241479434029715, "grad_norm": 3.178123712539673, "learning_rate": 0.0010021951884600483, "loss": 7.5383, "step": 1135100 }, { "epoch": 4.624555321426353, "grad_norm": 8.649395942687988, "learning_rate": 0.0010017992118311956, "loss": 7.5498, "step": 1135200 }, { "epoch": 4.624962699449734, "grad_norm": 6.623432636260986, "learning_rate": 0.0010014032946319092, "loss": 7.5677, "step": 1135300 }, { "epoch": 4.625370077473115, "grad_norm": 7.139390468597412, "learning_rate": 0.0010010074368778433, "loss": 7.5651, "step": 1135400 }, { "epoch": 4.625777455496497, "grad_norm": 5.244434356689453, "learning_rate": 0.001000611638584647, "loss": 7.5451, "step": 1135500 }, { "epoch": 4.626184833519878, "grad_norm": 7.217027187347412, "learning_rate": 0.001000215899767969, "loss": 7.5459, "step": 1135600 }, { "epoch": 4.62659221154326, "grad_norm": 5.953061103820801, "learning_rate": 0.0009998202204434547, "loss": 7.5492, "step": 1135700 }, { "epoch": 4.626999589566641, "grad_norm": 5.7374467849731445, "learning_rate": 0.0009994246006267467, "loss": 7.5751, "step": 1135800 }, { "epoch": 4.627406967590023, "grad_norm": 5.704726219177246, "learning_rate": 0.0009990290403334864, "loss": 7.5706, "step": 1135900 }, { "epoch": 4.627814345613404, "grad_norm": 5.9133124351501465, "learning_rate": 0.0009986335395793123, "loss": 7.5748, "step": 1136000 }, { "epoch": 4.627814345613404, "eval_MaskedAccuracy": 0.5111270056741465, "eval_loss": 1.5983633995056152, "eval_runtime": 163.718, "eval_samples_per_second": 387.716, "eval_steps_per_second": 1.515, "step": 1136000 }, { "epoch": 4.628221723636786, "grad_norm": 5.866695880889893, "learning_rate": 0.0009982380983798614, "loss": 7.59, "step": 1136100 }, { "epoch": 4.6286291016601675, "grad_norm": 6.089090824127197, "learning_rate": 0.0009978427167507673, "loss": 7.5661, "step": 1136200 }, { "epoch": 4.629036479683549, "grad_norm": 5.132720947265625, "learning_rate": 0.000997447394707662, "loss": 7.5672, "step": 1136300 }, { "epoch": 4.629443857706931, "grad_norm": 10.10950756072998, "learning_rate": 0.0009970521322661746, "loss": 7.6056, "step": 1136400 }, { "epoch": 4.629851235730312, "grad_norm": 6.252572536468506, "learning_rate": 0.000996656929441931, "loss": 7.5389, "step": 1136500 }, { "epoch": 4.630258613753693, "grad_norm": 6.584892272949219, "learning_rate": 0.0009962617862505564, "loss": 7.5638, "step": 1136600 }, { "epoch": 4.630665991777074, "grad_norm": 7.151813983917236, "learning_rate": 0.0009958667027076735, "loss": 7.5606, "step": 1136700 }, { "epoch": 4.631073369800456, "grad_norm": 6.8990960121154785, "learning_rate": 0.0009954716788289008, "loss": 7.5572, "step": 1136800 }, { "epoch": 4.631480747823837, "grad_norm": 12.346681594848633, "learning_rate": 0.0009950767146298556, "loss": 7.5588, "step": 1136900 }, { "epoch": 4.631888125847219, "grad_norm": 4.093268394470215, "learning_rate": 0.0009946818101261529, "loss": 7.5845, "step": 1137000 }, { "epoch": 4.631888125847219, "eval_MaskedAccuracy": 0.5110830540974026, "eval_loss": 1.6059156656265259, "eval_runtime": 159.358, "eval_samples_per_second": 398.323, "eval_steps_per_second": 1.556, "step": 1137000 }, { "epoch": 4.6322955038706, "grad_norm": 7.124499797821045, "learning_rate": 0.0009942869653334066, "loss": 7.5619, "step": 1137100 }, { "epoch": 4.632702881893982, "grad_norm": 12.021746635437012, "learning_rate": 0.0009938921802672268, "loss": 7.5638, "step": 1137200 }, { "epoch": 4.6331102599173635, "grad_norm": 7.023179054260254, "learning_rate": 0.0009934974549432201, "loss": 7.5889, "step": 1137300 }, { "epoch": 4.633517637940745, "grad_norm": 5.897668838500977, "learning_rate": 0.0009931027893769918, "loss": 7.563, "step": 1137400 }, { "epoch": 4.6339250159641265, "grad_norm": 5.407822132110596, "learning_rate": 0.0009927081835841482, "loss": 7.5535, "step": 1137500 }, { "epoch": 4.634332393987508, "grad_norm": 4.564602375030518, "learning_rate": 0.0009923136375802888, "loss": 7.5615, "step": 1137600 }, { "epoch": 4.63473977201089, "grad_norm": 7.788735866546631, "learning_rate": 0.0009919191513810108, "loss": 7.5493, "step": 1137700 }, { "epoch": 4.635147150034271, "grad_norm": 7.4595441818237305, "learning_rate": 0.0009915247250019106, "loss": 7.5604, "step": 1137800 }, { "epoch": 4.635554528057652, "grad_norm": 12.7979097366333, "learning_rate": 0.0009911303584585838, "loss": 7.5602, "step": 1137900 }, { "epoch": 4.635961906081033, "grad_norm": 8.483467102050781, "learning_rate": 0.0009907360517666202, "loss": 7.5891, "step": 1138000 }, { "epoch": 4.635961906081033, "eval_MaskedAccuracy": 0.5113504076574895, "eval_loss": 1.5911567211151123, "eval_runtime": 182.7869, "eval_samples_per_second": 347.268, "eval_steps_per_second": 1.357, "step": 1138000 }, { "epoch": 4.636369284104415, "grad_norm": 13.062750816345215, "learning_rate": 0.0009903418049416095, "loss": 7.5754, "step": 1138100 }, { "epoch": 4.636776662127796, "grad_norm": 6.437638282775879, "learning_rate": 0.0009899476179991375, "loss": 7.5613, "step": 1138200 }, { "epoch": 4.637184040151178, "grad_norm": 7.485435962677002, "learning_rate": 0.0009895534909547905, "loss": 7.5303, "step": 1138300 }, { "epoch": 4.637591418174559, "grad_norm": 11.320119857788086, "learning_rate": 0.0009891594238241486, "loss": 7.5563, "step": 1138400 }, { "epoch": 4.637998796197941, "grad_norm": 3.7288763523101807, "learning_rate": 0.0009887654166227926, "loss": 7.5556, "step": 1138500 }, { "epoch": 4.6384061742213225, "grad_norm": 4.068279266357422, "learning_rate": 0.0009883714693662985, "loss": 7.5811, "step": 1138600 }, { "epoch": 4.638813552244704, "grad_norm": 22.849851608276367, "learning_rate": 0.0009879775820702428, "loss": 7.5548, "step": 1138700 }, { "epoch": 4.639220930268086, "grad_norm": 4.552586078643799, "learning_rate": 0.000987583754750195, "loss": 7.5686, "step": 1138800 }, { "epoch": 4.639628308291467, "grad_norm": 8.008922576904297, "learning_rate": 0.000987189987421728, "loss": 7.5439, "step": 1138900 }, { "epoch": 4.640035686314848, "grad_norm": 7.918119430541992, "learning_rate": 0.0009867962801004078, "loss": 7.5674, "step": 1139000 }, { "epoch": 4.640035686314848, "eval_MaskedAccuracy": 0.5108878226299906, "eval_loss": 1.5939793586730957, "eval_runtime": 186.1234, "eval_samples_per_second": 341.042, "eval_steps_per_second": 1.332, "step": 1139000 }, { "epoch": 4.64044306433823, "grad_norm": 4.122742652893066, "learning_rate": 0.0009864026328017983, "loss": 7.565, "step": 1139100 }, { "epoch": 4.640850442361611, "grad_norm": 5.125577926635742, "learning_rate": 0.0009860090455414616, "loss": 7.5802, "step": 1139200 }, { "epoch": 4.641257820384992, "grad_norm": 4.104841709136963, "learning_rate": 0.0009856155183349637, "loss": 7.5967, "step": 1139300 }, { "epoch": 4.641665198408374, "grad_norm": 12.787208557128906, "learning_rate": 0.0009852220511978591, "loss": 7.5735, "step": 1139400 }, { "epoch": 4.642072576431755, "grad_norm": 7.688920021057129, "learning_rate": 0.0009848286441457058, "loss": 7.5604, "step": 1139500 }, { "epoch": 4.642479954455137, "grad_norm": 4.265176296234131, "learning_rate": 0.0009844352971940557, "loss": 7.5465, "step": 1139600 }, { "epoch": 4.6428873324785185, "grad_norm": 7.598531723022461, "learning_rate": 0.0009840420103584616, "loss": 7.5898, "step": 1139700 }, { "epoch": 4.6432947105019, "grad_norm": 14.754538536071777, "learning_rate": 0.0009836487836544718, "loss": 7.5626, "step": 1139800 }, { "epoch": 4.6437020885252815, "grad_norm": 13.873635292053223, "learning_rate": 0.0009832556170976312, "loss": 7.5814, "step": 1139900 }, { "epoch": 4.644109466548663, "grad_norm": 5.928347110748291, "learning_rate": 0.0009828625107034849, "loss": 7.5458, "step": 1140000 }, { "epoch": 4.644109466548663, "eval_MaskedAccuracy": 0.5112632507850131, "eval_loss": 1.588282823562622, "eval_runtime": 167.4255, "eval_samples_per_second": 379.13, "eval_steps_per_second": 1.481, "step": 1140000 }, { "epoch": 4.644516844572045, "grad_norm": 11.163224220275879, "learning_rate": 0.0009824694644875755, "loss": 7.5592, "step": 1140100 }, { "epoch": 4.644924222595426, "grad_norm": 5.4645514488220215, "learning_rate": 0.0009820764784654412, "loss": 7.5757, "step": 1140200 }, { "epoch": 4.645331600618807, "grad_norm": 9.20749282836914, "learning_rate": 0.0009816835526526174, "loss": 7.5959, "step": 1140300 }, { "epoch": 4.645738978642188, "grad_norm": 5.6333489418029785, "learning_rate": 0.0009812906870646403, "loss": 7.5675, "step": 1140400 }, { "epoch": 4.64614635666557, "grad_norm": 10.956141471862793, "learning_rate": 0.0009808978817170428, "loss": 7.5533, "step": 1140500 }, { "epoch": 4.646553734688951, "grad_norm": 4.930355072021484, "learning_rate": 0.000980505136625352, "loss": 7.5678, "step": 1140600 }, { "epoch": 4.646961112712333, "grad_norm": 11.471359252929688, "learning_rate": 0.000980112451805098, "loss": 7.5696, "step": 1140700 }, { "epoch": 4.6473684907357145, "grad_norm": 11.46281623840332, "learning_rate": 0.0009797198272718053, "loss": 7.5496, "step": 1140800 }, { "epoch": 4.647775868759096, "grad_norm": 8.473072052001953, "learning_rate": 0.0009793272630409943, "loss": 7.5514, "step": 1140900 }, { "epoch": 4.6481832467824775, "grad_norm": 5.101851463317871, "learning_rate": 0.0009789347591281858, "loss": 7.5946, "step": 1141000 }, { "epoch": 4.6481832467824775, "eval_MaskedAccuracy": 0.5106380063732754, "eval_loss": 1.6024672985076904, "eval_runtime": 171.8719, "eval_samples_per_second": 369.322, "eval_steps_per_second": 1.443, "step": 1141000 }, { "epoch": 4.648590624805859, "grad_norm": 7.18214750289917, "learning_rate": 0.0009785423155488993, "loss": 7.5712, "step": 1141100 }, { "epoch": 4.648998002829241, "grad_norm": 74.3897705078125, "learning_rate": 0.000978149932318649, "loss": 7.5622, "step": 1141200 }, { "epoch": 4.649405380852622, "grad_norm": 5.3525872230529785, "learning_rate": 0.000977757609452949, "loss": 7.5846, "step": 1141300 }, { "epoch": 4.649812758876004, "grad_norm": 5.943066596984863, "learning_rate": 0.0009773653469673082, "loss": 7.5625, "step": 1141400 }, { "epoch": 4.650220136899385, "grad_norm": 17.435598373413086, "learning_rate": 0.0009769731448772353, "loss": 7.5962, "step": 1141500 }, { "epoch": 4.650627514922766, "grad_norm": 14.843661308288574, "learning_rate": 0.0009765810031982362, "loss": 7.5575, "step": 1141600 }, { "epoch": 4.651034892946147, "grad_norm": 2.8244335651397705, "learning_rate": 0.0009761889219458203, "loss": 7.5549, "step": 1141700 }, { "epoch": 4.651442270969529, "grad_norm": 5.403951168060303, "learning_rate": 0.0009757969011354824, "loss": 7.569, "step": 1141800 }, { "epoch": 4.65184964899291, "grad_norm": 10.951919555664062, "learning_rate": 0.0009754049407827225, "loss": 7.5864, "step": 1141900 }, { "epoch": 4.652257027016292, "grad_norm": 15.487794876098633, "learning_rate": 0.000975013040903036, "loss": 7.5902, "step": 1142000 }, { "epoch": 4.652257027016292, "eval_MaskedAccuracy": 0.5107587389265058, "eval_loss": 1.6009981632232666, "eval_runtime": 165.3809, "eval_samples_per_second": 383.817, "eval_steps_per_second": 1.5, "step": 1142000 }, { "epoch": 4.6526644050396735, "grad_norm": 3.882822275161743, "learning_rate": 0.0009746212015119169, "loss": 7.5798, "step": 1142100 }, { "epoch": 4.653071783063055, "grad_norm": 4.652502536773682, "learning_rate": 0.0009742294226248559, "loss": 7.5497, "step": 1142200 }, { "epoch": 4.6534791610864366, "grad_norm": 7.919519424438477, "learning_rate": 0.0009738377042573433, "loss": 7.5726, "step": 1142300 }, { "epoch": 4.653886539109818, "grad_norm": 9.157454490661621, "learning_rate": 0.0009734460464248658, "loss": 7.5391, "step": 1142400 }, { "epoch": 4.6542939171332, "grad_norm": 8.602246284484863, "learning_rate": 0.0009730544491429075, "loss": 7.5814, "step": 1142500 }, { "epoch": 4.654701295156581, "grad_norm": 17.3488712310791, "learning_rate": 0.0009726629124269508, "loss": 7.5472, "step": 1142600 }, { "epoch": 4.655108673179962, "grad_norm": 9.232991218566895, "learning_rate": 0.0009722714362924737, "loss": 7.5696, "step": 1142700 }, { "epoch": 4.655516051203344, "grad_norm": 4.006420612335205, "learning_rate": 0.0009718800207549548, "loss": 7.5809, "step": 1142800 }, { "epoch": 4.655923429226725, "grad_norm": 5.405423164367676, "learning_rate": 0.0009714886658298674, "loss": 7.5426, "step": 1142900 }, { "epoch": 4.656330807250106, "grad_norm": 5.900599479675293, "learning_rate": 0.000971097371532686, "loss": 7.5743, "step": 1143000 }, { "epoch": 4.656330807250106, "eval_MaskedAccuracy": 0.5108904182976085, "eval_loss": 1.5938276052474976, "eval_runtime": 203.4273, "eval_samples_per_second": 312.033, "eval_steps_per_second": 1.219, "step": 1143000 }, { "epoch": 4.656738185273488, "grad_norm": 3.059312105178833, "learning_rate": 0.0009707061378788781, "loss": 7.587, "step": 1143100 }, { "epoch": 4.6571455632968695, "grad_norm": 7.715422630310059, "learning_rate": 0.0009703149648839125, "loss": 7.566, "step": 1143200 }, { "epoch": 4.657552941320251, "grad_norm": 2.4538686275482178, "learning_rate": 0.0009699238525632545, "loss": 7.5648, "step": 1143300 }, { "epoch": 4.6579603193436325, "grad_norm": 6.942965030670166, "learning_rate": 0.0009695328009323676, "loss": 7.5429, "step": 1143400 }, { "epoch": 4.658367697367014, "grad_norm": 7.37758731842041, "learning_rate": 0.0009691418100067122, "loss": 7.5582, "step": 1143500 }, { "epoch": 4.658775075390396, "grad_norm": 4.007798194885254, "learning_rate": 0.0009687508798017454, "loss": 7.5776, "step": 1143600 }, { "epoch": 4.659182453413777, "grad_norm": 13.702225685119629, "learning_rate": 0.0009683600103329228, "loss": 7.5836, "step": 1143700 }, { "epoch": 4.659589831437159, "grad_norm": 4.413660049438477, "learning_rate": 0.0009679692016156966, "loss": 7.5519, "step": 1143800 }, { "epoch": 4.65999720946054, "grad_norm": 5.619871616363525, "learning_rate": 0.0009675784536655186, "loss": 7.5697, "step": 1143900 }, { "epoch": 4.660404587483921, "grad_norm": 10.72059440612793, "learning_rate": 0.0009671877664978373, "loss": 7.5397, "step": 1144000 }, { "epoch": 4.660404587483921, "eval_MaskedAccuracy": 0.5106447879565417, "eval_loss": 1.594344139099121, "eval_runtime": 168.6792, "eval_samples_per_second": 376.312, "eval_steps_per_second": 1.47, "step": 1144000 }, { "epoch": 4.660811965507303, "grad_norm": 9.93779468536377, "learning_rate": 0.0009667971401280979, "loss": 7.5475, "step": 1144100 }, { "epoch": 4.661219343530684, "grad_norm": 4.302090644836426, "learning_rate": 0.0009664065745717441, "loss": 7.5715, "step": 1144200 }, { "epoch": 4.661626721554065, "grad_norm": 13.303513526916504, "learning_rate": 0.0009660160698442181, "loss": 7.559, "step": 1144300 }, { "epoch": 4.662034099577447, "grad_norm": 3.0725815296173096, "learning_rate": 0.000965625625960958, "loss": 7.5932, "step": 1144400 }, { "epoch": 4.6624414776008285, "grad_norm": 11.4290132522583, "learning_rate": 0.0009652352429373998, "loss": 7.5794, "step": 1144500 }, { "epoch": 4.66284885562421, "grad_norm": 6.4777350425720215, "learning_rate": 0.0009648449207889775, "loss": 7.5705, "step": 1144600 }, { "epoch": 4.663256233647592, "grad_norm": 4.026091575622559, "learning_rate": 0.0009644546595311234, "loss": 7.5718, "step": 1144700 }, { "epoch": 4.663663611670973, "grad_norm": 11.589993476867676, "learning_rate": 0.0009640644591792654, "loss": 7.5862, "step": 1144800 }, { "epoch": 4.664070989694355, "grad_norm": 4.978834629058838, "learning_rate": 0.0009636743197488318, "loss": 7.5616, "step": 1144900 }, { "epoch": 4.664478367717736, "grad_norm": 11.424694061279297, "learning_rate": 0.0009632842412552456, "loss": 7.5634, "step": 1145000 }, { "epoch": 4.664478367717736, "eval_MaskedAccuracy": 0.5117445791108512, "eval_loss": 1.592308521270752, "eval_runtime": 185.3599, "eval_samples_per_second": 342.447, "eval_steps_per_second": 1.338, "step": 1145000 }, { "epoch": 4.664885745741118, "grad_norm": 13.615307807922363, "learning_rate": 0.0009628942237139287, "loss": 7.5524, "step": 1145100 }, { "epoch": 4.665293123764499, "grad_norm": 4.647578239440918, "learning_rate": 0.0009625042671403015, "loss": 7.5626, "step": 1145200 }, { "epoch": 4.66570050178788, "grad_norm": 6.316812515258789, "learning_rate": 0.0009621143715497798, "loss": 7.6045, "step": 1145300 }, { "epoch": 4.666107879811261, "grad_norm": 4.423739433288574, "learning_rate": 0.0009617245369577794, "loss": 7.5796, "step": 1145400 }, { "epoch": 4.666515257834643, "grad_norm": 6.143904209136963, "learning_rate": 0.0009613347633797118, "loss": 7.5647, "step": 1145500 }, { "epoch": 4.6669226358580245, "grad_norm": 5.285592555999756, "learning_rate": 0.000960945050830987, "loss": 7.5881, "step": 1145600 }, { "epoch": 4.667330013881406, "grad_norm": 9.723875999450684, "learning_rate": 0.0009605553993270131, "loss": 7.5708, "step": 1145700 }, { "epoch": 4.6677373919047875, "grad_norm": 6.489499568939209, "learning_rate": 0.0009601658088831943, "loss": 7.5466, "step": 1145800 }, { "epoch": 4.668144769928169, "grad_norm": 9.560879707336426, "learning_rate": 0.0009597762795149336, "loss": 7.5744, "step": 1145900 }, { "epoch": 4.668552147951551, "grad_norm": 4.059078693389893, "learning_rate": 0.0009593868112376308, "loss": 7.5617, "step": 1146000 }, { "epoch": 4.668552147951551, "eval_MaskedAccuracy": 0.5113877713197927, "eval_loss": 1.5960330963134766, "eval_runtime": 177.4826, "eval_samples_per_second": 357.646, "eval_steps_per_second": 1.397, "step": 1146000 }, { "epoch": 4.668959525974932, "grad_norm": 12.183576583862305, "learning_rate": 0.0009589974040666838, "loss": 7.5596, "step": 1146100 }, { "epoch": 4.669366903998314, "grad_norm": 7.5042619705200195, "learning_rate": 0.0009586080580174874, "loss": 7.575, "step": 1146200 }, { "epoch": 4.669774282021695, "grad_norm": 14.12924575805664, "learning_rate": 0.0009582187731054356, "loss": 7.5706, "step": 1146300 }, { "epoch": 4.670181660045077, "grad_norm": 4.307675838470459, "learning_rate": 0.0009578295493459185, "loss": 7.5781, "step": 1146400 }, { "epoch": 4.670589038068458, "grad_norm": 12.567525863647461, "learning_rate": 0.0009574403867543238, "loss": 7.5722, "step": 1146500 }, { "epoch": 4.670996416091839, "grad_norm": 7.56950044631958, "learning_rate": 0.000957051285346038, "loss": 7.5686, "step": 1146600 }, { "epoch": 4.67140379411522, "grad_norm": 12.223543167114258, "learning_rate": 0.0009566622451364445, "loss": 7.5831, "step": 1146700 }, { "epoch": 4.671811172138602, "grad_norm": 12.852731704711914, "learning_rate": 0.0009562732661409233, "loss": 7.5545, "step": 1146800 }, { "epoch": 4.6722185501619835, "grad_norm": 3.26822829246521, "learning_rate": 0.0009558843483748528, "loss": 7.5588, "step": 1146900 }, { "epoch": 4.672625928185365, "grad_norm": 9.585270881652832, "learning_rate": 0.000955495491853609, "loss": 7.561, "step": 1147000 }, { "epoch": 4.672625928185365, "eval_MaskedAccuracy": 0.511802373359456, "eval_loss": 1.58987557888031, "eval_runtime": 173.1924, "eval_samples_per_second": 366.506, "eval_steps_per_second": 1.432, "step": 1147000 }, { "epoch": 4.673033306208747, "grad_norm": 5.222931861877441, "learning_rate": 0.0009551066965925656, "loss": 7.5802, "step": 1147100 }, { "epoch": 4.673440684232128, "grad_norm": 14.629156112670898, "learning_rate": 0.0009547179626070933, "loss": 7.5598, "step": 1147200 }, { "epoch": 4.67384806225551, "grad_norm": 22.34994125366211, "learning_rate": 0.0009543292899125627, "loss": 7.5542, "step": 1147300 }, { "epoch": 4.674255440278891, "grad_norm": 10.149872779846191, "learning_rate": 0.000953940678524338, "loss": 7.5516, "step": 1147400 }, { "epoch": 4.674662818302273, "grad_norm": 15.584869384765625, "learning_rate": 0.0009535521284577845, "loss": 7.5781, "step": 1147500 }, { "epoch": 4.675070196325654, "grad_norm": 9.096953392028809, "learning_rate": 0.0009531636397282633, "loss": 7.5922, "step": 1147600 }, { "epoch": 4.675477574349035, "grad_norm": 5.695037841796875, "learning_rate": 0.0009527752123511332, "loss": 7.5604, "step": 1147700 }, { "epoch": 4.675884952372417, "grad_norm": 11.450030326843262, "learning_rate": 0.0009523868463417511, "loss": 7.5651, "step": 1147800 }, { "epoch": 4.676292330395798, "grad_norm": 15.076812744140625, "learning_rate": 0.0009519985417154695, "loss": 7.5982, "step": 1147900 }, { "epoch": 4.6766997084191795, "grad_norm": 5.078405380249023, "learning_rate": 0.0009516102984876421, "loss": 7.5564, "step": 1148000 }, { "epoch": 4.6766997084191795, "eval_MaskedAccuracy": 0.5112680448618053, "eval_loss": 1.6016606092453003, "eval_runtime": 159.3213, "eval_samples_per_second": 398.415, "eval_steps_per_second": 1.557, "step": 1148000 }, { "epoch": 4.677107086442561, "grad_norm": 9.016671180725098, "learning_rate": 0.0009512221166736176, "loss": 7.5621, "step": 1148100 }, { "epoch": 4.6775144644659425, "grad_norm": 7.202791213989258, "learning_rate": 0.0009508339962887438, "loss": 7.5468, "step": 1148200 }, { "epoch": 4.677921842489324, "grad_norm": 9.776394844055176, "learning_rate": 0.0009504459373483638, "loss": 7.5705, "step": 1148300 }, { "epoch": 4.678329220512706, "grad_norm": 13.585295677185059, "learning_rate": 0.0009500579398678206, "loss": 7.5861, "step": 1148400 }, { "epoch": 4.678736598536087, "grad_norm": 18.066389083862305, "learning_rate": 0.0009496700038624534, "loss": 7.5696, "step": 1148500 }, { "epoch": 4.679143976559469, "grad_norm": 4.05338716506958, "learning_rate": 0.0009492821293475995, "loss": 7.5646, "step": 1148600 }, { "epoch": 4.67955135458285, "grad_norm": 12.096548080444336, "learning_rate": 0.0009488943163385934, "loss": 7.5707, "step": 1148700 }, { "epoch": 4.679958732606232, "grad_norm": 11.088159561157227, "learning_rate": 0.0009485065648507686, "loss": 7.5828, "step": 1148800 }, { "epoch": 4.680366110629613, "grad_norm": 6.6414875984191895, "learning_rate": 0.000948118874899453, "loss": 7.58, "step": 1148900 }, { "epoch": 4.680773488652994, "grad_norm": 5.62465763092041, "learning_rate": 0.000947731246499975, "loss": 7.5294, "step": 1149000 }, { "epoch": 4.680773488652994, "eval_MaskedAccuracy": 0.5116329883426698, "eval_loss": 1.5883326530456543, "eval_runtime": 203.7743, "eval_samples_per_second": 311.502, "eval_steps_per_second": 1.217, "step": 1149000 }, { "epoch": 4.681180866676376, "grad_norm": 3.951120138168335, "learning_rate": 0.0009473436796676603, "loss": 7.5658, "step": 1149100 }, { "epoch": 4.681588244699757, "grad_norm": 8.422918319702148, "learning_rate": 0.0009469561744178301, "loss": 7.559, "step": 1149200 }, { "epoch": 4.6819956227231385, "grad_norm": 6.362574577331543, "learning_rate": 0.000946568730765806, "loss": 7.5737, "step": 1149300 }, { "epoch": 4.68240300074652, "grad_norm": 4.840132713317871, "learning_rate": 0.0009461813487269046, "loss": 7.5654, "step": 1149400 }, { "epoch": 4.682810378769902, "grad_norm": 8.584992408752441, "learning_rate": 0.0009457940283164404, "loss": 7.5624, "step": 1149500 }, { "epoch": 4.683217756793283, "grad_norm": 4.112203121185303, "learning_rate": 0.0009454067695497283, "loss": 7.566, "step": 1149600 }, { "epoch": 4.683625134816665, "grad_norm": 3.2341792583465576, "learning_rate": 0.0009450195724420789, "loss": 7.5753, "step": 1149700 }, { "epoch": 4.684032512840046, "grad_norm": 5.026054382324219, "learning_rate": 0.0009446324370087991, "loss": 7.5528, "step": 1149800 }, { "epoch": 4.684439890863428, "grad_norm": 8.230510711669922, "learning_rate": 0.0009442453632651933, "loss": 7.5804, "step": 1149900 }, { "epoch": 4.684847268886809, "grad_norm": 10.661288261413574, "learning_rate": 0.0009438583512265662, "loss": 7.5863, "step": 1150000 }, { "epoch": 4.684847268886809, "eval_MaskedAccuracy": 0.5112104708678854, "eval_loss": 1.59536612033844, "eval_runtime": 154.144, "eval_samples_per_second": 411.797, "eval_steps_per_second": 1.609, "step": 1150000 }, { "epoch": 4.685254646910191, "grad_norm": 4.505941867828369, "learning_rate": 0.0009434714009082187, "loss": 7.5788, "step": 1150100 }, { "epoch": 4.685662024933572, "grad_norm": 4.621151447296143, "learning_rate": 0.0009430845123254485, "loss": 7.5662, "step": 1150200 }, { "epoch": 4.686069402956953, "grad_norm": 13.552003860473633, "learning_rate": 0.0009426976854935505, "loss": 7.5705, "step": 1150300 }, { "epoch": 4.6864767809803345, "grad_norm": 10.98857307434082, "learning_rate": 0.0009423109204278197, "loss": 7.5407, "step": 1150400 }, { "epoch": 4.686884159003716, "grad_norm": 7.751584529876709, "learning_rate": 0.0009419242171435458, "loss": 7.5761, "step": 1150500 }, { "epoch": 4.6872915370270976, "grad_norm": 5.739630222320557, "learning_rate": 0.0009415375756560168, "loss": 7.5842, "step": 1150600 }, { "epoch": 4.687698915050479, "grad_norm": 4.1098408699035645, "learning_rate": 0.0009411509959805195, "loss": 7.5826, "step": 1150700 }, { "epoch": 4.688106293073861, "grad_norm": 3.4388153553009033, "learning_rate": 0.0009407644781323382, "loss": 7.5715, "step": 1150800 }, { "epoch": 4.688513671097242, "grad_norm": 3.127047061920166, "learning_rate": 0.0009403780221267528, "loss": 7.575, "step": 1150900 }, { "epoch": 4.688921049120624, "grad_norm": 10.35447883605957, "learning_rate": 0.0009399916279790423, "loss": 7.5464, "step": 1151000 }, { "epoch": 4.688921049120624, "eval_MaskedAccuracy": 0.5115344628783024, "eval_loss": 1.5853630304336548, "eval_runtime": 156.1249, "eval_samples_per_second": 406.572, "eval_steps_per_second": 1.588, "step": 1151000 }, { "epoch": 4.689328427144005, "grad_norm": 3.8142848014831543, "learning_rate": 0.0009396052957044819, "loss": 7.5292, "step": 1151100 }, { "epoch": 4.689735805167387, "grad_norm": 10.079751014709473, "learning_rate": 0.0009392190253183472, "loss": 7.5651, "step": 1151200 }, { "epoch": 4.690143183190768, "grad_norm": 4.882888317108154, "learning_rate": 0.000938832816835908, "loss": 7.5667, "step": 1151300 }, { "epoch": 4.69055056121415, "grad_norm": 4.196845054626465, "learning_rate": 0.0009384466702724349, "loss": 7.5767, "step": 1151400 }, { "epoch": 4.690957939237531, "grad_norm": 7.795995712280273, "learning_rate": 0.0009380605856431927, "loss": 7.5496, "step": 1151500 }, { "epoch": 4.691365317260912, "grad_norm": 4.899525165557861, "learning_rate": 0.0009376745629634473, "loss": 7.5885, "step": 1151600 }, { "epoch": 4.6917726952842935, "grad_norm": 7.629162311553955, "learning_rate": 0.0009372886022484589, "loss": 7.5856, "step": 1151700 }, { "epoch": 4.692180073307675, "grad_norm": 5.845236301422119, "learning_rate": 0.0009369027035134867, "loss": 7.5688, "step": 1151800 }, { "epoch": 4.692587451331057, "grad_norm": 5.813826560974121, "learning_rate": 0.0009365168667737876, "loss": 7.5521, "step": 1151900 }, { "epoch": 4.692994829354438, "grad_norm": 10.613510131835938, "learning_rate": 0.0009361310920446154, "loss": 7.5786, "step": 1152000 }, { "epoch": 4.692994829354438, "eval_MaskedAccuracy": 0.5118310677127698, "eval_loss": 1.5911270380020142, "eval_runtime": 155.8643, "eval_samples_per_second": 407.252, "eval_steps_per_second": 1.591, "step": 1152000 }, { "epoch": 4.69340220737782, "grad_norm": 4.008493900299072, "learning_rate": 0.0009357453793412213, "loss": 7.597, "step": 1152100 }, { "epoch": 4.693809585401201, "grad_norm": 4.623472213745117, "learning_rate": 0.0009353597286788566, "loss": 7.5364, "step": 1152200 }, { "epoch": 4.694216963424583, "grad_norm": 17.645742416381836, "learning_rate": 0.000934974140072766, "loss": 7.5699, "step": 1152300 }, { "epoch": 4.694624341447964, "grad_norm": 9.060762405395508, "learning_rate": 0.0009345886135381944, "loss": 7.5748, "step": 1152400 }, { "epoch": 4.695031719471346, "grad_norm": 6.78955602645874, "learning_rate": 0.0009342031490903841, "loss": 7.5464, "step": 1152500 }, { "epoch": 4.695439097494727, "grad_norm": 3.801654815673828, "learning_rate": 0.0009338177467445753, "loss": 7.5737, "step": 1152600 }, { "epoch": 4.695846475518108, "grad_norm": 8.123143196105957, "learning_rate": 0.0009334324065160029, "loss": 7.5737, "step": 1152700 }, { "epoch": 4.69625385354149, "grad_norm": 5.954250335693359, "learning_rate": 0.0009330471284199043, "loss": 7.5386, "step": 1152800 }, { "epoch": 4.696661231564871, "grad_norm": 16.418846130371094, "learning_rate": 0.0009326619124715094, "loss": 7.5939, "step": 1152900 }, { "epoch": 4.697068609588253, "grad_norm": 7.963831901550293, "learning_rate": 0.0009322767586860495, "loss": 7.5794, "step": 1153000 }, { "epoch": 4.697068609588253, "eval_MaskedAccuracy": 0.5114758930732738, "eval_loss": 1.5934879779815674, "eval_runtime": 180.4163, "eval_samples_per_second": 351.831, "eval_steps_per_second": 1.375, "step": 1153000 }, { "epoch": 4.697475987611634, "grad_norm": 4.721909046173096, "learning_rate": 0.0009318916670787493, "loss": 7.5947, "step": 1153100 }, { "epoch": 4.697883365635016, "grad_norm": 8.068775177001953, "learning_rate": 0.0009315066376648359, "loss": 7.5498, "step": 1153200 }, { "epoch": 4.698290743658397, "grad_norm": 5.424630165100098, "learning_rate": 0.0009311216704595312, "loss": 7.5658, "step": 1153300 }, { "epoch": 4.698698121681779, "grad_norm": 16.26315689086914, "learning_rate": 0.0009307367654780546, "loss": 7.5756, "step": 1153400 }, { "epoch": 4.69910549970516, "grad_norm": 4.9227495193481445, "learning_rate": 0.0009303519227356221, "loss": 7.568, "step": 1153500 }, { "epoch": 4.699512877728542, "grad_norm": 8.267171859741211, "learning_rate": 0.0009299671422474505, "loss": 7.562, "step": 1153600 }, { "epoch": 4.699920255751923, "grad_norm": 5.523322105407715, "learning_rate": 0.0009295824240287522, "loss": 7.551, "step": 1153700 }, { "epoch": 4.700327633775305, "grad_norm": 5.789429664611816, "learning_rate": 0.0009291977680947362, "loss": 7.5907, "step": 1153800 }, { "epoch": 4.700735011798686, "grad_norm": 11.09715461730957, "learning_rate": 0.0009288131744606112, "loss": 7.5964, "step": 1153900 }, { "epoch": 4.701142389822067, "grad_norm": 5.65620231628418, "learning_rate": 0.000928428643141581, "loss": 7.5622, "step": 1154000 }, { "epoch": 4.701142389822067, "eval_MaskedAccuracy": 0.5111806981310292, "eval_loss": 1.5976860523223877, "eval_runtime": 154.3055, "eval_samples_per_second": 411.366, "eval_steps_per_second": 1.607, "step": 1154000 }, { "epoch": 4.701549767845449, "grad_norm": 5.494576454162598, "learning_rate": 0.0009280441741528481, "loss": 7.5637, "step": 1154100 }, { "epoch": 4.70195714586883, "grad_norm": 12.01253604888916, "learning_rate": 0.0009276597675096129, "loss": 7.5816, "step": 1154200 }, { "epoch": 4.702364523892212, "grad_norm": 9.708366394042969, "learning_rate": 0.0009272754232270741, "loss": 7.5551, "step": 1154300 }, { "epoch": 4.702771901915593, "grad_norm": 15.648822784423828, "learning_rate": 0.0009268911413204264, "loss": 7.5346, "step": 1154400 }, { "epoch": 4.703179279938975, "grad_norm": 6.7168450355529785, "learning_rate": 0.0009265069218048618, "loss": 7.5566, "step": 1154500 }, { "epoch": 4.703586657962356, "grad_norm": 7.411017417907715, "learning_rate": 0.0009261227646955714, "loss": 7.5763, "step": 1154600 }, { "epoch": 4.703994035985738, "grad_norm": 6.240413188934326, "learning_rate": 0.0009257386700077413, "loss": 7.5579, "step": 1154700 }, { "epoch": 4.704401414009119, "grad_norm": 4.57463264465332, "learning_rate": 0.0009253546377565591, "loss": 7.5935, "step": 1154800 }, { "epoch": 4.704808792032501, "grad_norm": 3.5772671699523926, "learning_rate": 0.000924970667957206, "loss": 7.5578, "step": 1154900 }, { "epoch": 4.705216170055882, "grad_norm": 4.515414714813232, "learning_rate": 0.0009245867606248639, "loss": 7.5838, "step": 1155000 }, { "epoch": 4.705216170055882, "eval_MaskedAccuracy": 0.5115335460324503, "eval_loss": 1.6010119915008545, "eval_runtime": 153.3033, "eval_samples_per_second": 414.055, "eval_steps_per_second": 1.618, "step": 1155000 }, { "epoch": 4.705623548079264, "grad_norm": 18.80776023864746, "learning_rate": 0.0009242029157747076, "loss": 7.5651, "step": 1155100 }, { "epoch": 4.706030926102645, "grad_norm": 4.243402481079102, "learning_rate": 0.0009238191334219159, "loss": 7.5421, "step": 1155200 }, { "epoch": 4.706438304126026, "grad_norm": 10.762371063232422, "learning_rate": 0.000923435413581661, "loss": 7.5476, "step": 1155300 }, { "epoch": 4.706845682149408, "grad_norm": 8.20543384552002, "learning_rate": 0.0009230517562691131, "loss": 7.5626, "step": 1155400 }, { "epoch": 4.707253060172789, "grad_norm": 6.052391052246094, "learning_rate": 0.0009226681614994401, "loss": 7.573, "step": 1155500 }, { "epoch": 4.707660438196171, "grad_norm": 3.0920984745025635, "learning_rate": 0.0009222846292878069, "loss": 7.5171, "step": 1155600 }, { "epoch": 4.708067816219552, "grad_norm": 13.645121574401855, "learning_rate": 0.0009219011596493782, "loss": 7.5465, "step": 1155700 }, { "epoch": 4.708475194242934, "grad_norm": 5.219463348388672, "learning_rate": 0.0009215177525993128, "loss": 7.5827, "step": 1155800 }, { "epoch": 4.708882572266315, "grad_norm": 6.038544654846191, "learning_rate": 0.0009211344081527708, "loss": 7.5518, "step": 1155900 }, { "epoch": 4.709289950289697, "grad_norm": 11.335884094238281, "learning_rate": 0.0009207511263249059, "loss": 7.5398, "step": 1156000 }, { "epoch": 4.709289950289697, "eval_MaskedAccuracy": 0.5115045317851677, "eval_loss": 1.5966569185256958, "eval_runtime": 168.76, "eval_samples_per_second": 376.132, "eval_steps_per_second": 1.47, "step": 1156000 }, { "epoch": 4.709697328313078, "grad_norm": 11.417588233947754, "learning_rate": 0.0009203679071308713, "loss": 7.5605, "step": 1156100 }, { "epoch": 4.71010470633646, "grad_norm": 9.42202377319336, "learning_rate": 0.0009199847505858197, "loss": 7.5569, "step": 1156200 }, { "epoch": 4.710512084359841, "grad_norm": 13.6173677444458, "learning_rate": 0.0009196016567048987, "loss": 7.5665, "step": 1156300 }, { "epoch": 4.710919462383223, "grad_norm": 5.46528434753418, "learning_rate": 0.0009192186255032525, "loss": 7.5827, "step": 1156400 }, { "epoch": 4.711326840406604, "grad_norm": 9.322909355163574, "learning_rate": 0.0009188356569960253, "loss": 7.581, "step": 1156500 }, { "epoch": 4.711734218429985, "grad_norm": 11.99737548828125, "learning_rate": 0.000918452751198359, "loss": 7.5692, "step": 1156600 }, { "epoch": 4.712141596453367, "grad_norm": 13.795363426208496, "learning_rate": 0.0009180699081253903, "loss": 7.5569, "step": 1156700 }, { "epoch": 4.712548974476748, "grad_norm": 6.006650924682617, "learning_rate": 0.0009176871277922558, "loss": 7.5583, "step": 1156800 }, { "epoch": 4.71295635250013, "grad_norm": 5.746901512145996, "learning_rate": 0.0009173044102140893, "loss": 7.5764, "step": 1156900 }, { "epoch": 4.713363730523511, "grad_norm": 6.087377071380615, "learning_rate": 0.0009169217554060214, "loss": 7.5724, "step": 1157000 }, { "epoch": 4.713363730523511, "eval_MaskedAccuracy": 0.5118616903480597, "eval_loss": 1.590957522392273, "eval_runtime": 166.5876, "eval_samples_per_second": 381.037, "eval_steps_per_second": 1.489, "step": 1157000 }, { "epoch": 4.713771108546893, "grad_norm": 5.474801540374756, "learning_rate": 0.0009165391633831785, "loss": 7.5729, "step": 1157100 }, { "epoch": 4.714178486570274, "grad_norm": 6.61529541015625, "learning_rate": 0.0009161566341606887, "loss": 7.5782, "step": 1157200 }, { "epoch": 4.714585864593656, "grad_norm": 8.765612602233887, "learning_rate": 0.0009157741677536754, "loss": 7.5509, "step": 1157300 }, { "epoch": 4.714993242617037, "grad_norm": 3.340832471847534, "learning_rate": 0.0009153917641772582, "loss": 7.5786, "step": 1157400 }, { "epoch": 4.715400620640419, "grad_norm": 3.8586692810058594, "learning_rate": 0.0009150094234465567, "loss": 7.5683, "step": 1157500 }, { "epoch": 4.7158079986638, "grad_norm": 10.54784870147705, "learning_rate": 0.000914627145576686, "loss": 7.5576, "step": 1157600 }, { "epoch": 4.716215376687181, "grad_norm": 8.375208854675293, "learning_rate": 0.000914244930582762, "loss": 7.5488, "step": 1157700 }, { "epoch": 4.7166227547105635, "grad_norm": 8.476005554199219, "learning_rate": 0.0009138627784798932, "loss": 7.5585, "step": 1157800 }, { "epoch": 4.717030132733944, "grad_norm": 6.5658721923828125, "learning_rate": 0.0009134806892831892, "loss": 7.5327, "step": 1157900 }, { "epoch": 4.717437510757326, "grad_norm": 4.999338150024414, "learning_rate": 0.0009130986630077549, "loss": 7.5549, "step": 1158000 }, { "epoch": 4.717437510757326, "eval_MaskedAccuracy": 0.5116032715326428, "eval_loss": 1.596752405166626, "eval_runtime": 218.3894, "eval_samples_per_second": 290.655, "eval_steps_per_second": 1.136, "step": 1158000 }, { "epoch": 4.717844888780707, "grad_norm": 4.15541410446167, "learning_rate": 0.0009127166996686937, "loss": 7.5502, "step": 1158100 }, { "epoch": 4.718252266804089, "grad_norm": 9.1082763671875, "learning_rate": 0.0009123347992811084, "loss": 7.5645, "step": 1158200 }, { "epoch": 4.71865964482747, "grad_norm": 7.027690410614014, "learning_rate": 0.0009119529618600974, "loss": 7.5473, "step": 1158300 }, { "epoch": 4.719067022850852, "grad_norm": 4.834759712219238, "learning_rate": 0.0009115711874207551, "loss": 7.5449, "step": 1158400 }, { "epoch": 4.719474400874233, "grad_norm": 4.868383407592773, "learning_rate": 0.0009111894759781747, "loss": 7.5653, "step": 1158500 }, { "epoch": 4.719881778897615, "grad_norm": 6.506927490234375, "learning_rate": 0.00091080782754745, "loss": 7.5646, "step": 1158600 }, { "epoch": 4.720289156920996, "grad_norm": 4.405208587646484, "learning_rate": 0.0009104262421436677, "loss": 7.5832, "step": 1158700 }, { "epoch": 4.720696534944378, "grad_norm": 8.431148529052734, "learning_rate": 0.0009100447197819151, "loss": 7.5627, "step": 1158800 }, { "epoch": 4.721103912967759, "grad_norm": 6.002216815948486, "learning_rate": 0.0009096632604772751, "loss": 7.5574, "step": 1158900 }, { "epoch": 4.72151129099114, "grad_norm": 4.2939229011535645, "learning_rate": 0.0009092818642448285, "loss": 7.5764, "step": 1159000 }, { "epoch": 4.72151129099114, "eval_MaskedAccuracy": 0.5107660818319678, "eval_loss": 1.598768949508667, "eval_runtime": 165.1411, "eval_samples_per_second": 384.374, "eval_steps_per_second": 1.502, "step": 1159000 }, { "epoch": 4.7219186690145225, "grad_norm": 4.288996696472168, "learning_rate": 0.0009089005310996544, "loss": 7.5438, "step": 1159100 }, { "epoch": 4.722326047037903, "grad_norm": 5.802402496337891, "learning_rate": 0.0009085192610568289, "loss": 7.57, "step": 1159200 }, { "epoch": 4.722733425061285, "grad_norm": 11.19207763671875, "learning_rate": 0.0009081380541314263, "loss": 7.5694, "step": 1159300 }, { "epoch": 4.723140803084666, "grad_norm": 8.473098754882812, "learning_rate": 0.0009077569103385164, "loss": 7.5503, "step": 1159400 }, { "epoch": 4.723548181108048, "grad_norm": 4.782020568847656, "learning_rate": 0.0009073758296931687, "loss": 7.5529, "step": 1159500 }, { "epoch": 4.723955559131429, "grad_norm": 3.8732857704162598, "learning_rate": 0.0009069948122104485, "loss": 7.5557, "step": 1159600 }, { "epoch": 4.724362937154811, "grad_norm": 5.217780113220215, "learning_rate": 0.0009066138579054214, "loss": 7.5645, "step": 1159700 }, { "epoch": 4.724770315178192, "grad_norm": 7.695767402648926, "learning_rate": 0.0009062329667931455, "loss": 7.5688, "step": 1159800 }, { "epoch": 4.725177693201574, "grad_norm": 13.585648536682129, "learning_rate": 0.0009058521388886819, "loss": 7.5559, "step": 1159900 }, { "epoch": 4.725585071224955, "grad_norm": 5.548020362854004, "learning_rate": 0.0009054713742070851, "loss": 7.5506, "step": 1160000 }, { "epoch": 4.725585071224955, "eval_MaskedAccuracy": 0.5112732356047921, "eval_loss": 1.60223388671875, "eval_runtime": 165.0537, "eval_samples_per_second": 384.578, "eval_steps_per_second": 1.503, "step": 1160000 }, { "epoch": 4.725992449248337, "grad_norm": 3.6210856437683105, "learning_rate": 0.00090509067276341, "loss": 7.5474, "step": 1160100 }, { "epoch": 4.7263998272717185, "grad_norm": 8.424483299255371, "learning_rate": 0.0009047100345727088, "loss": 7.5553, "step": 1160200 }, { "epoch": 4.726807205295099, "grad_norm": 15.483039855957031, "learning_rate": 0.0009043294596500279, "loss": 7.5684, "step": 1160300 }, { "epoch": 4.727214583318481, "grad_norm": 6.544163703918457, "learning_rate": 0.0009039489480104152, "loss": 7.5656, "step": 1160400 }, { "epoch": 4.727621961341862, "grad_norm": 6.401214599609375, "learning_rate": 0.0009035684996689137, "loss": 7.5752, "step": 1160500 }, { "epoch": 4.728029339365244, "grad_norm": 7.120314598083496, "learning_rate": 0.000903188114640565, "loss": 7.5552, "step": 1160600 }, { "epoch": 4.728436717388625, "grad_norm": 11.215267181396484, "learning_rate": 0.0009028077929404073, "loss": 7.5178, "step": 1160700 }, { "epoch": 4.728844095412007, "grad_norm": 8.733551025390625, "learning_rate": 0.0009024275345834758, "loss": 7.5551, "step": 1160800 }, { "epoch": 4.729251473435388, "grad_norm": 6.027173042297363, "learning_rate": 0.0009020473395848047, "loss": 7.549, "step": 1160900 }, { "epoch": 4.72965885145877, "grad_norm": 4.815136909484863, "learning_rate": 0.0009016672079594246, "loss": 7.5705, "step": 1161000 }, { "epoch": 4.72965885145877, "eval_MaskedAccuracy": 0.5118587553350832, "eval_loss": 1.596041202545166, "eval_runtime": 173.4566, "eval_samples_per_second": 365.947, "eval_steps_per_second": 1.43, "step": 1161000 }, { "epoch": 4.730066229482151, "grad_norm": 8.049763679504395, "learning_rate": 0.0009012871397223662, "loss": 7.5641, "step": 1161100 }, { "epoch": 4.730473607505533, "grad_norm": 9.600907325744629, "learning_rate": 0.0009009071348886541, "loss": 7.5797, "step": 1161200 }, { "epoch": 4.7308809855289145, "grad_norm": 3.3699300289154053, "learning_rate": 0.0009005271934733123, "loss": 7.5313, "step": 1161300 }, { "epoch": 4.731288363552296, "grad_norm": 4.550024509429932, "learning_rate": 0.0009001473154913624, "loss": 7.5517, "step": 1161400 }, { "epoch": 4.7316957415756775, "grad_norm": 17.960681915283203, "learning_rate": 0.000899767500957822, "loss": 7.5803, "step": 1161500 }, { "epoch": 4.732103119599058, "grad_norm": 8.197354316711426, "learning_rate": 0.0008993877498877079, "loss": 7.5665, "step": 1161600 }, { "epoch": 4.73251049762244, "grad_norm": 5.120255947113037, "learning_rate": 0.0008990080622960325, "loss": 7.5399, "step": 1161700 }, { "epoch": 4.732917875645821, "grad_norm": 3.9899091720581055, "learning_rate": 0.0008986284381978085, "loss": 7.5394, "step": 1161800 }, { "epoch": 4.733325253669203, "grad_norm": 12.039031982421875, "learning_rate": 0.0008982488776080437, "loss": 7.57, "step": 1161900 }, { "epoch": 4.733732631692584, "grad_norm": 4.0215911865234375, "learning_rate": 0.000897869380541744, "loss": 7.5371, "step": 1162000 }, { "epoch": 4.733732631692584, "eval_MaskedAccuracy": 0.5118360792934855, "eval_loss": 1.5812571048736572, "eval_runtime": 150.9453, "eval_samples_per_second": 420.523, "eval_steps_per_second": 1.643, "step": 1162000 }, { "epoch": 4.734140009715966, "grad_norm": 10.553079605102539, "learning_rate": 0.0008974899470139132, "loss": 7.5778, "step": 1162100 }, { "epoch": 4.734547387739347, "grad_norm": 3.1538913249969482, "learning_rate": 0.0008971105770395535, "loss": 7.5433, "step": 1162200 }, { "epoch": 4.734954765762729, "grad_norm": 4.568304061889648, "learning_rate": 0.000896731270633662, "loss": 7.569, "step": 1162300 }, { "epoch": 4.73536214378611, "grad_norm": 7.588132858276367, "learning_rate": 0.0008963520278112351, "loss": 7.5664, "step": 1162400 }, { "epoch": 4.735769521809492, "grad_norm": 7.682785987854004, "learning_rate": 0.0008959728485872658, "loss": 7.5311, "step": 1162500 }, { "epoch": 4.7361768998328735, "grad_norm": 6.86749267578125, "learning_rate": 0.0008955937329767455, "loss": 7.5475, "step": 1162600 }, { "epoch": 4.736584277856254, "grad_norm": 12.38428020477295, "learning_rate": 0.0008952146809946629, "loss": 7.5966, "step": 1162700 }, { "epoch": 4.7369916558796366, "grad_norm": 12.370417594909668, "learning_rate": 0.0008948356926560034, "loss": 7.5645, "step": 1162800 }, { "epoch": 4.737399033903017, "grad_norm": 3.9510908126831055, "learning_rate": 0.0008944567679757506, "loss": 7.548, "step": 1162900 }, { "epoch": 4.737806411926399, "grad_norm": 7.90590763092041, "learning_rate": 0.0008940779069688855, "loss": 7.5456, "step": 1163000 }, { "epoch": 4.737806411926399, "eval_MaskedAccuracy": 0.5119326063557618, "eval_loss": 1.586570382118225, "eval_runtime": 154.9261, "eval_samples_per_second": 409.718, "eval_steps_per_second": 1.601, "step": 1163000 }, { "epoch": 4.73821378994978, "grad_norm": 5.847414970397949, "learning_rate": 0.000893699109650387, "loss": 7.559, "step": 1163100 }, { "epoch": 4.738621167973162, "grad_norm": 10.857290267944336, "learning_rate": 0.0008933203760352303, "loss": 7.5917, "step": 1163200 }, { "epoch": 4.739028545996543, "grad_norm": 7.471684455871582, "learning_rate": 0.0008929417061383891, "loss": 7.5403, "step": 1163300 }, { "epoch": 4.739435924019925, "grad_norm": 7.344918727874756, "learning_rate": 0.0008925630999748348, "loss": 7.5972, "step": 1163400 }, { "epoch": 4.739843302043306, "grad_norm": 11.058645248413086, "learning_rate": 0.0008921845575595341, "loss": 7.5894, "step": 1163500 }, { "epoch": 4.740250680066688, "grad_norm": 15.42564582824707, "learning_rate": 0.0008918060789074541, "loss": 7.5493, "step": 1163600 }, { "epoch": 4.7406580580900695, "grad_norm": 12.14067554473877, "learning_rate": 0.0008914276640335578, "loss": 7.5389, "step": 1163700 }, { "epoch": 4.741065436113451, "grad_norm": 9.472207069396973, "learning_rate": 0.000891049312952807, "loss": 7.5601, "step": 1163800 }, { "epoch": 4.7414728141368325, "grad_norm": 10.546226501464844, "learning_rate": 0.0008906710256801578, "loss": 7.5863, "step": 1163900 }, { "epoch": 4.741880192160213, "grad_norm": 6.95032262802124, "learning_rate": 0.0008902928022305664, "loss": 7.5769, "step": 1164000 }, { "epoch": 4.741880192160213, "eval_MaskedAccuracy": 0.5117164427856976, "eval_loss": 1.5969483852386475, "eval_runtime": 170.8262, "eval_samples_per_second": 371.582, "eval_steps_per_second": 1.452, "step": 1164000 }, { "epoch": 4.742287570183595, "grad_norm": 9.718832969665527, "learning_rate": 0.0008899146426189865, "loss": 7.5576, "step": 1164100 }, { "epoch": 4.742694948206976, "grad_norm": 11.1882963180542, "learning_rate": 0.0008895365468603693, "loss": 7.5773, "step": 1164200 }, { "epoch": 4.743102326230358, "grad_norm": 8.55040168762207, "learning_rate": 0.0008891585149696628, "loss": 7.5588, "step": 1164300 }, { "epoch": 4.743509704253739, "grad_norm": 13.320935249328613, "learning_rate": 0.0008887805469618123, "loss": 7.5798, "step": 1164400 }, { "epoch": 4.743917082277121, "grad_norm": 7.613979816436768, "learning_rate": 0.0008884026428517607, "loss": 7.5466, "step": 1164500 }, { "epoch": 4.744324460300502, "grad_norm": 8.136162757873535, "learning_rate": 0.0008880248026544489, "loss": 7.59, "step": 1164600 }, { "epoch": 4.744731838323884, "grad_norm": 8.828904151916504, "learning_rate": 0.0008876470263848153, "loss": 7.5592, "step": 1164700 }, { "epoch": 4.745139216347265, "grad_norm": 5.564899444580078, "learning_rate": 0.0008872693140577958, "loss": 7.56, "step": 1164800 }, { "epoch": 4.745546594370647, "grad_norm": 8.49870777130127, "learning_rate": 0.0008868916656883212, "loss": 7.567, "step": 1164900 }, { "epoch": 4.7459539723940285, "grad_norm": 9.09511661529541, "learning_rate": 0.000886514081291322, "loss": 7.5508, "step": 1165000 }, { "epoch": 4.7459539723940285, "eval_MaskedAccuracy": 0.5113661104288739, "eval_loss": 1.6031959056854248, "eval_runtime": 190.2801, "eval_samples_per_second": 333.593, "eval_steps_per_second": 1.303, "step": 1165000 }, { "epoch": 4.74636135041741, "grad_norm": 7.04810905456543, "learning_rate": 0.0008861365608817294, "loss": 7.5461, "step": 1165100 }, { "epoch": 4.746768728440792, "grad_norm": 6.270197868347168, "learning_rate": 0.0008857591044744663, "loss": 7.581, "step": 1165200 }, { "epoch": 4.747176106464172, "grad_norm": 12.531331062316895, "learning_rate": 0.0008853817120844558, "loss": 7.5635, "step": 1165300 }, { "epoch": 4.747583484487554, "grad_norm": 4.336494445800781, "learning_rate": 0.000885004383726619, "loss": 7.5655, "step": 1165400 }, { "epoch": 4.747990862510935, "grad_norm": 5.909377574920654, "learning_rate": 0.0008846271194158728, "loss": 7.5881, "step": 1165500 }, { "epoch": 4.748398240534317, "grad_norm": 5.464470386505127, "learning_rate": 0.0008842499191671333, "loss": 7.5237, "step": 1165600 }, { "epoch": 4.748805618557698, "grad_norm": 6.150482177734375, "learning_rate": 0.0008838727829953133, "loss": 7.5772, "step": 1165700 }, { "epoch": 4.74921299658108, "grad_norm": 18.878080368041992, "learning_rate": 0.0008834957109153224, "loss": 7.5897, "step": 1165800 }, { "epoch": 4.749620374604461, "grad_norm": 5.610697269439697, "learning_rate": 0.0008831187029420681, "loss": 7.5657, "step": 1165900 }, { "epoch": 4.750027752627843, "grad_norm": 10.221170425415039, "learning_rate": 0.0008827417590904548, "loss": 7.5678, "step": 1166000 }, { "epoch": 4.750027752627843, "eval_MaskedAccuracy": 0.5113002197967617, "eval_loss": 1.5943676233291626, "eval_runtime": 164.9484, "eval_samples_per_second": 384.823, "eval_steps_per_second": 1.504, "step": 1166000 }, { "epoch": 4.7504351306512245, "grad_norm": 8.151117324829102, "learning_rate": 0.0008823648793753875, "loss": 7.5369, "step": 1166100 }, { "epoch": 4.750842508674606, "grad_norm": 3.9326717853546143, "learning_rate": 0.0008819880638117646, "loss": 7.5922, "step": 1166200 }, { "epoch": 4.7512498866979875, "grad_norm": 6.830138206481934, "learning_rate": 0.0008816113124144843, "loss": 7.5527, "step": 1166300 }, { "epoch": 4.751657264721369, "grad_norm": 19.084400177001953, "learning_rate": 0.0008812346251984412, "loss": 7.5512, "step": 1166400 }, { "epoch": 4.752064642744751, "grad_norm": 11.66727066040039, "learning_rate": 0.0008808580021785284, "loss": 7.5515, "step": 1166500 }, { "epoch": 4.752472020768131, "grad_norm": 4.68572998046875, "learning_rate": 0.0008804814433696352, "loss": 7.5802, "step": 1166600 }, { "epoch": 4.752879398791513, "grad_norm": 15.044259071350098, "learning_rate": 0.0008801049487866491, "loss": 7.5799, "step": 1166700 }, { "epoch": 4.753286776814894, "grad_norm": 4.469608783721924, "learning_rate": 0.0008797285184444543, "loss": 7.5697, "step": 1166800 }, { "epoch": 4.753694154838276, "grad_norm": 10.37489128112793, "learning_rate": 0.0008793521523579333, "loss": 7.5539, "step": 1166900 }, { "epoch": 4.754101532861657, "grad_norm": 5.180149078369141, "learning_rate": 0.0008789758505419663, "loss": 7.5689, "step": 1167000 }, { "epoch": 4.754101532861657, "eval_MaskedAccuracy": 0.5114207717471436, "eval_loss": 1.5994853973388672, "eval_runtime": 162.4343, "eval_samples_per_second": 390.78, "eval_steps_per_second": 1.527, "step": 1167000 }, { "epoch": 4.754508910885039, "grad_norm": 10.346702575683594, "learning_rate": 0.000878599613011431, "loss": 7.5567, "step": 1167100 }, { "epoch": 4.75491628890842, "grad_norm": 8.587042808532715, "learning_rate": 0.0008782234397812026, "loss": 7.5207, "step": 1167200 }, { "epoch": 4.755323666931802, "grad_norm": 3.1612582206726074, "learning_rate": 0.0008778473308661511, "loss": 7.5726, "step": 1167300 }, { "epoch": 4.7557310449551835, "grad_norm": 4.133459568023682, "learning_rate": 0.000877471286281146, "loss": 7.5411, "step": 1167400 }, { "epoch": 4.756138422978565, "grad_norm": 12.26366138458252, "learning_rate": 0.0008770953060410577, "loss": 7.5428, "step": 1167500 }, { "epoch": 4.756545801001947, "grad_norm": 8.328948974609375, "learning_rate": 0.0008767193901607479, "loss": 7.5525, "step": 1167600 }, { "epoch": 4.756953179025327, "grad_norm": 12.7047119140625, "learning_rate": 0.0008763435386550798, "loss": 7.5579, "step": 1167700 }, { "epoch": 4.75736055704871, "grad_norm": 6.194194316864014, "learning_rate": 0.0008759677515389119, "loss": 7.5937, "step": 1167800 }, { "epoch": 4.75776793507209, "grad_norm": 7.560429096221924, "learning_rate": 0.000875592028827102, "loss": 7.5618, "step": 1167900 }, { "epoch": 4.758175313095472, "grad_norm": 11.472549438476562, "learning_rate": 0.000875216370534502, "loss": 7.5542, "step": 1168000 }, { "epoch": 4.758175313095472, "eval_MaskedAccuracy": 0.5114655179978685, "eval_loss": 1.5953867435455322, "eval_runtime": 164.6842, "eval_samples_per_second": 385.441, "eval_steps_per_second": 1.506, "step": 1168000 }, { "epoch": 4.758582691118853, "grad_norm": 5.689689636230469, "learning_rate": 0.0008748407766759673, "loss": 7.5692, "step": 1168100 }, { "epoch": 4.758990069142235, "grad_norm": 5.820748805999756, "learning_rate": 0.0008744652472663456, "loss": 7.5568, "step": 1168200 }, { "epoch": 4.759397447165616, "grad_norm": 4.162635326385498, "learning_rate": 0.0008740897823204831, "loss": 7.5429, "step": 1168300 }, { "epoch": 4.759804825188998, "grad_norm": 8.76908016204834, "learning_rate": 0.0008737143818532241, "loss": 7.5525, "step": 1168400 }, { "epoch": 4.7602122032123795, "grad_norm": 6.940735816955566, "learning_rate": 0.0008733390458794099, "loss": 7.6008, "step": 1168500 }, { "epoch": 4.760619581235761, "grad_norm": 9.81445026397705, "learning_rate": 0.0008729637744138795, "loss": 7.5677, "step": 1168600 }, { "epoch": 4.7610269592591425, "grad_norm": 3.841867446899414, "learning_rate": 0.0008725885674714701, "loss": 7.5302, "step": 1168700 }, { "epoch": 4.761434337282524, "grad_norm": 6.942512512207031, "learning_rate": 0.0008722134250670156, "loss": 7.5518, "step": 1168800 }, { "epoch": 4.761841715305906, "grad_norm": 6.805759906768799, "learning_rate": 0.0008718383472153469, "loss": 7.5466, "step": 1168900 }, { "epoch": 4.762249093329286, "grad_norm": 7.043669700622559, "learning_rate": 0.0008714633339312923, "loss": 7.5593, "step": 1169000 }, { "epoch": 4.762249093329286, "eval_MaskedAccuracy": 0.5117339740654038, "eval_loss": 1.5920907258987427, "eval_runtime": 199.7228, "eval_samples_per_second": 317.82, "eval_steps_per_second": 1.242, "step": 1169000 }, { "epoch": 4.762656471352668, "grad_norm": 8.697935104370117, "learning_rate": 0.000871088385229681, "loss": 7.5533, "step": 1169100 }, { "epoch": 4.763063849376049, "grad_norm": 8.470230102539062, "learning_rate": 0.0008707135011253334, "loss": 7.5727, "step": 1169200 }, { "epoch": 4.763471227399431, "grad_norm": 10.522871971130371, "learning_rate": 0.0008703386816330725, "loss": 7.5232, "step": 1169300 }, { "epoch": 4.763878605422812, "grad_norm": 6.342298984527588, "learning_rate": 0.0008699639267677165, "loss": 7.5681, "step": 1169400 }, { "epoch": 4.764285983446194, "grad_norm": 9.163177490234375, "learning_rate": 0.0008695892365440817, "loss": 7.5368, "step": 1169500 }, { "epoch": 4.7646933614695754, "grad_norm": 5.029087543487549, "learning_rate": 0.0008692146109769804, "loss": 7.5465, "step": 1169600 }, { "epoch": 4.765100739492957, "grad_norm": 7.584436893463135, "learning_rate": 0.0008688400500812244, "loss": 7.5663, "step": 1169700 }, { "epoch": 4.7655081175163385, "grad_norm": 5.353863716125488, "learning_rate": 0.000868465553871622, "loss": 7.5442, "step": 1169800 }, { "epoch": 4.76591549553972, "grad_norm": 14.737866401672363, "learning_rate": 0.0008680911223629781, "loss": 7.5757, "step": 1169900 }, { "epoch": 4.766322873563102, "grad_norm": 4.93510627746582, "learning_rate": 0.0008677167555700973, "loss": 7.573, "step": 1170000 }, { "epoch": 4.766322873563102, "eval_MaskedAccuracy": 0.5113181159684853, "eval_loss": 1.595564842224121, "eval_runtime": 162.6779, "eval_samples_per_second": 390.194, "eval_steps_per_second": 1.524, "step": 1170000 }, { "epoch": 4.766730251586483, "grad_norm": 14.309826850891113, "learning_rate": 0.0008673424535077804, "loss": 7.5814, "step": 1170100 }, { "epoch": 4.767137629609865, "grad_norm": 3.418689727783203, "learning_rate": 0.0008669682161908257, "loss": 7.5516, "step": 1170200 }, { "epoch": 4.767545007633245, "grad_norm": 16.931493759155273, "learning_rate": 0.000866594043634028, "loss": 7.5483, "step": 1170300 }, { "epoch": 4.767952385656627, "grad_norm": 3.264801263809204, "learning_rate": 0.0008662199358521804, "loss": 7.5704, "step": 1170400 }, { "epoch": 4.768359763680008, "grad_norm": 11.889373779296875, "learning_rate": 0.0008658458928600735, "loss": 7.5637, "step": 1170500 }, { "epoch": 4.76876714170339, "grad_norm": 5.08151388168335, "learning_rate": 0.0008654719146724953, "loss": 7.5387, "step": 1170600 }, { "epoch": 4.769174519726771, "grad_norm": 8.597216606140137, "learning_rate": 0.0008650980013042308, "loss": 7.573, "step": 1170700 }, { "epoch": 4.769581897750153, "grad_norm": 4.913052558898926, "learning_rate": 0.0008647241527700637, "loss": 7.5568, "step": 1170800 }, { "epoch": 4.7699892757735345, "grad_norm": 20.04241180419922, "learning_rate": 0.0008643503690847726, "loss": 7.5769, "step": 1170900 }, { "epoch": 4.770396653796916, "grad_norm": 19.256046295166016, "learning_rate": 0.000863976650263137, "loss": 7.5777, "step": 1171000 }, { "epoch": 4.770396653796916, "eval_MaskedAccuracy": 0.5114205840526601, "eval_loss": 1.5949627161026, "eval_runtime": 160.7449, "eval_samples_per_second": 394.887, "eval_steps_per_second": 1.543, "step": 1171000 }, { "epoch": 4.7708040318202976, "grad_norm": 6.5651960372924805, "learning_rate": 0.0008636029963199313, "loss": 7.5617, "step": 1171100 }, { "epoch": 4.771211409843679, "grad_norm": 4.102616786956787, "learning_rate": 0.0008632294072699271, "loss": 7.5564, "step": 1171200 }, { "epoch": 4.771618787867061, "grad_norm": 9.836995124816895, "learning_rate": 0.0008628558831278951, "loss": 7.5346, "step": 1171300 }, { "epoch": 4.772026165890442, "grad_norm": 6.117199897766113, "learning_rate": 0.0008624824239086028, "loss": 7.5615, "step": 1171400 }, { "epoch": 4.772433543913824, "grad_norm": 8.702661514282227, "learning_rate": 0.0008621090296268162, "loss": 7.5579, "step": 1171500 }, { "epoch": 4.772840921937204, "grad_norm": 8.160992622375488, "learning_rate": 0.0008617357002972964, "loss": 7.5548, "step": 1171600 }, { "epoch": 4.773248299960586, "grad_norm": 4.512524604797363, "learning_rate": 0.0008613624359348026, "loss": 7.5602, "step": 1171700 }, { "epoch": 4.773655677983967, "grad_norm": 6.5624895095825195, "learning_rate": 0.0008609892365540935, "loss": 7.567, "step": 1171800 }, { "epoch": 4.774063056007349, "grad_norm": 7.824966907501221, "learning_rate": 0.0008606161021699222, "loss": 7.533, "step": 1171900 }, { "epoch": 4.7744704340307305, "grad_norm": 8.146249771118164, "learning_rate": 0.0008602430327970409, "loss": 7.5275, "step": 1172000 }, { "epoch": 4.7744704340307305, "eval_MaskedAccuracy": 0.511129574165486, "eval_loss": 1.586852788925171, "eval_runtime": 173.3589, "eval_samples_per_second": 366.154, "eval_steps_per_second": 1.431, "step": 1172000 }, { "epoch": 4.774877812054112, "grad_norm": 9.132033348083496, "learning_rate": 0.0008598700284501994, "loss": 7.5793, "step": 1172100 }, { "epoch": 4.7752851900774935, "grad_norm": 3.6502768993377686, "learning_rate": 0.0008594970891441445, "loss": 7.5517, "step": 1172200 }, { "epoch": 4.775692568100875, "grad_norm": 4.238583564758301, "learning_rate": 0.0008591242148936199, "loss": 7.5552, "step": 1172300 }, { "epoch": 4.776099946124257, "grad_norm": 3.451655626296997, "learning_rate": 0.0008587514057133689, "loss": 7.5518, "step": 1172400 }, { "epoch": 4.776507324147638, "grad_norm": 3.534184694290161, "learning_rate": 0.000858378661618129, "loss": 7.5849, "step": 1172500 }, { "epoch": 4.77691470217102, "grad_norm": 15.55631160736084, "learning_rate": 0.0008580059826226376, "loss": 7.5186, "step": 1172600 }, { "epoch": 4.7773220801944, "grad_norm": 6.297276496887207, "learning_rate": 0.0008576333687416277, "loss": 7.5566, "step": 1172700 }, { "epoch": 4.777729458217783, "grad_norm": 12.850409507751465, "learning_rate": 0.000857260819989832, "loss": 7.5354, "step": 1172800 }, { "epoch": 4.778136836241163, "grad_norm": 6.520290374755859, "learning_rate": 0.0008568883363819779, "loss": 7.5494, "step": 1172900 }, { "epoch": 4.778544214264545, "grad_norm": 4.348905086517334, "learning_rate": 0.0008565159179327939, "loss": 7.5546, "step": 1173000 }, { "epoch": 4.778544214264545, "eval_MaskedAccuracy": 0.5115314028956122, "eval_loss": 1.5912624597549438, "eval_runtime": 185.6518, "eval_samples_per_second": 341.909, "eval_steps_per_second": 1.336, "step": 1173000 }, { "epoch": 4.778951592287926, "grad_norm": 5.397449016571045, "learning_rate": 0.0008561435646570022, "loss": 7.5621, "step": 1173100 }, { "epoch": 4.779358970311308, "grad_norm": 3.4884116649627686, "learning_rate": 0.0008557712765693242, "loss": 7.5594, "step": 1173200 }, { "epoch": 4.7797663483346895, "grad_norm": 8.312586784362793, "learning_rate": 0.0008553990536844777, "loss": 7.5803, "step": 1173300 }, { "epoch": 4.780173726358071, "grad_norm": 4.167957305908203, "learning_rate": 0.0008550268960171794, "loss": 7.584, "step": 1173400 }, { "epoch": 4.780581104381453, "grad_norm": 5.039919376373291, "learning_rate": 0.0008546548035821431, "loss": 7.5604, "step": 1173500 }, { "epoch": 4.780988482404834, "grad_norm": 10.528605461120605, "learning_rate": 0.0008542827763940794, "loss": 7.5642, "step": 1173600 }, { "epoch": 4.781395860428216, "grad_norm": 16.156702041625977, "learning_rate": 0.0008539108144676959, "loss": 7.5513, "step": 1173700 }, { "epoch": 4.781803238451597, "grad_norm": 5.934739589691162, "learning_rate": 0.0008535389178176985, "loss": 7.5693, "step": 1173800 }, { "epoch": 4.782210616474979, "grad_norm": 6.528579235076904, "learning_rate": 0.000853167086458789, "loss": 7.5572, "step": 1173900 }, { "epoch": 4.782617994498359, "grad_norm": 7.340646266937256, "learning_rate": 0.0008527953204056709, "loss": 7.5583, "step": 1174000 }, { "epoch": 4.782617994498359, "eval_MaskedAccuracy": 0.5111816050317756, "eval_loss": 1.5916887521743774, "eval_runtime": 157.6226, "eval_samples_per_second": 402.709, "eval_steps_per_second": 1.573, "step": 1174000 }, { "epoch": 4.783025372521741, "grad_norm": 5.891901969909668, "learning_rate": 0.0008524236196730395, "loss": 7.5695, "step": 1174100 }, { "epoch": 4.783432750545122, "grad_norm": 7.876992702484131, "learning_rate": 0.0008520519842755907, "loss": 7.5495, "step": 1174200 }, { "epoch": 4.783840128568504, "grad_norm": 4.458004951477051, "learning_rate": 0.0008516804142280179, "loss": 7.5643, "step": 1174300 }, { "epoch": 4.7842475065918855, "grad_norm": 9.906769752502441, "learning_rate": 0.0008513089095450113, "loss": 7.5376, "step": 1174400 }, { "epoch": 4.784654884615267, "grad_norm": 5.301359176635742, "learning_rate": 0.0008509374702412575, "loss": 7.5495, "step": 1174500 }, { "epoch": 4.7850622626386485, "grad_norm": 8.92895221710205, "learning_rate": 0.0008505660963314424, "loss": 7.5627, "step": 1174600 }, { "epoch": 4.78546964066203, "grad_norm": 6.8228068351745605, "learning_rate": 0.0008501947878302477, "loss": 7.5624, "step": 1174700 }, { "epoch": 4.785877018685412, "grad_norm": 11.703251838684082, "learning_rate": 0.0008498235447523541, "loss": 7.5339, "step": 1174800 }, { "epoch": 4.786284396708793, "grad_norm": 3.5254666805267334, "learning_rate": 0.0008494523671124373, "loss": 7.5439, "step": 1174900 }, { "epoch": 4.786691774732175, "grad_norm": 12.059759140014648, "learning_rate": 0.000849081254925174, "loss": 7.5601, "step": 1175000 }, { "epoch": 4.786691774732175, "eval_MaskedAccuracy": 0.5108074817477763, "eval_loss": 1.5949333906173706, "eval_runtime": 159.7461, "eval_samples_per_second": 397.356, "eval_steps_per_second": 1.552, "step": 1175000 }, { "epoch": 4.787099152755556, "grad_norm": 7.086734294891357, "learning_rate": 0.0008487102082052353, "loss": 7.5566, "step": 1175100 }, { "epoch": 4.787506530778938, "grad_norm": 7.3957672119140625, "learning_rate": 0.00084833922696729, "loss": 7.5542, "step": 1175200 }, { "epoch": 4.787913908802318, "grad_norm": 8.284390449523926, "learning_rate": 0.0008479683112260062, "loss": 7.5498, "step": 1175300 }, { "epoch": 4.7883212868257, "grad_norm": 3.2272274494171143, "learning_rate": 0.0008475974609960468, "loss": 7.5619, "step": 1175400 }, { "epoch": 4.788728664849081, "grad_norm": 3.680732250213623, "learning_rate": 0.0008472266762920747, "loss": 7.5294, "step": 1175500 }, { "epoch": 4.789136042872463, "grad_norm": 4.4533514976501465, "learning_rate": 0.0008468559571287474, "loss": 7.5329, "step": 1175600 }, { "epoch": 4.7895434208958445, "grad_norm": 15.333439826965332, "learning_rate": 0.0008464853035207225, "loss": 7.5385, "step": 1175700 }, { "epoch": 4.789950798919226, "grad_norm": 4.0509772300720215, "learning_rate": 0.000846114715482654, "loss": 7.5505, "step": 1175800 }, { "epoch": 4.790358176942608, "grad_norm": 9.513604164123535, "learning_rate": 0.0008457441930291938, "loss": 7.5768, "step": 1175900 }, { "epoch": 4.790765554965989, "grad_norm": 7.331367015838623, "learning_rate": 0.0008453737361749897, "loss": 7.5372, "step": 1176000 }, { "epoch": 4.790765554965989, "eval_MaskedAccuracy": 0.5116188169980294, "eval_loss": 1.589592695236206, "eval_runtime": 162.4316, "eval_samples_per_second": 390.786, "eval_steps_per_second": 1.527, "step": 1176000 }, { "epoch": 4.791172932989371, "grad_norm": 5.087259769439697, "learning_rate": 0.0008450033449346889, "loss": 7.553, "step": 1176100 }, { "epoch": 4.791580311012752, "grad_norm": 11.546634674072266, "learning_rate": 0.0008446330193229337, "loss": 7.5676, "step": 1176200 }, { "epoch": 4.791987689036134, "grad_norm": 4.462437629699707, "learning_rate": 0.0008442627593543655, "loss": 7.549, "step": 1176300 }, { "epoch": 4.792395067059515, "grad_norm": 5.836212635040283, "learning_rate": 0.000843892565043623, "loss": 7.5656, "step": 1176400 }, { "epoch": 4.792802445082897, "grad_norm": 16.149566650390625, "learning_rate": 0.0008435224364053408, "loss": 7.5743, "step": 1176500 }, { "epoch": 4.793209823106277, "grad_norm": 5.329946994781494, "learning_rate": 0.0008431523734541532, "loss": 7.5559, "step": 1176600 }, { "epoch": 4.793617201129659, "grad_norm": 3.8502087593078613, "learning_rate": 0.0008427823762046909, "loss": 7.5601, "step": 1176700 }, { "epoch": 4.7940245791530405, "grad_norm": 6.400423049926758, "learning_rate": 0.0008424124446715808, "loss": 7.5447, "step": 1176800 }, { "epoch": 4.794431957176422, "grad_norm": 3.2085399627685547, "learning_rate": 0.0008420425788694498, "loss": 7.5418, "step": 1176900 }, { "epoch": 4.7948393351998035, "grad_norm": 3.724039077758789, "learning_rate": 0.000841672778812919, "loss": 7.5298, "step": 1177000 }, { "epoch": 4.7948393351998035, "eval_MaskedAccuracy": 0.5119181182744716, "eval_loss": 1.5904992818832397, "eval_runtime": 159.9055, "eval_samples_per_second": 396.959, "eval_steps_per_second": 1.551, "step": 1177000 }, { "epoch": 4.795246713223185, "grad_norm": 4.9031782150268555, "learning_rate": 0.0008413030445166099, "loss": 7.5699, "step": 1177100 }, { "epoch": 4.795654091246567, "grad_norm": 10.650490760803223, "learning_rate": 0.0008409333759951393, "loss": 7.546, "step": 1177200 }, { "epoch": 4.796061469269948, "grad_norm": 7.704545497894287, "learning_rate": 0.0008405637732631219, "loss": 7.5634, "step": 1177300 }, { "epoch": 4.79646884729333, "grad_norm": 13.6079740524292, "learning_rate": 0.0008401942363351713, "loss": 7.5523, "step": 1177400 }, { "epoch": 4.796876225316711, "grad_norm": 7.489874362945557, "learning_rate": 0.0008398247652258953, "loss": 7.5635, "step": 1177500 }, { "epoch": 4.797283603340093, "grad_norm": 8.139029502868652, "learning_rate": 0.0008394553599499032, "loss": 7.5421, "step": 1177600 }, { "epoch": 4.797690981363473, "grad_norm": 3.4920918941497803, "learning_rate": 0.0008390860205217983, "loss": 7.5678, "step": 1177700 }, { "epoch": 4.798098359386856, "grad_norm": 9.2100248336792, "learning_rate": 0.0008387167469561832, "loss": 7.5353, "step": 1177800 }, { "epoch": 4.7985057374102364, "grad_norm": 4.074262619018555, "learning_rate": 0.0008383475392676562, "loss": 7.5206, "step": 1177900 }, { "epoch": 4.798913115433618, "grad_norm": 6.044760227203369, "learning_rate": 0.0008379783974708151, "loss": 7.5689, "step": 1178000 }, { "epoch": 4.798913115433618, "eval_MaskedAccuracy": 0.5115592092361615, "eval_loss": 1.5906074047088623, "eval_runtime": 169.0923, "eval_samples_per_second": 375.393, "eval_steps_per_second": 1.467, "step": 1178000 }, { "epoch": 4.7993204934569995, "grad_norm": 5.4311113357543945, "learning_rate": 0.0008376093215802557, "loss": 7.5713, "step": 1178100 }, { "epoch": 4.799727871480381, "grad_norm": 13.861988067626953, "learning_rate": 0.0008372403116105657, "loss": 7.5553, "step": 1178200 }, { "epoch": 4.800135249503763, "grad_norm": 12.188407897949219, "learning_rate": 0.000836871367576337, "loss": 7.5857, "step": 1178300 }, { "epoch": 4.800542627527144, "grad_norm": 8.346395492553711, "learning_rate": 0.0008365024894921546, "loss": 7.5611, "step": 1178400 }, { "epoch": 4.800950005550526, "grad_norm": 10.427240371704102, "learning_rate": 0.0008361336773726031, "loss": 7.5669, "step": 1178500 }, { "epoch": 4.801357383573907, "grad_norm": 7.915243625640869, "learning_rate": 0.0008357649312322624, "loss": 7.5343, "step": 1178600 }, { "epoch": 4.801764761597289, "grad_norm": 15.282696723937988, "learning_rate": 0.0008353962510857115, "loss": 7.5569, "step": 1178700 }, { "epoch": 4.80217213962067, "grad_norm": 4.565001964569092, "learning_rate": 0.000835027636947527, "loss": 7.562, "step": 1178800 }, { "epoch": 4.802579517644052, "grad_norm": 5.097818851470947, "learning_rate": 0.0008346590888322795, "loss": 7.5534, "step": 1178900 }, { "epoch": 4.802986895667432, "grad_norm": 5.627564430236816, "learning_rate": 0.0008342906067545435, "loss": 7.5384, "step": 1179000 }, { "epoch": 4.802986895667432, "eval_MaskedAccuracy": 0.5117527316966278, "eval_loss": 1.585954189300537, "eval_runtime": 172.4997, "eval_samples_per_second": 367.977, "eval_steps_per_second": 1.438, "step": 1179000 }, { "epoch": 4.803394273690814, "grad_norm": 16.744068145751953, "learning_rate": 0.000833922190728886, "loss": 7.5589, "step": 1179100 }, { "epoch": 4.8038016517141955, "grad_norm": 9.47870922088623, "learning_rate": 0.0008335538407698721, "loss": 7.5891, "step": 1179200 }, { "epoch": 4.804209029737577, "grad_norm": 7.305385112762451, "learning_rate": 0.0008331855568920641, "loss": 7.5582, "step": 1179300 }, { "epoch": 4.8046164077609586, "grad_norm": 10.57204532623291, "learning_rate": 0.0008328173391100221, "loss": 7.559, "step": 1179400 }, { "epoch": 4.80502378578434, "grad_norm": 9.74145793914795, "learning_rate": 0.0008324491874383042, "loss": 7.5358, "step": 1179500 }, { "epoch": 4.805431163807722, "grad_norm": 11.683330535888672, "learning_rate": 0.0008320811018914656, "loss": 7.5596, "step": 1179600 }, { "epoch": 4.805838541831103, "grad_norm": 4.935939311981201, "learning_rate": 0.0008317130824840579, "loss": 7.5346, "step": 1179700 }, { "epoch": 4.806245919854485, "grad_norm": 9.072016716003418, "learning_rate": 0.0008313451292306317, "loss": 7.5421, "step": 1179800 }, { "epoch": 4.806653297877866, "grad_norm": 6.465701580047607, "learning_rate": 0.0008309772421457329, "loss": 7.5538, "step": 1179900 }, { "epoch": 4.807060675901248, "grad_norm": 8.924321174621582, "learning_rate": 0.0008306094212439082, "loss": 7.576, "step": 1180000 }, { "epoch": 4.807060675901248, "eval_MaskedAccuracy": 0.5117493672356868, "eval_loss": 1.5885009765625, "eval_runtime": 169.4175, "eval_samples_per_second": 374.672, "eval_steps_per_second": 1.464, "step": 1180000 }, { "epoch": 4.807468053924629, "grad_norm": 11.632453918457031, "learning_rate": 0.0008302416665396979, "loss": 7.5657, "step": 1180100 }, { "epoch": 4.807875431948011, "grad_norm": 5.263997554779053, "learning_rate": 0.0008298739780476414, "loss": 7.5603, "step": 1180200 }, { "epoch": 4.8082828099713915, "grad_norm": 12.246635437011719, "learning_rate": 0.0008295063557822768, "loss": 7.5642, "step": 1180300 }, { "epoch": 4.808690187994773, "grad_norm": 7.712202072143555, "learning_rate": 0.0008291387997581372, "loss": 7.5582, "step": 1180400 }, { "epoch": 4.8090975660181545, "grad_norm": 7.902210712432861, "learning_rate": 0.0008287713099897533, "loss": 7.5842, "step": 1180500 }, { "epoch": 4.809504944041536, "grad_norm": 10.562908172607422, "learning_rate": 0.0008284038864916547, "loss": 7.5277, "step": 1180600 }, { "epoch": 4.809912322064918, "grad_norm": 7.974733829498291, "learning_rate": 0.0008280365292783681, "loss": 7.5559, "step": 1180700 }, { "epoch": 4.810319700088299, "grad_norm": 6.211431980133057, "learning_rate": 0.0008276692383644154, "loss": 7.512, "step": 1180800 }, { "epoch": 4.810727078111681, "grad_norm": 9.059982299804688, "learning_rate": 0.00082730201376432, "loss": 7.5429, "step": 1180900 }, { "epoch": 4.811134456135062, "grad_norm": 3.615178346633911, "learning_rate": 0.0008269348554925981, "loss": 7.5495, "step": 1181000 }, { "epoch": 4.811134456135062, "eval_MaskedAccuracy": 0.51130747334969, "eval_loss": 1.5924112796783447, "eval_runtime": 167.4774, "eval_samples_per_second": 379.012, "eval_steps_per_second": 1.481, "step": 1181000 }, { "epoch": 4.811541834158444, "grad_norm": 3.9623985290527344, "learning_rate": 0.0008265677635637671, "loss": 7.5728, "step": 1181100 }, { "epoch": 4.811949212181825, "grad_norm": 5.541474342346191, "learning_rate": 0.0008262007379923382, "loss": 7.5579, "step": 1181200 }, { "epoch": 4.812356590205207, "grad_norm": 12.991304397583008, "learning_rate": 0.0008258337787928237, "loss": 7.5556, "step": 1181300 }, { "epoch": 4.812763968228588, "grad_norm": 9.231416702270508, "learning_rate": 0.0008254668859797306, "loss": 7.5713, "step": 1181400 }, { "epoch": 4.81317134625197, "grad_norm": 12.248048782348633, "learning_rate": 0.0008251000595675641, "loss": 7.5807, "step": 1181500 }, { "epoch": 4.8135787242753505, "grad_norm": 4.622974395751953, "learning_rate": 0.0008247332995708265, "loss": 7.572, "step": 1181600 }, { "epoch": 4.813986102298732, "grad_norm": 16.859704971313477, "learning_rate": 0.0008243666060040183, "loss": 7.5727, "step": 1181700 }, { "epoch": 4.814393480322114, "grad_norm": 6.884955406188965, "learning_rate": 0.0008239999788816374, "loss": 7.5525, "step": 1181800 }, { "epoch": 4.814800858345495, "grad_norm": 5.5208234786987305, "learning_rate": 0.0008236334182181777, "loss": 7.5362, "step": 1181900 }, { "epoch": 4.815208236368877, "grad_norm": 6.944272518157959, "learning_rate": 0.0008232669240281317, "loss": 7.5922, "step": 1182000 }, { "epoch": 4.815208236368877, "eval_MaskedAccuracy": 0.5108697522849652, "eval_loss": 1.6041796207427979, "eval_runtime": 161.7268, "eval_samples_per_second": 392.489, "eval_steps_per_second": 1.533, "step": 1182000 }, { "epoch": 4.815615614392258, "grad_norm": 4.707577228546143, "learning_rate": 0.0008229004963259892, "loss": 7.5376, "step": 1182100 }, { "epoch": 4.81602299241564, "grad_norm": 12.653528213500977, "learning_rate": 0.0008225341351262367, "loss": 7.5649, "step": 1182200 }, { "epoch": 4.816430370439021, "grad_norm": 5.855892658233643, "learning_rate": 0.0008221678404433578, "loss": 7.5719, "step": 1182300 }, { "epoch": 4.816837748462403, "grad_norm": 15.072898864746094, "learning_rate": 0.0008218016122918338, "loss": 7.5101, "step": 1182400 }, { "epoch": 4.817245126485784, "grad_norm": 6.575567722320557, "learning_rate": 0.0008214354506861457, "loss": 7.5501, "step": 1182500 }, { "epoch": 4.817652504509166, "grad_norm": 3.4672160148620605, "learning_rate": 0.0008210693556407684, "loss": 7.5534, "step": 1182600 }, { "epoch": 4.8180598825325465, "grad_norm": 10.773113250732422, "learning_rate": 0.0008207033271701755, "loss": 7.5582, "step": 1182700 }, { "epoch": 4.818467260555929, "grad_norm": 9.565873146057129, "learning_rate": 0.0008203373652888373, "loss": 7.5465, "step": 1182800 }, { "epoch": 4.8188746385793095, "grad_norm": 7.289404392242432, "learning_rate": 0.0008199714700112242, "loss": 7.5607, "step": 1182900 }, { "epoch": 4.819282016602691, "grad_norm": 3.710710287094116, "learning_rate": 0.0008196056413518006, "loss": 7.5357, "step": 1183000 }, { "epoch": 4.819282016602691, "eval_MaskedAccuracy": 0.5118950024515713, "eval_loss": 1.590827226638794, "eval_runtime": 168.9435, "eval_samples_per_second": 375.723, "eval_steps_per_second": 1.468, "step": 1183000 }, { "epoch": 4.819689394626073, "grad_norm": 3.4109690189361572, "learning_rate": 0.0008192398793250301, "loss": 7.5449, "step": 1183100 }, { "epoch": 4.820096772649454, "grad_norm": 29.629152297973633, "learning_rate": 0.0008188741839453739, "loss": 7.5535, "step": 1183200 }, { "epoch": 4.820504150672836, "grad_norm": 10.11011791229248, "learning_rate": 0.0008185085552272896, "loss": 7.4998, "step": 1183300 }, { "epoch": 4.820911528696217, "grad_norm": 4.354499816894531, "learning_rate": 0.0008181429931852319, "loss": 7.542, "step": 1183400 }, { "epoch": 4.821318906719599, "grad_norm": 6.408851623535156, "learning_rate": 0.0008177774978336531, "loss": 7.5473, "step": 1183500 }, { "epoch": 4.82172628474298, "grad_norm": 16.92336654663086, "learning_rate": 0.0008174120691870037, "loss": 7.5295, "step": 1183600 }, { "epoch": 4.822133662766362, "grad_norm": 16.88974952697754, "learning_rate": 0.000817046707259731, "loss": 7.5474, "step": 1183700 }, { "epoch": 4.822541040789743, "grad_norm": 7.427070617675781, "learning_rate": 0.0008166814120662792, "loss": 7.5787, "step": 1183800 }, { "epoch": 4.822948418813125, "grad_norm": 11.39084243774414, "learning_rate": 0.0008163161836210915, "loss": 7.5496, "step": 1183900 }, { "epoch": 4.8233557968365055, "grad_norm": 6.5332441329956055, "learning_rate": 0.0008159510219386075, "loss": 7.5599, "step": 1184000 }, { "epoch": 4.8233557968365055, "eval_MaskedAccuracy": 0.5120332026114306, "eval_loss": 1.5897085666656494, "eval_runtime": 164.398, "eval_samples_per_second": 386.112, "eval_steps_per_second": 1.509, "step": 1184000 }, { "epoch": 4.823763174859887, "grad_norm": 7.744347095489502, "learning_rate": 0.0008155859270332627, "loss": 7.5517, "step": 1184100 }, { "epoch": 4.824170552883269, "grad_norm": 10.261436462402344, "learning_rate": 0.0008152208989194916, "loss": 7.5532, "step": 1184200 }, { "epoch": 4.82457793090665, "grad_norm": 5.4755754470825195, "learning_rate": 0.0008148559376117268, "loss": 7.5245, "step": 1184300 }, { "epoch": 4.824985308930032, "grad_norm": 17.02247428894043, "learning_rate": 0.000814491043124396, "loss": 7.5803, "step": 1184400 }, { "epoch": 4.825392686953413, "grad_norm": 9.922220230102539, "learning_rate": 0.0008141262154719258, "loss": 7.5631, "step": 1184500 }, { "epoch": 4.825800064976795, "grad_norm": 5.512035369873047, "learning_rate": 0.0008137614546687394, "loss": 7.5538, "step": 1184600 }, { "epoch": 4.826207443000176, "grad_norm": 6.123693943023682, "learning_rate": 0.0008133967607292579, "loss": 7.5373, "step": 1184700 }, { "epoch": 4.826614821023558, "grad_norm": 10.376054763793945, "learning_rate": 0.0008130321336679002, "loss": 7.5408, "step": 1184800 }, { "epoch": 4.827022199046939, "grad_norm": 7.109657287597656, "learning_rate": 0.000812667573499081, "loss": 7.5449, "step": 1184900 }, { "epoch": 4.827429577070321, "grad_norm": 13.123220443725586, "learning_rate": 0.0008123030802372152, "loss": 7.5583, "step": 1185000 }, { "epoch": 4.827429577070321, "eval_MaskedAccuracy": 0.5111163088078949, "eval_loss": 1.6001918315887451, "eval_runtime": 158.7699, "eval_samples_per_second": 399.799, "eval_steps_per_second": 1.562, "step": 1185000 }, { "epoch": 4.827836955093702, "grad_norm": 5.369037628173828, "learning_rate": 0.0008119386538967105, "loss": 7.5462, "step": 1185100 }, { "epoch": 4.828244333117084, "grad_norm": 8.000434875488281, "learning_rate": 0.0008115742944919748, "loss": 7.5668, "step": 1185200 }, { "epoch": 4.8286517111404645, "grad_norm": 10.486366271972656, "learning_rate": 0.0008112100020374161, "loss": 7.5508, "step": 1185300 }, { "epoch": 4.829059089163846, "grad_norm": 20.98023223876953, "learning_rate": 0.000810845776547435, "loss": 7.5701, "step": 1185400 }, { "epoch": 4.829466467187228, "grad_norm": 7.4165730476379395, "learning_rate": 0.0008104816180364309, "loss": 7.5397, "step": 1185500 }, { "epoch": 4.829873845210609, "grad_norm": 8.169611930847168, "learning_rate": 0.0008101175265188017, "loss": 7.5534, "step": 1185600 }, { "epoch": 4.830281223233991, "grad_norm": 6.466330051422119, "learning_rate": 0.000809753502008942, "loss": 7.5308, "step": 1185700 }, { "epoch": 4.830688601257372, "grad_norm": 8.122366905212402, "learning_rate": 0.0008093895445212426, "loss": 7.549, "step": 1185800 }, { "epoch": 4.831095979280754, "grad_norm": 3.603215456008911, "learning_rate": 0.000809025654070093, "loss": 7.538, "step": 1185900 }, { "epoch": 4.831503357304135, "grad_norm": 5.026655673980713, "learning_rate": 0.0008086618306698808, "loss": 7.5431, "step": 1186000 }, { "epoch": 4.831503357304135, "eval_MaskedAccuracy": 0.5110371926782977, "eval_loss": 1.5962096452713013, "eval_runtime": 173.0103, "eval_samples_per_second": 366.891, "eval_steps_per_second": 1.433, "step": 1186000 }, { "epoch": 4.831910735327517, "grad_norm": 4.054526329040527, "learning_rate": 0.0008082980743349883, "loss": 7.5448, "step": 1186100 }, { "epoch": 4.832318113350898, "grad_norm": 7.328049182891846, "learning_rate": 0.0008079343850797972, "loss": 7.5691, "step": 1186200 }, { "epoch": 4.83272549137428, "grad_norm": 5.259098529815674, "learning_rate": 0.0008075707629186859, "loss": 7.5568, "step": 1186300 }, { "epoch": 4.833132869397661, "grad_norm": 9.088693618774414, "learning_rate": 0.0008072072078660324, "loss": 7.5279, "step": 1186400 }, { "epoch": 4.833540247421043, "grad_norm": 8.539562225341797, "learning_rate": 0.0008068437199362077, "loss": 7.5686, "step": 1186500 }, { "epoch": 4.833947625444424, "grad_norm": 18.32861328125, "learning_rate": 0.0008064802991435834, "loss": 7.5616, "step": 1186600 }, { "epoch": 4.834355003467805, "grad_norm": 9.15188217163086, "learning_rate": 0.0008061169455025269, "loss": 7.557, "step": 1186700 }, { "epoch": 4.834762381491187, "grad_norm": 7.382136344909668, "learning_rate": 0.0008057536590274034, "loss": 7.5468, "step": 1186800 }, { "epoch": 4.835169759514568, "grad_norm": 16.520309448242188, "learning_rate": 0.0008053904397325771, "loss": 7.58, "step": 1186900 }, { "epoch": 4.83557713753795, "grad_norm": 12.390761375427246, "learning_rate": 0.0008050272876324063, "loss": 7.5283, "step": 1187000 }, { "epoch": 4.83557713753795, "eval_MaskedAccuracy": 0.5117377525810891, "eval_loss": 1.5956392288208008, "eval_runtime": 171.3553, "eval_samples_per_second": 370.435, "eval_steps_per_second": 1.447, "step": 1187000 }, { "epoch": 4.835984515561331, "grad_norm": 20.251081466674805, "learning_rate": 0.0008046642027412489, "loss": 7.5667, "step": 1187100 }, { "epoch": 4.836391893584713, "grad_norm": 5.811011791229248, "learning_rate": 0.0008043011850734598, "loss": 7.5397, "step": 1187200 }, { "epoch": 4.836799271608094, "grad_norm": 6.2515339851379395, "learning_rate": 0.0008039382346433905, "loss": 7.5672, "step": 1187300 }, { "epoch": 4.837206649631476, "grad_norm": 12.136992454528809, "learning_rate": 0.0008035753514653918, "loss": 7.517, "step": 1187400 }, { "epoch": 4.837614027654857, "grad_norm": 7.822018146514893, "learning_rate": 0.0008032125355538078, "loss": 7.5217, "step": 1187500 }, { "epoch": 4.838021405678239, "grad_norm": 9.882133483886719, "learning_rate": 0.0008028497869229848, "loss": 7.5685, "step": 1187600 }, { "epoch": 4.8384287837016196, "grad_norm": 8.234831809997559, "learning_rate": 0.0008024871055872637, "loss": 7.5238, "step": 1187700 }, { "epoch": 4.838836161725002, "grad_norm": 6.588048934936523, "learning_rate": 0.000802124491560982, "loss": 7.5805, "step": 1187800 }, { "epoch": 4.839243539748383, "grad_norm": 4.4182353019714355, "learning_rate": 0.000801761944858477, "loss": 7.5356, "step": 1187900 }, { "epoch": 4.839650917771764, "grad_norm": 10.374272346496582, "learning_rate": 0.0008013994654940826, "loss": 7.5774, "step": 1188000 }, { "epoch": 4.839650917771764, "eval_MaskedAccuracy": 0.5114369197100814, "eval_loss": 1.5909814834594727, "eval_runtime": 165.4066, "eval_samples_per_second": 383.757, "eval_steps_per_second": 1.499, "step": 1188000 }, { "epoch": 4.840058295795146, "grad_norm": 17.36933135986328, "learning_rate": 0.0008010370534821281, "loss": 7.5422, "step": 1188100 }, { "epoch": 4.840465673818527, "grad_norm": 23.012807846069336, "learning_rate": 0.0008006747088369422, "loss": 7.5512, "step": 1188200 }, { "epoch": 4.840873051841909, "grad_norm": 11.495627403259277, "learning_rate": 0.0008003124315728504, "loss": 7.5815, "step": 1188300 }, { "epoch": 4.84128042986529, "grad_norm": 9.812355041503906, "learning_rate": 0.0007999502217041759, "loss": 7.5426, "step": 1188400 }, { "epoch": 4.841687807888672, "grad_norm": 8.009356498718262, "learning_rate": 0.0007995880792452387, "loss": 7.5432, "step": 1188500 }, { "epoch": 4.842095185912053, "grad_norm": 18.623714447021484, "learning_rate": 0.0007992260042103557, "loss": 7.5557, "step": 1188600 }, { "epoch": 4.842502563935435, "grad_norm": 3.701794385910034, "learning_rate": 0.000798863996613842, "loss": 7.5489, "step": 1188700 }, { "epoch": 4.842909941958816, "grad_norm": 5.531005859375, "learning_rate": 0.0007985020564700094, "loss": 7.563, "step": 1188800 }, { "epoch": 4.843317319982198, "grad_norm": 4.462012767791748, "learning_rate": 0.0007981401837931671, "loss": 7.5386, "step": 1188900 }, { "epoch": 4.843724698005579, "grad_norm": 5.2527875900268555, "learning_rate": 0.000797778378597623, "loss": 7.5612, "step": 1189000 }, { "epoch": 4.843724698005579, "eval_MaskedAccuracy": 0.5116886053689556, "eval_loss": 1.5937349796295166, "eval_runtime": 163.6586, "eval_samples_per_second": 387.856, "eval_steps_per_second": 1.515, "step": 1189000 }, { "epoch": 4.84413207602896, "grad_norm": 7.725368976593018, "learning_rate": 0.0007974166408976809, "loss": 7.5494, "step": 1189100 }, { "epoch": 4.844539454052342, "grad_norm": 13.587323188781738, "learning_rate": 0.0007970549707076406, "loss": 7.5619, "step": 1189200 }, { "epoch": 4.844946832075723, "grad_norm": 8.40877914428711, "learning_rate": 0.0007966933680418032, "loss": 7.5109, "step": 1189300 }, { "epoch": 4.845354210099105, "grad_norm": 18.549039840698242, "learning_rate": 0.0007963318329144626, "loss": 7.5597, "step": 1189400 }, { "epoch": 4.845761588122486, "grad_norm": 13.995420455932617, "learning_rate": 0.0007959703653399143, "loss": 7.5592, "step": 1189500 }, { "epoch": 4.846168966145868, "grad_norm": 4.556239604949951, "learning_rate": 0.0007956089653324473, "loss": 7.5232, "step": 1189600 }, { "epoch": 4.846576344169249, "grad_norm": 3.827169179916382, "learning_rate": 0.00079524763290635, "loss": 7.5391, "step": 1189700 }, { "epoch": 4.846983722192631, "grad_norm": 10.173226356506348, "learning_rate": 0.0007948863680759086, "loss": 7.5328, "step": 1189800 }, { "epoch": 4.847391100216012, "grad_norm": 12.815058708190918, "learning_rate": 0.0007945251708554043, "loss": 7.5205, "step": 1189900 }, { "epoch": 4.847798478239394, "grad_norm": 5.204775333404541, "learning_rate": 0.0007941640412591194, "loss": 7.5524, "step": 1190000 }, { "epoch": 4.847798478239394, "eval_MaskedAccuracy": 0.5115184180846889, "eval_loss": 1.5997650623321533, "eval_runtime": 165.1239, "eval_samples_per_second": 384.414, "eval_steps_per_second": 1.502, "step": 1190000 }, { "epoch": 4.8482058562627754, "grad_norm": 16.823041915893555, "learning_rate": 0.0007938029793013307, "loss": 7.5503, "step": 1190100 }, { "epoch": 4.848613234286157, "grad_norm": 4.9770026206970215, "learning_rate": 0.0007934419849963117, "loss": 7.5683, "step": 1190200 }, { "epoch": 4.849020612309538, "grad_norm": 5.374384880065918, "learning_rate": 0.0007930810583583345, "loss": 7.5581, "step": 1190300 }, { "epoch": 4.849427990332919, "grad_norm": 13.473366737365723, "learning_rate": 0.0007927201994016704, "loss": 7.5634, "step": 1190400 }, { "epoch": 4.849835368356301, "grad_norm": 3.5089282989501953, "learning_rate": 0.0007923594081405834, "loss": 7.5612, "step": 1190500 }, { "epoch": 4.850242746379682, "grad_norm": 3.3681108951568604, "learning_rate": 0.0007919986845893393, "loss": 7.5523, "step": 1190600 }, { "epoch": 4.850650124403064, "grad_norm": 12.124752044677734, "learning_rate": 0.0007916380287621984, "loss": 7.5654, "step": 1190700 }, { "epoch": 4.851057502426445, "grad_norm": 21.249267578125, "learning_rate": 0.0007912774406734202, "loss": 7.5434, "step": 1190800 }, { "epoch": 4.851464880449827, "grad_norm": 6.28021240234375, "learning_rate": 0.0007909169203372618, "loss": 7.5697, "step": 1190900 }, { "epoch": 4.851872258473208, "grad_norm": 7.58031702041626, "learning_rate": 0.000790556467767974, "loss": 7.5487, "step": 1191000 }, { "epoch": 4.851872258473208, "eval_MaskedAccuracy": 0.5123831634801366, "eval_loss": 1.5865329504013062, "eval_runtime": 162.9624, "eval_samples_per_second": 389.513, "eval_steps_per_second": 1.522, "step": 1191000 }, { "epoch": 4.85227963649659, "grad_norm": 17.498788833618164, "learning_rate": 0.0007901960829798082, "loss": 7.52, "step": 1191100 }, { "epoch": 4.852687014519971, "grad_norm": 6.950149059295654, "learning_rate": 0.0007898357659870131, "loss": 7.5632, "step": 1191200 }, { "epoch": 4.853094392543353, "grad_norm": 5.151035308837891, "learning_rate": 0.0007894755168038332, "loss": 7.5828, "step": 1191300 }, { "epoch": 4.8535017705667345, "grad_norm": 4.0569610595703125, "learning_rate": 0.0007891153354445123, "loss": 7.5552, "step": 1191400 }, { "epoch": 4.853909148590116, "grad_norm": 5.584034442901611, "learning_rate": 0.0007887552219232886, "loss": 7.5662, "step": 1191500 }, { "epoch": 4.854316526613497, "grad_norm": 7.946394443511963, "learning_rate": 0.0007883951762543998, "loss": 7.5149, "step": 1191600 }, { "epoch": 4.854723904636878, "grad_norm": 7.690493106842041, "learning_rate": 0.0007880351984520812, "loss": 7.5979, "step": 1191700 }, { "epoch": 4.85513128266026, "grad_norm": 5.265111923217773, "learning_rate": 0.0007876752885305632, "loss": 7.5156, "step": 1191800 }, { "epoch": 4.855538660683641, "grad_norm": 5.356286525726318, "learning_rate": 0.0007873154465040766, "loss": 7.5383, "step": 1191900 }, { "epoch": 4.855946038707023, "grad_norm": 6.729825019836426, "learning_rate": 0.000786955672386848, "loss": 7.541, "step": 1192000 }, { "epoch": 4.855946038707023, "eval_MaskedAccuracy": 0.5114614765556479, "eval_loss": 1.6037479639053345, "eval_runtime": 176.5018, "eval_samples_per_second": 359.634, "eval_steps_per_second": 1.405, "step": 1192000 }, { "epoch": 4.856353416730404, "grad_norm": 16.198883056640625, "learning_rate": 0.0007865959661930994, "loss": 7.5375, "step": 1192100 }, { "epoch": 4.856760794753786, "grad_norm": 5.049928188323975, "learning_rate": 0.0007862363279370542, "loss": 7.5553, "step": 1192200 }, { "epoch": 4.857168172777167, "grad_norm": 12.35220718383789, "learning_rate": 0.0007858767576329285, "loss": 7.5294, "step": 1192300 }, { "epoch": 4.857575550800549, "grad_norm": 16.83810806274414, "learning_rate": 0.0007855172552949396, "loss": 7.541, "step": 1192400 }, { "epoch": 4.8579829288239305, "grad_norm": 12.581002235412598, "learning_rate": 0.0007851578209372996, "loss": 7.5417, "step": 1192500 }, { "epoch": 4.858390306847312, "grad_norm": 4.008618354797363, "learning_rate": 0.000784798454574219, "loss": 7.5609, "step": 1192600 }, { "epoch": 4.858797684870693, "grad_norm": 20.214204788208008, "learning_rate": 0.0007844391562199052, "loss": 7.5383, "step": 1192700 }, { "epoch": 4.859205062894075, "grad_norm": 11.226751327514648, "learning_rate": 0.0007840799258885661, "loss": 7.5476, "step": 1192800 }, { "epoch": 4.859612440917456, "grad_norm": 3.165907144546509, "learning_rate": 0.0007837207635944005, "loss": 7.5658, "step": 1192900 }, { "epoch": 4.860019818940837, "grad_norm": 3.552816867828369, "learning_rate": 0.0007833616693516094, "loss": 7.5542, "step": 1193000 }, { "epoch": 4.860019818940837, "eval_MaskedAccuracy": 0.5119116573491553, "eval_loss": 1.5996180772781372, "eval_runtime": 202.3496, "eval_samples_per_second": 313.695, "eval_steps_per_second": 1.226, "step": 1193000 }, { "epoch": 4.860427196964219, "grad_norm": 7.732130527496338, "learning_rate": 0.000783002643174389, "loss": 7.5853, "step": 1193100 }, { "epoch": 4.8608345749876, "grad_norm": 11.34261703491211, "learning_rate": 0.0007826436850769341, "loss": 7.542, "step": 1193200 }, { "epoch": 4.861241953010982, "grad_norm": 4.273205280303955, "learning_rate": 0.0007822847950734366, "loss": 7.556, "step": 1193300 }, { "epoch": 4.861649331034363, "grad_norm": 5.62993860244751, "learning_rate": 0.0007819259731780844, "loss": 7.5515, "step": 1193400 }, { "epoch": 4.862056709057745, "grad_norm": 11.265458106994629, "learning_rate": 0.0007815672194050635, "loss": 7.5491, "step": 1193500 }, { "epoch": 4.862464087081126, "grad_norm": 4.582005023956299, "learning_rate": 0.0007812085337685592, "loss": 7.5536, "step": 1193600 }, { "epoch": 4.862871465104508, "grad_norm": 8.613621711730957, "learning_rate": 0.0007808499162827504, "loss": 7.5293, "step": 1193700 }, { "epoch": 4.8632788431278895, "grad_norm": 6.918115139007568, "learning_rate": 0.0007804913669618152, "loss": 7.5194, "step": 1193800 }, { "epoch": 4.863686221151271, "grad_norm": 8.471829414367676, "learning_rate": 0.0007801328858199305, "loss": 7.5394, "step": 1193900 }, { "epoch": 4.864093599174652, "grad_norm": 9.388032913208008, "learning_rate": 0.0007797744728712683, "loss": 7.5661, "step": 1194000 }, { "epoch": 4.864093599174652, "eval_MaskedAccuracy": 0.5120076719840594, "eval_loss": 1.5941848754882812, "eval_runtime": 165.3658, "eval_samples_per_second": 383.852, "eval_steps_per_second": 1.5, "step": 1194000 }, { "epoch": 4.864500977198033, "grad_norm": 10.334595680236816, "learning_rate": 0.0007794161281299983, "loss": 7.5442, "step": 1194100 }, { "epoch": 4.864908355221415, "grad_norm": 5.058508396148682, "learning_rate": 0.0007790578516102866, "loss": 7.5621, "step": 1194200 }, { "epoch": 4.865315733244796, "grad_norm": 3.7669878005981445, "learning_rate": 0.0007786996433263002, "loss": 7.5869, "step": 1194300 }, { "epoch": 4.865723111268178, "grad_norm": 8.707598686218262, "learning_rate": 0.0007783415032921997, "loss": 7.5463, "step": 1194400 }, { "epoch": 4.866130489291559, "grad_norm": 12.151708602905273, "learning_rate": 0.0007779834315221447, "loss": 7.5517, "step": 1194500 }, { "epoch": 4.866537867314941, "grad_norm": 10.467202186584473, "learning_rate": 0.0007776254280302912, "loss": 7.5374, "step": 1194600 }, { "epoch": 4.866945245338322, "grad_norm": 7.354379653930664, "learning_rate": 0.0007772674928307932, "loss": 7.5671, "step": 1194700 }, { "epoch": 4.867352623361704, "grad_norm": 25.011062622070312, "learning_rate": 0.0007769096259378013, "loss": 7.548, "step": 1194800 }, { "epoch": 4.8677600013850855, "grad_norm": 5.9411845207214355, "learning_rate": 0.0007765518273654639, "loss": 7.5215, "step": 1194900 }, { "epoch": 4.868167379408467, "grad_norm": 4.374491214752197, "learning_rate": 0.0007761940971279279, "loss": 7.5435, "step": 1195000 }, { "epoch": 4.868167379408467, "eval_MaskedAccuracy": 0.51178161139941, "eval_loss": 1.6017199754714966, "eval_runtime": 167.3714, "eval_samples_per_second": 379.252, "eval_steps_per_second": 1.482, "step": 1195000 }, { "epoch": 4.8685747574318485, "grad_norm": 21.570842742919922, "learning_rate": 0.0007758364352393351, "loss": 7.5455, "step": 1195100 }, { "epoch": 4.86898213545523, "grad_norm": 7.334157466888428, "learning_rate": 0.0007754788417138269, "loss": 7.5505, "step": 1195200 }, { "epoch": 4.869389513478611, "grad_norm": 5.550645351409912, "learning_rate": 0.00077512131656554, "loss": 7.5318, "step": 1195300 }, { "epoch": 4.869796891501992, "grad_norm": 2.8154354095458984, "learning_rate": 0.0007747638598086088, "loss": 7.5582, "step": 1195400 }, { "epoch": 4.870204269525374, "grad_norm": 4.538912773132324, "learning_rate": 0.0007744064714571674, "loss": 7.5342, "step": 1195500 }, { "epoch": 4.870611647548755, "grad_norm": 6.24132776260376, "learning_rate": 0.0007740491515253432, "loss": 7.5108, "step": 1195600 }, { "epoch": 4.871019025572137, "grad_norm": 8.308329582214355, "learning_rate": 0.000773691900027263, "loss": 7.5304, "step": 1195700 }, { "epoch": 4.871426403595518, "grad_norm": 7.756369590759277, "learning_rate": 0.0007733347169770532, "loss": 7.5322, "step": 1195800 }, { "epoch": 4.8718337816189, "grad_norm": 10.87598705291748, "learning_rate": 0.0007729776023888328, "loss": 7.5568, "step": 1195900 }, { "epoch": 4.872241159642281, "grad_norm": 12.378133773803711, "learning_rate": 0.0007726205562767216, "loss": 7.5558, "step": 1196000 }, { "epoch": 4.872241159642281, "eval_MaskedAccuracy": 0.5117830706125536, "eval_loss": 1.602381944656372, "eval_runtime": 164.3121, "eval_samples_per_second": 386.314, "eval_steps_per_second": 1.509, "step": 1196000 }, { "epoch": 4.872648537665663, "grad_norm": 11.273324012756348, "learning_rate": 0.000772263578654835, "loss": 7.5612, "step": 1196100 }, { "epoch": 4.8730559156890445, "grad_norm": 11.511683464050293, "learning_rate": 0.0007719066695372866, "loss": 7.5292, "step": 1196200 }, { "epoch": 4.873463293712426, "grad_norm": 4.152893543243408, "learning_rate": 0.0007715498289381868, "loss": 7.5239, "step": 1196300 }, { "epoch": 4.873870671735808, "grad_norm": 11.901667594909668, "learning_rate": 0.0007711930568716437, "loss": 7.5602, "step": 1196400 }, { "epoch": 4.874278049759189, "grad_norm": 4.4524688720703125, "learning_rate": 0.000770836353351762, "loss": 7.5427, "step": 1196500 }, { "epoch": 4.87468542778257, "grad_norm": 10.268949508666992, "learning_rate": 0.0007704797183926442, "loss": 7.5852, "step": 1196600 }, { "epoch": 4.875092805805951, "grad_norm": 5.763948917388916, "learning_rate": 0.0007701231520083893, "loss": 7.5782, "step": 1196700 }, { "epoch": 4.875500183829333, "grad_norm": 8.539776802062988, "learning_rate": 0.0007697666542130953, "loss": 7.5452, "step": 1196800 }, { "epoch": 4.875907561852714, "grad_norm": 7.586759567260742, "learning_rate": 0.0007694102250208563, "loss": 7.5259, "step": 1196900 }, { "epoch": 4.876314939876096, "grad_norm": 9.62762451171875, "learning_rate": 0.0007690538644457642, "loss": 7.5534, "step": 1197000 }, { "epoch": 4.876314939876096, "eval_MaskedAccuracy": 0.5114081034475745, "eval_loss": 1.598467469215393, "eval_runtime": 173.6027, "eval_samples_per_second": 365.639, "eval_steps_per_second": 1.429, "step": 1197000 }, { "epoch": 4.876722317899477, "grad_norm": 15.170089721679688, "learning_rate": 0.0007686975725019069, "loss": 7.5419, "step": 1197100 }, { "epoch": 4.877129695922859, "grad_norm": 6.346302509307861, "learning_rate": 0.0007683413492033714, "loss": 7.548, "step": 1197200 }, { "epoch": 4.8775370739462405, "grad_norm": 10.324873924255371, "learning_rate": 0.00076798519456424, "loss": 7.5457, "step": 1197300 }, { "epoch": 4.877944451969622, "grad_norm": 9.471755981445312, "learning_rate": 0.0007676291085985937, "loss": 7.5584, "step": 1197400 }, { "epoch": 4.8783518299930035, "grad_norm": 5.598387718200684, "learning_rate": 0.000767273091320512, "loss": 7.5556, "step": 1197500 }, { "epoch": 4.878759208016385, "grad_norm": 8.985330581665039, "learning_rate": 0.0007669171427440681, "loss": 7.537, "step": 1197600 }, { "epoch": 4.879166586039766, "grad_norm": 6.05348014831543, "learning_rate": 0.0007665612628833352, "loss": 7.5667, "step": 1197700 }, { "epoch": 4.879573964063148, "grad_norm": 6.537703990936279, "learning_rate": 0.0007662054517523839, "loss": 7.5642, "step": 1197800 }, { "epoch": 4.879981342086529, "grad_norm": 8.868982315063477, "learning_rate": 0.0007658497093652816, "loss": 7.5475, "step": 1197900 }, { "epoch": 4.88038872010991, "grad_norm": 14.07412338256836, "learning_rate": 0.0007654940357360914, "loss": 7.5654, "step": 1198000 }, { "epoch": 4.88038872010991, "eval_MaskedAccuracy": 0.5122425870288108, "eval_loss": 1.5965555906295776, "eval_runtime": 161.1757, "eval_samples_per_second": 393.831, "eval_steps_per_second": 1.539, "step": 1198000 }, { "epoch": 4.880796098133292, "grad_norm": 12.023533821105957, "learning_rate": 0.0007651384308788752, "loss": 7.5583, "step": 1198100 }, { "epoch": 4.881203476156673, "grad_norm": 10.829926490783691, "learning_rate": 0.0007647828948076922, "loss": 7.5655, "step": 1198200 }, { "epoch": 4.881610854180055, "grad_norm": 6.019172668457031, "learning_rate": 0.0007644274275365985, "loss": 7.5322, "step": 1198300 }, { "epoch": 4.8820182322034364, "grad_norm": 7.629240989685059, "learning_rate": 0.0007640720290796481, "loss": 7.5597, "step": 1198400 }, { "epoch": 4.882425610226818, "grad_norm": 5.705124378204346, "learning_rate": 0.0007637166994508922, "loss": 7.5618, "step": 1198500 }, { "epoch": 4.8828329882501995, "grad_norm": 16.90255355834961, "learning_rate": 0.0007633614386643779, "loss": 7.5585, "step": 1198600 }, { "epoch": 4.883240366273581, "grad_norm": 12.051189422607422, "learning_rate": 0.0007630062467341514, "loss": 7.5594, "step": 1198700 }, { "epoch": 4.883647744296963, "grad_norm": 10.131692886352539, "learning_rate": 0.0007626511236742547, "loss": 7.5269, "step": 1198800 }, { "epoch": 4.884055122320344, "grad_norm": 5.140071392059326, "learning_rate": 0.0007622960694987271, "loss": 7.539, "step": 1198900 }, { "epoch": 4.884462500343725, "grad_norm": 6.4076151847839355, "learning_rate": 0.0007619410842216064, "loss": 7.5137, "step": 1199000 }, { "epoch": 4.884462500343725, "eval_MaskedAccuracy": 0.5120933543858173, "eval_loss": 1.584120512008667, "eval_runtime": 160.8011, "eval_samples_per_second": 394.749, "eval_steps_per_second": 1.542, "step": 1199000 }, { "epoch": 4.884869878367106, "grad_norm": 3.249976873397827, "learning_rate": 0.0007615861678569265, "loss": 7.5453, "step": 1199100 }, { "epoch": 4.885277256390488, "grad_norm": 10.001482009887695, "learning_rate": 0.00076123132041872, "loss": 7.5468, "step": 1199200 }, { "epoch": 4.885684634413869, "grad_norm": 8.136263847351074, "learning_rate": 0.0007608765419210159, "loss": 7.5718, "step": 1199300 }, { "epoch": 4.886092012437251, "grad_norm": 3.457944631576538, "learning_rate": 0.0007605218323778406, "loss": 7.5435, "step": 1199400 }, { "epoch": 4.886499390460632, "grad_norm": 6.13539981842041, "learning_rate": 0.000760167191803218, "loss": 7.5457, "step": 1199500 }, { "epoch": 4.886906768484014, "grad_norm": 19.31534767150879, "learning_rate": 0.0007598126202111678, "loss": 7.5543, "step": 1199600 }, { "epoch": 4.8873141465073955, "grad_norm": 4.478299140930176, "learning_rate": 0.0007594581176157088, "loss": 7.5671, "step": 1199700 }, { "epoch": 4.887721524530777, "grad_norm": 3.900815010070801, "learning_rate": 0.0007591036840308561, "loss": 7.5428, "step": 1199800 }, { "epoch": 4.8881289025541586, "grad_norm": 3.649026393890381, "learning_rate": 0.0007587493194706228, "loss": 7.5303, "step": 1199900 }, { "epoch": 4.88853628057754, "grad_norm": 4.455073356628418, "learning_rate": 0.0007583950239490187, "loss": 7.5484, "step": 1200000 }, { "epoch": 4.88853628057754, "eval_MaskedAccuracy": 0.5117544590661958, "eval_loss": 1.5857608318328857, "eval_runtime": 162.1352, "eval_samples_per_second": 391.5, "eval_steps_per_second": 1.53, "step": 1200000 }, { "epoch": 4.888943658600922, "grad_norm": 10.334717750549316, "learning_rate": 0.0007580407974800497, "loss": 7.5526, "step": 1200100 }, { "epoch": 4.889351036624303, "grad_norm": 4.750763416290283, "learning_rate": 0.0007576866400777221, "loss": 7.5572, "step": 1200200 }, { "epoch": 4.889758414647684, "grad_norm": 3.92103910446167, "learning_rate": 0.000757332551756037, "loss": 7.5074, "step": 1200300 }, { "epoch": 4.890165792671065, "grad_norm": 4.871909141540527, "learning_rate": 0.0007569785325289936, "loss": 7.5334, "step": 1200400 }, { "epoch": 4.890573170694447, "grad_norm": 3.145890474319458, "learning_rate": 0.000756624582410587, "loss": 7.5789, "step": 1200500 }, { "epoch": 4.890980548717828, "grad_norm": 4.591555595397949, "learning_rate": 0.0007562707014148116, "loss": 7.5519, "step": 1200600 }, { "epoch": 4.89138792674121, "grad_norm": 4.398261547088623, "learning_rate": 0.0007559168895556579, "loss": 7.5337, "step": 1200700 }, { "epoch": 4.8917953047645915, "grad_norm": 3.0164170265197754, "learning_rate": 0.0007555631468471149, "loss": 7.5538, "step": 1200800 }, { "epoch": 4.892202682787973, "grad_norm": 7.808837413787842, "learning_rate": 0.000755209473303168, "loss": 7.5553, "step": 1200900 }, { "epoch": 4.8926100608113545, "grad_norm": 6.134701251983643, "learning_rate": 0.0007548558689377991, "loss": 7.5678, "step": 1201000 }, { "epoch": 4.8926100608113545, "eval_MaskedAccuracy": 0.5118214447349616, "eval_loss": 1.5958538055419922, "eval_runtime": 165.0679, "eval_samples_per_second": 384.545, "eval_steps_per_second": 1.502, "step": 1201000 }, { "epoch": 4.893017438834736, "grad_norm": 15.148693084716797, "learning_rate": 0.0007545023337649872, "loss": 7.556, "step": 1201100 }, { "epoch": 4.893424816858118, "grad_norm": 6.748607158660889, "learning_rate": 0.0007541488677987109, "loss": 7.5368, "step": 1201200 }, { "epoch": 4.893832194881499, "grad_norm": 17.395864486694336, "learning_rate": 0.000753795471052944, "loss": 7.5801, "step": 1201300 }, { "epoch": 4.894239572904881, "grad_norm": 14.391432762145996, "learning_rate": 0.0007534421435416585, "loss": 7.5487, "step": 1201400 }, { "epoch": 4.894646950928262, "grad_norm": 7.4796061515808105, "learning_rate": 0.0007530888852788226, "loss": 7.5542, "step": 1201500 }, { "epoch": 4.895054328951643, "grad_norm": 9.131574630737305, "learning_rate": 0.000752735696278403, "loss": 7.552, "step": 1201600 }, { "epoch": 4.895461706975024, "grad_norm": 14.780279159545898, "learning_rate": 0.0007523825765543629, "loss": 7.5478, "step": 1201700 }, { "epoch": 4.895869084998406, "grad_norm": 3.968935251235962, "learning_rate": 0.0007520295261206631, "loss": 7.5934, "step": 1201800 }, { "epoch": 4.896276463021787, "grad_norm": 11.925000190734863, "learning_rate": 0.000751676544991263, "loss": 7.5472, "step": 1201900 }, { "epoch": 4.896683841045169, "grad_norm": 13.403853416442871, "learning_rate": 0.0007513236331801159, "loss": 7.5358, "step": 1202000 }, { "epoch": 4.896683841045169, "eval_MaskedAccuracy": 0.511778153741451, "eval_loss": 1.5913338661193848, "eval_runtime": 312.7231, "eval_samples_per_second": 202.978, "eval_steps_per_second": 0.793, "step": 1202000 }, { "epoch": 4.8970912190685505, "grad_norm": 18.489057540893555, "learning_rate": 0.0007509707907011745, "loss": 7.5609, "step": 1202100 }, { "epoch": 4.897498597091932, "grad_norm": 6.835686206817627, "learning_rate": 0.0007506180175683894, "loss": 7.5582, "step": 1202200 }, { "epoch": 4.897905975115314, "grad_norm": 4.83351469039917, "learning_rate": 0.0007502653137957075, "loss": 7.5397, "step": 1202300 }, { "epoch": 4.898313353138695, "grad_norm": 17.712465286254883, "learning_rate": 0.0007499126793970722, "loss": 7.5395, "step": 1202400 }, { "epoch": 4.898720731162077, "grad_norm": 7.643850803375244, "learning_rate": 0.0007495601143864254, "loss": 7.5416, "step": 1202500 }, { "epoch": 4.899128109185458, "grad_norm": 3.950080156326294, "learning_rate": 0.0007492076187777062, "loss": 7.5261, "step": 1202600 }, { "epoch": 4.899535487208839, "grad_norm": 6.754606246948242, "learning_rate": 0.0007488551925848507, "loss": 7.5333, "step": 1202700 }, { "epoch": 4.899942865232221, "grad_norm": 13.595417976379395, "learning_rate": 0.0007485028358217915, "loss": 7.5228, "step": 1202800 }, { "epoch": 4.900350243255602, "grad_norm": 3.7433109283447266, "learning_rate": 0.0007481505485024593, "loss": 7.5514, "step": 1202900 }, { "epoch": 4.900757621278983, "grad_norm": 4.0549492835998535, "learning_rate": 0.0007477983306407826, "loss": 7.5417, "step": 1203000 }, { "epoch": 4.900757621278983, "eval_MaskedAccuracy": 0.5117592686952326, "eval_loss": 1.5817937850952148, "eval_runtime": 186.8929, "eval_samples_per_second": 339.638, "eval_steps_per_second": 1.327, "step": 1203000 }, { "epoch": 4.901164999302365, "grad_norm": 12.048553466796875, "learning_rate": 0.0007474461822506855, "loss": 7.5358, "step": 1203100 }, { "epoch": 4.9015723773257465, "grad_norm": 3.9458818435668945, "learning_rate": 0.0007470941033460917, "loss": 7.5381, "step": 1203200 }, { "epoch": 4.901979755349128, "grad_norm": 14.209798812866211, "learning_rate": 0.0007467420939409201, "loss": 7.5116, "step": 1203300 }, { "epoch": 4.9023871333725095, "grad_norm": 3.4636948108673096, "learning_rate": 0.0007463901540490868, "loss": 7.5462, "step": 1203400 }, { "epoch": 4.902794511395891, "grad_norm": 8.81868839263916, "learning_rate": 0.0007460382836845061, "loss": 7.5566, "step": 1203500 }, { "epoch": 4.903201889419273, "grad_norm": 8.757772445678711, "learning_rate": 0.0007456864828610907, "loss": 7.5783, "step": 1203600 }, { "epoch": 4.903609267442654, "grad_norm": 20.694164276123047, "learning_rate": 0.0007453347515927477, "loss": 7.555, "step": 1203700 }, { "epoch": 4.904016645466036, "grad_norm": 11.508885383605957, "learning_rate": 0.0007449830898933838, "loss": 7.5654, "step": 1203800 }, { "epoch": 4.904424023489417, "grad_norm": 4.550755500793457, "learning_rate": 0.0007446314977769009, "loss": 7.5359, "step": 1203900 }, { "epoch": 4.904831401512798, "grad_norm": 8.318005561828613, "learning_rate": 0.0007442799752571998, "loss": 7.5477, "step": 1204000 }, { "epoch": 4.904831401512798, "eval_MaskedAccuracy": 0.5113642599044101, "eval_loss": 1.5978615283966064, "eval_runtime": 165.3288, "eval_samples_per_second": 383.938, "eval_steps_per_second": 1.5, "step": 1204000 }, { "epoch": 4.905238779536179, "grad_norm": 12.498735427856445, "learning_rate": 0.0007439285223481781, "loss": 7.5535, "step": 1204100 }, { "epoch": 4.905646157559561, "grad_norm": 9.212076187133789, "learning_rate": 0.0007435771390637318, "loss": 7.532, "step": 1204200 }, { "epoch": 4.906053535582942, "grad_norm": 18.942237854003906, "learning_rate": 0.0007432258254177514, "loss": 7.5348, "step": 1204300 }, { "epoch": 4.906460913606324, "grad_norm": 13.009732246398926, "learning_rate": 0.0007428745814241261, "loss": 7.5604, "step": 1204400 }, { "epoch": 4.9068682916297055, "grad_norm": 7.673408508300781, "learning_rate": 0.0007425234070967434, "loss": 7.5487, "step": 1204500 }, { "epoch": 4.907275669653087, "grad_norm": 2.9380886554718018, "learning_rate": 0.0007421723024494868, "loss": 7.5483, "step": 1204600 }, { "epoch": 4.907683047676469, "grad_norm": 5.434240818023682, "learning_rate": 0.0007418212674962382, "loss": 7.5482, "step": 1204700 }, { "epoch": 4.90809042569985, "grad_norm": 8.399138450622559, "learning_rate": 0.0007414703022508739, "loss": 7.5417, "step": 1204800 }, { "epoch": 4.908497803723232, "grad_norm": 4.983442783355713, "learning_rate": 0.0007411194067272717, "loss": 7.5382, "step": 1204900 }, { "epoch": 4.908905181746613, "grad_norm": 4.560715198516846, "learning_rate": 0.0007407685809393033, "loss": 7.5431, "step": 1205000 }, { "epoch": 4.908905181746613, "eval_MaskedAccuracy": 0.5120175489996054, "eval_loss": 1.6002388000488281, "eval_runtime": 166.2125, "eval_samples_per_second": 381.897, "eval_steps_per_second": 1.492, "step": 1205000 }, { "epoch": 4.909312559769995, "grad_norm": 13.783533096313477, "learning_rate": 0.0007404178249008389, "loss": 7.5595, "step": 1205100 }, { "epoch": 4.909719937793376, "grad_norm": 13.345892906188965, "learning_rate": 0.0007400671386257446, "loss": 7.5346, "step": 1205200 }, { "epoch": 4.910127315816757, "grad_norm": 5.098935604095459, "learning_rate": 0.0007397165221278855, "loss": 7.5577, "step": 1205300 }, { "epoch": 4.910534693840138, "grad_norm": 3.479382276535034, "learning_rate": 0.0007393659754211242, "loss": 7.5385, "step": 1205400 }, { "epoch": 4.91094207186352, "grad_norm": 8.8301362991333, "learning_rate": 0.0007390154985193192, "loss": 7.5576, "step": 1205500 }, { "epoch": 4.9113494498869015, "grad_norm": 5.703672885894775, "learning_rate": 0.0007386650914363256, "loss": 7.5512, "step": 1205600 }, { "epoch": 4.911756827910283, "grad_norm": 8.167853355407715, "learning_rate": 0.0007383147541859996, "loss": 7.5703, "step": 1205700 }, { "epoch": 4.9121642059336645, "grad_norm": 5.829007148742676, "learning_rate": 0.0007379644867821892, "loss": 7.5496, "step": 1205800 }, { "epoch": 4.912571583957046, "grad_norm": 16.226884841918945, "learning_rate": 0.0007376142892387445, "loss": 7.5726, "step": 1205900 }, { "epoch": 4.912978961980428, "grad_norm": 18.150897979736328, "learning_rate": 0.0007372641615695092, "loss": 7.5522, "step": 1206000 }, { "epoch": 4.912978961980428, "eval_MaskedAccuracy": 0.5119257946600297, "eval_loss": 1.587544560432434, "eval_runtime": 159.4616, "eval_samples_per_second": 398.064, "eval_steps_per_second": 1.555, "step": 1206000 }, { "epoch": 4.913386340003809, "grad_norm": 7.121440410614014, "learning_rate": 0.0007369141037883261, "loss": 7.5279, "step": 1206100 }, { "epoch": 4.913793718027191, "grad_norm": 15.683066368103027, "learning_rate": 0.000736564115909035, "loss": 7.5256, "step": 1206200 }, { "epoch": 4.914201096050572, "grad_norm": 10.379250526428223, "learning_rate": 0.0007362141979454733, "loss": 7.5627, "step": 1206300 }, { "epoch": 4.914608474073954, "grad_norm": 11.896134376525879, "learning_rate": 0.0007358643499114731, "loss": 7.5345, "step": 1206400 }, { "epoch": 4.915015852097335, "grad_norm": 4.112306594848633, "learning_rate": 0.0007355145718208677, "loss": 7.5472, "step": 1206500 }, { "epoch": 4.915423230120716, "grad_norm": 4.409004211425781, "learning_rate": 0.0007351648636874854, "loss": 7.5803, "step": 1206600 }, { "epoch": 4.9158306081440974, "grad_norm": 8.464868545532227, "learning_rate": 0.0007348152255251507, "loss": 7.505, "step": 1206700 }, { "epoch": 4.916237986167479, "grad_norm": 7.594976902008057, "learning_rate": 0.0007344656573476882, "loss": 7.5448, "step": 1206800 }, { "epoch": 4.9166453641908605, "grad_norm": 4.152006149291992, "learning_rate": 0.0007341161591689183, "loss": 7.5382, "step": 1206900 }, { "epoch": 4.917052742214242, "grad_norm": 3.523061990737915, "learning_rate": 0.0007337667310026584, "loss": 7.5292, "step": 1207000 }, { "epoch": 4.917052742214242, "eval_MaskedAccuracy": 0.5113573410134107, "eval_loss": 1.6028958559036255, "eval_runtime": 154.3865, "eval_samples_per_second": 411.15, "eval_steps_per_second": 1.606, "step": 1207000 }, { "epoch": 4.917460120237624, "grad_norm": 4.540543556213379, "learning_rate": 0.0007334173728627229, "loss": 7.5733, "step": 1207100 }, { "epoch": 4.917867498261005, "grad_norm": 6.582998275756836, "learning_rate": 0.0007330680847629223, "loss": 7.5597, "step": 1207200 }, { "epoch": 4.918274876284387, "grad_norm": 4.1407904624938965, "learning_rate": 0.0007327188667170672, "loss": 7.5584, "step": 1207300 }, { "epoch": 4.918682254307768, "grad_norm": 13.84073257446289, "learning_rate": 0.0007323697187389647, "loss": 7.5483, "step": 1207400 }, { "epoch": 4.91908963233115, "grad_norm": 6.500828742980957, "learning_rate": 0.0007320206408424183, "loss": 7.5393, "step": 1207500 }, { "epoch": 4.919497010354531, "grad_norm": 3.3657803535461426, "learning_rate": 0.0007316716330412285, "loss": 7.542, "step": 1207600 }, { "epoch": 4.919904388377912, "grad_norm": 4.423755168914795, "learning_rate": 0.000731322695349193, "loss": 7.5362, "step": 1207700 }, { "epoch": 4.920311766401294, "grad_norm": 3.881791353225708, "learning_rate": 0.0007309738277801082, "loss": 7.5344, "step": 1207800 }, { "epoch": 4.920719144424675, "grad_norm": 10.870352745056152, "learning_rate": 0.0007306250303477654, "loss": 7.5518, "step": 1207900 }, { "epoch": 4.9211265224480565, "grad_norm": 4.204110145568848, "learning_rate": 0.0007302763030659553, "loss": 7.5573, "step": 1208000 }, { "epoch": 4.9211265224480565, "eval_MaskedAccuracy": 0.512164743597124, "eval_loss": 1.5818017721176147, "eval_runtime": 187.1196, "eval_samples_per_second": 339.227, "eval_steps_per_second": 1.325, "step": 1208000 }, { "epoch": 4.921533900471438, "grad_norm": 12.669214248657227, "learning_rate": 0.0007299276459484648, "loss": 7.5223, "step": 1208100 }, { "epoch": 4.9219412784948195, "grad_norm": 6.498908996582031, "learning_rate": 0.0007295790590090782, "loss": 7.5455, "step": 1208200 }, { "epoch": 4.922348656518201, "grad_norm": 11.104379653930664, "learning_rate": 0.0007292305422615774, "loss": 7.5407, "step": 1208300 }, { "epoch": 4.922756034541583, "grad_norm": 2.674159526824951, "learning_rate": 0.0007288820957197401, "loss": 7.5591, "step": 1208400 }, { "epoch": 4.923163412564964, "grad_norm": 15.331785202026367, "learning_rate": 0.000728533719397344, "loss": 7.5599, "step": 1208500 }, { "epoch": 4.923570790588346, "grad_norm": 8.375259399414062, "learning_rate": 0.0007281854133081597, "loss": 7.5597, "step": 1208600 }, { "epoch": 4.923978168611727, "grad_norm": 11.889664649963379, "learning_rate": 0.0007278371774659603, "loss": 7.5819, "step": 1208700 }, { "epoch": 4.924385546635109, "grad_norm": 7.2464141845703125, "learning_rate": 0.0007274890118845125, "loss": 7.578, "step": 1208800 }, { "epoch": 4.92479292465849, "grad_norm": 4.1132659912109375, "learning_rate": 0.0007271409165775803, "loss": 7.5622, "step": 1208900 }, { "epoch": 4.925200302681871, "grad_norm": 7.361584186553955, "learning_rate": 0.0007267928915589258, "loss": 7.5246, "step": 1209000 }, { "epoch": 4.925200302681871, "eval_MaskedAccuracy": 0.5119248000298258, "eval_loss": 1.5902751684188843, "eval_runtime": 198.846, "eval_samples_per_second": 319.222, "eval_steps_per_second": 1.247, "step": 1209000 }, { "epoch": 4.9256076807052525, "grad_norm": 8.776246070861816, "learning_rate": 0.000726444936842309, "loss": 7.5099, "step": 1209100 }, { "epoch": 4.926015058728634, "grad_norm": 16.761333465576172, "learning_rate": 0.0007260970524414868, "loss": 7.5745, "step": 1209200 }, { "epoch": 4.9264224367520155, "grad_norm": 7.601357936859131, "learning_rate": 0.0007257492383702113, "loss": 7.5531, "step": 1209300 }, { "epoch": 4.926829814775397, "grad_norm": 3.0784542560577393, "learning_rate": 0.0007254014946422347, "loss": 7.5417, "step": 1209400 }, { "epoch": 4.927237192798779, "grad_norm": 13.504761695861816, "learning_rate": 0.0007250538212713058, "loss": 7.5265, "step": 1209500 }, { "epoch": 4.92764457082216, "grad_norm": 4.976802349090576, "learning_rate": 0.0007247062182711677, "loss": 7.5552, "step": 1209600 }, { "epoch": 4.928051948845542, "grad_norm": 7.255842208862305, "learning_rate": 0.000724358685655566, "loss": 7.5497, "step": 1209700 }, { "epoch": 4.928459326868923, "grad_norm": 6.7689528465271, "learning_rate": 0.0007240112234382386, "loss": 7.5878, "step": 1209800 }, { "epoch": 4.928866704892305, "grad_norm": 3.824451208114624, "learning_rate": 0.0007236638316329236, "loss": 7.5322, "step": 1209900 }, { "epoch": 4.929274082915686, "grad_norm": 9.546440124511719, "learning_rate": 0.0007233165102533543, "loss": 7.5591, "step": 1210000 }, { "epoch": 4.929274082915686, "eval_MaskedAccuracy": 0.5115372377019847, "eval_loss": 1.60365891456604, "eval_runtime": 155.2988, "eval_samples_per_second": 408.735, "eval_steps_per_second": 1.597, "step": 1210000 }, { "epoch": 4.929681460939068, "grad_norm": 5.6259002685546875, "learning_rate": 0.0007229692593132624, "loss": 7.5387, "step": 1210100 }, { "epoch": 4.930088838962449, "grad_norm": 15.719690322875977, "learning_rate": 0.0007226220788263771, "loss": 7.5471, "step": 1210200 }, { "epoch": 4.93049621698583, "grad_norm": 8.795516967773438, "learning_rate": 0.0007222749688064227, "loss": 7.5553, "step": 1210300 }, { "epoch": 4.9309035950092115, "grad_norm": 5.547180652618408, "learning_rate": 0.0007219279292671238, "loss": 7.5224, "step": 1210400 }, { "epoch": 4.931310973032593, "grad_norm": 3.618469476699829, "learning_rate": 0.0007215809602222006, "loss": 7.5578, "step": 1210500 }, { "epoch": 4.931718351055975, "grad_norm": 9.542546272277832, "learning_rate": 0.00072123406168537, "loss": 7.5169, "step": 1210600 }, { "epoch": 4.932125729079356, "grad_norm": 9.178125381469727, "learning_rate": 0.0007208872336703475, "loss": 7.5434, "step": 1210700 }, { "epoch": 4.932533107102738, "grad_norm": 3.8660006523132324, "learning_rate": 0.0007205404761908446, "loss": 7.5454, "step": 1210800 }, { "epoch": 4.932940485126119, "grad_norm": 6.547489166259766, "learning_rate": 0.000720193789260571, "loss": 7.5517, "step": 1210900 }, { "epoch": 4.933347863149501, "grad_norm": 4.7753825187683105, "learning_rate": 0.0007198471728932326, "loss": 7.5577, "step": 1211000 }, { "epoch": 4.933347863149501, "eval_MaskedAccuracy": 0.512251721091413, "eval_loss": 1.5960654020309448, "eval_runtime": 161.1451, "eval_samples_per_second": 393.906, "eval_steps_per_second": 1.539, "step": 1211000 }, { "epoch": 4.933755241172882, "grad_norm": 12.791091918945312, "learning_rate": 0.000719500627102533, "loss": 7.5337, "step": 1211100 }, { "epoch": 4.934162619196264, "grad_norm": 9.975597381591797, "learning_rate": 0.0007191541519021734, "loss": 7.5377, "step": 1211200 }, { "epoch": 4.934569997219645, "grad_norm": 8.221304893493652, "learning_rate": 0.0007188077473058509, "loss": 7.5487, "step": 1211300 }, { "epoch": 4.934977375243027, "grad_norm": 7.139200210571289, "learning_rate": 0.0007184614133272621, "loss": 7.5161, "step": 1211400 }, { "epoch": 4.935384753266408, "grad_norm": 7.881173610687256, "learning_rate": 0.0007181151499800989, "loss": 7.545, "step": 1211500 }, { "epoch": 4.935792131289789, "grad_norm": 13.266868591308594, "learning_rate": 0.0007177689572780508, "loss": 7.5327, "step": 1211600 }, { "epoch": 4.9361995093131705, "grad_norm": 5.918792247772217, "learning_rate": 0.0007174228352348036, "loss": 7.5473, "step": 1211700 }, { "epoch": 4.936606887336552, "grad_norm": 8.716567039489746, "learning_rate": 0.0007170767838640428, "loss": 7.5205, "step": 1211800 }, { "epoch": 4.937014265359934, "grad_norm": 9.766985893249512, "learning_rate": 0.0007167308031794493, "loss": 7.5614, "step": 1211900 }, { "epoch": 4.937421643383315, "grad_norm": 9.446882247924805, "learning_rate": 0.0007163848931947004, "loss": 7.5108, "step": 1212000 }, { "epoch": 4.937421643383315, "eval_MaskedAccuracy": 0.5119367433537259, "eval_loss": 1.5985194444656372, "eval_runtime": 176.7689, "eval_samples_per_second": 359.09, "eval_steps_per_second": 1.403, "step": 1212000 }, { "epoch": 4.937829021406697, "grad_norm": 5.008060455322266, "learning_rate": 0.0007160390539234742, "loss": 7.5541, "step": 1212100 }, { "epoch": 4.938236399430078, "grad_norm": 6.689391136169434, "learning_rate": 0.0007156932853794427, "loss": 7.5419, "step": 1212200 }, { "epoch": 4.93864377745346, "grad_norm": 7.030514717102051, "learning_rate": 0.0007153475875762749, "loss": 7.5473, "step": 1212300 }, { "epoch": 4.939051155476841, "grad_norm": 9.665675163269043, "learning_rate": 0.0007150019605276384, "loss": 7.5458, "step": 1212400 }, { "epoch": 4.939458533500223, "grad_norm": 6.615889549255371, "learning_rate": 0.0007146564042471975, "loss": 7.5769, "step": 1212500 }, { "epoch": 4.939865911523604, "grad_norm": 17.90924644470215, "learning_rate": 0.0007143109187486155, "loss": 7.5709, "step": 1212600 }, { "epoch": 4.940273289546985, "grad_norm": 9.423774719238281, "learning_rate": 0.0007139655040455492, "loss": 7.547, "step": 1212700 }, { "epoch": 4.940680667570367, "grad_norm": 14.463005065917969, "learning_rate": 0.0007136201601516552, "loss": 7.5456, "step": 1212800 }, { "epoch": 4.941088045593748, "grad_norm": 15.357586860656738, "learning_rate": 0.0007132748870805885, "loss": 7.5488, "step": 1212900 }, { "epoch": 4.94149542361713, "grad_norm": 19.023876190185547, "learning_rate": 0.0007129296848459989, "loss": 7.5766, "step": 1213000 }, { "epoch": 4.94149542361713, "eval_MaskedAccuracy": 0.5120620997588456, "eval_loss": 1.595471739768982, "eval_runtime": 168.1678, "eval_samples_per_second": 377.456, "eval_steps_per_second": 1.475, "step": 1213000 }, { "epoch": 4.941902801640511, "grad_norm": 4.344493865966797, "learning_rate": 0.0007125845534615327, "loss": 7.5458, "step": 1213100 }, { "epoch": 4.942310179663893, "grad_norm": 19.665414810180664, "learning_rate": 0.000712239492940836, "loss": 7.5566, "step": 1213200 }, { "epoch": 4.942717557687274, "grad_norm": 12.394417762756348, "learning_rate": 0.0007118945032975498, "loss": 7.5318, "step": 1213300 }, { "epoch": 4.943124935710656, "grad_norm": 8.70321273803711, "learning_rate": 0.0007115495845453144, "loss": 7.5444, "step": 1213400 }, { "epoch": 4.943532313734037, "grad_norm": 7.687065124511719, "learning_rate": 0.0007112047366977659, "loss": 7.5605, "step": 1213500 }, { "epoch": 4.943939691757419, "grad_norm": 8.121780395507812, "learning_rate": 0.0007108599597685379, "loss": 7.5398, "step": 1213600 }, { "epoch": 4.9443470697808, "grad_norm": 3.469874382019043, "learning_rate": 0.0007105152537712617, "loss": 7.5462, "step": 1213700 }, { "epoch": 4.944754447804182, "grad_norm": 12.128028869628906, "learning_rate": 0.0007101706187195658, "loss": 7.5294, "step": 1213800 }, { "epoch": 4.945161825827563, "grad_norm": 18.595476150512695, "learning_rate": 0.0007098260546270746, "loss": 7.553, "step": 1213900 }, { "epoch": 4.945569203850944, "grad_norm": 11.615880966186523, "learning_rate": 0.0007094815615074105, "loss": 7.5637, "step": 1214000 }, { "epoch": 4.945569203850944, "eval_MaskedAccuracy": 0.5119515891990868, "eval_loss": 1.5936355590820312, "eval_runtime": 159.4551, "eval_samples_per_second": 398.081, "eval_steps_per_second": 1.555, "step": 1214000 }, { "epoch": 4.9459765818743255, "grad_norm": 18.023178100585938, "learning_rate": 0.0007091371393741934, "loss": 7.5547, "step": 1214100 }, { "epoch": 4.946383959897707, "grad_norm": 6.357263088226318, "learning_rate": 0.0007087927882410394, "loss": 7.5684, "step": 1214200 }, { "epoch": 4.946791337921089, "grad_norm": 20.91889190673828, "learning_rate": 0.0007084485081215631, "loss": 7.5231, "step": 1214300 }, { "epoch": 4.94719871594447, "grad_norm": 7.93168306350708, "learning_rate": 0.0007081042990293762, "loss": 7.5154, "step": 1214400 }, { "epoch": 4.947606093967852, "grad_norm": 3.209022045135498, "learning_rate": 0.000707760160978087, "loss": 7.5569, "step": 1214500 }, { "epoch": 4.948013471991233, "grad_norm": 5.462933540344238, "learning_rate": 0.0007074160939813006, "loss": 7.5551, "step": 1214600 }, { "epoch": 4.948420850014615, "grad_norm": 7.854034900665283, "learning_rate": 0.0007070720980526205, "loss": 7.5412, "step": 1214700 }, { "epoch": 4.948828228037996, "grad_norm": 5.540817737579346, "learning_rate": 0.0007067281732056463, "loss": 7.5218, "step": 1214800 }, { "epoch": 4.949235606061378, "grad_norm": 5.173191070556641, "learning_rate": 0.0007063843194539759, "loss": 7.5735, "step": 1214900 }, { "epoch": 4.949642984084759, "grad_norm": 7.646540641784668, "learning_rate": 0.0007060405368112032, "loss": 7.5173, "step": 1215000 }, { "epoch": 4.949642984084759, "eval_MaskedAccuracy": 0.5120245841505193, "eval_loss": 1.590340256690979, "eval_runtime": 181.7862, "eval_samples_per_second": 349.179, "eval_steps_per_second": 1.364, "step": 1215000 }, { "epoch": 4.950050362108141, "grad_norm": 5.17561149597168, "learning_rate": 0.000705696825290919, "loss": 7.563, "step": 1215100 }, { "epoch": 4.950457740131522, "grad_norm": 5.196922302246094, "learning_rate": 0.0007053531849067126, "loss": 7.5658, "step": 1215200 }, { "epoch": 4.950865118154903, "grad_norm": 9.764001846313477, "learning_rate": 0.0007050096156721698, "loss": 7.5574, "step": 1215300 }, { "epoch": 4.951272496178285, "grad_norm": 5.893946647644043, "learning_rate": 0.0007046661176008743, "loss": 7.5621, "step": 1215400 }, { "epoch": 4.951679874201666, "grad_norm": 4.240930557250977, "learning_rate": 0.0007043226907064059, "loss": 7.5274, "step": 1215500 }, { "epoch": 4.952087252225048, "grad_norm": 5.049214839935303, "learning_rate": 0.0007039793350023427, "loss": 7.5195, "step": 1215600 }, { "epoch": 4.952494630248429, "grad_norm": 7.266434192657471, "learning_rate": 0.0007036360505022579, "loss": 7.5359, "step": 1215700 }, { "epoch": 4.952902008271811, "grad_norm": 8.328526496887207, "learning_rate": 0.000703292837219725, "loss": 7.5646, "step": 1215800 }, { "epoch": 4.953309386295192, "grad_norm": 9.4588623046875, "learning_rate": 0.0007029496951683128, "loss": 7.5433, "step": 1215900 }, { "epoch": 4.953716764318574, "grad_norm": 16.228639602661133, "learning_rate": 0.0007026066243615868, "loss": 7.5312, "step": 1216000 }, { "epoch": 4.953716764318574, "eval_MaskedAccuracy": 0.5119462318759695, "eval_loss": 1.586015224456787, "eval_runtime": 157.8224, "eval_samples_per_second": 402.199, "eval_steps_per_second": 1.571, "step": 1216000 }, { "epoch": 4.954124142341955, "grad_norm": 6.658722877502441, "learning_rate": 0.0007022636248131115, "loss": 7.5458, "step": 1216100 }, { "epoch": 4.954531520365337, "grad_norm": 19.78124237060547, "learning_rate": 0.0007019206965364466, "loss": 7.5342, "step": 1216200 }, { "epoch": 4.954938898388718, "grad_norm": 13.313947677612305, "learning_rate": 0.0007015778395451491, "loss": 7.5773, "step": 1216300 }, { "epoch": 4.9553462764121, "grad_norm": 4.927038192749023, "learning_rate": 0.000701235053852775, "loss": 7.5592, "step": 1216400 }, { "epoch": 4.955753654435481, "grad_norm": 9.97121810913086, "learning_rate": 0.0007008923394728761, "loss": 7.5425, "step": 1216500 }, { "epoch": 4.956161032458862, "grad_norm": 16.75101661682129, "learning_rate": 0.0007005496964190028, "loss": 7.5683, "step": 1216600 }, { "epoch": 4.956568410482244, "grad_norm": 10.632609367370605, "learning_rate": 0.0007002071247047009, "loss": 7.5703, "step": 1216700 }, { "epoch": 4.956975788505625, "grad_norm": 9.779709815979004, "learning_rate": 0.0006998646243435141, "loss": 7.5214, "step": 1216800 }, { "epoch": 4.957383166529007, "grad_norm": 6.6657795906066895, "learning_rate": 0.0006995221953489826, "loss": 7.5268, "step": 1216900 }, { "epoch": 4.957790544552388, "grad_norm": 18.713905334472656, "learning_rate": 0.0006991798377346457, "loss": 7.5422, "step": 1217000 }, { "epoch": 4.957790544552388, "eval_MaskedAccuracy": 0.5123652625997255, "eval_loss": 1.5796376466751099, "eval_runtime": 164.9988, "eval_samples_per_second": 384.706, "eval_steps_per_second": 1.503, "step": 1217000 }, { "epoch": 4.95819792257577, "grad_norm": 10.145832061767578, "learning_rate": 0.0006988375515140388, "loss": 7.5367, "step": 1217100 }, { "epoch": 4.958605300599151, "grad_norm": 17.383893966674805, "learning_rate": 0.0006984953367006932, "loss": 7.548, "step": 1217200 }, { "epoch": 4.959012678622533, "grad_norm": 3.454498291015625, "learning_rate": 0.0006981531933081381, "loss": 7.5563, "step": 1217300 }, { "epoch": 4.959420056645914, "grad_norm": 13.539298057556152, "learning_rate": 0.0006978111213499014, "loss": 7.5305, "step": 1217400 }, { "epoch": 4.959827434669296, "grad_norm": 9.780884742736816, "learning_rate": 0.0006974691208395057, "loss": 7.5471, "step": 1217500 }, { "epoch": 4.960234812692677, "grad_norm": 3.3330161571502686, "learning_rate": 0.0006971271917904734, "loss": 7.5407, "step": 1217600 }, { "epoch": 4.960642190716058, "grad_norm": 5.288601398468018, "learning_rate": 0.0006967853342163229, "loss": 7.5269, "step": 1217700 }, { "epoch": 4.9610495687394405, "grad_norm": 7.116894245147705, "learning_rate": 0.0006964435481305688, "loss": 7.547, "step": 1217800 }, { "epoch": 4.961456946762821, "grad_norm": 6.527875900268555, "learning_rate": 0.0006961018335467239, "loss": 7.5503, "step": 1217900 }, { "epoch": 4.961864324786203, "grad_norm": 6.521652698516846, "learning_rate": 0.0006957601904782982, "loss": 7.5476, "step": 1218000 }, { "epoch": 4.961864324786203, "eval_MaskedAccuracy": 0.512317709785432, "eval_loss": 1.587033748626709, "eval_runtime": 198.291, "eval_samples_per_second": 320.115, "eval_steps_per_second": 1.251, "step": 1218000 }, { "epoch": 4.962271702809584, "grad_norm": 5.053981304168701, "learning_rate": 0.0006954186189387996, "loss": 7.5563, "step": 1218100 }, { "epoch": 4.962679080832966, "grad_norm": 8.57411003112793, "learning_rate": 0.000695077118941731, "loss": 7.5518, "step": 1218200 }, { "epoch": 4.963086458856347, "grad_norm": 6.027956008911133, "learning_rate": 0.0006947356905005938, "loss": 7.5219, "step": 1218300 }, { "epoch": 4.963493836879729, "grad_norm": 13.150764465332031, "learning_rate": 0.0006943943336288865, "loss": 7.5229, "step": 1218400 }, { "epoch": 4.96390121490311, "grad_norm": 6.306985378265381, "learning_rate": 0.0006940530483401045, "loss": 7.5328, "step": 1218500 }, { "epoch": 4.964308592926492, "grad_norm": 3.376652717590332, "learning_rate": 0.0006937118346477413, "loss": 7.5606, "step": 1218600 }, { "epoch": 4.964715970949873, "grad_norm": 3.4649882316589355, "learning_rate": 0.0006933706925652879, "loss": 7.5298, "step": 1218700 }, { "epoch": 4.965123348973255, "grad_norm": 4.103401184082031, "learning_rate": 0.0006930296221062292, "loss": 7.5757, "step": 1218800 }, { "epoch": 4.9655307269966364, "grad_norm": 5.4330925941467285, "learning_rate": 0.0006926886232840514, "loss": 7.5406, "step": 1218900 }, { "epoch": 4.965938105020017, "grad_norm": 5.642839431762695, "learning_rate": 0.0006923476961122346, "loss": 7.5665, "step": 1219000 }, { "epoch": 4.965938105020017, "eval_MaskedAccuracy": 0.511894358141707, "eval_loss": 1.5927015542984009, "eval_runtime": 162.5161, "eval_samples_per_second": 390.583, "eval_steps_per_second": 1.526, "step": 1219000 }, { "epoch": 4.966345483043399, "grad_norm": 4.599015712738037, "learning_rate": 0.0006920068406042587, "loss": 7.5584, "step": 1219100 }, { "epoch": 4.96675286106678, "grad_norm": 4.394110679626465, "learning_rate": 0.0006916660567735992, "loss": 7.5543, "step": 1219200 }, { "epoch": 4.967160239090162, "grad_norm": 6.67686128616333, "learning_rate": 0.000691325344633728, "loss": 7.5341, "step": 1219300 }, { "epoch": 4.967567617113543, "grad_norm": 7.108152389526367, "learning_rate": 0.0006909847041981161, "loss": 7.539, "step": 1219400 }, { "epoch": 4.967974995136925, "grad_norm": 10.382682800292969, "learning_rate": 0.0006906441354802306, "loss": 7.5425, "step": 1219500 }, { "epoch": 4.968382373160306, "grad_norm": 9.100656509399414, "learning_rate": 0.0006903036384935381, "loss": 7.506, "step": 1219600 }, { "epoch": 4.968789751183688, "grad_norm": 4.32513952255249, "learning_rate": 0.000689963213251497, "loss": 7.515, "step": 1219700 }, { "epoch": 4.969197129207069, "grad_norm": 7.653156280517578, "learning_rate": 0.0006896228597675673, "loss": 7.5366, "step": 1219800 }, { "epoch": 4.969604507230451, "grad_norm": 15.72667121887207, "learning_rate": 0.0006892825780552056, "loss": 7.5449, "step": 1219900 }, { "epoch": 4.970011885253832, "grad_norm": 9.174826622009277, "learning_rate": 0.0006889423681278641, "loss": 7.5405, "step": 1220000 }, { "epoch": 4.970011885253832, "eval_MaskedAccuracy": 0.512115965289989, "eval_loss": 1.590442419052124, "eval_runtime": 160.9373, "eval_samples_per_second": 394.415, "eval_steps_per_second": 1.541, "step": 1220000 }, { "epoch": 4.970419263277214, "grad_norm": 6.958202362060547, "learning_rate": 0.0006886022299989953, "loss": 7.5262, "step": 1220100 }, { "epoch": 4.9708266413005955, "grad_norm": 8.148906707763672, "learning_rate": 0.0006882621636820443, "loss": 7.5343, "step": 1220200 }, { "epoch": 4.971234019323976, "grad_norm": 14.049768447875977, "learning_rate": 0.0006879221691904563, "loss": 7.5337, "step": 1220300 }, { "epoch": 4.971641397347358, "grad_norm": 6.911771297454834, "learning_rate": 0.0006875822465376738, "loss": 7.5715, "step": 1220400 }, { "epoch": 4.972048775370739, "grad_norm": 6.940609455108643, "learning_rate": 0.0006872423957371355, "loss": 7.5754, "step": 1220500 }, { "epoch": 4.972456153394121, "grad_norm": 11.549186706542969, "learning_rate": 0.000686902616802278, "loss": 7.5271, "step": 1220600 }, { "epoch": 4.972863531417502, "grad_norm": 3.5038044452667236, "learning_rate": 0.0006865629097465335, "loss": 7.529, "step": 1220700 }, { "epoch": 4.973270909440884, "grad_norm": 8.903679847717285, "learning_rate": 0.0006862232745833321, "loss": 7.5604, "step": 1220800 }, { "epoch": 4.973678287464265, "grad_norm": 19.229045867919922, "learning_rate": 0.0006858837113261025, "loss": 7.5344, "step": 1220900 }, { "epoch": 4.974085665487647, "grad_norm": 9.70226001739502, "learning_rate": 0.0006855442199882678, "loss": 7.5518, "step": 1221000 }, { "epoch": 4.974085665487647, "eval_MaskedAccuracy": 0.5120600921687889, "eval_loss": 1.584570288658142, "eval_runtime": 157.4361, "eval_samples_per_second": 403.186, "eval_steps_per_second": 1.575, "step": 1221000 }, { "epoch": 4.974493043511028, "grad_norm": 13.518266677856445, "learning_rate": 0.000685204800583253, "loss": 7.553, "step": 1221100 }, { "epoch": 4.97490042153441, "grad_norm": 13.211006164550781, "learning_rate": 0.0006848654531244754, "loss": 7.5281, "step": 1221200 }, { "epoch": 4.9753077995577915, "grad_norm": 12.042580604553223, "learning_rate": 0.0006845261776253507, "loss": 7.5236, "step": 1221300 }, { "epoch": 4.975715177581173, "grad_norm": 4.958207607269287, "learning_rate": 0.0006841869740992928, "loss": 7.5424, "step": 1221400 }, { "epoch": 4.9761225556045545, "grad_norm": 3.994436740875244, "learning_rate": 0.0006838478425597126, "loss": 7.5442, "step": 1221500 }, { "epoch": 4.976529933627935, "grad_norm": 8.007580757141113, "learning_rate": 0.0006835087830200165, "loss": 7.5554, "step": 1221600 }, { "epoch": 4.976937311651317, "grad_norm": 4.834044933319092, "learning_rate": 0.0006831697954936109, "loss": 7.5528, "step": 1221700 }, { "epoch": 4.977344689674698, "grad_norm": 7.90855073928833, "learning_rate": 0.0006828308799938971, "loss": 7.5414, "step": 1221800 }, { "epoch": 4.97775206769808, "grad_norm": 7.272604465484619, "learning_rate": 0.000682492036534274, "loss": 7.5618, "step": 1221900 }, { "epoch": 4.978159445721461, "grad_norm": 13.992048263549805, "learning_rate": 0.000682153265128138, "loss": 7.5439, "step": 1222000 }, { "epoch": 4.978159445721461, "eval_MaskedAccuracy": 0.511423294216237, "eval_loss": 1.590978980064392, "eval_runtime": 180.8448, "eval_samples_per_second": 350.997, "eval_steps_per_second": 1.371, "step": 1222000 }, { "epoch": 4.978566823744843, "grad_norm": 15.217825889587402, "learning_rate": 0.0006818145657888822, "loss": 7.5627, "step": 1222100 }, { "epoch": 4.978974201768224, "grad_norm": 6.383872032165527, "learning_rate": 0.0006814759385298989, "loss": 7.5386, "step": 1222200 }, { "epoch": 4.979381579791606, "grad_norm": 13.450451850891113, "learning_rate": 0.0006811373833645746, "loss": 7.5648, "step": 1222300 }, { "epoch": 4.979788957814987, "grad_norm": 12.187459945678711, "learning_rate": 0.0006807989003062939, "loss": 7.5256, "step": 1222400 }, { "epoch": 4.980196335838369, "grad_norm": 9.030609130859375, "learning_rate": 0.0006804604893684383, "loss": 7.5648, "step": 1222500 }, { "epoch": 4.9806037138617505, "grad_norm": 6.992572784423828, "learning_rate": 0.0006801221505643891, "loss": 7.5238, "step": 1222600 }, { "epoch": 4.981011091885131, "grad_norm": 3.9543697834014893, "learning_rate": 0.0006797838839075205, "loss": 7.5242, "step": 1222700 }, { "epoch": 4.981418469908514, "grad_norm": 9.706334114074707, "learning_rate": 0.0006794456894112076, "loss": 7.5423, "step": 1222800 }, { "epoch": 4.981825847931894, "grad_norm": 4.013082504272461, "learning_rate": 0.0006791075670888206, "loss": 7.5461, "step": 1222900 }, { "epoch": 4.982233225955276, "grad_norm": 6.769832611083984, "learning_rate": 0.0006787695169537265, "loss": 7.518, "step": 1223000 }, { "epoch": 4.982233225955276, "eval_MaskedAccuracy": 0.5118910308254481, "eval_loss": 1.6035711765289307, "eval_runtime": 172.2235, "eval_samples_per_second": 368.568, "eval_steps_per_second": 1.44, "step": 1223000 }, { "epoch": 4.982640603978657, "grad_norm": 9.585030555725098, "learning_rate": 0.0006784315390192912, "loss": 7.5537, "step": 1223100 }, { "epoch": 4.983047982002039, "grad_norm": 14.527358055114746, "learning_rate": 0.0006780936332988761, "loss": 7.5511, "step": 1223200 }, { "epoch": 4.98345536002542, "grad_norm": 9.325721740722656, "learning_rate": 0.0006777557998058404, "loss": 7.5221, "step": 1223300 }, { "epoch": 4.983862738048802, "grad_norm": 6.756707668304443, "learning_rate": 0.0006774180385535401, "loss": 7.5538, "step": 1223400 }, { "epoch": 4.984270116072183, "grad_norm": 9.356431007385254, "learning_rate": 0.0006770803495553293, "loss": 7.5535, "step": 1223500 }, { "epoch": 4.984677494095565, "grad_norm": 3.1493470668792725, "learning_rate": 0.00067674273282456, "loss": 7.5381, "step": 1223600 }, { "epoch": 4.9850848721189465, "grad_norm": 6.802088737487793, "learning_rate": 0.0006764051883745777, "loss": 7.5455, "step": 1223700 }, { "epoch": 4.985492250142328, "grad_norm": 18.060728073120117, "learning_rate": 0.0006760677162187292, "loss": 7.5534, "step": 1223800 }, { "epoch": 4.9858996281657095, "grad_norm": 3.9281394481658936, "learning_rate": 0.0006757303163703543, "loss": 7.5359, "step": 1223900 }, { "epoch": 4.98630700618909, "grad_norm": 12.208638191223145, "learning_rate": 0.0006753929888427948, "loss": 7.5467, "step": 1224000 }, { "epoch": 4.98630700618909, "eval_MaskedAccuracy": 0.5127115042746659, "eval_loss": 1.5897910594940186, "eval_runtime": 161.2034, "eval_samples_per_second": 393.763, "eval_steps_per_second": 1.538, "step": 1224000 }, { "epoch": 4.986714384212472, "grad_norm": 3.983851671218872, "learning_rate": 0.000675055733649386, "loss": 7.545, "step": 1224100 }, { "epoch": 4.987121762235853, "grad_norm": 11.683621406555176, "learning_rate": 0.000674718550803461, "loss": 7.5659, "step": 1224200 }, { "epoch": 4.987529140259235, "grad_norm": 5.003474712371826, "learning_rate": 0.0006743814403183518, "loss": 7.5639, "step": 1224300 }, { "epoch": 4.987936518282616, "grad_norm": 13.809906005859375, "learning_rate": 0.0006740444022073836, "loss": 7.5556, "step": 1224400 }, { "epoch": 4.988343896305998, "grad_norm": 12.487983703613281, "learning_rate": 0.0006737074364838838, "loss": 7.5397, "step": 1224500 }, { "epoch": 4.988751274329379, "grad_norm": 11.203004837036133, "learning_rate": 0.0006733705431611725, "loss": 7.5272, "step": 1224600 }, { "epoch": 4.989158652352761, "grad_norm": 5.616673946380615, "learning_rate": 0.0006730337222525702, "loss": 7.545, "step": 1224700 }, { "epoch": 4.989566030376142, "grad_norm": 6.645087718963623, "learning_rate": 0.0006726969737713924, "loss": 7.5325, "step": 1224800 }, { "epoch": 4.989973408399524, "grad_norm": 8.293774604797363, "learning_rate": 0.0006723602977309536, "loss": 7.5518, "step": 1224900 }, { "epoch": 4.9903807864229055, "grad_norm": 7.347590923309326, "learning_rate": 0.0006720236941445641, "loss": 7.5345, "step": 1225000 }, { "epoch": 4.9903807864229055, "eval_MaskedAccuracy": 0.5120031573281795, "eval_loss": 1.5872365236282349, "eval_runtime": 169.0849, "eval_samples_per_second": 375.409, "eval_steps_per_second": 1.467, "step": 1225000 }, { "epoch": 4.990788164446287, "grad_norm": 6.247953414916992, "learning_rate": 0.0006716871630255315, "loss": 7.5583, "step": 1225100 }, { "epoch": 4.991195542469669, "grad_norm": 9.45461654663086, "learning_rate": 0.0006713507043871601, "loss": 7.5196, "step": 1225200 }, { "epoch": 4.991602920493049, "grad_norm": 9.74386215209961, "learning_rate": 0.0006710143182427533, "loss": 7.5245, "step": 1225300 }, { "epoch": 4.992010298516431, "grad_norm": 15.22220230102539, "learning_rate": 0.0006706780046056085, "loss": 7.5144, "step": 1225400 }, { "epoch": 4.992417676539812, "grad_norm": 12.491472244262695, "learning_rate": 0.0006703417634890228, "loss": 7.5216, "step": 1225500 }, { "epoch": 4.992825054563194, "grad_norm": 13.283463478088379, "learning_rate": 0.00067000559490629, "loss": 7.5587, "step": 1225600 }, { "epoch": 4.993232432586575, "grad_norm": 4.220707416534424, "learning_rate": 0.0006696694988707003, "loss": 7.5687, "step": 1225700 }, { "epoch": 4.993639810609957, "grad_norm": 5.348223686218262, "learning_rate": 0.0006693334753955413, "loss": 7.5399, "step": 1225800 }, { "epoch": 4.994047188633338, "grad_norm": 5.098475456237793, "learning_rate": 0.0006689975244940982, "loss": 7.5416, "step": 1225900 }, { "epoch": 4.99445456665672, "grad_norm": 5.169957637786865, "learning_rate": 0.000668661646179652, "loss": 7.5448, "step": 1226000 }, { "epoch": 4.99445456665672, "eval_MaskedAccuracy": 0.5121886745386547, "eval_loss": 1.59209144115448, "eval_runtime": 169.7167, "eval_samples_per_second": 374.011, "eval_steps_per_second": 1.461, "step": 1226000 }, { "epoch": 4.9948619446801015, "grad_norm": 15.679362297058105, "learning_rate": 0.0006683258404654823, "loss": 7.5624, "step": 1226100 }, { "epoch": 4.995269322703483, "grad_norm": 5.099567890167236, "learning_rate": 0.0006679901073648651, "loss": 7.5409, "step": 1226200 }, { "epoch": 4.9956767007268645, "grad_norm": 18.295846939086914, "learning_rate": 0.0006676544468910742, "loss": 7.5332, "step": 1226300 }, { "epoch": 4.996084078750246, "grad_norm": 3.887827157974243, "learning_rate": 0.0006673188590573797, "loss": 7.5142, "step": 1226400 }, { "epoch": 4.996491456773628, "grad_norm": 4.531190395355225, "learning_rate": 0.0006669833438770492, "loss": 7.5377, "step": 1226500 }, { "epoch": 4.996898834797008, "grad_norm": 4.338390350341797, "learning_rate": 0.0006666479013633482, "loss": 7.5642, "step": 1226600 }, { "epoch": 4.99730621282039, "grad_norm": 6.9320197105407715, "learning_rate": 0.0006663125315295379, "loss": 7.5329, "step": 1226700 }, { "epoch": 4.997713590843771, "grad_norm": 4.566051483154297, "learning_rate": 0.0006659772343888761, "loss": 7.5234, "step": 1226800 }, { "epoch": 4.998120968867153, "grad_norm": 2.9808521270751953, "learning_rate": 0.0006656420099546208, "loss": 7.556, "step": 1226900 }, { "epoch": 4.998528346890534, "grad_norm": 13.253824234008789, "learning_rate": 0.0006653068582400242, "loss": 7.5222, "step": 1227000 }, { "epoch": 4.998528346890534, "eval_MaskedAccuracy": 0.5122886249159206, "eval_loss": 1.5890278816223145, "eval_runtime": 162.5801, "eval_samples_per_second": 390.429, "eval_steps_per_second": 1.525, "step": 1227000 }, { "epoch": 4.998935724913916, "grad_norm": 11.345479965209961, "learning_rate": 0.000664971779258337, "loss": 7.5476, "step": 1227100 }, { "epoch": 4.9993431029372974, "grad_norm": 3.7569751739501953, "learning_rate": 0.0006646367730228067, "loss": 7.5337, "step": 1227200 }, { "epoch": 4.999750480960679, "grad_norm": 14.374944686889648, "learning_rate": 0.0006643018395466777, "loss": 7.5692, "step": 1227300 }, { "epoch": 5.0001578589840605, "grad_norm": 5.712322235107422, "learning_rate": 0.0006639669788431905, "loss": 7.5522, "step": 1227400 }, { "epoch": 5.000565237007442, "grad_norm": 4.443236827850342, "learning_rate": 0.0006636321909255869, "loss": 7.5616, "step": 1227500 }, { "epoch": 5.000972615030824, "grad_norm": 8.470004081726074, "learning_rate": 0.0006632974758071009, "loss": 7.5996, "step": 1227600 }, { "epoch": 5.001379993054205, "grad_norm": 5.000079154968262, "learning_rate": 0.0006629628335009662, "loss": 7.5359, "step": 1227700 }, { "epoch": 5.001787371077586, "grad_norm": 9.659152030944824, "learning_rate": 0.0006626282640204117, "loss": 7.57, "step": 1227800 }, { "epoch": 5.002194749100967, "grad_norm": 3.7673120498657227, "learning_rate": 0.0006622937673786657, "loss": 7.562, "step": 1227900 }, { "epoch": 5.002602127124349, "grad_norm": 6.777065277099609, "learning_rate": 0.000661959343588953, "loss": 7.5599, "step": 1228000 }, { "epoch": 5.002602127124349, "eval_MaskedAccuracy": 0.5117803751789692, "eval_loss": 1.5943812131881714, "eval_runtime": 149.2434, "eval_samples_per_second": 425.319, "eval_steps_per_second": 1.662, "step": 1228000 }, { "epoch": 5.00300950514773, "grad_norm": 13.324762344360352, "learning_rate": 0.0006616249926644944, "loss": 7.5493, "step": 1228100 }, { "epoch": 5.003416883171112, "grad_norm": 13.191681861877441, "learning_rate": 0.0006612907146185087, "loss": 7.5454, "step": 1228200 }, { "epoch": 5.003824261194493, "grad_norm": 7.181677341461182, "learning_rate": 0.0006609565094642122, "loss": 7.5511, "step": 1228300 }, { "epoch": 5.004231639217875, "grad_norm": 21.544034957885742, "learning_rate": 0.0006606223772148165, "loss": 7.5532, "step": 1228400 }, { "epoch": 5.0046390172412565, "grad_norm": 10.261125564575195, "learning_rate": 0.0006602883178835334, "loss": 7.5641, "step": 1228500 }, { "epoch": 5.005046395264638, "grad_norm": 6.927099704742432, "learning_rate": 0.0006599543314835694, "loss": 7.5241, "step": 1228600 }, { "epoch": 5.0054537732880195, "grad_norm": 3.984013557434082, "learning_rate": 0.0006596204180281294, "loss": 7.5216, "step": 1228700 }, { "epoch": 5.005861151311401, "grad_norm": 7.137518405914307, "learning_rate": 0.000659286577530414, "loss": 7.5325, "step": 1228800 }, { "epoch": 5.006268529334783, "grad_norm": 8.60405158996582, "learning_rate": 0.0006589528100036211, "loss": 7.566, "step": 1228900 }, { "epoch": 5.006675907358164, "grad_norm": 13.43557357788086, "learning_rate": 0.0006586191154609472, "loss": 7.5554, "step": 1229000 }, { "epoch": 5.006675907358164, "eval_MaskedAccuracy": 0.5120016789837898, "eval_loss": 1.5948749780654907, "eval_runtime": 147.1504, "eval_samples_per_second": 431.368, "eval_steps_per_second": 1.685, "step": 1229000 }, { "epoch": 5.007083285381545, "grad_norm": 9.201789855957031, "learning_rate": 0.0006582854939155844, "loss": 7.5459, "step": 1229100 }, { "epoch": 5.007490663404926, "grad_norm": 19.81255340576172, "learning_rate": 0.0006579519453807234, "loss": 7.5338, "step": 1229200 }, { "epoch": 5.007898041428308, "grad_norm": 4.990116596221924, "learning_rate": 0.0006576184698695503, "loss": 7.5349, "step": 1229300 }, { "epoch": 5.008305419451689, "grad_norm": 9.439986228942871, "learning_rate": 0.0006572850673952495, "loss": 7.5722, "step": 1229400 }, { "epoch": 5.008712797475071, "grad_norm": 4.5637006759643555, "learning_rate": 0.0006569517379710025, "loss": 7.5144, "step": 1229500 }, { "epoch": 5.0091201754984525, "grad_norm": 13.385419845581055, "learning_rate": 0.0006566184816099879, "loss": 7.5402, "step": 1229600 }, { "epoch": 5.009527553521834, "grad_norm": 8.353857040405273, "learning_rate": 0.0006562852983253799, "loss": 7.5562, "step": 1229700 }, { "epoch": 5.0099349315452155, "grad_norm": 13.3303804397583, "learning_rate": 0.0006559521881303521, "loss": 7.5471, "step": 1229800 }, { "epoch": 5.010342309568597, "grad_norm": 12.460258483886719, "learning_rate": 0.0006556191510380728, "loss": 7.5468, "step": 1229900 }, { "epoch": 5.010749687591979, "grad_norm": 4.557527542114258, "learning_rate": 0.00065528618706171, "loss": 7.5539, "step": 1230000 }, { "epoch": 5.010749687591979, "eval_MaskedAccuracy": 0.5120977751441654, "eval_loss": 1.6034334897994995, "eval_runtime": 147.2259, "eval_samples_per_second": 431.147, "eval_steps_per_second": 1.684, "step": 1230000 }, { "epoch": 5.01115706561536, "grad_norm": 4.789175033569336, "learning_rate": 0.0006549532962144281, "loss": 7.5718, "step": 1230100 }, { "epoch": 5.011564443638742, "grad_norm": 6.723941802978516, "learning_rate": 0.0006546204785093867, "loss": 7.5443, "step": 1230200 }, { "epoch": 5.011971821662122, "grad_norm": 16.7134952545166, "learning_rate": 0.0006542877339597431, "loss": 7.5383, "step": 1230300 }, { "epoch": 5.012379199685504, "grad_norm": 5.229317665100098, "learning_rate": 0.0006539550625786548, "loss": 7.5319, "step": 1230400 }, { "epoch": 5.012786577708885, "grad_norm": 4.620895862579346, "learning_rate": 0.0006536224643792727, "loss": 7.5577, "step": 1230500 }, { "epoch": 5.013193955732267, "grad_norm": 15.162534713745117, "learning_rate": 0.0006532899393747466, "loss": 7.5539, "step": 1230600 }, { "epoch": 5.013601333755648, "grad_norm": 21.066532135009766, "learning_rate": 0.0006529574875782229, "loss": 7.5467, "step": 1230700 }, { "epoch": 5.01400871177903, "grad_norm": 4.801368236541748, "learning_rate": 0.0006526251090028454, "loss": 7.5722, "step": 1230800 }, { "epoch": 5.0144160898024115, "grad_norm": 15.411423683166504, "learning_rate": 0.0006522928036617541, "loss": 7.5509, "step": 1230900 }, { "epoch": 5.014823467825793, "grad_norm": 16.93613624572754, "learning_rate": 0.0006519605715680867, "loss": 7.5444, "step": 1231000 }, { "epoch": 5.014823467825793, "eval_MaskedAccuracy": 0.5123794414665871, "eval_loss": 1.5937520265579224, "eval_runtime": 150.3004, "eval_samples_per_second": 422.328, "eval_steps_per_second": 1.65, "step": 1231000 }, { "epoch": 5.015230845849175, "grad_norm": 4.480098724365234, "learning_rate": 0.0006516284127349796, "loss": 7.5674, "step": 1231100 }, { "epoch": 5.015638223872556, "grad_norm": 10.921262741088867, "learning_rate": 0.000651296327175563, "loss": 7.5748, "step": 1231200 }, { "epoch": 5.016045601895938, "grad_norm": 5.700741291046143, "learning_rate": 0.0006509643149029676, "loss": 7.5292, "step": 1231300 }, { "epoch": 5.016452979919319, "grad_norm": 12.017867088317871, "learning_rate": 0.0006506323759303185, "loss": 7.5319, "step": 1231400 }, { "epoch": 5.016860357942701, "grad_norm": 4.3226094245910645, "learning_rate": 0.0006503005102707396, "loss": 7.5432, "step": 1231500 }, { "epoch": 5.017267735966081, "grad_norm": 4.801712989807129, "learning_rate": 0.0006499687179373513, "loss": 7.552, "step": 1231600 }, { "epoch": 5.017675113989463, "grad_norm": 11.267165184020996, "learning_rate": 0.000649636998943271, "loss": 7.5417, "step": 1231700 }, { "epoch": 5.018082492012844, "grad_norm": 5.242594242095947, "learning_rate": 0.0006493053533016136, "loss": 7.5636, "step": 1231800 }, { "epoch": 5.018489870036226, "grad_norm": 6.6134114265441895, "learning_rate": 0.0006489737810254903, "loss": 7.5765, "step": 1231900 }, { "epoch": 5.0188972480596075, "grad_norm": 12.343679428100586, "learning_rate": 0.0006486422821280105, "loss": 7.54, "step": 1232000 }, { "epoch": 5.0188972480596075, "eval_MaskedAccuracy": 0.5116733871540049, "eval_loss": 1.592746615409851, "eval_runtime": 149.8209, "eval_samples_per_second": 423.679, "eval_steps_per_second": 1.655, "step": 1232000 }, { "epoch": 5.019304626082989, "grad_norm": 6.2411675453186035, "learning_rate": 0.00064831085662228, "loss": 7.5497, "step": 1232100 }, { "epoch": 5.0197120041063705, "grad_norm": 14.235241889953613, "learning_rate": 0.000647979504521401, "loss": 7.5724, "step": 1232200 }, { "epoch": 5.020119382129752, "grad_norm": 9.591054916381836, "learning_rate": 0.0006476482258384742, "loss": 7.5926, "step": 1232300 }, { "epoch": 5.020526760153134, "grad_norm": 7.400914192199707, "learning_rate": 0.0006473170205865959, "loss": 7.5692, "step": 1232400 }, { "epoch": 5.020934138176515, "grad_norm": 7.066140651702881, "learning_rate": 0.0006469858887788625, "loss": 7.5784, "step": 1232500 }, { "epoch": 5.021341516199897, "grad_norm": 4.17603874206543, "learning_rate": 0.0006466548304283648, "loss": 7.57, "step": 1232600 }, { "epoch": 5.021748894223278, "grad_norm": 4.827794075012207, "learning_rate": 0.0006463238455481907, "loss": 7.5599, "step": 1232700 }, { "epoch": 5.022156272246659, "grad_norm": 3.1116693019866943, "learning_rate": 0.0006459929341514248, "loss": 7.5577, "step": 1232800 }, { "epoch": 5.02256365027004, "grad_norm": 6.376616477966309, "learning_rate": 0.0006456620962511512, "loss": 7.5453, "step": 1232900 }, { "epoch": 5.022971028293422, "grad_norm": 5.128245830535889, "learning_rate": 0.00064533133186045, "loss": 7.5689, "step": 1233000 }, { "epoch": 5.022971028293422, "eval_MaskedAccuracy": 0.5119402879554316, "eval_loss": 1.5978705883026123, "eval_runtime": 151.5867, "eval_samples_per_second": 418.744, "eval_steps_per_second": 1.636, "step": 1233000 }, { "epoch": 5.023378406316803, "grad_norm": 4.2856950759887695, "learning_rate": 0.000645000640992397, "loss": 7.5258, "step": 1233100 }, { "epoch": 5.023785784340185, "grad_norm": 6.219287395477295, "learning_rate": 0.000644670023660067, "loss": 7.5731, "step": 1233200 }, { "epoch": 5.0241931623635665, "grad_norm": 11.586651802062988, "learning_rate": 0.00064433947987653, "loss": 7.5728, "step": 1233300 }, { "epoch": 5.024600540386948, "grad_norm": 15.761351585388184, "learning_rate": 0.0006440090096548546, "loss": 7.5649, "step": 1233400 }, { "epoch": 5.02500791841033, "grad_norm": 10.314072608947754, "learning_rate": 0.0006436786130081072, "loss": 7.5498, "step": 1233500 }, { "epoch": 5.025415296433711, "grad_norm": 11.7467041015625, "learning_rate": 0.0006433482899493493, "loss": 7.5529, "step": 1233600 }, { "epoch": 5.025822674457093, "grad_norm": 9.369192123413086, "learning_rate": 0.00064301804049164, "loss": 7.5365, "step": 1233700 }, { "epoch": 5.026230052480474, "grad_norm": 6.06901741027832, "learning_rate": 0.0006426878646480359, "loss": 7.5585, "step": 1233800 }, { "epoch": 5.026637430503856, "grad_norm": 17.563222885131836, "learning_rate": 0.0006423577624315919, "loss": 7.5502, "step": 1233900 }, { "epoch": 5.027044808527236, "grad_norm": 5.640834331512451, "learning_rate": 0.0006420277338553568, "loss": 7.5621, "step": 1234000 }, { "epoch": 5.027044808527236, "eval_MaskedAccuracy": 0.5119214262047922, "eval_loss": 1.594098448753357, "eval_runtime": 171.365, "eval_samples_per_second": 370.414, "eval_steps_per_second": 1.447, "step": 1234000 }, { "epoch": 5.027452186550618, "grad_norm": 4.507350444793701, "learning_rate": 0.0006416977789323789, "loss": 7.5617, "step": 1234100 }, { "epoch": 5.027859564573999, "grad_norm": 7.678382873535156, "learning_rate": 0.0006413678976757045, "loss": 7.5864, "step": 1234200 }, { "epoch": 5.028266942597381, "grad_norm": 11.929092407226562, "learning_rate": 0.0006410380900983741, "loss": 7.5457, "step": 1234300 }, { "epoch": 5.0286743206207625, "grad_norm": 4.17978572845459, "learning_rate": 0.0006407083562134268, "loss": 7.5625, "step": 1234400 }, { "epoch": 5.029081698644144, "grad_norm": 10.068114280700684, "learning_rate": 0.0006403786960338991, "loss": 7.5691, "step": 1234500 }, { "epoch": 5.0294890766675255, "grad_norm": 13.511075019836426, "learning_rate": 0.0006400491095728252, "loss": 7.5597, "step": 1234600 }, { "epoch": 5.029896454690907, "grad_norm": 11.481148719787598, "learning_rate": 0.0006397195968432339, "loss": 7.5419, "step": 1234700 }, { "epoch": 5.030303832714289, "grad_norm": 6.723931789398193, "learning_rate": 0.0006393901578581525, "loss": 7.5497, "step": 1234800 }, { "epoch": 5.03071121073767, "grad_norm": 7.834571838378906, "learning_rate": 0.0006390607926306063, "loss": 7.5659, "step": 1234900 }, { "epoch": 5.031118588761052, "grad_norm": 8.159076690673828, "learning_rate": 0.000638731501173617, "loss": 7.541, "step": 1235000 }, { "epoch": 5.031118588761052, "eval_MaskedAccuracy": 0.5121700753974395, "eval_loss": 1.5960626602172852, "eval_runtime": 154.5013, "eval_samples_per_second": 410.844, "eval_steps_per_second": 1.605, "step": 1235000 }, { "epoch": 5.031525966784433, "grad_norm": 6.948030948638916, "learning_rate": 0.0006384022835002031, "loss": 7.5352, "step": 1235100 }, { "epoch": 5.031933344807815, "grad_norm": 2.986198663711548, "learning_rate": 0.0006380731396233794, "loss": 7.5597, "step": 1235200 }, { "epoch": 5.032340722831195, "grad_norm": 3.1850740909576416, "learning_rate": 0.0006377440695561599, "loss": 7.5361, "step": 1235300 }, { "epoch": 5.032748100854577, "grad_norm": 12.844918251037598, "learning_rate": 0.0006374150733115545, "loss": 7.56, "step": 1235400 }, { "epoch": 5.0331554788779584, "grad_norm": 5.665256977081299, "learning_rate": 0.0006370861509025694, "loss": 7.5411, "step": 1235500 }, { "epoch": 5.03356285690134, "grad_norm": 2.9927115440368652, "learning_rate": 0.0006367573023422098, "loss": 7.5629, "step": 1235600 }, { "epoch": 5.0339702349247215, "grad_norm": 6.249614715576172, "learning_rate": 0.0006364285276434758, "loss": 7.5761, "step": 1235700 }, { "epoch": 5.034377612948103, "grad_norm": 13.659110069274902, "learning_rate": 0.0006360998268193654, "loss": 7.5576, "step": 1235800 }, { "epoch": 5.034784990971485, "grad_norm": 4.137001991271973, "learning_rate": 0.0006357711998828752, "loss": 7.5726, "step": 1235900 }, { "epoch": 5.035192368994866, "grad_norm": 4.4844770431518555, "learning_rate": 0.0006354426468469963, "loss": 7.5949, "step": 1236000 }, { "epoch": 5.035192368994866, "eval_MaskedAccuracy": 0.5122887211273998, "eval_loss": 1.5912740230560303, "eval_runtime": 150.1366, "eval_samples_per_second": 422.788, "eval_steps_per_second": 1.652, "step": 1236000 }, { "epoch": 5.035599747018248, "grad_norm": 7.55839729309082, "learning_rate": 0.0006351141677247188, "loss": 7.5687, "step": 1236100 }, { "epoch": 5.036007125041629, "grad_norm": 4.499295234680176, "learning_rate": 0.0006347857625290293, "loss": 7.5633, "step": 1236200 }, { "epoch": 5.036414503065011, "grad_norm": 13.397000312805176, "learning_rate": 0.0006344574312729102, "loss": 7.5499, "step": 1236300 }, { "epoch": 5.036821881088392, "grad_norm": 5.78963565826416, "learning_rate": 0.0006341291739693437, "loss": 7.5459, "step": 1236400 }, { "epoch": 5.037229259111774, "grad_norm": 11.74659252166748, "learning_rate": 0.000633800990631307, "loss": 7.5468, "step": 1236500 }, { "epoch": 5.037636637135154, "grad_norm": 23.07303810119629, "learning_rate": 0.0006334728812717758, "loss": 7.5352, "step": 1236600 }, { "epoch": 5.038044015158536, "grad_norm": 8.282074928283691, "learning_rate": 0.000633144845903721, "loss": 7.533, "step": 1236700 }, { "epoch": 5.0384513931819175, "grad_norm": 11.630181312561035, "learning_rate": 0.0006328168845401113, "loss": 7.5395, "step": 1236800 }, { "epoch": 5.038858771205299, "grad_norm": 11.355972290039062, "learning_rate": 0.0006324889971939131, "loss": 7.5454, "step": 1236900 }, { "epoch": 5.0392661492286805, "grad_norm": 3.331683397293091, "learning_rate": 0.0006321611838780903, "loss": 7.5635, "step": 1237000 }, { "epoch": 5.0392661492286805, "eval_MaskedAccuracy": 0.5126326265318845, "eval_loss": 1.5927146673202515, "eval_runtime": 160.3941, "eval_samples_per_second": 395.75, "eval_steps_per_second": 1.546, "step": 1237000 }, { "epoch": 5.039673527252062, "grad_norm": 9.400132179260254, "learning_rate": 0.0006318334446056021, "loss": 7.5406, "step": 1237100 }, { "epoch": 5.040080905275444, "grad_norm": 12.600625991821289, "learning_rate": 0.0006315057793894067, "loss": 7.5457, "step": 1237200 }, { "epoch": 5.040488283298825, "grad_norm": 6.932764053344727, "learning_rate": 0.0006311781882424574, "loss": 7.5593, "step": 1237300 }, { "epoch": 5.040895661322207, "grad_norm": 6.773071765899658, "learning_rate": 0.0006308506711777069, "loss": 7.5205, "step": 1237400 }, { "epoch": 5.041303039345588, "grad_norm": 13.089978218078613, "learning_rate": 0.0006305232282081035, "loss": 7.5466, "step": 1237500 }, { "epoch": 5.04171041736897, "grad_norm": 5.462027549743652, "learning_rate": 0.0006301958593465915, "loss": 7.5614, "step": 1237600 }, { "epoch": 5.042117795392351, "grad_norm": 11.837106704711914, "learning_rate": 0.0006298685646061147, "loss": 7.5382, "step": 1237700 }, { "epoch": 5.042525173415732, "grad_norm": 9.655179023742676, "learning_rate": 0.0006295413439996122, "loss": 7.552, "step": 1237800 }, { "epoch": 5.0429325514391135, "grad_norm": 5.208213806152344, "learning_rate": 0.0006292141975400214, "loss": 7.5591, "step": 1237900 }, { "epoch": 5.043339929462495, "grad_norm": 6.39457893371582, "learning_rate": 0.0006288871252402751, "loss": 7.5299, "step": 1238000 }, { "epoch": 5.043339929462495, "eval_MaskedAccuracy": 0.5122286821516949, "eval_loss": 1.5957825183868408, "eval_runtime": 180.9774, "eval_samples_per_second": 350.74, "eval_steps_per_second": 1.37, "step": 1238000 }, { "epoch": 5.0437473074858765, "grad_norm": 3.443537473678589, "learning_rate": 0.000628560127113305, "loss": 7.562, "step": 1238100 }, { "epoch": 5.044154685509258, "grad_norm": 3.330414056777954, "learning_rate": 0.0006282332031720396, "loss": 7.5548, "step": 1238200 }, { "epoch": 5.04456206353264, "grad_norm": 4.9159955978393555, "learning_rate": 0.0006279063534294034, "loss": 7.5721, "step": 1238300 }, { "epoch": 5.044969441556021, "grad_norm": 3.3496997356414795, "learning_rate": 0.0006275795778983178, "loss": 7.5472, "step": 1238400 }, { "epoch": 5.045376819579403, "grad_norm": 2.7423839569091797, "learning_rate": 0.0006272528765917022, "loss": 7.5708, "step": 1238500 }, { "epoch": 5.045784197602784, "grad_norm": 8.792877197265625, "learning_rate": 0.000626926249522474, "loss": 7.5471, "step": 1238600 }, { "epoch": 5.046191575626166, "grad_norm": 12.140045166015625, "learning_rate": 0.0006265996967035461, "loss": 7.5585, "step": 1238700 }, { "epoch": 5.046598953649547, "grad_norm": 7.998610496520996, "learning_rate": 0.0006262732181478278, "loss": 7.5587, "step": 1238800 }, { "epoch": 5.047006331672929, "grad_norm": 5.750330924987793, "learning_rate": 0.000625946813868228, "loss": 7.5774, "step": 1238900 }, { "epoch": 5.047413709696309, "grad_norm": 8.81126880645752, "learning_rate": 0.0006256204838776503, "loss": 7.572, "step": 1239000 }, { "epoch": 5.047413709696309, "eval_MaskedAccuracy": 0.5119281307959642, "eval_loss": 1.5961500406265259, "eval_runtime": 155.0172, "eval_samples_per_second": 409.477, "eval_steps_per_second": 1.6, "step": 1239000 }, { "epoch": 5.047821087719691, "grad_norm": 11.096508979797363, "learning_rate": 0.0006252942281889956, "loss": 7.5435, "step": 1239100 }, { "epoch": 5.0482284657430725, "grad_norm": 11.315113067626953, "learning_rate": 0.0006249680468151646, "loss": 7.5503, "step": 1239200 }, { "epoch": 5.048635843766454, "grad_norm": 9.148561477661133, "learning_rate": 0.0006246419397690517, "loss": 7.5525, "step": 1239300 }, { "epoch": 5.049043221789836, "grad_norm": 5.415726661682129, "learning_rate": 0.0006243159070635498, "loss": 7.5513, "step": 1239400 }, { "epoch": 5.049450599813217, "grad_norm": 6.175321102142334, "learning_rate": 0.0006239899487115492, "loss": 7.5694, "step": 1239500 }, { "epoch": 5.049857977836599, "grad_norm": 3.0635392665863037, "learning_rate": 0.0006236640647259356, "loss": 7.5822, "step": 1239600 }, { "epoch": 5.05026535585998, "grad_norm": 6.307855129241943, "learning_rate": 0.0006233382551195924, "loss": 7.5385, "step": 1239700 }, { "epoch": 5.050672733883362, "grad_norm": 9.618474960327148, "learning_rate": 0.0006230125199054031, "loss": 7.5466, "step": 1239800 }, { "epoch": 5.051080111906743, "grad_norm": 4.653321266174316, "learning_rate": 0.0006226868590962443, "loss": 7.5483, "step": 1239900 }, { "epoch": 5.051487489930125, "grad_norm": 10.856022834777832, "learning_rate": 0.0006223612727049914, "loss": 7.5544, "step": 1240000 }, { "epoch": 5.051487489930125, "eval_MaskedAccuracy": 0.5123791608637853, "eval_loss": 1.594712734222412, "eval_runtime": 174.7706, "eval_samples_per_second": 363.196, "eval_steps_per_second": 1.419, "step": 1240000 }, { "epoch": 5.051894867953506, "grad_norm": 6.180064678192139, "learning_rate": 0.0006220357607445162, "loss": 7.5663, "step": 1240100 }, { "epoch": 5.052302245976888, "grad_norm": 21.003381729125977, "learning_rate": 0.0006217103232276886, "loss": 7.5411, "step": 1240200 }, { "epoch": 5.0527096240002685, "grad_norm": 9.569164276123047, "learning_rate": 0.0006213849601673735, "loss": 7.5279, "step": 1240300 }, { "epoch": 5.05311700202365, "grad_norm": 4.3968095779418945, "learning_rate": 0.0006210596715764357, "loss": 7.5777, "step": 1240400 }, { "epoch": 5.0535243800470315, "grad_norm": 17.27248191833496, "learning_rate": 0.0006207344574677353, "loss": 7.544, "step": 1240500 }, { "epoch": 5.053931758070413, "grad_norm": 3.057788848876953, "learning_rate": 0.0006204093178541299, "loss": 7.5487, "step": 1240600 }, { "epoch": 5.054339136093795, "grad_norm": 17.172550201416016, "learning_rate": 0.0006200842527484734, "loss": 7.5459, "step": 1240700 }, { "epoch": 5.054746514117176, "grad_norm": 16.662858963012695, "learning_rate": 0.0006197592621636174, "loss": 7.5822, "step": 1240800 }, { "epoch": 5.055153892140558, "grad_norm": 12.877495765686035, "learning_rate": 0.0006194343461124117, "loss": 7.5713, "step": 1240900 }, { "epoch": 5.055561270163939, "grad_norm": 4.151196002960205, "learning_rate": 0.0006191095046076997, "loss": 7.5532, "step": 1241000 }, { "epoch": 5.055561270163939, "eval_MaskedAccuracy": 0.512082622485966, "eval_loss": 1.5900789499282837, "eval_runtime": 195.1468, "eval_samples_per_second": 325.273, "eval_steps_per_second": 1.271, "step": 1241000 }, { "epoch": 5.055968648187321, "grad_norm": 6.3274760246276855, "learning_rate": 0.0006187847376623259, "loss": 7.5781, "step": 1241100 }, { "epoch": 5.056376026210702, "grad_norm": 4.799252033233643, "learning_rate": 0.0006184600452891298, "loss": 7.5625, "step": 1241200 }, { "epoch": 5.056783404234084, "grad_norm": 24.22987174987793, "learning_rate": 0.0006181354275009471, "loss": 7.5594, "step": 1241300 }, { "epoch": 5.057190782257465, "grad_norm": 10.769682884216309, "learning_rate": 0.0006178108843106137, "loss": 7.5522, "step": 1241400 }, { "epoch": 5.057598160280847, "grad_norm": 12.865377426147461, "learning_rate": 0.0006174864157309583, "loss": 7.5753, "step": 1241500 }, { "epoch": 5.0580055383042275, "grad_norm": 8.817621231079102, "learning_rate": 0.0006171620217748107, "loss": 7.5563, "step": 1241600 }, { "epoch": 5.058412916327609, "grad_norm": 12.362869262695312, "learning_rate": 0.0006168377024549951, "loss": 7.5794, "step": 1241700 }, { "epoch": 5.058820294350991, "grad_norm": 8.096556663513184, "learning_rate": 0.0006165134577843348, "loss": 7.5728, "step": 1241800 }, { "epoch": 5.059227672374372, "grad_norm": 9.449455261230469, "learning_rate": 0.0006161892877756477, "loss": 7.5657, "step": 1241900 }, { "epoch": 5.059635050397754, "grad_norm": 10.60107707977295, "learning_rate": 0.0006158651924417497, "loss": 7.5518, "step": 1242000 }, { "epoch": 5.059635050397754, "eval_MaskedAccuracy": 0.5116715651411367, "eval_loss": 1.5865345001220703, "eval_runtime": 156.4843, "eval_samples_per_second": 405.638, "eval_steps_per_second": 1.585, "step": 1242000 }, { "epoch": 5.060042428421135, "grad_norm": 17.698286056518555, "learning_rate": 0.000615541171795455, "loss": 7.5416, "step": 1242100 }, { "epoch": 5.060449806444517, "grad_norm": 5.095458984375, "learning_rate": 0.0006152172258495724, "loss": 7.5648, "step": 1242200 }, { "epoch": 5.060857184467898, "grad_norm": 3.6847846508026123, "learning_rate": 0.0006148933546169099, "loss": 7.5441, "step": 1242300 }, { "epoch": 5.06126456249128, "grad_norm": 13.084811210632324, "learning_rate": 0.0006145695581102728, "loss": 7.5808, "step": 1242400 }, { "epoch": 5.061671940514661, "grad_norm": 4.228727340698242, "learning_rate": 0.0006142458363424609, "loss": 7.5572, "step": 1242500 }, { "epoch": 5.062079318538043, "grad_norm": 16.473806381225586, "learning_rate": 0.000613922189326274, "loss": 7.5709, "step": 1242600 }, { "epoch": 5.062486696561424, "grad_norm": 13.398612022399902, "learning_rate": 0.000613598617074508, "loss": 7.5613, "step": 1242700 }, { "epoch": 5.062894074584805, "grad_norm": 5.265367031097412, "learning_rate": 0.0006132751195999541, "loss": 7.5626, "step": 1242800 }, { "epoch": 5.0633014526081865, "grad_norm": 6.05662727355957, "learning_rate": 0.0006129516969154025, "loss": 7.5746, "step": 1242900 }, { "epoch": 5.063708830631568, "grad_norm": 9.01705551147461, "learning_rate": 0.0006126283490336391, "loss": 7.5784, "step": 1243000 }, { "epoch": 5.063708830631568, "eval_MaskedAccuracy": 0.5116898757757985, "eval_loss": 1.5971190929412842, "eval_runtime": 153.5325, "eval_samples_per_second": 413.437, "eval_steps_per_second": 1.615, "step": 1243000 }, { "epoch": 5.06411620865495, "grad_norm": 8.705946922302246, "learning_rate": 0.0006123050759674489, "loss": 7.5422, "step": 1243100 }, { "epoch": 5.064523586678331, "grad_norm": 5.019017696380615, "learning_rate": 0.000611981877729611, "loss": 7.5754, "step": 1243200 }, { "epoch": 5.064930964701713, "grad_norm": 9.691917419433594, "learning_rate": 0.0006116587543329046, "loss": 7.58, "step": 1243300 }, { "epoch": 5.065338342725094, "grad_norm": 4.409297466278076, "learning_rate": 0.0006113357057901047, "loss": 7.5852, "step": 1243400 }, { "epoch": 5.065745720748476, "grad_norm": 5.675930976867676, "learning_rate": 0.0006110127321139818, "loss": 7.5649, "step": 1243500 }, { "epoch": 5.066153098771857, "grad_norm": 4.158321857452393, "learning_rate": 0.0006106898333173048, "loss": 7.5891, "step": 1243600 }, { "epoch": 5.066560476795239, "grad_norm": 11.193013191223145, "learning_rate": 0.0006103670094128405, "loss": 7.545, "step": 1243700 }, { "epoch": 5.06696785481862, "grad_norm": 13.805326461791992, "learning_rate": 0.0006100442604133508, "loss": 7.557, "step": 1243800 }, { "epoch": 5.067375232842002, "grad_norm": 14.540875434875488, "learning_rate": 0.0006097215863315973, "loss": 7.5654, "step": 1243900 }, { "epoch": 5.0677826108653825, "grad_norm": 11.002042770385742, "learning_rate": 0.000609398987180336, "loss": 7.5549, "step": 1244000 }, { "epoch": 5.0677826108653825, "eval_MaskedAccuracy": 0.5124008095192564, "eval_loss": 1.582103967666626, "eval_runtime": 160.788, "eval_samples_per_second": 394.781, "eval_steps_per_second": 1.542, "step": 1244000 }, { "epoch": 5.068189988888764, "grad_norm": 23.186279296875, "learning_rate": 0.0006090764629723212, "loss": 7.5456, "step": 1244100 }, { "epoch": 5.068597366912146, "grad_norm": 3.9477131366729736, "learning_rate": 0.0006087540137203041, "loss": 7.5322, "step": 1244200 }, { "epoch": 5.069004744935527, "grad_norm": 5.8432135581970215, "learning_rate": 0.0006084316394370321, "loss": 7.5411, "step": 1244300 }, { "epoch": 5.069412122958909, "grad_norm": 8.310843467712402, "learning_rate": 0.000608109340135251, "loss": 7.5811, "step": 1244400 }, { "epoch": 5.06981950098229, "grad_norm": 4.559779167175293, "learning_rate": 0.0006077871158277029, "loss": 7.5606, "step": 1244500 }, { "epoch": 5.070226879005672, "grad_norm": 5.4670796394348145, "learning_rate": 0.0006074649665271281, "loss": 7.5285, "step": 1244600 }, { "epoch": 5.070634257029053, "grad_norm": 4.705342769622803, "learning_rate": 0.0006071428922462611, "loss": 7.5644, "step": 1244700 }, { "epoch": 5.071041635052435, "grad_norm": 9.920463562011719, "learning_rate": 0.000606820892997836, "loss": 7.5439, "step": 1244800 }, { "epoch": 5.071449013075816, "grad_norm": 7.15165376663208, "learning_rate": 0.0006064989687945835, "loss": 7.5577, "step": 1244900 }, { "epoch": 5.071856391099198, "grad_norm": 5.455395698547363, "learning_rate": 0.0006061771196492301, "loss": 7.5656, "step": 1245000 }, { "epoch": 5.071856391099198, "eval_MaskedAccuracy": 0.5121866050921413, "eval_loss": 1.5907988548278809, "eval_runtime": 166.2802, "eval_samples_per_second": 381.741, "eval_steps_per_second": 1.491, "step": 1245000 }, { "epoch": 5.072263769122579, "grad_norm": 13.10548210144043, "learning_rate": 0.0006058553455745019, "loss": 7.5728, "step": 1245100 }, { "epoch": 5.072671147145961, "grad_norm": 3.50596022605896, "learning_rate": 0.0006055336465831184, "loss": 7.5844, "step": 1245200 }, { "epoch": 5.0730785251693415, "grad_norm": 11.356693267822266, "learning_rate": 0.0006052120226877984, "loss": 7.6003, "step": 1245300 }, { "epoch": 5.073485903192723, "grad_norm": 5.499568939208984, "learning_rate": 0.0006048904739012585, "loss": 7.5822, "step": 1245400 }, { "epoch": 5.073893281216105, "grad_norm": 12.636957168579102, "learning_rate": 0.0006045690002362105, "loss": 7.5753, "step": 1245500 }, { "epoch": 5.074300659239486, "grad_norm": 4.892049789428711, "learning_rate": 0.0006042476017053644, "loss": 7.5464, "step": 1245600 }, { "epoch": 5.074708037262868, "grad_norm": 13.68791675567627, "learning_rate": 0.0006039262783214258, "loss": 7.5666, "step": 1245700 }, { "epoch": 5.075115415286249, "grad_norm": 17.178817749023438, "learning_rate": 0.0006036050300971004, "loss": 7.5651, "step": 1245800 }, { "epoch": 5.075522793309631, "grad_norm": 8.208373069763184, "learning_rate": 0.0006032838570450868, "loss": 7.5787, "step": 1245900 }, { "epoch": 5.075930171333012, "grad_norm": 9.169124603271484, "learning_rate": 0.000602962759178084, "loss": 7.5959, "step": 1246000 }, { "epoch": 5.075930171333012, "eval_MaskedAccuracy": 0.5118789037473761, "eval_loss": 1.5950679779052734, "eval_runtime": 155.3492, "eval_samples_per_second": 408.602, "eval_steps_per_second": 1.596, "step": 1246000 }, { "epoch": 5.076337549356394, "grad_norm": 9.100870132446289, "learning_rate": 0.000602641736508786, "loss": 7.5746, "step": 1246100 }, { "epoch": 5.076744927379775, "grad_norm": 12.614434242248535, "learning_rate": 0.000602320789049884, "loss": 7.5438, "step": 1246200 }, { "epoch": 5.077152305403157, "grad_norm": 10.327921867370605, "learning_rate": 0.0006019999168140688, "loss": 7.5708, "step": 1246300 }, { "epoch": 5.077559683426538, "grad_norm": 11.155251502990723, "learning_rate": 0.0006016791198140243, "loss": 7.5773, "step": 1246400 }, { "epoch": 5.07796706144992, "grad_norm": 17.12721824645996, "learning_rate": 0.0006013583980624337, "loss": 7.5309, "step": 1246500 }, { "epoch": 5.078374439473301, "grad_norm": 13.454442024230957, "learning_rate": 0.0006010377515719766, "loss": 7.5724, "step": 1246600 }, { "epoch": 5.078781817496682, "grad_norm": 5.540965557098389, "learning_rate": 0.0006007171803553305, "loss": 7.5745, "step": 1246700 }, { "epoch": 5.079189195520064, "grad_norm": 3.1566126346588135, "learning_rate": 0.000600396684425169, "loss": 7.5758, "step": 1246800 }, { "epoch": 5.079596573543445, "grad_norm": 4.9269256591796875, "learning_rate": 0.000600076263794163, "loss": 7.5842, "step": 1246900 }, { "epoch": 5.080003951566827, "grad_norm": 4.288941383361816, "learning_rate": 0.0005997559184749813, "loss": 7.5766, "step": 1247000 }, { "epoch": 5.080003951566827, "eval_MaskedAccuracy": 0.511492453496384, "eval_loss": 1.5896881818771362, "eval_runtime": 178.6538, "eval_samples_per_second": 355.302, "eval_steps_per_second": 1.388, "step": 1247000 }, { "epoch": 5.080411329590208, "grad_norm": 4.120076656341553, "learning_rate": 0.0005994356484802875, "loss": 7.5974, "step": 1247100 }, { "epoch": 5.08081870761359, "grad_norm": 4.301754474639893, "learning_rate": 0.0005991154538227442, "loss": 7.5655, "step": 1247200 }, { "epoch": 5.081226085636971, "grad_norm": 5.2326340675354, "learning_rate": 0.000598795334515011, "loss": 7.5692, "step": 1247300 }, { "epoch": 5.081633463660353, "grad_norm": 4.03275728225708, "learning_rate": 0.0005984752905697425, "loss": 7.5522, "step": 1247400 }, { "epoch": 5.082040841683734, "grad_norm": 3.790022850036621, "learning_rate": 0.0005981553219995932, "loss": 7.5502, "step": 1247500 }, { "epoch": 5.082448219707116, "grad_norm": 10.635153770446777, "learning_rate": 0.0005978354288172125, "loss": 7.555, "step": 1247600 }, { "epoch": 5.082855597730497, "grad_norm": 20.811840057373047, "learning_rate": 0.0005975156110352463, "loss": 7.5726, "step": 1247700 }, { "epoch": 5.083262975753878, "grad_norm": 13.617968559265137, "learning_rate": 0.0005971958686663415, "loss": 7.5655, "step": 1247800 }, { "epoch": 5.08367035377726, "grad_norm": 13.507347106933594, "learning_rate": 0.0005968762017231368, "loss": 7.5567, "step": 1247900 }, { "epoch": 5.084077731800641, "grad_norm": 10.461236953735352, "learning_rate": 0.0005965566102182718, "loss": 7.5358, "step": 1248000 }, { "epoch": 5.084077731800641, "eval_MaskedAccuracy": 0.5126387374378459, "eval_loss": 1.5889209508895874, "eval_runtime": 167.4296, "eval_samples_per_second": 379.121, "eval_steps_per_second": 1.481, "step": 1248000 }, { "epoch": 5.084485109824023, "grad_norm": 4.239201068878174, "learning_rate": 0.0005962370941643803, "loss": 7.5566, "step": 1248100 }, { "epoch": 5.084892487847404, "grad_norm": 3.749708890914917, "learning_rate": 0.0005959176535740955, "loss": 7.5685, "step": 1248200 }, { "epoch": 5.085299865870786, "grad_norm": 5.5102009773254395, "learning_rate": 0.000595598288460047, "loss": 7.5543, "step": 1248300 }, { "epoch": 5.085707243894167, "grad_norm": 4.811374664306641, "learning_rate": 0.0005952789988348602, "loss": 7.5522, "step": 1248400 }, { "epoch": 5.086114621917549, "grad_norm": 4.167360305786133, "learning_rate": 0.0005949597847111571, "loss": 7.5536, "step": 1248500 }, { "epoch": 5.08652199994093, "grad_norm": 14.653417587280273, "learning_rate": 0.0005946406461015605, "loss": 7.5484, "step": 1248600 }, { "epoch": 5.086929377964312, "grad_norm": 11.3723783493042, "learning_rate": 0.0005943215830186857, "loss": 7.5298, "step": 1248700 }, { "epoch": 5.087336755987693, "grad_norm": 5.123376369476318, "learning_rate": 0.0005940025954751483, "loss": 7.5693, "step": 1248800 }, { "epoch": 5.087744134011075, "grad_norm": 7.393428802490234, "learning_rate": 0.0005936836834835579, "loss": 7.5546, "step": 1248900 }, { "epoch": 5.088151512034456, "grad_norm": 12.377934455871582, "learning_rate": 0.0005933648470565252, "loss": 7.5704, "step": 1249000 }, { "epoch": 5.088151512034456, "eval_MaskedAccuracy": 0.5124602522649733, "eval_loss": 1.5873277187347412, "eval_runtime": 161.0087, "eval_samples_per_second": 394.24, "eval_steps_per_second": 1.54, "step": 1249000 }, { "epoch": 5.088558890057837, "grad_norm": 5.779115676879883, "learning_rate": 0.0005930460862066542, "loss": 7.5507, "step": 1249100 }, { "epoch": 5.088966268081219, "grad_norm": 3.380244255065918, "learning_rate": 0.0005927274009465464, "loss": 7.555, "step": 1249200 }, { "epoch": 5.0893736461046, "grad_norm": 9.080333709716797, "learning_rate": 0.0005924087912888035, "loss": 7.5308, "step": 1249300 }, { "epoch": 5.089781024127982, "grad_norm": 10.474891662597656, "learning_rate": 0.0005920902572460193, "loss": 7.5426, "step": 1249400 }, { "epoch": 5.090188402151363, "grad_norm": 14.309752464294434, "learning_rate": 0.000591771798830788, "loss": 7.5422, "step": 1249500 }, { "epoch": 5.090595780174745, "grad_norm": 4.935224533081055, "learning_rate": 0.000591453416055701, "loss": 7.579, "step": 1249600 }, { "epoch": 5.091003158198126, "grad_norm": 7.429482936859131, "learning_rate": 0.0005911351089333443, "loss": 7.5559, "step": 1249700 }, { "epoch": 5.091410536221508, "grad_norm": 24.867961883544922, "learning_rate": 0.0005908168774763031, "loss": 7.5151, "step": 1249800 }, { "epoch": 5.091817914244889, "grad_norm": 10.574111938476562, "learning_rate": 0.0005904987216971586, "loss": 7.5641, "step": 1249900 }, { "epoch": 5.092225292268271, "grad_norm": 3.2641782760620117, "learning_rate": 0.0005901806416084893, "loss": 7.5581, "step": 1250000 }, { "epoch": 5.092225292268271, "eval_MaskedAccuracy": 0.5119926108642491, "eval_loss": 1.5946589708328247, "eval_runtime": 156.1855, "eval_samples_per_second": 406.414, "eval_steps_per_second": 1.588, "step": 1250000 }, { "epoch": 5.0926326702916525, "grad_norm": 7.2194671630859375, "learning_rate": 0.0005898626372228697, "loss": 7.5324, "step": 1250100 }, { "epoch": 5.093040048315034, "grad_norm": 3.0554192066192627, "learning_rate": 0.000589544708552873, "loss": 7.5478, "step": 1250200 }, { "epoch": 5.093447426338415, "grad_norm": 8.473506927490234, "learning_rate": 0.0005892268556110687, "loss": 7.5416, "step": 1250300 }, { "epoch": 5.093854804361796, "grad_norm": 3.3607919216156006, "learning_rate": 0.0005889090784100231, "loss": 7.5222, "step": 1250400 }, { "epoch": 5.094262182385178, "grad_norm": 6.100803852081299, "learning_rate": 0.0005885913769622999, "loss": 7.5557, "step": 1250500 }, { "epoch": 5.094669560408559, "grad_norm": 15.029216766357422, "learning_rate": 0.0005882737512804591, "loss": 7.5462, "step": 1250600 }, { "epoch": 5.095076938431941, "grad_norm": 3.2186503410339355, "learning_rate": 0.0005879562013770584, "loss": 7.5586, "step": 1250700 }, { "epoch": 5.095484316455322, "grad_norm": 4.313271999359131, "learning_rate": 0.0005876387272646515, "loss": 7.5492, "step": 1250800 }, { "epoch": 5.095891694478704, "grad_norm": 6.743495941162109, "learning_rate": 0.0005873213289557906, "loss": 7.5544, "step": 1250900 }, { "epoch": 5.096299072502085, "grad_norm": 5.610095977783203, "learning_rate": 0.0005870040064630246, "loss": 7.5321, "step": 1251000 }, { "epoch": 5.096299072502085, "eval_MaskedAccuracy": 0.5120883219105775, "eval_loss": 1.5959781408309937, "eval_runtime": 155.8264, "eval_samples_per_second": 407.351, "eval_steps_per_second": 1.592, "step": 1251000 }, { "epoch": 5.096706450525467, "grad_norm": 5.0802001953125, "learning_rate": 0.0005866867597988978, "loss": 7.5512, "step": 1251100 }, { "epoch": 5.097113828548848, "grad_norm": 5.563141345977783, "learning_rate": 0.0005863695889759535, "loss": 7.5453, "step": 1251200 }, { "epoch": 5.09752120657223, "grad_norm": 15.891023635864258, "learning_rate": 0.0005860524940067311, "loss": 7.5568, "step": 1251300 }, { "epoch": 5.0979285845956115, "grad_norm": 21.714107513427734, "learning_rate": 0.0005857354749037669, "loss": 7.5554, "step": 1251400 }, { "epoch": 5.098335962618993, "grad_norm": 12.382031440734863, "learning_rate": 0.0005854185316795939, "loss": 7.567, "step": 1251500 }, { "epoch": 5.098743340642374, "grad_norm": 8.557860374450684, "learning_rate": 0.0005851016643467436, "loss": 7.561, "step": 1251600 }, { "epoch": 5.099150718665755, "grad_norm": 8.319543838500977, "learning_rate": 0.0005847848729177425, "loss": 7.554, "step": 1251700 }, { "epoch": 5.099558096689137, "grad_norm": 4.27390718460083, "learning_rate": 0.0005844681574051157, "loss": 7.5624, "step": 1251800 }, { "epoch": 5.099965474712518, "grad_norm": 11.300516128540039, "learning_rate": 0.000584151517821384, "loss": 7.5306, "step": 1251900 }, { "epoch": 5.1003728527359, "grad_norm": 4.060203552246094, "learning_rate": 0.0005838349541790667, "loss": 7.5262, "step": 1252000 }, { "epoch": 5.1003728527359, "eval_MaskedAccuracy": 0.5128182618811351, "eval_loss": 1.5957520008087158, "eval_runtime": 156.8332, "eval_samples_per_second": 404.736, "eval_steps_per_second": 1.581, "step": 1252000 }, { "epoch": 5.100780230759281, "grad_norm": 3.796227216720581, "learning_rate": 0.0005835184664906784, "loss": 7.5764, "step": 1252100 }, { "epoch": 5.101187608782663, "grad_norm": 19.47611427307129, "learning_rate": 0.0005832020547687317, "loss": 7.5604, "step": 1252200 }, { "epoch": 5.101594986806044, "grad_norm": 20.198137283325195, "learning_rate": 0.0005828857190257373, "loss": 7.5396, "step": 1252300 }, { "epoch": 5.102002364829426, "grad_norm": 7.291891574859619, "learning_rate": 0.0005825694592741993, "loss": 7.5283, "step": 1252400 }, { "epoch": 5.1024097428528075, "grad_norm": 3.155221939086914, "learning_rate": 0.0005822532755266229, "loss": 7.5515, "step": 1252500 }, { "epoch": 5.102817120876189, "grad_norm": 13.736040115356445, "learning_rate": 0.0005819371677955081, "loss": 7.5659, "step": 1252600 }, { "epoch": 5.1032244988995705, "grad_norm": 6.547802925109863, "learning_rate": 0.0005816211360933521, "loss": 7.5592, "step": 1252700 }, { "epoch": 5.103631876922951, "grad_norm": 3.539252281188965, "learning_rate": 0.0005813051804326502, "loss": 7.5672, "step": 1252800 }, { "epoch": 5.104039254946333, "grad_norm": 6.332305908203125, "learning_rate": 0.0005809893008258931, "loss": 7.5759, "step": 1252900 }, { "epoch": 5.104446632969714, "grad_norm": 18.98788070678711, "learning_rate": 0.0005806734972855694, "loss": 7.5309, "step": 1253000 }, { "epoch": 5.104446632969714, "eval_MaskedAccuracy": 0.5122840903033515, "eval_loss": 1.603977084159851, "eval_runtime": 198.8013, "eval_samples_per_second": 319.294, "eval_steps_per_second": 1.247, "step": 1253000 }, { "epoch": 5.104854010993096, "grad_norm": 4.694102764129639, "learning_rate": 0.0005803577698241637, "loss": 7.5488, "step": 1253100 }, { "epoch": 5.105261389016477, "grad_norm": 4.3892974853515625, "learning_rate": 0.000580042118454159, "loss": 7.5326, "step": 1253200 }, { "epoch": 5.105668767039859, "grad_norm": 7.1691670417785645, "learning_rate": 0.0005797265431880349, "loss": 7.5478, "step": 1253300 }, { "epoch": 5.10607614506324, "grad_norm": 6.850220680236816, "learning_rate": 0.0005794110440382678, "loss": 7.5936, "step": 1253400 }, { "epoch": 5.106483523086622, "grad_norm": 8.848091125488281, "learning_rate": 0.0005790956210173313, "loss": 7.5667, "step": 1253500 }, { "epoch": 5.106890901110003, "grad_norm": 4.3532233238220215, "learning_rate": 0.0005787802741376956, "loss": 7.5526, "step": 1253600 }, { "epoch": 5.107298279133385, "grad_norm": 10.647968292236328, "learning_rate": 0.0005784650034118275, "loss": 7.5571, "step": 1253700 }, { "epoch": 5.1077056571567665, "grad_norm": 9.989473342895508, "learning_rate": 0.000578149808852191, "loss": 7.5255, "step": 1253800 }, { "epoch": 5.108113035180148, "grad_norm": 10.404735565185547, "learning_rate": 0.0005778346904712488, "loss": 7.5445, "step": 1253900 }, { "epoch": 5.108520413203529, "grad_norm": 15.024141311645508, "learning_rate": 0.0005775196482814576, "loss": 7.5367, "step": 1254000 }, { "epoch": 5.108520413203529, "eval_MaskedAccuracy": 0.5125342825745884, "eval_loss": 1.58397376537323, "eval_runtime": 152.9923, "eval_samples_per_second": 414.897, "eval_steps_per_second": 1.621, "step": 1254000 }, { "epoch": 5.10892779122691, "grad_norm": 14.381766319274902, "learning_rate": 0.0005772046822952739, "loss": 7.5819, "step": 1254100 }, { "epoch": 5.109335169250292, "grad_norm": 10.739044189453125, "learning_rate": 0.000576889792525148, "loss": 7.554, "step": 1254200 }, { "epoch": 5.109742547273673, "grad_norm": 10.537728309631348, "learning_rate": 0.0005765749789835326, "loss": 7.5674, "step": 1254300 }, { "epoch": 5.110149925297055, "grad_norm": 4.679098606109619, "learning_rate": 0.000576260241682872, "loss": 7.5419, "step": 1254400 }, { "epoch": 5.110557303320436, "grad_norm": 3.553837299346924, "learning_rate": 0.0005759455806356104, "loss": 7.5387, "step": 1254500 }, { "epoch": 5.110964681343818, "grad_norm": 13.702073097229004, "learning_rate": 0.0005756309958541866, "loss": 7.5506, "step": 1254600 }, { "epoch": 5.111372059367199, "grad_norm": 5.067311763763428, "learning_rate": 0.0005753164873510391, "loss": 7.5482, "step": 1254700 }, { "epoch": 5.111779437390581, "grad_norm": 6.317410469055176, "learning_rate": 0.0005750020551386011, "loss": 7.5415, "step": 1254800 }, { "epoch": 5.1121868154139625, "grad_norm": 8.441106796264648, "learning_rate": 0.0005746876992293042, "loss": 7.5387, "step": 1254900 }, { "epoch": 5.112594193437344, "grad_norm": 4.294591903686523, "learning_rate": 0.0005743734196355764, "loss": 7.5423, "step": 1255000 }, { "epoch": 5.112594193437344, "eval_MaskedAccuracy": 0.5121129902226159, "eval_loss": 1.5859158039093018, "eval_runtime": 166.2729, "eval_samples_per_second": 381.758, "eval_steps_per_second": 1.492, "step": 1255000 }, { "epoch": 5.1130015714607255, "grad_norm": 6.235103607177734, "learning_rate": 0.0005740592163698434, "loss": 7.528, "step": 1255100 }, { "epoch": 5.113408949484107, "grad_norm": 4.567471027374268, "learning_rate": 0.0005737450894445267, "loss": 7.5364, "step": 1255200 }, { "epoch": 5.113816327507488, "grad_norm": 6.386045932769775, "learning_rate": 0.0005734310388720463, "loss": 7.5939, "step": 1255300 }, { "epoch": 5.114223705530869, "grad_norm": 5.731184482574463, "learning_rate": 0.000573117064664818, "loss": 7.5333, "step": 1255400 }, { "epoch": 5.114631083554251, "grad_norm": 3.82441782951355, "learning_rate": 0.0005728031668352545, "loss": 7.5742, "step": 1255500 }, { "epoch": 5.115038461577632, "grad_norm": 4.16675329208374, "learning_rate": 0.000572489345395765, "loss": 7.5424, "step": 1255600 }, { "epoch": 5.115445839601014, "grad_norm": 10.363410949707031, "learning_rate": 0.0005721756003587576, "loss": 7.5379, "step": 1255700 }, { "epoch": 5.115853217624395, "grad_norm": 13.491689682006836, "learning_rate": 0.0005718619317366372, "loss": 7.5668, "step": 1255800 }, { "epoch": 5.116260595647777, "grad_norm": 21.5654239654541, "learning_rate": 0.0005715483395418032, "loss": 7.5626, "step": 1255900 }, { "epoch": 5.116667973671158, "grad_norm": 14.647242546081543, "learning_rate": 0.0005712348237866539, "loss": 7.5572, "step": 1256000 }, { "epoch": 5.116667973671158, "eval_MaskedAccuracy": 0.5119153561224999, "eval_loss": 1.5987579822540283, "eval_runtime": 167.6983, "eval_samples_per_second": 378.513, "eval_steps_per_second": 1.479, "step": 1256000 }, { "epoch": 5.11707535169454, "grad_norm": 5.562261581420898, "learning_rate": 0.000570921384483585, "loss": 7.5398, "step": 1256100 }, { "epoch": 5.1174827297179215, "grad_norm": 5.86845064163208, "learning_rate": 0.0005706080216449879, "loss": 7.562, "step": 1256200 }, { "epoch": 5.117890107741303, "grad_norm": 5.843132495880127, "learning_rate": 0.0005702947352832513, "loss": 7.5208, "step": 1256300 }, { "epoch": 5.118297485764685, "grad_norm": 9.1304931640625, "learning_rate": 0.0005699815254107612, "loss": 7.5627, "step": 1256400 }, { "epoch": 5.118704863788066, "grad_norm": 9.829513549804688, "learning_rate": 0.0005696683920399006, "loss": 7.5289, "step": 1256500 }, { "epoch": 5.119112241811447, "grad_norm": 3.751936435699463, "learning_rate": 0.0005693553351830492, "loss": 7.5478, "step": 1256600 }, { "epoch": 5.119519619834828, "grad_norm": 20.435016632080078, "learning_rate": 0.0005690423548525833, "loss": 7.5411, "step": 1256700 }, { "epoch": 5.11992699785821, "grad_norm": 5.691356658935547, "learning_rate": 0.0005687294510608776, "loss": 7.5367, "step": 1256800 }, { "epoch": 5.120334375881591, "grad_norm": 6.463367938995361, "learning_rate": 0.0005684166238203031, "loss": 7.5915, "step": 1256900 }, { "epoch": 5.120741753904973, "grad_norm": 4.4170379638671875, "learning_rate": 0.0005681038731432271, "loss": 7.536, "step": 1257000 }, { "epoch": 5.120741753904973, "eval_MaskedAccuracy": 0.5127271519708295, "eval_loss": 1.5867799520492554, "eval_runtime": 169.8144, "eval_samples_per_second": 373.796, "eval_steps_per_second": 1.46, "step": 1257000 }, { "epoch": 5.121149131928354, "grad_norm": 3.14304518699646, "learning_rate": 0.0005677911990420141, "loss": 7.5447, "step": 1257100 }, { "epoch": 5.121556509951736, "grad_norm": 11.655447959899902, "learning_rate": 0.0005674786015290261, "loss": 7.5494, "step": 1257200 }, { "epoch": 5.1219638879751175, "grad_norm": 5.352982044219971, "learning_rate": 0.000567166080616622, "loss": 7.5544, "step": 1257300 }, { "epoch": 5.122371265998499, "grad_norm": 12.675518035888672, "learning_rate": 0.0005668536363171567, "loss": 7.5741, "step": 1257400 }, { "epoch": 5.1227786440218805, "grad_norm": 11.002009391784668, "learning_rate": 0.0005665412686429829, "loss": 7.5242, "step": 1257500 }, { "epoch": 5.123186022045262, "grad_norm": 4.125703811645508, "learning_rate": 0.0005662289776064508, "loss": 7.5405, "step": 1257600 }, { "epoch": 5.123593400068644, "grad_norm": 5.825300693511963, "learning_rate": 0.000565916763219906, "loss": 7.5437, "step": 1257700 }, { "epoch": 5.124000778092024, "grad_norm": 18.037317276000977, "learning_rate": 0.0005656046254956921, "loss": 7.5688, "step": 1257800 }, { "epoch": 5.124408156115406, "grad_norm": 5.741120338439941, "learning_rate": 0.0005652925644461503, "loss": 7.5351, "step": 1257900 }, { "epoch": 5.124815534138787, "grad_norm": 3.35128116607666, "learning_rate": 0.0005649805800836175, "loss": 7.5357, "step": 1258000 }, { "epoch": 5.124815534138787, "eval_MaskedAccuracy": 0.5120024325681323, "eval_loss": 1.593034029006958, "eval_runtime": 180.2955, "eval_samples_per_second": 352.066, "eval_steps_per_second": 1.376, "step": 1258000 }, { "epoch": 5.125222912162169, "grad_norm": 12.68496322631836, "learning_rate": 0.0005646686724204281, "loss": 7.5331, "step": 1258100 }, { "epoch": 5.12563029018555, "grad_norm": 3.733407974243164, "learning_rate": 0.0005643568414689143, "loss": 7.56, "step": 1258200 }, { "epoch": 5.126037668208932, "grad_norm": 8.2514009475708, "learning_rate": 0.0005640450872414042, "loss": 7.5143, "step": 1258300 }, { "epoch": 5.1264450462323135, "grad_norm": 12.680123329162598, "learning_rate": 0.0005637334097502226, "loss": 7.5809, "step": 1258400 }, { "epoch": 5.126852424255695, "grad_norm": 7.681098461151123, "learning_rate": 0.0005634218090076914, "loss": 7.5516, "step": 1258500 }, { "epoch": 5.1272598022790765, "grad_norm": 8.842367172241211, "learning_rate": 0.0005631102850261314, "loss": 7.5352, "step": 1258600 }, { "epoch": 5.127667180302458, "grad_norm": 12.64952278137207, "learning_rate": 0.0005627988378178572, "loss": 7.5737, "step": 1258700 }, { "epoch": 5.12807455832584, "grad_norm": 7.7794718742370605, "learning_rate": 0.0005624874673951821, "loss": 7.5327, "step": 1258800 }, { "epoch": 5.128481936349221, "grad_norm": 8.353389739990234, "learning_rate": 0.0005621761737704163, "loss": 7.5702, "step": 1258900 }, { "epoch": 5.128889314372602, "grad_norm": 7.190156936645508, "learning_rate": 0.0005618649569558682, "loss": 7.5149, "step": 1259000 }, { "epoch": 5.128889314372602, "eval_MaskedAccuracy": 0.5123441037784241, "eval_loss": 1.5870732069015503, "eval_runtime": 165.3347, "eval_samples_per_second": 383.924, "eval_steps_per_second": 1.5, "step": 1259000 }, { "epoch": 5.129296692395983, "grad_norm": 4.980506420135498, "learning_rate": 0.0005615538169638411, "loss": 7.5461, "step": 1259100 }, { "epoch": 5.129704070419365, "grad_norm": 3.817624807357788, "learning_rate": 0.0005612427538066355, "loss": 7.5517, "step": 1259200 }, { "epoch": 5.130111448442746, "grad_norm": 14.45424747467041, "learning_rate": 0.0005609317674965501, "loss": 7.5436, "step": 1259300 }, { "epoch": 5.130518826466128, "grad_norm": 13.708019256591797, "learning_rate": 0.0005606208580458796, "loss": 7.5498, "step": 1259400 }, { "epoch": 5.130926204489509, "grad_norm": 8.454963684082031, "learning_rate": 0.0005603100254669155, "loss": 7.5532, "step": 1259500 }, { "epoch": 5.131333582512891, "grad_norm": 8.445759773254395, "learning_rate": 0.0005599992697719477, "loss": 7.5479, "step": 1259600 }, { "epoch": 5.1317409605362725, "grad_norm": 6.335297107696533, "learning_rate": 0.00055968859097326, "loss": 7.559, "step": 1259700 }, { "epoch": 5.132148338559654, "grad_norm": 6.836855411529541, "learning_rate": 0.0005593779890831371, "loss": 7.5776, "step": 1259800 }, { "epoch": 5.1325557165830356, "grad_norm": 5.868393421173096, "learning_rate": 0.0005590674641138585, "loss": 7.5654, "step": 1259900 }, { "epoch": 5.132963094606417, "grad_norm": 9.056082725524902, "learning_rate": 0.0005587570160777005, "loss": 7.5385, "step": 1260000 }, { "epoch": 5.132963094606417, "eval_MaskedAccuracy": 0.5123997891684868, "eval_loss": 1.595545768737793, "eval_runtime": 178.9284, "eval_samples_per_second": 354.756, "eval_steps_per_second": 1.386, "step": 1260000 }, { "epoch": 5.133370472629799, "grad_norm": 5.885403633117676, "learning_rate": 0.0005584466449869369, "loss": 7.5392, "step": 1260100 }, { "epoch": 5.13377785065318, "grad_norm": 4.102877616882324, "learning_rate": 0.0005581363508538385, "loss": 7.5559, "step": 1260200 }, { "epoch": 5.134185228676561, "grad_norm": 5.089933395385742, "learning_rate": 0.000557826133690672, "loss": 7.5677, "step": 1260300 }, { "epoch": 5.134592606699942, "grad_norm": 15.710661888122559, "learning_rate": 0.000557515993509703, "loss": 7.5531, "step": 1260400 }, { "epoch": 5.134999984723324, "grad_norm": 4.206904411315918, "learning_rate": 0.0005572059303231927, "loss": 7.5491, "step": 1260500 }, { "epoch": 5.135407362746705, "grad_norm": 12.060872077941895, "learning_rate": 0.0005568959441433999, "loss": 7.5683, "step": 1260600 }, { "epoch": 5.135814740770087, "grad_norm": 2.5560874938964844, "learning_rate": 0.00055658603498258, "loss": 7.5888, "step": 1260700 }, { "epoch": 5.1362221187934685, "grad_norm": 7.485671043395996, "learning_rate": 0.000556276202852984, "loss": 7.5713, "step": 1260800 }, { "epoch": 5.13662949681685, "grad_norm": 18.603740692138672, "learning_rate": 0.0005559664477668626, "loss": 7.5626, "step": 1260900 }, { "epoch": 5.1370368748402315, "grad_norm": 3.6558775901794434, "learning_rate": 0.0005556567697364616, "loss": 7.5661, "step": 1261000 }, { "epoch": 5.1370368748402315, "eval_MaskedAccuracy": 0.512043810548891, "eval_loss": 1.5838418006896973, "eval_runtime": 180.6622, "eval_samples_per_second": 351.352, "eval_steps_per_second": 1.373, "step": 1261000 }, { "epoch": 5.137444252863613, "grad_norm": 6.576967239379883, "learning_rate": 0.0005553471687740244, "loss": 7.5592, "step": 1261100 }, { "epoch": 5.137851630886995, "grad_norm": 5.285487174987793, "learning_rate": 0.0005550376448917914, "loss": 7.5759, "step": 1261200 }, { "epoch": 5.138259008910376, "grad_norm": 6.252453327178955, "learning_rate": 0.0005547281981019993, "loss": 7.5457, "step": 1261300 }, { "epoch": 5.138666386933758, "grad_norm": 5.763672351837158, "learning_rate": 0.0005544188284168824, "loss": 7.5586, "step": 1261400 }, { "epoch": 5.139073764957139, "grad_norm": 13.533401489257812, "learning_rate": 0.0005541095358486723, "loss": 7.5717, "step": 1261500 }, { "epoch": 5.13948114298052, "grad_norm": 4.017519474029541, "learning_rate": 0.0005538003204095959, "loss": 7.5298, "step": 1261600 }, { "epoch": 5.139888521003901, "grad_norm": 17.37210464477539, "learning_rate": 0.0005534911821118786, "loss": 7.5192, "step": 1261700 }, { "epoch": 5.140295899027283, "grad_norm": 17.076250076293945, "learning_rate": 0.0005531821209677424, "loss": 7.5374, "step": 1261800 }, { "epoch": 5.140703277050664, "grad_norm": 12.65649127960205, "learning_rate": 0.000552873136989405, "loss": 7.5443, "step": 1261900 }, { "epoch": 5.141110655074046, "grad_norm": 4.699617385864258, "learning_rate": 0.0005525642301890837, "loss": 7.5673, "step": 1262000 }, { "epoch": 5.141110655074046, "eval_MaskedAccuracy": 0.5128783282627524, "eval_loss": 1.5865117311477661, "eval_runtime": 153.5447, "eval_samples_per_second": 413.404, "eval_steps_per_second": 1.615, "step": 1262000 }, { "epoch": 5.1415180330974275, "grad_norm": 17.365907669067383, "learning_rate": 0.0005522554005789904, "loss": 7.5619, "step": 1262100 }, { "epoch": 5.141925411120809, "grad_norm": 8.379057884216309, "learning_rate": 0.0005519466481713351, "loss": 7.5665, "step": 1262200 }, { "epoch": 5.142332789144191, "grad_norm": 14.656353950500488, "learning_rate": 0.0005516379729783243, "loss": 7.5485, "step": 1262300 }, { "epoch": 5.142740167167572, "grad_norm": 9.875476837158203, "learning_rate": 0.0005513293750121628, "loss": 7.5678, "step": 1262400 }, { "epoch": 5.143147545190954, "grad_norm": 2.759429454803467, "learning_rate": 0.0005510208542850499, "loss": 7.545, "step": 1262500 }, { "epoch": 5.143554923214335, "grad_norm": 4.63169002532959, "learning_rate": 0.0005507124108091826, "loss": 7.5508, "step": 1262600 }, { "epoch": 5.143962301237717, "grad_norm": 4.982854843139648, "learning_rate": 0.0005504040445967567, "loss": 7.56, "step": 1262700 }, { "epoch": 5.144369679261097, "grad_norm": 11.580699920654297, "learning_rate": 0.0005500957556599621, "loss": 7.5589, "step": 1262800 }, { "epoch": 5.144777057284479, "grad_norm": 9.614246368408203, "learning_rate": 0.0005497875440109886, "loss": 7.5468, "step": 1262900 }, { "epoch": 5.14518443530786, "grad_norm": 8.304036140441895, "learning_rate": 0.0005494794096620203, "loss": 7.5644, "step": 1263000 }, { "epoch": 5.14518443530786, "eval_MaskedAccuracy": 0.5125698403114065, "eval_loss": 1.5937505960464478, "eval_runtime": 166.4638, "eval_samples_per_second": 381.32, "eval_steps_per_second": 1.49, "step": 1263000 }, { "epoch": 5.145591813331242, "grad_norm": 5.4799113273620605, "learning_rate": 0.0005491713526252387, "loss": 7.5838, "step": 1263100 }, { "epoch": 5.1459991913546235, "grad_norm": 5.485556125640869, "learning_rate": 0.0005488633729128249, "loss": 7.5746, "step": 1263200 }, { "epoch": 5.146406569378005, "grad_norm": 15.55025863647461, "learning_rate": 0.000548555470536954, "loss": 7.5516, "step": 1263300 }, { "epoch": 5.1468139474013865, "grad_norm": 10.720710754394531, "learning_rate": 0.0005482476455097989, "loss": 7.5385, "step": 1263400 }, { "epoch": 5.147221325424768, "grad_norm": 9.393437385559082, "learning_rate": 0.0005479398978435298, "loss": 7.577, "step": 1263500 }, { "epoch": 5.14762870344815, "grad_norm": 9.32558822631836, "learning_rate": 0.0005476322275503147, "loss": 7.5602, "step": 1263600 }, { "epoch": 5.148036081471531, "grad_norm": 4.0196309089660645, "learning_rate": 0.0005473246346423159, "loss": 7.5521, "step": 1263700 }, { "epoch": 5.148443459494913, "grad_norm": 4.326145172119141, "learning_rate": 0.0005470171191316944, "loss": 7.5632, "step": 1263800 }, { "epoch": 5.148850837518294, "grad_norm": 6.749875068664551, "learning_rate": 0.0005467096810306082, "loss": 7.5494, "step": 1263900 }, { "epoch": 5.149258215541675, "grad_norm": 8.418333053588867, "learning_rate": 0.0005464023203512122, "loss": 7.5677, "step": 1264000 }, { "epoch": 5.149258215541675, "eval_MaskedAccuracy": 0.5127325140598666, "eval_loss": 1.5884439945220947, "eval_runtime": 221.1288, "eval_samples_per_second": 287.054, "eval_steps_per_second": 1.122, "step": 1264000 }, { "epoch": 5.149665593565056, "grad_norm": 5.534397125244141, "learning_rate": 0.0005460950371056575, "loss": 7.5374, "step": 1264100 }, { "epoch": 5.150072971588438, "grad_norm": 15.326800346374512, "learning_rate": 0.0005457878313060935, "loss": 7.5759, "step": 1264200 }, { "epoch": 5.150480349611819, "grad_norm": 11.39289665222168, "learning_rate": 0.0005454807029646647, "loss": 7.5524, "step": 1264300 }, { "epoch": 5.150887727635201, "grad_norm": 3.969057083129883, "learning_rate": 0.0005451736520935137, "loss": 7.542, "step": 1264400 }, { "epoch": 5.1512951056585825, "grad_norm": 15.005791664123535, "learning_rate": 0.0005448666787047811, "loss": 7.576, "step": 1264500 }, { "epoch": 5.151702483681964, "grad_norm": 14.393976211547852, "learning_rate": 0.0005445597828106016, "loss": 7.5784, "step": 1264600 }, { "epoch": 5.152109861705346, "grad_norm": 4.292541980743408, "learning_rate": 0.0005442529644231091, "loss": 7.5579, "step": 1264700 }, { "epoch": 5.152517239728727, "grad_norm": 4.117619037628174, "learning_rate": 0.0005439462235544333, "loss": 7.574, "step": 1264800 }, { "epoch": 5.152924617752109, "grad_norm": 5.055161476135254, "learning_rate": 0.0005436395602167026, "loss": 7.5709, "step": 1264900 }, { "epoch": 5.15333199577549, "grad_norm": 3.9017088413238525, "learning_rate": 0.0005433329744220398, "loss": 7.5536, "step": 1265000 }, { "epoch": 5.15333199577549, "eval_MaskedAccuracy": 0.5119708914343247, "eval_loss": 1.5935159921646118, "eval_runtime": 239.3234, "eval_samples_per_second": 265.231, "eval_steps_per_second": 1.036, "step": 1265000 }, { "epoch": 5.153739373798872, "grad_norm": 5.433421611785889, "learning_rate": 0.0005430264661825662, "loss": 7.5654, "step": 1265100 }, { "epoch": 5.154146751822253, "grad_norm": 10.552696228027344, "learning_rate": 0.0005427200355103999, "loss": 7.5633, "step": 1265200 }, { "epoch": 5.154554129845634, "grad_norm": 5.230506896972656, "learning_rate": 0.0005424136824176553, "loss": 7.5879, "step": 1265300 }, { "epoch": 5.154961507869015, "grad_norm": 5.859530448913574, "learning_rate": 0.0005421074069164449, "loss": 7.5353, "step": 1265400 }, { "epoch": 5.155368885892397, "grad_norm": 3.4236106872558594, "learning_rate": 0.0005418012090188764, "loss": 7.5419, "step": 1265500 }, { "epoch": 5.1557762639157785, "grad_norm": 11.217131614685059, "learning_rate": 0.0005414950887370554, "loss": 7.5294, "step": 1265600 }, { "epoch": 5.15618364193916, "grad_norm": 6.577310085296631, "learning_rate": 0.0005411890460830863, "loss": 7.5612, "step": 1265700 }, { "epoch": 5.1565910199625415, "grad_norm": 6.010001182556152, "learning_rate": 0.0005408830810690677, "loss": 7.576, "step": 1265800 }, { "epoch": 5.156998397985923, "grad_norm": 9.843003273010254, "learning_rate": 0.0005405771937070946, "loss": 7.5373, "step": 1265900 }, { "epoch": 5.157405776009305, "grad_norm": 5.558066368103027, "learning_rate": 0.0005402713840092617, "loss": 7.5682, "step": 1266000 }, { "epoch": 5.157405776009305, "eval_MaskedAccuracy": 0.5126519099999005, "eval_loss": 1.585870623588562, "eval_runtime": 176.206, "eval_samples_per_second": 360.237, "eval_steps_per_second": 1.407, "step": 1266000 }, { "epoch": 5.157813154032686, "grad_norm": 12.909948348999023, "learning_rate": 0.0005399656519876595, "loss": 7.5489, "step": 1266100 }, { "epoch": 5.158220532056068, "grad_norm": 16.241851806640625, "learning_rate": 0.000539659997654375, "loss": 7.5387, "step": 1266200 }, { "epoch": 5.158627910079449, "grad_norm": 5.339664459228516, "learning_rate": 0.0005393544210214921, "loss": 7.5225, "step": 1266300 }, { "epoch": 5.159035288102831, "grad_norm": 17.13751792907715, "learning_rate": 0.0005390489221010918, "loss": 7.5019, "step": 1266400 }, { "epoch": 5.159442666126212, "grad_norm": 4.725861549377441, "learning_rate": 0.0005387435009052526, "loss": 7.5521, "step": 1266500 }, { "epoch": 5.159850044149593, "grad_norm": 5.74859094619751, "learning_rate": 0.0005384381574460492, "loss": 7.563, "step": 1266600 }, { "epoch": 5.1602574221729745, "grad_norm": 15.772001266479492, "learning_rate": 0.0005381328917355535, "loss": 7.5229, "step": 1266700 }, { "epoch": 5.160664800196356, "grad_norm": 9.625349998474121, "learning_rate": 0.000537827703785833, "loss": 7.5489, "step": 1266800 }, { "epoch": 5.1610721782197375, "grad_norm": 5.832103729248047, "learning_rate": 0.0005375225936089551, "loss": 7.5486, "step": 1266900 }, { "epoch": 5.161479556243119, "grad_norm": 3.081801652908325, "learning_rate": 0.0005372175612169822, "loss": 7.5265, "step": 1267000 }, { "epoch": 5.161479556243119, "eval_MaskedAccuracy": 0.5128539098904568, "eval_loss": 1.5957610607147217, "eval_runtime": 163.7109, "eval_samples_per_second": 387.732, "eval_steps_per_second": 1.515, "step": 1267000 }, { "epoch": 5.161886934266501, "grad_norm": 4.2837982177734375, "learning_rate": 0.0005369126066219735, "loss": 7.5596, "step": 1267100 }, { "epoch": 5.162294312289882, "grad_norm": 5.668388843536377, "learning_rate": 0.0005366077298359856, "loss": 7.5594, "step": 1267200 }, { "epoch": 5.162701690313264, "grad_norm": 18.58271598815918, "learning_rate": 0.0005363029308710721, "loss": 7.5077, "step": 1267300 }, { "epoch": 5.163109068336645, "grad_norm": 16.35295867919922, "learning_rate": 0.0005359982097392827, "loss": 7.5415, "step": 1267400 }, { "epoch": 5.163516446360027, "grad_norm": 21.181188583374023, "learning_rate": 0.0005356935664526649, "loss": 7.5489, "step": 1267500 }, { "epoch": 5.163923824383408, "grad_norm": 3.084169864654541, "learning_rate": 0.0005353890010232636, "loss": 7.5617, "step": 1267600 }, { "epoch": 5.16433120240679, "grad_norm": 17.424720764160156, "learning_rate": 0.0005350845134631188, "loss": 7.562, "step": 1267700 }, { "epoch": 5.16473858043017, "grad_norm": 3.184140920639038, "learning_rate": 0.0005347801037842686, "loss": 7.5331, "step": 1267800 }, { "epoch": 5.165145958453552, "grad_norm": 6.366086483001709, "learning_rate": 0.0005344757719987482, "loss": 7.5336, "step": 1267900 }, { "epoch": 5.1655533364769335, "grad_norm": 9.245692253112793, "learning_rate": 0.0005341715181185896, "loss": 7.5413, "step": 1268000 }, { "epoch": 5.1655533364769335, "eval_MaskedAccuracy": 0.512463367519968, "eval_loss": 1.5966273546218872, "eval_runtime": 171.4111, "eval_samples_per_second": 370.314, "eval_steps_per_second": 1.447, "step": 1268000 }, { "epoch": 5.165960714500315, "grad_norm": 16.10322380065918, "learning_rate": 0.0005338673421558221, "loss": 7.5161, "step": 1268100 }, { "epoch": 5.1663680925236966, "grad_norm": 3.6817374229431152, "learning_rate": 0.000533563244122471, "loss": 7.523, "step": 1268200 }, { "epoch": 5.166775470547078, "grad_norm": 16.2663631439209, "learning_rate": 0.0005332592240305583, "loss": 7.5354, "step": 1268300 }, { "epoch": 5.16718284857046, "grad_norm": 20.580501556396484, "learning_rate": 0.0005329552818921049, "loss": 7.5454, "step": 1268400 }, { "epoch": 5.167590226593841, "grad_norm": 10.921297073364258, "learning_rate": 0.0005326514177191262, "loss": 7.525, "step": 1268500 }, { "epoch": 5.167997604617223, "grad_norm": 2.968923568725586, "learning_rate": 0.0005323476315236359, "loss": 7.5435, "step": 1268600 }, { "epoch": 5.168404982640604, "grad_norm": 9.835203170776367, "learning_rate": 0.000532043923317644, "loss": 7.5515, "step": 1268700 }, { "epoch": 5.168812360663986, "grad_norm": 6.683423042297363, "learning_rate": 0.000531740293113158, "loss": 7.5591, "step": 1268800 }, { "epoch": 5.169219738687367, "grad_norm": 13.926468849182129, "learning_rate": 0.0005314367409221814, "loss": 7.5541, "step": 1268900 }, { "epoch": 5.169627116710748, "grad_norm": 4.112544059753418, "learning_rate": 0.0005311332667567165, "loss": 7.5419, "step": 1269000 }, { "epoch": 5.169627116710748, "eval_MaskedAccuracy": 0.5119529905206833, "eval_loss": 1.598028302192688, "eval_runtime": 190.2852, "eval_samples_per_second": 333.584, "eval_steps_per_second": 1.303, "step": 1269000 }, { "epoch": 5.1700344947341295, "grad_norm": 6.330892086029053, "learning_rate": 0.0005308298706287607, "loss": 7.579, "step": 1269100 }, { "epoch": 5.170441872757511, "grad_norm": 11.34080696105957, "learning_rate": 0.0005305265525503086, "loss": 7.5401, "step": 1269200 }, { "epoch": 5.1708492507808925, "grad_norm": 14.808900833129883, "learning_rate": 0.000530223312533352, "loss": 7.5109, "step": 1269300 }, { "epoch": 5.171256628804274, "grad_norm": 19.919721603393555, "learning_rate": 0.0005299201505898793, "loss": 7.5375, "step": 1269400 }, { "epoch": 5.171664006827656, "grad_norm": 13.870037078857422, "learning_rate": 0.0005296170667318773, "loss": 7.5282, "step": 1269500 }, { "epoch": 5.172071384851037, "grad_norm": 8.070316314697266, "learning_rate": 0.0005293140609713271, "loss": 7.535, "step": 1269600 }, { "epoch": 5.172478762874419, "grad_norm": 9.52051830291748, "learning_rate": 0.0005290111333202091, "loss": 7.561, "step": 1269700 }, { "epoch": 5.1728861408978, "grad_norm": 19.949111938476562, "learning_rate": 0.0005287082837904989, "loss": 7.5589, "step": 1269800 }, { "epoch": 5.173293518921182, "grad_norm": 6.0597944259643555, "learning_rate": 0.0005284055123941703, "loss": 7.5136, "step": 1269900 }, { "epoch": 5.173700896944563, "grad_norm": 3.6505286693573, "learning_rate": 0.0005281028191431927, "loss": 7.5395, "step": 1270000 }, { "epoch": 5.173700896944563, "eval_MaskedAccuracy": 0.5123596121515993, "eval_loss": 1.5900706052780151, "eval_runtime": 162.7414, "eval_samples_per_second": 390.042, "eval_steps_per_second": 1.524, "step": 1270000 }, { "epoch": 5.174108274967945, "grad_norm": 10.080352783203125, "learning_rate": 0.0005278002040495345, "loss": 7.5478, "step": 1270100 }, { "epoch": 5.174515652991326, "grad_norm": 6.672272205352783, "learning_rate": 0.0005274976671251593, "loss": 7.5676, "step": 1270200 }, { "epoch": 5.174923031014707, "grad_norm": 10.528050422668457, "learning_rate": 0.0005271952083820284, "loss": 7.5677, "step": 1270300 }, { "epoch": 5.1753304090380885, "grad_norm": 19.363574981689453, "learning_rate": 0.0005268928278320986, "loss": 7.5439, "step": 1270400 }, { "epoch": 5.17573778706147, "grad_norm": 3.931839942932129, "learning_rate": 0.0005265905254873242, "loss": 7.5489, "step": 1270500 }, { "epoch": 5.176145165084852, "grad_norm": 11.309305191040039, "learning_rate": 0.0005262883013596573, "loss": 7.5522, "step": 1270600 }, { "epoch": 5.176552543108233, "grad_norm": 13.059709548950195, "learning_rate": 0.0005259861554610477, "loss": 7.5518, "step": 1270700 }, { "epoch": 5.176959921131615, "grad_norm": 7.099419116973877, "learning_rate": 0.0005256840878034389, "loss": 7.5431, "step": 1270800 }, { "epoch": 5.177367299154996, "grad_norm": 2.741917848587036, "learning_rate": 0.0005253820983987747, "loss": 7.5658, "step": 1270900 }, { "epoch": 5.177774677178378, "grad_norm": 4.795065879821777, "learning_rate": 0.0005250801872589931, "loss": 7.5494, "step": 1271000 }, { "epoch": 5.177774677178378, "eval_MaskedAccuracy": 0.5126933325976969, "eval_loss": 1.5832537412643433, "eval_runtime": 172.5992, "eval_samples_per_second": 367.765, "eval_steps_per_second": 1.437, "step": 1271000 }, { "epoch": 5.178182055201759, "grad_norm": 7.33160400390625, "learning_rate": 0.000524778354396032, "loss": 7.5672, "step": 1271100 }, { "epoch": 5.178589433225141, "grad_norm": 18.861373901367188, "learning_rate": 0.0005244765998218236, "loss": 7.5469, "step": 1271200 }, { "epoch": 5.178996811248522, "grad_norm": 4.511181831359863, "learning_rate": 0.0005241749235482972, "loss": 7.5396, "step": 1271300 }, { "epoch": 5.179404189271904, "grad_norm": 6.524885654449463, "learning_rate": 0.0005238733255873815, "loss": 7.5576, "step": 1271400 }, { "epoch": 5.179811567295285, "grad_norm": 3.818176746368408, "learning_rate": 0.0005235718059509989, "loss": 7.5659, "step": 1271500 }, { "epoch": 5.180218945318666, "grad_norm": 13.060426712036133, "learning_rate": 0.00052327036465107, "loss": 7.5366, "step": 1271600 }, { "epoch": 5.1806263233420475, "grad_norm": 22.0394229888916, "learning_rate": 0.0005229690016995128, "loss": 7.5515, "step": 1271700 }, { "epoch": 5.181033701365429, "grad_norm": 19.83888053894043, "learning_rate": 0.0005226677171082413, "loss": 7.5662, "step": 1271800 }, { "epoch": 5.181441079388811, "grad_norm": 9.551691055297852, "learning_rate": 0.0005223665108891677, "loss": 7.5682, "step": 1271900 }, { "epoch": 5.181848457412192, "grad_norm": 4.3937859535217285, "learning_rate": 0.0005220653830541982, "loss": 7.5552, "step": 1272000 }, { "epoch": 5.181848457412192, "eval_MaskedAccuracy": 0.5124723849085815, "eval_loss": 1.5935578346252441, "eval_runtime": 174.9777, "eval_samples_per_second": 362.766, "eval_steps_per_second": 1.417, "step": 1272000 }, { "epoch": 5.182255835435574, "grad_norm": 22.850011825561523, "learning_rate": 0.0005217643336152411, "loss": 7.5562, "step": 1272100 }, { "epoch": 5.182663213458955, "grad_norm": 18.282371520996094, "learning_rate": 0.0005214633625841967, "loss": 7.5065, "step": 1272200 }, { "epoch": 5.183070591482337, "grad_norm": 7.804836273193359, "learning_rate": 0.0005211624699729641, "loss": 7.544, "step": 1272300 }, { "epoch": 5.183477969505718, "grad_norm": 4.658621311187744, "learning_rate": 0.0005208616557934395, "loss": 7.5487, "step": 1272400 }, { "epoch": 5.1838853475291, "grad_norm": 5.6526947021484375, "learning_rate": 0.0005205609200575151, "loss": 7.5616, "step": 1272500 }, { "epoch": 5.184292725552481, "grad_norm": 20.303049087524414, "learning_rate": 0.0005202602627770817, "loss": 7.5462, "step": 1272600 }, { "epoch": 5.184700103575863, "grad_norm": 21.300710678100586, "learning_rate": 0.0005199596839640245, "loss": 7.5363, "step": 1272700 }, { "epoch": 5.1851074815992435, "grad_norm": 8.422891616821289, "learning_rate": 0.0005196591836302279, "loss": 7.5784, "step": 1272800 }, { "epoch": 5.185514859622625, "grad_norm": 15.43104076385498, "learning_rate": 0.0005193587617875722, "loss": 7.5381, "step": 1272900 }, { "epoch": 5.185922237646007, "grad_norm": 9.114885330200195, "learning_rate": 0.0005190584184479343, "loss": 7.5534, "step": 1273000 }, { "epoch": 5.185922237646007, "eval_MaskedAccuracy": 0.5126939014839413, "eval_loss": 1.5904895067214966, "eval_runtime": 173.6511, "eval_samples_per_second": 365.538, "eval_steps_per_second": 1.428, "step": 1273000 }, { "epoch": 5.186329615669388, "grad_norm": 9.559250831604004, "learning_rate": 0.0005187581536231892, "loss": 7.5217, "step": 1273100 }, { "epoch": 5.18673699369277, "grad_norm": 4.708173751831055, "learning_rate": 0.0005184579673252074, "loss": 7.5205, "step": 1273200 }, { "epoch": 5.187144371716151, "grad_norm": 21.92316436767578, "learning_rate": 0.0005181578595658565, "loss": 7.5412, "step": 1273300 }, { "epoch": 5.187551749739533, "grad_norm": 11.953930854797363, "learning_rate": 0.000517857830357002, "loss": 7.5612, "step": 1273400 }, { "epoch": 5.187959127762914, "grad_norm": 17.054977416992188, "learning_rate": 0.0005175578797105055, "loss": 7.5511, "step": 1273500 }, { "epoch": 5.188366505786296, "grad_norm": 9.827713012695312, "learning_rate": 0.0005172580076382258, "loss": 7.5188, "step": 1273600 }, { "epoch": 5.188773883809677, "grad_norm": 20.34474754333496, "learning_rate": 0.0005169582141520181, "loss": 7.5589, "step": 1273700 }, { "epoch": 5.189181261833059, "grad_norm": 7.054084777832031, "learning_rate": 0.0005166584992637351, "loss": 7.5555, "step": 1273800 }, { "epoch": 5.18958863985644, "grad_norm": 7.799508094787598, "learning_rate": 0.0005163588629852259, "loss": 7.558, "step": 1273900 }, { "epoch": 5.189996017879821, "grad_norm": 8.375802993774414, "learning_rate": 0.0005160593053283368, "loss": 7.52, "step": 1274000 }, { "epoch": 5.189996017879821, "eval_MaskedAccuracy": 0.512114161487815, "eval_loss": 1.5887385606765747, "eval_runtime": 165.4391, "eval_samples_per_second": 383.682, "eval_steps_per_second": 1.499, "step": 1274000 }, { "epoch": 5.1904033959032025, "grad_norm": 10.135929107666016, "learning_rate": 0.0005157598263049106, "loss": 7.543, "step": 1274100 }, { "epoch": 5.190810773926584, "grad_norm": 12.61279582977295, "learning_rate": 0.0005154604259267879, "loss": 7.5402, "step": 1274200 }, { "epoch": 5.191218151949966, "grad_norm": 8.79918384552002, "learning_rate": 0.0005151611042058058, "loss": 7.5575, "step": 1274300 }, { "epoch": 5.191625529973347, "grad_norm": 16.091278076171875, "learning_rate": 0.0005148618611537979, "loss": 7.5296, "step": 1274400 }, { "epoch": 5.192032907996729, "grad_norm": 16.06725311279297, "learning_rate": 0.0005145626967825941, "loss": 7.537, "step": 1274500 }, { "epoch": 5.19244028602011, "grad_norm": 18.132741928100586, "learning_rate": 0.0005142636111040232, "loss": 7.5405, "step": 1274600 }, { "epoch": 5.192847664043492, "grad_norm": 4.988110542297363, "learning_rate": 0.0005139646041299082, "loss": 7.533, "step": 1274700 }, { "epoch": 5.193255042066873, "grad_norm": 5.989030838012695, "learning_rate": 0.0005136656758720708, "loss": 7.563, "step": 1274800 }, { "epoch": 5.193662420090255, "grad_norm": 25.162853240966797, "learning_rate": 0.0005133668263423302, "loss": 7.5419, "step": 1274900 }, { "epoch": 5.194069798113636, "grad_norm": 18.128734588623047, "learning_rate": 0.0005130680555525008, "loss": 7.534, "step": 1275000 }, { "epoch": 5.194069798113636, "eval_MaskedAccuracy": 0.5121862551616991, "eval_loss": 1.5897197723388672, "eval_runtime": 160.1237, "eval_samples_per_second": 396.419, "eval_steps_per_second": 1.549, "step": 1275000 }, { "epoch": 5.194477176137018, "grad_norm": 8.414715766906738, "learning_rate": 0.0005127693635143956, "loss": 7.5486, "step": 1275100 }, { "epoch": 5.194884554160399, "grad_norm": 3.6837668418884277, "learning_rate": 0.0005124707502398218, "loss": 7.557, "step": 1275200 }, { "epoch": 5.19529193218378, "grad_norm": 6.873579025268555, "learning_rate": 0.0005121722157405866, "loss": 7.5579, "step": 1275300 }, { "epoch": 5.195699310207162, "grad_norm": 6.990813255310059, "learning_rate": 0.0005118737600284916, "loss": 7.5723, "step": 1275400 }, { "epoch": 5.196106688230543, "grad_norm": 4.58314847946167, "learning_rate": 0.0005115753831153366, "loss": 7.5239, "step": 1275500 }, { "epoch": 5.196514066253925, "grad_norm": 15.989585876464844, "learning_rate": 0.0005112770850129188, "loss": 7.5425, "step": 1275600 }, { "epoch": 5.196921444277306, "grad_norm": 20.096433639526367, "learning_rate": 0.000510978865733031, "loss": 7.5467, "step": 1275700 }, { "epoch": 5.197328822300688, "grad_norm": 10.62798023223877, "learning_rate": 0.0005106807252874631, "loss": 7.5466, "step": 1275800 }, { "epoch": 5.197736200324069, "grad_norm": 21.502344131469727, "learning_rate": 0.0005103826636880029, "loss": 7.503, "step": 1275900 }, { "epoch": 5.198143578347451, "grad_norm": 3.51676082611084, "learning_rate": 0.000510084680946433, "loss": 7.5731, "step": 1276000 }, { "epoch": 5.198143578347451, "eval_MaskedAccuracy": 0.5123767147094139, "eval_loss": 1.5917176008224487, "eval_runtime": 171.0444, "eval_samples_per_second": 371.108, "eval_steps_per_second": 1.45, "step": 1276000 }, { "epoch": 5.198550956370832, "grad_norm": 12.190217018127441, "learning_rate": 0.0005097867770745358, "loss": 7.5173, "step": 1276100 }, { "epoch": 5.198958334394214, "grad_norm": 24.854997634887695, "learning_rate": 0.0005094889520840889, "loss": 7.5543, "step": 1276200 }, { "epoch": 5.199365712417595, "grad_norm": 21.390878677368164, "learning_rate": 0.0005091912059868668, "loss": 7.5561, "step": 1276300 }, { "epoch": 5.199773090440977, "grad_norm": 16.05355453491211, "learning_rate": 0.0005088935387946397, "loss": 7.5494, "step": 1276400 }, { "epoch": 5.200180468464358, "grad_norm": 4.544560432434082, "learning_rate": 0.0005085959505191776, "loss": 7.5553, "step": 1276500 }, { "epoch": 5.200587846487739, "grad_norm": 19.7420654296875, "learning_rate": 0.0005082984411722452, "loss": 7.5475, "step": 1276600 }, { "epoch": 5.200995224511121, "grad_norm": 12.10345458984375, "learning_rate": 0.0005080010107656047, "loss": 7.5647, "step": 1276700 }, { "epoch": 5.201402602534502, "grad_norm": 5.871239185333252, "learning_rate": 0.0005077036593110144, "loss": 7.5453, "step": 1276800 }, { "epoch": 5.201809980557884, "grad_norm": 18.729658126831055, "learning_rate": 0.0005074063868202314, "loss": 7.537, "step": 1276900 }, { "epoch": 5.202217358581265, "grad_norm": 6.017864227294922, "learning_rate": 0.0005071091933050057, "loss": 7.5214, "step": 1277000 }, { "epoch": 5.202217358581265, "eval_MaskedAccuracy": 0.5128177817636642, "eval_loss": 1.5880012512207031, "eval_runtime": 173.3499, "eval_samples_per_second": 366.173, "eval_steps_per_second": 1.431, "step": 1277000 }, { "epoch": 5.202624736604647, "grad_norm": 6.193304061889648, "learning_rate": 0.0005068120787770904, "loss": 7.562, "step": 1277100 }, { "epoch": 5.203032114628028, "grad_norm": 18.89733123779297, "learning_rate": 0.0005065150432482303, "loss": 7.532, "step": 1277200 }, { "epoch": 5.20343949265141, "grad_norm": 6.110525608062744, "learning_rate": 0.0005062180867301693, "loss": 7.5612, "step": 1277300 }, { "epoch": 5.203846870674791, "grad_norm": 16.18763542175293, "learning_rate": 0.000505921209234647, "loss": 7.5379, "step": 1277400 }, { "epoch": 5.204254248698173, "grad_norm": 18.676969528198242, "learning_rate": 0.0005056244107734024, "loss": 7.5401, "step": 1277500 }, { "epoch": 5.204661626721554, "grad_norm": 14.559529304504395, "learning_rate": 0.000505327691358168, "loss": 7.5684, "step": 1277600 }, { "epoch": 5.205069004744936, "grad_norm": 4.3110175132751465, "learning_rate": 0.0005050310510006749, "loss": 7.5665, "step": 1277700 }, { "epoch": 5.205476382768317, "grad_norm": 4.289351463317871, "learning_rate": 0.0005047344897126508, "loss": 7.5634, "step": 1277800 }, { "epoch": 5.205883760791698, "grad_norm": 4.672882556915283, "learning_rate": 0.0005044380075058202, "loss": 7.5554, "step": 1277900 }, { "epoch": 5.20629113881508, "grad_norm": 15.918068885803223, "learning_rate": 0.0005041416043919044, "loss": 7.5788, "step": 1278000 }, { "epoch": 5.20629113881508, "eval_MaskedAccuracy": 0.5121467039824147, "eval_loss": 1.5961333513259888, "eval_runtime": 164.6736, "eval_samples_per_second": 385.465, "eval_steps_per_second": 1.506, "step": 1278000 }, { "epoch": 5.206698516838461, "grad_norm": 19.095203399658203, "learning_rate": 0.0005038452803826239, "loss": 7.5523, "step": 1278100 }, { "epoch": 5.207105894861843, "grad_norm": 21.950645446777344, "learning_rate": 0.0005035490354896918, "loss": 7.5401, "step": 1278200 }, { "epoch": 5.207513272885224, "grad_norm": 5.846652507781982, "learning_rate": 0.0005032528697248206, "loss": 7.5438, "step": 1278300 }, { "epoch": 5.207920650908606, "grad_norm": 3.010402202606201, "learning_rate": 0.0005029567830997202, "loss": 7.5521, "step": 1278400 }, { "epoch": 5.208328028931987, "grad_norm": 8.32205581665039, "learning_rate": 0.0005026607756260952, "loss": 7.5344, "step": 1278500 }, { "epoch": 5.208735406955369, "grad_norm": 5.044190883636475, "learning_rate": 0.000502364847315649, "loss": 7.5528, "step": 1278600 }, { "epoch": 5.20914278497875, "grad_norm": 9.99151611328125, "learning_rate": 0.0005020689981800818, "loss": 7.5759, "step": 1278700 }, { "epoch": 5.209550163002132, "grad_norm": 9.568770408630371, "learning_rate": 0.0005017732282310888, "loss": 7.5386, "step": 1278800 }, { "epoch": 5.2099575410255134, "grad_norm": 27.248090744018555, "learning_rate": 0.0005014775374803644, "loss": 7.5319, "step": 1278900 }, { "epoch": 5.210364919048894, "grad_norm": 5.413463115692139, "learning_rate": 0.0005011819259395981, "loss": 7.537, "step": 1279000 }, { "epoch": 5.210364919048894, "eval_MaskedAccuracy": 0.5122444609892658, "eval_loss": 1.587311029434204, "eval_runtime": 163.1407, "eval_samples_per_second": 389.088, "eval_steps_per_second": 1.52, "step": 1279000 }, { "epoch": 5.210772297072276, "grad_norm": 11.685070991516113, "learning_rate": 0.0005008863936204777, "loss": 7.5437, "step": 1279100 }, { "epoch": 5.211179675095657, "grad_norm": 7.2980265617370605, "learning_rate": 0.0005005909405346861, "loss": 7.5981, "step": 1279200 }, { "epoch": 5.211587053119039, "grad_norm": 18.209922790527344, "learning_rate": 0.0005002955666939064, "loss": 7.545, "step": 1279300 }, { "epoch": 5.21199443114242, "grad_norm": 12.128777503967285, "learning_rate": 0.0005000002721098142, "loss": 7.5637, "step": 1279400 }, { "epoch": 5.212401809165802, "grad_norm": 4.433072566986084, "learning_rate": 0.0004997050567940851, "loss": 7.5295, "step": 1279500 }, { "epoch": 5.212809187189183, "grad_norm": 10.994619369506836, "learning_rate": 0.0004994099207583901, "loss": 7.5315, "step": 1279600 }, { "epoch": 5.213216565212565, "grad_norm": 13.443550109863281, "learning_rate": 0.0004991148640143975, "loss": 7.5605, "step": 1279700 }, { "epoch": 5.213623943235946, "grad_norm": 16.893964767456055, "learning_rate": 0.0004988198865737724, "loss": 7.5842, "step": 1279800 }, { "epoch": 5.214031321259328, "grad_norm": 9.369170188903809, "learning_rate": 0.0004985249884481761, "loss": 7.5525, "step": 1279900 }, { "epoch": 5.214438699282709, "grad_norm": 9.552986145019531, "learning_rate": 0.0004982301696492694, "loss": 7.5436, "step": 1280000 }, { "epoch": 5.214438699282709, "eval_MaskedAccuracy": 0.5130588358646386, "eval_loss": 1.5795327425003052, "eval_runtime": 169.1986, "eval_samples_per_second": 375.157, "eval_steps_per_second": 1.466, "step": 1280000 }, { "epoch": 5.214846077306091, "grad_norm": 3.6762850284576416, "learning_rate": 0.0004979354301887042, "loss": 7.5163, "step": 1280100 }, { "epoch": 5.2152534553294725, "grad_norm": 8.116147994995117, "learning_rate": 0.0004976407700781358, "loss": 7.5847, "step": 1280200 }, { "epoch": 5.215660833352853, "grad_norm": 5.632050514221191, "learning_rate": 0.0004973461893292147, "loss": 7.4982, "step": 1280300 }, { "epoch": 5.216068211376235, "grad_norm": 5.7624053955078125, "learning_rate": 0.0004970516879535864, "loss": 7.5721, "step": 1280400 }, { "epoch": 5.216475589399616, "grad_norm": 6.228050231933594, "learning_rate": 0.0004967572659628934, "loss": 7.5744, "step": 1280500 }, { "epoch": 5.216882967422998, "grad_norm": 3.5868136882781982, "learning_rate": 0.0004964629233687771, "loss": 7.5626, "step": 1280600 }, { "epoch": 5.217290345446379, "grad_norm": 11.111696243286133, "learning_rate": 0.000496168660182874, "loss": 7.5391, "step": 1280700 }, { "epoch": 5.217697723469761, "grad_norm": 4.965041160583496, "learning_rate": 0.0004958744764168174, "loss": 7.5492, "step": 1280800 }, { "epoch": 5.218105101493142, "grad_norm": 12.747413635253906, "learning_rate": 0.000495580372082238, "loss": 7.5441, "step": 1280900 }, { "epoch": 5.218512479516524, "grad_norm": 5.378708362579346, "learning_rate": 0.0004952863471907636, "loss": 7.5121, "step": 1281000 }, { "epoch": 5.218512479516524, "eval_MaskedAccuracy": 0.5126325956417793, "eval_loss": 1.593763828277588, "eval_runtime": 176.2232, "eval_samples_per_second": 360.202, "eval_steps_per_second": 1.407, "step": 1281000 }, { "epoch": 5.218919857539905, "grad_norm": 11.631937026977539, "learning_rate": 0.0004949924017540182, "loss": 7.5425, "step": 1281100 }, { "epoch": 5.219327235563287, "grad_norm": 5.905684947967529, "learning_rate": 0.0004946985357836244, "loss": 7.5377, "step": 1281200 }, { "epoch": 5.2197346135866685, "grad_norm": 5.273624420166016, "learning_rate": 0.0004944047492911988, "loss": 7.5476, "step": 1281300 }, { "epoch": 5.22014199161005, "grad_norm": 9.432669639587402, "learning_rate": 0.0004941110422883577, "loss": 7.5253, "step": 1281400 }, { "epoch": 5.2205493696334315, "grad_norm": 7.141916275024414, "learning_rate": 0.0004938174147867119, "loss": 7.5348, "step": 1281500 }, { "epoch": 5.220956747656812, "grad_norm": 12.515483856201172, "learning_rate": 0.0004935238667978699, "loss": 7.535, "step": 1281600 }, { "epoch": 5.221364125680194, "grad_norm": 4.881894588470459, "learning_rate": 0.0004932303983334376, "loss": 7.5415, "step": 1281700 }, { "epoch": 5.221771503703575, "grad_norm": 3.7656407356262207, "learning_rate": 0.0004929370094050173, "loss": 7.5609, "step": 1281800 }, { "epoch": 5.222178881726957, "grad_norm": 13.714067459106445, "learning_rate": 0.0004926437000242073, "loss": 7.5394, "step": 1281900 }, { "epoch": 5.222586259750338, "grad_norm": 15.318659782409668, "learning_rate": 0.0004923504702026056, "loss": 7.5323, "step": 1282000 }, { "epoch": 5.222586259750338, "eval_MaskedAccuracy": 0.51202640226235, "eval_loss": 1.5868650674819946, "eval_runtime": 169.942, "eval_samples_per_second": 373.516, "eval_steps_per_second": 1.459, "step": 1282000 }, { "epoch": 5.22299363777372, "grad_norm": 4.545812129974365, "learning_rate": 0.0004920573199518039, "loss": 7.5595, "step": 1282100 }, { "epoch": 5.223401015797101, "grad_norm": 18.41389274597168, "learning_rate": 0.0004917642492833929, "loss": 7.5457, "step": 1282200 }, { "epoch": 5.223808393820483, "grad_norm": 5.827291011810303, "learning_rate": 0.0004914712582089581, "loss": 7.5457, "step": 1282300 }, { "epoch": 5.224215771843864, "grad_norm": 13.710026741027832, "learning_rate": 0.0004911783467400837, "loss": 7.5297, "step": 1282400 }, { "epoch": 5.224623149867246, "grad_norm": 7.869463920593262, "learning_rate": 0.0004908855148883501, "loss": 7.5423, "step": 1282500 }, { "epoch": 5.2250305278906275, "grad_norm": 4.054448127746582, "learning_rate": 0.000490592762665335, "loss": 7.553, "step": 1282600 }, { "epoch": 5.225437905914009, "grad_norm": 7.019258975982666, "learning_rate": 0.0004903000900826111, "loss": 7.5315, "step": 1282700 }, { "epoch": 5.22584528393739, "grad_norm": 8.367408752441406, "learning_rate": 0.0004900074971517503, "loss": 7.5601, "step": 1282800 }, { "epoch": 5.226252661960771, "grad_norm": 8.909741401672363, "learning_rate": 0.00048971498388432, "loss": 7.5328, "step": 1282900 }, { "epoch": 5.226660039984153, "grad_norm": 10.273576736450195, "learning_rate": 0.000489422550291885, "loss": 7.5083, "step": 1283000 }, { "epoch": 5.226660039984153, "eval_MaskedAccuracy": 0.5120344032540787, "eval_loss": 1.5840531587600708, "eval_runtime": 162.2577, "eval_samples_per_second": 391.205, "eval_steps_per_second": 1.528, "step": 1283000 }, { "epoch": 5.227067418007534, "grad_norm": 10.882198333740234, "learning_rate": 0.0004891301963860064, "loss": 7.5474, "step": 1283100 }, { "epoch": 5.227474796030916, "grad_norm": 10.444225311279297, "learning_rate": 0.0004888379221782434, "loss": 7.5588, "step": 1283200 }, { "epoch": 5.227882174054297, "grad_norm": 20.341291427612305, "learning_rate": 0.0004885457276801499, "loss": 7.5335, "step": 1283300 }, { "epoch": 5.228289552077679, "grad_norm": 12.155478477478027, "learning_rate": 0.000488253612903279, "loss": 7.5322, "step": 1283400 }, { "epoch": 5.22869693010106, "grad_norm": 6.3683390617370605, "learning_rate": 0.000487961577859181, "loss": 7.5329, "step": 1283500 }, { "epoch": 5.229104308124442, "grad_norm": 9.687567710876465, "learning_rate": 0.00048766962255940034, "loss": 7.5184, "step": 1283600 }, { "epoch": 5.2295116861478235, "grad_norm": 15.411434173583984, "learning_rate": 0.0004873777470154787, "loss": 7.5815, "step": 1283700 }, { "epoch": 5.229919064171205, "grad_norm": 2.9939393997192383, "learning_rate": 0.00048708595123895566, "loss": 7.5516, "step": 1283800 }, { "epoch": 5.2303264421945865, "grad_norm": 8.011006355285645, "learning_rate": 0.0004867942352413677, "loss": 7.5448, "step": 1283900 }, { "epoch": 5.230733820217967, "grad_norm": 12.934003829956055, "learning_rate": 0.000486502599034248, "loss": 7.5224, "step": 1284000 }, { "epoch": 5.230733820217967, "eval_MaskedAccuracy": 0.5120633930333218, "eval_loss": 1.5963746309280396, "eval_runtime": 165.3475, "eval_samples_per_second": 383.894, "eval_steps_per_second": 1.5, "step": 1284000 }, { "epoch": 5.231141198241349, "grad_norm": 8.110870361328125, "learning_rate": 0.0004862110426291262, "loss": 7.5334, "step": 1284100 }, { "epoch": 5.23154857626473, "grad_norm": 28.927682876586914, "learning_rate": 0.00048591956603752894, "loss": 7.5333, "step": 1284200 }, { "epoch": 5.231955954288112, "grad_norm": 18.001243591308594, "learning_rate": 0.0004856281692709795, "loss": 7.5285, "step": 1284300 }, { "epoch": 5.232363332311493, "grad_norm": 11.77944564819336, "learning_rate": 0.0004853368523409993, "loss": 7.5394, "step": 1284400 }, { "epoch": 5.232770710334875, "grad_norm": 11.754374504089355, "learning_rate": 0.0004850456152591052, "loss": 7.5542, "step": 1284500 }, { "epoch": 5.233178088358256, "grad_norm": 4.447332382202148, "learning_rate": 0.0004847544580368111, "loss": 7.5223, "step": 1284600 }, { "epoch": 5.233585466381638, "grad_norm": 12.266335487365723, "learning_rate": 0.0004844633806856289, "loss": 7.4934, "step": 1284700 }, { "epoch": 5.233992844405019, "grad_norm": 4.861476421356201, "learning_rate": 0.00048417238321706577, "loss": 7.5464, "step": 1284800 }, { "epoch": 5.234400222428401, "grad_norm": 3.598330497741699, "learning_rate": 0.00048388146564262567, "loss": 7.5578, "step": 1284900 }, { "epoch": 5.2348076004517825, "grad_norm": 22.25257682800293, "learning_rate": 0.00048359062797381157, "loss": 7.52, "step": 1285000 }, { "epoch": 5.2348076004517825, "eval_MaskedAccuracy": 0.5125263938568052, "eval_loss": 1.5771960020065308, "eval_runtime": 164.8982, "eval_samples_per_second": 384.94, "eval_steps_per_second": 1.504, "step": 1285000 }, { "epoch": 5.235214978475164, "grad_norm": 14.142352104187012, "learning_rate": 0.0004832998702221213, "loss": 7.584, "step": 1285100 }, { "epoch": 5.235622356498546, "grad_norm": 4.911906719207764, "learning_rate": 0.00048300919239904966, "loss": 7.5526, "step": 1285200 }, { "epoch": 5.236029734521926, "grad_norm": 5.236055374145508, "learning_rate": 0.00048271859451608933, "loss": 7.5327, "step": 1285300 }, { "epoch": 5.236437112545308, "grad_norm": 6.5616984367370605, "learning_rate": 0.00048242807658472887, "loss": 7.5313, "step": 1285400 }, { "epoch": 5.236844490568689, "grad_norm": 14.643767356872559, "learning_rate": 0.0004821376386164545, "loss": 7.576, "step": 1285500 }, { "epoch": 5.237251868592071, "grad_norm": 6.73957633972168, "learning_rate": 0.0004818472806227475, "loss": 7.543, "step": 1285600 }, { "epoch": 5.237659246615452, "grad_norm": 10.866485595703125, "learning_rate": 0.00048155700261508835, "loss": 7.5103, "step": 1285700 }, { "epoch": 5.238066624638834, "grad_norm": 4.343795299530029, "learning_rate": 0.00048126680460495314, "loss": 7.5332, "step": 1285800 }, { "epoch": 5.238474002662215, "grad_norm": 12.848427772521973, "learning_rate": 0.0004809766866038141, "loss": 7.5352, "step": 1285900 }, { "epoch": 5.238881380685597, "grad_norm": 6.552394866943359, "learning_rate": 0.0004806866486231429, "loss": 7.5298, "step": 1286000 }, { "epoch": 5.238881380685597, "eval_MaskedAccuracy": 0.5122762140455446, "eval_loss": 1.602086067199707, "eval_runtime": 183.139, "eval_samples_per_second": 346.6, "eval_steps_per_second": 1.354, "step": 1286000 }, { "epoch": 5.2392887587089785, "grad_norm": 3.267075538635254, "learning_rate": 0.0004803966906744046, "loss": 7.521, "step": 1286100 }, { "epoch": 5.23969613673236, "grad_norm": 5.963367462158203, "learning_rate": 0.00048010681276906397, "loss": 7.5325, "step": 1286200 }, { "epoch": 5.2401035147557415, "grad_norm": 7.868770599365234, "learning_rate": 0.0004798170149185809, "loss": 7.5412, "step": 1286300 }, { "epoch": 5.240510892779123, "grad_norm": 6.230511665344238, "learning_rate": 0.00047952729713441244, "loss": 7.5466, "step": 1286400 }, { "epoch": 5.240918270802505, "grad_norm": 21.375364303588867, "learning_rate": 0.0004792376594280131, "loss": 7.5623, "step": 1286500 }, { "epoch": 5.241325648825885, "grad_norm": 5.332259654998779, "learning_rate": 0.00047894810181083335, "loss": 7.53, "step": 1286600 }, { "epoch": 5.241733026849267, "grad_norm": 8.759510040283203, "learning_rate": 0.0004786586242943215, "loss": 7.5094, "step": 1286700 }, { "epoch": 5.242140404872648, "grad_norm": 14.29955768585205, "learning_rate": 0.0004783692268899217, "loss": 7.5341, "step": 1286800 }, { "epoch": 5.24254778289603, "grad_norm": 3.381981134414673, "learning_rate": 0.000478079909609075, "loss": 7.5583, "step": 1286900 }, { "epoch": 5.242955160919411, "grad_norm": 15.60663890838623, "learning_rate": 0.0004777906724632211, "loss": 7.5218, "step": 1287000 }, { "epoch": 5.242955160919411, "eval_MaskedAccuracy": 0.5127360467544204, "eval_loss": 1.5800647735595703, "eval_runtime": 164.4253, "eval_samples_per_second": 386.048, "eval_steps_per_second": 1.508, "step": 1287000 }, { "epoch": 5.243362538942793, "grad_norm": 11.387564659118652, "learning_rate": 0.0004775015154637939, "loss": 7.5437, "step": 1287100 }, { "epoch": 5.2437699169661744, "grad_norm": 6.099433898925781, "learning_rate": 0.00047721243862222483, "loss": 7.5736, "step": 1287200 }, { "epoch": 5.244177294989556, "grad_norm": 5.325205326080322, "learning_rate": 0.00047692344194994433, "loss": 7.5478, "step": 1287300 }, { "epoch": 5.2445846730129375, "grad_norm": 8.52285385131836, "learning_rate": 0.0004766345254583771, "loss": 7.5513, "step": 1287400 }, { "epoch": 5.244992051036319, "grad_norm": 4.5920820236206055, "learning_rate": 0.0004763456891589454, "loss": 7.5396, "step": 1287500 }, { "epoch": 5.245399429059701, "grad_norm": 5.701610565185547, "learning_rate": 0.00047605693306306874, "loss": 7.5753, "step": 1287600 }, { "epoch": 5.245806807083082, "grad_norm": 9.378232955932617, "learning_rate": 0.00047576825718216267, "loss": 7.5534, "step": 1287700 }, { "epoch": 5.246214185106463, "grad_norm": 3.1569764614105225, "learning_rate": 0.0004754796615276406, "loss": 7.5526, "step": 1287800 }, { "epoch": 5.246621563129844, "grad_norm": 7.628342151641846, "learning_rate": 0.0004751911461109123, "loss": 7.565, "step": 1287900 }, { "epoch": 5.247028941153226, "grad_norm": 9.264254570007324, "learning_rate": 0.0004749027109433841, "loss": 7.5454, "step": 1288000 }, { "epoch": 5.247028941153226, "eval_MaskedAccuracy": 0.5126010382443479, "eval_loss": 1.5994163751602173, "eval_runtime": 179.8793, "eval_samples_per_second": 352.881, "eval_steps_per_second": 1.379, "step": 1288000 }, { "epoch": 5.247436319176607, "grad_norm": 11.552118301391602, "learning_rate": 0.0004746143560364597, "loss": 7.5598, "step": 1288100 }, { "epoch": 5.247843697199989, "grad_norm": 16.229381561279297, "learning_rate": 0.0004743260814015388, "loss": 7.5822, "step": 1288200 }, { "epoch": 5.24825107522337, "grad_norm": 3.563021659851074, "learning_rate": 0.0004740378870500186, "loss": 7.5352, "step": 1288300 }, { "epoch": 5.248658453246752, "grad_norm": 18.701969146728516, "learning_rate": 0.0004737497729932932, "loss": 7.4985, "step": 1288400 }, { "epoch": 5.2490658312701335, "grad_norm": 12.15512752532959, "learning_rate": 0.0004734617392427531, "loss": 7.5343, "step": 1288500 }, { "epoch": 5.249473209293515, "grad_norm": 10.886027336120605, "learning_rate": 0.00047317378580978645, "loss": 7.5496, "step": 1288600 }, { "epoch": 5.2498805873168966, "grad_norm": 6.915746212005615, "learning_rate": 0.00047288591270577655, "loss": 7.5567, "step": 1288700 }, { "epoch": 5.250287965340278, "grad_norm": 3.643834352493286, "learning_rate": 0.00047259811994210477, "loss": 7.5528, "step": 1288800 }, { "epoch": 5.25069534336366, "grad_norm": 5.9770827293396, "learning_rate": 0.00047231040753014893, "loss": 7.5479, "step": 1288900 }, { "epoch": 5.25110272138704, "grad_norm": 19.901004791259766, "learning_rate": 0.00047202277548128504, "loss": 7.5477, "step": 1289000 }, { "epoch": 5.25110272138704, "eval_MaskedAccuracy": 0.512350450181298, "eval_loss": 1.5840940475463867, "eval_runtime": 170.3322, "eval_samples_per_second": 372.66, "eval_steps_per_second": 1.456, "step": 1289000 }, { "epoch": 5.251510099410422, "grad_norm": 11.31248664855957, "learning_rate": 0.000471735223806884, "loss": 7.5632, "step": 1289100 }, { "epoch": 5.251917477433803, "grad_norm": 10.792511940002441, "learning_rate": 0.0004714477525183138, "loss": 7.5247, "step": 1289200 }, { "epoch": 5.252324855457185, "grad_norm": 6.50295877456665, "learning_rate": 0.0004711603616269403, "loss": 7.5642, "step": 1289300 }, { "epoch": 5.252732233480566, "grad_norm": 5.041585445404053, "learning_rate": 0.0004708730511441261, "loss": 7.5536, "step": 1289400 }, { "epoch": 5.253139611503948, "grad_norm": 3.0778822898864746, "learning_rate": 0.00047058582108122903, "loss": 7.5834, "step": 1289500 }, { "epoch": 5.2535469895273295, "grad_norm": 7.7733659744262695, "learning_rate": 0.00047029867144960475, "loss": 7.5586, "step": 1289600 }, { "epoch": 5.253954367550711, "grad_norm": 7.688708305358887, "learning_rate": 0.00047001160226060705, "loss": 7.5327, "step": 1289700 }, { "epoch": 5.2543617455740925, "grad_norm": 16.57267189025879, "learning_rate": 0.00046972461352558394, "loss": 7.5596, "step": 1289800 }, { "epoch": 5.254769123597474, "grad_norm": 10.274303436279297, "learning_rate": 0.00046943770525588273, "loss": 7.5482, "step": 1289900 }, { "epoch": 5.255176501620856, "grad_norm": 8.531637191772461, "learning_rate": 0.0004691508774628459, "loss": 7.549, "step": 1290000 }, { "epoch": 5.255176501620856, "eval_MaskedAccuracy": 0.5127083132934412, "eval_loss": 1.5892359018325806, "eval_runtime": 175.776, "eval_samples_per_second": 361.119, "eval_steps_per_second": 1.411, "step": 1290000 }, { "epoch": 5.255583879644237, "grad_norm": 12.287857055664062, "learning_rate": 0.0004688641301578133, "loss": 7.5599, "step": 1290100 }, { "epoch": 5.255991257667619, "grad_norm": 20.044239044189453, "learning_rate": 0.00046857746335212205, "loss": 7.5597, "step": 1290200 }, { "epoch": 5.256398635690999, "grad_norm": 9.0589599609375, "learning_rate": 0.0004682908770571044, "loss": 7.5507, "step": 1290300 }, { "epoch": 5.256806013714381, "grad_norm": 4.308457851409912, "learning_rate": 0.00046800437128409196, "loss": 7.537, "step": 1290400 }, { "epoch": 5.257213391737762, "grad_norm": 10.643133163452148, "learning_rate": 0.0004677179460444113, "loss": 7.5506, "step": 1290500 }, { "epoch": 5.257620769761144, "grad_norm": 4.991848945617676, "learning_rate": 0.0004674316013493865, "loss": 7.5256, "step": 1290600 }, { "epoch": 5.258028147784525, "grad_norm": 14.143113136291504, "learning_rate": 0.00046714533721033824, "loss": 7.5438, "step": 1290700 }, { "epoch": 5.258435525807907, "grad_norm": 5.053746700286865, "learning_rate": 0.00046685915363858305, "loss": 7.5349, "step": 1290800 }, { "epoch": 5.2588429038312885, "grad_norm": 6.166320323944092, "learning_rate": 0.0004665730506454373, "loss": 7.5626, "step": 1290900 }, { "epoch": 5.25925028185467, "grad_norm": 5.733420372009277, "learning_rate": 0.00046628702824221116, "loss": 7.5535, "step": 1291000 }, { "epoch": 5.25925028185467, "eval_MaskedAccuracy": 0.5129774351274916, "eval_loss": 1.5838022232055664, "eval_runtime": 190.6573, "eval_samples_per_second": 332.933, "eval_steps_per_second": 1.301, "step": 1291000 }, { "epoch": 5.259657659878052, "grad_norm": 18.24139976501465, "learning_rate": 0.00046600108644021253, "loss": 7.5392, "step": 1291100 }, { "epoch": 5.260065037901433, "grad_norm": 7.882357597351074, "learning_rate": 0.0004657152252507463, "loss": 7.5146, "step": 1291200 }, { "epoch": 5.260472415924815, "grad_norm": 4.560593605041504, "learning_rate": 0.00046542944468511344, "loss": 7.5257, "step": 1291300 }, { "epoch": 5.260879793948196, "grad_norm": 4.275140285491943, "learning_rate": 0.0004651437447546133, "loss": 7.5555, "step": 1291400 }, { "epoch": 5.261287171971578, "grad_norm": 14.538978576660156, "learning_rate": 0.00046485812547054103, "loss": 7.5422, "step": 1291500 }, { "epoch": 5.261694549994958, "grad_norm": 16.678388595581055, "learning_rate": 0.0004645725868441886, "loss": 7.5793, "step": 1291600 }, { "epoch": 5.26210192801834, "grad_norm": 9.275223731994629, "learning_rate": 0.00046428712888684436, "loss": 7.5627, "step": 1291700 }, { "epoch": 5.262509306041721, "grad_norm": 5.30936861038208, "learning_rate": 0.00046400175160979455, "loss": 7.5381, "step": 1291800 }, { "epoch": 5.262916684065103, "grad_norm": 6.1422600746154785, "learning_rate": 0.0004637164550243218, "loss": 7.5604, "step": 1291900 }, { "epoch": 5.2633240620884845, "grad_norm": 6.921450614929199, "learning_rate": 0.00046343123914170523, "loss": 7.5401, "step": 1292000 }, { "epoch": 5.2633240620884845, "eval_MaskedAccuracy": 0.5128384343371457, "eval_loss": 1.581850528717041, "eval_runtime": 189.9637, "eval_samples_per_second": 334.148, "eval_steps_per_second": 1.306, "step": 1292000 }, { "epoch": 5.263731440111866, "grad_norm": 3.6345386505126953, "learning_rate": 0.0004631461039732202, "loss": 7.5429, "step": 1292100 }, { "epoch": 5.2641388181352475, "grad_norm": 5.500032424926758, "learning_rate": 0.00046286104953014123, "loss": 7.5973, "step": 1292200 }, { "epoch": 5.264546196158629, "grad_norm": 7.4409499168396, "learning_rate": 0.0004625760758237363, "loss": 7.5617, "step": 1292300 }, { "epoch": 5.264953574182011, "grad_norm": 3.640127182006836, "learning_rate": 0.00046229118286527266, "loss": 7.546, "step": 1292400 }, { "epoch": 5.265360952205392, "grad_norm": 3.7148308753967285, "learning_rate": 0.00046200637066601366, "loss": 7.5905, "step": 1292500 }, { "epoch": 5.265768330228774, "grad_norm": 3.0423319339752197, "learning_rate": 0.0004617216392372203, "loss": 7.5471, "step": 1292600 }, { "epoch": 5.266175708252154, "grad_norm": 21.06744956970215, "learning_rate": 0.00046143698859014846, "loss": 7.5274, "step": 1292700 }, { "epoch": 5.266583086275536, "grad_norm": 4.4424848556518555, "learning_rate": 0.00046115241873605244, "loss": 7.5555, "step": 1292800 }, { "epoch": 5.266990464298917, "grad_norm": 3.3005311489105225, "learning_rate": 0.0004608679296861823, "loss": 7.5234, "step": 1292900 }, { "epoch": 5.267397842322299, "grad_norm": 9.025586128234863, "learning_rate": 0.0004605835214517859, "loss": 7.5342, "step": 1293000 }, { "epoch": 5.267397842322299, "eval_MaskedAccuracy": 0.5121078317535273, "eval_loss": 1.5936518907546997, "eval_runtime": 182.2964, "eval_samples_per_second": 348.202, "eval_steps_per_second": 1.36, "step": 1293000 }, { "epoch": 5.26780522034568, "grad_norm": 7.081292629241943, "learning_rate": 0.0004602991940441072, "loss": 7.5365, "step": 1293100 }, { "epoch": 5.268212598369062, "grad_norm": 3.9597582817077637, "learning_rate": 0.00046001494747438705, "loss": 7.5489, "step": 1293200 }, { "epoch": 5.2686199763924435, "grad_norm": 18.178199768066406, "learning_rate": 0.0004597307817538632, "loss": 7.5318, "step": 1293300 }, { "epoch": 5.269027354415825, "grad_norm": 5.263289928436279, "learning_rate": 0.0004594466968937704, "loss": 7.5342, "step": 1293400 }, { "epoch": 5.269434732439207, "grad_norm": 7.937551498413086, "learning_rate": 0.00045916269290534025, "loss": 7.5707, "step": 1293500 }, { "epoch": 5.269842110462588, "grad_norm": 8.918220520019531, "learning_rate": 0.0004588787697998007, "loss": 7.5492, "step": 1293600 }, { "epoch": 5.27024948848597, "grad_norm": 14.999495506286621, "learning_rate": 0.00045859492758837676, "loss": 7.5527, "step": 1293700 }, { "epoch": 5.270656866509351, "grad_norm": 4.848311901092529, "learning_rate": 0.0004583111662822896, "loss": 7.5433, "step": 1293800 }, { "epoch": 5.271064244532733, "grad_norm": 6.287750720977783, "learning_rate": 0.000458027485892759, "loss": 7.559, "step": 1293900 }, { "epoch": 5.271471622556113, "grad_norm": 4.3590989112854, "learning_rate": 0.000457743886431, "loss": 7.5378, "step": 1294000 }, { "epoch": 5.271471622556113, "eval_MaskedAccuracy": 0.5122752705209045, "eval_loss": 1.5980956554412842, "eval_runtime": 164.4552, "eval_samples_per_second": 385.977, "eval_steps_per_second": 1.508, "step": 1294000 }, { "epoch": 5.271879000579495, "grad_norm": 6.657304286956787, "learning_rate": 0.00045746036790822486, "loss": 7.5515, "step": 1294100 }, { "epoch": 5.272286378602876, "grad_norm": 4.288999080657959, "learning_rate": 0.00045717693033564256, "loss": 7.5395, "step": 1294200 }, { "epoch": 5.272693756626258, "grad_norm": 4.8614888191223145, "learning_rate": 0.0004568935737244582, "loss": 7.5536, "step": 1294300 }, { "epoch": 5.2731011346496395, "grad_norm": 9.658740043640137, "learning_rate": 0.0004566102980858755, "loss": 7.5664, "step": 1294400 }, { "epoch": 5.273508512673021, "grad_norm": 17.376310348510742, "learning_rate": 0.00045632710343109303, "loss": 7.5423, "step": 1294500 }, { "epoch": 5.2739158906964025, "grad_norm": 17.489919662475586, "learning_rate": 0.00045604398977130707, "loss": 7.5552, "step": 1294600 }, { "epoch": 5.274323268719784, "grad_norm": 7.263301372528076, "learning_rate": 0.0004557609571177107, "loss": 7.5632, "step": 1294700 }, { "epoch": 5.274730646743166, "grad_norm": 18.212284088134766, "learning_rate": 0.00045547800548149307, "loss": 7.5522, "step": 1294800 }, { "epoch": 5.275138024766547, "grad_norm": 13.712687492370605, "learning_rate": 0.00045519513487384244, "loss": 7.5387, "step": 1294900 }, { "epoch": 5.275545402789929, "grad_norm": 12.00277328491211, "learning_rate": 0.00045491234530594147, "loss": 7.5094, "step": 1295000 }, { "epoch": 5.275545402789929, "eval_MaskedAccuracy": 0.5126189693204896, "eval_loss": 1.583979606628418, "eval_runtime": 177.2105, "eval_samples_per_second": 358.195, "eval_steps_per_second": 1.399, "step": 1295000 }, { "epoch": 5.27595278081331, "grad_norm": 5.409238815307617, "learning_rate": 0.00045462963678897, "loss": 7.538, "step": 1295100 }, { "epoch": 5.276360158836692, "grad_norm": 4.3116960525512695, "learning_rate": 0.00045434700933410496, "loss": 7.5659, "step": 1295200 }, { "epoch": 5.276767536860072, "grad_norm": 4.7283806800842285, "learning_rate": 0.00045406446295252035, "loss": 7.4981, "step": 1295300 }, { "epoch": 5.277174914883454, "grad_norm": 12.936450004577637, "learning_rate": 0.0004537819976553864, "loss": 7.5487, "step": 1295400 }, { "epoch": 5.2775822929068354, "grad_norm": 3.9384841918945312, "learning_rate": 0.00045349961345387077, "loss": 7.4983, "step": 1295500 }, { "epoch": 5.277989670930217, "grad_norm": 18.16901397705078, "learning_rate": 0.0004532173103591378, "loss": 7.5316, "step": 1295600 }, { "epoch": 5.2783970489535985, "grad_norm": 13.225839614868164, "learning_rate": 0.0004529350883823488, "loss": 7.5077, "step": 1295700 }, { "epoch": 5.27880442697698, "grad_norm": 12.580103874206543, "learning_rate": 0.0004526529475346595, "loss": 7.519, "step": 1295800 }, { "epoch": 5.279211805000362, "grad_norm": 9.558238983154297, "learning_rate": 0.00045237088782722657, "loss": 7.5222, "step": 1295900 }, { "epoch": 5.279619183023743, "grad_norm": 9.06231689453125, "learning_rate": 0.00045208890927120104, "loss": 7.5416, "step": 1296000 }, { "epoch": 5.279619183023743, "eval_MaskedAccuracy": 0.5123287434314, "eval_loss": 1.5975998640060425, "eval_runtime": 176.2005, "eval_samples_per_second": 360.249, "eval_steps_per_second": 1.407, "step": 1296000 }, { "epoch": 5.280026561047125, "grad_norm": 7.299903869628906, "learning_rate": 0.00045180701187773077, "loss": 7.5211, "step": 1296100 }, { "epoch": 5.280433939070506, "grad_norm": 9.481529235839844, "learning_rate": 0.0004515251956579602, "loss": 7.5459, "step": 1296200 }, { "epoch": 5.280841317093888, "grad_norm": 20.374595642089844, "learning_rate": 0.00045124346062303195, "loss": 7.5396, "step": 1296300 }, { "epoch": 5.281248695117269, "grad_norm": 14.360052108764648, "learning_rate": 0.0004509618067840833, "loss": 7.5444, "step": 1296400 }, { "epoch": 5.281656073140651, "grad_norm": 7.320785045623779, "learning_rate": 0.0004506802341522505, "loss": 7.5352, "step": 1296500 }, { "epoch": 5.282063451164031, "grad_norm": 25.894439697265625, "learning_rate": 0.0004503987427386654, "loss": 7.5397, "step": 1296600 }, { "epoch": 5.282470829187413, "grad_norm": 6.371466159820557, "learning_rate": 0.00045011733255445683, "loss": 7.5465, "step": 1296700 }, { "epoch": 5.2828782072107945, "grad_norm": 16.651531219482422, "learning_rate": 0.00044983600361075016, "loss": 7.5721, "step": 1296800 }, { "epoch": 5.283285585234176, "grad_norm": 6.61417818069458, "learning_rate": 0.00044955475591866757, "loss": 7.5564, "step": 1296900 }, { "epoch": 5.2836929632575576, "grad_norm": 14.974919319152832, "learning_rate": 0.00044927358948932945, "loss": 7.5771, "step": 1297000 }, { "epoch": 5.2836929632575576, "eval_MaskedAccuracy": 0.5125122033148928, "eval_loss": 1.5861791372299194, "eval_runtime": 158.5479, "eval_samples_per_second": 400.359, "eval_steps_per_second": 1.564, "step": 1297000 }, { "epoch": 5.284100341280939, "grad_norm": 11.632669448852539, "learning_rate": 0.0004489925043338509, "loss": 7.5322, "step": 1297100 }, { "epoch": 5.284507719304321, "grad_norm": 3.8050639629364014, "learning_rate": 0.0004487115004633452, "loss": 7.525, "step": 1297200 }, { "epoch": 5.284915097327702, "grad_norm": 14.257918357849121, "learning_rate": 0.00044843057788892165, "loss": 7.5417, "step": 1297300 }, { "epoch": 5.285322475351084, "grad_norm": 7.380938529968262, "learning_rate": 0.0004481497366216868, "loss": 7.5114, "step": 1297400 }, { "epoch": 5.285729853374465, "grad_norm": 3.516423225402832, "learning_rate": 0.0004478689766727435, "loss": 7.5464, "step": 1297500 }, { "epoch": 5.286137231397847, "grad_norm": 8.229839324951172, "learning_rate": 0.0004475882980531919, "loss": 7.5587, "step": 1297600 }, { "epoch": 5.286544609421227, "grad_norm": 7.977880477905273, "learning_rate": 0.00044730770077412875, "loss": 7.5146, "step": 1297700 }, { "epoch": 5.286951987444609, "grad_norm": 9.316288948059082, "learning_rate": 0.0004470271848466471, "loss": 7.4973, "step": 1297800 }, { "epoch": 5.2873593654679905, "grad_norm": 7.418075084686279, "learning_rate": 0.00044674675028183816, "loss": 7.5378, "step": 1297900 }, { "epoch": 5.287766743491372, "grad_norm": 13.113317489624023, "learning_rate": 0.0004464663970907882, "loss": 7.5471, "step": 1298000 }, { "epoch": 5.287766743491372, "eval_MaskedAccuracy": 0.5122624171890003, "eval_loss": 1.593355417251587, "eval_runtime": 163.0497, "eval_samples_per_second": 389.305, "eval_steps_per_second": 1.521, "step": 1298000 }, { "epoch": 5.2881741215147535, "grad_norm": 4.2268781661987305, "learning_rate": 0.00044618612528458197, "loss": 7.5563, "step": 1298100 }, { "epoch": 5.288581499538135, "grad_norm": 3.9747097492218018, "learning_rate": 0.00044590593487429876, "loss": 7.5446, "step": 1298200 }, { "epoch": 5.288988877561517, "grad_norm": 4.4901862144470215, "learning_rate": 0.0004456258258710172, "loss": 7.5498, "step": 1298300 }, { "epoch": 5.289396255584898, "grad_norm": 15.346317291259766, "learning_rate": 0.0004453457982858112, "loss": 7.517, "step": 1298400 }, { "epoch": 5.28980363360828, "grad_norm": 7.432754993438721, "learning_rate": 0.0004450658521297512, "loss": 7.5307, "step": 1298500 }, { "epoch": 5.290211011631661, "grad_norm": 15.85821533203125, "learning_rate": 0.0004447859874139055, "loss": 7.5595, "step": 1298600 }, { "epoch": 5.290618389655043, "grad_norm": 3.3669979572296143, "learning_rate": 0.00044450620414933833, "loss": 7.5576, "step": 1298700 }, { "epoch": 5.291025767678424, "grad_norm": 8.374113082885742, "learning_rate": 0.0004442265023471109, "loss": 7.5313, "step": 1298800 }, { "epoch": 5.291433145701806, "grad_norm": 13.105899810791016, "learning_rate": 0.0004439468820182816, "loss": 7.5872, "step": 1298900 }, { "epoch": 5.291840523725186, "grad_norm": 5.814199447631836, "learning_rate": 0.00044366734317390614, "loss": 7.5416, "step": 1299000 }, { "epoch": 5.291840523725186, "eval_MaskedAccuracy": 0.5128568836943566, "eval_loss": 1.5909806489944458, "eval_runtime": 157.3225, "eval_samples_per_second": 403.477, "eval_steps_per_second": 1.576, "step": 1299000 }, { "epoch": 5.292247901748568, "grad_norm": 12.49307632446289, "learning_rate": 0.00044338788582503507, "loss": 7.5241, "step": 1299100 }, { "epoch": 5.2926552797719495, "grad_norm": 11.495176315307617, "learning_rate": 0.0004431085099827177, "loss": 7.5259, "step": 1299200 }, { "epoch": 5.293062657795331, "grad_norm": 18.209365844726562, "learning_rate": 0.0004428292156579981, "loss": 7.5491, "step": 1299300 }, { "epoch": 5.293470035818713, "grad_norm": 15.301490783691406, "learning_rate": 0.0004425500028619188, "loss": 7.5406, "step": 1299400 }, { "epoch": 5.293877413842094, "grad_norm": 15.960002899169922, "learning_rate": 0.0004422708716055185, "loss": 7.5214, "step": 1299500 }, { "epoch": 5.294284791865476, "grad_norm": 12.808664321899414, "learning_rate": 0.0004419918218998331, "loss": 7.5618, "step": 1299600 }, { "epoch": 5.294692169888857, "grad_norm": 17.56773567199707, "learning_rate": 0.0004417128537558946, "loss": 7.5269, "step": 1299700 }, { "epoch": 5.295099547912239, "grad_norm": 16.31693458557129, "learning_rate": 0.0004414339671847317, "loss": 7.5218, "step": 1299800 }, { "epoch": 5.29550692593562, "grad_norm": 9.28150749206543, "learning_rate": 0.0004411551621973715, "loss": 7.5507, "step": 1299900 }, { "epoch": 5.295914303959002, "grad_norm": 4.1325249671936035, "learning_rate": 0.00044087643880483604, "loss": 7.5484, "step": 1300000 }, { "epoch": 5.295914303959002, "eval_MaskedAccuracy": 0.5128927888927057, "eval_loss": 1.5847995281219482, "eval_runtime": 158.4338, "eval_samples_per_second": 400.647, "eval_steps_per_second": 1.565, "step": 1300000 }, { "epoch": 5.296321681982383, "grad_norm": 8.101085662841797, "learning_rate": 0.0004405977970181446, "loss": 7.5637, "step": 1300100 }, { "epoch": 5.296729060005765, "grad_norm": 7.415262222290039, "learning_rate": 0.00044031923684831337, "loss": 7.5345, "step": 1300200 }, { "epoch": 5.2971364380291455, "grad_norm": 9.402379035949707, "learning_rate": 0.0004400407583063554, "loss": 7.5537, "step": 1300300 }, { "epoch": 5.297543816052527, "grad_norm": 13.689456939697266, "learning_rate": 0.0004397623614032804, "loss": 7.559, "step": 1300400 }, { "epoch": 5.2979511940759085, "grad_norm": 16.62010955810547, "learning_rate": 0.00043948404615009484, "loss": 7.5558, "step": 1300500 }, { "epoch": 5.29835857209929, "grad_norm": 22.214908599853516, "learning_rate": 0.00043920581255780237, "loss": 7.5524, "step": 1300600 }, { "epoch": 5.298765950122672, "grad_norm": 15.187567710876465, "learning_rate": 0.0004389276606374029, "loss": 7.5675, "step": 1300700 }, { "epoch": 5.299173328146053, "grad_norm": 9.365309715270996, "learning_rate": 0.0004386495903998932, "loss": 7.5302, "step": 1300800 }, { "epoch": 5.299580706169435, "grad_norm": 13.13938045501709, "learning_rate": 0.0004383716018562663, "loss": 7.5481, "step": 1300900 }, { "epoch": 5.299988084192816, "grad_norm": 17.353683471679688, "learning_rate": 0.00043809369501751324, "loss": 7.5162, "step": 1301000 }, { "epoch": 5.299988084192816, "eval_MaskedAccuracy": 0.5124460678659971, "eval_loss": 1.5921168327331543, "eval_runtime": 174.3072, "eval_samples_per_second": 364.162, "eval_steps_per_second": 1.423, "step": 1301000 }, { "epoch": 5.300395462216198, "grad_norm": 5.176187515258789, "learning_rate": 0.0004378158698946208, "loss": 7.5158, "step": 1301100 }, { "epoch": 5.300802840239579, "grad_norm": 29.1513614654541, "learning_rate": 0.0004375381264985733, "loss": 7.5413, "step": 1301200 }, { "epoch": 5.301210218262961, "grad_norm": 8.285941123962402, "learning_rate": 0.00043726046484035075, "loss": 7.5578, "step": 1301300 }, { "epoch": 5.301617596286342, "grad_norm": 20.85906410217285, "learning_rate": 0.0004369828849309314, "loss": 7.5582, "step": 1301400 }, { "epoch": 5.302024974309724, "grad_norm": 9.398735046386719, "learning_rate": 0.0004367053867812896, "loss": 7.5808, "step": 1301500 }, { "epoch": 5.3024323523331045, "grad_norm": 3.6359591484069824, "learning_rate": 0.0004364279704023959, "loss": 7.5516, "step": 1301600 }, { "epoch": 5.302839730356486, "grad_norm": 9.873467445373535, "learning_rate": 0.0004361506358052178, "loss": 7.5499, "step": 1301700 }, { "epoch": 5.303247108379868, "grad_norm": 10.540246963500977, "learning_rate": 0.0004358733830007199, "loss": 7.5431, "step": 1301800 }, { "epoch": 5.303654486403249, "grad_norm": 14.299480438232422, "learning_rate": 0.0004355962119998637, "loss": 7.5704, "step": 1301900 }, { "epoch": 5.304061864426631, "grad_norm": 4.484654426574707, "learning_rate": 0.0004353191228136075, "loss": 7.5667, "step": 1302000 }, { "epoch": 5.304061864426631, "eval_MaskedAccuracy": 0.5124708999618712, "eval_loss": 1.5887837409973145, "eval_runtime": 160.61, "eval_samples_per_second": 395.218, "eval_steps_per_second": 1.544, "step": 1302000 }, { "epoch": 5.304469242450012, "grad_norm": 17.846599578857422, "learning_rate": 0.0004350421154529061, "loss": 7.5863, "step": 1302100 }, { "epoch": 5.304876620473394, "grad_norm": 7.631462574005127, "learning_rate": 0.0004347651899287102, "loss": 7.5704, "step": 1302200 }, { "epoch": 5.305283998496775, "grad_norm": 7.09050989151001, "learning_rate": 0.00043448834625196874, "loss": 7.5547, "step": 1302300 }, { "epoch": 5.305691376520157, "grad_norm": 13.030946731567383, "learning_rate": 0.0004342115844336266, "loss": 7.5227, "step": 1302400 }, { "epoch": 5.306098754543538, "grad_norm": 19.24678611755371, "learning_rate": 0.00043393490448462616, "loss": 7.5469, "step": 1302500 }, { "epoch": 5.30650613256692, "grad_norm": 12.20937728881836, "learning_rate": 0.0004336583064159059, "loss": 7.5525, "step": 1302600 }, { "epoch": 5.3069135105903005, "grad_norm": 4.572175979614258, "learning_rate": 0.0004333817902384007, "loss": 7.5473, "step": 1302700 }, { "epoch": 5.307320888613682, "grad_norm": 7.789766311645508, "learning_rate": 0.0004331053559630435, "loss": 7.5323, "step": 1302800 }, { "epoch": 5.3077282666370635, "grad_norm": 7.178048610687256, "learning_rate": 0.000432829003600763, "loss": 7.5074, "step": 1302900 }, { "epoch": 5.308135644660445, "grad_norm": 14.05922794342041, "learning_rate": 0.0004325527331624844, "loss": 7.5447, "step": 1303000 }, { "epoch": 5.308135644660445, "eval_MaskedAccuracy": 0.5133053084697339, "eval_loss": 1.5874871015548706, "eval_runtime": 173.7811, "eval_samples_per_second": 365.264, "eval_steps_per_second": 1.427, "step": 1303000 }, { "epoch": 5.308543022683827, "grad_norm": 11.715259552001953, "learning_rate": 0.000432276544659131, "loss": 7.5571, "step": 1303100 }, { "epoch": 5.308950400707208, "grad_norm": 5.756912708282471, "learning_rate": 0.0004320004381016209, "loss": 7.5376, "step": 1303200 }, { "epoch": 5.30935777873059, "grad_norm": 19.225601196289062, "learning_rate": 0.00043172441350087116, "loss": 7.5668, "step": 1303300 }, { "epoch": 5.309765156753971, "grad_norm": 13.775434494018555, "learning_rate": 0.00043144847086779356, "loss": 7.5564, "step": 1303400 }, { "epoch": 5.310172534777353, "grad_norm": 11.389219284057617, "learning_rate": 0.00043117261021329814, "loss": 7.5481, "step": 1303500 }, { "epoch": 5.310579912800734, "grad_norm": 15.047318458557129, "learning_rate": 0.00043089683154829133, "loss": 7.5344, "step": 1303600 }, { "epoch": 5.310987290824116, "grad_norm": 7.878559589385986, "learning_rate": 0.0004306211348836748, "loss": 7.5367, "step": 1303700 }, { "epoch": 5.311394668847497, "grad_norm": 15.258581161499023, "learning_rate": 0.00043034552023034985, "loss": 7.5238, "step": 1303800 }, { "epoch": 5.311802046870879, "grad_norm": 4.573291778564453, "learning_rate": 0.00043006998759921263, "loss": 7.538, "step": 1303900 }, { "epoch": 5.3122094248942595, "grad_norm": 5.569906234741211, "learning_rate": 0.00042979453700115654, "loss": 7.5354, "step": 1304000 }, { "epoch": 5.3122094248942595, "eval_MaskedAccuracy": 0.51245261577088, "eval_loss": 1.5892739295959473, "eval_runtime": 157.9994, "eval_samples_per_second": 401.748, "eval_steps_per_second": 1.57, "step": 1304000 }, { "epoch": 5.312616802917641, "grad_norm": 25.85943603515625, "learning_rate": 0.0004295191684470714, "loss": 7.5452, "step": 1304100 }, { "epoch": 5.313024180941023, "grad_norm": 8.31718635559082, "learning_rate": 0.000429243881947844, "loss": 7.5537, "step": 1304200 }, { "epoch": 5.313431558964404, "grad_norm": 5.419006824493408, "learning_rate": 0.0004289686775143576, "loss": 7.5556, "step": 1304300 }, { "epoch": 5.313838936987786, "grad_norm": 5.167078495025635, "learning_rate": 0.000428693555157493, "loss": 7.5566, "step": 1304400 }, { "epoch": 5.314246315011167, "grad_norm": 6.955180644989014, "learning_rate": 0.00042841851488812705, "loss": 7.5608, "step": 1304500 }, { "epoch": 5.314653693034549, "grad_norm": 15.398619651794434, "learning_rate": 0.00042814355671713335, "loss": 7.5092, "step": 1304600 }, { "epoch": 5.31506107105793, "grad_norm": 4.185786247253418, "learning_rate": 0.00042786868065538197, "loss": 7.5604, "step": 1304700 }, { "epoch": 5.315468449081312, "grad_norm": 10.055868148803711, "learning_rate": 0.0004275938867137414, "loss": 7.5345, "step": 1304800 }, { "epoch": 5.315875827104693, "grad_norm": 6.670746803283691, "learning_rate": 0.00042731917490307575, "loss": 7.5433, "step": 1304900 }, { "epoch": 5.316283205128075, "grad_norm": 10.293941497802734, "learning_rate": 0.0004270445452342453, "loss": 7.5425, "step": 1305000 }, { "epoch": 5.316283205128075, "eval_MaskedAccuracy": 0.5132595393285279, "eval_loss": 1.5809528827667236, "eval_runtime": 170.4779, "eval_samples_per_second": 372.342, "eval_steps_per_second": 1.455, "step": 1305000 }, { "epoch": 5.316690583151456, "grad_norm": 9.087493896484375, "learning_rate": 0.0004267699977181075, "loss": 7.5326, "step": 1305100 }, { "epoch": 5.317097961174838, "grad_norm": 9.560127258300781, "learning_rate": 0.0004264955323655171, "loss": 7.52, "step": 1305200 }, { "epoch": 5.3175053391982186, "grad_norm": 2.9714224338531494, "learning_rate": 0.0004262211491873249, "loss": 7.5339, "step": 1305300 }, { "epoch": 5.3179127172216, "grad_norm": 12.012248039245605, "learning_rate": 0.00042594684819437825, "loss": 7.5305, "step": 1305400 }, { "epoch": 5.318320095244982, "grad_norm": 22.136690139770508, "learning_rate": 0.0004256726293975226, "loss": 7.5278, "step": 1305500 }, { "epoch": 5.318727473268363, "grad_norm": 11.67299747467041, "learning_rate": 0.00042539849280759904, "loss": 7.5272, "step": 1305600 }, { "epoch": 5.319134851291745, "grad_norm": 18.251789093017578, "learning_rate": 0.0004251244384354456, "loss": 7.5241, "step": 1305700 }, { "epoch": 5.319542229315126, "grad_norm": 11.959935188293457, "learning_rate": 0.0004248504662918969, "loss": 7.5191, "step": 1305800 }, { "epoch": 5.319949607338508, "grad_norm": 3.573176145553589, "learning_rate": 0.0004245765763877847, "loss": 7.4944, "step": 1305900 }, { "epoch": 5.320356985361889, "grad_norm": 12.186860084533691, "learning_rate": 0.0004243027687339373, "loss": 7.5227, "step": 1306000 }, { "epoch": 5.320356985361889, "eval_MaskedAccuracy": 0.5127432091581579, "eval_loss": 1.5990056991577148, "eval_runtime": 175.7337, "eval_samples_per_second": 361.206, "eval_steps_per_second": 1.411, "step": 1306000 }, { "epoch": 5.320764363385271, "grad_norm": 4.960899829864502, "learning_rate": 0.0004240290433411798, "loss": 7.5576, "step": 1306100 }, { "epoch": 5.321171741408652, "grad_norm": 3.94295334815979, "learning_rate": 0.00042375540022033403, "loss": 7.5552, "step": 1306200 }, { "epoch": 5.321579119432034, "grad_norm": 10.346899032592773, "learning_rate": 0.00042348183938221843, "loss": 7.5124, "step": 1306300 }, { "epoch": 5.321986497455415, "grad_norm": 28.08512306213379, "learning_rate": 0.0004232083608376485, "loss": 7.5388, "step": 1306400 }, { "epoch": 5.322393875478797, "grad_norm": 5.102529048919678, "learning_rate": 0.00042293496459743673, "loss": 7.5298, "step": 1306500 }, { "epoch": 5.322801253502178, "grad_norm": 6.559290885925293, "learning_rate": 0.0004226616506723909, "loss": 7.5538, "step": 1306600 }, { "epoch": 5.323208631525559, "grad_norm": 21.693456649780273, "learning_rate": 0.0004223884190733167, "loss": 7.5271, "step": 1306700 }, { "epoch": 5.323616009548941, "grad_norm": 4.1509809494018555, "learning_rate": 0.00042211526981101715, "loss": 7.5492, "step": 1306800 }, { "epoch": 5.324023387572322, "grad_norm": 10.603466987609863, "learning_rate": 0.00042184220289629115, "loss": 7.5463, "step": 1306900 }, { "epoch": 5.324430765595704, "grad_norm": 5.296978950500488, "learning_rate": 0.0004215692183399348, "loss": 7.5322, "step": 1307000 }, { "epoch": 5.324430765595704, "eval_MaskedAccuracy": 0.5130697215877362, "eval_loss": 1.5880545377731323, "eval_runtime": 171.6127, "eval_samples_per_second": 369.879, "eval_steps_per_second": 1.445, "step": 1307000 }, { "epoch": 5.324838143619085, "grad_norm": 6.5792436599731445, "learning_rate": 0.00042129631615274004, "loss": 7.5571, "step": 1307100 }, { "epoch": 5.325245521642467, "grad_norm": 5.165150165557861, "learning_rate": 0.0004210234963454961, "loss": 7.5089, "step": 1307200 }, { "epoch": 5.325652899665848, "grad_norm": 16.115964889526367, "learning_rate": 0.00042075075892898976, "loss": 7.5474, "step": 1307300 }, { "epoch": 5.32606027768923, "grad_norm": 9.52346420288086, "learning_rate": 0.00042047810391400284, "loss": 7.5395, "step": 1307400 }, { "epoch": 5.326467655712611, "grad_norm": 18.871601104736328, "learning_rate": 0.0004202055313113151, "loss": 7.5501, "step": 1307500 }, { "epoch": 5.326875033735993, "grad_norm": 19.175764083862305, "learning_rate": 0.00041993304113170307, "loss": 7.5676, "step": 1307600 }, { "epoch": 5.327282411759374, "grad_norm": 26.013240814208984, "learning_rate": 0.0004196606333859392, "loss": 7.5426, "step": 1307700 }, { "epoch": 5.327689789782755, "grad_norm": 8.625907897949219, "learning_rate": 0.00041938830808479463, "loss": 7.5382, "step": 1307800 }, { "epoch": 5.328097167806137, "grad_norm": 5.5241618156433105, "learning_rate": 0.00041911606523903437, "loss": 7.5174, "step": 1307900 }, { "epoch": 5.328504545829518, "grad_norm": 7.408745288848877, "learning_rate": 0.0004188439048594226, "loss": 7.5024, "step": 1308000 }, { "epoch": 5.328504545829518, "eval_MaskedAccuracy": 0.5130457488440296, "eval_loss": 1.5893456935882568, "eval_runtime": 167.7813, "eval_samples_per_second": 378.326, "eval_steps_per_second": 1.478, "step": 1308000 }, { "epoch": 5.3289119238529, "grad_norm": 20.802061080932617, "learning_rate": 0.0004185718269567188, "loss": 7.5287, "step": 1308100 }, { "epoch": 5.329319301876281, "grad_norm": 13.224457740783691, "learning_rate": 0.0004182998315416791, "loss": 7.5293, "step": 1308200 }, { "epoch": 5.329726679899663, "grad_norm": 13.797801971435547, "learning_rate": 0.00041802791862505803, "loss": 7.5389, "step": 1308300 }, { "epoch": 5.330134057923044, "grad_norm": 10.780403137207031, "learning_rate": 0.0004177560882176052, "loss": 7.5695, "step": 1308400 }, { "epoch": 5.330541435946426, "grad_norm": 24.98788070678711, "learning_rate": 0.0004174843403300676, "loss": 7.5097, "step": 1308500 }, { "epoch": 5.330948813969807, "grad_norm": 17.747987747192383, "learning_rate": 0.0004172126749731888, "loss": 7.5208, "step": 1308600 }, { "epoch": 5.331356191993189, "grad_norm": 16.77276039123535, "learning_rate": 0.0004169410921577086, "loss": 7.5197, "step": 1308700 }, { "epoch": 5.33176357001657, "grad_norm": 16.676837921142578, "learning_rate": 0.00041666959189436543, "loss": 7.5426, "step": 1308800 }, { "epoch": 5.332170948039952, "grad_norm": 19.803956985473633, "learning_rate": 0.0004163981741938923, "loss": 7.5245, "step": 1308900 }, { "epoch": 5.332578326063333, "grad_norm": 17.46125030517578, "learning_rate": 0.0004161268390670199, "loss": 7.5737, "step": 1309000 }, { "epoch": 5.332578326063333, "eval_MaskedAccuracy": 0.5129704649628153, "eval_loss": 1.5894827842712402, "eval_runtime": 161.9856, "eval_samples_per_second": 391.862, "eval_steps_per_second": 1.531, "step": 1309000 }, { "epoch": 5.332985704086714, "grad_norm": 7.310243129730225, "learning_rate": 0.00041585558652447593, "loss": 7.5035, "step": 1309100 }, { "epoch": 5.333393082110096, "grad_norm": 5.5625319480896, "learning_rate": 0.0004155844165769841, "loss": 7.5365, "step": 1309200 }, { "epoch": 5.333800460133477, "grad_norm": 18.815752029418945, "learning_rate": 0.00041531332923526497, "loss": 7.509, "step": 1309300 }, { "epoch": 5.334207838156859, "grad_norm": 13.04920482635498, "learning_rate": 0.0004150423245100366, "loss": 7.5357, "step": 1309400 }, { "epoch": 5.33461521618024, "grad_norm": 5.087550640106201, "learning_rate": 0.00041477140241201276, "loss": 7.5255, "step": 1309500 }, { "epoch": 5.335022594203622, "grad_norm": 5.4975056648254395, "learning_rate": 0.0004145005629519049, "loss": 7.5456, "step": 1309600 }, { "epoch": 5.335429972227003, "grad_norm": 10.997625350952148, "learning_rate": 0.00041422980614042065, "loss": 7.5531, "step": 1309700 }, { "epoch": 5.335837350250385, "grad_norm": 12.89289665222168, "learning_rate": 0.0004139591319882645, "loss": 7.5631, "step": 1309800 }, { "epoch": 5.336244728273766, "grad_norm": 10.376225471496582, "learning_rate": 0.0004136885405061371, "loss": 7.5583, "step": 1309900 }, { "epoch": 5.336652106297148, "grad_norm": 4.5756516456604, "learning_rate": 0.0004134180317047375, "loss": 7.5299, "step": 1310000 }, { "epoch": 5.336652106297148, "eval_MaskedAccuracy": 0.5123939334447258, "eval_loss": 1.5862807035446167, "eval_runtime": 174.3439, "eval_samples_per_second": 364.085, "eval_steps_per_second": 1.422, "step": 1310000 }, { "epoch": 5.3370594843205295, "grad_norm": 6.5817952156066895, "learning_rate": 0.0004131476055947595, "loss": 7.5295, "step": 1310100 }, { "epoch": 5.337466862343911, "grad_norm": 9.626608848571777, "learning_rate": 0.00041287726218689375, "loss": 7.5487, "step": 1310200 }, { "epoch": 5.337874240367292, "grad_norm": 13.104557991027832, "learning_rate": 0.0004126070014918302, "loss": 7.5388, "step": 1310300 }, { "epoch": 5.338281618390673, "grad_norm": 10.869489669799805, "learning_rate": 0.0004123368235202523, "loss": 7.5224, "step": 1310400 }, { "epoch": 5.338688996414055, "grad_norm": 8.736258506774902, "learning_rate": 0.0004120667282828421, "loss": 7.5527, "step": 1310500 }, { "epoch": 5.339096374437436, "grad_norm": 17.685625076293945, "learning_rate": 0.00041179671579027826, "loss": 7.5197, "step": 1310600 }, { "epoch": 5.339503752460818, "grad_norm": 23.2597713470459, "learning_rate": 0.00041152678605323536, "loss": 7.5461, "step": 1310700 }, { "epoch": 5.339911130484199, "grad_norm": 5.2023162841796875, "learning_rate": 0.0004112569390823852, "loss": 7.5447, "step": 1310800 }, { "epoch": 5.340318508507581, "grad_norm": 10.159354209899902, "learning_rate": 0.0004109871748883967, "loss": 7.5404, "step": 1310900 }, { "epoch": 5.340725886530962, "grad_norm": 13.514359474182129, "learning_rate": 0.00041071749348193447, "loss": 7.5503, "step": 1311000 }, { "epoch": 5.340725886530962, "eval_MaskedAccuracy": 0.5126926209186771, "eval_loss": 1.5796821117401123, "eval_runtime": 183.4886, "eval_samples_per_second": 345.94, "eval_steps_per_second": 1.352, "step": 1311000 }, { "epoch": 5.341133264554344, "grad_norm": 8.904132843017578, "learning_rate": 0.00041044789487366105, "loss": 7.5385, "step": 1311100 }, { "epoch": 5.341540642577725, "grad_norm": 20.313217163085938, "learning_rate": 0.0004101783790742347, "loss": 7.5527, "step": 1311200 }, { "epoch": 5.341948020601107, "grad_norm": 17.716442108154297, "learning_rate": 0.0004099089460943104, "loss": 7.5334, "step": 1311300 }, { "epoch": 5.3423553986244885, "grad_norm": 15.50544261932373, "learning_rate": 0.0004096395959445412, "loss": 7.5475, "step": 1311400 }, { "epoch": 5.34276277664787, "grad_norm": 3.44032621383667, "learning_rate": 0.00040937032863557567, "loss": 7.542, "step": 1311500 }, { "epoch": 5.343170154671251, "grad_norm": 4.819403171539307, "learning_rate": 0.00040910114417805815, "loss": 7.5638, "step": 1311600 }, { "epoch": 5.343577532694632, "grad_norm": 12.281195640563965, "learning_rate": 0.0004088320425826329, "loss": 7.529, "step": 1311700 }, { "epoch": 5.343984910718014, "grad_norm": 12.83141803741455, "learning_rate": 0.00040856302385993814, "loss": 7.5529, "step": 1311800 }, { "epoch": 5.344392288741395, "grad_norm": 3.5825467109680176, "learning_rate": 0.0004082940880206096, "loss": 7.52, "step": 1311900 }, { "epoch": 5.344799666764777, "grad_norm": 18.3764705657959, "learning_rate": 0.00040802523507527984, "loss": 7.5457, "step": 1312000 }, { "epoch": 5.344799666764777, "eval_MaskedAccuracy": 0.5125034261440891, "eval_loss": 1.5950565338134766, "eval_runtime": 198.6051, "eval_samples_per_second": 319.609, "eval_steps_per_second": 1.249, "step": 1312000 }, { "epoch": 5.345207044788158, "grad_norm": 8.003653526306152, "learning_rate": 0.00040775646503457764, "loss": 7.5668, "step": 1312100 }, { "epoch": 5.34561442281154, "grad_norm": 13.133673667907715, "learning_rate": 0.0004074877779091297, "loss": 7.5334, "step": 1312200 }, { "epoch": 5.346021800834921, "grad_norm": 6.1893486976623535, "learning_rate": 0.00040721917370955814, "loss": 7.5845, "step": 1312300 }, { "epoch": 5.346429178858303, "grad_norm": 9.516192436218262, "learning_rate": 0.00040695065244648204, "loss": 7.5335, "step": 1312400 }, { "epoch": 5.3468365568816845, "grad_norm": 21.823017120361328, "learning_rate": 0.00040668221413051774, "loss": 7.4975, "step": 1312500 }, { "epoch": 5.347243934905066, "grad_norm": 12.48024845123291, "learning_rate": 0.0004064138587722779, "loss": 7.5477, "step": 1312600 }, { "epoch": 5.347651312928447, "grad_norm": 14.158882141113281, "learning_rate": 0.00040614558638237246, "loss": 7.5063, "step": 1312700 }, { "epoch": 5.348058690951828, "grad_norm": 7.386480808258057, "learning_rate": 0.00040587739697140753, "loss": 7.5617, "step": 1312800 }, { "epoch": 5.34846606897521, "grad_norm": 8.702553749084473, "learning_rate": 0.00040560929054998617, "loss": 7.5234, "step": 1312900 }, { "epoch": 5.348873446998591, "grad_norm": 13.563511848449707, "learning_rate": 0.00040534126712870813, "loss": 7.5658, "step": 1313000 }, { "epoch": 5.348873446998591, "eval_MaskedAccuracy": 0.5124022373641511, "eval_loss": 1.5936604738235474, "eval_runtime": 183.3132, "eval_samples_per_second": 346.271, "eval_steps_per_second": 1.353, "step": 1313000 }, { "epoch": 5.349280825021973, "grad_norm": 9.220017433166504, "learning_rate": 0.00040507332671816873, "loss": 7.559, "step": 1313100 }, { "epoch": 5.349688203045354, "grad_norm": 10.695631980895996, "learning_rate": 0.000404805469328962, "loss": 7.5328, "step": 1313200 }, { "epoch": 5.350095581068736, "grad_norm": 10.029516220092773, "learning_rate": 0.00040453769497167803, "loss": 7.5139, "step": 1313300 }, { "epoch": 5.350502959092117, "grad_norm": 12.86770248413086, "learning_rate": 0.0004042700036569035, "loss": 7.5435, "step": 1313400 }, { "epoch": 5.350910337115499, "grad_norm": 5.891438961029053, "learning_rate": 0.00040400239539522084, "loss": 7.5367, "step": 1313500 }, { "epoch": 5.35131771513888, "grad_norm": 8.02744197845459, "learning_rate": 0.0004037348701972099, "loss": 7.5255, "step": 1313600 }, { "epoch": 5.351725093162262, "grad_norm": 19.7856388092041, "learning_rate": 0.00040346742807344845, "loss": 7.5718, "step": 1313700 }, { "epoch": 5.3521324711856435, "grad_norm": 9.17331600189209, "learning_rate": 0.00040320006903450976, "loss": 7.5278, "step": 1313800 }, { "epoch": 5.352539849209025, "grad_norm": 5.395483016967773, "learning_rate": 0.0004029327930909628, "loss": 7.552, "step": 1313900 }, { "epoch": 5.352947227232406, "grad_norm": 7.13218879699707, "learning_rate": 0.0004026656002533755, "loss": 7.5595, "step": 1314000 }, { "epoch": 5.352947227232406, "eval_MaskedAccuracy": 0.5126087552568914, "eval_loss": 1.5851060152053833, "eval_runtime": 182.1563, "eval_samples_per_second": 348.47, "eval_steps_per_second": 1.361, "step": 1314000 }, { "epoch": 5.353354605255787, "grad_norm": 13.182037353515625, "learning_rate": 0.0004023984905323112, "loss": 7.5605, "step": 1314100 }, { "epoch": 5.353761983279169, "grad_norm": 12.281851768493652, "learning_rate": 0.00040213146393833, "loss": 7.5284, "step": 1314200 }, { "epoch": 5.35416936130255, "grad_norm": 5.3217949867248535, "learning_rate": 0.000401864520481989, "loss": 7.5249, "step": 1314300 }, { "epoch": 5.354576739325932, "grad_norm": 20.306541442871094, "learning_rate": 0.0004015976601738416, "loss": 7.5326, "step": 1314400 }, { "epoch": 5.354984117349313, "grad_norm": 6.144830226898193, "learning_rate": 0.0004013308830244386, "loss": 7.5371, "step": 1314500 }, { "epoch": 5.355391495372695, "grad_norm": 5.244483470916748, "learning_rate": 0.00040106418904432725, "loss": 7.5152, "step": 1314600 }, { "epoch": 5.355798873396076, "grad_norm": 5.141518592834473, "learning_rate": 0.00040079757824405083, "loss": 7.5004, "step": 1314700 }, { "epoch": 5.356206251419458, "grad_norm": 8.617745399475098, "learning_rate": 0.00040053105063415055, "loss": 7.5898, "step": 1314800 }, { "epoch": 5.3566136294428395, "grad_norm": 6.266997337341309, "learning_rate": 0.00040026460622516355, "loss": 7.4993, "step": 1314900 }, { "epoch": 5.357021007466221, "grad_norm": 22.494998931884766, "learning_rate": 0.0003999982450276234, "loss": 7.5457, "step": 1315000 }, { "epoch": 5.357021007466221, "eval_MaskedAccuracy": 0.5129410097786962, "eval_loss": 1.588871717453003, "eval_runtime": 165.149, "eval_samples_per_second": 384.356, "eval_steps_per_second": 1.502, "step": 1315000 }, { "epoch": 5.3574283854896025, "grad_norm": 23.0819091796875, "learning_rate": 0.0003997319670520608, "loss": 7.5326, "step": 1315100 }, { "epoch": 5.357835763512984, "grad_norm": 22.44310760498047, "learning_rate": 0.0003994657723090032, "loss": 7.5444, "step": 1315200 }, { "epoch": 5.358243141536365, "grad_norm": 8.22282886505127, "learning_rate": 0.00039919966080897485, "loss": 7.5606, "step": 1315300 }, { "epoch": 5.358650519559746, "grad_norm": 20.609516143798828, "learning_rate": 0.0003989336325624969, "loss": 7.5357, "step": 1315400 }, { "epoch": 5.359057897583128, "grad_norm": 6.432299613952637, "learning_rate": 0.0003986676875800865, "loss": 7.5586, "step": 1315500 }, { "epoch": 5.359465275606509, "grad_norm": 6.51712703704834, "learning_rate": 0.00039840182587225713, "loss": 7.5492, "step": 1315600 }, { "epoch": 5.359872653629891, "grad_norm": 8.8421630859375, "learning_rate": 0.00039813604744952066, "loss": 7.5369, "step": 1315700 }, { "epoch": 5.360280031653272, "grad_norm": 4.876821994781494, "learning_rate": 0.0003978703523223846, "loss": 7.5407, "step": 1315800 }, { "epoch": 5.360687409676654, "grad_norm": 9.427638053894043, "learning_rate": 0.000397604740501353, "loss": 7.5552, "step": 1315900 }, { "epoch": 5.3610947877000354, "grad_norm": 12.400432586669922, "learning_rate": 0.00039733921199692725, "loss": 7.5291, "step": 1316000 }, { "epoch": 5.3610947877000354, "eval_MaskedAccuracy": 0.5122422487512329, "eval_loss": 1.597111701965332, "eval_runtime": 194.252, "eval_samples_per_second": 326.771, "eval_steps_per_second": 1.277, "step": 1316000 }, { "epoch": 5.361502165723417, "grad_norm": 18.319015502929688, "learning_rate": 0.00039707376681960555, "loss": 7.5306, "step": 1316100 }, { "epoch": 5.3619095437467985, "grad_norm": 6.772778034210205, "learning_rate": 0.0003968084049798815, "loss": 7.5176, "step": 1316200 }, { "epoch": 5.36231692177018, "grad_norm": 10.569714546203613, "learning_rate": 0.00039654312648824665, "loss": 7.52, "step": 1316300 }, { "epoch": 5.362724299793562, "grad_norm": 12.517595291137695, "learning_rate": 0.00039627793135518866, "loss": 7.5064, "step": 1316400 }, { "epoch": 5.363131677816943, "grad_norm": 8.718416213989258, "learning_rate": 0.0003960128195911921, "loss": 7.5677, "step": 1316500 }, { "epoch": 5.363539055840324, "grad_norm": 18.29831314086914, "learning_rate": 0.0003957477912067381, "loss": 7.5475, "step": 1316600 }, { "epoch": 5.363946433863705, "grad_norm": 19.1815242767334, "learning_rate": 0.00039548284621230564, "loss": 7.5555, "step": 1316700 }, { "epoch": 5.364353811887087, "grad_norm": 8.209659576416016, "learning_rate": 0.0003952179846183688, "loss": 7.5113, "step": 1316800 }, { "epoch": 5.364761189910468, "grad_norm": 2.8940536975860596, "learning_rate": 0.00039495320643539913, "loss": 7.5219, "step": 1316900 }, { "epoch": 5.36516856793385, "grad_norm": 6.335155487060547, "learning_rate": 0.0003946885116738643, "loss": 7.5488, "step": 1317000 }, { "epoch": 5.36516856793385, "eval_MaskedAccuracy": 0.5125834465282173, "eval_loss": 1.5992813110351562, "eval_runtime": 170.2243, "eval_samples_per_second": 372.896, "eval_steps_per_second": 1.457, "step": 1317000 }, { "epoch": 5.365575945957231, "grad_norm": 3.8846523761749268, "learning_rate": 0.0003944239003442292, "loss": 7.5064, "step": 1317100 }, { "epoch": 5.365983323980613, "grad_norm": 5.232937812805176, "learning_rate": 0.0003941593724569558, "loss": 7.5324, "step": 1317200 }, { "epoch": 5.3663907020039945, "grad_norm": 12.589492797851562, "learning_rate": 0.0003938949280225023, "loss": 7.5335, "step": 1317300 }, { "epoch": 5.366798080027376, "grad_norm": 28.246875762939453, "learning_rate": 0.00039363056705132285, "loss": 7.5515, "step": 1317400 }, { "epoch": 5.3672054580507575, "grad_norm": 10.634908676147461, "learning_rate": 0.00039336628955386925, "loss": 7.5299, "step": 1317500 }, { "epoch": 5.367612836074139, "grad_norm": 13.630500793457031, "learning_rate": 0.00039310209554058943, "loss": 7.5211, "step": 1317600 }, { "epoch": 5.36802021409752, "grad_norm": 5.530129909515381, "learning_rate": 0.0003928379850219299, "loss": 7.5065, "step": 1317700 }, { "epoch": 5.368427592120901, "grad_norm": 6.207881450653076, "learning_rate": 0.0003925739580083312, "loss": 7.5259, "step": 1317800 }, { "epoch": 5.368834970144283, "grad_norm": 13.467637062072754, "learning_rate": 0.0003923100145102327, "loss": 7.5184, "step": 1317900 }, { "epoch": 5.369242348167664, "grad_norm": 3.3274197578430176, "learning_rate": 0.00039204615453806795, "loss": 7.5238, "step": 1318000 }, { "epoch": 5.369242348167664, "eval_MaskedAccuracy": 0.5131843151826292, "eval_loss": 1.5855181217193604, "eval_runtime": 175.5552, "eval_samples_per_second": 361.573, "eval_steps_per_second": 1.413, "step": 1318000 }, { "epoch": 5.369649726191046, "grad_norm": 11.497682571411133, "learning_rate": 0.00039178237810227033, "loss": 7.5357, "step": 1318100 }, { "epoch": 5.370057104214427, "grad_norm": 5.816246032714844, "learning_rate": 0.0003915186852132668, "loss": 7.5541, "step": 1318200 }, { "epoch": 5.370464482237809, "grad_norm": 4.2336297035217285, "learning_rate": 0.00039125507588148357, "loss": 7.542, "step": 1318300 }, { "epoch": 5.3708718602611905, "grad_norm": 8.212498664855957, "learning_rate": 0.00039099155011734267, "loss": 7.5107, "step": 1318400 }, { "epoch": 5.371279238284572, "grad_norm": 3.440297842025757, "learning_rate": 0.00039072810793126187, "loss": 7.5382, "step": 1318500 }, { "epoch": 5.3716866163079535, "grad_norm": 7.391533851623535, "learning_rate": 0.00039046474933365717, "loss": 7.5435, "step": 1318600 }, { "epoch": 5.372093994331335, "grad_norm": 4.354636192321777, "learning_rate": 0.00039020147433494, "loss": 7.5329, "step": 1318700 }, { "epoch": 5.372501372354717, "grad_norm": 29.759435653686523, "learning_rate": 0.0003899382829455192, "loss": 7.5259, "step": 1318800 }, { "epoch": 5.372908750378098, "grad_norm": 5.569962024688721, "learning_rate": 0.00038967517517580043, "loss": 7.5521, "step": 1318900 }, { "epoch": 5.373316128401479, "grad_norm": 2.95422101020813, "learning_rate": 0.00038941215103618535, "loss": 7.5121, "step": 1319000 }, { "epoch": 5.373316128401479, "eval_MaskedAccuracy": 0.5129268518777247, "eval_loss": 1.5870742797851562, "eval_runtime": 202.1288, "eval_samples_per_second": 314.037, "eval_steps_per_second": 1.227, "step": 1319000 }, { "epoch": 5.37372350642486, "grad_norm": 3.4544146060943604, "learning_rate": 0.00038914921053707344, "loss": 7.5696, "step": 1319100 }, { "epoch": 5.374130884448242, "grad_norm": 4.413163185119629, "learning_rate": 0.00038888635368885925, "loss": 7.546, "step": 1319200 }, { "epoch": 5.374538262471623, "grad_norm": 6.508971691131592, "learning_rate": 0.0003886235805019351, "loss": 7.5441, "step": 1319300 }, { "epoch": 5.374945640495005, "grad_norm": 13.9869384765625, "learning_rate": 0.00038836089098669065, "loss": 7.519, "step": 1319400 }, { "epoch": 5.375353018518386, "grad_norm": 4.619554042816162, "learning_rate": 0.0003880982851535106, "loss": 7.5285, "step": 1319500 }, { "epoch": 5.375760396541768, "grad_norm": 17.703474044799805, "learning_rate": 0.0003878357630127769, "loss": 7.5462, "step": 1319600 }, { "epoch": 5.3761677745651495, "grad_norm": 22.187644958496094, "learning_rate": 0.00038757332457486893, "loss": 7.5366, "step": 1319700 }, { "epoch": 5.376575152588531, "grad_norm": 10.332889556884766, "learning_rate": 0.00038731096985016207, "loss": 7.5217, "step": 1319800 }, { "epoch": 5.376982530611913, "grad_norm": 22.152936935424805, "learning_rate": 0.0003870486988490289, "loss": 7.5483, "step": 1319900 }, { "epoch": 5.377389908635294, "grad_norm": 7.143672943115234, "learning_rate": 0.0003867865115818383, "loss": 7.5636, "step": 1320000 }, { "epoch": 5.377389908635294, "eval_MaskedAccuracy": 0.5130267379412599, "eval_loss": 1.5818291902542114, "eval_runtime": 173.6924, "eval_samples_per_second": 365.451, "eval_steps_per_second": 1.428, "step": 1320000 }, { "epoch": 5.377797286658676, "grad_norm": 17.466697692871094, "learning_rate": 0.00038652440805895567, "loss": 7.5344, "step": 1320100 }, { "epoch": 5.378204664682057, "grad_norm": 3.5994436740875244, "learning_rate": 0.0003862623882907438, "loss": 7.5153, "step": 1320200 }, { "epoch": 5.378612042705438, "grad_norm": 5.591103553771973, "learning_rate": 0.0003860004522875611, "loss": 7.5348, "step": 1320300 }, { "epoch": 5.379019420728819, "grad_norm": 3.714869737625122, "learning_rate": 0.0003857386000597642, "loss": 7.5269, "step": 1320400 }, { "epoch": 5.379426798752201, "grad_norm": 20.146677017211914, "learning_rate": 0.0003854768316177052, "loss": 7.5617, "step": 1320500 }, { "epoch": 5.379834176775582, "grad_norm": 14.925349235534668, "learning_rate": 0.00038521514697173286, "loss": 7.5187, "step": 1320600 }, { "epoch": 5.380241554798964, "grad_norm": 5.3701581954956055, "learning_rate": 0.00038495354613219296, "loss": 7.5165, "step": 1320700 }, { "epoch": 5.3806489328223455, "grad_norm": 8.008158683776855, "learning_rate": 0.0003846920291094284, "loss": 7.5342, "step": 1320800 }, { "epoch": 5.381056310845727, "grad_norm": 12.682861328125, "learning_rate": 0.00038443059591377807, "loss": 7.5345, "step": 1320900 }, { "epoch": 5.3814636888691085, "grad_norm": 6.351812839508057, "learning_rate": 0.00038416924655557767, "loss": 7.5084, "step": 1321000 }, { "epoch": 5.3814636888691085, "eval_MaskedAccuracy": 0.5129036387186814, "eval_loss": 1.5889383554458618, "eval_runtime": 165.3506, "eval_samples_per_second": 383.887, "eval_steps_per_second": 1.5, "step": 1321000 }, { "epoch": 5.38187106689249, "grad_norm": 14.423798561096191, "learning_rate": 0.0003839079810451603, "loss": 7.5513, "step": 1321100 }, { "epoch": 5.382278444915872, "grad_norm": 14.717927932739258, "learning_rate": 0.00038364679939285483, "loss": 7.5411, "step": 1321200 }, { "epoch": 5.382685822939253, "grad_norm": 14.870012283325195, "learning_rate": 0.0003833857016089872, "loss": 7.5271, "step": 1321300 }, { "epoch": 5.383093200962635, "grad_norm": 19.216617584228516, "learning_rate": 0.0003831246877038793, "loss": 7.568, "step": 1321400 }, { "epoch": 5.383500578986016, "grad_norm": 4.725645065307617, "learning_rate": 0.0003828637576878519, "loss": 7.5502, "step": 1321500 }, { "epoch": 5.383907957009397, "grad_norm": 3.9537508487701416, "learning_rate": 0.00038260291157121945, "loss": 7.5641, "step": 1321600 }, { "epoch": 5.384315335032778, "grad_norm": 22.767629623413086, "learning_rate": 0.0003823421493642955, "loss": 7.5407, "step": 1321700 }, { "epoch": 5.38472271305616, "grad_norm": 8.632490158081055, "learning_rate": 0.00038208147107738927, "loss": 7.5435, "step": 1321800 }, { "epoch": 5.385130091079541, "grad_norm": 4.072729587554932, "learning_rate": 0.00038182087672080706, "loss": 7.5354, "step": 1321900 }, { "epoch": 5.385537469102923, "grad_norm": 8.678680419921875, "learning_rate": 0.0003815603663048508, "loss": 7.5438, "step": 1322000 }, { "epoch": 5.385537469102923, "eval_MaskedAccuracy": 0.512812164448944, "eval_loss": 1.593866229057312, "eval_runtime": 169.3801, "eval_samples_per_second": 374.755, "eval_steps_per_second": 1.464, "step": 1322000 }, { "epoch": 5.3859448471263045, "grad_norm": 22.05284309387207, "learning_rate": 0.0003812999398398203, "loss": 7.52, "step": 1322100 }, { "epoch": 5.386352225149686, "grad_norm": 3.8710389137268066, "learning_rate": 0.00038103959733601155, "loss": 7.5524, "step": 1322200 }, { "epoch": 5.386759603173068, "grad_norm": 19.8353214263916, "learning_rate": 0.0003807793388037175, "loss": 7.5244, "step": 1322300 }, { "epoch": 5.387166981196449, "grad_norm": 10.54450511932373, "learning_rate": 0.0003805191642532271, "loss": 7.5627, "step": 1322400 }, { "epoch": 5.387574359219831, "grad_norm": 4.97101354598999, "learning_rate": 0.0003802590736948262, "loss": 7.5435, "step": 1322500 }, { "epoch": 5.387981737243212, "grad_norm": 13.744295120239258, "learning_rate": 0.00037999906713879865, "loss": 7.5629, "step": 1322600 }, { "epoch": 5.388389115266593, "grad_norm": 18.679704666137695, "learning_rate": 0.000379739144595423, "loss": 7.5163, "step": 1322700 }, { "epoch": 5.388796493289974, "grad_norm": 13.629724502563477, "learning_rate": 0.00037947930607497586, "loss": 7.5317, "step": 1322800 }, { "epoch": 5.389203871313356, "grad_norm": 27.627920150756836, "learning_rate": 0.0003792195515877298, "loss": 7.5291, "step": 1322900 }, { "epoch": 5.389611249336737, "grad_norm": 13.59952449798584, "learning_rate": 0.00037895988114395397, "loss": 7.5524, "step": 1323000 }, { "epoch": 5.389611249336737, "eval_MaskedAccuracy": 0.5128496255588495, "eval_loss": 1.5980591773986816, "eval_runtime": 169.592, "eval_samples_per_second": 374.286, "eval_steps_per_second": 1.462, "step": 1323000 }, { "epoch": 5.390018627360119, "grad_norm": 9.709085464477539, "learning_rate": 0.0003787002947539152, "loss": 7.5235, "step": 1323100 }, { "epoch": 5.3904260053835005, "grad_norm": 14.340139389038086, "learning_rate": 0.0003784407924278759, "loss": 7.5406, "step": 1323200 }, { "epoch": 5.390833383406882, "grad_norm": 7.7949957847595215, "learning_rate": 0.000378181374176096, "loss": 7.5504, "step": 1323300 }, { "epoch": 5.3912407614302635, "grad_norm": 17.28707504272461, "learning_rate": 0.00037792204000883093, "loss": 7.5238, "step": 1323400 }, { "epoch": 5.391648139453645, "grad_norm": 7.211428642272949, "learning_rate": 0.0003776627899363344, "loss": 7.5173, "step": 1323500 }, { "epoch": 5.392055517477027, "grad_norm": 13.542436599731445, "learning_rate": 0.0003774036239688558, "loss": 7.5393, "step": 1323600 }, { "epoch": 5.392462895500408, "grad_norm": 25.06650161743164, "learning_rate": 0.000377144542116641, "loss": 7.537, "step": 1323700 }, { "epoch": 5.39287027352379, "grad_norm": 17.492645263671875, "learning_rate": 0.00037688554438993285, "loss": 7.5252, "step": 1323800 }, { "epoch": 5.393277651547171, "grad_norm": 8.449779510498047, "learning_rate": 0.0003766266307989712, "loss": 7.5459, "step": 1323900 }, { "epoch": 5.393685029570552, "grad_norm": 8.3551664352417, "learning_rate": 0.00037636780135399223, "loss": 7.5279, "step": 1324000 }, { "epoch": 5.393685029570552, "eval_MaskedAccuracy": 0.5134369896521969, "eval_loss": 1.5781093835830688, "eval_runtime": 172.788, "eval_samples_per_second": 367.363, "eval_steps_per_second": 1.435, "step": 1324000 }, { "epoch": 5.394092407593933, "grad_norm": 13.627609252929688, "learning_rate": 0.00037610905606522865, "loss": 7.5107, "step": 1324100 }, { "epoch": 5.394499785617315, "grad_norm": 6.587964057922363, "learning_rate": 0.0003758503949429101, "loss": 7.5158, "step": 1324200 }, { "epoch": 5.3949071636406964, "grad_norm": 27.416866302490234, "learning_rate": 0.0003755918179972632, "loss": 7.536, "step": 1324300 }, { "epoch": 5.395314541664078, "grad_norm": 8.633523941040039, "learning_rate": 0.00037533332523851043, "loss": 7.5323, "step": 1324400 }, { "epoch": 5.3957219196874595, "grad_norm": 16.36502456665039, "learning_rate": 0.0003750749166768711, "loss": 7.5265, "step": 1324500 }, { "epoch": 5.396129297710841, "grad_norm": 14.699762344360352, "learning_rate": 0.0003748165923225625, "loss": 7.5216, "step": 1324600 }, { "epoch": 5.396536675734223, "grad_norm": 17.163429260253906, "learning_rate": 0.00037455835218579657, "loss": 7.5513, "step": 1324700 }, { "epoch": 5.396944053757604, "grad_norm": 10.71406364440918, "learning_rate": 0.0003743001962767837, "loss": 7.548, "step": 1324800 }, { "epoch": 5.397351431780986, "grad_norm": 11.657567024230957, "learning_rate": 0.00037404212460572986, "loss": 7.5181, "step": 1324900 }, { "epoch": 5.397758809804367, "grad_norm": 15.483115196228027, "learning_rate": 0.0003737841371828378, "loss": 7.5, "step": 1325000 }, { "epoch": 5.397758809804367, "eval_MaskedAccuracy": 0.5131526883538816, "eval_loss": 1.5845218896865845, "eval_runtime": 169.8272, "eval_samples_per_second": 373.768, "eval_steps_per_second": 1.46, "step": 1325000 }, { "epoch": 5.398166187827749, "grad_norm": 4.039661884307861, "learning_rate": 0.0003735262340183077, "loss": 7.515, "step": 1325100 }, { "epoch": 5.39857356585113, "grad_norm": 5.575165271759033, "learning_rate": 0.00037326841512233555, "loss": 7.5221, "step": 1325200 }, { "epoch": 5.398980943874511, "grad_norm": 7.704547882080078, "learning_rate": 0.00037301068050511396, "loss": 7.5164, "step": 1325300 }, { "epoch": 5.399388321897892, "grad_norm": 7.4974188804626465, "learning_rate": 0.0003727530301768332, "loss": 7.5379, "step": 1325400 }, { "epoch": 5.399795699921274, "grad_norm": 9.015436172485352, "learning_rate": 0.00037249546414767855, "loss": 7.5171, "step": 1325500 }, { "epoch": 5.4002030779446555, "grad_norm": 8.185972213745117, "learning_rate": 0.0003722379824278342, "loss": 7.545, "step": 1325600 }, { "epoch": 5.400610455968037, "grad_norm": 13.704870223999023, "learning_rate": 0.00037198058502747904, "loss": 7.5315, "step": 1325700 }, { "epoch": 5.4010178339914185, "grad_norm": 7.266266345977783, "learning_rate": 0.0003717232719567897, "loss": 7.549, "step": 1325800 }, { "epoch": 5.4014252120148, "grad_norm": 4.926082611083984, "learning_rate": 0.00037146604322593887, "loss": 7.5379, "step": 1325900 }, { "epoch": 5.401832590038182, "grad_norm": 21.833553314208984, "learning_rate": 0.0003712088988450965, "loss": 7.5369, "step": 1326000 }, { "epoch": 5.401832590038182, "eval_MaskedAccuracy": 0.5133101537241762, "eval_loss": 1.5868409872055054, "eval_runtime": 178.0688, "eval_samples_per_second": 356.469, "eval_steps_per_second": 1.393, "step": 1326000 }, { "epoch": 5.402239968061563, "grad_norm": 6.728618621826172, "learning_rate": 0.00037095183882442844, "loss": 7.5212, "step": 1326100 }, { "epoch": 5.402647346084945, "grad_norm": 7.52299165725708, "learning_rate": 0.00037069486317409755, "loss": 7.5444, "step": 1326200 }, { "epoch": 5.403054724108326, "grad_norm": 20.929426193237305, "learning_rate": 0.000370437971904264, "loss": 7.5385, "step": 1326300 }, { "epoch": 5.403462102131708, "grad_norm": 18.907255172729492, "learning_rate": 0.000370181165025084, "loss": 7.5352, "step": 1326400 }, { "epoch": 5.403869480155089, "grad_norm": 7.164705753326416, "learning_rate": 0.0003699244425467094, "loss": 7.5369, "step": 1326500 }, { "epoch": 5.40427685817847, "grad_norm": 8.887213706970215, "learning_rate": 0.0003696678044792916, "loss": 7.5353, "step": 1326600 }, { "epoch": 5.4046842362018515, "grad_norm": 25.259241104125977, "learning_rate": 0.0003694112508329763, "loss": 7.5614, "step": 1326700 }, { "epoch": 5.405091614225233, "grad_norm": 9.49890422821045, "learning_rate": 0.0003691547816179058, "loss": 7.5566, "step": 1326800 }, { "epoch": 5.4054989922486145, "grad_norm": 11.74758529663086, "learning_rate": 0.0003688983968442207, "loss": 7.5704, "step": 1326900 }, { "epoch": 5.405906370271996, "grad_norm": 23.76457405090332, "learning_rate": 0.0003686420965220563, "loss": 7.5434, "step": 1327000 }, { "epoch": 5.405906370271996, "eval_MaskedAccuracy": 0.5125708183474138, "eval_loss": 1.5922502279281616, "eval_runtime": 177.1038, "eval_samples_per_second": 358.411, "eval_steps_per_second": 1.4, "step": 1327000 }, { "epoch": 5.406313748295378, "grad_norm": 8.575128555297852, "learning_rate": 0.0003683858806615459, "loss": 7.5408, "step": 1327100 }, { "epoch": 5.406721126318759, "grad_norm": 14.53542423248291, "learning_rate": 0.000368129749272819, "loss": 7.5233, "step": 1327200 }, { "epoch": 5.407128504342141, "grad_norm": 17.9052734375, "learning_rate": 0.00036787370236600185, "loss": 7.5332, "step": 1327300 }, { "epoch": 5.407535882365522, "grad_norm": 4.918173789978027, "learning_rate": 0.0003676177399512174, "loss": 7.5047, "step": 1327400 }, { "epoch": 5.407943260388904, "grad_norm": 7.67805290222168, "learning_rate": 0.0003673618620385856, "loss": 7.5447, "step": 1327500 }, { "epoch": 5.408350638412285, "grad_norm": 7.62180757522583, "learning_rate": 0.00036710606863822197, "loss": 7.5551, "step": 1327600 }, { "epoch": 5.408758016435666, "grad_norm": 10.066807746887207, "learning_rate": 0.00036685035976024026, "loss": 7.4909, "step": 1327700 }, { "epoch": 5.409165394459047, "grad_norm": 14.012504577636719, "learning_rate": 0.00036659473541474926, "loss": 7.5455, "step": 1327800 }, { "epoch": 5.409572772482429, "grad_norm": 18.850902557373047, "learning_rate": 0.00036633919561185514, "loss": 7.536, "step": 1327900 }, { "epoch": 5.4099801505058105, "grad_norm": 7.739994049072266, "learning_rate": 0.00036608374036166115, "loss": 7.5027, "step": 1328000 }, { "epoch": 5.4099801505058105, "eval_MaskedAccuracy": 0.5132354166429296, "eval_loss": 1.581290602684021, "eval_runtime": 175.2553, "eval_samples_per_second": 362.192, "eval_steps_per_second": 1.415, "step": 1328000 }, { "epoch": 5.410387528529192, "grad_norm": 11.632523536682129, "learning_rate": 0.00036582836967426603, "loss": 7.5485, "step": 1328100 }, { "epoch": 5.410794906552574, "grad_norm": 8.318305015563965, "learning_rate": 0.000365573083559767, "loss": 7.5681, "step": 1328200 }, { "epoch": 5.411202284575955, "grad_norm": 8.158488273620605, "learning_rate": 0.0003653178820282565, "loss": 7.5407, "step": 1328300 }, { "epoch": 5.411609662599337, "grad_norm": 3.6198344230651855, "learning_rate": 0.00036506276508982367, "loss": 7.5366, "step": 1328400 }, { "epoch": 5.412017040622718, "grad_norm": 6.183091640472412, "learning_rate": 0.00036480773275455493, "loss": 7.5503, "step": 1328500 }, { "epoch": 5.4124244186461, "grad_norm": 8.025540351867676, "learning_rate": 0.0003645527850325331, "loss": 7.5346, "step": 1328600 }, { "epoch": 5.412831796669481, "grad_norm": 8.579042434692383, "learning_rate": 0.00036429792193383735, "loss": 7.5301, "step": 1328700 }, { "epoch": 5.413239174692863, "grad_norm": 17.611928939819336, "learning_rate": 0.0003640431434685441, "loss": 7.5073, "step": 1328800 }, { "epoch": 5.413646552716244, "grad_norm": 14.009963989257812, "learning_rate": 0.0003637884496467265, "loss": 7.52, "step": 1328900 }, { "epoch": 5.414053930739625, "grad_norm": 2.990170955657959, "learning_rate": 0.00036353384047845347, "loss": 7.5237, "step": 1329000 }, { "epoch": 5.414053930739625, "eval_MaskedAccuracy": 0.5129120832677998, "eval_loss": 1.5914942026138306, "eval_runtime": 172.7778, "eval_samples_per_second": 367.385, "eval_steps_per_second": 1.435, "step": 1329000 }, { "epoch": 5.4144613087630065, "grad_norm": 13.124401092529297, "learning_rate": 0.00036327931597379096, "loss": 7.5323, "step": 1329100 }, { "epoch": 5.414868686786388, "grad_norm": 7.4388251304626465, "learning_rate": 0.0003630248761428018, "loss": 7.5581, "step": 1329200 }, { "epoch": 5.4152760648097695, "grad_norm": 21.128225326538086, "learning_rate": 0.0003627705209955451, "loss": 7.5452, "step": 1329300 }, { "epoch": 5.415683442833151, "grad_norm": 6.522050857543945, "learning_rate": 0.0003625162505420775, "loss": 7.5497, "step": 1329400 }, { "epoch": 5.416090820856533, "grad_norm": 5.448185443878174, "learning_rate": 0.0003622620647924505, "loss": 7.534, "step": 1329500 }, { "epoch": 5.416498198879914, "grad_norm": 4.216091632843018, "learning_rate": 0.00036200796375671475, "loss": 7.5572, "step": 1329600 }, { "epoch": 5.416905576903296, "grad_norm": 3.4206042289733887, "learning_rate": 0.0003617539474449165, "loss": 7.5092, "step": 1329700 }, { "epoch": 5.417312954926677, "grad_norm": 5.928034782409668, "learning_rate": 0.0003615000158670968, "loss": 7.5278, "step": 1329800 }, { "epoch": 5.417720332950059, "grad_norm": 4.560810565948486, "learning_rate": 0.0003612461690332955, "loss": 7.5227, "step": 1329900 }, { "epoch": 5.41812771097344, "grad_norm": 8.247055053710938, "learning_rate": 0.0003609924069535495, "loss": 7.5479, "step": 1330000 }, { "epoch": 5.41812771097344, "eval_MaskedAccuracy": 0.5127908292357723, "eval_loss": 1.5952818393707275, "eval_runtime": 176.3326, "eval_samples_per_second": 359.979, "eval_steps_per_second": 1.406, "step": 1330000 }, { "epoch": 5.418535088996822, "grad_norm": 8.054097175598145, "learning_rate": 0.00036073872963789057, "loss": 7.5763, "step": 1330100 }, { "epoch": 5.418942467020203, "grad_norm": 4.887831211090088, "learning_rate": 0.00036048513709634737, "loss": 7.5659, "step": 1330200 }, { "epoch": 5.419349845043584, "grad_norm": 9.02286148071289, "learning_rate": 0.00036023162933894714, "loss": 7.5615, "step": 1330300 }, { "epoch": 5.4197572230669655, "grad_norm": 9.907876968383789, "learning_rate": 0.0003599782063757112, "loss": 7.5057, "step": 1330400 }, { "epoch": 5.420164601090347, "grad_norm": 15.303842544555664, "learning_rate": 0.0003597248682166587, "loss": 7.5373, "step": 1330500 }, { "epoch": 5.420571979113729, "grad_norm": 7.663967609405518, "learning_rate": 0.0003594716148718062, "loss": 7.5297, "step": 1330600 }, { "epoch": 5.42097935713711, "grad_norm": 19.221603393554688, "learning_rate": 0.0003592184463511656, "loss": 7.5419, "step": 1330700 }, { "epoch": 5.421386735160492, "grad_norm": 14.464127540588379, "learning_rate": 0.0003589653626647467, "loss": 7.5402, "step": 1330800 }, { "epoch": 5.421794113183873, "grad_norm": 15.968949317932129, "learning_rate": 0.0003587123638225548, "loss": 7.5249, "step": 1330900 }, { "epoch": 5.422201491207255, "grad_norm": 12.885955810546875, "learning_rate": 0.0003584594498345925, "loss": 7.5548, "step": 1331000 }, { "epoch": 5.422201491207255, "eval_MaskedAccuracy": 0.5131306422582581, "eval_loss": 1.5863968133926392, "eval_runtime": 179.8542, "eval_samples_per_second": 352.93, "eval_steps_per_second": 1.379, "step": 1331000 }, { "epoch": 5.422608869230636, "grad_norm": 3.5550923347473145, "learning_rate": 0.00035820662071085855, "loss": 7.5337, "step": 1331100 }, { "epoch": 5.423016247254018, "grad_norm": 27.15935707092285, "learning_rate": 0.00035795387646134826, "loss": 7.5472, "step": 1331200 }, { "epoch": 5.423423625277399, "grad_norm": 6.2725090980529785, "learning_rate": 0.0003577012170960547, "loss": 7.5348, "step": 1331300 }, { "epoch": 5.423831003300781, "grad_norm": 19.44696807861328, "learning_rate": 0.0003574486426249667, "loss": 7.5198, "step": 1331400 }, { "epoch": 5.424238381324162, "grad_norm": 18.16598129272461, "learning_rate": 0.0003571961530580688, "loss": 7.5305, "step": 1331500 }, { "epoch": 5.424645759347543, "grad_norm": 18.06147575378418, "learning_rate": 0.000356943748405345, "loss": 7.5041, "step": 1331600 }, { "epoch": 5.4250531373709245, "grad_norm": 5.330445289611816, "learning_rate": 0.0003566914286767737, "loss": 7.5474, "step": 1331700 }, { "epoch": 5.425460515394306, "grad_norm": 17.969959259033203, "learning_rate": 0.00035643919388232985, "loss": 7.5306, "step": 1331800 }, { "epoch": 5.425867893417688, "grad_norm": 11.571163177490234, "learning_rate": 0.00035618704403198666, "loss": 7.5164, "step": 1331900 }, { "epoch": 5.426275271441069, "grad_norm": 18.80744171142578, "learning_rate": 0.0003559349791357114, "loss": 7.5245, "step": 1332000 }, { "epoch": 5.426275271441069, "eval_MaskedAccuracy": 0.5136524945248668, "eval_loss": 1.5814142227172852, "eval_runtime": 174.3108, "eval_samples_per_second": 364.154, "eval_steps_per_second": 1.423, "step": 1332000 }, { "epoch": 5.426682649464451, "grad_norm": 8.176673889160156, "learning_rate": 0.00035568299920347095, "loss": 7.5307, "step": 1332100 }, { "epoch": 5.427090027487832, "grad_norm": 3.8952736854553223, "learning_rate": 0.00035543110424522727, "loss": 7.5446, "step": 1332200 }, { "epoch": 5.427497405511214, "grad_norm": 5.171550273895264, "learning_rate": 0.00035517929427093846, "loss": 7.5641, "step": 1332300 }, { "epoch": 5.427904783534595, "grad_norm": 5.871838092803955, "learning_rate": 0.0003549275692905596, "loss": 7.5226, "step": 1332400 }, { "epoch": 5.428312161557977, "grad_norm": 16.73641014099121, "learning_rate": 0.0003546759293140439, "loss": 7.5418, "step": 1332500 }, { "epoch": 5.428719539581358, "grad_norm": 3.0405664443969727, "learning_rate": 0.0003544243743513398, "loss": 7.5329, "step": 1332600 }, { "epoch": 5.429126917604739, "grad_norm": 14.436090469360352, "learning_rate": 0.00035417290441239186, "loss": 7.533, "step": 1332700 }, { "epoch": 5.4295342956281205, "grad_norm": 10.883188247680664, "learning_rate": 0.00035392151950714226, "loss": 7.5392, "step": 1332800 }, { "epoch": 5.429941673651502, "grad_norm": 12.811883926391602, "learning_rate": 0.00035367021964552973, "loss": 7.4935, "step": 1332900 }, { "epoch": 5.430349051674884, "grad_norm": 4.134091854095459, "learning_rate": 0.00035341900483748973, "loss": 7.5361, "step": 1333000 }, { "epoch": 5.430349051674884, "eval_MaskedAccuracy": 0.5130643101503585, "eval_loss": 1.586102843284607, "eval_runtime": 175.0351, "eval_samples_per_second": 362.647, "eval_steps_per_second": 1.417, "step": 1333000 }, { "epoch": 5.430756429698265, "grad_norm": 22.417848587036133, "learning_rate": 0.00035316787509295403, "loss": 7.527, "step": 1333100 }, { "epoch": 5.431163807721647, "grad_norm": 6.5187578201293945, "learning_rate": 0.00035291683042185036, "loss": 7.5296, "step": 1333200 }, { "epoch": 5.431571185745028, "grad_norm": 19.404090881347656, "learning_rate": 0.0003526658708341051, "loss": 7.521, "step": 1333300 }, { "epoch": 5.43197856376841, "grad_norm": 5.884498596191406, "learning_rate": 0.00035241499633963883, "loss": 7.5207, "step": 1333400 }, { "epoch": 5.432385941791791, "grad_norm": 14.977967262268066, "learning_rate": 0.00035216420694837085, "loss": 7.5263, "step": 1333500 }, { "epoch": 5.432793319815173, "grad_norm": 6.032514572143555, "learning_rate": 0.0003519135026702161, "loss": 7.5368, "step": 1333600 }, { "epoch": 5.433200697838554, "grad_norm": 17.459970474243164, "learning_rate": 0.00035166288351508627, "loss": 7.5152, "step": 1333700 }, { "epoch": 5.433608075861936, "grad_norm": 5.257842540740967, "learning_rate": 0.00035141234949288955, "loss": 7.5326, "step": 1333800 }, { "epoch": 5.434015453885317, "grad_norm": 9.040240287780762, "learning_rate": 0.00035116190061353086, "loss": 7.5341, "step": 1333900 }, { "epoch": 5.434422831908698, "grad_norm": 5.258974075317383, "learning_rate": 0.0003509115368869117, "loss": 7.5446, "step": 1334000 }, { "epoch": 5.434422831908698, "eval_MaskedAccuracy": 0.5131344451019411, "eval_loss": 1.5840940475463867, "eval_runtime": 155.4309, "eval_samples_per_second": 408.387, "eval_steps_per_second": 1.596, "step": 1334000 }, { "epoch": 5.4348302099320795, "grad_norm": 10.722369194030762, "learning_rate": 0.00035066125832293026, "loss": 7.5392, "step": 1334100 }, { "epoch": 5.435237587955461, "grad_norm": 11.396703720092773, "learning_rate": 0.00035041106493148187, "loss": 7.5322, "step": 1334200 }, { "epoch": 5.435644965978843, "grad_norm": 6.298478603363037, "learning_rate": 0.0003501609567224579, "loss": 7.5113, "step": 1334300 }, { "epoch": 5.436052344002224, "grad_norm": 10.318392753601074, "learning_rate": 0.00034991093370574536, "loss": 7.5359, "step": 1334400 }, { "epoch": 5.436459722025606, "grad_norm": 28.445934295654297, "learning_rate": 0.00034966099589123006, "loss": 7.545, "step": 1334500 }, { "epoch": 5.436867100048987, "grad_norm": 6.695615768432617, "learning_rate": 0.0003494111432887935, "loss": 7.5127, "step": 1334600 }, { "epoch": 5.437274478072369, "grad_norm": 22.194202423095703, "learning_rate": 0.00034916137590831295, "loss": 7.5225, "step": 1334700 }, { "epoch": 5.43768185609575, "grad_norm": 15.1741361618042, "learning_rate": 0.0003489116937596636, "loss": 7.5247, "step": 1334800 }, { "epoch": 5.438089234119132, "grad_norm": 2.9347405433654785, "learning_rate": 0.0003486620968527162, "loss": 7.5406, "step": 1334900 }, { "epoch": 5.438496612142513, "grad_norm": 21.040611267089844, "learning_rate": 0.00034841258519733957, "loss": 7.5374, "step": 1335000 }, { "epoch": 5.438496612142513, "eval_MaskedAccuracy": 0.5131448649528033, "eval_loss": 1.5918980836868286, "eval_runtime": 164.1497, "eval_samples_per_second": 386.696, "eval_steps_per_second": 1.511, "step": 1335000 }, { "epoch": 5.438903990165895, "grad_norm": 3.5905072689056396, "learning_rate": 0.00034816315880339764, "loss": 7.5396, "step": 1335100 }, { "epoch": 5.439311368189276, "grad_norm": 5.706892967224121, "learning_rate": 0.00034791381768075155, "loss": 7.538, "step": 1335200 }, { "epoch": 5.439718746212657, "grad_norm": 10.50268268585205, "learning_rate": 0.00034766456183925884, "loss": 7.5207, "step": 1335300 }, { "epoch": 5.440126124236039, "grad_norm": 18.290935516357422, "learning_rate": 0.0003474153912887737, "loss": 7.5295, "step": 1335400 }, { "epoch": 5.44053350225942, "grad_norm": 12.509167671203613, "learning_rate": 0.00034716630603914833, "loss": 7.5343, "step": 1335500 }, { "epoch": 5.440940880282802, "grad_norm": 7.119658946990967, "learning_rate": 0.00034691730610023053, "loss": 7.4918, "step": 1335600 }, { "epoch": 5.441348258306183, "grad_norm": 2.9584312438964844, "learning_rate": 0.0003466683914818631, "loss": 7.5193, "step": 1335700 }, { "epoch": 5.441755636329565, "grad_norm": 26.875782012939453, "learning_rate": 0.00034641956219388773, "loss": 7.5529, "step": 1335800 }, { "epoch": 5.442163014352946, "grad_norm": 15.270933151245117, "learning_rate": 0.0003461708182461422, "loss": 7.5507, "step": 1335900 }, { "epoch": 5.442570392376328, "grad_norm": 5.244901657104492, "learning_rate": 0.00034592215964846017, "loss": 7.5237, "step": 1336000 }, { "epoch": 5.442570392376328, "eval_MaskedAccuracy": 0.5129465714972707, "eval_loss": 1.5902272462844849, "eval_runtime": 158.804, "eval_samples_per_second": 399.713, "eval_steps_per_second": 1.562, "step": 1336000 }, { "epoch": 5.442977770399709, "grad_norm": 4.501123905181885, "learning_rate": 0.0003456735864106727, "loss": 7.5205, "step": 1336100 }, { "epoch": 5.443385148423091, "grad_norm": 13.229872703552246, "learning_rate": 0.0003454250985426075, "loss": 7.5779, "step": 1336200 }, { "epoch": 5.443792526446472, "grad_norm": 18.62929916381836, "learning_rate": 0.0003451766960540884, "loss": 7.515, "step": 1336300 }, { "epoch": 5.444199904469854, "grad_norm": 9.902000427246094, "learning_rate": 0.0003449283789549361, "loss": 7.5232, "step": 1336400 }, { "epoch": 5.4446072824932354, "grad_norm": 10.179045677185059, "learning_rate": 0.0003446801472549677, "loss": 7.5605, "step": 1336500 }, { "epoch": 5.445014660516616, "grad_norm": 13.637742042541504, "learning_rate": 0.0003444320009639968, "loss": 7.5518, "step": 1336600 }, { "epoch": 5.445422038539998, "grad_norm": 14.29509449005127, "learning_rate": 0.000344183940091835, "loss": 7.5242, "step": 1336700 }, { "epoch": 5.445829416563379, "grad_norm": 10.59730339050293, "learning_rate": 0.0003439359646482886, "loss": 7.5534, "step": 1336800 }, { "epoch": 5.446236794586761, "grad_norm": 9.562889099121094, "learning_rate": 0.00034368807464316164, "loss": 7.5266, "step": 1336900 }, { "epoch": 5.446644172610142, "grad_norm": 19.545717239379883, "learning_rate": 0.0003434402700862544, "loss": 7.5524, "step": 1337000 }, { "epoch": 5.446644172610142, "eval_MaskedAccuracy": 0.5134409661704566, "eval_loss": 1.581125020980835, "eval_runtime": 195.5915, "eval_samples_per_second": 324.533, "eval_steps_per_second": 1.268, "step": 1337000 }, { "epoch": 5.447051550633524, "grad_norm": 12.941215515136719, "learning_rate": 0.00034319255098736514, "loss": 7.5372, "step": 1337100 }, { "epoch": 5.447458928656905, "grad_norm": 14.04662036895752, "learning_rate": 0.0003429449173562859, "loss": 7.518, "step": 1337200 }, { "epoch": 5.447866306680287, "grad_norm": 17.593795776367188, "learning_rate": 0.00034269736920280764, "loss": 7.541, "step": 1337300 }, { "epoch": 5.448273684703668, "grad_norm": 5.950951099395752, "learning_rate": 0.00034244990653671715, "loss": 7.5448, "step": 1337400 }, { "epoch": 5.44868106272705, "grad_norm": 6.166128635406494, "learning_rate": 0.000342202529367798, "loss": 7.5412, "step": 1337500 }, { "epoch": 5.449088440750431, "grad_norm": 5.826472282409668, "learning_rate": 0.0003419552377058305, "loss": 7.5065, "step": 1337600 }, { "epoch": 5.449495818773812, "grad_norm": 17.794090270996094, "learning_rate": 0.0003417080315605913, "loss": 7.535, "step": 1337700 }, { "epoch": 5.449903196797194, "grad_norm": 10.942166328430176, "learning_rate": 0.00034146091094185393, "loss": 7.5487, "step": 1337800 }, { "epoch": 5.450310574820575, "grad_norm": 11.414239883422852, "learning_rate": 0.00034121387585938766, "loss": 7.5501, "step": 1337900 }, { "epoch": 5.450717952843957, "grad_norm": 3.393474817276001, "learning_rate": 0.00034096692632296007, "loss": 7.5384, "step": 1338000 }, { "epoch": 5.450717952843957, "eval_MaskedAccuracy": 0.5130633760258678, "eval_loss": 1.596960425376892, "eval_runtime": 163.0323, "eval_samples_per_second": 389.346, "eval_steps_per_second": 1.521, "step": 1338000 }, { "epoch": 5.451125330867338, "grad_norm": 23.581764221191406, "learning_rate": 0.00034072006234233375, "loss": 7.572, "step": 1338100 }, { "epoch": 5.45153270889072, "grad_norm": 2.6095852851867676, "learning_rate": 0.00034047328392726857, "loss": 7.5234, "step": 1338200 }, { "epoch": 5.451940086914101, "grad_norm": 10.084429740905762, "learning_rate": 0.00034022659108752106, "loss": 7.5147, "step": 1338300 }, { "epoch": 5.452347464937483, "grad_norm": 3.5106518268585205, "learning_rate": 0.00033997998383284404, "loss": 7.5262, "step": 1338400 }, { "epoch": 5.452754842960864, "grad_norm": 13.30595588684082, "learning_rate": 0.0003397334621729879, "loss": 7.5234, "step": 1338500 }, { "epoch": 5.453162220984246, "grad_norm": 15.96445083618164, "learning_rate": 0.0003394870261176992, "loss": 7.5267, "step": 1338600 }, { "epoch": 5.453569599007627, "grad_norm": 10.994308471679688, "learning_rate": 0.00033924067567671996, "loss": 7.5092, "step": 1338700 }, { "epoch": 5.453976977031009, "grad_norm": 17.248014450073242, "learning_rate": 0.0003389944108597901, "loss": 7.5469, "step": 1338800 }, { "epoch": 5.4543843550543905, "grad_norm": 2.9977152347564697, "learning_rate": 0.0003387482316766457, "loss": 7.5468, "step": 1338900 }, { "epoch": 5.454791733077771, "grad_norm": 3.5361149311065674, "learning_rate": 0.0003385021381370198, "loss": 7.5357, "step": 1339000 }, { "epoch": 5.454791733077771, "eval_MaskedAccuracy": 0.5134931230086092, "eval_loss": 1.5927139520645142, "eval_runtime": 176.011, "eval_samples_per_second": 360.637, "eval_steps_per_second": 1.409, "step": 1339000 }, { "epoch": 5.455199111101153, "grad_norm": 5.578920841217041, "learning_rate": 0.00033825613025064176, "loss": 7.5595, "step": 1339100 }, { "epoch": 5.455606489124534, "grad_norm": 7.370936870574951, "learning_rate": 0.00033801020802723777, "loss": 7.5487, "step": 1339200 }, { "epoch": 5.456013867147916, "grad_norm": 14.652971267700195, "learning_rate": 0.00033776437147653, "loss": 7.5554, "step": 1339300 }, { "epoch": 5.456421245171297, "grad_norm": 6.171076774597168, "learning_rate": 0.00033751862060823725, "loss": 7.5181, "step": 1339400 }, { "epoch": 5.456828623194679, "grad_norm": 3.990514039993286, "learning_rate": 0.000337272955432076, "loss": 7.5047, "step": 1339500 }, { "epoch": 5.45723600121806, "grad_norm": 6.304797172546387, "learning_rate": 0.0003370273759577589, "loss": 7.5187, "step": 1339600 }, { "epoch": 5.457643379241442, "grad_norm": 11.207032203674316, "learning_rate": 0.00033678188219499503, "loss": 7.4948, "step": 1339700 }, { "epoch": 5.458050757264823, "grad_norm": 8.533391952514648, "learning_rate": 0.0003365364741534893, "loss": 7.5275, "step": 1339800 }, { "epoch": 5.458458135288205, "grad_norm": 4.139130115509033, "learning_rate": 0.0003362911518429445, "loss": 7.532, "step": 1339900 }, { "epoch": 5.458865513311586, "grad_norm": 6.50508451461792, "learning_rate": 0.00033604591527305953, "loss": 7.4986, "step": 1340000 }, { "epoch": 5.458865513311586, "eval_MaskedAccuracy": 0.512827248144668, "eval_loss": 1.5947595834732056, "eval_runtime": 181.6673, "eval_samples_per_second": 349.408, "eval_steps_per_second": 1.365, "step": 1340000 }, { "epoch": 5.459272891334968, "grad_norm": 10.115270614624023, "learning_rate": 0.0003358007644535302, "loss": 7.5262, "step": 1340100 }, { "epoch": 5.4596802693583495, "grad_norm": 21.376296997070312, "learning_rate": 0.00033555569939404845, "loss": 7.5373, "step": 1340200 }, { "epoch": 5.46008764738173, "grad_norm": 19.60698127746582, "learning_rate": 0.00033531072010430237, "loss": 7.5546, "step": 1340300 }, { "epoch": 5.460495025405112, "grad_norm": 15.67002010345459, "learning_rate": 0.00033506582659397757, "loss": 7.5188, "step": 1340400 }, { "epoch": 5.460902403428493, "grad_norm": 4.400627136230469, "learning_rate": 0.00033482101887275694, "loss": 7.5232, "step": 1340500 }, { "epoch": 5.461309781451875, "grad_norm": 6.973476886749268, "learning_rate": 0.0003345762969503184, "loss": 7.5498, "step": 1340600 }, { "epoch": 5.461717159475256, "grad_norm": 12.007425308227539, "learning_rate": 0.0003343316608363366, "loss": 7.5003, "step": 1340700 }, { "epoch": 5.462124537498638, "grad_norm": 12.796014785766602, "learning_rate": 0.00033408711054048443, "loss": 7.5597, "step": 1340800 }, { "epoch": 5.462531915522019, "grad_norm": 16.323322296142578, "learning_rate": 0.00033384264607242987, "loss": 7.5058, "step": 1340900 }, { "epoch": 5.462939293545401, "grad_norm": 19.669464111328125, "learning_rate": 0.00033359826744183696, "loss": 7.5373, "step": 1341000 }, { "epoch": 5.462939293545401, "eval_MaskedAccuracy": 0.5132753471187822, "eval_loss": 1.5841223001480103, "eval_runtime": 189.2486, "eval_samples_per_second": 335.411, "eval_steps_per_second": 1.31, "step": 1341000 }, { "epoch": 5.463346671568782, "grad_norm": 11.022880554199219, "learning_rate": 0.00033335397465836833, "loss": 7.5201, "step": 1341100 }, { "epoch": 5.463754049592164, "grad_norm": 14.854443550109863, "learning_rate": 0.00033310976773168183, "loss": 7.5231, "step": 1341200 }, { "epoch": 5.4641614276155455, "grad_norm": 8.195154190063477, "learning_rate": 0.00033286564667143234, "loss": 7.5346, "step": 1341300 }, { "epoch": 5.464568805638927, "grad_norm": 5.181168556213379, "learning_rate": 0.0003326216114872715, "loss": 7.5321, "step": 1341400 }, { "epoch": 5.4649761836623085, "grad_norm": 13.370020866394043, "learning_rate": 0.0003323776621888467, "loss": 7.5162, "step": 1341500 }, { "epoch": 5.465383561685689, "grad_norm": 31.435993194580078, "learning_rate": 0.00033213379878580315, "loss": 7.5352, "step": 1341600 }, { "epoch": 5.465790939709071, "grad_norm": 3.4957010746002197, "learning_rate": 0.00033189002128778154, "loss": 7.4939, "step": 1341700 }, { "epoch": 5.466198317732452, "grad_norm": 9.091877937316895, "learning_rate": 0.0003316463297044193, "loss": 7.5208, "step": 1341800 }, { "epoch": 5.466605695755834, "grad_norm": 8.437361717224121, "learning_rate": 0.0003314027240453526, "loss": 7.5717, "step": 1341900 }, { "epoch": 5.467013073779215, "grad_norm": 10.271653175354004, "learning_rate": 0.00033115920432021123, "loss": 7.5145, "step": 1342000 }, { "epoch": 5.467013073779215, "eval_MaskedAccuracy": 0.5129229004959125, "eval_loss": 1.5858826637268066, "eval_runtime": 151.4268, "eval_samples_per_second": 419.186, "eval_steps_per_second": 1.638, "step": 1342000 }, { "epoch": 5.467420451802597, "grad_norm": 6.712301731109619, "learning_rate": 0.0003309157705386229, "loss": 7.5501, "step": 1342100 }, { "epoch": 5.467827829825978, "grad_norm": 5.213397026062012, "learning_rate": 0.00033067242271021206, "loss": 7.4886, "step": 1342200 }, { "epoch": 5.46823520784936, "grad_norm": 23.394397735595703, "learning_rate": 0.0003304291608445994, "loss": 7.5303, "step": 1342300 }, { "epoch": 5.468642585872741, "grad_norm": 13.85262393951416, "learning_rate": 0.00033018598495140216, "loss": 7.5308, "step": 1342400 }, { "epoch": 5.469049963896123, "grad_norm": 4.306583404541016, "learning_rate": 0.00032994289504023453, "loss": 7.5401, "step": 1342500 }, { "epoch": 5.4694573419195045, "grad_norm": 21.96116065979004, "learning_rate": 0.00032969989112070774, "loss": 7.5259, "step": 1342600 }, { "epoch": 5.469864719942885, "grad_norm": 11.738424301147461, "learning_rate": 0.0003294569732024284, "loss": 7.5423, "step": 1342700 }, { "epoch": 5.470272097966267, "grad_norm": 16.211389541625977, "learning_rate": 0.0003292141412950006, "loss": 7.5346, "step": 1342800 }, { "epoch": 5.470679475989648, "grad_norm": 13.16845417022705, "learning_rate": 0.00032897139540802474, "loss": 7.5355, "step": 1342900 }, { "epoch": 5.47108685401303, "grad_norm": 4.776648998260498, "learning_rate": 0.0003287287355510974, "loss": 7.517, "step": 1343000 }, { "epoch": 5.47108685401303, "eval_MaskedAccuracy": 0.5125800603311347, "eval_loss": 1.594915509223938, "eval_runtime": 157.1588, "eval_samples_per_second": 403.897, "eval_steps_per_second": 1.578, "step": 1343000 }, { "epoch": 5.471494232036411, "grad_norm": 4.625259876251221, "learning_rate": 0.000328486161733813, "loss": 7.5365, "step": 1343100 }, { "epoch": 5.471901610059793, "grad_norm": 5.41280460357666, "learning_rate": 0.0003282436739657615, "loss": 7.5352, "step": 1343200 }, { "epoch": 5.472308988083174, "grad_norm": 12.514579772949219, "learning_rate": 0.0003280012722565299, "loss": 7.531, "step": 1343300 }, { "epoch": 5.472716366106556, "grad_norm": 13.552356719970703, "learning_rate": 0.0003277589566157006, "loss": 7.5604, "step": 1343400 }, { "epoch": 5.473123744129937, "grad_norm": 13.721244812011719, "learning_rate": 0.0003275167270528554, "loss": 7.5526, "step": 1343500 }, { "epoch": 5.473531122153319, "grad_norm": 11.623786926269531, "learning_rate": 0.00032727458357756985, "loss": 7.5298, "step": 1343600 }, { "epoch": 5.4739385001767005, "grad_norm": 16.389251708984375, "learning_rate": 0.00032703252619941737, "loss": 7.5368, "step": 1343700 }, { "epoch": 5.474345878200082, "grad_norm": 8.868416786193848, "learning_rate": 0.00032679055492796744, "loss": 7.5088, "step": 1343800 }, { "epoch": 5.4747532562234635, "grad_norm": 3.096803903579712, "learning_rate": 0.0003265486697727869, "loss": 7.5462, "step": 1343900 }, { "epoch": 5.475160634246844, "grad_norm": 21.188514709472656, "learning_rate": 0.0003263068707434388, "loss": 7.5692, "step": 1344000 }, { "epoch": 5.475160634246844, "eval_MaskedAccuracy": 0.5128599134834798, "eval_loss": 1.588036298751831, "eval_runtime": 159.4299, "eval_samples_per_second": 398.144, "eval_steps_per_second": 1.556, "step": 1344000 }, { "epoch": 5.475568012270226, "grad_norm": 3.874368906021118, "learning_rate": 0.0003260651578494824, "loss": 7.5354, "step": 1344100 }, { "epoch": 5.475975390293607, "grad_norm": 19.807144165039062, "learning_rate": 0.00032582353110047424, "loss": 7.5309, "step": 1344200 }, { "epoch": 5.476382768316989, "grad_norm": 6.514845848083496, "learning_rate": 0.00032558199050596574, "loss": 7.5299, "step": 1344300 }, { "epoch": 5.47679014634037, "grad_norm": 12.320228576660156, "learning_rate": 0.0003253405360755088, "loss": 7.5375, "step": 1344400 }, { "epoch": 5.477197524363752, "grad_norm": 4.334640026092529, "learning_rate": 0.0003250991678186477, "loss": 7.496, "step": 1344500 }, { "epoch": 5.477604902387133, "grad_norm": 18.309972763061523, "learning_rate": 0.0003248578857449255, "loss": 7.5116, "step": 1344600 }, { "epoch": 5.478012280410515, "grad_norm": 3.038743019104004, "learning_rate": 0.0003246166898638813, "loss": 7.5474, "step": 1344700 }, { "epoch": 5.4784196584338964, "grad_norm": 5.224091529846191, "learning_rate": 0.000324375580185051, "loss": 7.5399, "step": 1344800 }, { "epoch": 5.478827036457278, "grad_norm": 11.377028465270996, "learning_rate": 0.00032413455671796655, "loss": 7.524, "step": 1344900 }, { "epoch": 5.4792344144806595, "grad_norm": 4.814363956451416, "learning_rate": 0.0003238936194721577, "loss": 7.5648, "step": 1345000 }, { "epoch": 5.4792344144806595, "eval_MaskedAccuracy": 0.512634868952215, "eval_loss": 1.5925509929656982, "eval_runtime": 157.3071, "eval_samples_per_second": 403.516, "eval_steps_per_second": 1.577, "step": 1345000 }, { "epoch": 5.479641792504041, "grad_norm": 9.213382720947266, "learning_rate": 0.00032365276845714927, "loss": 7.5141, "step": 1345100 }, { "epoch": 5.480049170527423, "grad_norm": 11.836997032165527, "learning_rate": 0.0003234120036824633, "loss": 7.522, "step": 1345200 }, { "epoch": 5.480456548550803, "grad_norm": 10.100482940673828, "learning_rate": 0.0003231713251576189, "loss": 7.5438, "step": 1345300 }, { "epoch": 5.480863926574185, "grad_norm": 6.054364204406738, "learning_rate": 0.00032293073289213145, "loss": 7.5527, "step": 1345400 }, { "epoch": 5.481271304597566, "grad_norm": 3.237656593322754, "learning_rate": 0.0003226902268955121, "loss": 7.4976, "step": 1345500 }, { "epoch": 5.481678682620948, "grad_norm": 4.5124101638793945, "learning_rate": 0.0003224498071772703, "loss": 7.538, "step": 1345600 }, { "epoch": 5.482086060644329, "grad_norm": 12.343074798583984, "learning_rate": 0.00032220947374691025, "loss": 7.4971, "step": 1345700 }, { "epoch": 5.482493438667711, "grad_norm": 4.376140594482422, "learning_rate": 0.0003219692266139342, "loss": 7.4965, "step": 1345800 }, { "epoch": 5.482900816691092, "grad_norm": 4.290546894073486, "learning_rate": 0.0003217290657878396, "loss": 7.5243, "step": 1345900 }, { "epoch": 5.483308194714474, "grad_norm": 12.027759552001953, "learning_rate": 0.0003214889912781226, "loss": 7.5142, "step": 1346000 }, { "epoch": 5.483308194714474, "eval_MaskedAccuracy": 0.5125970006508773, "eval_loss": 1.5909134149551392, "eval_runtime": 171.889, "eval_samples_per_second": 369.285, "eval_steps_per_second": 1.443, "step": 1346000 }, { "epoch": 5.4837155727378555, "grad_norm": 3.6816511154174805, "learning_rate": 0.0003212490030942748, "loss": 7.5326, "step": 1346100 }, { "epoch": 5.484122950761237, "grad_norm": 5.182373523712158, "learning_rate": 0.00032100910124578284, "loss": 7.5531, "step": 1346200 }, { "epoch": 5.4845303287846185, "grad_norm": 12.852604866027832, "learning_rate": 0.00032076928574213204, "loss": 7.5406, "step": 1346300 }, { "epoch": 5.484937706808, "grad_norm": 15.884415626525879, "learning_rate": 0.000320529556592803, "loss": 7.5342, "step": 1346400 }, { "epoch": 5.485345084831382, "grad_norm": 18.99367904663086, "learning_rate": 0.00032028991380727423, "loss": 7.5479, "step": 1346500 }, { "epoch": 5.485752462854762, "grad_norm": 13.396120071411133, "learning_rate": 0.00032005035739501994, "loss": 7.5298, "step": 1346600 }, { "epoch": 5.486159840878144, "grad_norm": 15.541460037231445, "learning_rate": 0.00031981088736551095, "loss": 7.5432, "step": 1346700 }, { "epoch": 5.486567218901525, "grad_norm": 5.564704895019531, "learning_rate": 0.0003195715037282145, "loss": 7.5431, "step": 1346800 }, { "epoch": 5.486974596924907, "grad_norm": 15.751948356628418, "learning_rate": 0.0003193322064925953, "loss": 7.516, "step": 1346900 }, { "epoch": 5.487381974948288, "grad_norm": 16.45070457458496, "learning_rate": 0.00031909299566811364, "loss": 7.5259, "step": 1347000 }, { "epoch": 5.487381974948288, "eval_MaskedAccuracy": 0.5127089098006233, "eval_loss": 1.5921605825424194, "eval_runtime": 161.6336, "eval_samples_per_second": 392.715, "eval_steps_per_second": 1.534, "step": 1347000 }, { "epoch": 5.48778935297167, "grad_norm": 16.948579788208008, "learning_rate": 0.000318853871264227, "loss": 7.4926, "step": 1347100 }, { "epoch": 5.4881967309950515, "grad_norm": 4.223629951477051, "learning_rate": 0.0003186148332903895, "loss": 7.5477, "step": 1347200 }, { "epoch": 5.488604109018433, "grad_norm": 4.958531856536865, "learning_rate": 0.00031837588175605047, "loss": 7.5472, "step": 1347300 }, { "epoch": 5.4890114870418145, "grad_norm": 3.535315990447998, "learning_rate": 0.0003181370166706577, "loss": 7.535, "step": 1347400 }, { "epoch": 5.489418865065196, "grad_norm": 14.249555587768555, "learning_rate": 0.00031789823804365553, "loss": 7.5191, "step": 1347500 }, { "epoch": 5.489826243088578, "grad_norm": 3.990600347518921, "learning_rate": 0.0003176595458844832, "loss": 7.5293, "step": 1347600 }, { "epoch": 5.490233621111958, "grad_norm": 9.97349739074707, "learning_rate": 0.0003174209402025778, "loss": 7.5425, "step": 1347700 }, { "epoch": 5.49064099913534, "grad_norm": 5.024134635925293, "learning_rate": 0.00031718242100737274, "loss": 7.5202, "step": 1347800 }, { "epoch": 5.491048377158721, "grad_norm": 10.962465286254883, "learning_rate": 0.00031694398830829786, "loss": 7.5292, "step": 1347900 }, { "epoch": 5.491455755182103, "grad_norm": 9.160889625549316, "learning_rate": 0.0003167056421147788, "loss": 7.5487, "step": 1348000 }, { "epoch": 5.491455755182103, "eval_MaskedAccuracy": 0.5124560960872646, "eval_loss": 1.5988479852676392, "eval_runtime": 159.5393, "eval_samples_per_second": 397.871, "eval_steps_per_second": 1.554, "step": 1348000 }, { "epoch": 5.491863133205484, "grad_norm": 7.978877067565918, "learning_rate": 0.00031646738243624046, "loss": 7.5229, "step": 1348100 }, { "epoch": 5.492270511228866, "grad_norm": 14.206701278686523, "learning_rate": 0.0003162292092821014, "loss": 7.5523, "step": 1348200 }, { "epoch": 5.492677889252247, "grad_norm": 9.733171463012695, "learning_rate": 0.0003159911226617778, "loss": 7.5467, "step": 1348300 }, { "epoch": 5.493085267275629, "grad_norm": 14.264520645141602, "learning_rate": 0.00031575312258468296, "loss": 7.528, "step": 1348400 }, { "epoch": 5.4934926452990105, "grad_norm": 4.115435600280762, "learning_rate": 0.0003155152090602261, "loss": 7.5435, "step": 1348500 }, { "epoch": 5.493900023322392, "grad_norm": 9.625788688659668, "learning_rate": 0.0003152773820978131, "loss": 7.5394, "step": 1348600 }, { "epoch": 5.494307401345774, "grad_norm": 8.65286636352539, "learning_rate": 0.0003150396417068466, "loss": 7.5411, "step": 1348700 }, { "epoch": 5.494714779369155, "grad_norm": 5.142483711242676, "learning_rate": 0.0003148019878967261, "loss": 7.5598, "step": 1348800 }, { "epoch": 5.495122157392537, "grad_norm": 10.273721694946289, "learning_rate": 0.0003145644206768466, "loss": 7.5265, "step": 1348900 }, { "epoch": 5.495529535415917, "grad_norm": 9.339920997619629, "learning_rate": 0.0003143269400566013, "loss": 7.5208, "step": 1349000 }, { "epoch": 5.495529535415917, "eval_MaskedAccuracy": 0.5130556723890132, "eval_loss": 1.5973339080810547, "eval_runtime": 164.6236, "eval_samples_per_second": 385.583, "eval_steps_per_second": 1.506, "step": 1349000 }, { "epoch": 5.495936913439299, "grad_norm": 4.450901985168457, "learning_rate": 0.00031408954604537854, "loss": 7.5344, "step": 1349100 }, { "epoch": 5.49634429146268, "grad_norm": 10.155784606933594, "learning_rate": 0.0003138522386525638, "loss": 7.5323, "step": 1349200 }, { "epoch": 5.496751669486062, "grad_norm": 15.838692665100098, "learning_rate": 0.0003136150178875386, "loss": 7.5505, "step": 1349300 }, { "epoch": 5.497159047509443, "grad_norm": 5.904865264892578, "learning_rate": 0.0003133778837596825, "loss": 7.5431, "step": 1349400 }, { "epoch": 5.497566425532825, "grad_norm": 9.229758262634277, "learning_rate": 0.00031314083627837034, "loss": 7.5248, "step": 1349500 }, { "epoch": 5.4979738035562065, "grad_norm": 17.848678588867188, "learning_rate": 0.00031290387545297336, "loss": 7.5719, "step": 1349600 }, { "epoch": 5.498381181579588, "grad_norm": 16.38506507873535, "learning_rate": 0.00031266700129286055, "loss": 7.5813, "step": 1349700 }, { "epoch": 5.4987885596029695, "grad_norm": 9.744184494018555, "learning_rate": 0.00031243021380739707, "loss": 7.5311, "step": 1349800 }, { "epoch": 5.499195937626351, "grad_norm": 6.805414199829102, "learning_rate": 0.0003121935130059439, "loss": 7.5201, "step": 1349900 }, { "epoch": 5.499603315649733, "grad_norm": 15.914021492004395, "learning_rate": 0.000311956898897859, "loss": 7.5276, "step": 1350000 }, { "epoch": 5.499603315649733, "eval_MaskedAccuracy": 0.5128690428060072, "eval_loss": 1.5992703437805176, "eval_runtime": 159.9813, "eval_samples_per_second": 396.771, "eval_steps_per_second": 1.55, "step": 1350000 }, { "epoch": 5.500010693673114, "grad_norm": 14.615961074829102, "learning_rate": 0.000311720371492497, "loss": 7.5482, "step": 1350100 }, { "epoch": 5.500418071696496, "grad_norm": 15.448691368103027, "learning_rate": 0.0003114839307992093, "loss": 7.5288, "step": 1350200 }, { "epoch": 5.500825449719876, "grad_norm": 2.8489630222320557, "learning_rate": 0.00031124757682734366, "loss": 7.5294, "step": 1350300 }, { "epoch": 5.501232827743258, "grad_norm": 4.686595439910889, "learning_rate": 0.0003110113095862444, "loss": 7.5541, "step": 1350400 }, { "epoch": 5.501640205766639, "grad_norm": 4.597413063049316, "learning_rate": 0.00031077512908525226, "loss": 7.5222, "step": 1350500 }, { "epoch": 5.502047583790021, "grad_norm": 9.691584587097168, "learning_rate": 0.0003105390353337044, "loss": 7.5245, "step": 1350600 }, { "epoch": 5.502454961813402, "grad_norm": 11.693702697753906, "learning_rate": 0.0003103030283409355, "loss": 7.5349, "step": 1350700 }, { "epoch": 5.502862339836784, "grad_norm": 5.796513557434082, "learning_rate": 0.0003100671081162762, "loss": 7.5517, "step": 1350800 }, { "epoch": 5.5032697178601655, "grad_norm": 9.841246604919434, "learning_rate": 0.00030983127466905285, "loss": 7.5717, "step": 1350900 }, { "epoch": 5.503677095883547, "grad_norm": 21.618650436401367, "learning_rate": 0.0003095955280085905, "loss": 7.5008, "step": 1351000 }, { "epoch": 5.503677095883547, "eval_MaskedAccuracy": 0.5133096525036167, "eval_loss": 1.5842745304107666, "eval_runtime": 154.528, "eval_samples_per_second": 410.773, "eval_steps_per_second": 1.605, "step": 1351000 }, { "epoch": 5.504084473906929, "grad_norm": 13.956299781799316, "learning_rate": 0.0003093598681442084, "loss": 7.5189, "step": 1351100 }, { "epoch": 5.50449185193031, "grad_norm": 6.590954303741455, "learning_rate": 0.00030912429508522376, "loss": 7.5171, "step": 1351200 }, { "epoch": 5.504899229953692, "grad_norm": 4.142152309417725, "learning_rate": 0.0003088888088409498, "loss": 7.5613, "step": 1351300 }, { "epoch": 5.505306607977072, "grad_norm": 3.0190186500549316, "learning_rate": 0.00030865340942069715, "loss": 7.5455, "step": 1351400 }, { "epoch": 5.505713986000455, "grad_norm": 9.084418296813965, "learning_rate": 0.00030841809683377215, "loss": 7.508, "step": 1351500 }, { "epoch": 5.506121364023835, "grad_norm": 10.555480003356934, "learning_rate": 0.00030818287108947773, "loss": 7.5278, "step": 1351600 }, { "epoch": 5.506528742047217, "grad_norm": 3.6321165561676025, "learning_rate": 0.0003079477321971144, "loss": 7.5506, "step": 1351700 }, { "epoch": 5.506936120070598, "grad_norm": 8.326172828674316, "learning_rate": 0.0003077126801659775, "loss": 7.5182, "step": 1351800 }, { "epoch": 5.50734349809398, "grad_norm": 6.167407989501953, "learning_rate": 0.00030747771500536027, "loss": 7.4815, "step": 1351900 }, { "epoch": 5.5077508761173615, "grad_norm": 6.568759441375732, "learning_rate": 0.00030724283672455205, "loss": 7.5345, "step": 1352000 }, { "epoch": 5.5077508761173615, "eval_MaskedAccuracy": 0.513475285674313, "eval_loss": 1.5810248851776123, "eval_runtime": 153.0557, "eval_samples_per_second": 414.725, "eval_steps_per_second": 1.62, "step": 1352000 }, { "epoch": 5.508158254140743, "grad_norm": 4.821125030517578, "learning_rate": 0.00030700804533283937, "loss": 7.5277, "step": 1352100 }, { "epoch": 5.5085656321641245, "grad_norm": 8.832671165466309, "learning_rate": 0.0003067733408395036, "loss": 7.5149, "step": 1352200 }, { "epoch": 5.508973010187506, "grad_norm": 16.641756057739258, "learning_rate": 0.0003065387232538247, "loss": 7.552, "step": 1352300 }, { "epoch": 5.509380388210888, "grad_norm": 5.799544334411621, "learning_rate": 0.0003063041925850786, "loss": 7.4953, "step": 1352400 }, { "epoch": 5.509787766234269, "grad_norm": 16.126590728759766, "learning_rate": 0.0003060697488425371, "loss": 7.5339, "step": 1352500 }, { "epoch": 5.510195144257651, "grad_norm": 4.758677005767822, "learning_rate": 0.00030583539203546926, "loss": 7.5393, "step": 1352600 }, { "epoch": 5.510602522281031, "grad_norm": 9.810537338256836, "learning_rate": 0.0003056011221731403, "loss": 7.5374, "step": 1352700 }, { "epoch": 5.511009900304413, "grad_norm": 5.438594341278076, "learning_rate": 0.0003053669392648126, "loss": 7.5487, "step": 1352800 }, { "epoch": 5.511417278327794, "grad_norm": 3.1264357566833496, "learning_rate": 0.0003051328433197442, "loss": 7.5228, "step": 1352900 }, { "epoch": 5.511824656351176, "grad_norm": 5.903149127960205, "learning_rate": 0.0003048988343471898, "loss": 7.5106, "step": 1353000 }, { "epoch": 5.511824656351176, "eval_MaskedAccuracy": 0.5130156749728397, "eval_loss": 1.593740463256836, "eval_runtime": 150.775, "eval_samples_per_second": 420.998, "eval_steps_per_second": 1.645, "step": 1353000 }, { "epoch": 5.512232034374557, "grad_norm": 4.0846757888793945, "learning_rate": 0.00030466491235640186, "loss": 7.5441, "step": 1353100 }, { "epoch": 5.512639412397939, "grad_norm": 4.778141975402832, "learning_rate": 0.00030443107735662797, "loss": 7.5308, "step": 1353200 }, { "epoch": 5.5130467904213205, "grad_norm": 5.338253974914551, "learning_rate": 0.0003041973293571125, "loss": 7.5573, "step": 1353300 }, { "epoch": 5.513454168444702, "grad_norm": 4.985092639923096, "learning_rate": 0.0003039636683670981, "loss": 7.5476, "step": 1353400 }, { "epoch": 5.513861546468084, "grad_norm": 13.974929809570312, "learning_rate": 0.0003037300943958216, "loss": 7.5322, "step": 1353500 }, { "epoch": 5.514268924491465, "grad_norm": 22.895973205566406, "learning_rate": 0.00030349660745251764, "loss": 7.5185, "step": 1353600 }, { "epoch": 5.514676302514847, "grad_norm": 2.834916353225708, "learning_rate": 0.0003032632075464173, "loss": 7.5345, "step": 1353700 }, { "epoch": 5.515083680538228, "grad_norm": 8.161876678466797, "learning_rate": 0.00030302989468674804, "loss": 7.521, "step": 1353800 }, { "epoch": 5.51549105856161, "grad_norm": 19.494930267333984, "learning_rate": 0.000302796668882734, "loss": 7.5327, "step": 1353900 }, { "epoch": 5.51589843658499, "grad_norm": 19.22008514404297, "learning_rate": 0.0003025635301435959, "loss": 7.5419, "step": 1354000 }, { "epoch": 5.51589843658499, "eval_MaskedAccuracy": 0.5136081749109903, "eval_loss": 1.5779005289077759, "eval_runtime": 159.4313, "eval_samples_per_second": 398.14, "eval_steps_per_second": 1.556, "step": 1354000 }, { "epoch": 5.516305814608372, "grad_norm": 17.956283569335938, "learning_rate": 0.00030233047847855085, "loss": 7.5323, "step": 1354100 }, { "epoch": 5.516713192631753, "grad_norm": 9.527266502380371, "learning_rate": 0.0003020975138968126, "loss": 7.5419, "step": 1354200 }, { "epoch": 5.517120570655135, "grad_norm": 5.125702381134033, "learning_rate": 0.00030186463640759177, "loss": 7.5371, "step": 1354300 }, { "epoch": 5.5175279486785165, "grad_norm": 11.493252754211426, "learning_rate": 0.0003016318460200952, "loss": 7.5557, "step": 1354400 }, { "epoch": 5.517935326701898, "grad_norm": 12.407708168029785, "learning_rate": 0.0003013991427435262, "loss": 7.4964, "step": 1354500 }, { "epoch": 5.5183427047252795, "grad_norm": 5.3618974685668945, "learning_rate": 0.00030116652658708455, "loss": 7.5349, "step": 1354600 }, { "epoch": 5.518750082748661, "grad_norm": 10.639982223510742, "learning_rate": 0.0003009339975599669, "loss": 7.5238, "step": 1354700 }, { "epoch": 5.519157460772043, "grad_norm": 6.850088596343994, "learning_rate": 0.00030070155567136646, "loss": 7.5272, "step": 1354800 }, { "epoch": 5.519564838795424, "grad_norm": 5.3476128578186035, "learning_rate": 0.00030046920093047304, "loss": 7.5304, "step": 1354900 }, { "epoch": 5.519972216818806, "grad_norm": 38.98796844482422, "learning_rate": 0.0003002369333464727, "loss": 7.5189, "step": 1355000 }, { "epoch": 5.519972216818806, "eval_MaskedAccuracy": 0.5131691850320736, "eval_loss": 1.588597297668457, "eval_runtime": 179.1814, "eval_samples_per_second": 354.255, "eval_steps_per_second": 1.384, "step": 1355000 }, { "epoch": 5.520379594842187, "grad_norm": 14.150798797607422, "learning_rate": 0.000300004752928548, "loss": 7.5311, "step": 1355100 }, { "epoch": 5.520786972865569, "grad_norm": 17.762676239013672, "learning_rate": 0.00029977265968587907, "loss": 7.5403, "step": 1355200 }, { "epoch": 5.521194350888949, "grad_norm": 11.598910331726074, "learning_rate": 0.00029954065362764154, "loss": 7.5492, "step": 1355300 }, { "epoch": 5.521601728912331, "grad_norm": 13.182674407958984, "learning_rate": 0.00029930873476300725, "loss": 7.5275, "step": 1355400 }, { "epoch": 5.5220091069357125, "grad_norm": 13.883099555969238, "learning_rate": 0.00029907690310114514, "loss": 7.5435, "step": 1355500 }, { "epoch": 5.522416484959094, "grad_norm": 8.441153526306152, "learning_rate": 0.0002988451586512213, "loss": 7.5111, "step": 1355600 }, { "epoch": 5.5228238629824755, "grad_norm": 11.882668495178223, "learning_rate": 0.000298613501422398, "loss": 7.5076, "step": 1355700 }, { "epoch": 5.523231241005857, "grad_norm": 25.98122215270996, "learning_rate": 0.00029838193142383375, "loss": 7.5322, "step": 1355800 }, { "epoch": 5.523638619029239, "grad_norm": 11.909917831420898, "learning_rate": 0.0002981504486646836, "loss": 7.5161, "step": 1355900 }, { "epoch": 5.52404599705262, "grad_norm": 13.028166770935059, "learning_rate": 0.00029791905315409826, "loss": 7.5113, "step": 1356000 }, { "epoch": 5.52404599705262, "eval_MaskedAccuracy": 0.5132002872127008, "eval_loss": 1.586454153060913, "eval_runtime": 156.1492, "eval_samples_per_second": 406.509, "eval_steps_per_second": 1.588, "step": 1356000 }, { "epoch": 5.524453375076002, "grad_norm": 5.624743938446045, "learning_rate": 0.0002976877449012282, "loss": 7.5335, "step": 1356100 }, { "epoch": 5.524860753099383, "grad_norm": 7.9186530113220215, "learning_rate": 0.00029745652391521653, "loss": 7.5268, "step": 1356200 }, { "epoch": 5.525268131122765, "grad_norm": 4.483052730560303, "learning_rate": 0.0002972253902052055, "loss": 7.5209, "step": 1356300 }, { "epoch": 5.525675509146145, "grad_norm": 4.488603591918945, "learning_rate": 0.0002969943437803331, "loss": 7.5226, "step": 1356400 }, { "epoch": 5.526082887169528, "grad_norm": 4.400208473205566, "learning_rate": 0.0002967633846497328, "loss": 7.5213, "step": 1356500 }, { "epoch": 5.526490265192908, "grad_norm": 3.501458168029785, "learning_rate": 0.00029653251282253694, "loss": 7.5381, "step": 1356600 }, { "epoch": 5.52689764321629, "grad_norm": 3.340562582015991, "learning_rate": 0.0002963017283078713, "loss": 7.5285, "step": 1356700 }, { "epoch": 5.5273050212396715, "grad_norm": 5.059596538543701, "learning_rate": 0.00029607103111486266, "loss": 7.5223, "step": 1356800 }, { "epoch": 5.527712399263053, "grad_norm": 3.181023120880127, "learning_rate": 0.00029584042125262974, "loss": 7.5276, "step": 1356900 }, { "epoch": 5.5281197772864346, "grad_norm": 15.770975112915039, "learning_rate": 0.00029560989873029077, "loss": 7.5245, "step": 1357000 }, { "epoch": 5.5281197772864346, "eval_MaskedAccuracy": 0.51299556931503, "eval_loss": 1.5948057174682617, "eval_runtime": 153.1605, "eval_samples_per_second": 414.441, "eval_steps_per_second": 1.619, "step": 1357000 }, { "epoch": 5.528527155309816, "grad_norm": 5.648419380187988, "learning_rate": 0.00029537946355695895, "loss": 7.5298, "step": 1357100 }, { "epoch": 5.528934533333198, "grad_norm": 27.536808013916016, "learning_rate": 0.0002951491157417453, "loss": 7.5066, "step": 1357200 }, { "epoch": 5.529341911356579, "grad_norm": 4.438647270202637, "learning_rate": 0.0002949188552937557, "loss": 7.5134, "step": 1357300 }, { "epoch": 5.529749289379961, "grad_norm": 10.924798965454102, "learning_rate": 0.00029468868222209423, "loss": 7.477, "step": 1357400 }, { "epoch": 5.530156667403342, "grad_norm": 3.520021438598633, "learning_rate": 0.0002944585965358611, "loss": 7.5538, "step": 1357500 }, { "epoch": 5.530564045426724, "grad_norm": 11.51445484161377, "learning_rate": 0.0002942285982441522, "loss": 7.533, "step": 1357600 }, { "epoch": 5.530971423450104, "grad_norm": 5.935987949371338, "learning_rate": 0.00029399868735606075, "loss": 7.5299, "step": 1357700 }, { "epoch": 5.531378801473486, "grad_norm": 9.059341430664062, "learning_rate": 0.00029376886388067644, "loss": 7.5327, "step": 1357800 }, { "epoch": 5.5317861794968675, "grad_norm": 17.713075637817383, "learning_rate": 0.0002935391278270857, "loss": 7.5326, "step": 1357900 }, { "epoch": 5.532193557520249, "grad_norm": 4.2288947105407715, "learning_rate": 0.00029330947920437074, "loss": 7.533, "step": 1358000 }, { "epoch": 5.532193557520249, "eval_MaskedAccuracy": 0.5129954861303083, "eval_loss": 1.5838136672973633, "eval_runtime": 156.0729, "eval_samples_per_second": 406.707, "eval_steps_per_second": 1.589, "step": 1358000 }, { "epoch": 5.5326009355436305, "grad_norm": 12.103240966796875, "learning_rate": 0.0002930799180216107, "loss": 7.5148, "step": 1358100 }, { "epoch": 5.533008313567012, "grad_norm": 3.613372325897217, "learning_rate": 0.00029285044428788186, "loss": 7.5113, "step": 1358200 }, { "epoch": 5.533415691590394, "grad_norm": 11.579863548278809, "learning_rate": 0.00029262105801225653, "loss": 7.5425, "step": 1358300 }, { "epoch": 5.533823069613775, "grad_norm": 5.354294300079346, "learning_rate": 0.00029239175920380353, "loss": 7.5302, "step": 1358400 }, { "epoch": 5.534230447637157, "grad_norm": 4.83122444152832, "learning_rate": 0.00029216254787158783, "loss": 7.5372, "step": 1358500 }, { "epoch": 5.534637825660538, "grad_norm": 19.561399459838867, "learning_rate": 0.0002919334240246718, "loss": 7.5163, "step": 1358600 }, { "epoch": 5.53504520368392, "grad_norm": 22.466936111450195, "learning_rate": 0.00029170438767211327, "loss": 7.54, "step": 1358700 }, { "epoch": 5.535452581707301, "grad_norm": 5.240617752075195, "learning_rate": 0.00029147543882296865, "loss": 7.5319, "step": 1358800 }, { "epoch": 5.535859959730683, "grad_norm": 5.821611404418945, "learning_rate": 0.0002912465774862884, "loss": 7.5175, "step": 1358900 }, { "epoch": 5.536267337754063, "grad_norm": 26.9658260345459, "learning_rate": 0.00029101780367112094, "loss": 7.5253, "step": 1359000 }, { "epoch": 5.536267337754063, "eval_MaskedAccuracy": 0.5132573493254089, "eval_loss": 1.5926933288574219, "eval_runtime": 163.301, "eval_samples_per_second": 388.706, "eval_steps_per_second": 1.519, "step": 1359000 }, { "epoch": 5.536674715777445, "grad_norm": 4.119058609008789, "learning_rate": 0.00029078911738651106, "loss": 7.5531, "step": 1359100 }, { "epoch": 5.5370820938008265, "grad_norm": 3.332658529281616, "learning_rate": 0.0002905605186414999, "loss": 7.5236, "step": 1359200 }, { "epoch": 5.537489471824208, "grad_norm": 8.222732543945312, "learning_rate": 0.00029033200744512473, "loss": 7.5292, "step": 1359300 }, { "epoch": 5.53789684984759, "grad_norm": 11.296479225158691, "learning_rate": 0.0002901035838064204, "loss": 7.5207, "step": 1359400 }, { "epoch": 5.538304227870971, "grad_norm": 12.594161033630371, "learning_rate": 0.00028987524773441744, "loss": 7.5036, "step": 1359500 }, { "epoch": 5.538711605894353, "grad_norm": 8.876920700073242, "learning_rate": 0.00028964699923814336, "loss": 7.5404, "step": 1359600 }, { "epoch": 5.539118983917734, "grad_norm": 4.410617828369141, "learning_rate": 0.00028941883832662177, "loss": 7.525, "step": 1359700 }, { "epoch": 5.539526361941116, "grad_norm": 6.2033538818359375, "learning_rate": 0.00028919076500887394, "loss": 7.5347, "step": 1359800 }, { "epoch": 5.539933739964497, "grad_norm": 9.682522773742676, "learning_rate": 0.0002889627792939154, "loss": 7.5422, "step": 1359900 }, { "epoch": 5.540341117987879, "grad_norm": 18.145429611206055, "learning_rate": 0.0002887348811907606, "loss": 7.5459, "step": 1360000 }, { "epoch": 5.540341117987879, "eval_MaskedAccuracy": 0.5131748225935678, "eval_loss": 1.59002685546875, "eval_runtime": 153.5122, "eval_samples_per_second": 413.492, "eval_steps_per_second": 1.616, "step": 1360000 }, { "epoch": 5.54074849601126, "grad_norm": 18.38419532775879, "learning_rate": 0.0002885070707084194, "loss": 7.5203, "step": 1360100 }, { "epoch": 5.541155874034642, "grad_norm": 14.297173500061035, "learning_rate": 0.00028827934785589807, "loss": 7.5336, "step": 1360200 }, { "epoch": 5.5415632520580225, "grad_norm": 6.6692633628845215, "learning_rate": 0.0002880517126422007, "loss": 7.5225, "step": 1360300 }, { "epoch": 5.541970630081404, "grad_norm": 11.438497543334961, "learning_rate": 0.0002878241650763257, "loss": 7.5433, "step": 1360400 }, { "epoch": 5.5423780081047855, "grad_norm": 3.0007681846618652, "learning_rate": 0.0002875967051672699, "loss": 7.5249, "step": 1360500 }, { "epoch": 5.542785386128167, "grad_norm": 4.182516098022461, "learning_rate": 0.00028736933292402515, "loss": 7.5555, "step": 1360600 }, { "epoch": 5.543192764151549, "grad_norm": 7.271773338317871, "learning_rate": 0.00028714204835558077, "loss": 7.4932, "step": 1360700 }, { "epoch": 5.54360014217493, "grad_norm": 9.270909309387207, "learning_rate": 0.00028691485147092435, "loss": 7.4946, "step": 1360800 }, { "epoch": 5.544007520198312, "grad_norm": 5.700077533721924, "learning_rate": 0.00028668774227903716, "loss": 7.5015, "step": 1360900 }, { "epoch": 5.544414898221693, "grad_norm": 30.443313598632812, "learning_rate": 0.0002864607207888979, "loss": 7.5364, "step": 1361000 }, { "epoch": 5.544414898221693, "eval_MaskedAccuracy": 0.5132442484916322, "eval_loss": 1.5818790197372437, "eval_runtime": 150.6778, "eval_samples_per_second": 421.27, "eval_steps_per_second": 1.646, "step": 1361000 }, { "epoch": 5.544822276245075, "grad_norm": 10.397770881652832, "learning_rate": 0.00028623378700948156, "loss": 7.4893, "step": 1361100 }, { "epoch": 5.545229654268456, "grad_norm": 7.334549427032471, "learning_rate": 0.00028600694094976076, "loss": 7.4905, "step": 1361200 }, { "epoch": 5.545637032291838, "grad_norm": 8.588781356811523, "learning_rate": 0.0002857801826187033, "loss": 7.4941, "step": 1361300 }, { "epoch": 5.546044410315218, "grad_norm": 16.37862777709961, "learning_rate": 0.0002855535120252745, "loss": 7.5173, "step": 1361400 }, { "epoch": 5.546451788338601, "grad_norm": 5.2927165031433105, "learning_rate": 0.0002853269291784358, "loss": 7.4858, "step": 1361500 }, { "epoch": 5.5468591663619815, "grad_norm": 3.8727829456329346, "learning_rate": 0.00028510043408714567, "loss": 7.519, "step": 1361600 }, { "epoch": 5.547266544385363, "grad_norm": 10.189196586608887, "learning_rate": 0.0002848740267603579, "loss": 7.5252, "step": 1361700 }, { "epoch": 5.547673922408745, "grad_norm": 7.4850029945373535, "learning_rate": 0.00028464770720702397, "loss": 7.5186, "step": 1361800 }, { "epoch": 5.548081300432126, "grad_norm": 5.111363887786865, "learning_rate": 0.0002844214754360912, "loss": 7.5465, "step": 1361900 }, { "epoch": 5.548488678455508, "grad_norm": 8.511821746826172, "learning_rate": 0.00028419533145650455, "loss": 7.5218, "step": 1362000 }, { "epoch": 5.548488678455508, "eval_MaskedAccuracy": 0.5126793560835217, "eval_loss": 1.594093680381775, "eval_runtime": 163.6107, "eval_samples_per_second": 387.97, "eval_steps_per_second": 1.516, "step": 1362000 }, { "epoch": 5.548896056478889, "grad_norm": 10.877079963684082, "learning_rate": 0.0002839692752772038, "loss": 7.545, "step": 1362100 }, { "epoch": 5.549303434502271, "grad_norm": 3.9173827171325684, "learning_rate": 0.00028374330690712654, "loss": 7.5129, "step": 1362200 }, { "epoch": 5.549710812525652, "grad_norm": 5.803036212921143, "learning_rate": 0.0002835174263552065, "loss": 7.5282, "step": 1362300 }, { "epoch": 5.550118190549034, "grad_norm": 4.023270606994629, "learning_rate": 0.0002832916336303739, "loss": 7.5377, "step": 1362400 }, { "epoch": 5.550525568572415, "grad_norm": 7.85737943649292, "learning_rate": 0.00028306592874155565, "loss": 7.524, "step": 1362500 }, { "epoch": 5.550932946595797, "grad_norm": 5.284960746765137, "learning_rate": 0.000282840311697675, "loss": 7.5251, "step": 1362600 }, { "epoch": 5.5513403246191775, "grad_norm": 7.164961338043213, "learning_rate": 0.0002826147825076513, "loss": 7.5379, "step": 1362700 }, { "epoch": 5.551747702642559, "grad_norm": 2.8433749675750732, "learning_rate": 0.0002823893411804015, "loss": 7.5438, "step": 1362800 }, { "epoch": 5.5521550806659405, "grad_norm": 4.6245341300964355, "learning_rate": 0.0002821639877248384, "loss": 7.5236, "step": 1362900 }, { "epoch": 5.552562458689322, "grad_norm": 6.554775238037109, "learning_rate": 0.0002819387221498716, "loss": 7.5347, "step": 1363000 }, { "epoch": 5.552562458689322, "eval_MaskedAccuracy": 0.5128569088679895, "eval_loss": 1.594756007194519, "eval_runtime": 186.1972, "eval_samples_per_second": 340.907, "eval_steps_per_second": 1.332, "step": 1363000 }, { "epoch": 5.552969836712704, "grad_norm": 4.0478668212890625, "learning_rate": 0.00028171354446440683, "loss": 7.5135, "step": 1363100 }, { "epoch": 5.553377214736085, "grad_norm": 29.875524520874023, "learning_rate": 0.0002814884546773468, "loss": 7.5379, "step": 1363200 }, { "epoch": 5.553784592759467, "grad_norm": 6.030183792114258, "learning_rate": 0.00028126345279759024, "loss": 7.5234, "step": 1363300 }, { "epoch": 5.554191970782848, "grad_norm": 8.036584854125977, "learning_rate": 0.00028103853883403273, "loss": 7.5175, "step": 1363400 }, { "epoch": 5.55459934880623, "grad_norm": 20.940343856811523, "learning_rate": 0.000280813712795566, "loss": 7.5369, "step": 1363500 }, { "epoch": 5.555006726829611, "grad_norm": 3.3316187858581543, "learning_rate": 0.00028058897469107955, "loss": 7.5088, "step": 1363600 }, { "epoch": 5.555414104852993, "grad_norm": 18.483346939086914, "learning_rate": 0.00028036432452945736, "loss": 7.5237, "step": 1363700 }, { "epoch": 5.555821482876374, "grad_norm": 7.354155540466309, "learning_rate": 0.00028013976231958153, "loss": 7.5373, "step": 1363800 }, { "epoch": 5.556228860899756, "grad_norm": 28.18296241760254, "learning_rate": 0.0002799152880703307, "loss": 7.5265, "step": 1363900 }, { "epoch": 5.5566362389231365, "grad_norm": 16.456531524658203, "learning_rate": 0.00027969090179057903, "loss": 7.5101, "step": 1364000 }, { "epoch": 5.5566362389231365, "eval_MaskedAccuracy": 0.5128969486337993, "eval_loss": 1.5997278690338135, "eval_runtime": 174.6059, "eval_samples_per_second": 363.539, "eval_steps_per_second": 1.42, "step": 1364000 }, { "epoch": 5.557043616946518, "grad_norm": 3.2516138553619385, "learning_rate": 0.00027946660348919793, "loss": 7.5338, "step": 1364100 }, { "epoch": 5.5574509949699, "grad_norm": 14.053657531738281, "learning_rate": 0.0002792423931750547, "loss": 7.5362, "step": 1364200 }, { "epoch": 5.557858372993281, "grad_norm": 15.374994277954102, "learning_rate": 0.0002790182708570141, "loss": 7.5311, "step": 1364300 }, { "epoch": 5.558265751016663, "grad_norm": 9.491127967834473, "learning_rate": 0.0002787942365439364, "loss": 7.4937, "step": 1364400 }, { "epoch": 5.558673129040044, "grad_norm": 16.13951873779297, "learning_rate": 0.0002785702902446793, "loss": 7.5448, "step": 1364500 }, { "epoch": 5.559080507063426, "grad_norm": 15.348613739013672, "learning_rate": 0.00027834643196809655, "loss": 7.5298, "step": 1364600 }, { "epoch": 5.559487885086807, "grad_norm": 5.019101619720459, "learning_rate": 0.00027812266172303737, "loss": 7.5188, "step": 1364700 }, { "epoch": 5.559895263110189, "grad_norm": 11.431594848632812, "learning_rate": 0.0002778989795183505, "loss": 7.529, "step": 1364800 }, { "epoch": 5.56030264113357, "grad_norm": 12.806009292602539, "learning_rate": 0.0002776753853628783, "loss": 7.4886, "step": 1364900 }, { "epoch": 5.560710019156952, "grad_norm": 20.750797271728516, "learning_rate": 0.0002774518792654607, "loss": 7.5241, "step": 1365000 }, { "epoch": 5.560710019156952, "eval_MaskedAccuracy": 0.5137996287967581, "eval_loss": 1.578283429145813, "eval_runtime": 172.6123, "eval_samples_per_second": 367.737, "eval_steps_per_second": 1.437, "step": 1365000 }, { "epoch": 5.561117397180333, "grad_norm": 10.64334774017334, "learning_rate": 0.0002772284612349343, "loss": 7.5076, "step": 1365100 }, { "epoch": 5.561524775203715, "grad_norm": 9.669614791870117, "learning_rate": 0.0002770051312801313, "loss": 7.5107, "step": 1365200 }, { "epoch": 5.5619321532270956, "grad_norm": 18.724956512451172, "learning_rate": 0.00027678188940988193, "loss": 7.5083, "step": 1365300 }, { "epoch": 5.562339531250477, "grad_norm": 4.32308292388916, "learning_rate": 0.00027655873563301164, "loss": 7.509, "step": 1365400 }, { "epoch": 5.562746909273859, "grad_norm": 23.26671028137207, "learning_rate": 0.00027633566995834325, "loss": 7.4798, "step": 1365500 }, { "epoch": 5.56315428729724, "grad_norm": 6.140720367431641, "learning_rate": 0.0002761126923946954, "loss": 7.5345, "step": 1365600 }, { "epoch": 5.563561665320622, "grad_norm": 8.759008407592773, "learning_rate": 0.00027588980295088403, "loss": 7.5419, "step": 1365700 }, { "epoch": 5.563969043344003, "grad_norm": 4.176877021789551, "learning_rate": 0.0002756670016357203, "loss": 7.515, "step": 1365800 }, { "epoch": 5.564376421367385, "grad_norm": 3.5711328983306885, "learning_rate": 0.000275444288458014, "loss": 7.5399, "step": 1365900 }, { "epoch": 5.564783799390766, "grad_norm": 14.740789413452148, "learning_rate": 0.00027522166342656935, "loss": 7.5484, "step": 1366000 }, { "epoch": 5.564783799390766, "eval_MaskedAccuracy": 0.5134230409177121, "eval_loss": 1.5808014869689941, "eval_runtime": 177.2087, "eval_samples_per_second": 358.199, "eval_steps_per_second": 1.399, "step": 1366000 }, { "epoch": 5.565191177414148, "grad_norm": 19.410436630249023, "learning_rate": 0.0002749991265501877, "loss": 7.5297, "step": 1366100 }, { "epoch": 5.565598555437529, "grad_norm": 12.808557510375977, "learning_rate": 0.0002747766778376676, "loss": 7.5098, "step": 1366200 }, { "epoch": 5.566005933460911, "grad_norm": 11.51230239868164, "learning_rate": 0.00027455431729780314, "loss": 7.5165, "step": 1366300 }, { "epoch": 5.5664133114842915, "grad_norm": 9.870705604553223, "learning_rate": 0.0002743320449393862, "loss": 7.5126, "step": 1366400 }, { "epoch": 5.566820689507674, "grad_norm": 10.526515007019043, "learning_rate": 0.00027410986077120434, "loss": 7.5364, "step": 1366500 }, { "epoch": 5.567228067531055, "grad_norm": 4.075404167175293, "learning_rate": 0.00027388776480204017, "loss": 7.5229, "step": 1366600 }, { "epoch": 5.567635445554436, "grad_norm": 6.086880207061768, "learning_rate": 0.0002736657570406758, "loss": 7.5175, "step": 1366700 }, { "epoch": 5.568042823577818, "grad_norm": 11.606575965881348, "learning_rate": 0.0002734438374958877, "loss": 7.5473, "step": 1366800 }, { "epoch": 5.568450201601199, "grad_norm": 6.996771335601807, "learning_rate": 0.00027322200617645035, "loss": 7.54, "step": 1366900 }, { "epoch": 5.568857579624581, "grad_norm": 8.935172080993652, "learning_rate": 0.00027300026309113375, "loss": 7.5155, "step": 1367000 }, { "epoch": 5.568857579624581, "eval_MaskedAccuracy": 0.5129643316707965, "eval_loss": 1.5887912511825562, "eval_runtime": 168.8896, "eval_samples_per_second": 375.843, "eval_steps_per_second": 1.468, "step": 1367000 }, { "epoch": 5.569264957647962, "grad_norm": 14.77796459197998, "learning_rate": 0.0002727786082487044, "loss": 7.5348, "step": 1367100 }, { "epoch": 5.569672335671344, "grad_norm": 5.776971817016602, "learning_rate": 0.0002725570416579253, "loss": 7.4868, "step": 1367200 }, { "epoch": 5.570079713694725, "grad_norm": 3.8741743564605713, "learning_rate": 0.0002723355633275565, "loss": 7.527, "step": 1367300 }, { "epoch": 5.570487091718107, "grad_norm": 17.044811248779297, "learning_rate": 0.00027211417326635396, "loss": 7.5317, "step": 1367400 }, { "epoch": 5.570894469741488, "grad_norm": 3.809858798980713, "learning_rate": 0.0002718928714830707, "loss": 7.5072, "step": 1367500 }, { "epoch": 5.57130184776487, "grad_norm": 16.063383102416992, "learning_rate": 0.0002716716579864553, "loss": 7.5465, "step": 1367600 }, { "epoch": 5.571709225788251, "grad_norm": 18.389799118041992, "learning_rate": 0.00027145053278525413, "loss": 7.5387, "step": 1367700 }, { "epoch": 5.572116603811632, "grad_norm": 2.9917666912078857, "learning_rate": 0.00027122949588820913, "loss": 7.5321, "step": 1367800 }, { "epoch": 5.572523981835014, "grad_norm": 4.532822608947754, "learning_rate": 0.000271008547304059, "loss": 7.5473, "step": 1367900 }, { "epoch": 5.572931359858395, "grad_norm": 6.056094169616699, "learning_rate": 0.00027078768704153915, "loss": 7.4983, "step": 1368000 }, { "epoch": 5.572931359858395, "eval_MaskedAccuracy": 0.5129696851753682, "eval_loss": 1.5889724493026733, "eval_runtime": 180.1956, "eval_samples_per_second": 352.262, "eval_steps_per_second": 1.376, "step": 1368000 }, { "epoch": 5.573338737881777, "grad_norm": 4.154176235198975, "learning_rate": 0.00027056691510938163, "loss": 7.547, "step": 1368100 }, { "epoch": 5.573746115905158, "grad_norm": 19.629858016967773, "learning_rate": 0.00027034623151631456, "loss": 7.5552, "step": 1368200 }, { "epoch": 5.57415349392854, "grad_norm": 6.804897308349609, "learning_rate": 0.00027012563627106304, "loss": 7.5076, "step": 1368300 }, { "epoch": 5.574560871951921, "grad_norm": 3.384577512741089, "learning_rate": 0.00026990512938234785, "loss": 7.5544, "step": 1368400 }, { "epoch": 5.574968249975303, "grad_norm": 9.741466522216797, "learning_rate": 0.00026968471085888707, "loss": 7.5361, "step": 1368500 }, { "epoch": 5.575375627998684, "grad_norm": 7.53199577331543, "learning_rate": 0.00026946438070939464, "loss": 7.5592, "step": 1368600 }, { "epoch": 5.575783006022066, "grad_norm": 5.295625686645508, "learning_rate": 0.00026924413894258245, "loss": 7.5428, "step": 1368700 }, { "epoch": 5.576190384045447, "grad_norm": 4.960668563842773, "learning_rate": 0.0002690239855671566, "loss": 7.5395, "step": 1368800 }, { "epoch": 5.576597762068829, "grad_norm": 3.418715000152588, "learning_rate": 0.0002688039205918214, "loss": 7.5282, "step": 1368900 }, { "epoch": 5.57700514009221, "grad_norm": 7.8770976066589355, "learning_rate": 0.00026858394402527736, "loss": 7.5509, "step": 1369000 }, { "epoch": 5.57700514009221, "eval_MaskedAccuracy": 0.5129721241031361, "eval_loss": 1.5839240550994873, "eval_runtime": 176.2656, "eval_samples_per_second": 360.116, "eval_steps_per_second": 1.407, "step": 1369000 }, { "epoch": 5.577412518115591, "grad_norm": 3.534111261367798, "learning_rate": 0.000268364055876221, "loss": 7.502, "step": 1369100 }, { "epoch": 5.577819896138973, "grad_norm": 5.81339168548584, "learning_rate": 0.0002681442561533461, "loss": 7.5105, "step": 1369200 }, { "epoch": 5.578227274162354, "grad_norm": 7.141969203948975, "learning_rate": 0.0002679245448653422, "loss": 7.5268, "step": 1369300 }, { "epoch": 5.578634652185736, "grad_norm": 5.7050018310546875, "learning_rate": 0.0002677049220208963, "loss": 7.5165, "step": 1369400 }, { "epoch": 5.579042030209117, "grad_norm": 3.8554022312164307, "learning_rate": 0.00026748538762869025, "loss": 7.5172, "step": 1369500 }, { "epoch": 5.579449408232499, "grad_norm": 5.558248996734619, "learning_rate": 0.00026726594169740374, "loss": 7.5197, "step": 1369600 }, { "epoch": 5.57985678625588, "grad_norm": 4.008612155914307, "learning_rate": 0.0002670465842357129, "loss": 7.5379, "step": 1369700 }, { "epoch": 5.580264164279262, "grad_norm": 2.9142096042633057, "learning_rate": 0.00026682731525229023, "loss": 7.5163, "step": 1369800 }, { "epoch": 5.580671542302643, "grad_norm": 12.786030769348145, "learning_rate": 0.00026660813475580427, "loss": 7.504, "step": 1369900 }, { "epoch": 5.581078920326025, "grad_norm": 23.525224685668945, "learning_rate": 0.0002663890427549205, "loss": 7.5243, "step": 1370000 }, { "epoch": 5.581078920326025, "eval_MaskedAccuracy": 0.5129621062762936, "eval_loss": 1.5875533819198608, "eval_runtime": 175.4683, "eval_samples_per_second": 361.752, "eval_steps_per_second": 1.413, "step": 1370000 }, { "epoch": 5.5814862983494065, "grad_norm": 4.683876037597656, "learning_rate": 0.000266170039258301, "loss": 7.5268, "step": 1370100 }, { "epoch": 5.581893676372788, "grad_norm": 7.082616806030273, "learning_rate": 0.00026595112427460363, "loss": 7.5114, "step": 1370200 }, { "epoch": 5.582301054396169, "grad_norm": 17.532716751098633, "learning_rate": 0.0002657322978124838, "loss": 7.5402, "step": 1370300 }, { "epoch": 5.58270843241955, "grad_norm": 7.782750129699707, "learning_rate": 0.00026551355988059283, "loss": 7.5323, "step": 1370400 }, { "epoch": 5.583115810442932, "grad_norm": 13.5018892288208, "learning_rate": 0.00026529491048757846, "loss": 7.5541, "step": 1370500 }, { "epoch": 5.583523188466313, "grad_norm": 13.190478324890137, "learning_rate": 0.0002650763496420836, "loss": 7.5248, "step": 1370600 }, { "epoch": 5.583930566489695, "grad_norm": 33.09577560424805, "learning_rate": 0.00026485787735275145, "loss": 7.5121, "step": 1370700 }, { "epoch": 5.584337944513076, "grad_norm": 12.808554649353027, "learning_rate": 0.00026463949362821924, "loss": 7.515, "step": 1370800 }, { "epoch": 5.584745322536458, "grad_norm": 15.17452621459961, "learning_rate": 0.00026442119847712014, "loss": 7.5279, "step": 1370900 }, { "epoch": 5.585152700559839, "grad_norm": 4.880710601806641, "learning_rate": 0.0002642029919080844, "loss": 7.5566, "step": 1371000 }, { "epoch": 5.585152700559839, "eval_MaskedAccuracy": 0.5137202102691227, "eval_loss": 1.5842161178588867, "eval_runtime": 159.5518, "eval_samples_per_second": 397.839, "eval_steps_per_second": 1.554, "step": 1371000 }, { "epoch": 5.585560078583221, "grad_norm": 4.810088157653809, "learning_rate": 0.0002639848739297388, "loss": 7.5377, "step": 1371100 }, { "epoch": 5.585967456606602, "grad_norm": 27.958778381347656, "learning_rate": 0.0002637668445507071, "loss": 7.539, "step": 1371200 }, { "epoch": 5.586374834629984, "grad_norm": 15.798528671264648, "learning_rate": 0.00026354890377960856, "loss": 7.5078, "step": 1371300 }, { "epoch": 5.586782212653365, "grad_norm": 23.275903701782227, "learning_rate": 0.0002633310516250602, "loss": 7.5168, "step": 1371400 }, { "epoch": 5.587189590676747, "grad_norm": 15.988101959228516, "learning_rate": 0.0002631132880956747, "loss": 7.4983, "step": 1371500 }, { "epoch": 5.587596968700128, "grad_norm": 11.174032211303711, "learning_rate": 0.00026289561320006117, "loss": 7.5152, "step": 1371600 }, { "epoch": 5.588004346723509, "grad_norm": 6.301385402679443, "learning_rate": 0.00026267802694682636, "loss": 7.5195, "step": 1371700 }, { "epoch": 5.588411724746891, "grad_norm": 16.60630989074707, "learning_rate": 0.00026246052934457154, "loss": 7.5452, "step": 1371800 }, { "epoch": 5.588819102770272, "grad_norm": 5.048940658569336, "learning_rate": 0.0002622431204018957, "loss": 7.5028, "step": 1371900 }, { "epoch": 5.589226480793654, "grad_norm": 12.170727729797363, "learning_rate": 0.00026202580012739456, "loss": 7.5008, "step": 1372000 }, { "epoch": 5.589226480793654, "eval_MaskedAccuracy": 0.513522222905867, "eval_loss": 1.5896488428115845, "eval_runtime": 194.5768, "eval_samples_per_second": 326.226, "eval_steps_per_second": 1.275, "step": 1372000 }, { "epoch": 5.589633858817035, "grad_norm": 6.673583030700684, "learning_rate": 0.0002618085685296599, "loss": 7.5521, "step": 1372100 }, { "epoch": 5.590041236840417, "grad_norm": 18.239015579223633, "learning_rate": 0.00026159142561728017, "loss": 7.5297, "step": 1372200 }, { "epoch": 5.590448614863798, "grad_norm": 4.859915733337402, "learning_rate": 0.0002613743713988397, "loss": 7.5231, "step": 1372300 }, { "epoch": 5.59085599288718, "grad_norm": 12.0966157913208, "learning_rate": 0.0002611574058829198, "loss": 7.554, "step": 1372400 }, { "epoch": 5.5912633709105615, "grad_norm": 19.52140235900879, "learning_rate": 0.0002609405290780991, "loss": 7.5323, "step": 1372500 }, { "epoch": 5.591670748933943, "grad_norm": 7.564475059509277, "learning_rate": 0.00026072374099295057, "loss": 7.5306, "step": 1372600 }, { "epoch": 5.592078126957324, "grad_norm": 10.323042869567871, "learning_rate": 0.00026050704163604627, "loss": 7.5441, "step": 1372700 }, { "epoch": 5.592485504980705, "grad_norm": 20.985734939575195, "learning_rate": 0.0002602904310159531, "loss": 7.5266, "step": 1372800 }, { "epoch": 5.592892883004087, "grad_norm": 6.601797580718994, "learning_rate": 0.00026007390914123434, "loss": 7.5029, "step": 1372900 }, { "epoch": 5.593300261027468, "grad_norm": 3.467031955718994, "learning_rate": 0.00025985747602045114, "loss": 7.5091, "step": 1373000 }, { "epoch": 5.593300261027468, "eval_MaskedAccuracy": 0.51330253407165, "eval_loss": 1.5910897254943848, "eval_runtime": 180.8446, "eval_samples_per_second": 350.998, "eval_steps_per_second": 1.371, "step": 1373000 }, { "epoch": 5.59370763905085, "grad_norm": 11.178659439086914, "learning_rate": 0.0002596411316621598, "loss": 7.53, "step": 1373100 }, { "epoch": 5.594115017074231, "grad_norm": 4.633257865905762, "learning_rate": 0.00025942487607491353, "loss": 7.5179, "step": 1373200 }, { "epoch": 5.594522395097613, "grad_norm": 23.054452896118164, "learning_rate": 0.0002592087092672618, "loss": 7.5005, "step": 1373300 }, { "epoch": 5.594929773120994, "grad_norm": 9.557489395141602, "learning_rate": 0.000258992631247751, "loss": 7.5183, "step": 1373400 }, { "epoch": 5.595337151144376, "grad_norm": 16.355045318603516, "learning_rate": 0.00025877664202492433, "loss": 7.5329, "step": 1373500 }, { "epoch": 5.595744529167757, "grad_norm": 17.38948631286621, "learning_rate": 0.00025856074160732017, "loss": 7.4872, "step": 1373600 }, { "epoch": 5.596151907191139, "grad_norm": 8.638805389404297, "learning_rate": 0.00025834493000347487, "loss": 7.5224, "step": 1373700 }, { "epoch": 5.5965592852145205, "grad_norm": 12.201508522033691, "learning_rate": 0.0002581292072219205, "loss": 7.5543, "step": 1373800 }, { "epoch": 5.596966663237902, "grad_norm": 4.477809906005859, "learning_rate": 0.0002579135732711859, "loss": 7.5445, "step": 1373900 }, { "epoch": 5.597374041261283, "grad_norm": 3.688843011856079, "learning_rate": 0.00025769802815979595, "loss": 7.5382, "step": 1374000 }, { "epoch": 5.597374041261283, "eval_MaskedAccuracy": 0.5132077913466938, "eval_loss": 1.5854042768478394, "eval_runtime": 157.7601, "eval_samples_per_second": 402.358, "eval_steps_per_second": 1.572, "step": 1374000 }, { "epoch": 5.597781419284664, "grad_norm": 5.155825138092041, "learning_rate": 0.00025748257189627243, "loss": 7.5234, "step": 1374100 }, { "epoch": 5.598188797308046, "grad_norm": 17.257415771484375, "learning_rate": 0.00025726720448913324, "loss": 7.508, "step": 1374200 }, { "epoch": 5.598596175331427, "grad_norm": 6.936742305755615, "learning_rate": 0.0002570519259468934, "loss": 7.5381, "step": 1374300 }, { "epoch": 5.599003553354809, "grad_norm": 7.681789398193359, "learning_rate": 0.0002568367362780638, "loss": 7.5254, "step": 1374400 }, { "epoch": 5.59941093137819, "grad_norm": 10.017396926879883, "learning_rate": 0.00025662163549115146, "loss": 7.5129, "step": 1374500 }, { "epoch": 5.599818309401572, "grad_norm": 3.6523165702819824, "learning_rate": 0.0002564066235946601, "loss": 7.5331, "step": 1374600 }, { "epoch": 5.600225687424953, "grad_norm": 12.0110502243042, "learning_rate": 0.0002561917005970929, "loss": 7.5079, "step": 1374700 }, { "epoch": 5.600633065448335, "grad_norm": 2.950870990753174, "learning_rate": 0.0002559768665069448, "loss": 7.5354, "step": 1374800 }, { "epoch": 5.6010404434717165, "grad_norm": 9.420726776123047, "learning_rate": 0.0002557621213327099, "loss": 7.5336, "step": 1374900 }, { "epoch": 5.601447821495098, "grad_norm": 3.371947765350342, "learning_rate": 0.00025554746508287847, "loss": 7.5341, "step": 1375000 }, { "epoch": 5.601447821495098, "eval_MaskedAccuracy": 0.5132219491246219, "eval_loss": 1.587613821029663, "eval_runtime": 157.1488, "eval_samples_per_second": 403.923, "eval_steps_per_second": 1.578, "step": 1375000 }, { "epoch": 5.6018551995184795, "grad_norm": 7.334743022918701, "learning_rate": 0.00025533289776593645, "loss": 7.5165, "step": 1375100 }, { "epoch": 5.602262577541861, "grad_norm": 12.500086784362793, "learning_rate": 0.0002551184193903668, "loss": 7.4944, "step": 1375200 }, { "epoch": 5.602669955565242, "grad_norm": 10.016807556152344, "learning_rate": 0.0002549040299646498, "loss": 7.5259, "step": 1375300 }, { "epoch": 5.603077333588623, "grad_norm": 5.238979339599609, "learning_rate": 0.00025468972949726045, "loss": 7.5177, "step": 1375400 }, { "epoch": 5.603484711612005, "grad_norm": 2.421661376953125, "learning_rate": 0.00025447551799667186, "loss": 7.549, "step": 1375500 }, { "epoch": 5.603892089635386, "grad_norm": 10.061813354492188, "learning_rate": 0.0002542613954713525, "loss": 7.5242, "step": 1375600 }, { "epoch": 5.604299467658768, "grad_norm": 7.082102298736572, "learning_rate": 0.0002540473619297686, "loss": 7.5315, "step": 1375700 }, { "epoch": 5.604706845682149, "grad_norm": 4.16160774230957, "learning_rate": 0.0002538334173803819, "loss": 7.515, "step": 1375800 }, { "epoch": 5.605114223705531, "grad_norm": 3.8698904514312744, "learning_rate": 0.0002536195618316509, "loss": 7.5255, "step": 1375900 }, { "epoch": 5.6055216017289125, "grad_norm": 3.6110451221466064, "learning_rate": 0.0002534057952920291, "loss": 7.5211, "step": 1376000 }, { "epoch": 5.6055216017289125, "eval_MaskedAccuracy": 0.5130806544962152, "eval_loss": 1.5900852680206299, "eval_runtime": 433.231, "eval_samples_per_second": 146.518, "eval_steps_per_second": 0.572, "step": 1376000 }, { "epoch": 5.605928979752294, "grad_norm": 9.983222007751465, "learning_rate": 0.0002531921177699691, "loss": 7.538, "step": 1376100 }, { "epoch": 5.6063363577756755, "grad_norm": 2.765918016433716, "learning_rate": 0.0002529785292739179, "loss": 7.5385, "step": 1376200 }, { "epoch": 5.606743735799057, "grad_norm": 15.373408317565918, "learning_rate": 0.0002527650298123204, "loss": 7.5486, "step": 1376300 }, { "epoch": 5.607151113822438, "grad_norm": 5.65013313293457, "learning_rate": 0.00025255161939361686, "loss": 7.514, "step": 1376400 }, { "epoch": 5.60755849184582, "grad_norm": 12.013772964477539, "learning_rate": 0.00025233829802624394, "loss": 7.5298, "step": 1376500 }, { "epoch": 5.607965869869201, "grad_norm": 4.139561176300049, "learning_rate": 0.0002521250657186372, "loss": 7.5365, "step": 1376600 }, { "epoch": 5.608373247892582, "grad_norm": 3.9420621395111084, "learning_rate": 0.00025191192247922594, "loss": 7.5278, "step": 1376700 }, { "epoch": 5.608780625915964, "grad_norm": 4.168811798095703, "learning_rate": 0.0002516988683164364, "loss": 7.5323, "step": 1376800 }, { "epoch": 5.609188003939345, "grad_norm": 3.7715203762054443, "learning_rate": 0.00025148590323869214, "loss": 7.5225, "step": 1376900 }, { "epoch": 5.609595381962727, "grad_norm": 8.349024772644043, "learning_rate": 0.00025127302725441313, "loss": 7.5179, "step": 1377000 }, { "epoch": 5.609595381962727, "eval_MaskedAccuracy": 0.513069277780445, "eval_loss": 1.5956621170043945, "eval_runtime": 193.6876, "eval_samples_per_second": 327.724, "eval_steps_per_second": 1.28, "step": 1377000 }, { "epoch": 5.610002759986108, "grad_norm": 4.570725917816162, "learning_rate": 0.0002510602403720149, "loss": 7.5396, "step": 1377100 }, { "epoch": 5.61041013800949, "grad_norm": 3.7946763038635254, "learning_rate": 0.00025084754259991005, "loss": 7.5329, "step": 1377200 }, { "epoch": 5.6108175160328715, "grad_norm": 30.738468170166016, "learning_rate": 0.000250634933946508, "loss": 7.5269, "step": 1377300 }, { "epoch": 5.611224894056253, "grad_norm": 7.892461776733398, "learning_rate": 0.0002504224144202143, "loss": 7.5212, "step": 1377400 }, { "epoch": 5.6116322720796346, "grad_norm": 17.25091552734375, "learning_rate": 0.00025020998402943095, "loss": 7.5222, "step": 1377500 }, { "epoch": 5.612039650103016, "grad_norm": 14.010287284851074, "learning_rate": 0.0002499976427825564, "loss": 7.5463, "step": 1377600 }, { "epoch": 5.612447028126397, "grad_norm": 15.319863319396973, "learning_rate": 0.00024978539068798596, "loss": 7.5458, "step": 1377700 }, { "epoch": 5.612854406149778, "grad_norm": 6.391556262969971, "learning_rate": 0.0002495732277541104, "loss": 7.5139, "step": 1377800 }, { "epoch": 5.61326178417316, "grad_norm": 11.232641220092773, "learning_rate": 0.00024936115398931764, "loss": 7.5126, "step": 1377900 }, { "epoch": 5.613669162196541, "grad_norm": 13.635534286499023, "learning_rate": 0.0002491491694019927, "loss": 7.5191, "step": 1378000 }, { "epoch": 5.613669162196541, "eval_MaskedAccuracy": 0.5130278719767443, "eval_loss": 1.5895746946334839, "eval_runtime": 189.5981, "eval_samples_per_second": 334.792, "eval_steps_per_second": 1.308, "step": 1378000 }, { "epoch": 5.614076540219923, "grad_norm": 11.969842910766602, "learning_rate": 0.00024893727400051603, "loss": 7.5269, "step": 1378100 }, { "epoch": 5.614483918243304, "grad_norm": 14.331548690795898, "learning_rate": 0.00024872546779326516, "loss": 7.5179, "step": 1378200 }, { "epoch": 5.614891296266686, "grad_norm": 17.87909698486328, "learning_rate": 0.00024851375078861445, "loss": 7.5224, "step": 1378300 }, { "epoch": 5.6152986742900675, "grad_norm": 7.2406907081604, "learning_rate": 0.00024830212299493324, "loss": 7.4959, "step": 1378400 }, { "epoch": 5.615706052313449, "grad_norm": 18.738910675048828, "learning_rate": 0.0002480905844205876, "loss": 7.5388, "step": 1378500 }, { "epoch": 5.6161134303368305, "grad_norm": 7.424437046051025, "learning_rate": 0.00024787913507394357, "loss": 7.5473, "step": 1378600 }, { "epoch": 5.616520808360212, "grad_norm": 9.2700777053833, "learning_rate": 0.00024766777496335876, "loss": 7.5144, "step": 1378700 }, { "epoch": 5.616928186383594, "grad_norm": 9.531607627868652, "learning_rate": 0.0002474565040971902, "loss": 7.5488, "step": 1378800 }, { "epoch": 5.617335564406975, "grad_norm": 3.692396879196167, "learning_rate": 0.00024724532248379005, "loss": 7.5224, "step": 1378900 }, { "epoch": 5.617742942430356, "grad_norm": 4.1397271156311035, "learning_rate": 0.00024703423013150796, "loss": 7.5122, "step": 1379000 }, { "epoch": 5.617742942430356, "eval_MaskedAccuracy": 0.5126794755519473, "eval_loss": 1.5880870819091797, "eval_runtime": 165.4931, "eval_samples_per_second": 383.557, "eval_steps_per_second": 1.499, "step": 1379000 }, { "epoch": 5.618150320453737, "grad_norm": 7.830399990081787, "learning_rate": 0.00024682322704868917, "loss": 7.5619, "step": 1379100 }, { "epoch": 5.618557698477119, "grad_norm": 6.649756908416748, "learning_rate": 0.00024661231324367536, "loss": 7.545, "step": 1379200 }, { "epoch": 5.6189650765005, "grad_norm": 8.903409957885742, "learning_rate": 0.00024640148872480557, "loss": 7.5419, "step": 1379300 }, { "epoch": 5.619372454523882, "grad_norm": 19.530412673950195, "learning_rate": 0.00024619075350041487, "loss": 7.5328, "step": 1379400 }, { "epoch": 5.619779832547263, "grad_norm": 3.8053641319274902, "learning_rate": 0.0002459801075788348, "loss": 7.5338, "step": 1379500 }, { "epoch": 5.620187210570645, "grad_norm": 2.8859376907348633, "learning_rate": 0.000245769550968393, "loss": 7.4906, "step": 1379600 }, { "epoch": 5.6205945885940265, "grad_norm": 7.343949317932129, "learning_rate": 0.00024555908367741413, "loss": 7.513, "step": 1379700 }, { "epoch": 5.621001966617408, "grad_norm": 9.09073257446289, "learning_rate": 0.000245348705714219, "loss": 7.517, "step": 1379800 }, { "epoch": 5.62140934464079, "grad_norm": 9.085687637329102, "learning_rate": 0.00024513841708712514, "loss": 7.5531, "step": 1379900 }, { "epoch": 5.621816722664171, "grad_norm": 6.09578800201416, "learning_rate": 0.00024492821780444613, "loss": 7.5081, "step": 1380000 }, { "epoch": 5.621816722664171, "eval_MaskedAccuracy": 0.5126112671805607, "eval_loss": 1.5826091766357422, "eval_runtime": 156.9029, "eval_samples_per_second": 404.556, "eval_steps_per_second": 1.581, "step": 1380000 }, { "epoch": 5.622224100687553, "grad_norm": 5.202302932739258, "learning_rate": 0.0002447181078744922, "loss": 7.5039, "step": 1380100 }, { "epoch": 5.622631478710934, "grad_norm": 6.4186482429504395, "learning_rate": 0.0002445080873055705, "loss": 7.5086, "step": 1380200 }, { "epoch": 5.623038856734315, "grad_norm": 6.249260425567627, "learning_rate": 0.0002442981561059843, "loss": 7.5084, "step": 1380300 }, { "epoch": 5.623446234757696, "grad_norm": 25.378747940063477, "learning_rate": 0.00024408831428403244, "loss": 7.4975, "step": 1380400 }, { "epoch": 5.623853612781078, "grad_norm": 6.305699825286865, "learning_rate": 0.00024387856184801042, "loss": 7.5652, "step": 1380500 }, { "epoch": 5.624260990804459, "grad_norm": 16.556028366088867, "learning_rate": 0.0002436688988062125, "loss": 7.5275, "step": 1380600 }, { "epoch": 5.624668368827841, "grad_norm": 8.81778335571289, "learning_rate": 0.00024345932516692744, "loss": 7.5607, "step": 1380700 }, { "epoch": 5.6250757468512225, "grad_norm": 16.383466720581055, "learning_rate": 0.00024324984093844108, "loss": 7.5306, "step": 1380800 }, { "epoch": 5.625483124874604, "grad_norm": 16.238239288330078, "learning_rate": 0.00024304044612903487, "loss": 7.5024, "step": 1380900 }, { "epoch": 5.6258905028979855, "grad_norm": 9.679112434387207, "learning_rate": 0.00024283114074698773, "loss": 7.5448, "step": 1381000 }, { "epoch": 5.6258905028979855, "eval_MaskedAccuracy": 0.5131659569757718, "eval_loss": 1.5812820196151733, "eval_runtime": 158.3051, "eval_samples_per_second": 400.973, "eval_steps_per_second": 1.567, "step": 1381000 }, { "epoch": 5.626297880921367, "grad_norm": 7.569660186767578, "learning_rate": 0.0002426219248005747, "loss": 7.5287, "step": 1381100 }, { "epoch": 5.626705258944749, "grad_norm": 24.3963680267334, "learning_rate": 0.00024241279829806685, "loss": 7.5214, "step": 1381200 }, { "epoch": 5.62711263696813, "grad_norm": 14.51697063446045, "learning_rate": 0.00024220376124773225, "loss": 7.5592, "step": 1381300 }, { "epoch": 5.627520014991511, "grad_norm": 6.675385475158691, "learning_rate": 0.00024199481365783535, "loss": 7.5481, "step": 1381400 }, { "epoch": 5.627927393014893, "grad_norm": 7.06484842300415, "learning_rate": 0.00024178595553663704, "loss": 7.4988, "step": 1381500 }, { "epoch": 5.628334771038274, "grad_norm": 8.65516185760498, "learning_rate": 0.0002415771868923942, "loss": 7.5275, "step": 1381600 }, { "epoch": 5.628742149061655, "grad_norm": 10.394768714904785, "learning_rate": 0.0002413685077333604, "loss": 7.5179, "step": 1381700 }, { "epoch": 5.629149527085037, "grad_norm": 24.104524612426758, "learning_rate": 0.00024115991806778638, "loss": 7.5205, "step": 1381800 }, { "epoch": 5.629556905108418, "grad_norm": 19.68671417236328, "learning_rate": 0.00024095141790391903, "loss": 7.5044, "step": 1381900 }, { "epoch": 5.6299642831318, "grad_norm": 8.048469543457031, "learning_rate": 0.00024074300725000108, "loss": 7.514, "step": 1382000 }, { "epoch": 5.6299642831318, "eval_MaskedAccuracy": 0.5139233389983908, "eval_loss": 1.5746879577636719, "eval_runtime": 157.8807, "eval_samples_per_second": 402.05, "eval_steps_per_second": 1.571, "step": 1382000 }, { "epoch": 5.6303716611551815, "grad_norm": 3.5602033138275146, "learning_rate": 0.00024053468611427194, "loss": 7.5481, "step": 1382100 }, { "epoch": 5.630779039178563, "grad_norm": 5.874471664428711, "learning_rate": 0.00024032645450496788, "loss": 7.5237, "step": 1382200 }, { "epoch": 5.631186417201945, "grad_norm": 17.948442459106445, "learning_rate": 0.00024011831243032125, "loss": 7.5058, "step": 1382300 }, { "epoch": 5.631593795225326, "grad_norm": 3.3917994499206543, "learning_rate": 0.0002399102598985611, "loss": 7.497, "step": 1382400 }, { "epoch": 5.632001173248708, "grad_norm": 4.5032830238342285, "learning_rate": 0.00023970229691791284, "loss": 7.5504, "step": 1382500 }, { "epoch": 5.632408551272089, "grad_norm": 13.444269180297852, "learning_rate": 0.00023949442349659885, "loss": 7.5114, "step": 1382600 }, { "epoch": 5.63281592929547, "grad_norm": 22.66585922241211, "learning_rate": 0.00023928663964283714, "loss": 7.534, "step": 1382700 }, { "epoch": 5.633223307318851, "grad_norm": 20.383052825927734, "learning_rate": 0.00023907894536484236, "loss": 7.548, "step": 1382800 }, { "epoch": 5.633630685342233, "grad_norm": 2.88193416595459, "learning_rate": 0.00023887134067082606, "loss": 7.5167, "step": 1382900 }, { "epoch": 5.634038063365614, "grad_norm": 13.426657676696777, "learning_rate": 0.00023866382556899545, "loss": 7.5217, "step": 1383000 }, { "epoch": 5.634038063365614, "eval_MaskedAccuracy": 0.5131976539834424, "eval_loss": 1.5839974880218506, "eval_runtime": 445.6287, "eval_samples_per_second": 142.441, "eval_steps_per_second": 0.557, "step": 1383000 }, { "epoch": 5.634445441388996, "grad_norm": 2.9683921337127686, "learning_rate": 0.00023845640006755502, "loss": 7.52, "step": 1383100 }, { "epoch": 5.6348528194123775, "grad_norm": 13.596526145935059, "learning_rate": 0.0002382490641747058, "loss": 7.5181, "step": 1383200 }, { "epoch": 5.635260197435759, "grad_norm": 17.86842155456543, "learning_rate": 0.00023804181789864416, "loss": 7.5123, "step": 1383300 }, { "epoch": 5.6356675754591405, "grad_norm": 3.347900867462158, "learning_rate": 0.00023783466124756416, "loss": 7.5022, "step": 1383400 }, { "epoch": 5.636074953482522, "grad_norm": 13.34920883178711, "learning_rate": 0.00023762759422965576, "loss": 7.4968, "step": 1383500 }, { "epoch": 5.636482331505904, "grad_norm": 3.989408493041992, "learning_rate": 0.00023742061685310518, "loss": 7.5358, "step": 1383600 }, { "epoch": 5.636889709529285, "grad_norm": 7.5562028884887695, "learning_rate": 0.0002372137291260952, "loss": 7.5279, "step": 1383700 }, { "epoch": 5.637297087552667, "grad_norm": 26.748794555664062, "learning_rate": 0.00023700693105680555, "loss": 7.5495, "step": 1383800 }, { "epoch": 5.637704465576048, "grad_norm": 20.54895782470703, "learning_rate": 0.00023680022265341207, "loss": 7.5356, "step": 1383900 }, { "epoch": 5.638111843599429, "grad_norm": 13.316396713256836, "learning_rate": 0.00023659360392408734, "loss": 7.5386, "step": 1384000 }, { "epoch": 5.638111843599429, "eval_MaskedAccuracy": 0.5128924797401614, "eval_loss": 1.5910016298294067, "eval_runtime": 154.0194, "eval_samples_per_second": 412.13, "eval_steps_per_second": 1.61, "step": 1384000 }, { "epoch": 5.63851922162281, "grad_norm": 15.915701866149902, "learning_rate": 0.0002363870748769995, "loss": 7.5486, "step": 1384100 }, { "epoch": 5.638926599646192, "grad_norm": 9.00930404663086, "learning_rate": 0.0002361806355203137, "loss": 7.5278, "step": 1384200 }, { "epoch": 5.6393339776695734, "grad_norm": 3.5040266513824463, "learning_rate": 0.00023597428586219238, "loss": 7.4803, "step": 1384300 }, { "epoch": 5.639741355692955, "grad_norm": 15.426980972290039, "learning_rate": 0.00023576802591079282, "loss": 7.5592, "step": 1384400 }, { "epoch": 5.6401487337163365, "grad_norm": 3.2443149089813232, "learning_rate": 0.00023556185567426936, "loss": 7.5452, "step": 1384500 }, { "epoch": 5.640556111739718, "grad_norm": 12.899211883544922, "learning_rate": 0.00023535577516077416, "loss": 7.5314, "step": 1384600 }, { "epoch": 5.6409634897631, "grad_norm": 18.656158447265625, "learning_rate": 0.00023514978437845372, "loss": 7.5093, "step": 1384700 }, { "epoch": 5.641370867786481, "grad_norm": 7.391315937042236, "learning_rate": 0.0002349438833354528, "loss": 7.5285, "step": 1384800 }, { "epoch": 5.641778245809863, "grad_norm": 20.103830337524414, "learning_rate": 0.00023473807203991124, "loss": 7.5072, "step": 1384900 }, { "epoch": 5.642185623833244, "grad_norm": 4.780278205871582, "learning_rate": 0.0002345323504999661, "loss": 7.5332, "step": 1385000 }, { "epoch": 5.642185623833244, "eval_MaskedAccuracy": 0.5127545823469942, "eval_loss": 1.5882014036178589, "eval_runtime": 166.3087, "eval_samples_per_second": 381.676, "eval_steps_per_second": 1.491, "step": 1385000 }, { "epoch": 5.642593001856626, "grad_norm": 13.922639846801758, "learning_rate": 0.00023432671872374995, "loss": 7.5407, "step": 1385100 }, { "epoch": 5.643000379880007, "grad_norm": 5.8329010009765625, "learning_rate": 0.00023412117671939335, "loss": 7.5332, "step": 1385200 }, { "epoch": 5.643407757903388, "grad_norm": 6.772289276123047, "learning_rate": 0.00023391572449502207, "loss": 7.5326, "step": 1385300 }, { "epoch": 5.643815135926769, "grad_norm": 7.5275115966796875, "learning_rate": 0.00023371036205875896, "loss": 7.5267, "step": 1385400 }, { "epoch": 5.644222513950151, "grad_norm": 8.430474281311035, "learning_rate": 0.00023350508941872292, "loss": 7.527, "step": 1385500 }, { "epoch": 5.6446298919735325, "grad_norm": 5.418252944946289, "learning_rate": 0.00023329990658302953, "loss": 7.5444, "step": 1385600 }, { "epoch": 5.645037269996914, "grad_norm": 9.187230110168457, "learning_rate": 0.0002330948135597908, "loss": 7.5594, "step": 1385700 }, { "epoch": 5.6454446480202956, "grad_norm": 5.811594009399414, "learning_rate": 0.00023288981035711476, "loss": 7.5045, "step": 1385800 }, { "epoch": 5.645852026043677, "grad_norm": 9.740527153015137, "learning_rate": 0.00023268489698310686, "loss": 7.5262, "step": 1385900 }, { "epoch": 5.646259404067059, "grad_norm": 10.950613975524902, "learning_rate": 0.00023248007344586785, "loss": 7.5199, "step": 1386000 }, { "epoch": 5.646259404067059, "eval_MaskedAccuracy": 0.5131919074783816, "eval_loss": 1.582820177078247, "eval_runtime": 154.2318, "eval_samples_per_second": 411.562, "eval_steps_per_second": 1.608, "step": 1386000 }, { "epoch": 5.64666678209044, "grad_norm": 15.783064842224121, "learning_rate": 0.00023227533975349614, "loss": 7.5395, "step": 1386100 }, { "epoch": 5.647074160113822, "grad_norm": 15.486825942993164, "learning_rate": 0.00023207069591408527, "loss": 7.5392, "step": 1386200 }, { "epoch": 5.647481538137203, "grad_norm": 13.45315933227539, "learning_rate": 0.00023186614193572652, "loss": 7.527, "step": 1386300 }, { "epoch": 5.647888916160584, "grad_norm": 22.64778709411621, "learning_rate": 0.00023166167782650706, "loss": 7.5207, "step": 1386400 }, { "epoch": 5.648296294183966, "grad_norm": 5.565525531768799, "learning_rate": 0.00023145730359450939, "loss": 7.5063, "step": 1386500 }, { "epoch": 5.648703672207347, "grad_norm": 11.855425834655762, "learning_rate": 0.00023125301924781488, "loss": 7.5222, "step": 1386600 }, { "epoch": 5.6491110502307285, "grad_norm": 6.1692304611206055, "learning_rate": 0.000231048824794499, "loss": 7.5396, "step": 1386700 }, { "epoch": 5.64951842825411, "grad_norm": 14.76517105102539, "learning_rate": 0.00023084472024263516, "loss": 7.5008, "step": 1386800 }, { "epoch": 5.6499258062774915, "grad_norm": 8.494549751281738, "learning_rate": 0.00023064070560029277, "loss": 7.555, "step": 1386900 }, { "epoch": 5.650333184300873, "grad_norm": 4.44266939163208, "learning_rate": 0.00023043678087553737, "loss": 7.5118, "step": 1387000 }, { "epoch": 5.650333184300873, "eval_MaskedAccuracy": 0.5126597984216629, "eval_loss": 1.5913329124450684, "eval_runtime": 161.6774, "eval_samples_per_second": 392.609, "eval_steps_per_second": 1.534, "step": 1387000 }, { "epoch": 5.650740562324255, "grad_norm": 3.5420591831207275, "learning_rate": 0.0002302329460764315, "loss": 7.5164, "step": 1387100 }, { "epoch": 5.651147940347636, "grad_norm": 3.3939616680145264, "learning_rate": 0.00023002920121103343, "loss": 7.4813, "step": 1387200 }, { "epoch": 5.651555318371018, "grad_norm": 17.22228240966797, "learning_rate": 0.0002298255462873982, "loss": 7.5138, "step": 1387300 }, { "epoch": 5.651962696394399, "grad_norm": 7.133488655090332, "learning_rate": 0.0002296219813135776, "loss": 7.5223, "step": 1387400 }, { "epoch": 5.652370074417781, "grad_norm": 10.650275230407715, "learning_rate": 0.00022941850629761977, "loss": 7.5269, "step": 1387500 }, { "epoch": 5.652777452441162, "grad_norm": 9.090660095214844, "learning_rate": 0.00022921512124756893, "loss": 7.512, "step": 1387600 }, { "epoch": 5.653184830464543, "grad_norm": 5.821437835693359, "learning_rate": 0.0002290118261714661, "loss": 7.5196, "step": 1387700 }, { "epoch": 5.653592208487924, "grad_norm": 5.73036527633667, "learning_rate": 0.00022880862107734873, "loss": 7.5401, "step": 1387800 }, { "epoch": 5.653999586511306, "grad_norm": 16.467464447021484, "learning_rate": 0.0002286055059732504, "loss": 7.518, "step": 1387900 }, { "epoch": 5.6544069645346875, "grad_norm": 3.790493965148926, "learning_rate": 0.00022840248086720166, "loss": 7.53, "step": 1388000 }, { "epoch": 5.6544069645346875, "eval_MaskedAccuracy": 0.5129876134496691, "eval_loss": 1.5923545360565186, "eval_runtime": 159.6839, "eval_samples_per_second": 397.51, "eval_steps_per_second": 1.553, "step": 1388000 }, { "epoch": 5.654814342558069, "grad_norm": 7.764084339141846, "learning_rate": 0.00022819954576722883, "loss": 7.5189, "step": 1388100 }, { "epoch": 5.655221720581451, "grad_norm": 5.0881476402282715, "learning_rate": 0.00022799670068135514, "loss": 7.5257, "step": 1388200 }, { "epoch": 5.655629098604832, "grad_norm": 11.76297664642334, "learning_rate": 0.0002277939456176003, "loss": 7.5083, "step": 1388300 }, { "epoch": 5.656036476628214, "grad_norm": 7.922427654266357, "learning_rate": 0.00022759128058397906, "loss": 7.5557, "step": 1388400 }, { "epoch": 5.656443854651595, "grad_norm": 10.830290794372559, "learning_rate": 0.00022738870558850616, "loss": 7.5255, "step": 1388500 }, { "epoch": 5.656851232674977, "grad_norm": 3.2147727012634277, "learning_rate": 0.00022718622063918882, "loss": 7.5072, "step": 1388600 }, { "epoch": 5.657258610698358, "grad_norm": 14.751627922058105, "learning_rate": 0.0002269838257440331, "loss": 7.521, "step": 1388700 }, { "epoch": 5.65766598872174, "grad_norm": 3.6608469486236572, "learning_rate": 0.0002267815209110401, "loss": 7.5121, "step": 1388800 }, { "epoch": 5.658073366745121, "grad_norm": 6.23715353012085, "learning_rate": 0.0002265793061482088, "loss": 7.5392, "step": 1388900 }, { "epoch": 5.658480744768502, "grad_norm": 12.295629501342773, "learning_rate": 0.0002263771814635332, "loss": 7.5265, "step": 1389000 }, { "epoch": 5.658480744768502, "eval_MaskedAccuracy": 0.5132895588478967, "eval_loss": 1.5833216905593872, "eval_runtime": 162.9295, "eval_samples_per_second": 389.592, "eval_steps_per_second": 1.522, "step": 1389000 }, { "epoch": 5.6588881227918835, "grad_norm": 20.044307708740234, "learning_rate": 0.0002261751468650044, "loss": 7.5064, "step": 1389100 }, { "epoch": 5.659295500815265, "grad_norm": 7.284054756164551, "learning_rate": 0.00022597320236061015, "loss": 7.5474, "step": 1389200 }, { "epoch": 5.6597028788386465, "grad_norm": 9.859037399291992, "learning_rate": 0.00022577134795833398, "loss": 7.5277, "step": 1389300 }, { "epoch": 5.660110256862028, "grad_norm": 11.014314651489258, "learning_rate": 0.00022556958366615716, "loss": 7.5124, "step": 1389400 }, { "epoch": 5.66051763488541, "grad_norm": 9.872495651245117, "learning_rate": 0.00022536790949205572, "loss": 7.5045, "step": 1389500 }, { "epoch": 5.660925012908791, "grad_norm": 13.823678970336914, "learning_rate": 0.00022516632544400283, "loss": 7.5307, "step": 1389600 }, { "epoch": 5.661332390932173, "grad_norm": 4.973620891571045, "learning_rate": 0.00022496483152996877, "loss": 7.5371, "step": 1389700 }, { "epoch": 5.661739768955554, "grad_norm": 23.51268196105957, "learning_rate": 0.00022476342775791958, "loss": 7.5074, "step": 1389800 }, { "epoch": 5.662147146978936, "grad_norm": 8.796059608459473, "learning_rate": 0.00022456211413581742, "loss": 7.5343, "step": 1389900 }, { "epoch": 5.662554525002317, "grad_norm": 4.088870048522949, "learning_rate": 0.00022436089067162134, "loss": 7.5248, "step": 1390000 }, { "epoch": 5.662554525002317, "eval_MaskedAccuracy": 0.5131125215970762, "eval_loss": 1.5887385606765747, "eval_runtime": 171.0249, "eval_samples_per_second": 371.151, "eval_steps_per_second": 1.45, "step": 1390000 }, { "epoch": 5.662961903025699, "grad_norm": 4.518383979797363, "learning_rate": 0.00022415975737328708, "loss": 7.5212, "step": 1390100 }, { "epoch": 5.66336928104908, "grad_norm": 7.672942638397217, "learning_rate": 0.00022395871424876662, "loss": 7.5618, "step": 1390200 }, { "epoch": 5.663776659072461, "grad_norm": 7.145913124084473, "learning_rate": 0.00022375776130600808, "loss": 7.5308, "step": 1390300 }, { "epoch": 5.6641840370958425, "grad_norm": 4.374745845794678, "learning_rate": 0.0002235568985529557, "loss": 7.5439, "step": 1390400 }, { "epoch": 5.664591415119224, "grad_norm": 5.299991607666016, "learning_rate": 0.0002233561259975516, "loss": 7.4934, "step": 1390500 }, { "epoch": 5.664998793142606, "grad_norm": 4.063174247741699, "learning_rate": 0.00022315544364773294, "loss": 7.5518, "step": 1390600 }, { "epoch": 5.665406171165987, "grad_norm": 5.611782073974609, "learning_rate": 0.00022295485151143358, "loss": 7.5503, "step": 1390700 }, { "epoch": 5.665813549189369, "grad_norm": 7.833299160003662, "learning_rate": 0.00022275434959658427, "loss": 7.5251, "step": 1390800 }, { "epoch": 5.66622092721275, "grad_norm": 3.6235079765319824, "learning_rate": 0.00022255393791111166, "loss": 7.5212, "step": 1390900 }, { "epoch": 5.666628305236132, "grad_norm": 17.92323875427246, "learning_rate": 0.0002223536164629392, "loss": 7.5201, "step": 1391000 }, { "epoch": 5.666628305236132, "eval_MaskedAccuracy": 0.513330936714161, "eval_loss": 1.5948268175125122, "eval_runtime": 155.461, "eval_samples_per_second": 408.308, "eval_steps_per_second": 1.595, "step": 1391000 }, { "epoch": 5.667035683259513, "grad_norm": 11.434247016906738, "learning_rate": 0.00022215338525998705, "loss": 7.5327, "step": 1391100 }, { "epoch": 5.667443061282895, "grad_norm": 17.200960159301758, "learning_rate": 0.00022195324431017064, "loss": 7.5173, "step": 1391200 }, { "epoch": 5.667850439306276, "grad_norm": 4.2296600341796875, "learning_rate": 0.0002217531936214034, "loss": 7.5445, "step": 1391300 }, { "epoch": 5.668257817329657, "grad_norm": 19.114154815673828, "learning_rate": 0.00022155323320159428, "loss": 7.5339, "step": 1391400 }, { "epoch": 5.668665195353039, "grad_norm": 3.479971408843994, "learning_rate": 0.00022135336305864826, "loss": 7.4977, "step": 1391500 }, { "epoch": 5.66907257337642, "grad_norm": 18.817155838012695, "learning_rate": 0.00022115358320046774, "loss": 7.4731, "step": 1391600 }, { "epoch": 5.6694799513998015, "grad_norm": 19.09714126586914, "learning_rate": 0.00022095389363495108, "loss": 7.5146, "step": 1391700 }, { "epoch": 5.669887329423183, "grad_norm": 3.2621140480041504, "learning_rate": 0.00022075429436999304, "loss": 7.5305, "step": 1391800 }, { "epoch": 5.670294707446565, "grad_norm": 3.465158462524414, "learning_rate": 0.00022055478541348447, "loss": 7.5481, "step": 1391900 }, { "epoch": 5.670702085469946, "grad_norm": 4.687906265258789, "learning_rate": 0.00022035536677331334, "loss": 7.5082, "step": 1392000 }, { "epoch": 5.670702085469946, "eval_MaskedAccuracy": 0.5138050215018688, "eval_loss": 1.5798211097717285, "eval_runtime": 162.4081, "eval_samples_per_second": 390.843, "eval_steps_per_second": 1.527, "step": 1392000 }, { "epoch": 5.671109463493328, "grad_norm": 20.191240310668945, "learning_rate": 0.00022015603845736378, "loss": 7.5108, "step": 1392100 }, { "epoch": 5.671516841516709, "grad_norm": 4.79811954498291, "learning_rate": 0.00021995680047351605, "loss": 7.5145, "step": 1392200 }, { "epoch": 5.671924219540091, "grad_norm": 4.349195957183838, "learning_rate": 0.0002197576528296474, "loss": 7.5234, "step": 1392300 }, { "epoch": 5.672331597563472, "grad_norm": 10.458327293395996, "learning_rate": 0.00021955859553362997, "loss": 7.5282, "step": 1392400 }, { "epoch": 5.672738975586854, "grad_norm": 14.85787582397461, "learning_rate": 0.0002193596285933358, "loss": 7.5352, "step": 1392500 }, { "epoch": 5.673146353610235, "grad_norm": 16.2830753326416, "learning_rate": 0.00021916075201662972, "loss": 7.5384, "step": 1392600 }, { "epoch": 5.673553731633616, "grad_norm": 3.118499279022217, "learning_rate": 0.00021896196581137482, "loss": 7.5212, "step": 1392700 }, { "epoch": 5.6739611096569975, "grad_norm": 11.2678804397583, "learning_rate": 0.00021876326998542995, "loss": 7.5187, "step": 1392800 }, { "epoch": 5.674368487680379, "grad_norm": 4.308579444885254, "learning_rate": 0.00021856466454665076, "loss": 7.5367, "step": 1392900 }, { "epoch": 5.674775865703761, "grad_norm": 3.5810940265655518, "learning_rate": 0.00021836614950288875, "loss": 7.5346, "step": 1393000 }, { "epoch": 5.674775865703761, "eval_MaskedAccuracy": 0.5134186143691946, "eval_loss": 1.5889108180999756, "eval_runtime": 157.2879, "eval_samples_per_second": 403.566, "eval_steps_per_second": 1.577, "step": 1393000 }, { "epoch": 5.675183243727142, "grad_norm": 24.302597045898438, "learning_rate": 0.00021816772486199303, "loss": 7.5473, "step": 1393100 }, { "epoch": 5.675590621750524, "grad_norm": 30.983192443847656, "learning_rate": 0.00021796939063180816, "loss": 7.5455, "step": 1393200 }, { "epoch": 5.675997999773905, "grad_norm": 16.238937377929688, "learning_rate": 0.00021777114682017506, "loss": 7.5368, "step": 1393300 }, { "epoch": 5.676405377797287, "grad_norm": 33.0887565612793, "learning_rate": 0.00021757299343493143, "loss": 7.5063, "step": 1393400 }, { "epoch": 5.676812755820668, "grad_norm": 16.486955642700195, "learning_rate": 0.00021737493048391116, "loss": 7.5209, "step": 1393500 }, { "epoch": 5.67722013384405, "grad_norm": 10.487832069396973, "learning_rate": 0.00021717695797494484, "loss": 7.5412, "step": 1393600 }, { "epoch": 5.677627511867431, "grad_norm": 16.049772262573242, "learning_rate": 0.0002169790759158599, "loss": 7.523, "step": 1393700 }, { "epoch": 5.678034889890813, "grad_norm": 33.27151870727539, "learning_rate": 0.00021678128431447906, "loss": 7.5293, "step": 1393800 }, { "epoch": 5.678442267914194, "grad_norm": 16.418254852294922, "learning_rate": 0.00021658358317862232, "loss": 7.4919, "step": 1393900 }, { "epoch": 5.678849645937575, "grad_norm": 7.597526550292969, "learning_rate": 0.00021638597251610604, "loss": 7.517, "step": 1394000 }, { "epoch": 5.678849645937575, "eval_MaskedAccuracy": 0.5131666246289983, "eval_loss": 1.5822957754135132, "eval_runtime": 175.9943, "eval_samples_per_second": 360.671, "eval_steps_per_second": 1.409, "step": 1394000 }, { "epoch": 5.6792570239609566, "grad_norm": 15.38145637512207, "learning_rate": 0.0002161884523347422, "loss": 7.4891, "step": 1394100 }, { "epoch": 5.679664401984338, "grad_norm": 18.405000686645508, "learning_rate": 0.00021599102264234054, "loss": 7.5428, "step": 1394200 }, { "epoch": 5.68007178000772, "grad_norm": 14.354438781738281, "learning_rate": 0.0002157936834467065, "loss": 7.5265, "step": 1394300 }, { "epoch": 5.680479158031101, "grad_norm": 2.9753637313842773, "learning_rate": 0.00021559643475564105, "loss": 7.4968, "step": 1394400 }, { "epoch": 5.680886536054483, "grad_norm": 18.088481903076172, "learning_rate": 0.00021539927657694357, "loss": 7.5363, "step": 1394500 }, { "epoch": 5.681293914077864, "grad_norm": 12.306862831115723, "learning_rate": 0.0002152022089184082, "loss": 7.5192, "step": 1394600 }, { "epoch": 5.681701292101246, "grad_norm": 14.621867179870605, "learning_rate": 0.00021500523178782634, "loss": 7.4961, "step": 1394700 }, { "epoch": 5.682108670124627, "grad_norm": 12.882950782775879, "learning_rate": 0.00021480834519298526, "loss": 7.5043, "step": 1394800 }, { "epoch": 5.682516048148009, "grad_norm": 3.314741849899292, "learning_rate": 0.0002146115491416694, "loss": 7.5298, "step": 1394900 }, { "epoch": 5.68292342617139, "grad_norm": 3.0651979446411133, "learning_rate": 0.00021441484364165885, "loss": 7.5249, "step": 1395000 }, { "epoch": 5.68292342617139, "eval_MaskedAccuracy": 0.5132118971430525, "eval_loss": 1.588690161705017, "eval_runtime": 173.4457, "eval_samples_per_second": 365.97, "eval_steps_per_second": 1.43, "step": 1395000 }, { "epoch": 5.683330804194772, "grad_norm": 4.461178302764893, "learning_rate": 0.00021421822870073016, "loss": 7.5057, "step": 1395100 }, { "epoch": 5.683738182218153, "grad_norm": 9.283483505249023, "learning_rate": 0.00021402170432665717, "loss": 7.5257, "step": 1395200 }, { "epoch": 5.684145560241534, "grad_norm": 18.940427780151367, "learning_rate": 0.00021382527052720888, "loss": 7.5487, "step": 1395300 }, { "epoch": 5.684552938264916, "grad_norm": 4.460453510284424, "learning_rate": 0.000213628927310152, "loss": 7.5051, "step": 1395400 }, { "epoch": 5.684960316288297, "grad_norm": 9.872069358825684, "learning_rate": 0.00021343267468324837, "loss": 7.5086, "step": 1395500 }, { "epoch": 5.685367694311679, "grad_norm": 15.643628120422363, "learning_rate": 0.00021323651265425773, "loss": 7.5264, "step": 1395600 }, { "epoch": 5.68577507233506, "grad_norm": 10.625282287597656, "learning_rate": 0.00021304044123093455, "loss": 7.5514, "step": 1395700 }, { "epoch": 5.686182450358442, "grad_norm": 22.49959945678711, "learning_rate": 0.00021284446042103128, "loss": 7.5237, "step": 1395800 }, { "epoch": 5.686589828381823, "grad_norm": 5.622125148773193, "learning_rate": 0.00021264857023229572, "loss": 7.5121, "step": 1395900 }, { "epoch": 5.686997206405205, "grad_norm": 8.442076683044434, "learning_rate": 0.00021245277067247258, "loss": 7.5048, "step": 1396000 }, { "epoch": 5.686997206405205, "eval_MaskedAccuracy": 0.5133545845147907, "eval_loss": 1.5889184474945068, "eval_runtime": 157.9874, "eval_samples_per_second": 401.779, "eval_steps_per_second": 1.57, "step": 1396000 }, { "epoch": 5.687404584428586, "grad_norm": 24.410968780517578, "learning_rate": 0.0002122570617493028, "loss": 7.5229, "step": 1396100 }, { "epoch": 5.687811962451968, "grad_norm": 4.419765472412109, "learning_rate": 0.00021206144347052364, "loss": 7.5385, "step": 1396200 }, { "epoch": 5.688219340475349, "grad_norm": 18.640249252319336, "learning_rate": 0.00021186591584386957, "loss": 7.5353, "step": 1396300 }, { "epoch": 5.68862671849873, "grad_norm": 12.499750137329102, "learning_rate": 0.00021167047887706911, "loss": 7.5206, "step": 1396400 }, { "epoch": 5.6890340965221124, "grad_norm": 14.130266189575195, "learning_rate": 0.00021147513257785102, "loss": 7.5055, "step": 1396500 }, { "epoch": 5.689441474545493, "grad_norm": 12.074944496154785, "learning_rate": 0.0002112798769539376, "loss": 7.5404, "step": 1396600 }, { "epoch": 5.689848852568875, "grad_norm": 22.056108474731445, "learning_rate": 0.0002110847120130487, "loss": 7.5548, "step": 1396700 }, { "epoch": 5.690256230592256, "grad_norm": 13.590144157409668, "learning_rate": 0.00021088963776289945, "loss": 7.4977, "step": 1396800 }, { "epoch": 5.690663608615638, "grad_norm": 4.762839317321777, "learning_rate": 0.00021069465421120264, "loss": 7.5441, "step": 1396900 }, { "epoch": 5.691070986639019, "grad_norm": 15.463266372680664, "learning_rate": 0.00021049976136566724, "loss": 7.541, "step": 1397000 }, { "epoch": 5.691070986639019, "eval_MaskedAccuracy": 0.5128581112931987, "eval_loss": 1.5882012844085693, "eval_runtime": 165.056, "eval_samples_per_second": 384.572, "eval_steps_per_second": 1.503, "step": 1397000 }, { "epoch": 5.691478364662401, "grad_norm": 6.944849014282227, "learning_rate": 0.0002103049592339978, "loss": 7.5231, "step": 1397100 }, { "epoch": 5.691885742685782, "grad_norm": 5.5081586837768555, "learning_rate": 0.0002101102478238965, "loss": 7.5173, "step": 1397200 }, { "epoch": 5.692293120709164, "grad_norm": 12.3670654296875, "learning_rate": 0.00020991562714306082, "loss": 7.5551, "step": 1397300 }, { "epoch": 5.692700498732545, "grad_norm": 10.130189895629883, "learning_rate": 0.0002097210971991854, "loss": 7.5331, "step": 1397400 }, { "epoch": 5.693107876755927, "grad_norm": 20.905229568481445, "learning_rate": 0.00020952665799996094, "loss": 7.5056, "step": 1397500 }, { "epoch": 5.693515254779308, "grad_norm": 9.281001091003418, "learning_rate": 0.00020933230955307489, "loss": 7.54, "step": 1397600 }, { "epoch": 5.693922632802689, "grad_norm": 3.8841283321380615, "learning_rate": 0.00020913805186621034, "loss": 7.5051, "step": 1397700 }, { "epoch": 5.694330010826071, "grad_norm": 11.44265365600586, "learning_rate": 0.00020894388494704813, "loss": 7.5378, "step": 1397800 }, { "epoch": 5.694737388849452, "grad_norm": 30.42966079711914, "learning_rate": 0.00020874980880326453, "loss": 7.5033, "step": 1397900 }, { "epoch": 5.695144766872834, "grad_norm": 3.4537734985351562, "learning_rate": 0.0002085558234425317, "loss": 7.5083, "step": 1398000 }, { "epoch": 5.695144766872834, "eval_MaskedAccuracy": 0.5137838148019644, "eval_loss": 1.5854030847549438, "eval_runtime": 193.3485, "eval_samples_per_second": 328.298, "eval_steps_per_second": 1.283, "step": 1398000 }, { "epoch": 5.695552144896215, "grad_norm": 8.677281379699707, "learning_rate": 0.00020836192887251987, "loss": 7.5314, "step": 1398100 }, { "epoch": 5.695959522919597, "grad_norm": 7.363255500793457, "learning_rate": 0.00020816812510089387, "loss": 7.4975, "step": 1398200 }, { "epoch": 5.696366900942978, "grad_norm": 20.45794677734375, "learning_rate": 0.000207974412135316, "loss": 7.5032, "step": 1398300 }, { "epoch": 5.69677427896636, "grad_norm": 22.967304229736328, "learning_rate": 0.00020778078998344554, "loss": 7.5389, "step": 1398400 }, { "epoch": 5.697181656989741, "grad_norm": 15.457267761230469, "learning_rate": 0.0002075872586529364, "loss": 7.5257, "step": 1398500 }, { "epoch": 5.697589035013123, "grad_norm": 12.36081600189209, "learning_rate": 0.00020739381815144056, "loss": 7.5254, "step": 1398600 }, { "epoch": 5.697996413036504, "grad_norm": 24.141223907470703, "learning_rate": 0.0002072004684866058, "loss": 7.5318, "step": 1398700 }, { "epoch": 5.698403791059886, "grad_norm": 16.874622344970703, "learning_rate": 0.0002070072096660758, "loss": 7.5378, "step": 1398800 }, { "epoch": 5.6988111690832675, "grad_norm": 22.59729766845703, "learning_rate": 0.00020681404169749123, "loss": 7.521, "step": 1398900 }, { "epoch": 5.699218547106648, "grad_norm": 14.58524227142334, "learning_rate": 0.00020662096458848965, "loss": 7.5406, "step": 1399000 }, { "epoch": 5.699218547106648, "eval_MaskedAccuracy": 0.5133135149270909, "eval_loss": 1.5805004835128784, "eval_runtime": 156.5144, "eval_samples_per_second": 405.56, "eval_steps_per_second": 1.585, "step": 1399000 }, { "epoch": 5.69962592513003, "grad_norm": 17.078554153442383, "learning_rate": 0.00020642797834670342, "loss": 7.52, "step": 1399100 }, { "epoch": 5.700033303153411, "grad_norm": 18.19713020324707, "learning_rate": 0.00020623508297976315, "loss": 7.5297, "step": 1399200 }, { "epoch": 5.700440681176793, "grad_norm": 13.093094825744629, "learning_rate": 0.0002060422784952944, "loss": 7.5375, "step": 1399300 }, { "epoch": 5.700848059200174, "grad_norm": 21.07042121887207, "learning_rate": 0.00020584956490091988, "loss": 7.5094, "step": 1399400 }, { "epoch": 5.701255437223556, "grad_norm": 19.204206466674805, "learning_rate": 0.00020565694220425895, "loss": 7.5414, "step": 1399500 }, { "epoch": 5.701662815246937, "grad_norm": 7.8062424659729, "learning_rate": 0.00020546441041292704, "loss": 7.5067, "step": 1399600 }, { "epoch": 5.702070193270319, "grad_norm": 3.5404536724090576, "learning_rate": 0.00020527196953453562, "loss": 7.5441, "step": 1399700 }, { "epoch": 5.7024775712937, "grad_norm": 14.274609565734863, "learning_rate": 0.0002050796195766931, "loss": 7.5387, "step": 1399800 }, { "epoch": 5.702884949317082, "grad_norm": 16.907634735107422, "learning_rate": 0.00020488736054700426, "loss": 7.4932, "step": 1399900 }, { "epoch": 5.703292327340463, "grad_norm": 21.000276565551758, "learning_rate": 0.0002046951924530697, "loss": 7.5337, "step": 1400000 }, { "epoch": 5.703292327340463, "eval_MaskedAccuracy": 0.5128022263675616, "eval_loss": 1.5850491523742676, "eval_runtime": 164.8747, "eval_samples_per_second": 384.995, "eval_steps_per_second": 1.504, "step": 1400000 }, { "epoch": 5.703699705363845, "grad_norm": 2.6224594116210938, "learning_rate": 0.00020450311530248696, "loss": 7.5409, "step": 1400100 }, { "epoch": 5.7041070833872265, "grad_norm": 13.088146209716797, "learning_rate": 0.0002043111291028501, "loss": 7.5092, "step": 1400200 }, { "epoch": 5.704514461410607, "grad_norm": 10.771458625793457, "learning_rate": 0.00020411923386174911, "loss": 7.517, "step": 1400300 }, { "epoch": 5.704921839433989, "grad_norm": 13.163960456848145, "learning_rate": 0.00020392742958677085, "loss": 7.5096, "step": 1400400 }, { "epoch": 5.70532921745737, "grad_norm": 7.284150123596191, "learning_rate": 0.00020373571628549817, "loss": 7.4986, "step": 1400500 }, { "epoch": 5.705736595480752, "grad_norm": 17.328292846679688, "learning_rate": 0.00020354409396551118, "loss": 7.516, "step": 1400600 }, { "epoch": 5.706143973504133, "grad_norm": 5.250231742858887, "learning_rate": 0.0002033525626343847, "loss": 7.4923, "step": 1400700 }, { "epoch": 5.706551351527515, "grad_norm": 2.9675095081329346, "learning_rate": 0.00020316112229969195, "loss": 7.5409, "step": 1400800 }, { "epoch": 5.706958729550896, "grad_norm": 6.556509971618652, "learning_rate": 0.00020296977296900087, "loss": 7.5035, "step": 1400900 }, { "epoch": 5.707366107574278, "grad_norm": 3.9646568298339844, "learning_rate": 0.0002027785146498769, "loss": 7.4983, "step": 1401000 }, { "epoch": 5.707366107574278, "eval_MaskedAccuracy": 0.5134965311932747, "eval_loss": 1.5908129215240479, "eval_runtime": 174.5783, "eval_samples_per_second": 363.596, "eval_steps_per_second": 1.421, "step": 1401000 }, { "epoch": 5.707773485597659, "grad_norm": 24.632753372192383, "learning_rate": 0.00020258734734988145, "loss": 7.5003, "step": 1401100 }, { "epoch": 5.708180863621041, "grad_norm": 33.74378967285156, "learning_rate": 0.0002023962710765723, "loss": 7.5498, "step": 1401200 }, { "epoch": 5.7085882416444225, "grad_norm": 5.798527717590332, "learning_rate": 0.0002022052858375038, "loss": 7.5268, "step": 1401300 }, { "epoch": 5.708995619667803, "grad_norm": 27.04881477355957, "learning_rate": 0.00020201439164022641, "loss": 7.545, "step": 1401400 }, { "epoch": 5.7094029976911855, "grad_norm": 12.537012100219727, "learning_rate": 0.0002018235884922875, "loss": 7.528, "step": 1401500 }, { "epoch": 5.709810375714566, "grad_norm": 4.775035858154297, "learning_rate": 0.00020163287640123008, "loss": 7.5073, "step": 1401600 }, { "epoch": 5.710217753737948, "grad_norm": 20.909391403198242, "learning_rate": 0.00020144225537459468, "loss": 7.5329, "step": 1401700 }, { "epoch": 5.710625131761329, "grad_norm": 5.003806114196777, "learning_rate": 0.00020125172541991693, "loss": 7.5185, "step": 1401800 }, { "epoch": 5.711032509784711, "grad_norm": 28.107206344604492, "learning_rate": 0.00020106128654472983, "loss": 7.5203, "step": 1401900 }, { "epoch": 5.711439887808092, "grad_norm": 3.4706532955169678, "learning_rate": 0.00020087093875656263, "loss": 7.5077, "step": 1402000 }, { "epoch": 5.711439887808092, "eval_MaskedAccuracy": 0.5129085953446926, "eval_loss": 1.5857908725738525, "eval_runtime": 169.7335, "eval_samples_per_second": 373.974, "eval_steps_per_second": 1.461, "step": 1402000 }, { "epoch": 5.711847265831474, "grad_norm": 5.496380805969238, "learning_rate": 0.00020068068206294015, "loss": 7.5444, "step": 1402100 }, { "epoch": 5.712254643854855, "grad_norm": 4.842715263366699, "learning_rate": 0.00020049051647138503, "loss": 7.5026, "step": 1402200 }, { "epoch": 5.712662021878237, "grad_norm": 3.697951316833496, "learning_rate": 0.00020030044198941484, "loss": 7.5159, "step": 1402300 }, { "epoch": 5.713069399901618, "grad_norm": 34.285362243652344, "learning_rate": 0.00020011045862454473, "loss": 7.4916, "step": 1402400 }, { "epoch": 5.713476777925, "grad_norm": 22.446369171142578, "learning_rate": 0.00019992056638428533, "loss": 7.5062, "step": 1402500 }, { "epoch": 5.7138841559483815, "grad_norm": 22.62911605834961, "learning_rate": 0.0001997307652761449, "loss": 7.4998, "step": 1402600 }, { "epoch": 5.714291533971762, "grad_norm": 10.011300086975098, "learning_rate": 0.00019954105530762653, "loss": 7.5392, "step": 1402700 }, { "epoch": 5.714698911995144, "grad_norm": 4.304636001586914, "learning_rate": 0.00019935143648623096, "loss": 7.516, "step": 1402800 }, { "epoch": 5.715106290018525, "grad_norm": 5.8564043045043945, "learning_rate": 0.00019916190881945424, "loss": 7.5233, "step": 1402900 }, { "epoch": 5.715513668041907, "grad_norm": 4.471909046173096, "learning_rate": 0.00019897247231478999, "loss": 7.5239, "step": 1403000 }, { "epoch": 5.715513668041907, "eval_MaskedAccuracy": 0.5131129358615463, "eval_loss": 1.5917563438415527, "eval_runtime": 173.1445, "eval_samples_per_second": 366.607, "eval_steps_per_second": 1.432, "step": 1403000 }, { "epoch": 5.715921046065288, "grad_norm": 31.28317642211914, "learning_rate": 0.00019878312697972728, "loss": 7.4928, "step": 1403100 }, { "epoch": 5.71632842408867, "grad_norm": 3.0140678882598877, "learning_rate": 0.00019859387282175216, "loss": 7.5469, "step": 1403200 }, { "epoch": 5.716735802112051, "grad_norm": 4.018800258636475, "learning_rate": 0.00019840470984834659, "loss": 7.5205, "step": 1403300 }, { "epoch": 5.717143180135433, "grad_norm": 5.32080078125, "learning_rate": 0.0001982156380669897, "loss": 7.5117, "step": 1403400 }, { "epoch": 5.717550558158814, "grad_norm": 16.230554580688477, "learning_rate": 0.000198026657485156, "loss": 7.5254, "step": 1403500 }, { "epoch": 5.717957936182196, "grad_norm": 16.40062141418457, "learning_rate": 0.0001978377681103172, "loss": 7.5317, "step": 1403600 }, { "epoch": 5.7183653142055775, "grad_norm": 13.31722640991211, "learning_rate": 0.00019764896994994092, "loss": 7.4973, "step": 1403700 }, { "epoch": 5.718772692228959, "grad_norm": 4.894228458404541, "learning_rate": 0.00019746026301149165, "loss": 7.5494, "step": 1403800 }, { "epoch": 5.7191800702523405, "grad_norm": 3.2217085361480713, "learning_rate": 0.00019727164730242935, "loss": 7.5501, "step": 1403900 }, { "epoch": 5.719587448275721, "grad_norm": 5.319243431091309, "learning_rate": 0.00019708312283021187, "loss": 7.5266, "step": 1404000 }, { "epoch": 5.719587448275721, "eval_MaskedAccuracy": 0.5128442257461067, "eval_loss": 1.5894906520843506, "eval_runtime": 160.973, "eval_samples_per_second": 394.327, "eval_steps_per_second": 1.541, "step": 1404000 }, { "epoch": 5.719994826299103, "grad_norm": 9.104679107666016, "learning_rate": 0.00019689468960229188, "loss": 7.5095, "step": 1404100 }, { "epoch": 5.720402204322484, "grad_norm": 5.5225138664245605, "learning_rate": 0.0001967063476261195, "loss": 7.5199, "step": 1404200 }, { "epoch": 5.720809582345866, "grad_norm": 9.274515151977539, "learning_rate": 0.0001965180969091406, "loss": 7.5142, "step": 1404300 }, { "epoch": 5.721216960369247, "grad_norm": 22.511634826660156, "learning_rate": 0.00019632993745879792, "loss": 7.5138, "step": 1404400 }, { "epoch": 5.721624338392629, "grad_norm": 15.016294479370117, "learning_rate": 0.0001961418692825308, "loss": 7.5143, "step": 1404500 }, { "epoch": 5.72203171641601, "grad_norm": 3.427358388900757, "learning_rate": 0.00019595389238777434, "loss": 7.5305, "step": 1404600 }, { "epoch": 5.722439094439392, "grad_norm": 21.848268508911133, "learning_rate": 0.00019576600678195975, "loss": 7.5188, "step": 1404700 }, { "epoch": 5.7228464724627734, "grad_norm": 23.56867790222168, "learning_rate": 0.00019557821247251575, "loss": 7.5326, "step": 1404800 }, { "epoch": 5.723253850486155, "grad_norm": 18.3046932220459, "learning_rate": 0.0001953905094668669, "loss": 7.5557, "step": 1404900 }, { "epoch": 5.7236612285095365, "grad_norm": 17.465869903564453, "learning_rate": 0.00019520289777243339, "loss": 7.539, "step": 1405000 }, { "epoch": 5.7236612285095365, "eval_MaskedAccuracy": 0.513291535950472, "eval_loss": 1.5922725200653076, "eval_runtime": 157.6129, "eval_samples_per_second": 402.733, "eval_steps_per_second": 1.573, "step": 1405000 }, { "epoch": 5.724068606532918, "grad_norm": 7.314059734344482, "learning_rate": 0.00019501537739663327, "loss": 7.5351, "step": 1405100 }, { "epoch": 5.7244759845563, "grad_norm": 18.200645446777344, "learning_rate": 0.00019482794834688, "loss": 7.493, "step": 1405200 }, { "epoch": 5.72488336257968, "grad_norm": 22.57877540588379, "learning_rate": 0.00019464061063058336, "loss": 7.5048, "step": 1405300 }, { "epoch": 5.725290740603062, "grad_norm": 3.4958767890930176, "learning_rate": 0.00019445336425515016, "loss": 7.5469, "step": 1405400 }, { "epoch": 5.725698118626443, "grad_norm": 23.43501091003418, "learning_rate": 0.00019426620922798318, "loss": 7.5286, "step": 1405500 }, { "epoch": 5.726105496649825, "grad_norm": 4.3498334884643555, "learning_rate": 0.00019407914555648158, "loss": 7.499, "step": 1405600 }, { "epoch": 5.726512874673206, "grad_norm": 6.705742359161377, "learning_rate": 0.00019389217324804108, "loss": 7.5264, "step": 1405700 }, { "epoch": 5.726920252696588, "grad_norm": 15.036599159240723, "learning_rate": 0.00019370529231005365, "loss": 7.5237, "step": 1405800 }, { "epoch": 5.727327630719969, "grad_norm": 10.892059326171875, "learning_rate": 0.00019351850274990787, "loss": 7.4968, "step": 1405900 }, { "epoch": 5.727735008743351, "grad_norm": 7.121788501739502, "learning_rate": 0.00019333180457498813, "loss": 7.5085, "step": 1406000 }, { "epoch": 5.727735008743351, "eval_MaskedAccuracy": 0.5135558206290997, "eval_loss": 1.5852932929992676, "eval_runtime": 158.0285, "eval_samples_per_second": 401.675, "eval_steps_per_second": 1.569, "step": 1406000 }, { "epoch": 5.7281423867667325, "grad_norm": 10.672481536865234, "learning_rate": 0.00019314519779267604, "loss": 7.5342, "step": 1406100 }, { "epoch": 5.728549764790114, "grad_norm": 22.060361862182617, "learning_rate": 0.00019295868241034892, "loss": 7.5069, "step": 1406200 }, { "epoch": 5.7289571428134956, "grad_norm": 15.186068534851074, "learning_rate": 0.00019277225843537975, "loss": 7.505, "step": 1406300 }, { "epoch": 5.729364520836876, "grad_norm": 8.485790252685547, "learning_rate": 0.00019258592587514053, "loss": 7.5102, "step": 1406400 }, { "epoch": 5.729771898860259, "grad_norm": 12.340378761291504, "learning_rate": 0.00019239968473699762, "loss": 7.5189, "step": 1406500 }, { "epoch": 5.730179276883639, "grad_norm": 16.11414909362793, "learning_rate": 0.0001922135350283136, "loss": 7.5215, "step": 1406600 }, { "epoch": 5.730586654907021, "grad_norm": 24.906097412109375, "learning_rate": 0.0001920274767564485, "loss": 7.5037, "step": 1406700 }, { "epoch": 5.730994032930402, "grad_norm": 5.577622413635254, "learning_rate": 0.00019184150992875788, "loss": 7.5285, "step": 1406800 }, { "epoch": 5.731401410953784, "grad_norm": 15.005581855773926, "learning_rate": 0.00019165563455259379, "loss": 7.5308, "step": 1406900 }, { "epoch": 5.731808788977165, "grad_norm": 11.045219421386719, "learning_rate": 0.00019146985063530535, "loss": 7.5216, "step": 1407000 }, { "epoch": 5.731808788977165, "eval_MaskedAccuracy": 0.5137690953558148, "eval_loss": 1.5858678817749023, "eval_runtime": 154.7819, "eval_samples_per_second": 410.1, "eval_steps_per_second": 1.602, "step": 1407000 }, { "epoch": 5.732216167000547, "grad_norm": 3.7264459133148193, "learning_rate": 0.000191284158184237, "loss": 7.5264, "step": 1407100 }, { "epoch": 5.7326235450239285, "grad_norm": 9.076326370239258, "learning_rate": 0.00019109855720673066, "loss": 7.5453, "step": 1407200 }, { "epoch": 5.73303092304731, "grad_norm": 9.742874145507812, "learning_rate": 0.00019091304771012374, "loss": 7.4892, "step": 1407300 }, { "epoch": 5.7334383010706915, "grad_norm": 4.512001991271973, "learning_rate": 0.000190727629701751, "loss": 7.4888, "step": 1407400 }, { "epoch": 5.733845679094073, "grad_norm": 14.355818748474121, "learning_rate": 0.00019054230318894222, "loss": 7.5074, "step": 1407500 }, { "epoch": 5.734253057117455, "grad_norm": 12.80526351928711, "learning_rate": 0.00019035706817902527, "loss": 7.5143, "step": 1407600 }, { "epoch": 5.734660435140835, "grad_norm": 4.367220401763916, "learning_rate": 0.00019017192467932244, "loss": 7.5077, "step": 1407700 }, { "epoch": 5.735067813164217, "grad_norm": 9.353137016296387, "learning_rate": 0.00018998687269715448, "loss": 7.5162, "step": 1407800 }, { "epoch": 5.735475191187598, "grad_norm": 20.973501205444336, "learning_rate": 0.000189801912239837, "loss": 7.5106, "step": 1407900 }, { "epoch": 5.73588256921098, "grad_norm": 13.765401840209961, "learning_rate": 0.00018961704331468273, "loss": 7.4998, "step": 1408000 }, { "epoch": 5.73588256921098, "eval_MaskedAccuracy": 0.513909088501633, "eval_loss": 1.575904130935669, "eval_runtime": 167.9919, "eval_samples_per_second": 377.851, "eval_steps_per_second": 1.476, "step": 1408000 }, { "epoch": 5.736289947234361, "grad_norm": 8.200135231018066, "learning_rate": 0.000189432265929, "loss": 7.5783, "step": 1408100 }, { "epoch": 5.736697325257743, "grad_norm": 6.398289680480957, "learning_rate": 0.00018924758009009434, "loss": 7.5039, "step": 1408200 }, { "epoch": 5.737104703281124, "grad_norm": 16.132312774658203, "learning_rate": 0.0001890629858052666, "loss": 7.5471, "step": 1408300 }, { "epoch": 5.737512081304506, "grad_norm": 13.6964693069458, "learning_rate": 0.00018887848308181616, "loss": 7.5508, "step": 1408400 }, { "epoch": 5.7379194593278875, "grad_norm": 14.69521713256836, "learning_rate": 0.0001886940719270369, "loss": 7.5162, "step": 1408500 }, { "epoch": 5.738326837351269, "grad_norm": 21.59804344177246, "learning_rate": 0.0001885097523482192, "loss": 7.5262, "step": 1408600 }, { "epoch": 5.738734215374651, "grad_norm": 21.779382705688477, "learning_rate": 0.00018832552435265053, "loss": 7.5196, "step": 1408700 }, { "epoch": 5.739141593398032, "grad_norm": 19.573837280273438, "learning_rate": 0.00018814138794761453, "loss": 7.5391, "step": 1408800 }, { "epoch": 5.739548971421414, "grad_norm": 17.48353385925293, "learning_rate": 0.0001879573431403909, "loss": 7.5005, "step": 1408900 }, { "epoch": 5.739956349444794, "grad_norm": 5.65546178817749, "learning_rate": 0.0001877733899382556, "loss": 7.4985, "step": 1409000 }, { "epoch": 5.739956349444794, "eval_MaskedAccuracy": 0.5129975344928691, "eval_loss": 1.5885475873947144, "eval_runtime": 154.3446, "eval_samples_per_second": 411.261, "eval_steps_per_second": 1.607, "step": 1409000 }, { "epoch": 5.740363727468176, "grad_norm": 10.080263137817383, "learning_rate": 0.0001875895283484819, "loss": 7.5066, "step": 1409100 }, { "epoch": 5.740771105491557, "grad_norm": 15.176826477050781, "learning_rate": 0.00018740575837833862, "loss": 7.5081, "step": 1409200 }, { "epoch": 5.741178483514939, "grad_norm": 14.322775840759277, "learning_rate": 0.00018722208003509099, "loss": 7.5071, "step": 1409300 }, { "epoch": 5.74158586153832, "grad_norm": 19.6398868560791, "learning_rate": 0.00018703849332600078, "loss": 7.5159, "step": 1409400 }, { "epoch": 5.741993239561702, "grad_norm": 8.177549362182617, "learning_rate": 0.00018685499825832612, "loss": 7.4846, "step": 1409500 }, { "epoch": 5.7424006175850835, "grad_norm": 13.5286283493042, "learning_rate": 0.00018667159483932183, "loss": 7.5117, "step": 1409600 }, { "epoch": 5.742807995608465, "grad_norm": 11.124225616455078, "learning_rate": 0.0001864882830762387, "loss": 7.5177, "step": 1409700 }, { "epoch": 5.7432153736318465, "grad_norm": 14.455705642700195, "learning_rate": 0.00018630506297632408, "loss": 7.5385, "step": 1409800 }, { "epoch": 5.743622751655228, "grad_norm": 23.371421813964844, "learning_rate": 0.00018612193454682123, "loss": 7.4981, "step": 1409900 }, { "epoch": 5.74403012967861, "grad_norm": 16.195911407470703, "learning_rate": 0.00018593889779497077, "loss": 7.5055, "step": 1410000 }, { "epoch": 5.74403012967861, "eval_MaskedAccuracy": 0.5131065060582505, "eval_loss": 1.5892151594161987, "eval_runtime": 151.2469, "eval_samples_per_second": 419.685, "eval_steps_per_second": 1.64, "step": 1410000 }, { "epoch": 5.744437507701991, "grad_norm": 15.349532127380371, "learning_rate": 0.00018575595272800902, "loss": 7.5182, "step": 1410100 }, { "epoch": 5.744844885725373, "grad_norm": 12.161064147949219, "learning_rate": 0.0001855730993531674, "loss": 7.54, "step": 1410200 }, { "epoch": 5.745252263748753, "grad_norm": 13.149774551391602, "learning_rate": 0.00018539033767767736, "loss": 7.5185, "step": 1410300 }, { "epoch": 5.745659641772135, "grad_norm": 15.56015396118164, "learning_rate": 0.0001852076677087636, "loss": 7.5248, "step": 1410400 }, { "epoch": 5.746067019795516, "grad_norm": 16.218429565429688, "learning_rate": 0.0001850250894536475, "loss": 7.5329, "step": 1410500 }, { "epoch": 5.746474397818898, "grad_norm": 13.44758129119873, "learning_rate": 0.00018484260291954825, "loss": 7.5114, "step": 1410600 }, { "epoch": 5.746881775842279, "grad_norm": 17.162885665893555, "learning_rate": 0.00018466020811367938, "loss": 7.5122, "step": 1410700 }, { "epoch": 5.747289153865661, "grad_norm": 10.99380874633789, "learning_rate": 0.0001844779050432526, "loss": 7.5164, "step": 1410800 }, { "epoch": 5.7476965318890425, "grad_norm": 4.253710746765137, "learning_rate": 0.00018429569371547562, "loss": 7.5313, "step": 1410900 }, { "epoch": 5.748103909912424, "grad_norm": 30.04854965209961, "learning_rate": 0.00018411357413755162, "loss": 7.5054, "step": 1411000 }, { "epoch": 5.748103909912424, "eval_MaskedAccuracy": 0.5134891987970711, "eval_loss": 1.581412672996521, "eval_runtime": 157.7877, "eval_samples_per_second": 402.287, "eval_steps_per_second": 1.572, "step": 1411000 }, { "epoch": 5.748511287935806, "grad_norm": 6.196340560913086, "learning_rate": 0.00018393154631668115, "loss": 7.5082, "step": 1411100 }, { "epoch": 5.748918665959187, "grad_norm": 13.771200180053711, "learning_rate": 0.00018374961026006048, "loss": 7.5089, "step": 1411200 }, { "epoch": 5.749326043982569, "grad_norm": 19.914575576782227, "learning_rate": 0.00018356776597488263, "loss": 7.4891, "step": 1411300 }, { "epoch": 5.749733422005949, "grad_norm": 23.734725952148438, "learning_rate": 0.00018338601346833694, "loss": 7.5522, "step": 1411400 }, { "epoch": 5.750140800029332, "grad_norm": 4.969549179077148, "learning_rate": 0.000183204352747609, "loss": 7.5143, "step": 1411500 }, { "epoch": 5.750548178052712, "grad_norm": 8.582549095153809, "learning_rate": 0.00018302278381988101, "loss": 7.5302, "step": 1411600 }, { "epoch": 5.750955556076094, "grad_norm": 2.790029287338257, "learning_rate": 0.000182841306692331, "loss": 7.5042, "step": 1411700 }, { "epoch": 5.751362934099475, "grad_norm": 8.083197593688965, "learning_rate": 0.0001826599213721339, "loss": 7.4969, "step": 1411800 }, { "epoch": 5.751770312122857, "grad_norm": 4.746738910675049, "learning_rate": 0.00018247862786646105, "loss": 7.4961, "step": 1411900 }, { "epoch": 5.7521776901462385, "grad_norm": 17.360036849975586, "learning_rate": 0.0001822974261824794, "loss": 7.5505, "step": 1412000 }, { "epoch": 5.7521776901462385, "eval_MaskedAccuracy": 0.5132593198820746, "eval_loss": 1.5834758281707764, "eval_runtime": 173.8437, "eval_samples_per_second": 365.133, "eval_steps_per_second": 1.427, "step": 1412000 }, { "epoch": 5.75258506816962, "grad_norm": 6.999335765838623, "learning_rate": 0.00018211631632735368, "loss": 7.4944, "step": 1412100 }, { "epoch": 5.7529924461930015, "grad_norm": 6.317979335784912, "learning_rate": 0.0001819352983082432, "loss": 7.5337, "step": 1412200 }, { "epoch": 5.753399824216383, "grad_norm": 9.501605033874512, "learning_rate": 0.00018175437213230518, "loss": 7.512, "step": 1412300 }, { "epoch": 5.753807202239765, "grad_norm": 5.46494722366333, "learning_rate": 0.00018157353780669228, "loss": 7.5546, "step": 1412400 }, { "epoch": 5.754214580263146, "grad_norm": 3.670668125152588, "learning_rate": 0.00018139279533855413, "loss": 7.5186, "step": 1412500 }, { "epoch": 5.754621958286528, "grad_norm": 11.392471313476562, "learning_rate": 0.0001812121447350363, "loss": 7.5155, "step": 1412600 }, { "epoch": 5.755029336309908, "grad_norm": 15.43734359741211, "learning_rate": 0.00018103158600328107, "loss": 7.5315, "step": 1412700 }, { "epoch": 5.75543671433329, "grad_norm": 16.57461166381836, "learning_rate": 0.00018085111915042642, "loss": 7.5286, "step": 1412800 }, { "epoch": 5.755844092356671, "grad_norm": 3.595592498779297, "learning_rate": 0.0001806707441836077, "loss": 7.5372, "step": 1412900 }, { "epoch": 5.756251470380053, "grad_norm": 21.875734329223633, "learning_rate": 0.00018049046110995607, "loss": 7.5122, "step": 1413000 }, { "epoch": 5.756251470380053, "eval_MaskedAccuracy": 0.5138528158223182, "eval_loss": 1.5747325420379639, "eval_runtime": 176.1096, "eval_samples_per_second": 360.435, "eval_steps_per_second": 1.408, "step": 1413000 }, { "epoch": 5.7566588484034344, "grad_norm": 4.31231164932251, "learning_rate": 0.00018031026993659837, "loss": 7.5071, "step": 1413100 }, { "epoch": 5.757066226426816, "grad_norm": 3.410404682159424, "learning_rate": 0.00018013017067065935, "loss": 7.4807, "step": 1413200 }, { "epoch": 5.7574736044501975, "grad_norm": 3.8272275924682617, "learning_rate": 0.00017995016331925865, "loss": 7.5132, "step": 1413300 }, { "epoch": 5.757880982473579, "grad_norm": 5.379833698272705, "learning_rate": 0.0001797702478895133, "loss": 7.5148, "step": 1413400 }, { "epoch": 5.758288360496961, "grad_norm": 3.9139113426208496, "learning_rate": 0.0001795904243885363, "loss": 7.5202, "step": 1413500 }, { "epoch": 5.758695738520342, "grad_norm": 5.3844828605651855, "learning_rate": 0.00017941069282343726, "loss": 7.5246, "step": 1413600 }, { "epoch": 5.759103116543724, "grad_norm": 19.920928955078125, "learning_rate": 0.00017923105320132105, "loss": 7.5016, "step": 1413700 }, { "epoch": 5.759510494567105, "grad_norm": 8.671945571899414, "learning_rate": 0.00017905150552929088, "loss": 7.4962, "step": 1413800 }, { "epoch": 5.759917872590487, "grad_norm": 6.017998695373535, "learning_rate": 0.0001788720498144446, "loss": 7.4906, "step": 1413900 }, { "epoch": 5.760325250613867, "grad_norm": 8.696832656860352, "learning_rate": 0.00017869268606387725, "loss": 7.5044, "step": 1414000 }, { "epoch": 5.760325250613867, "eval_MaskedAccuracy": 0.5131223645917778, "eval_loss": 1.5926177501678467, "eval_runtime": 162.3583, "eval_samples_per_second": 390.962, "eval_steps_per_second": 1.527, "step": 1414000 }, { "epoch": 5.760732628637249, "grad_norm": 19.09294891357422, "learning_rate": 0.0001785134142846798, "loss": 7.5388, "step": 1414100 }, { "epoch": 5.76114000666063, "grad_norm": 8.947702407836914, "learning_rate": 0.0001783342344839394, "loss": 7.5146, "step": 1414200 }, { "epoch": 5.761547384684012, "grad_norm": 8.759279251098633, "learning_rate": 0.00017815514666874116, "loss": 7.5019, "step": 1414300 }, { "epoch": 5.7619547627073935, "grad_norm": 36.35895919799805, "learning_rate": 0.0001779761508461649, "loss": 7.5131, "step": 1414400 }, { "epoch": 5.762362140730775, "grad_norm": 3.747073173522949, "learning_rate": 0.00017779724702328753, "loss": 7.5365, "step": 1414500 }, { "epoch": 5.7627695187541566, "grad_norm": 9.462847709655762, "learning_rate": 0.0001776184352071813, "loss": 7.5144, "step": 1414600 }, { "epoch": 5.763176896777538, "grad_norm": 4.828328609466553, "learning_rate": 0.0001774397154049161, "loss": 7.5307, "step": 1414700 }, { "epoch": 5.76358427480092, "grad_norm": 24.189462661743164, "learning_rate": 0.00017726108762355767, "loss": 7.4861, "step": 1414800 }, { "epoch": 5.763991652824301, "grad_norm": 8.334132194519043, "learning_rate": 0.00017708255187016793, "loss": 7.518, "step": 1414900 }, { "epoch": 5.764399030847683, "grad_norm": 24.44731330871582, "learning_rate": 0.00017690410815180598, "loss": 7.5354, "step": 1415000 }, { "epoch": 5.764399030847683, "eval_MaskedAccuracy": 0.5132570388452132, "eval_loss": 1.5811527967453003, "eval_runtime": 170.6061, "eval_samples_per_second": 372.062, "eval_steps_per_second": 1.454, "step": 1415000 }, { "epoch": 5.764806408871064, "grad_norm": 22.014970779418945, "learning_rate": 0.00017672575647552594, "loss": 7.5485, "step": 1415100 }, { "epoch": 5.765213786894446, "grad_norm": 3.618098020553589, "learning_rate": 0.0001765474968483792, "loss": 7.5262, "step": 1415200 }, { "epoch": 5.765621164917826, "grad_norm": 7.878652095794678, "learning_rate": 0.0001763693292774133, "loss": 7.5139, "step": 1415300 }, { "epoch": 5.766028542941208, "grad_norm": 4.511267185211182, "learning_rate": 0.00017619125376967205, "loss": 7.5011, "step": 1415400 }, { "epoch": 5.7664359209645895, "grad_norm": 13.622882843017578, "learning_rate": 0.00017601327033219623, "loss": 7.5164, "step": 1415500 }, { "epoch": 5.766843298987971, "grad_norm": 6.8950090408325195, "learning_rate": 0.0001758353789720218, "loss": 7.506, "step": 1415600 }, { "epoch": 5.7672506770113525, "grad_norm": 15.81292724609375, "learning_rate": 0.00017565757969618246, "loss": 7.5314, "step": 1415700 }, { "epoch": 5.767658055034734, "grad_norm": 19.056610107421875, "learning_rate": 0.00017547987251170674, "loss": 7.5093, "step": 1415800 }, { "epoch": 5.768065433058116, "grad_norm": 4.119966983795166, "learning_rate": 0.0001753022574256211, "loss": 7.5192, "step": 1415900 }, { "epoch": 5.768472811081497, "grad_norm": 15.643582344055176, "learning_rate": 0.00017512473444494694, "loss": 7.5324, "step": 1416000 }, { "epoch": 5.768472811081497, "eval_MaskedAccuracy": 0.5141732353414097, "eval_loss": 1.5801136493682861, "eval_runtime": 169.0959, "eval_samples_per_second": 375.385, "eval_steps_per_second": 1.467, "step": 1416000 }, { "epoch": 5.768880189104879, "grad_norm": 13.730713844299316, "learning_rate": 0.00017494730357670368, "loss": 7.5006, "step": 1416100 }, { "epoch": 5.76928756712826, "grad_norm": 6.921182155609131, "learning_rate": 0.0001747699648279048, "loss": 7.491, "step": 1416200 }, { "epoch": 5.769694945151642, "grad_norm": 5.342350482940674, "learning_rate": 0.0001745927182055627, "loss": 7.5168, "step": 1416300 }, { "epoch": 5.770102323175022, "grad_norm": 11.932868957519531, "learning_rate": 0.00017441556371668417, "loss": 7.5134, "step": 1416400 }, { "epoch": 5.770509701198405, "grad_norm": 11.356863975524902, "learning_rate": 0.00017423850136827324, "loss": 7.4981, "step": 1416500 }, { "epoch": 5.770917079221785, "grad_norm": 25.6007080078125, "learning_rate": 0.00017406153116733024, "loss": 7.5063, "step": 1416600 }, { "epoch": 5.771324457245167, "grad_norm": 17.394763946533203, "learning_rate": 0.00017388465312085177, "loss": 7.5262, "step": 1416700 }, { "epoch": 5.7717318352685485, "grad_norm": 27.250417709350586, "learning_rate": 0.0001737078672358303, "loss": 7.4939, "step": 1416800 }, { "epoch": 5.77213921329193, "grad_norm": 4.442460536956787, "learning_rate": 0.00017353117351925565, "loss": 7.4952, "step": 1416900 }, { "epoch": 5.772546591315312, "grad_norm": 12.650249481201172, "learning_rate": 0.0001733545719781134, "loss": 7.5142, "step": 1417000 }, { "epoch": 5.772546591315312, "eval_MaskedAccuracy": 0.5134382189619834, "eval_loss": 1.5805528163909912, "eval_runtime": 173.0323, "eval_samples_per_second": 366.845, "eval_steps_per_second": 1.433, "step": 1417000 }, { "epoch": 5.772953969338693, "grad_norm": 20.84615135192871, "learning_rate": 0.00017317806261938522, "loss": 7.54, "step": 1417100 }, { "epoch": 5.773361347362075, "grad_norm": 13.096055030822754, "learning_rate": 0.0001730016454500498, "loss": 7.5417, "step": 1417200 }, { "epoch": 5.773768725385456, "grad_norm": 19.751548767089844, "learning_rate": 0.00017282532047708182, "loss": 7.5181, "step": 1417300 }, { "epoch": 5.774176103408838, "grad_norm": 22.788862228393555, "learning_rate": 0.00017264908770745163, "loss": 7.522, "step": 1417400 }, { "epoch": 5.774583481432219, "grad_norm": 4.602299213409424, "learning_rate": 0.00017247294714812782, "loss": 7.5088, "step": 1417500 }, { "epoch": 5.774990859455601, "grad_norm": 3.314424991607666, "learning_rate": 0.0001722968988060734, "loss": 7.5029, "step": 1417600 }, { "epoch": 5.775398237478981, "grad_norm": 4.984609603881836, "learning_rate": 0.0001721209426882492, "loss": 7.5298, "step": 1417700 }, { "epoch": 5.775805615502363, "grad_norm": 4.923085689544678, "learning_rate": 0.00017194507880161113, "loss": 7.5344, "step": 1417800 }, { "epoch": 5.7762129935257445, "grad_norm": 9.468344688415527, "learning_rate": 0.00017176930715311214, "loss": 7.5023, "step": 1417900 }, { "epoch": 5.776620371549126, "grad_norm": 4.969415664672852, "learning_rate": 0.00017159362774970134, "loss": 7.5011, "step": 1418000 }, { "epoch": 5.776620371549126, "eval_MaskedAccuracy": 0.5141010918163169, "eval_loss": 1.5732693672180176, "eval_runtime": 167.0545, "eval_samples_per_second": 379.972, "eval_steps_per_second": 1.485, "step": 1418000 }, { "epoch": 5.7770277495725075, "grad_norm": 19.92969512939453, "learning_rate": 0.00017141804059832467, "loss": 7.5398, "step": 1418100 }, { "epoch": 5.777435127595889, "grad_norm": 14.730072975158691, "learning_rate": 0.00017124254570592313, "loss": 7.538, "step": 1418200 }, { "epoch": 5.777842505619271, "grad_norm": 29.363462448120117, "learning_rate": 0.00017106714307943587, "loss": 7.4936, "step": 1418300 }, { "epoch": 5.778249883642652, "grad_norm": 13.004620552062988, "learning_rate": 0.00017089183272579728, "loss": 7.5343, "step": 1418400 }, { "epoch": 5.778657261666034, "grad_norm": 17.11601448059082, "learning_rate": 0.00017071661465193835, "loss": 7.5099, "step": 1418500 }, { "epoch": 5.779064639689415, "grad_norm": 15.091670036315918, "learning_rate": 0.00017054148886478597, "loss": 7.525, "step": 1418600 }, { "epoch": 5.779472017712797, "grad_norm": 5.1989521980285645, "learning_rate": 0.0001703664553712643, "loss": 7.5075, "step": 1418700 }, { "epoch": 5.779879395736178, "grad_norm": 3.2822389602661133, "learning_rate": 0.00017019151417829297, "loss": 7.5084, "step": 1418800 }, { "epoch": 5.78028677375956, "grad_norm": 15.049131393432617, "learning_rate": 0.00017001666529278836, "loss": 7.4896, "step": 1418900 }, { "epoch": 5.78069415178294, "grad_norm": 3.6609060764312744, "learning_rate": 0.0001698419087216636, "loss": 7.5162, "step": 1419000 }, { "epoch": 5.78069415178294, "eval_MaskedAccuracy": 0.5130734803741487, "eval_loss": 1.594419240951538, "eval_runtime": 175.0282, "eval_samples_per_second": 362.661, "eval_steps_per_second": 1.417, "step": 1419000 }, { "epoch": 5.781101529806322, "grad_norm": 15.390701293945312, "learning_rate": 0.0001696672444718274, "loss": 7.5092, "step": 1419100 }, { "epoch": 5.7815089078297035, "grad_norm": 8.904915809631348, "learning_rate": 0.0001694926725501855, "loss": 7.553, "step": 1419200 }, { "epoch": 5.781916285853085, "grad_norm": 11.09874439239502, "learning_rate": 0.000169318192963639, "loss": 7.5293, "step": 1419300 }, { "epoch": 5.782323663876467, "grad_norm": 16.156326293945312, "learning_rate": 0.00016914380571908682, "loss": 7.4898, "step": 1419400 }, { "epoch": 5.782731041899848, "grad_norm": 34.805091857910156, "learning_rate": 0.00016896951082342305, "loss": 7.5133, "step": 1419500 }, { "epoch": 5.78313841992323, "grad_norm": 8.421262741088867, "learning_rate": 0.0001687953082835382, "loss": 7.5162, "step": 1419600 }, { "epoch": 5.783545797946611, "grad_norm": 3.0364320278167725, "learning_rate": 0.00016862119810631988, "loss": 7.4995, "step": 1419700 }, { "epoch": 5.783953175969993, "grad_norm": 12.082828521728516, "learning_rate": 0.00016844718029865153, "loss": 7.5227, "step": 1419800 }, { "epoch": 5.784360553993374, "grad_norm": 4.081977844238281, "learning_rate": 0.0001682732548674128, "loss": 7.5162, "step": 1419900 }, { "epoch": 5.784767932016756, "grad_norm": 9.002562522888184, "learning_rate": 0.00016809942181947988, "loss": 7.5442, "step": 1420000 }, { "epoch": 5.784767932016756, "eval_MaskedAccuracy": 0.513356205793277, "eval_loss": 1.5863076448440552, "eval_runtime": 181.1339, "eval_samples_per_second": 350.437, "eval_steps_per_second": 1.369, "step": 1420000 }, { "epoch": 5.785175310040137, "grad_norm": 16.186351776123047, "learning_rate": 0.0001679256811617254, "loss": 7.5288, "step": 1420100 }, { "epoch": 5.785582688063519, "grad_norm": 8.277743339538574, "learning_rate": 0.0001677520329010184, "loss": 7.5296, "step": 1420200 }, { "epoch": 5.7859900660868995, "grad_norm": 16.341815948486328, "learning_rate": 0.00016757847704422393, "loss": 7.5262, "step": 1420300 }, { "epoch": 5.786397444110281, "grad_norm": 16.191190719604492, "learning_rate": 0.00016740501359820348, "loss": 7.5162, "step": 1420400 }, { "epoch": 5.7868048221336625, "grad_norm": 19.828542709350586, "learning_rate": 0.00016723164256981544, "loss": 7.5429, "step": 1420500 }, { "epoch": 5.787212200157044, "grad_norm": 3.413541316986084, "learning_rate": 0.0001670583639659137, "loss": 7.5456, "step": 1420600 }, { "epoch": 5.787619578180426, "grad_norm": 15.310487747192383, "learning_rate": 0.00016688517779334924, "loss": 7.5234, "step": 1420700 }, { "epoch": 5.788026956203807, "grad_norm": 13.728946685791016, "learning_rate": 0.00016671208405896876, "loss": 7.5219, "step": 1420800 }, { "epoch": 5.788434334227189, "grad_norm": 15.777775764465332, "learning_rate": 0.00016653908276961587, "loss": 7.5322, "step": 1420900 }, { "epoch": 5.78884171225057, "grad_norm": 17.24802017211914, "learning_rate": 0.00016636617393213, "loss": 7.5075, "step": 1421000 }, { "epoch": 5.78884171225057, "eval_MaskedAccuracy": 0.5130345098445821, "eval_loss": 1.5933095216751099, "eval_runtime": 190.5582, "eval_samples_per_second": 333.106, "eval_steps_per_second": 1.301, "step": 1421000 }, { "epoch": 5.789249090273952, "grad_norm": 4.251654624938965, "learning_rate": 0.00016619335755334685, "loss": 7.5079, "step": 1421100 }, { "epoch": 5.789656468297333, "grad_norm": 31.306337356567383, "learning_rate": 0.00016602063364009952, "loss": 7.4902, "step": 1421200 }, { "epoch": 5.790063846320715, "grad_norm": 22.56920051574707, "learning_rate": 0.000165848002199216, "loss": 7.4929, "step": 1421300 }, { "epoch": 5.7904712243440954, "grad_norm": 10.100997924804688, "learning_rate": 0.00016567546323752134, "loss": 7.5175, "step": 1421400 }, { "epoch": 5.790878602367478, "grad_norm": 3.6591389179229736, "learning_rate": 0.0001655030167618374, "loss": 7.5263, "step": 1421500 }, { "epoch": 5.7912859803908585, "grad_norm": 8.138897895812988, "learning_rate": 0.00016533066277898173, "loss": 7.509, "step": 1421600 }, { "epoch": 5.79169335841424, "grad_norm": 23.298463821411133, "learning_rate": 0.00016515840129576847, "loss": 7.5223, "step": 1421700 }, { "epoch": 5.792100736437622, "grad_norm": 17.69959831237793, "learning_rate": 0.00016498623231900794, "loss": 7.5256, "step": 1421800 }, { "epoch": 5.792508114461003, "grad_norm": 14.322881698608398, "learning_rate": 0.00016481415585550686, "loss": 7.5199, "step": 1421900 }, { "epoch": 5.792915492484385, "grad_norm": 19.614898681640625, "learning_rate": 0.00016464217191206835, "loss": 7.5402, "step": 1422000 }, { "epoch": 5.792915492484385, "eval_MaskedAccuracy": 0.5130547197273498, "eval_loss": 1.587104320526123, "eval_runtime": 165.2217, "eval_samples_per_second": 384.187, "eval_steps_per_second": 1.501, "step": 1422000 }, { "epoch": 5.793322870507766, "grad_norm": 22.410751342773438, "learning_rate": 0.0001644702804954915, "loss": 7.5272, "step": 1422100 }, { "epoch": 5.793730248531148, "grad_norm": 14.932026863098145, "learning_rate": 0.00016429848161257302, "loss": 7.5039, "step": 1422200 }, { "epoch": 5.794137626554529, "grad_norm": 20.829832077026367, "learning_rate": 0.00016412677527010444, "loss": 7.4982, "step": 1422300 }, { "epoch": 5.794545004577911, "grad_norm": 24.133546829223633, "learning_rate": 0.00016395516147487416, "loss": 7.4878, "step": 1422400 }, { "epoch": 5.794952382601292, "grad_norm": 12.178784370422363, "learning_rate": 0.0001637836402336674, "loss": 7.525, "step": 1422500 }, { "epoch": 5.795359760624674, "grad_norm": 15.152470588684082, "learning_rate": 0.00016361221155326516, "loss": 7.5057, "step": 1422600 }, { "epoch": 5.7957671386480545, "grad_norm": 9.138758659362793, "learning_rate": 0.0001634408754404445, "loss": 7.5014, "step": 1422700 }, { "epoch": 5.796174516671436, "grad_norm": 18.050846099853516, "learning_rate": 0.0001632696319019799, "loss": 7.5523, "step": 1422800 }, { "epoch": 5.7965818946948175, "grad_norm": 19.427255630493164, "learning_rate": 0.00016309848094464097, "loss": 7.5159, "step": 1422900 }, { "epoch": 5.796989272718199, "grad_norm": 10.630948066711426, "learning_rate": 0.0001629274225751945, "loss": 7.5131, "step": 1423000 }, { "epoch": 5.796989272718199, "eval_MaskedAccuracy": 0.5133400752451206, "eval_loss": 1.588206171989441, "eval_runtime": 157.6728, "eval_samples_per_second": 402.58, "eval_steps_per_second": 1.573, "step": 1423000 }, { "epoch": 5.797396650741581, "grad_norm": 4.26202917098999, "learning_rate": 0.00016275645680040323, "loss": 7.5006, "step": 1423100 }, { "epoch": 5.797804028764962, "grad_norm": 8.990422248840332, "learning_rate": 0.00016258558362702648, "loss": 7.51, "step": 1423200 }, { "epoch": 5.798211406788344, "grad_norm": 13.804823875427246, "learning_rate": 0.0001624148030618199, "loss": 7.5149, "step": 1423300 }, { "epoch": 5.798618784811725, "grad_norm": 12.746509552001953, "learning_rate": 0.00016224411511153497, "loss": 7.5131, "step": 1423400 }, { "epoch": 5.799026162835107, "grad_norm": 10.440051078796387, "learning_rate": 0.00016207351978292017, "loss": 7.5118, "step": 1423500 }, { "epoch": 5.799433540858488, "grad_norm": 24.82433319091797, "learning_rate": 0.00016190301708272, "loss": 7.49, "step": 1423600 }, { "epoch": 5.79984091888187, "grad_norm": 5.237053871154785, "learning_rate": 0.00016173260701767515, "loss": 7.4757, "step": 1423700 }, { "epoch": 5.800248296905251, "grad_norm": 14.583934783935547, "learning_rate": 0.00016156228959452316, "loss": 7.5304, "step": 1423800 }, { "epoch": 5.800655674928633, "grad_norm": 48.17268371582031, "learning_rate": 0.00016139206481999757, "loss": 7.5201, "step": 1423900 }, { "epoch": 5.8010630529520135, "grad_norm": 3.127892255783081, "learning_rate": 0.00016122193270082808, "loss": 7.5152, "step": 1424000 }, { "epoch": 5.8010630529520135, "eval_MaskedAccuracy": 0.5130820432215011, "eval_loss": 1.5851596593856812, "eval_runtime": 173.9997, "eval_samples_per_second": 364.805, "eval_steps_per_second": 1.425, "step": 1424000 }, { "epoch": 5.801470430975395, "grad_norm": 14.792893409729004, "learning_rate": 0.00016105189324374018, "loss": 7.5313, "step": 1424100 }, { "epoch": 5.801877808998777, "grad_norm": 15.091297149658203, "learning_rate": 0.00016088194645545793, "loss": 7.5159, "step": 1424200 }, { "epoch": 5.802285187022158, "grad_norm": 26.00152587890625, "learning_rate": 0.00016071209234269965, "loss": 7.5087, "step": 1424300 }, { "epoch": 5.80269256504554, "grad_norm": 14.155200004577637, "learning_rate": 0.00016054233091218048, "loss": 7.5039, "step": 1424400 }, { "epoch": 5.803099943068921, "grad_norm": 39.95311737060547, "learning_rate": 0.00016037266217061209, "loss": 7.5115, "step": 1424500 }, { "epoch": 5.803507321092303, "grad_norm": 3.3049662113189697, "learning_rate": 0.00016020308612470257, "loss": 7.4999, "step": 1424600 }, { "epoch": 5.803914699115684, "grad_norm": 5.990292072296143, "learning_rate": 0.00016003360278115547, "loss": 7.5388, "step": 1424700 }, { "epoch": 5.804322077139066, "grad_norm": 7.175048351287842, "learning_rate": 0.000159864212146672, "loss": 7.5164, "step": 1424800 }, { "epoch": 5.804729455162447, "grad_norm": 31.079818725585938, "learning_rate": 0.00015969491422794874, "loss": 7.5047, "step": 1424900 }, { "epoch": 5.805136833185829, "grad_norm": 12.982433319091797, "learning_rate": 0.00015952570903167928, "loss": 7.5094, "step": 1425000 }, { "epoch": 5.805136833185829, "eval_MaskedAccuracy": 0.5132631796938145, "eval_loss": 1.581839919090271, "eval_runtime": 159.0711, "eval_samples_per_second": 399.042, "eval_steps_per_second": 1.559, "step": 1425000 }, { "epoch": 5.80554421120921, "grad_norm": 4.535707473754883, "learning_rate": 0.00015935659656455287, "loss": 7.5213, "step": 1425100 }, { "epoch": 5.805951589232592, "grad_norm": 11.968395233154297, "learning_rate": 0.0001591875768332556, "loss": 7.524, "step": 1425200 }, { "epoch": 5.806358967255973, "grad_norm": 6.857604503631592, "learning_rate": 0.00015901864984446974, "loss": 7.5128, "step": 1425300 }, { "epoch": 5.806766345279354, "grad_norm": 3.4068024158477783, "learning_rate": 0.00015884981560487378, "loss": 7.5053, "step": 1425400 }, { "epoch": 5.807173723302736, "grad_norm": 12.815781593322754, "learning_rate": 0.00015868107412114293, "loss": 7.495, "step": 1425500 }, { "epoch": 5.807581101326117, "grad_norm": 12.488129615783691, "learning_rate": 0.00015851242539994783, "loss": 7.5018, "step": 1425600 }, { "epoch": 5.807988479349499, "grad_norm": 10.342538833618164, "learning_rate": 0.00015834386944795683, "loss": 7.5324, "step": 1425700 }, { "epoch": 5.80839585737288, "grad_norm": 22.352279663085938, "learning_rate": 0.00015817540627183327, "loss": 7.4829, "step": 1425800 }, { "epoch": 5.808803235396262, "grad_norm": 19.0584774017334, "learning_rate": 0.00015800703587823768, "loss": 7.5372, "step": 1425900 }, { "epoch": 5.809210613419643, "grad_norm": 13.602452278137207, "learning_rate": 0.00015783875827382656, "loss": 7.4858, "step": 1426000 }, { "epoch": 5.809210613419643, "eval_MaskedAccuracy": 0.5133862877492475, "eval_loss": 1.5769734382629395, "eval_runtime": 173.2155, "eval_samples_per_second": 366.457, "eval_steps_per_second": 1.432, "step": 1426000 }, { "epoch": 5.809617991443025, "grad_norm": 9.673832893371582, "learning_rate": 0.00015767057346525206, "loss": 7.5318, "step": 1426100 }, { "epoch": 5.810025369466406, "grad_norm": 6.554992198944092, "learning_rate": 0.0001575024814591651, "loss": 7.533, "step": 1426200 }, { "epoch": 5.810432747489788, "grad_norm": 3.984445095062256, "learning_rate": 0.00015733448226221014, "loss": 7.5076, "step": 1426300 }, { "epoch": 5.8108401255131685, "grad_norm": 9.314393997192383, "learning_rate": 0.00015716657588102918, "loss": 7.5052, "step": 1426400 }, { "epoch": 5.811247503536551, "grad_norm": 11.197092056274414, "learning_rate": 0.00015699876232226084, "loss": 7.5127, "step": 1426500 }, { "epoch": 5.811654881559932, "grad_norm": 7.632115840911865, "learning_rate": 0.00015683104159253924, "loss": 7.5313, "step": 1426600 }, { "epoch": 5.812062259583313, "grad_norm": 3.435426950454712, "learning_rate": 0.00015666341369849574, "loss": 7.4944, "step": 1426700 }, { "epoch": 5.812469637606695, "grad_norm": 4.164614677429199, "learning_rate": 0.00015649587864675693, "loss": 7.5022, "step": 1426800 }, { "epoch": 5.812877015630076, "grad_norm": 2.644254684448242, "learning_rate": 0.0001563284364439469, "loss": 7.5023, "step": 1426900 }, { "epoch": 5.813284393653458, "grad_norm": 26.294357299804688, "learning_rate": 0.00015616108709668518, "loss": 7.4974, "step": 1427000 }, { "epoch": 5.813284393653458, "eval_MaskedAccuracy": 0.5134179198324821, "eval_loss": 1.5896483659744263, "eval_runtime": 186.502, "eval_samples_per_second": 340.35, "eval_steps_per_second": 1.33, "step": 1427000 }, { "epoch": 5.813691771676839, "grad_norm": 3.466761827468872, "learning_rate": 0.00015599383061158854, "loss": 7.5437, "step": 1427100 }, { "epoch": 5.814099149700221, "grad_norm": 23.73604393005371, "learning_rate": 0.0001558266669952689, "loss": 7.5146, "step": 1427200 }, { "epoch": 5.814506527723602, "grad_norm": 13.889824867248535, "learning_rate": 0.00015565959625433542, "loss": 7.5074, "step": 1427300 }, { "epoch": 5.814913905746984, "grad_norm": 4.097889423370361, "learning_rate": 0.00015549261839539318, "loss": 7.5213, "step": 1427400 }, { "epoch": 5.815321283770365, "grad_norm": 8.272371292114258, "learning_rate": 0.00015532573342504365, "loss": 7.5008, "step": 1427500 }, { "epoch": 5.815728661793747, "grad_norm": 11.328155517578125, "learning_rate": 0.00015515894134988513, "loss": 7.5225, "step": 1427600 }, { "epoch": 5.816136039817128, "grad_norm": 10.377781867980957, "learning_rate": 0.000154992242176511, "loss": 7.5254, "step": 1427700 }, { "epoch": 5.816543417840509, "grad_norm": 24.338708877563477, "learning_rate": 0.00015482563591151275, "loss": 7.5151, "step": 1427800 }, { "epoch": 5.816950795863891, "grad_norm": 19.643484115600586, "learning_rate": 0.00015465912256147637, "loss": 7.5253, "step": 1427900 }, { "epoch": 5.817358173887272, "grad_norm": 19.863996505737305, "learning_rate": 0.00015449270213298592, "loss": 7.499, "step": 1428000 }, { "epoch": 5.817358173887272, "eval_MaskedAccuracy": 0.514176879805612, "eval_loss": 1.5824474096298218, "eval_runtime": 153.1606, "eval_samples_per_second": 414.441, "eval_steps_per_second": 1.619, "step": 1428000 }, { "epoch": 5.817765551910654, "grad_norm": 29.033245086669922, "learning_rate": 0.00015432637463261924, "loss": 7.49, "step": 1428100 }, { "epoch": 5.818172929934035, "grad_norm": 7.3021240234375, "learning_rate": 0.00015416014006695397, "loss": 7.5345, "step": 1428200 }, { "epoch": 5.818580307957417, "grad_norm": 4.279619216918945, "learning_rate": 0.0001539939984425615, "loss": 7.5325, "step": 1428300 }, { "epoch": 5.818987685980798, "grad_norm": 11.395988464355469, "learning_rate": 0.00015382794976601027, "loss": 7.4974, "step": 1428400 }, { "epoch": 5.81939506400418, "grad_norm": 4.971868515014648, "learning_rate": 0.00015366199404386524, "loss": 7.5171, "step": 1428500 }, { "epoch": 5.819802442027561, "grad_norm": 11.167874336242676, "learning_rate": 0.0001534961312826874, "loss": 7.4983, "step": 1428600 }, { "epoch": 5.820209820050943, "grad_norm": 8.332849502563477, "learning_rate": 0.00015333036148903434, "loss": 7.4879, "step": 1428700 }, { "epoch": 5.820617198074324, "grad_norm": 14.909524917602539, "learning_rate": 0.00015316468466945965, "loss": 7.5394, "step": 1428800 }, { "epoch": 5.821024576097706, "grad_norm": 12.37127685546875, "learning_rate": 0.00015299910083051344, "loss": 7.4944, "step": 1428900 }, { "epoch": 5.821431954121087, "grad_norm": 15.7769136428833, "learning_rate": 0.00015283360997874208, "loss": 7.5223, "step": 1429000 }, { "epoch": 5.821431954121087, "eval_MaskedAccuracy": 0.51342468818523, "eval_loss": 1.5892704725265503, "eval_runtime": 191.8393, "eval_samples_per_second": 330.881, "eval_steps_per_second": 1.293, "step": 1429000 }, { "epoch": 5.821839332144468, "grad_norm": 27.168245315551758, "learning_rate": 0.00015266821212068886, "loss": 7.5041, "step": 1429100 }, { "epoch": 5.82224671016785, "grad_norm": 2.8287925720214844, "learning_rate": 0.00015250290726289198, "loss": 7.5165, "step": 1429200 }, { "epoch": 5.822654088191231, "grad_norm": 4.237165451049805, "learning_rate": 0.00015233769541188734, "loss": 7.5103, "step": 1429300 }, { "epoch": 5.823061466214613, "grad_norm": 15.783507347106934, "learning_rate": 0.0001521725765742062, "loss": 7.5219, "step": 1429400 }, { "epoch": 5.823468844237994, "grad_norm": 10.036375999450684, "learning_rate": 0.00015200755075637698, "loss": 7.5448, "step": 1429500 }, { "epoch": 5.823876222261376, "grad_norm": 6.849361896514893, "learning_rate": 0.0001518426179649242, "loss": 7.5031, "step": 1429600 }, { "epoch": 5.824283600284757, "grad_norm": 9.938248634338379, "learning_rate": 0.0001516777782063682, "loss": 7.528, "step": 1429700 }, { "epoch": 5.824690978308139, "grad_norm": 12.386895179748535, "learning_rate": 0.00015151303148722609, "loss": 7.5247, "step": 1429800 }, { "epoch": 5.82509835633152, "grad_norm": 10.73524284362793, "learning_rate": 0.00015134837781401118, "loss": 7.5242, "step": 1429900 }, { "epoch": 5.825505734354902, "grad_norm": 4.935142993927002, "learning_rate": 0.00015118381719323332, "loss": 7.5333, "step": 1430000 }, { "epoch": 5.825505734354902, "eval_MaskedAccuracy": 0.5131773851836039, "eval_loss": 1.5934122800827026, "eval_runtime": 161.6301, "eval_samples_per_second": 392.724, "eval_steps_per_second": 1.534, "step": 1430000 }, { "epoch": 5.8259131123782835, "grad_norm": 8.51754093170166, "learning_rate": 0.00015101934963139766, "loss": 7.5085, "step": 1430100 }, { "epoch": 5.826320490401665, "grad_norm": 18.393491744995117, "learning_rate": 0.00015085497513500743, "loss": 7.5272, "step": 1430200 }, { "epoch": 5.826727868425046, "grad_norm": 31.64918327331543, "learning_rate": 0.00015069069371056098, "loss": 7.5202, "step": 1430300 }, { "epoch": 5.827135246448427, "grad_norm": 3.425719976425171, "learning_rate": 0.00015052650536455327, "loss": 7.5535, "step": 1430400 }, { "epoch": 5.827542624471809, "grad_norm": 18.87350082397461, "learning_rate": 0.0001503624101034753, "loss": 7.5311, "step": 1430500 }, { "epoch": 5.82795000249519, "grad_norm": 3.842937707901001, "learning_rate": 0.0001501984079338148, "loss": 7.5187, "step": 1430600 }, { "epoch": 5.828357380518572, "grad_norm": 8.800285339355469, "learning_rate": 0.00015003449886205513, "loss": 7.491, "step": 1430700 }, { "epoch": 5.828764758541953, "grad_norm": 6.007112979888916, "learning_rate": 0.00014987068289467755, "loss": 7.4973, "step": 1430800 }, { "epoch": 5.829172136565335, "grad_norm": 32.813323974609375, "learning_rate": 0.00014970696003815774, "loss": 7.5156, "step": 1430900 }, { "epoch": 5.829579514588716, "grad_norm": 18.104747772216797, "learning_rate": 0.00014954333029896898, "loss": 7.5164, "step": 1431000 }, { "epoch": 5.829579514588716, "eval_MaskedAccuracy": 0.5130792734574849, "eval_loss": 1.5899848937988281, "eval_runtime": 164.4066, "eval_samples_per_second": 386.092, "eval_steps_per_second": 1.508, "step": 1431000 }, { "epoch": 5.829986892612098, "grad_norm": 9.131357192993164, "learning_rate": 0.00014937979368357993, "loss": 7.5244, "step": 1431100 }, { "epoch": 5.830394270635479, "grad_norm": 3.4502384662628174, "learning_rate": 0.00014921635019845647, "loss": 7.5191, "step": 1431200 }, { "epoch": 5.830801648658861, "grad_norm": 3.947558879852295, "learning_rate": 0.00014905299985006035, "loss": 7.5305, "step": 1431300 }, { "epoch": 5.831209026682242, "grad_norm": 40.45591354370117, "learning_rate": 0.00014888974264484967, "loss": 7.5185, "step": 1431400 }, { "epoch": 5.831616404705624, "grad_norm": 5.565196990966797, "learning_rate": 0.00014872657858927866, "loss": 7.538, "step": 1431500 }, { "epoch": 5.832023782729005, "grad_norm": 25.74686050415039, "learning_rate": 0.00014856350768979834, "loss": 7.4946, "step": 1431600 }, { "epoch": 5.832431160752386, "grad_norm": 6.645580291748047, "learning_rate": 0.00014840052995285583, "loss": 7.5205, "step": 1431700 }, { "epoch": 5.832838538775768, "grad_norm": 12.331221580505371, "learning_rate": 0.00014823764538489446, "loss": 7.516, "step": 1431800 }, { "epoch": 5.833245916799149, "grad_norm": 13.032149314880371, "learning_rate": 0.00014807485399235352, "loss": 7.5061, "step": 1431900 }, { "epoch": 5.833653294822531, "grad_norm": 9.871225357055664, "learning_rate": 0.00014791215578166864, "loss": 7.5037, "step": 1432000 }, { "epoch": 5.833653294822531, "eval_MaskedAccuracy": 0.5136045432551434, "eval_loss": 1.5865452289581299, "eval_runtime": 167.9042, "eval_samples_per_second": 378.049, "eval_steps_per_second": 1.477, "step": 1432000 }, { "epoch": 5.834060672845912, "grad_norm": 13.493284225463867, "learning_rate": 0.0001477495507592737, "loss": 7.4847, "step": 1432100 }, { "epoch": 5.834468050869294, "grad_norm": 4.093759536743164, "learning_rate": 0.00014758703893159636, "loss": 7.5054, "step": 1432200 }, { "epoch": 5.834875428892675, "grad_norm": 14.402068138122559, "learning_rate": 0.00014742462030506188, "loss": 7.551, "step": 1432300 }, { "epoch": 5.835282806916057, "grad_norm": 4.072689533233643, "learning_rate": 0.00014726229488609119, "loss": 7.5064, "step": 1432400 }, { "epoch": 5.8356901849394385, "grad_norm": 26.50486946105957, "learning_rate": 0.00014710006268110211, "loss": 7.4957, "step": 1432500 }, { "epoch": 5.83609756296282, "grad_norm": 10.46023941040039, "learning_rate": 0.00014693792369650866, "loss": 7.5267, "step": 1432600 }, { "epoch": 5.836504940986201, "grad_norm": 18.413921356201172, "learning_rate": 0.00014677587793872037, "loss": 7.5463, "step": 1432700 }, { "epoch": 5.836912319009582, "grad_norm": 9.61780071258545, "learning_rate": 0.00014661392541414457, "loss": 7.5262, "step": 1432800 }, { "epoch": 5.837319697032964, "grad_norm": 19.005826950073242, "learning_rate": 0.00014645206612918388, "loss": 7.5296, "step": 1432900 }, { "epoch": 5.837727075056345, "grad_norm": 7.550987243652344, "learning_rate": 0.0001462903000902375, "loss": 7.5098, "step": 1433000 }, { "epoch": 5.837727075056345, "eval_MaskedAccuracy": 0.5138815917066393, "eval_loss": 1.5830613374710083, "eval_runtime": 172.1335, "eval_samples_per_second": 368.76, "eval_steps_per_second": 1.441, "step": 1433000 }, { "epoch": 5.838134453079727, "grad_norm": 4.856964111328125, "learning_rate": 0.00014612862730370085, "loss": 7.4893, "step": 1433100 }, { "epoch": 5.838541831103108, "grad_norm": 10.035447120666504, "learning_rate": 0.00014596704777596547, "loss": 7.5173, "step": 1433200 }, { "epoch": 5.83894920912649, "grad_norm": 26.232330322265625, "learning_rate": 0.00014580556151341995, "loss": 7.5221, "step": 1433300 }, { "epoch": 5.839356587149871, "grad_norm": 7.708934307098389, "learning_rate": 0.00014564416852244855, "loss": 7.5182, "step": 1433400 }, { "epoch": 5.839763965173253, "grad_norm": 7.310030460357666, "learning_rate": 0.0001454828688094316, "loss": 7.5197, "step": 1433500 }, { "epoch": 5.8401713431966344, "grad_norm": 17.08176612854004, "learning_rate": 0.00014532166238074643, "loss": 7.5286, "step": 1433600 }, { "epoch": 5.840578721220016, "grad_norm": 10.942180633544922, "learning_rate": 0.00014516054924276658, "loss": 7.4983, "step": 1433700 }, { "epoch": 5.8409860992433975, "grad_norm": 3.745558023452759, "learning_rate": 0.0001449995294018614, "loss": 7.504, "step": 1433800 }, { "epoch": 5.841393477266779, "grad_norm": 3.755819797515869, "learning_rate": 0.0001448386028643968, "loss": 7.5042, "step": 1433900 }, { "epoch": 5.84180085529016, "grad_norm": 4.809333801269531, "learning_rate": 0.00014467776963673486, "loss": 7.4873, "step": 1434000 }, { "epoch": 5.84180085529016, "eval_MaskedAccuracy": 0.5132207532249975, "eval_loss": 1.5858699083328247, "eval_runtime": 174.3579, "eval_samples_per_second": 364.056, "eval_steps_per_second": 1.422, "step": 1434000 }, { "epoch": 5.842208233313541, "grad_norm": 8.828631401062012, "learning_rate": 0.0001445170297252353, "loss": 7.5113, "step": 1434100 }, { "epoch": 5.842615611336923, "grad_norm": 7.251721382141113, "learning_rate": 0.0001443563831362521, "loss": 7.526, "step": 1434200 }, { "epoch": 5.843022989360304, "grad_norm": 7.140491962432861, "learning_rate": 0.00014419582987613673, "loss": 7.5218, "step": 1434300 }, { "epoch": 5.843430367383686, "grad_norm": 9.469985961914062, "learning_rate": 0.00014403536995123657, "loss": 7.4989, "step": 1434400 }, { "epoch": 5.843837745407067, "grad_norm": 20.95997428894043, "learning_rate": 0.0001438750033678955, "loss": 7.5093, "step": 1434500 }, { "epoch": 5.844245123430449, "grad_norm": 25.432119369506836, "learning_rate": 0.00014371473013245387, "loss": 7.4939, "step": 1434600 }, { "epoch": 5.84465250145383, "grad_norm": 30.063068389892578, "learning_rate": 0.00014355455025124794, "loss": 7.5289, "step": 1434700 }, { "epoch": 5.845059879477212, "grad_norm": 4.15604829788208, "learning_rate": 0.00014339446373061077, "loss": 7.5243, "step": 1434800 }, { "epoch": 5.8454672575005935, "grad_norm": 5.345333099365234, "learning_rate": 0.00014323447057687104, "loss": 7.5189, "step": 1434900 }, { "epoch": 5.845874635523975, "grad_norm": 7.9726433753967285, "learning_rate": 0.00014307457079635436, "loss": 7.5263, "step": 1435000 }, { "epoch": 5.845874635523975, "eval_MaskedAccuracy": 0.5138395214722764, "eval_loss": 1.5867456197738647, "eval_runtime": 163.9514, "eval_samples_per_second": 387.163, "eval_steps_per_second": 1.513, "step": 1435000 }, { "epoch": 5.8462820135473565, "grad_norm": 3.4750051498413086, "learning_rate": 0.00014291476439538204, "loss": 7.5256, "step": 1435100 }, { "epoch": 5.846689391570738, "grad_norm": 5.654435634613037, "learning_rate": 0.00014275505138027232, "loss": 7.5117, "step": 1435200 }, { "epoch": 5.847096769594119, "grad_norm": 25.007041931152344, "learning_rate": 0.00014259543175734006, "loss": 7.4898, "step": 1435300 }, { "epoch": 5.8475041476175, "grad_norm": 14.512124061584473, "learning_rate": 0.00014243590553289517, "loss": 7.5383, "step": 1435400 }, { "epoch": 5.847911525640882, "grad_norm": 4.9716291427612305, "learning_rate": 0.00014227647271324506, "loss": 7.5172, "step": 1435500 }, { "epoch": 5.848318903664263, "grad_norm": 12.040976524353027, "learning_rate": 0.0001421171333046923, "loss": 7.5086, "step": 1435600 }, { "epoch": 5.848726281687645, "grad_norm": 26.50587272644043, "learning_rate": 0.00014195788731353722, "loss": 7.5304, "step": 1435700 }, { "epoch": 5.849133659711026, "grad_norm": 7.013994216918945, "learning_rate": 0.0001417987347460752, "loss": 7.5006, "step": 1435800 }, { "epoch": 5.849541037734408, "grad_norm": 20.126253128051758, "learning_rate": 0.00014163967560859838, "loss": 7.5247, "step": 1435900 }, { "epoch": 5.8499484157577895, "grad_norm": 9.635932922363281, "learning_rate": 0.0001414807099073949, "loss": 7.5034, "step": 1436000 }, { "epoch": 5.8499484157577895, "eval_MaskedAccuracy": 0.5133615099935407, "eval_loss": 1.5901832580566406, "eval_runtime": 154.7722, "eval_samples_per_second": 410.125, "eval_steps_per_second": 1.602, "step": 1436000 }, { "epoch": 5.850355793781171, "grad_norm": 23.026901245117188, "learning_rate": 0.00014132183764875054, "loss": 7.5276, "step": 1436100 }, { "epoch": 5.8507631718045525, "grad_norm": 9.818020820617676, "learning_rate": 0.00014116305883894568, "loss": 7.5121, "step": 1436200 }, { "epoch": 5.851170549827934, "grad_norm": 3.112827777862549, "learning_rate": 0.00014100437348425773, "loss": 7.5157, "step": 1436300 }, { "epoch": 5.851577927851315, "grad_norm": 3.668189525604248, "learning_rate": 0.0001408457815909602, "loss": 7.5273, "step": 1436400 }, { "epoch": 5.851985305874697, "grad_norm": 5.518843650817871, "learning_rate": 0.00014068728316532367, "loss": 7.524, "step": 1436500 }, { "epoch": 5.852392683898078, "grad_norm": 13.072844505310059, "learning_rate": 0.0001405288782136139, "loss": 7.4977, "step": 1436600 }, { "epoch": 5.852800061921459, "grad_norm": 7.497927188873291, "learning_rate": 0.0001403705667420937, "loss": 7.5197, "step": 1436700 }, { "epoch": 5.853207439944841, "grad_norm": 7.040710926055908, "learning_rate": 0.00014021234875702211, "loss": 7.546, "step": 1436800 }, { "epoch": 5.853614817968222, "grad_norm": 7.165277481079102, "learning_rate": 0.00014005422426465413, "loss": 7.5479, "step": 1436900 }, { "epoch": 5.854022195991604, "grad_norm": 16.675113677978516, "learning_rate": 0.0001398961932712411, "loss": 7.5055, "step": 1437000 }, { "epoch": 5.854022195991604, "eval_MaskedAccuracy": 0.5135469924658986, "eval_loss": 1.5882340669631958, "eval_runtime": 158.4854, "eval_samples_per_second": 400.516, "eval_steps_per_second": 1.565, "step": 1437000 }, { "epoch": 5.854429574014985, "grad_norm": 6.026208400726318, "learning_rate": 0.000139738255783031, "loss": 7.5223, "step": 1437100 }, { "epoch": 5.854836952038367, "grad_norm": 6.308501243591309, "learning_rate": 0.00013958041180626798, "loss": 7.5261, "step": 1437200 }, { "epoch": 5.8552443300617485, "grad_norm": 5.170680522918701, "learning_rate": 0.0001394226613471928, "loss": 7.4833, "step": 1437300 }, { "epoch": 5.85565170808513, "grad_norm": 4.743289947509766, "learning_rate": 0.00013926500441204147, "loss": 7.5125, "step": 1437400 }, { "epoch": 5.856059086108512, "grad_norm": 5.6337571144104, "learning_rate": 0.00013910744100704736, "loss": 7.4908, "step": 1437500 }, { "epoch": 5.856466464131893, "grad_norm": 3.7580676078796387, "learning_rate": 0.00013894997113844025, "loss": 7.5016, "step": 1437600 }, { "epoch": 5.856873842155274, "grad_norm": 4.790675640106201, "learning_rate": 0.00013879259481244487, "loss": 7.5079, "step": 1437700 }, { "epoch": 5.857281220178655, "grad_norm": 21.260013580322266, "learning_rate": 0.00013863531203528356, "loss": 7.5416, "step": 1437800 }, { "epoch": 5.857688598202037, "grad_norm": 19.645368576049805, "learning_rate": 0.00013847812281317457, "loss": 7.5363, "step": 1437900 }, { "epoch": 5.858095976225418, "grad_norm": 25.295434951782227, "learning_rate": 0.00013832102715233204, "loss": 7.494, "step": 1438000 }, { "epoch": 5.858095976225418, "eval_MaskedAccuracy": 0.5135544458623212, "eval_loss": 1.5859079360961914, "eval_runtime": 160.8898, "eval_samples_per_second": 394.531, "eval_steps_per_second": 1.541, "step": 1438000 }, { "epoch": 5.8585033542488, "grad_norm": 10.009047508239746, "learning_rate": 0.00013816402505896781, "loss": 7.5017, "step": 1438100 }, { "epoch": 5.858910732272181, "grad_norm": 5.341984272003174, "learning_rate": 0.00013800711653928768, "loss": 7.5598, "step": 1438200 }, { "epoch": 5.859318110295563, "grad_norm": 14.228255271911621, "learning_rate": 0.0001378503015994967, "loss": 7.5264, "step": 1438300 }, { "epoch": 5.8597254883189445, "grad_norm": 10.650612831115723, "learning_rate": 0.0001376935802457934, "loss": 7.5107, "step": 1438400 }, { "epoch": 5.860132866342326, "grad_norm": 20.671329498291016, "learning_rate": 0.00013753695248437388, "loss": 7.5118, "step": 1438500 }, { "epoch": 5.8605402443657075, "grad_norm": 10.042603492736816, "learning_rate": 0.00013738041832143092, "loss": 7.5162, "step": 1438600 }, { "epoch": 5.860947622389089, "grad_norm": 4.876357555389404, "learning_rate": 0.00013722397776315314, "loss": 7.5134, "step": 1438700 }, { "epoch": 5.861355000412471, "grad_norm": 10.248143196105957, "learning_rate": 0.00013706763081572533, "loss": 7.5192, "step": 1438800 }, { "epoch": 5.861762378435852, "grad_norm": 11.08122444152832, "learning_rate": 0.0001369113774853283, "loss": 7.5037, "step": 1438900 }, { "epoch": 5.862169756459233, "grad_norm": 17.589916229248047, "learning_rate": 0.0001367552177781404, "loss": 7.5149, "step": 1439000 }, { "epoch": 5.862169756459233, "eval_MaskedAccuracy": 0.5135288636038816, "eval_loss": 1.5831726789474487, "eval_runtime": 165.6449, "eval_samples_per_second": 383.205, "eval_steps_per_second": 1.497, "step": 1439000 }, { "epoch": 5.862577134482614, "grad_norm": 8.701517105102539, "learning_rate": 0.00013659915170033456, "loss": 7.5096, "step": 1439100 }, { "epoch": 5.862984512505996, "grad_norm": 4.694896221160889, "learning_rate": 0.0001364431792580816, "loss": 7.5167, "step": 1439200 }, { "epoch": 5.863391890529377, "grad_norm": 32.01338577270508, "learning_rate": 0.0001362873004575476, "loss": 7.5181, "step": 1439300 }, { "epoch": 5.863799268552759, "grad_norm": 19.998645782470703, "learning_rate": 0.00013613151530489546, "loss": 7.474, "step": 1439400 }, { "epoch": 5.86420664657614, "grad_norm": 14.526906967163086, "learning_rate": 0.00013597582380628413, "loss": 7.5189, "step": 1439500 }, { "epoch": 5.864614024599522, "grad_norm": 3.2489209175109863, "learning_rate": 0.00013582022596786894, "loss": 7.5216, "step": 1439600 }, { "epoch": 5.8650214026229035, "grad_norm": 7.698704242706299, "learning_rate": 0.0001356647217958015, "loss": 7.5021, "step": 1439700 }, { "epoch": 5.865428780646285, "grad_norm": 9.126141548156738, "learning_rate": 0.00013550931129622946, "loss": 7.5083, "step": 1439800 }, { "epoch": 5.865836158669667, "grad_norm": 15.408364295959473, "learning_rate": 0.00013535399447529715, "loss": 7.5032, "step": 1439900 }, { "epoch": 5.866243536693048, "grad_norm": 20.983110427856445, "learning_rate": 0.00013519877133914525, "loss": 7.4762, "step": 1440000 }, { "epoch": 5.866243536693048, "eval_MaskedAccuracy": 0.5134474078317008, "eval_loss": 1.5856764316558838, "eval_runtime": 165.5224, "eval_samples_per_second": 383.489, "eval_steps_per_second": 1.498, "step": 1440000 }, { "epoch": 5.86665091471643, "grad_norm": 2.6975395679473877, "learning_rate": 0.00013504364189391042, "loss": 7.5193, "step": 1440100 }, { "epoch": 5.867058292739811, "grad_norm": 9.552655220031738, "learning_rate": 0.00013488860614572603, "loss": 7.4968, "step": 1440200 }, { "epoch": 5.867465670763192, "grad_norm": 4.0049662590026855, "learning_rate": 0.00013473366410072108, "loss": 7.5209, "step": 1440300 }, { "epoch": 5.867873048786573, "grad_norm": 18.50998306274414, "learning_rate": 0.00013457881576502177, "loss": 7.509, "step": 1440400 }, { "epoch": 5.868280426809955, "grad_norm": 14.251355171203613, "learning_rate": 0.00013442406114474987, "loss": 7.4969, "step": 1440500 }, { "epoch": 5.868687804833336, "grad_norm": 7.884382724761963, "learning_rate": 0.0001342694002460231, "loss": 7.5282, "step": 1440600 }, { "epoch": 5.869095182856718, "grad_norm": 21.401798248291016, "learning_rate": 0.00013411483307495698, "loss": 7.4923, "step": 1440700 }, { "epoch": 5.8695025608800995, "grad_norm": 41.12330627441406, "learning_rate": 0.00013396035963766155, "loss": 7.5121, "step": 1440800 }, { "epoch": 5.869909938903481, "grad_norm": 16.419288635253906, "learning_rate": 0.00013380597994024443, "loss": 7.4969, "step": 1440900 }, { "epoch": 5.8703173169268625, "grad_norm": 12.37655258178711, "learning_rate": 0.00013365169398880881, "loss": 7.501, "step": 1441000 }, { "epoch": 5.8703173169268625, "eval_MaskedAccuracy": 0.5133409005214465, "eval_loss": 1.5829401016235352, "eval_runtime": 163.5677, "eval_samples_per_second": 388.072, "eval_steps_per_second": 1.516, "step": 1441000 }, { "epoch": 5.870724694950244, "grad_norm": 6.052454948425293, "learning_rate": 0.00013349750178945474, "loss": 7.5154, "step": 1441100 }, { "epoch": 5.871132072973626, "grad_norm": 5.5943379402160645, "learning_rate": 0.0001333434033482782, "loss": 7.5461, "step": 1441200 }, { "epoch": 5.871539450997007, "grad_norm": 7.4150471687316895, "learning_rate": 0.00013318939867137166, "loss": 7.5171, "step": 1441300 }, { "epoch": 5.871946829020388, "grad_norm": 4.02662467956543, "learning_rate": 0.00013303548776482353, "loss": 7.5081, "step": 1441400 }, { "epoch": 5.87235420704377, "grad_norm": 14.457428932189941, "learning_rate": 0.00013288167063471825, "loss": 7.4945, "step": 1441500 }, { "epoch": 5.872761585067151, "grad_norm": 10.926926612854004, "learning_rate": 0.0001327279472871378, "loss": 7.5316, "step": 1441600 }, { "epoch": 5.873168963090532, "grad_norm": 14.696325302124023, "learning_rate": 0.00013257431772815918, "loss": 7.4902, "step": 1441700 }, { "epoch": 5.873576341113914, "grad_norm": 18.471006393432617, "learning_rate": 0.00013242078196385674, "loss": 7.4907, "step": 1441800 }, { "epoch": 5.8739837191372954, "grad_norm": 6.5945143699646, "learning_rate": 0.00013226734000029944, "loss": 7.5085, "step": 1441900 }, { "epoch": 5.874391097160677, "grad_norm": 8.984210014343262, "learning_rate": 0.00013211399184355515, "loss": 7.5213, "step": 1442000 }, { "epoch": 5.874391097160677, "eval_MaskedAccuracy": 0.513202929537671, "eval_loss": 1.5851502418518066, "eval_runtime": 165.5599, "eval_samples_per_second": 383.402, "eval_steps_per_second": 1.498, "step": 1442000 }, { "epoch": 5.8747984751840585, "grad_norm": 9.500515937805176, "learning_rate": 0.00013196073749968635, "loss": 7.4766, "step": 1442100 }, { "epoch": 5.87520585320744, "grad_norm": 4.075962066650391, "learning_rate": 0.0001318075769747516, "loss": 7.5027, "step": 1442200 }, { "epoch": 5.875613231230822, "grad_norm": 25.375219345092773, "learning_rate": 0.00013165451027480574, "loss": 7.4876, "step": 1442300 }, { "epoch": 5.876020609254203, "grad_norm": 40.18368148803711, "learning_rate": 0.000131501537405901, "loss": 7.5283, "step": 1442400 }, { "epoch": 5.876427987277585, "grad_norm": 7.134500980377197, "learning_rate": 0.00013134865837408468, "loss": 7.5371, "step": 1442500 }, { "epoch": 5.876835365300966, "grad_norm": 5.869187831878662, "learning_rate": 0.0001311958731854011, "loss": 7.5293, "step": 1442600 }, { "epoch": 5.877242743324347, "grad_norm": 32.77632522583008, "learning_rate": 0.00013104318184589084, "loss": 7.5221, "step": 1442700 }, { "epoch": 5.877650121347728, "grad_norm": 14.90440845489502, "learning_rate": 0.00013089058436159008, "loss": 7.5192, "step": 1442800 }, { "epoch": 5.87805749937111, "grad_norm": 31.729684829711914, "learning_rate": 0.0001307380807385323, "loss": 7.5285, "step": 1442900 }, { "epoch": 5.878464877394491, "grad_norm": 10.556347846984863, "learning_rate": 0.0001305856709827465, "loss": 7.4979, "step": 1443000 }, { "epoch": 5.878464877394491, "eval_MaskedAccuracy": 0.5139184009130835, "eval_loss": 1.5758243799209595, "eval_runtime": 171.8788, "eval_samples_per_second": 369.307, "eval_steps_per_second": 1.443, "step": 1443000 }, { "epoch": 5.878872255417873, "grad_norm": 5.144359588623047, "learning_rate": 0.0001304333551002587, "loss": 7.5262, "step": 1443100 }, { "epoch": 5.8792796334412545, "grad_norm": 2.1949470043182373, "learning_rate": 0.00013028113309708996, "loss": 7.5334, "step": 1443200 }, { "epoch": 5.879687011464636, "grad_norm": 33.62776184082031, "learning_rate": 0.00013012900497925934, "loss": 7.5258, "step": 1443300 }, { "epoch": 5.8800943894880175, "grad_norm": 34.01841735839844, "learning_rate": 0.0001299769707527811, "loss": 7.5322, "step": 1443400 }, { "epoch": 5.880501767511399, "grad_norm": 23.741214752197266, "learning_rate": 0.00012982503042366543, "loss": 7.5244, "step": 1443500 }, { "epoch": 5.880909145534781, "grad_norm": 12.783524513244629, "learning_rate": 0.00012967318399791965, "loss": 7.5436, "step": 1443600 }, { "epoch": 5.881316523558162, "grad_norm": 4.492242813110352, "learning_rate": 0.00012952143148154698, "loss": 7.5224, "step": 1443700 }, { "epoch": 5.881723901581544, "grad_norm": 27.41019058227539, "learning_rate": 0.0001293697728805472, "loss": 7.5373, "step": 1443800 }, { "epoch": 5.882131279604925, "grad_norm": 4.752145767211914, "learning_rate": 0.00012921820820091558, "loss": 7.5271, "step": 1443900 }, { "epoch": 5.882538657628306, "grad_norm": 3.3152313232421875, "learning_rate": 0.00012906673744864512, "loss": 7.5065, "step": 1444000 }, { "epoch": 5.882538657628306, "eval_MaskedAccuracy": 0.5136492446660499, "eval_loss": 1.5915486812591553, "eval_runtime": 167.6905, "eval_samples_per_second": 378.531, "eval_steps_per_second": 1.479, "step": 1444000 }, { "epoch": 5.882946035651687, "grad_norm": 10.958313941955566, "learning_rate": 0.00012891536062972382, "loss": 7.5251, "step": 1444100 }, { "epoch": 5.883353413675069, "grad_norm": 3.695143699645996, "learning_rate": 0.00012876407775013672, "loss": 7.4796, "step": 1444200 }, { "epoch": 5.8837607916984505, "grad_norm": 9.321735382080078, "learning_rate": 0.00012861288881586464, "loss": 7.5384, "step": 1444300 }, { "epoch": 5.884168169721832, "grad_norm": 20.033639907836914, "learning_rate": 0.0001284617938328845, "loss": 7.524, "step": 1444400 }, { "epoch": 5.8845755477452135, "grad_norm": 25.62944984436035, "learning_rate": 0.00012831079280717066, "loss": 7.5156, "step": 1444500 }, { "epoch": 5.884982925768595, "grad_norm": 4.379734039306641, "learning_rate": 0.00012815988574469282, "loss": 7.536, "step": 1444600 }, { "epoch": 5.885390303791977, "grad_norm": 19.151958465576172, "learning_rate": 0.00012800907265141665, "loss": 7.502, "step": 1444700 }, { "epoch": 5.885797681815358, "grad_norm": 21.18988609313965, "learning_rate": 0.00012785835353330477, "loss": 7.5286, "step": 1444800 }, { "epoch": 5.88620505983874, "grad_norm": 16.10911750793457, "learning_rate": 0.0001277077283963162, "loss": 7.5095, "step": 1444900 }, { "epoch": 5.886612437862121, "grad_norm": 24.481395721435547, "learning_rate": 0.00012755719724640558, "loss": 7.5323, "step": 1445000 }, { "epoch": 5.886612437862121, "eval_MaskedAccuracy": 0.5135959303774668, "eval_loss": 1.5905863046646118, "eval_runtime": 177.6913, "eval_samples_per_second": 357.226, "eval_steps_per_second": 1.396, "step": 1445000 }, { "epoch": 5.887019815885503, "grad_norm": 23.361230850219727, "learning_rate": 0.00012740676008952476, "loss": 7.4898, "step": 1445100 }, { "epoch": 5.887427193908884, "grad_norm": 3.7567873001098633, "learning_rate": 0.00012725641693162084, "loss": 7.5047, "step": 1445200 }, { "epoch": 5.887834571932265, "grad_norm": 3.618382215499878, "learning_rate": 0.0001271061677786378, "loss": 7.5332, "step": 1445300 }, { "epoch": 5.888241949955646, "grad_norm": 4.32653284072876, "learning_rate": 0.00012695601263651587, "loss": 7.5139, "step": 1445400 }, { "epoch": 5.888649327979028, "grad_norm": 17.843873977661133, "learning_rate": 0.00012680595151119133, "loss": 7.5246, "step": 1445500 }, { "epoch": 5.8890567060024095, "grad_norm": 27.57331085205078, "learning_rate": 0.00012665598440859713, "loss": 7.506, "step": 1445600 }, { "epoch": 5.889464084025791, "grad_norm": 9.948957443237305, "learning_rate": 0.00012650611133466165, "loss": 7.5069, "step": 1445700 }, { "epoch": 5.889871462049173, "grad_norm": 14.954339981079102, "learning_rate": 0.0001263563322953109, "loss": 7.5173, "step": 1445800 }, { "epoch": 5.890278840072554, "grad_norm": 8.692944526672363, "learning_rate": 0.00012620664729646582, "loss": 7.504, "step": 1445900 }, { "epoch": 5.890686218095936, "grad_norm": 26.2714786529541, "learning_rate": 0.00012605705634404497, "loss": 7.5458, "step": 1446000 }, { "epoch": 5.890686218095936, "eval_MaskedAccuracy": 0.5133637309369398, "eval_loss": 1.5888253450393677, "eval_runtime": 169.9806, "eval_samples_per_second": 373.431, "eval_steps_per_second": 1.459, "step": 1446000 }, { "epoch": 5.891093596119317, "grad_norm": 4.7359418869018555, "learning_rate": 0.00012590755944396224, "loss": 7.5019, "step": 1446100 }, { "epoch": 5.891500974142699, "grad_norm": 18.73127555847168, "learning_rate": 0.00012575815660212795, "loss": 7.5389, "step": 1446200 }, { "epoch": 5.89190835216608, "grad_norm": 18.924087524414062, "learning_rate": 0.0001256088478244491, "loss": 7.4981, "step": 1446300 }, { "epoch": 5.892315730189461, "grad_norm": 15.551505088806152, "learning_rate": 0.00012545963311682816, "loss": 7.4889, "step": 1446400 }, { "epoch": 5.892723108212843, "grad_norm": 10.590399742126465, "learning_rate": 0.0001253105124851645, "loss": 7.532, "step": 1446500 }, { "epoch": 5.893130486236224, "grad_norm": 13.321043014526367, "learning_rate": 0.00012516148593535385, "loss": 7.5364, "step": 1446600 }, { "epoch": 5.8935378642596055, "grad_norm": 11.357898712158203, "learning_rate": 0.00012501255347328795, "loss": 7.4776, "step": 1446700 }, { "epoch": 5.893945242282987, "grad_norm": 3.8171656131744385, "learning_rate": 0.00012486371510485507, "loss": 7.5042, "step": 1446800 }, { "epoch": 5.8943526203063685, "grad_norm": 10.532248497009277, "learning_rate": 0.00012471497083593915, "loss": 7.5628, "step": 1446900 }, { "epoch": 5.89475999832975, "grad_norm": 18.82876205444336, "learning_rate": 0.0001245663206724209, "loss": 7.5004, "step": 1447000 }, { "epoch": 5.89475999832975, "eval_MaskedAccuracy": 0.5139297068543455, "eval_loss": 1.5773016214370728, "eval_runtime": 168.4882, "eval_samples_per_second": 376.739, "eval_steps_per_second": 1.472, "step": 1447000 }, { "epoch": 5.895167376353132, "grad_norm": 6.096856594085693, "learning_rate": 0.00012441776462017795, "loss": 7.5222, "step": 1447100 }, { "epoch": 5.895574754376513, "grad_norm": 24.732280731201172, "learning_rate": 0.0001242693026850828, "loss": 7.4954, "step": 1447200 }, { "epoch": 5.895982132399895, "grad_norm": 25.817455291748047, "learning_rate": 0.00012412093487300518, "loss": 7.5074, "step": 1447300 }, { "epoch": 5.896389510423276, "grad_norm": 19.49112892150879, "learning_rate": 0.00012397266118981045, "loss": 7.5271, "step": 1447400 }, { "epoch": 5.896796888446658, "grad_norm": 19.085458755493164, "learning_rate": 0.00012382448164136153, "loss": 7.5066, "step": 1447500 }, { "epoch": 5.897204266470039, "grad_norm": 22.932804107666016, "learning_rate": 0.00012367639623351624, "loss": 7.52, "step": 1447600 }, { "epoch": 5.89761164449342, "grad_norm": 10.887919425964355, "learning_rate": 0.00012352840497212907, "loss": 7.5119, "step": 1447700 }, { "epoch": 5.898019022516801, "grad_norm": 3.4162111282348633, "learning_rate": 0.00012338050786305126, "loss": 7.5051, "step": 1447800 }, { "epoch": 5.898426400540183, "grad_norm": 3.4957773685455322, "learning_rate": 0.00012323270491212935, "loss": 7.5177, "step": 1447900 }, { "epoch": 5.8988337785635645, "grad_norm": 8.30463981628418, "learning_rate": 0.0001230849961252072, "loss": 7.4906, "step": 1448000 }, { "epoch": 5.8988337785635645, "eval_MaskedAccuracy": 0.5137562541685923, "eval_loss": 1.5782514810562134, "eval_runtime": 161.3628, "eval_samples_per_second": 393.374, "eval_steps_per_second": 1.537, "step": 1448000 }, { "epoch": 5.899241156586946, "grad_norm": 3.286139726638794, "learning_rate": 0.0001229373815081249, "loss": 7.5286, "step": 1448100 }, { "epoch": 5.899648534610328, "grad_norm": 18.14317512512207, "learning_rate": 0.00012278986106671787, "loss": 7.5051, "step": 1448200 }, { "epoch": 5.900055912633709, "grad_norm": 7.864316940307617, "learning_rate": 0.00012264243480681892, "loss": 7.5175, "step": 1448300 }, { "epoch": 5.900463290657091, "grad_norm": 17.623258590698242, "learning_rate": 0.00012249510273425597, "loss": 7.5062, "step": 1448400 }, { "epoch": 5.900870668680472, "grad_norm": 14.369786262512207, "learning_rate": 0.000122347864854854, "loss": 7.529, "step": 1448500 }, { "epoch": 5.901278046703854, "grad_norm": 21.708314895629883, "learning_rate": 0.00012220072117443488, "loss": 7.5125, "step": 1448600 }, { "epoch": 5.901685424727235, "grad_norm": 10.244361877441406, "learning_rate": 0.00012205367169881467, "loss": 7.563, "step": 1448700 }, { "epoch": 5.902092802750617, "grad_norm": 16.77159309387207, "learning_rate": 0.00012190671643380743, "loss": 7.4899, "step": 1448800 }, { "epoch": 5.902500180773998, "grad_norm": 19.80599021911621, "learning_rate": 0.00012175985538522329, "loss": 7.5095, "step": 1448900 }, { "epoch": 5.902907558797379, "grad_norm": 7.711851119995117, "learning_rate": 0.00012161308855886834, "loss": 7.5223, "step": 1449000 }, { "epoch": 5.902907558797379, "eval_MaskedAccuracy": 0.5132495013050716, "eval_loss": 1.5839108228683472, "eval_runtime": 171.2586, "eval_samples_per_second": 370.644, "eval_steps_per_second": 1.448, "step": 1449000 }, { "epoch": 5.9033149368207605, "grad_norm": 8.166816711425781, "learning_rate": 0.0001214664159605456, "loss": 7.4815, "step": 1449100 }, { "epoch": 5.903722314844142, "grad_norm": 14.676194190979004, "learning_rate": 0.00012131983759605326, "loss": 7.5067, "step": 1449200 }, { "epoch": 5.9041296928675235, "grad_norm": 6.128334999084473, "learning_rate": 0.00012117335347118658, "loss": 7.4995, "step": 1449300 }, { "epoch": 5.904537070890905, "grad_norm": 3.75400972366333, "learning_rate": 0.00012102696359173672, "loss": 7.5163, "step": 1449400 }, { "epoch": 5.904944448914287, "grad_norm": 22.000093460083008, "learning_rate": 0.0001208806679634915, "loss": 7.5036, "step": 1449500 }, { "epoch": 5.905351826937668, "grad_norm": 10.487384796142578, "learning_rate": 0.00012073446659223455, "loss": 7.5393, "step": 1449600 }, { "epoch": 5.90575920496105, "grad_norm": 13.914013862609863, "learning_rate": 0.00012058835948374605, "loss": 7.5098, "step": 1449700 }, { "epoch": 5.906166582984431, "grad_norm": 4.168239116668701, "learning_rate": 0.00012044234664380251, "loss": 7.5023, "step": 1449800 }, { "epoch": 5.906573961007813, "grad_norm": 73.41988372802734, "learning_rate": 0.00012029642807817591, "loss": 7.4953, "step": 1449900 }, { "epoch": 5.906981339031194, "grad_norm": 4.312287330627441, "learning_rate": 0.00012015060379263657, "loss": 7.5129, "step": 1450000 }, { "epoch": 5.906981339031194, "eval_MaskedAccuracy": 0.5135990392072747, "eval_loss": 1.5788543224334717, "eval_runtime": 164.1998, "eval_samples_per_second": 386.578, "eval_steps_per_second": 1.51, "step": 1450000 }, { "epoch": 5.907388717054576, "grad_norm": 29.152294158935547, "learning_rate": 0.00012000487379294871, "loss": 7.4977, "step": 1450100 }, { "epoch": 5.907796095077957, "grad_norm": 16.983320236206055, "learning_rate": 0.00011985923808487424, "loss": 7.4934, "step": 1450200 }, { "epoch": 5.908203473101338, "grad_norm": 10.714752197265625, "learning_rate": 0.00011971369667417034, "loss": 7.5055, "step": 1450300 }, { "epoch": 5.9086108511247195, "grad_norm": 3.5559794902801514, "learning_rate": 0.00011956824956659175, "loss": 7.4962, "step": 1450400 }, { "epoch": 5.909018229148101, "grad_norm": 6.410406112670898, "learning_rate": 0.00011942289676788828, "loss": 7.506, "step": 1450500 }, { "epoch": 5.909425607171483, "grad_norm": 14.903667449951172, "learning_rate": 0.00011927763828380662, "loss": 7.5288, "step": 1450600 }, { "epoch": 5.909832985194864, "grad_norm": 17.27276611328125, "learning_rate": 0.00011913247412008981, "loss": 7.5025, "step": 1450700 }, { "epoch": 5.910240363218246, "grad_norm": 14.003911972045898, "learning_rate": 0.00011898740428247663, "loss": 7.5199, "step": 1450800 }, { "epoch": 5.910647741241627, "grad_norm": 23.43582534790039, "learning_rate": 0.00011884242877670301, "loss": 7.4905, "step": 1450900 }, { "epoch": 5.911055119265009, "grad_norm": 11.995508193969727, "learning_rate": 0.00011869754760849997, "loss": 7.5087, "step": 1451000 }, { "epoch": 5.911055119265009, "eval_MaskedAccuracy": 0.51343927019386, "eval_loss": 1.5892750024795532, "eval_runtime": 175.4648, "eval_samples_per_second": 361.759, "eval_steps_per_second": 1.413, "step": 1451000 }, { "epoch": 5.91146249728839, "grad_norm": 17.342601776123047, "learning_rate": 0.00011855276078359587, "loss": 7.4915, "step": 1451100 }, { "epoch": 5.911869875311772, "grad_norm": 14.83423137664795, "learning_rate": 0.0001184080683077145, "loss": 7.5277, "step": 1451200 }, { "epoch": 5.912277253335153, "grad_norm": 15.177057266235352, "learning_rate": 0.00011826347018657685, "loss": 7.5006, "step": 1451300 }, { "epoch": 5.912684631358534, "grad_norm": 11.612956047058105, "learning_rate": 0.00011811896642589933, "loss": 7.4908, "step": 1451400 }, { "epoch": 5.913092009381916, "grad_norm": 19.39689064025879, "learning_rate": 0.00011797455703139484, "loss": 7.4941, "step": 1451500 }, { "epoch": 5.913499387405297, "grad_norm": 3.6763217449188232, "learning_rate": 0.00011783024200877304, "loss": 7.5167, "step": 1451600 }, { "epoch": 5.9139067654286785, "grad_norm": 12.86085033416748, "learning_rate": 0.0001176860213637391, "loss": 7.498, "step": 1451700 }, { "epoch": 5.91431414345206, "grad_norm": 4.068489074707031, "learning_rate": 0.00011754189510199496, "loss": 7.4984, "step": 1451800 }, { "epoch": 5.914721521475442, "grad_norm": 20.809120178222656, "learning_rate": 0.00011739786322923863, "loss": 7.4948, "step": 1451900 }, { "epoch": 5.915128899498823, "grad_norm": 11.219196319580078, "learning_rate": 0.00011725392575116451, "loss": 7.5258, "step": 1452000 }, { "epoch": 5.915128899498823, "eval_MaskedAccuracy": 0.5137053143283565, "eval_loss": 1.5857536792755127, "eval_runtime": 186.5472, "eval_samples_per_second": 340.268, "eval_steps_per_second": 1.329, "step": 1452000 }, { "epoch": 5.915536277522205, "grad_norm": 13.42127799987793, "learning_rate": 0.0001171100826734637, "loss": 7.5294, "step": 1452100 }, { "epoch": 5.915943655545586, "grad_norm": 8.402169227600098, "learning_rate": 0.00011696633400182227, "loss": 7.4891, "step": 1452200 }, { "epoch": 5.916351033568968, "grad_norm": 17.03421974182129, "learning_rate": 0.00011682267974192385, "loss": 7.4852, "step": 1452300 }, { "epoch": 5.916758411592349, "grad_norm": 3.7382123470306396, "learning_rate": 0.000116679119899448, "loss": 7.5025, "step": 1452400 }, { "epoch": 5.917165789615731, "grad_norm": 19.800050735473633, "learning_rate": 0.00011653565448006997, "loss": 7.4884, "step": 1452500 }, { "epoch": 5.917573167639112, "grad_norm": 3.0500950813293457, "learning_rate": 0.00011639228348946203, "loss": 7.5367, "step": 1452600 }, { "epoch": 5.917980545662493, "grad_norm": 8.724006652832031, "learning_rate": 0.00011624900693329203, "loss": 7.4985, "step": 1452700 }, { "epoch": 5.9183879236858745, "grad_norm": 19.91343116760254, "learning_rate": 0.00011610582481722494, "loss": 7.5106, "step": 1452800 }, { "epoch": 5.918795301709256, "grad_norm": 16.161819458007812, "learning_rate": 0.00011596273714692142, "loss": 7.4874, "step": 1452900 }, { "epoch": 5.919202679732638, "grad_norm": 4.49916934967041, "learning_rate": 0.00011581974392803816, "loss": 7.5098, "step": 1453000 }, { "epoch": 5.919202679732638, "eval_MaskedAccuracy": 0.5143894777400708, "eval_loss": 1.5754600763320923, "eval_runtime": 158.4013, "eval_samples_per_second": 400.729, "eval_steps_per_second": 1.566, "step": 1453000 }, { "epoch": 5.919610057756019, "grad_norm": 23.63644027709961, "learning_rate": 0.00011567684516622845, "loss": 7.4949, "step": 1453100 }, { "epoch": 5.920017435779401, "grad_norm": 23.471284866333008, "learning_rate": 0.00011553404086714239, "loss": 7.5216, "step": 1453200 }, { "epoch": 5.920424813802782, "grad_norm": 16.059326171875, "learning_rate": 0.00011539133103642536, "loss": 7.537, "step": 1453300 }, { "epoch": 5.920832191826164, "grad_norm": 5.33090877532959, "learning_rate": 0.00011524871567971943, "loss": 7.5013, "step": 1453400 }, { "epoch": 5.921239569849545, "grad_norm": 6.212322235107422, "learning_rate": 0.00011510619480266355, "loss": 7.5425, "step": 1453500 }, { "epoch": 5.921646947872927, "grad_norm": 23.205650329589844, "learning_rate": 0.00011496376841089155, "loss": 7.5143, "step": 1453600 }, { "epoch": 5.922054325896308, "grad_norm": 20.84681510925293, "learning_rate": 0.00011482143651003477, "loss": 7.5228, "step": 1453700 }, { "epoch": 5.92246170391969, "grad_norm": 17.809091567993164, "learning_rate": 0.00011467919910571971, "loss": 7.5239, "step": 1453800 }, { "epoch": 5.922869081943071, "grad_norm": 5.129265308380127, "learning_rate": 0.00011453705620357022, "loss": 7.5259, "step": 1453900 }, { "epoch": 5.923276459966452, "grad_norm": 13.345703125, "learning_rate": 0.00011439500780920652, "loss": 7.5022, "step": 1454000 }, { "epoch": 5.923276459966452, "eval_MaskedAccuracy": 0.5135653023197638, "eval_loss": 1.5850971937179565, "eval_runtime": 182.5633, "eval_samples_per_second": 347.693, "eval_steps_per_second": 1.358, "step": 1454000 }, { "epoch": 5.923683837989834, "grad_norm": 9.327455520629883, "learning_rate": 0.00011425305392824364, "loss": 7.4987, "step": 1454100 }, { "epoch": 5.924091216013215, "grad_norm": 14.787358283996582, "learning_rate": 0.00011411119456629445, "loss": 7.5079, "step": 1454200 }, { "epoch": 5.924498594036597, "grad_norm": 6.930863857269287, "learning_rate": 0.00011396942972896683, "loss": 7.5335, "step": 1454300 }, { "epoch": 5.924905972059978, "grad_norm": 11.233614921569824, "learning_rate": 0.00011382775942186596, "loss": 7.5241, "step": 1454400 }, { "epoch": 5.92531335008336, "grad_norm": 4.68874454498291, "learning_rate": 0.00011368618365059263, "loss": 7.5484, "step": 1454500 }, { "epoch": 5.925720728106741, "grad_norm": 10.873495101928711, "learning_rate": 0.00011354470242074426, "loss": 7.494, "step": 1454600 }, { "epoch": 5.926128106130123, "grad_norm": 11.247530937194824, "learning_rate": 0.00011340331573791407, "loss": 7.4829, "step": 1454700 }, { "epoch": 5.926535484153504, "grad_norm": 39.476524353027344, "learning_rate": 0.00011326202360769187, "loss": 7.5259, "step": 1454800 }, { "epoch": 5.926942862176886, "grad_norm": 25.049827575683594, "learning_rate": 0.00011312082603566377, "loss": 7.5139, "step": 1454900 }, { "epoch": 5.927350240200267, "grad_norm": 3.6227469444274902, "learning_rate": 0.00011297972302741214, "loss": 7.5229, "step": 1455000 }, { "epoch": 5.927350240200267, "eval_MaskedAccuracy": 0.5139925969125861, "eval_loss": 1.578850507736206, "eval_runtime": 176.3056, "eval_samples_per_second": 360.034, "eval_steps_per_second": 1.407, "step": 1455000 }, { "epoch": 5.927757618223649, "grad_norm": 19.786165237426758, "learning_rate": 0.00011283871458851566, "loss": 7.5219, "step": 1455100 }, { "epoch": 5.92816499624703, "grad_norm": 3.4540088176727295, "learning_rate": 0.0001126978007245488, "loss": 7.536, "step": 1455200 }, { "epoch": 5.928572374270411, "grad_norm": 26.75469207763672, "learning_rate": 0.0001125569814410828, "loss": 7.4783, "step": 1455300 }, { "epoch": 5.928979752293793, "grad_norm": 15.556597709655762, "learning_rate": 0.00011241625674368492, "loss": 7.4901, "step": 1455400 }, { "epoch": 5.929387130317174, "grad_norm": 19.530500411987305, "learning_rate": 0.000112275626637919, "loss": 7.5292, "step": 1455500 }, { "epoch": 5.929794508340556, "grad_norm": 23.01983642578125, "learning_rate": 0.00011213509112934449, "loss": 7.4952, "step": 1455600 }, { "epoch": 5.930201886363937, "grad_norm": 28.697982788085938, "learning_rate": 0.00011199465022351797, "loss": 7.5308, "step": 1455700 }, { "epoch": 5.930609264387319, "grad_norm": 9.541047096252441, "learning_rate": 0.00011185430392599124, "loss": 7.5132, "step": 1455800 }, { "epoch": 5.9310166424107, "grad_norm": 14.119759559631348, "learning_rate": 0.00011171405224231382, "loss": 7.5025, "step": 1455900 }, { "epoch": 5.931424020434082, "grad_norm": 3.3149988651275635, "learning_rate": 0.00011157389517802986, "loss": 7.5053, "step": 1456000 }, { "epoch": 5.931424020434082, "eval_MaskedAccuracy": 0.514069930094781, "eval_loss": 1.580933690071106, "eval_runtime": 170.3279, "eval_samples_per_second": 372.669, "eval_steps_per_second": 1.456, "step": 1456000 }, { "epoch": 5.931831398457463, "grad_norm": 8.514421463012695, "learning_rate": 0.00011143383273868073, "loss": 7.518, "step": 1456100 }, { "epoch": 5.932238776480845, "grad_norm": 21.946008682250977, "learning_rate": 0.00011129386492980401, "loss": 7.5054, "step": 1456200 }, { "epoch": 5.932646154504226, "grad_norm": 6.1492228507995605, "learning_rate": 0.00011115399175693329, "loss": 7.5188, "step": 1456300 }, { "epoch": 5.933053532527607, "grad_norm": 24.58721351623535, "learning_rate": 0.00011101421322559837, "loss": 7.5078, "step": 1456400 }, { "epoch": 5.9334609105509895, "grad_norm": 11.320701599121094, "learning_rate": 0.00011087452934132561, "loss": 7.4834, "step": 1456500 }, { "epoch": 5.93386828857437, "grad_norm": 13.344386100769043, "learning_rate": 0.00011073494010963742, "loss": 7.5097, "step": 1456600 }, { "epoch": 5.934275666597752, "grad_norm": 3.499661922454834, "learning_rate": 0.00011059544553605261, "loss": 7.51, "step": 1456700 }, { "epoch": 5.934683044621133, "grad_norm": 14.360859870910645, "learning_rate": 0.0001104560456260858, "loss": 7.4833, "step": 1456800 }, { "epoch": 5.935090422644515, "grad_norm": 14.348841667175293, "learning_rate": 0.00011031674038524853, "loss": 7.5203, "step": 1456900 }, { "epoch": 5.935497800667896, "grad_norm": 6.4998459815979, "learning_rate": 0.00011017752981904802, "loss": 7.5111, "step": 1457000 }, { "epoch": 5.935497800667896, "eval_MaskedAccuracy": 0.5138158737279147, "eval_loss": 1.5889171361923218, "eval_runtime": 192.7508, "eval_samples_per_second": 329.316, "eval_steps_per_second": 1.287, "step": 1457000 }, { "epoch": 5.935905178691278, "grad_norm": 9.365138053894043, "learning_rate": 0.00011003841393298827, "loss": 7.4937, "step": 1457100 }, { "epoch": 5.936312556714659, "grad_norm": 2.9428069591522217, "learning_rate": 0.00010989939273256926, "loss": 7.5282, "step": 1457200 }, { "epoch": 5.936719934738041, "grad_norm": 19.085132598876953, "learning_rate": 0.00010976046622328699, "loss": 7.484, "step": 1457300 }, { "epoch": 5.937127312761422, "grad_norm": 15.751285552978516, "learning_rate": 0.00010962163441063402, "loss": 7.5371, "step": 1457400 }, { "epoch": 5.937534690784804, "grad_norm": 14.46463680267334, "learning_rate": 0.0001094828973000994, "loss": 7.5145, "step": 1457500 }, { "epoch": 5.937942068808185, "grad_norm": 5.172186374664307, "learning_rate": 0.00010934425489716794, "loss": 7.5101, "step": 1457600 }, { "epoch": 5.938349446831566, "grad_norm": 38.2766227722168, "learning_rate": 0.00010920570720732101, "loss": 7.5323, "step": 1457700 }, { "epoch": 5.938756824854948, "grad_norm": 3.047947406768799, "learning_rate": 0.00010906725423603598, "loss": 7.5255, "step": 1457800 }, { "epoch": 5.939164202878329, "grad_norm": 2.5195424556732178, "learning_rate": 0.00010892889598878674, "loss": 7.4827, "step": 1457900 }, { "epoch": 5.939571580901711, "grad_norm": 14.40636157989502, "learning_rate": 0.00010879063247104339, "loss": 7.5231, "step": 1458000 }, { "epoch": 5.939571580901711, "eval_MaskedAccuracy": 0.5136709774046209, "eval_loss": 1.5871533155441284, "eval_runtime": 173.1303, "eval_samples_per_second": 366.637, "eval_steps_per_second": 1.432, "step": 1458000 }, { "epoch": 5.939978958925092, "grad_norm": 3.6198933124542236, "learning_rate": 0.0001086524636882722, "loss": 7.5091, "step": 1458100 }, { "epoch": 5.940386336948474, "grad_norm": 5.195720195770264, "learning_rate": 0.00010851438964593575, "loss": 7.5194, "step": 1458200 }, { "epoch": 5.940793714971855, "grad_norm": 5.755271911621094, "learning_rate": 0.0001083764103494929, "loss": 7.4978, "step": 1458300 }, { "epoch": 5.941201092995237, "grad_norm": 11.181453704833984, "learning_rate": 0.00010823852580439825, "loss": 7.5298, "step": 1458400 }, { "epoch": 5.941608471018618, "grad_norm": 6.790501117706299, "learning_rate": 0.00010810073601610384, "loss": 7.4772, "step": 1458500 }, { "epoch": 5.942015849042, "grad_norm": 37.07013702392578, "learning_rate": 0.00010796304099005648, "loss": 7.5224, "step": 1458600 }, { "epoch": 5.942423227065381, "grad_norm": 15.961553573608398, "learning_rate": 0.00010782544073170042, "loss": 7.4948, "step": 1458700 }, { "epoch": 5.942830605088763, "grad_norm": 34.687110900878906, "learning_rate": 0.00010768793524647601, "loss": 7.5462, "step": 1458800 }, { "epoch": 5.9432379831121445, "grad_norm": 25.230497360229492, "learning_rate": 0.0001075505245398189, "loss": 7.5262, "step": 1458900 }, { "epoch": 5.943645361135525, "grad_norm": 19.544490814208984, "learning_rate": 0.0001074132086171622, "loss": 7.4948, "step": 1459000 }, { "epoch": 5.943645361135525, "eval_MaskedAccuracy": 0.5138457305683033, "eval_loss": 1.585815191268921, "eval_runtime": 179.7689, "eval_samples_per_second": 353.098, "eval_steps_per_second": 1.38, "step": 1459000 }, { "epoch": 5.944052739158907, "grad_norm": 8.752004623413086, "learning_rate": 0.00010727598748393458, "loss": 7.5053, "step": 1459100 }, { "epoch": 5.944460117182288, "grad_norm": 33.22947692871094, "learning_rate": 0.00010713886114556092, "loss": 7.477, "step": 1459200 }, { "epoch": 5.94486749520567, "grad_norm": 24.97207260131836, "learning_rate": 0.00010700182960746278, "loss": 7.5359, "step": 1459300 }, { "epoch": 5.945274873229051, "grad_norm": 13.16974925994873, "learning_rate": 0.0001068648928750579, "loss": 7.5337, "step": 1459400 }, { "epoch": 5.945682251252433, "grad_norm": 18.190574645996094, "learning_rate": 0.00010672805095375971, "loss": 7.5315, "step": 1459500 }, { "epoch": 5.946089629275814, "grad_norm": 19.884090423583984, "learning_rate": 0.00010659130384897889, "loss": 7.5197, "step": 1459600 }, { "epoch": 5.946497007299196, "grad_norm": 11.915521621704102, "learning_rate": 0.00010645465156612104, "loss": 7.5041, "step": 1459700 }, { "epoch": 5.946904385322577, "grad_norm": 36.03401565551758, "learning_rate": 0.0001063180941105886, "loss": 7.5012, "step": 1459800 }, { "epoch": 5.947311763345959, "grad_norm": 6.755247116088867, "learning_rate": 0.00010618163148778167, "loss": 7.4913, "step": 1459900 }, { "epoch": 5.94771914136934, "grad_norm": 3.6575753688812256, "learning_rate": 0.00010604526370309458, "loss": 7.4957, "step": 1460000 }, { "epoch": 5.94771914136934, "eval_MaskedAccuracy": 0.5137182524719381, "eval_loss": 1.5914576053619385, "eval_runtime": 170.9962, "eval_samples_per_second": 371.213, "eval_steps_per_second": 1.45, "step": 1460000 }, { "epoch": 5.948126519392722, "grad_norm": 3.3408427238464355, "learning_rate": 0.00010590899076191884, "loss": 7.5273, "step": 1460100 }, { "epoch": 5.9485338974161035, "grad_norm": 6.367294788360596, "learning_rate": 0.00010577281266964175, "loss": 7.5127, "step": 1460200 }, { "epoch": 5.948941275439484, "grad_norm": 3.021536350250244, "learning_rate": 0.00010563672943164713, "loss": 7.5185, "step": 1460300 }, { "epoch": 5.949348653462866, "grad_norm": 17.60310173034668, "learning_rate": 0.00010550074105331562, "loss": 7.4955, "step": 1460400 }, { "epoch": 5.949756031486247, "grad_norm": 13.842937469482422, "learning_rate": 0.00010536484754002345, "loss": 7.513, "step": 1460500 }, { "epoch": 5.950163409509629, "grad_norm": 13.60439395904541, "learning_rate": 0.00010522904889714263, "loss": 7.519, "step": 1460600 }, { "epoch": 5.95057078753301, "grad_norm": 15.470193862915039, "learning_rate": 0.00010509334513004263, "loss": 7.4751, "step": 1460700 }, { "epoch": 5.950978165556392, "grad_norm": 11.470630645751953, "learning_rate": 0.00010495773624408801, "loss": 7.5312, "step": 1460800 }, { "epoch": 5.951385543579773, "grad_norm": 24.447980880737305, "learning_rate": 0.00010482222224464061, "loss": 7.5286, "step": 1460900 }, { "epoch": 5.951792921603155, "grad_norm": 22.97228240966797, "learning_rate": 0.00010468680313705783, "loss": 7.4935, "step": 1461000 }, { "epoch": 5.951792921603155, "eval_MaskedAccuracy": 0.513794019307914, "eval_loss": 1.5820680856704712, "eval_runtime": 171.5518, "eval_samples_per_second": 370.011, "eval_steps_per_second": 1.446, "step": 1461000 }, { "epoch": 5.952200299626536, "grad_norm": 26.28276252746582, "learning_rate": 0.00010455147892669349, "loss": 7.5011, "step": 1461100 }, { "epoch": 5.952607677649918, "grad_norm": 3.126626491546631, "learning_rate": 0.0001044162496188975, "loss": 7.5204, "step": 1461200 }, { "epoch": 5.9530150556732995, "grad_norm": 3.186384439468384, "learning_rate": 0.00010428111521901671, "loss": 7.5096, "step": 1461300 }, { "epoch": 5.95342243369668, "grad_norm": 5.268251419067383, "learning_rate": 0.00010414607573239319, "loss": 7.5026, "step": 1461400 }, { "epoch": 5.9538298117200625, "grad_norm": 11.56948184967041, "learning_rate": 0.00010401113116436601, "loss": 7.5108, "step": 1461500 }, { "epoch": 5.954237189743443, "grad_norm": 28.242420196533203, "learning_rate": 0.00010387628152026999, "loss": 7.4937, "step": 1461600 }, { "epoch": 5.954644567766825, "grad_norm": 4.825844764709473, "learning_rate": 0.0001037415268054368, "loss": 7.5388, "step": 1461700 }, { "epoch": 5.955051945790206, "grad_norm": 41.179134368896484, "learning_rate": 0.00010360686702519378, "loss": 7.5047, "step": 1461800 }, { "epoch": 5.955459323813588, "grad_norm": 11.953980445861816, "learning_rate": 0.00010347230218486505, "loss": 7.4873, "step": 1461900 }, { "epoch": 5.955866701836969, "grad_norm": 18.22443199157715, "learning_rate": 0.00010333783228977027, "loss": 7.5001, "step": 1462000 }, { "epoch": 5.955866701836969, "eval_MaskedAccuracy": 0.5136876045121863, "eval_loss": 1.5788699388504028, "eval_runtime": 170.8875, "eval_samples_per_second": 371.449, "eval_steps_per_second": 1.451, "step": 1462000 }, { "epoch": 5.956274079860351, "grad_norm": 3.943265199661255, "learning_rate": 0.00010320345734522625, "loss": 7.4993, "step": 1462100 }, { "epoch": 5.956681457883732, "grad_norm": 3.824932098388672, "learning_rate": 0.00010306917735654511, "loss": 7.4942, "step": 1462200 }, { "epoch": 5.957088835907114, "grad_norm": 16.49104118347168, "learning_rate": 0.00010293499232903566, "loss": 7.5071, "step": 1462300 }, { "epoch": 5.957496213930495, "grad_norm": 4.29388952255249, "learning_rate": 0.00010280090226800353, "loss": 7.5231, "step": 1462400 }, { "epoch": 5.957903591953877, "grad_norm": 5.01418924331665, "learning_rate": 0.00010266690717874919, "loss": 7.4935, "step": 1462500 }, { "epoch": 5.9583109699772585, "grad_norm": 14.47789192199707, "learning_rate": 0.00010253300706657099, "loss": 7.5057, "step": 1462600 }, { "epoch": 5.958718348000639, "grad_norm": 24.14881134033203, "learning_rate": 0.00010239920193676215, "loss": 7.5101, "step": 1462700 }, { "epoch": 5.959125726024021, "grad_norm": 16.402585983276367, "learning_rate": 0.00010226549179461274, "loss": 7.4822, "step": 1462800 }, { "epoch": 5.959533104047402, "grad_norm": 22.45849609375, "learning_rate": 0.00010213187664540925, "loss": 7.5053, "step": 1462900 }, { "epoch": 5.959940482070784, "grad_norm": 33.035240173339844, "learning_rate": 0.00010199835649443415, "loss": 7.5059, "step": 1463000 }, { "epoch": 5.959940482070784, "eval_MaskedAccuracy": 0.5138496094674514, "eval_loss": 1.5820598602294922, "eval_runtime": 173.585, "eval_samples_per_second": 365.677, "eval_steps_per_second": 1.429, "step": 1463000 }, { "epoch": 5.960347860094165, "grad_norm": 4.521730899810791, "learning_rate": 0.00010186493134696622, "loss": 7.5253, "step": 1463100 }, { "epoch": 5.960755238117547, "grad_norm": 5.788667678833008, "learning_rate": 0.00010173160120828051, "loss": 7.5097, "step": 1463200 }, { "epoch": 5.961162616140928, "grad_norm": 15.684574127197266, "learning_rate": 0.00010159836608364796, "loss": 7.5296, "step": 1463300 }, { "epoch": 5.96156999416431, "grad_norm": 13.343859672546387, "learning_rate": 0.00010146522597833619, "loss": 7.5307, "step": 1463400 }, { "epoch": 5.961977372187691, "grad_norm": 3.1307003498077393, "learning_rate": 0.0001013321808976096, "loss": 7.5201, "step": 1463500 }, { "epoch": 5.962384750211073, "grad_norm": 17.754648208618164, "learning_rate": 0.000101199230846727, "loss": 7.5361, "step": 1463600 }, { "epoch": 5.9627921282344545, "grad_norm": 6.762691974639893, "learning_rate": 0.00010106637583094499, "loss": 7.4972, "step": 1463700 }, { "epoch": 5.963199506257836, "grad_norm": 6.7893548011779785, "learning_rate": 0.00010093361585551683, "loss": 7.5174, "step": 1463800 }, { "epoch": 5.9636068842812175, "grad_norm": 13.800431251525879, "learning_rate": 0.00010080095092569082, "loss": 7.5069, "step": 1463900 }, { "epoch": 5.964014262304598, "grad_norm": 6.065407752990723, "learning_rate": 0.00010066838104671137, "loss": 7.4887, "step": 1464000 }, { "epoch": 5.964014262304598, "eval_MaskedAccuracy": 0.5136984718314406, "eval_loss": 1.5877317190170288, "eval_runtime": 187.7424, "eval_samples_per_second": 338.102, "eval_steps_per_second": 1.321, "step": 1464000 }, { "epoch": 5.96442164032798, "grad_norm": 15.611747741699219, "learning_rate": 0.00010053590622382053, "loss": 7.4778, "step": 1464100 }, { "epoch": 5.964829018351361, "grad_norm": 24.867883682250977, "learning_rate": 0.00010040352646225486, "loss": 7.5207, "step": 1464200 }, { "epoch": 5.965236396374743, "grad_norm": 13.83848762512207, "learning_rate": 0.0001002712417672487, "loss": 7.5243, "step": 1464300 }, { "epoch": 5.965643774398124, "grad_norm": 16.843660354614258, "learning_rate": 0.00010013905214403175, "loss": 7.4785, "step": 1464400 }, { "epoch": 5.966051152421506, "grad_norm": 5.088784217834473, "learning_rate": 0.00010000695759783033, "loss": 7.5036, "step": 1464500 }, { "epoch": 5.966458530444887, "grad_norm": 18.821393966674805, "learning_rate": 9.987495813386656e-05, "loss": 7.5131, "step": 1464600 }, { "epoch": 5.966865908468269, "grad_norm": 15.2603120803833, "learning_rate": 9.974305375735924e-05, "loss": 7.4984, "step": 1464700 }, { "epoch": 5.9672732864916505, "grad_norm": 5.545129776000977, "learning_rate": 9.961124447352308e-05, "loss": 7.5165, "step": 1464800 }, { "epoch": 5.967680664515032, "grad_norm": 2.6632354259490967, "learning_rate": 9.94795302875693e-05, "loss": 7.4969, "step": 1464900 }, { "epoch": 5.9680880425384135, "grad_norm": 11.412628173828125, "learning_rate": 9.934791120470565e-05, "loss": 7.4946, "step": 1465000 }, { "epoch": 5.9680880425384135, "eval_MaskedAccuracy": 0.5141648127293885, "eval_loss": 1.5856399536132812, "eval_runtime": 168.2749, "eval_samples_per_second": 377.216, "eval_steps_per_second": 1.474, "step": 1465000 }, { "epoch": 5.968495420561795, "grad_norm": 13.691043853759766, "learning_rate": 9.921638723013513e-05, "loss": 7.5057, "step": 1465100 }, { "epoch": 5.968902798585177, "grad_norm": 11.590338706970215, "learning_rate": 9.908495836905769e-05, "loss": 7.528, "step": 1465200 }, { "epoch": 5.969310176608557, "grad_norm": 9.705368041992188, "learning_rate": 9.895362462666984e-05, "loss": 7.5023, "step": 1465300 }, { "epoch": 5.969717554631939, "grad_norm": 3.938136100769043, "learning_rate": 9.88223860081638e-05, "loss": 7.5202, "step": 1465400 }, { "epoch": 5.97012493265532, "grad_norm": 17.26057243347168, "learning_rate": 9.869124251872763e-05, "loss": 7.5211, "step": 1465500 }, { "epoch": 5.970532310678702, "grad_norm": 18.391721725463867, "learning_rate": 9.856019416354689e-05, "loss": 7.5244, "step": 1465600 }, { "epoch": 5.970939688702083, "grad_norm": 17.442569732666016, "learning_rate": 9.842924094780166e-05, "loss": 7.5026, "step": 1465700 }, { "epoch": 5.971347066725465, "grad_norm": 33.836708068847656, "learning_rate": 9.829838287667e-05, "loss": 7.5119, "step": 1465800 }, { "epoch": 5.971754444748846, "grad_norm": 3.290839195251465, "learning_rate": 9.81676199553254e-05, "loss": 7.5054, "step": 1465900 }, { "epoch": 5.972161822772228, "grad_norm": 22.256479263305664, "learning_rate": 9.803695218893726e-05, "loss": 7.4874, "step": 1466000 }, { "epoch": 5.972161822772228, "eval_MaskedAccuracy": 0.513176038920254, "eval_loss": 1.5861608982086182, "eval_runtime": 197.0123, "eval_samples_per_second": 322.193, "eval_steps_per_second": 1.259, "step": 1466000 }, { "epoch": 5.9725692007956095, "grad_norm": 5.800668239593506, "learning_rate": 9.790637958267198e-05, "loss": 7.5038, "step": 1466100 }, { "epoch": 5.972976578818991, "grad_norm": 30.10011100769043, "learning_rate": 9.777590214169132e-05, "loss": 7.5013, "step": 1466200 }, { "epoch": 5.9733839568423726, "grad_norm": 3.665769338607788, "learning_rate": 9.764551987115402e-05, "loss": 7.5282, "step": 1466300 }, { "epoch": 5.973791334865753, "grad_norm": 22.723546981811523, "learning_rate": 9.75152327762147e-05, "loss": 7.4963, "step": 1466400 }, { "epoch": 5.974198712889136, "grad_norm": 24.522565841674805, "learning_rate": 9.738504086202449e-05, "loss": 7.5091, "step": 1466500 }, { "epoch": 5.974606090912516, "grad_norm": 19.31745147705078, "learning_rate": 9.725494413373036e-05, "loss": 7.5246, "step": 1466600 }, { "epoch": 5.975013468935898, "grad_norm": 3.7023141384124756, "learning_rate": 9.712494259647596e-05, "loss": 7.4691, "step": 1466700 }, { "epoch": 5.975420846959279, "grad_norm": 14.211570739746094, "learning_rate": 9.699503625540065e-05, "loss": 7.5027, "step": 1466800 }, { "epoch": 5.975828224982661, "grad_norm": 13.158486366271973, "learning_rate": 9.686522511564062e-05, "loss": 7.5129, "step": 1466900 }, { "epoch": 5.976235603006042, "grad_norm": 16.808256149291992, "learning_rate": 9.673550918232768e-05, "loss": 7.5369, "step": 1467000 }, { "epoch": 5.976235603006042, "eval_MaskedAccuracy": 0.5137634491849503, "eval_loss": 1.58700430393219, "eval_runtime": 171.3642, "eval_samples_per_second": 370.416, "eval_steps_per_second": 1.447, "step": 1467000 }, { "epoch": 5.976642981029424, "grad_norm": 19.31085205078125, "learning_rate": 9.66058884605905e-05, "loss": 7.493, "step": 1467100 }, { "epoch": 5.9770503590528055, "grad_norm": 9.289825439453125, "learning_rate": 9.647636295555366e-05, "loss": 7.5255, "step": 1467200 }, { "epoch": 5.977457737076187, "grad_norm": 18.02328109741211, "learning_rate": 9.634693267233765e-05, "loss": 7.4943, "step": 1467300 }, { "epoch": 5.9778651150995685, "grad_norm": 5.039106369018555, "learning_rate": 9.621759761605976e-05, "loss": 7.5191, "step": 1467400 }, { "epoch": 5.97827249312295, "grad_norm": 21.01978874206543, "learning_rate": 9.608835779183349e-05, "loss": 7.5306, "step": 1467500 }, { "epoch": 5.978679871146332, "grad_norm": 6.009078502655029, "learning_rate": 9.595921320476804e-05, "loss": 7.487, "step": 1467600 }, { "epoch": 5.979087249169712, "grad_norm": 6.737271308898926, "learning_rate": 9.583016385996886e-05, "loss": 7.4964, "step": 1467700 }, { "epoch": 5.979494627193094, "grad_norm": 35.005550384521484, "learning_rate": 9.570120976253911e-05, "loss": 7.5056, "step": 1467800 }, { "epoch": 5.979902005216475, "grad_norm": 3.025550127029419, "learning_rate": 9.557235091757648e-05, "loss": 7.5109, "step": 1467900 }, { "epoch": 5.980309383239857, "grad_norm": 6.011305809020996, "learning_rate": 9.544358733017512e-05, "loss": 7.5341, "step": 1468000 }, { "epoch": 5.980309383239857, "eval_MaskedAccuracy": 0.5141501315477184, "eval_loss": 1.5844790935516357, "eval_runtime": 158.4112, "eval_samples_per_second": 400.704, "eval_steps_per_second": 1.566, "step": 1468000 }, { "epoch": 5.980716761263238, "grad_norm": 9.449957847595215, "learning_rate": 9.531491900542593e-05, "loss": 7.5071, "step": 1468100 }, { "epoch": 5.98112413928662, "grad_norm": 14.28373908996582, "learning_rate": 9.518634594841594e-05, "loss": 7.5178, "step": 1468200 }, { "epoch": 5.981531517310001, "grad_norm": 20.214340209960938, "learning_rate": 9.505786816422842e-05, "loss": 7.5069, "step": 1468300 }, { "epoch": 5.981938895333383, "grad_norm": 8.812238693237305, "learning_rate": 9.492948565794244e-05, "loss": 7.5183, "step": 1468400 }, { "epoch": 5.9823462733567645, "grad_norm": 25.215444564819336, "learning_rate": 9.480119843463404e-05, "loss": 7.5348, "step": 1468500 }, { "epoch": 5.982753651380146, "grad_norm": 26.55904197692871, "learning_rate": 9.46730064993748e-05, "loss": 7.5074, "step": 1468600 }, { "epoch": 5.983161029403528, "grad_norm": 21.207361221313477, "learning_rate": 9.454490985723302e-05, "loss": 7.514, "step": 1468700 }, { "epoch": 5.983568407426909, "grad_norm": 30.02730369567871, "learning_rate": 9.44169085132728e-05, "loss": 7.5086, "step": 1468800 }, { "epoch": 5.983975785450291, "grad_norm": 24.488086700439453, "learning_rate": 9.428900247255503e-05, "loss": 7.5401, "step": 1468900 }, { "epoch": 5.984383163473671, "grad_norm": 31.024044036865234, "learning_rate": 9.416119174013635e-05, "loss": 7.5347, "step": 1469000 }, { "epoch": 5.984383163473671, "eval_MaskedAccuracy": 0.5133502734147514, "eval_loss": 1.5921359062194824, "eval_runtime": 168.351, "eval_samples_per_second": 377.046, "eval_steps_per_second": 1.473, "step": 1469000 }, { "epoch": 5.984790541497053, "grad_norm": 5.456093788146973, "learning_rate": 9.403347632106972e-05, "loss": 7.5337, "step": 1469100 }, { "epoch": 5.985197919520434, "grad_norm": 5.64096736907959, "learning_rate": 9.390585622040481e-05, "loss": 7.5264, "step": 1469200 }, { "epoch": 5.985605297543816, "grad_norm": 27.000465393066406, "learning_rate": 9.377833144318656e-05, "loss": 7.5282, "step": 1469300 }, { "epoch": 5.986012675567197, "grad_norm": 2.9788384437561035, "learning_rate": 9.365090199445708e-05, "loss": 7.5108, "step": 1469400 }, { "epoch": 5.986420053590579, "grad_norm": 6.45781946182251, "learning_rate": 9.352356787925448e-05, "loss": 7.5074, "step": 1469500 }, { "epoch": 5.9868274316139605, "grad_norm": 5.0068583488464355, "learning_rate": 9.33963291026124e-05, "loss": 7.4929, "step": 1469600 }, { "epoch": 5.987234809637342, "grad_norm": 24.099658966064453, "learning_rate": 9.326918566956142e-05, "loss": 7.5002, "step": 1469700 }, { "epoch": 5.9876421876607235, "grad_norm": 8.808025360107422, "learning_rate": 9.314213758512856e-05, "loss": 7.5103, "step": 1469800 }, { "epoch": 5.988049565684105, "grad_norm": 22.51898193359375, "learning_rate": 9.301518485433673e-05, "loss": 7.5148, "step": 1469900 }, { "epoch": 5.988456943707487, "grad_norm": 22.935468673706055, "learning_rate": 9.288832748220463e-05, "loss": 7.5077, "step": 1470000 }, { "epoch": 5.988456943707487, "eval_MaskedAccuracy": 0.5140249406513584, "eval_loss": 1.5843276977539062, "eval_runtime": 167.0962, "eval_samples_per_second": 379.877, "eval_steps_per_second": 1.484, "step": 1470000 }, { "epoch": 5.988864321730867, "grad_norm": 5.14185094833374, "learning_rate": 9.276156547374807e-05, "loss": 7.5165, "step": 1470100 }, { "epoch": 5.98927169975425, "grad_norm": 19.748701095581055, "learning_rate": 9.263489883397812e-05, "loss": 7.5202, "step": 1470200 }, { "epoch": 5.98967907777763, "grad_norm": 14.636068344116211, "learning_rate": 9.250832756790298e-05, "loss": 7.5375, "step": 1470300 }, { "epoch": 5.990086455801012, "grad_norm": 2.6870667934417725, "learning_rate": 9.238185168052653e-05, "loss": 7.4993, "step": 1470400 }, { "epoch": 5.990493833824393, "grad_norm": 16.517555236816406, "learning_rate": 9.225547117684914e-05, "loss": 7.5019, "step": 1470500 }, { "epoch": 5.990901211847775, "grad_norm": 9.952735900878906, "learning_rate": 9.212918606186695e-05, "loss": 7.5014, "step": 1470600 }, { "epoch": 5.991308589871156, "grad_norm": 3.9285147190093994, "learning_rate": 9.200299634057354e-05, "loss": 7.5063, "step": 1470700 }, { "epoch": 5.991715967894538, "grad_norm": 14.709887504577637, "learning_rate": 9.1876902017957e-05, "loss": 7.5175, "step": 1470800 }, { "epoch": 5.9921233459179195, "grad_norm": 36.995018005371094, "learning_rate": 9.175090309900246e-05, "loss": 7.5189, "step": 1470900 }, { "epoch": 5.992530723941301, "grad_norm": 11.792192459106445, "learning_rate": 9.162499958869204e-05, "loss": 7.4943, "step": 1471000 }, { "epoch": 5.992530723941301, "eval_MaskedAccuracy": 0.5135675904019984, "eval_loss": 1.588697075843811, "eval_runtime": 163.4904, "eval_samples_per_second": 388.255, "eval_steps_per_second": 1.517, "step": 1471000 }, { "epoch": 5.992938101964683, "grad_norm": 16.327465057373047, "learning_rate": 9.149919149200308e-05, "loss": 7.5165, "step": 1471100 }, { "epoch": 5.993345479988064, "grad_norm": 21.664289474487305, "learning_rate": 9.137347881390924e-05, "loss": 7.4989, "step": 1471200 }, { "epoch": 5.993752858011446, "grad_norm": 5.597540855407715, "learning_rate": 9.124786155938075e-05, "loss": 7.5394, "step": 1471300 }, { "epoch": 5.994160236034826, "grad_norm": 3.8632829189300537, "learning_rate": 9.112233973338396e-05, "loss": 7.5079, "step": 1471400 }, { "epoch": 5.994567614058209, "grad_norm": 5.042380332946777, "learning_rate": 9.099691334088161e-05, "loss": 7.5295, "step": 1471500 }, { "epoch": 5.994974992081589, "grad_norm": 6.029579162597656, "learning_rate": 9.087158238683211e-05, "loss": 7.4962, "step": 1471600 }, { "epoch": 5.995382370104971, "grad_norm": 14.387858390808105, "learning_rate": 9.074634687619045e-05, "loss": 7.5266, "step": 1471700 }, { "epoch": 5.995789748128352, "grad_norm": 4.79557466506958, "learning_rate": 9.062120681390815e-05, "loss": 7.5172, "step": 1471800 }, { "epoch": 5.996197126151734, "grad_norm": 2.618720054626465, "learning_rate": 9.049616220493274e-05, "loss": 7.5104, "step": 1471900 }, { "epoch": 5.9966045041751155, "grad_norm": 8.536681175231934, "learning_rate": 9.037121305420762e-05, "loss": 7.4938, "step": 1472000 }, { "epoch": 5.9966045041751155, "eval_MaskedAccuracy": 0.5139832638053828, "eval_loss": 1.5836459398269653, "eval_runtime": 179.3431, "eval_samples_per_second": 353.936, "eval_steps_per_second": 1.383, "step": 1472000 }, { "epoch": 5.997011882198497, "grad_norm": 3.333171844482422, "learning_rate": 9.024635936667268e-05, "loss": 7.4803, "step": 1472100 }, { "epoch": 5.9974192602218785, "grad_norm": 17.23170280456543, "learning_rate": 9.012160114726437e-05, "loss": 7.4951, "step": 1472200 }, { "epoch": 5.99782663824526, "grad_norm": 5.657397270202637, "learning_rate": 8.999693840091489e-05, "loss": 7.4899, "step": 1472300 }, { "epoch": 5.998234016268642, "grad_norm": 2.1915388107299805, "learning_rate": 8.987237113255245e-05, "loss": 7.5321, "step": 1472400 }, { "epoch": 5.998641394292023, "grad_norm": 16.36842918395996, "learning_rate": 8.974789934710257e-05, "loss": 7.5004, "step": 1472500 }, { "epoch": 5.999048772315405, "grad_norm": 12.492622375488281, "learning_rate": 8.962352304948547e-05, "loss": 7.5078, "step": 1472600 }, { "epoch": 5.999456150338785, "grad_norm": 4.250843524932861, "learning_rate": 8.949924224461917e-05, "loss": 7.5268, "step": 1472700 }, { "epoch": 5.999863528362167, "grad_norm": 3.642354965209961, "learning_rate": 8.937505693741677e-05, "loss": 7.5178, "step": 1472800 }, { "epoch": 6.000270906385548, "grad_norm": 12.447785377502441, "learning_rate": 8.925096713278803e-05, "loss": 7.5448, "step": 1472900 }, { "epoch": 6.00067828440893, "grad_norm": 5.121676445007324, "learning_rate": 8.912697283563867e-05, "loss": 7.5269, "step": 1473000 }, { "epoch": 6.00067828440893, "eval_MaskedAccuracy": 0.513692675246052, "eval_loss": 1.5907717943191528, "eval_runtime": 152.3174, "eval_samples_per_second": 416.735, "eval_steps_per_second": 1.628, "step": 1473000 }, { "epoch": 6.0010856624323115, "grad_norm": 3.62019681930542, "learning_rate": 8.900307405087142e-05, "loss": 7.5368, "step": 1473100 }, { "epoch": 6.001493040455693, "grad_norm": 19.750944137573242, "learning_rate": 8.887927078338407e-05, "loss": 7.5676, "step": 1473200 }, { "epoch": 6.0019004184790745, "grad_norm": 3.51492977142334, "learning_rate": 8.875556303807164e-05, "loss": 7.5278, "step": 1473300 }, { "epoch": 6.002307796502456, "grad_norm": 3.6750075817108154, "learning_rate": 8.863195081982469e-05, "loss": 7.5525, "step": 1473400 }, { "epoch": 6.002715174525838, "grad_norm": 4.643901824951172, "learning_rate": 8.850843413353042e-05, "loss": 7.5, "step": 1473500 }, { "epoch": 6.003122552549219, "grad_norm": 7.770880222320557, "learning_rate": 8.838501298407213e-05, "loss": 7.5233, "step": 1473600 }, { "epoch": 6.003529930572601, "grad_norm": 3.9565768241882324, "learning_rate": 8.826168737632878e-05, "loss": 7.5046, "step": 1473700 }, { "epoch": 6.003937308595982, "grad_norm": 47.52876663208008, "learning_rate": 8.813845731517678e-05, "loss": 7.4973, "step": 1473800 }, { "epoch": 6.004344686619364, "grad_norm": 5.450173854827881, "learning_rate": 8.80153228054878e-05, "loss": 7.521, "step": 1473900 }, { "epoch": 6.004752064642744, "grad_norm": 2.5958971977233887, "learning_rate": 8.789228385213007e-05, "loss": 7.5098, "step": 1474000 }, { "epoch": 6.004752064642744, "eval_MaskedAccuracy": 0.5133457696758731, "eval_loss": 1.5857244729995728, "eval_runtime": 149.2937, "eval_samples_per_second": 425.175, "eval_steps_per_second": 1.661, "step": 1474000 }, { "epoch": 6.005159442666126, "grad_norm": 4.340638160705566, "learning_rate": 8.776934045996828e-05, "loss": 7.5252, "step": 1474100 }, { "epoch": 6.005566820689507, "grad_norm": 6.678283214569092, "learning_rate": 8.764649263386264e-05, "loss": 7.4868, "step": 1474200 }, { "epoch": 6.005974198712889, "grad_norm": 11.900827407836914, "learning_rate": 8.752374037866983e-05, "loss": 7.5235, "step": 1474300 }, { "epoch": 6.0063815767362705, "grad_norm": 3.1901357173919678, "learning_rate": 8.740108369924347e-05, "loss": 7.5153, "step": 1474400 }, { "epoch": 6.006788954759652, "grad_norm": 6.039796352386475, "learning_rate": 8.727852260043249e-05, "loss": 7.5251, "step": 1474500 }, { "epoch": 6.0071963327830336, "grad_norm": 9.42049789428711, "learning_rate": 8.715605708708207e-05, "loss": 7.528, "step": 1474600 }, { "epoch": 6.007603710806415, "grad_norm": 3.2921876907348633, "learning_rate": 8.703368716403451e-05, "loss": 7.5019, "step": 1474700 }, { "epoch": 6.008011088829797, "grad_norm": 4.643259525299072, "learning_rate": 8.691141283612734e-05, "loss": 7.5136, "step": 1474800 }, { "epoch": 6.008418466853178, "grad_norm": 2.806891441345215, "learning_rate": 8.678923410819515e-05, "loss": 7.5136, "step": 1474900 }, { "epoch": 6.00882584487656, "grad_norm": 6.464155197143555, "learning_rate": 8.666715098506755e-05, "loss": 7.5112, "step": 1475000 }, { "epoch": 6.00882584487656, "eval_MaskedAccuracy": 0.5135272810651529, "eval_loss": 1.587581992149353, "eval_runtime": 153.6876, "eval_samples_per_second": 413.02, "eval_steps_per_second": 1.614, "step": 1475000 }, { "epoch": 6.009233222899941, "grad_norm": 3.559119701385498, "learning_rate": 8.654516347157198e-05, "loss": 7.5267, "step": 1475100 }, { "epoch": 6.009640600923322, "grad_norm": 19.796335220336914, "learning_rate": 8.642327157253099e-05, "loss": 7.5369, "step": 1475200 }, { "epoch": 6.010047978946703, "grad_norm": 3.9808058738708496, "learning_rate": 8.630147529276317e-05, "loss": 7.5266, "step": 1475300 }, { "epoch": 6.010455356970085, "grad_norm": 2.821082592010498, "learning_rate": 8.617977463708416e-05, "loss": 7.5325, "step": 1475400 }, { "epoch": 6.0108627349934665, "grad_norm": 6.1518025398254395, "learning_rate": 8.605816961030575e-05, "loss": 7.5449, "step": 1475500 }, { "epoch": 6.011270113016848, "grad_norm": 20.01251220703125, "learning_rate": 8.593666021723478e-05, "loss": 7.5365, "step": 1475600 }, { "epoch": 6.0116774910402295, "grad_norm": 3.697535514831543, "learning_rate": 8.581524646267592e-05, "loss": 7.4953, "step": 1475700 }, { "epoch": 6.012084869063611, "grad_norm": 4.0473785400390625, "learning_rate": 8.569392835142931e-05, "loss": 7.5204, "step": 1475800 }, { "epoch": 6.012492247086993, "grad_norm": 4.885238170623779, "learning_rate": 8.55727058882911e-05, "loss": 7.5096, "step": 1475900 }, { "epoch": 6.012899625110374, "grad_norm": 10.347676277160645, "learning_rate": 8.545157907805378e-05, "loss": 7.5226, "step": 1476000 }, { "epoch": 6.012899625110374, "eval_MaskedAccuracy": 0.5145368133630774, "eval_loss": 1.5742132663726807, "eval_runtime": 166.2148, "eval_samples_per_second": 381.891, "eval_steps_per_second": 1.492, "step": 1476000 }, { "epoch": 6.013307003133756, "grad_norm": 8.436140060424805, "learning_rate": 8.533054792550602e-05, "loss": 7.5097, "step": 1476100 }, { "epoch": 6.013714381157137, "grad_norm": 18.134437561035156, "learning_rate": 8.520961243543283e-05, "loss": 7.5142, "step": 1476200 }, { "epoch": 6.014121759180519, "grad_norm": 8.203875541687012, "learning_rate": 8.508877261261593e-05, "loss": 7.538, "step": 1476300 }, { "epoch": 6.0145291372039, "grad_norm": 4.324888229370117, "learning_rate": 8.496802846183241e-05, "loss": 7.5197, "step": 1476400 }, { "epoch": 6.014936515227281, "grad_norm": 7.974706172943115, "learning_rate": 8.484737998785591e-05, "loss": 7.5017, "step": 1476500 }, { "epoch": 6.015343893250662, "grad_norm": 4.344145774841309, "learning_rate": 8.472682719545635e-05, "loss": 7.4817, "step": 1476600 }, { "epoch": 6.015751271274044, "grad_norm": 43.55150604248047, "learning_rate": 8.46063700894001e-05, "loss": 7.531, "step": 1476700 }, { "epoch": 6.0161586492974255, "grad_norm": 24.808927536010742, "learning_rate": 8.448600867444912e-05, "loss": 7.5286, "step": 1476800 }, { "epoch": 6.016566027320807, "grad_norm": 29.526124954223633, "learning_rate": 8.436574295536182e-05, "loss": 7.5118, "step": 1476900 }, { "epoch": 6.016973405344189, "grad_norm": 12.216218948364258, "learning_rate": 8.424557293689314e-05, "loss": 7.4915, "step": 1477000 }, { "epoch": 6.016973405344189, "eval_MaskedAccuracy": 0.5140047141803903, "eval_loss": 1.5815221071243286, "eval_runtime": 175.0698, "eval_samples_per_second": 362.575, "eval_steps_per_second": 1.417, "step": 1477000 }, { "epoch": 6.01738078336757, "grad_norm": 12.379630088806152, "learning_rate": 8.412549862379428e-05, "loss": 7.5399, "step": 1477100 }, { "epoch": 6.017788161390952, "grad_norm": 7.037223815917969, "learning_rate": 8.400552002081206e-05, "loss": 7.4965, "step": 1477200 }, { "epoch": 6.018195539414333, "grad_norm": 13.78564739227295, "learning_rate": 8.388563713269037e-05, "loss": 7.5165, "step": 1477300 }, { "epoch": 6.018602917437715, "grad_norm": 17.575111389160156, "learning_rate": 8.376584996416819e-05, "loss": 7.5105, "step": 1477400 }, { "epoch": 6.019010295461096, "grad_norm": 42.46109390258789, "learning_rate": 8.364615851998164e-05, "loss": 7.5303, "step": 1477500 }, { "epoch": 6.019417673484478, "grad_norm": 29.17548942565918, "learning_rate": 8.352656280486201e-05, "loss": 7.5214, "step": 1477600 }, { "epoch": 6.019825051507858, "grad_norm": 32.769691467285156, "learning_rate": 8.340706282353902e-05, "loss": 7.5035, "step": 1477700 }, { "epoch": 6.02023242953124, "grad_norm": 10.002124786376953, "learning_rate": 8.328765858073651e-05, "loss": 7.5399, "step": 1477800 }, { "epoch": 6.0206398075546215, "grad_norm": 31.22835922241211, "learning_rate": 8.316835008117497e-05, "loss": 7.5106, "step": 1477900 }, { "epoch": 6.021047185578003, "grad_norm": 8.925127029418945, "learning_rate": 8.30491373295714e-05, "loss": 7.5622, "step": 1478000 }, { "epoch": 6.021047185578003, "eval_MaskedAccuracy": 0.5135413131190498, "eval_loss": 1.5861700773239136, "eval_runtime": 151.9379, "eval_samples_per_second": 417.776, "eval_steps_per_second": 1.632, "step": 1478000 }, { "epoch": 6.0214545636013845, "grad_norm": 19.77899169921875, "learning_rate": 8.293002033063891e-05, "loss": 7.5126, "step": 1478100 }, { "epoch": 6.021861941624766, "grad_norm": 25.79058837890625, "learning_rate": 8.281099908908722e-05, "loss": 7.471, "step": 1478200 }, { "epoch": 6.022269319648148, "grad_norm": 16.387996673583984, "learning_rate": 8.269207360962114e-05, "loss": 7.5214, "step": 1478300 }, { "epoch": 6.022676697671529, "grad_norm": 3.027670383453369, "learning_rate": 8.257324389694275e-05, "loss": 7.5149, "step": 1478400 }, { "epoch": 6.023084075694911, "grad_norm": 23.706575393676758, "learning_rate": 8.245450995575031e-05, "loss": 7.5386, "step": 1478500 }, { "epoch": 6.023491453718292, "grad_norm": 21.659406661987305, "learning_rate": 8.233587179073771e-05, "loss": 7.5462, "step": 1478600 }, { "epoch": 6.023898831741674, "grad_norm": 16.330801010131836, "learning_rate": 8.221732940659524e-05, "loss": 7.5242, "step": 1478700 }, { "epoch": 6.024306209765055, "grad_norm": 19.215543746948242, "learning_rate": 8.209888280801004e-05, "loss": 7.4945, "step": 1478800 }, { "epoch": 6.024713587788437, "grad_norm": 21.308149337768555, "learning_rate": 8.198053199966429e-05, "loss": 7.5352, "step": 1478900 }, { "epoch": 6.025120965811817, "grad_norm": 3.8174691200256348, "learning_rate": 8.186227698623734e-05, "loss": 7.5282, "step": 1479000 }, { "epoch": 6.025120965811817, "eval_MaskedAccuracy": 0.5139456962781497, "eval_loss": 1.5708402395248413, "eval_runtime": 168.6338, "eval_samples_per_second": 376.413, "eval_steps_per_second": 1.471, "step": 1479000 }, { "epoch": 6.025528343835199, "grad_norm": 11.857566833496094, "learning_rate": 8.174411777240445e-05, "loss": 7.5141, "step": 1479100 }, { "epoch": 6.0259357218585805, "grad_norm": 20.858306884765625, "learning_rate": 8.162605436283722e-05, "loss": 7.5072, "step": 1479200 }, { "epoch": 6.026343099881962, "grad_norm": 9.31919002532959, "learning_rate": 8.150808676220307e-05, "loss": 7.532, "step": 1479300 }, { "epoch": 6.026750477905344, "grad_norm": 16.40489959716797, "learning_rate": 8.139021497516625e-05, "loss": 7.4986, "step": 1479400 }, { "epoch": 6.027157855928725, "grad_norm": 20.209657669067383, "learning_rate": 8.127243900638636e-05, "loss": 7.5194, "step": 1479500 }, { "epoch": 6.027565233952107, "grad_norm": 13.192978858947754, "learning_rate": 8.115475886051974e-05, "loss": 7.5556, "step": 1479600 }, { "epoch": 6.027972611975488, "grad_norm": 3.7593281269073486, "learning_rate": 8.103717454221929e-05, "loss": 7.5251, "step": 1479700 }, { "epoch": 6.02837998999887, "grad_norm": 8.86939525604248, "learning_rate": 8.091968605613393e-05, "loss": 7.5129, "step": 1479800 }, { "epoch": 6.028787368022251, "grad_norm": 7.07117223739624, "learning_rate": 8.080229340690793e-05, "loss": 7.5095, "step": 1479900 }, { "epoch": 6.029194746045633, "grad_norm": 33.47684860229492, "learning_rate": 8.068499659918283e-05, "loss": 7.5375, "step": 1480000 }, { "epoch": 6.029194746045633, "eval_MaskedAccuracy": 0.5141075699483529, "eval_loss": 1.578412413597107, "eval_runtime": 188.9794, "eval_samples_per_second": 335.889, "eval_steps_per_second": 1.312, "step": 1480000 }, { "epoch": 6.029602124069014, "grad_norm": 22.33747100830078, "learning_rate": 8.056779563759619e-05, "loss": 7.5462, "step": 1480100 }, { "epoch": 6.030009502092395, "grad_norm": 4.381908416748047, "learning_rate": 8.045069052678106e-05, "loss": 7.544, "step": 1480200 }, { "epoch": 6.0304168801157765, "grad_norm": 25.629236221313477, "learning_rate": 8.033368127136756e-05, "loss": 7.5352, "step": 1480300 }, { "epoch": 6.030824258139158, "grad_norm": 20.314191818237305, "learning_rate": 8.02167678759819e-05, "loss": 7.5364, "step": 1480400 }, { "epoch": 6.0312316361625395, "grad_norm": 15.726969718933105, "learning_rate": 8.00999503452459e-05, "loss": 7.5213, "step": 1480500 }, { "epoch": 6.031639014185921, "grad_norm": 17.58500862121582, "learning_rate": 7.998322868377814e-05, "loss": 7.5023, "step": 1480600 }, { "epoch": 6.032046392209303, "grad_norm": 20.897462844848633, "learning_rate": 7.986660289619344e-05, "loss": 7.502, "step": 1480700 }, { "epoch": 6.032453770232684, "grad_norm": 20.865142822265625, "learning_rate": 7.975007298710239e-05, "loss": 7.5286, "step": 1480800 }, { "epoch": 6.032861148256066, "grad_norm": 25.1275691986084, "learning_rate": 7.963363896111185e-05, "loss": 7.5269, "step": 1480900 }, { "epoch": 6.033268526279447, "grad_norm": 28.700515747070312, "learning_rate": 7.951730082282565e-05, "loss": 7.4916, "step": 1481000 }, { "epoch": 6.033268526279447, "eval_MaskedAccuracy": 0.5133240535375019, "eval_loss": 1.5870534181594849, "eval_runtime": 173.3375, "eval_samples_per_second": 366.199, "eval_steps_per_second": 1.431, "step": 1481000 }, { "epoch": 6.033675904302829, "grad_norm": 30.533897399902344, "learning_rate": 7.940105857684282e-05, "loss": 7.5085, "step": 1481100 }, { "epoch": 6.03408328232621, "grad_norm": 10.506007194519043, "learning_rate": 7.928491222775904e-05, "loss": 7.5175, "step": 1481200 }, { "epoch": 6.034490660349592, "grad_norm": 8.078603744506836, "learning_rate": 7.916886178016647e-05, "loss": 7.5289, "step": 1481300 }, { "epoch": 6.034898038372973, "grad_norm": 9.919642448425293, "learning_rate": 7.905290723865287e-05, "loss": 7.531, "step": 1481400 }, { "epoch": 6.035305416396354, "grad_norm": 6.306997776031494, "learning_rate": 7.89370486078031e-05, "loss": 7.5023, "step": 1481500 }, { "epoch": 6.0357127944197355, "grad_norm": 20.438024520874023, "learning_rate": 7.88212858921965e-05, "loss": 7.5102, "step": 1481600 }, { "epoch": 6.036120172443117, "grad_norm": 14.660018920898438, "learning_rate": 7.870561909641109e-05, "loss": 7.5188, "step": 1481700 }, { "epoch": 6.036527550466499, "grad_norm": 5.2086615562438965, "learning_rate": 7.859004822501947e-05, "loss": 7.5134, "step": 1481800 }, { "epoch": 6.03693492848988, "grad_norm": 6.651661396026611, "learning_rate": 7.847457328259038e-05, "loss": 7.529, "step": 1481900 }, { "epoch": 6.037342306513262, "grad_norm": 21.134342193603516, "learning_rate": 7.835919427368926e-05, "loss": 7.5063, "step": 1482000 }, { "epoch": 6.037342306513262, "eval_MaskedAccuracy": 0.5138702898446108, "eval_loss": 1.595697283744812, "eval_runtime": 157.862, "eval_samples_per_second": 402.098, "eval_steps_per_second": 1.571, "step": 1482000 }, { "epoch": 6.037749684536643, "grad_norm": 4.608585834503174, "learning_rate": 7.82439112028781e-05, "loss": 7.4962, "step": 1482100 }, { "epoch": 6.038157062560025, "grad_norm": 9.62073802947998, "learning_rate": 7.812872407471408e-05, "loss": 7.486, "step": 1482200 }, { "epoch": 6.038564440583406, "grad_norm": 2.1867218017578125, "learning_rate": 7.801363289375163e-05, "loss": 7.5352, "step": 1482300 }, { "epoch": 6.038971818606788, "grad_norm": 21.22916603088379, "learning_rate": 7.789863766454078e-05, "loss": 7.5152, "step": 1482400 }, { "epoch": 6.039379196630169, "grad_norm": 3.346815347671509, "learning_rate": 7.778373839162766e-05, "loss": 7.5238, "step": 1482500 }, { "epoch": 6.039786574653551, "grad_norm": 12.451679229736328, "learning_rate": 7.766893507955478e-05, "loss": 7.5203, "step": 1482600 }, { "epoch": 6.0401939526769315, "grad_norm": 27.90259552001953, "learning_rate": 7.755422773286169e-05, "loss": 7.5063, "step": 1482700 }, { "epoch": 6.040601330700313, "grad_norm": 14.531145095825195, "learning_rate": 7.743961635608251e-05, "loss": 7.5414, "step": 1482800 }, { "epoch": 6.0410087087236946, "grad_norm": 29.668895721435547, "learning_rate": 7.732510095374884e-05, "loss": 7.5228, "step": 1482900 }, { "epoch": 6.041416086747076, "grad_norm": 32.814239501953125, "learning_rate": 7.721068153038813e-05, "loss": 7.4897, "step": 1483000 }, { "epoch": 6.041416086747076, "eval_MaskedAccuracy": 0.513304971145958, "eval_loss": 1.585016131401062, "eval_runtime": 173.8014, "eval_samples_per_second": 365.221, "eval_steps_per_second": 1.427, "step": 1483000 }, { "epoch": 6.041823464770458, "grad_norm": 5.910619258880615, "learning_rate": 7.709635809052385e-05, "loss": 7.5483, "step": 1483100 }, { "epoch": 6.042230842793839, "grad_norm": 18.325368881225586, "learning_rate": 7.698213063867615e-05, "loss": 7.5097, "step": 1483200 }, { "epoch": 6.042638220817221, "grad_norm": 4.285280704498291, "learning_rate": 7.686799917936054e-05, "loss": 7.5598, "step": 1483300 }, { "epoch": 6.043045598840602, "grad_norm": 5.864536285400391, "learning_rate": 7.675396371708956e-05, "loss": 7.5594, "step": 1483400 }, { "epoch": 6.043452976863984, "grad_norm": 3.1510560512542725, "learning_rate": 7.664002425637147e-05, "loss": 7.4977, "step": 1483500 }, { "epoch": 6.043860354887365, "grad_norm": 13.101836204528809, "learning_rate": 7.65261808017107e-05, "loss": 7.5379, "step": 1483600 }, { "epoch": 6.044267732910747, "grad_norm": 17.246509552001953, "learning_rate": 7.641243335760868e-05, "loss": 7.5224, "step": 1483700 }, { "epoch": 6.044675110934128, "grad_norm": 25.920820236206055, "learning_rate": 7.629878192856211e-05, "loss": 7.5453, "step": 1483800 }, { "epoch": 6.04508248895751, "grad_norm": 16.338682174682617, "learning_rate": 7.618522651906443e-05, "loss": 7.5451, "step": 1483900 }, { "epoch": 6.0454898669808905, "grad_norm": 17.48977279663086, "learning_rate": 7.607176713360497e-05, "loss": 7.5639, "step": 1484000 }, { "epoch": 6.0454898669808905, "eval_MaskedAccuracy": 0.5138310179083767, "eval_loss": 1.586430311203003, "eval_runtime": 196.0605, "eval_samples_per_second": 323.757, "eval_steps_per_second": 1.265, "step": 1484000 }, { "epoch": 6.045897245004272, "grad_norm": 10.223199844360352, "learning_rate": 7.595840377666913e-05, "loss": 7.516, "step": 1484100 }, { "epoch": 6.046304623027654, "grad_norm": 4.165820598602295, "learning_rate": 7.584513645273925e-05, "loss": 7.5316, "step": 1484200 }, { "epoch": 6.046712001051035, "grad_norm": 18.81446647644043, "learning_rate": 7.57319651662932e-05, "loss": 7.5156, "step": 1484300 }, { "epoch": 6.047119379074417, "grad_norm": 19.572067260742188, "learning_rate": 7.561888992180515e-05, "loss": 7.5306, "step": 1484400 }, { "epoch": 6.047526757097798, "grad_norm": 21.238670349121094, "learning_rate": 7.550591072374554e-05, "loss": 7.5215, "step": 1484500 }, { "epoch": 6.04793413512118, "grad_norm": 15.118731498718262, "learning_rate": 7.539302757658103e-05, "loss": 7.5368, "step": 1484600 }, { "epoch": 6.048341513144561, "grad_norm": 10.186454772949219, "learning_rate": 7.528024048477497e-05, "loss": 7.5218, "step": 1484700 }, { "epoch": 6.048748891167943, "grad_norm": 12.231056213378906, "learning_rate": 7.516754945278571e-05, "loss": 7.5073, "step": 1484800 }, { "epoch": 6.049156269191324, "grad_norm": 17.024585723876953, "learning_rate": 7.5054954485069e-05, "loss": 7.5014, "step": 1484900 }, { "epoch": 6.049563647214706, "grad_norm": 4.59981107711792, "learning_rate": 7.494245558607634e-05, "loss": 7.5547, "step": 1485000 }, { "epoch": 6.049563647214706, "eval_MaskedAccuracy": 0.5133585626392425, "eval_loss": 1.5897096395492554, "eval_runtime": 160.2187, "eval_samples_per_second": 396.183, "eval_steps_per_second": 1.548, "step": 1485000 }, { "epoch": 6.049971025238087, "grad_norm": 21.608034133911133, "learning_rate": 7.483005276025505e-05, "loss": 7.5247, "step": 1485100 }, { "epoch": 6.050378403261468, "grad_norm": 22.101882934570312, "learning_rate": 7.471774601204954e-05, "loss": 7.4833, "step": 1485200 }, { "epoch": 6.05078578128485, "grad_norm": 14.103845596313477, "learning_rate": 7.460553534589917e-05, "loss": 7.5188, "step": 1485300 }, { "epoch": 6.051193159308231, "grad_norm": 16.816926956176758, "learning_rate": 7.449342076624102e-05, "loss": 7.5451, "step": 1485400 }, { "epoch": 6.051600537331613, "grad_norm": 13.526528358459473, "learning_rate": 7.438140227750652e-05, "loss": 7.5259, "step": 1485500 }, { "epoch": 6.052007915354994, "grad_norm": 18.043670654296875, "learning_rate": 7.426947988412573e-05, "loss": 7.5241, "step": 1485600 }, { "epoch": 6.052415293378376, "grad_norm": 3.703565835952759, "learning_rate": 7.415765359052263e-05, "loss": 7.5698, "step": 1485700 }, { "epoch": 6.052822671401757, "grad_norm": 3.1997010707855225, "learning_rate": 7.404592340111874e-05, "loss": 7.5117, "step": 1485800 }, { "epoch": 6.053230049425139, "grad_norm": 26.271591186523438, "learning_rate": 7.393428932033097e-05, "loss": 7.5212, "step": 1485900 }, { "epoch": 6.05363742744852, "grad_norm": 11.788737297058105, "learning_rate": 7.382275135257303e-05, "loss": 7.5569, "step": 1486000 }, { "epoch": 6.05363742744852, "eval_MaskedAccuracy": 0.5138277712720543, "eval_loss": 1.5856705904006958, "eval_runtime": 175.3261, "eval_samples_per_second": 362.045, "eval_steps_per_second": 1.415, "step": 1486000 }, { "epoch": 6.054044805471902, "grad_norm": 16.922420501708984, "learning_rate": 7.371130950225481e-05, "loss": 7.5364, "step": 1486100 }, { "epoch": 6.054452183495283, "grad_norm": 13.531429290771484, "learning_rate": 7.359996377378155e-05, "loss": 7.5092, "step": 1486200 }, { "epoch": 6.054859561518665, "grad_norm": 14.451566696166992, "learning_rate": 7.34887141715562e-05, "loss": 7.5153, "step": 1486300 }, { "epoch": 6.055266939542046, "grad_norm": 21.359676361083984, "learning_rate": 7.33775606999764e-05, "loss": 7.546, "step": 1486400 }, { "epoch": 6.055674317565427, "grad_norm": 20.718141555786133, "learning_rate": 7.326650336343699e-05, "loss": 7.5072, "step": 1486500 }, { "epoch": 6.056081695588809, "grad_norm": 13.19158935546875, "learning_rate": 7.315554216632842e-05, "loss": 7.5253, "step": 1486600 }, { "epoch": 6.05648907361219, "grad_norm": 13.698712348937988, "learning_rate": 7.304467711303758e-05, "loss": 7.5042, "step": 1486700 }, { "epoch": 6.056896451635572, "grad_norm": 16.147371292114258, "learning_rate": 7.293390820794776e-05, "loss": 7.5272, "step": 1486800 }, { "epoch": 6.057303829658953, "grad_norm": 17.229768753051758, "learning_rate": 7.282323545543808e-05, "loss": 7.5149, "step": 1486900 }, { "epoch": 6.057711207682335, "grad_norm": 12.138152122497559, "learning_rate": 7.271265885988438e-05, "loss": 7.5274, "step": 1487000 }, { "epoch": 6.057711207682335, "eval_MaskedAccuracy": 0.5139290309555314, "eval_loss": 1.5757960081100464, "eval_runtime": 158.3188, "eval_samples_per_second": 400.938, "eval_steps_per_second": 1.566, "step": 1487000 }, { "epoch": 6.058118585705716, "grad_norm": 5.158970832824707, "learning_rate": 7.260217842565783e-05, "loss": 7.5205, "step": 1487100 }, { "epoch": 6.058525963729098, "grad_norm": 20.5587158203125, "learning_rate": 7.249179415712652e-05, "loss": 7.5331, "step": 1487200 }, { "epoch": 6.058933341752479, "grad_norm": 16.001752853393555, "learning_rate": 7.238150605865461e-05, "loss": 7.5402, "step": 1487300 }, { "epoch": 6.059340719775861, "grad_norm": 19.773279190063477, "learning_rate": 7.227131413460207e-05, "loss": 7.5562, "step": 1487400 }, { "epoch": 6.059748097799242, "grad_norm": 20.93306541442871, "learning_rate": 7.216121838932572e-05, "loss": 7.5483, "step": 1487500 }, { "epoch": 6.060155475822624, "grad_norm": 9.476699829101562, "learning_rate": 7.205121882717846e-05, "loss": 7.5344, "step": 1487600 }, { "epoch": 6.060562853846005, "grad_norm": 13.896587371826172, "learning_rate": 7.194131545250865e-05, "loss": 7.5394, "step": 1487700 }, { "epoch": 6.060970231869386, "grad_norm": 21.827335357666016, "learning_rate": 7.183150826966167e-05, "loss": 7.5509, "step": 1487800 }, { "epoch": 6.061377609892768, "grad_norm": 10.596880912780762, "learning_rate": 7.172179728297853e-05, "loss": 7.5221, "step": 1487900 }, { "epoch": 6.061784987916149, "grad_norm": 14.482085227966309, "learning_rate": 7.161218249679691e-05, "loss": 7.508, "step": 1488000 }, { "epoch": 6.061784987916149, "eval_MaskedAccuracy": 0.5136656771419797, "eval_loss": 1.5972084999084473, "eval_runtime": 158.4216, "eval_samples_per_second": 400.678, "eval_steps_per_second": 1.565, "step": 1488000 }, { "epoch": 6.062192365939531, "grad_norm": 19.168067932128906, "learning_rate": 7.150266391545025e-05, "loss": 7.5234, "step": 1488100 }, { "epoch": 6.062599743962912, "grad_norm": 9.889928817749023, "learning_rate": 7.139324154326872e-05, "loss": 7.5054, "step": 1488200 }, { "epoch": 6.063007121986294, "grad_norm": 21.029836654663086, "learning_rate": 7.128391538457812e-05, "loss": 7.5124, "step": 1488300 }, { "epoch": 6.063414500009675, "grad_norm": 13.195849418640137, "learning_rate": 7.117468544370075e-05, "loss": 7.5186, "step": 1488400 }, { "epoch": 6.063821878033057, "grad_norm": 13.294780731201172, "learning_rate": 7.106555172495504e-05, "loss": 7.5509, "step": 1488500 }, { "epoch": 6.064229256056438, "grad_norm": 12.833215713500977, "learning_rate": 7.095651423265603e-05, "loss": 7.561, "step": 1488600 }, { "epoch": 6.06463663407982, "grad_norm": 10.44172191619873, "learning_rate": 7.084757297111431e-05, "loss": 7.5101, "step": 1488700 }, { "epoch": 6.065044012103201, "grad_norm": 2.6616299152374268, "learning_rate": 7.07387279446363e-05, "loss": 7.5579, "step": 1488800 }, { "epoch": 6.065451390126583, "grad_norm": 5.75972318649292, "learning_rate": 7.06299791575263e-05, "loss": 7.508, "step": 1488900 }, { "epoch": 6.065858768149964, "grad_norm": 12.5253267288208, "learning_rate": 7.052132661408304e-05, "loss": 7.5384, "step": 1489000 }, { "epoch": 6.065858768149964, "eval_MaskedAccuracy": 0.5136171818785219, "eval_loss": 1.5813449621200562, "eval_runtime": 181.8537, "eval_samples_per_second": 349.05, "eval_steps_per_second": 1.364, "step": 1489000 }, { "epoch": 6.066266146173345, "grad_norm": 21.288951873779297, "learning_rate": 7.0412770318602e-05, "loss": 7.5523, "step": 1489100 }, { "epoch": 6.066673524196727, "grad_norm": 16.027528762817383, "learning_rate": 7.03043102753754e-05, "loss": 7.5512, "step": 1489200 }, { "epoch": 6.067080902220108, "grad_norm": 16.88258171081543, "learning_rate": 7.019594648869112e-05, "loss": 7.5768, "step": 1489300 }, { "epoch": 6.06748828024349, "grad_norm": 2.8600540161132812, "learning_rate": 7.008767896283354e-05, "loss": 7.5026, "step": 1489400 }, { "epoch": 6.067895658266871, "grad_norm": 18.037826538085938, "learning_rate": 6.997950770208234e-05, "loss": 7.5338, "step": 1489500 }, { "epoch": 6.068303036290253, "grad_norm": 16.59130096435547, "learning_rate": 6.987143271071482e-05, "loss": 7.5303, "step": 1489600 }, { "epoch": 6.068710414313634, "grad_norm": 2.7622907161712646, "learning_rate": 6.97634539930036e-05, "loss": 7.5182, "step": 1489700 }, { "epoch": 6.069117792337016, "grad_norm": 15.692123413085938, "learning_rate": 6.965557155321751e-05, "loss": 7.526, "step": 1489800 }, { "epoch": 6.069525170360397, "grad_norm": 13.806824684143066, "learning_rate": 6.954778539562206e-05, "loss": 7.5205, "step": 1489900 }, { "epoch": 6.069932548383779, "grad_norm": 18.76637840270996, "learning_rate": 6.944009552447811e-05, "loss": 7.5312, "step": 1490000 }, { "epoch": 6.069932548383779, "eval_MaskedAccuracy": 0.5137253195890193, "eval_loss": 1.580369234085083, "eval_runtime": 156.3748, "eval_samples_per_second": 405.922, "eval_steps_per_second": 1.586, "step": 1490000 }, { "epoch": 6.0703399264071605, "grad_norm": 13.451923370361328, "learning_rate": 6.933250194404374e-05, "loss": 7.5104, "step": 1490100 }, { "epoch": 6.070747304430541, "grad_norm": 5.400201797485352, "learning_rate": 6.922500465857215e-05, "loss": 7.5199, "step": 1490200 }, { "epoch": 6.071154682453923, "grad_norm": 6.814541339874268, "learning_rate": 6.911760367231377e-05, "loss": 7.5038, "step": 1490300 }, { "epoch": 6.071562060477304, "grad_norm": 20.72386932373047, "learning_rate": 6.90102989895144e-05, "loss": 7.5477, "step": 1490400 }, { "epoch": 6.071969438500686, "grad_norm": 17.206645965576172, "learning_rate": 6.890309061441635e-05, "loss": 7.5428, "step": 1490500 }, { "epoch": 6.072376816524067, "grad_norm": 16.003599166870117, "learning_rate": 6.879597855125853e-05, "loss": 7.5401, "step": 1490600 }, { "epoch": 6.072784194547449, "grad_norm": 12.124286651611328, "learning_rate": 6.868896280427548e-05, "loss": 7.5281, "step": 1490700 }, { "epoch": 6.07319157257083, "grad_norm": 12.789175033569336, "learning_rate": 6.858204337769781e-05, "loss": 7.535, "step": 1490800 }, { "epoch": 6.073598950594212, "grad_norm": 8.151293754577637, "learning_rate": 6.847522027575316e-05, "loss": 7.523, "step": 1490900 }, { "epoch": 6.074006328617593, "grad_norm": 17.697446823120117, "learning_rate": 6.836849350266432e-05, "loss": 7.5407, "step": 1491000 }, { "epoch": 6.074006328617593, "eval_MaskedAccuracy": 0.5141354898984748, "eval_loss": 1.5804506540298462, "eval_runtime": 153.9659, "eval_samples_per_second": 412.273, "eval_steps_per_second": 1.611, "step": 1491000 }, { "epoch": 6.074413706640975, "grad_norm": 15.482162475585938, "learning_rate": 6.826186306265088e-05, "loss": 7.5333, "step": 1491100 }, { "epoch": 6.074821084664356, "grad_norm": 8.309759140014648, "learning_rate": 6.81553289599287e-05, "loss": 7.539, "step": 1491200 }, { "epoch": 6.075228462687738, "grad_norm": 16.621870040893555, "learning_rate": 6.804889119870934e-05, "loss": 7.542, "step": 1491300 }, { "epoch": 6.0756358407111195, "grad_norm": 30.539064407348633, "learning_rate": 6.79425497832011e-05, "loss": 7.5703, "step": 1491400 }, { "epoch": 6.0760432187345, "grad_norm": 20.90303611755371, "learning_rate": 6.783630471760798e-05, "loss": 7.5265, "step": 1491500 }, { "epoch": 6.076450596757882, "grad_norm": 15.229084014892578, "learning_rate": 6.7730156006131e-05, "loss": 7.5384, "step": 1491600 }, { "epoch": 6.076857974781263, "grad_norm": 3.7409682273864746, "learning_rate": 6.762410365296634e-05, "loss": 7.5134, "step": 1491700 }, { "epoch": 6.077265352804645, "grad_norm": 4.02523946762085, "learning_rate": 6.751814766230639e-05, "loss": 7.5619, "step": 1491800 }, { "epoch": 6.077672730828026, "grad_norm": 6.49707555770874, "learning_rate": 6.741228803834111e-05, "loss": 7.5522, "step": 1491900 }, { "epoch": 6.078080108851408, "grad_norm": 24.099685668945312, "learning_rate": 6.730652478525492e-05, "loss": 7.5671, "step": 1492000 }, { "epoch": 6.078080108851408, "eval_MaskedAccuracy": 0.5134329376238207, "eval_loss": 1.5875269174575806, "eval_runtime": 230.1251, "eval_samples_per_second": 275.833, "eval_steps_per_second": 1.078, "step": 1492000 }, { "epoch": 6.078487486874789, "grad_norm": 11.89902400970459, "learning_rate": 6.720085790722995e-05, "loss": 7.5227, "step": 1492100 }, { "epoch": 6.078894864898171, "grad_norm": 37.37939453125, "learning_rate": 6.709528740844305e-05, "loss": 7.5143, "step": 1492200 }, { "epoch": 6.079302242921552, "grad_norm": 14.41089916229248, "learning_rate": 6.698981329306816e-05, "loss": 7.5072, "step": 1492300 }, { "epoch": 6.079709620944934, "grad_norm": 22.131799697875977, "learning_rate": 6.688443556527573e-05, "loss": 7.5069, "step": 1492400 }, { "epoch": 6.0801169989683155, "grad_norm": 4.21099853515625, "learning_rate": 6.677915422923131e-05, "loss": 7.5656, "step": 1492500 }, { "epoch": 6.080524376991697, "grad_norm": 30.27763557434082, "learning_rate": 6.667396928909737e-05, "loss": 7.5174, "step": 1492600 }, { "epoch": 6.080931755015078, "grad_norm": 8.610176086425781, "learning_rate": 6.65688807490325e-05, "loss": 7.5211, "step": 1492700 }, { "epoch": 6.081339133038459, "grad_norm": 20.53519630432129, "learning_rate": 6.646388861319148e-05, "loss": 7.5401, "step": 1492800 }, { "epoch": 6.081746511061841, "grad_norm": 8.410408020019531, "learning_rate": 6.635899288572525e-05, "loss": 7.5366, "step": 1492900 }, { "epoch": 6.082153889085222, "grad_norm": 9.369873046875, "learning_rate": 6.625419357078061e-05, "loss": 7.5128, "step": 1493000 }, { "epoch": 6.082153889085222, "eval_MaskedAccuracy": 0.5137086489015725, "eval_loss": 1.576980471611023, "eval_runtime": 165.4792, "eval_samples_per_second": 383.589, "eval_steps_per_second": 1.499, "step": 1493000 }, { "epoch": 6.082561267108604, "grad_norm": 12.149567604064941, "learning_rate": 6.614949067250127e-05, "loss": 7.5179, "step": 1493100 }, { "epoch": 6.082968645131985, "grad_norm": 2.9722039699554443, "learning_rate": 6.604488419502623e-05, "loss": 7.5354, "step": 1493200 }, { "epoch": 6.083376023155367, "grad_norm": 13.29632568359375, "learning_rate": 6.594037414249129e-05, "loss": 7.5287, "step": 1493300 }, { "epoch": 6.083783401178748, "grad_norm": 24.41215705871582, "learning_rate": 6.58359605190286e-05, "loss": 7.5353, "step": 1493400 }, { "epoch": 6.08419077920213, "grad_norm": 5.133878707885742, "learning_rate": 6.57316433287656e-05, "loss": 7.5244, "step": 1493500 }, { "epoch": 6.0845981572255115, "grad_norm": 14.606828689575195, "learning_rate": 6.562742257582729e-05, "loss": 7.5008, "step": 1493600 }, { "epoch": 6.085005535248893, "grad_norm": 7.373438358306885, "learning_rate": 6.552329826433337e-05, "loss": 7.5131, "step": 1493700 }, { "epoch": 6.0854129132722745, "grad_norm": 3.4081225395202637, "learning_rate": 6.541927039840082e-05, "loss": 7.4887, "step": 1493800 }, { "epoch": 6.085820291295656, "grad_norm": 15.155529022216797, "learning_rate": 6.531533898214251e-05, "loss": 7.5363, "step": 1493900 }, { "epoch": 6.086227669319037, "grad_norm": 11.844385147094727, "learning_rate": 6.521150401966698e-05, "loss": 7.5261, "step": 1494000 }, { "epoch": 6.086227669319037, "eval_MaskedAccuracy": 0.513277743180825, "eval_loss": 1.5814414024353027, "eval_runtime": 158.5885, "eval_samples_per_second": 400.256, "eval_steps_per_second": 1.564, "step": 1494000 }, { "epoch": 6.086635047342418, "grad_norm": 22.268585205078125, "learning_rate": 6.510776551507941e-05, "loss": 7.5169, "step": 1494100 }, { "epoch": 6.0870424253658, "grad_norm": 35.34524154663086, "learning_rate": 6.50041234724817e-05, "loss": 7.5192, "step": 1494200 }, { "epoch": 6.087449803389181, "grad_norm": 6.962163925170898, "learning_rate": 6.49005778959707e-05, "loss": 7.5179, "step": 1494300 }, { "epoch": 6.087857181412563, "grad_norm": 3.497898817062378, "learning_rate": 6.479712878964077e-05, "loss": 7.5114, "step": 1494400 }, { "epoch": 6.088264559435944, "grad_norm": 8.635665893554688, "learning_rate": 6.469377615758141e-05, "loss": 7.5209, "step": 1494500 }, { "epoch": 6.088671937459326, "grad_norm": 20.112213134765625, "learning_rate": 6.459052000387842e-05, "loss": 7.5197, "step": 1494600 }, { "epoch": 6.089079315482707, "grad_norm": 4.1904473304748535, "learning_rate": 6.44873603326146e-05, "loss": 7.51, "step": 1494700 }, { "epoch": 6.089486693506089, "grad_norm": 8.575403213500977, "learning_rate": 6.438429714786817e-05, "loss": 7.5089, "step": 1494800 }, { "epoch": 6.0898940715294705, "grad_norm": 28.999231338500977, "learning_rate": 6.42813304537138e-05, "loss": 7.5137, "step": 1494900 }, { "epoch": 6.090301449552852, "grad_norm": 13.976512908935547, "learning_rate": 6.41784602542223e-05, "loss": 7.5096, "step": 1495000 }, { "epoch": 6.090301449552852, "eval_MaskedAccuracy": 0.5134130855731317, "eval_loss": 1.5810376405715942, "eval_runtime": 157.1439, "eval_samples_per_second": 403.935, "eval_steps_per_second": 1.578, "step": 1495000 }, { "epoch": 6.0907088275762336, "grad_norm": 9.604857444763184, "learning_rate": 6.40756865534607e-05, "loss": 7.5231, "step": 1495100 }, { "epoch": 6.091116205599614, "grad_norm": 12.27506160736084, "learning_rate": 6.39730093554921e-05, "loss": 7.5165, "step": 1495200 }, { "epoch": 6.091523583622996, "grad_norm": 4.105710983276367, "learning_rate": 6.387042866437618e-05, "loss": 7.5047, "step": 1495300 }, { "epoch": 6.091930961646377, "grad_norm": 3.478956699371338, "learning_rate": 6.37679444841684e-05, "loss": 7.4837, "step": 1495400 }, { "epoch": 6.092338339669759, "grad_norm": 12.801264762878418, "learning_rate": 6.366555681891957e-05, "loss": 7.5093, "step": 1495500 }, { "epoch": 6.09274571769314, "grad_norm": 3.358525037765503, "learning_rate": 6.356326567267916e-05, "loss": 7.5539, "step": 1495600 }, { "epoch": 6.093153095716522, "grad_norm": 22.82572364807129, "learning_rate": 6.346107104949042e-05, "loss": 7.5156, "step": 1495700 }, { "epoch": 6.093560473739903, "grad_norm": 2.934875011444092, "learning_rate": 6.335897295339384e-05, "loss": 7.5188, "step": 1495800 }, { "epoch": 6.093967851763285, "grad_norm": 3.2453079223632812, "learning_rate": 6.325697138842585e-05, "loss": 7.4781, "step": 1495900 }, { "epoch": 6.0943752297866665, "grad_norm": 4.031073093414307, "learning_rate": 6.315506635861938e-05, "loss": 7.535, "step": 1496000 }, { "epoch": 6.0943752297866665, "eval_MaskedAccuracy": 0.5137616484666648, "eval_loss": 1.5853080749511719, "eval_runtime": 164.0886, "eval_samples_per_second": 386.84, "eval_steps_per_second": 1.511, "step": 1496000 }, { "epoch": 6.094782607810048, "grad_norm": 3.092862606048584, "learning_rate": 6.305325786800281e-05, "loss": 7.5226, "step": 1496100 }, { "epoch": 6.0951899858334295, "grad_norm": 12.842656135559082, "learning_rate": 6.295154592060126e-05, "loss": 7.5319, "step": 1496200 }, { "epoch": 6.095597363856811, "grad_norm": 6.992317199707031, "learning_rate": 6.284993052043654e-05, "loss": 7.5205, "step": 1496300 }, { "epoch": 6.096004741880193, "grad_norm": 4.310663223266602, "learning_rate": 6.274841167152531e-05, "loss": 7.4866, "step": 1496400 }, { "epoch": 6.096412119903573, "grad_norm": 9.675249099731445, "learning_rate": 6.264698937788139e-05, "loss": 7.5292, "step": 1496500 }, { "epoch": 6.096819497926955, "grad_norm": 4.376608848571777, "learning_rate": 6.25456636435147e-05, "loss": 7.505, "step": 1496600 }, { "epoch": 6.097226875950336, "grad_norm": 9.331419944763184, "learning_rate": 6.244443447243108e-05, "loss": 7.4936, "step": 1496700 }, { "epoch": 6.097634253973718, "grad_norm": 4.6320977210998535, "learning_rate": 6.234330186863277e-05, "loss": 7.537, "step": 1496800 }, { "epoch": 6.098041631997099, "grad_norm": 4.060110092163086, "learning_rate": 6.224226583611792e-05, "loss": 7.5192, "step": 1496900 }, { "epoch": 6.098449010020481, "grad_norm": 4.308634281158447, "learning_rate": 6.214132637888117e-05, "loss": 7.523, "step": 1497000 }, { "epoch": 6.098449010020481, "eval_MaskedAccuracy": 0.5131019622926449, "eval_loss": 1.59047532081604, "eval_runtime": 164.3752, "eval_samples_per_second": 386.165, "eval_steps_per_second": 1.509, "step": 1497000 }, { "epoch": 6.098856388043862, "grad_norm": 3.2699756622314453, "learning_rate": 6.204048350091304e-05, "loss": 7.4865, "step": 1497100 }, { "epoch": 6.099263766067244, "grad_norm": 5.7099833488464355, "learning_rate": 6.193973720620037e-05, "loss": 7.4747, "step": 1497200 }, { "epoch": 6.0996711440906255, "grad_norm": 3.1226887702941895, "learning_rate": 6.183908749872627e-05, "loss": 7.5386, "step": 1497300 }, { "epoch": 6.100078522114007, "grad_norm": 9.971474647521973, "learning_rate": 6.173853438246976e-05, "loss": 7.5233, "step": 1497400 }, { "epoch": 6.100485900137389, "grad_norm": 41.65095520019531, "learning_rate": 6.163807786140696e-05, "loss": 7.5278, "step": 1497500 }, { "epoch": 6.10089327816077, "grad_norm": 15.143765449523926, "learning_rate": 6.153771793950887e-05, "loss": 7.5195, "step": 1497600 }, { "epoch": 6.101300656184151, "grad_norm": 44.65937042236328, "learning_rate": 6.143745462074318e-05, "loss": 7.5296, "step": 1497700 }, { "epoch": 6.101708034207532, "grad_norm": 8.260921478271484, "learning_rate": 6.133728790907413e-05, "loss": 7.5214, "step": 1497800 }, { "epoch": 6.102115412230914, "grad_norm": 3.633737564086914, "learning_rate": 6.123721780846173e-05, "loss": 7.5076, "step": 1497900 }, { "epoch": 6.102522790254295, "grad_norm": 6.263742923736572, "learning_rate": 6.113724432286207e-05, "loss": 7.4943, "step": 1498000 }, { "epoch": 6.102522790254295, "eval_MaskedAccuracy": 0.5135091319582853, "eval_loss": 1.5897711515426636, "eval_runtime": 165.0235, "eval_samples_per_second": 384.648, "eval_steps_per_second": 1.503, "step": 1498000 }, { "epoch": 6.102930168277677, "grad_norm": 38.498966217041016, "learning_rate": 6.103736745622785e-05, "loss": 7.5505, "step": 1498100 }, { "epoch": 6.103337546301058, "grad_norm": 35.71931076049805, "learning_rate": 6.093758721250758e-05, "loss": 7.5099, "step": 1498200 }, { "epoch": 6.10374492432444, "grad_norm": 9.26229190826416, "learning_rate": 6.0837903595646224e-05, "loss": 7.5418, "step": 1498300 }, { "epoch": 6.1041523023478215, "grad_norm": 26.859647750854492, "learning_rate": 6.07383166095847e-05, "loss": 7.5456, "step": 1498400 }, { "epoch": 6.104559680371203, "grad_norm": 2.4605488777160645, "learning_rate": 6.063882625826058e-05, "loss": 7.491, "step": 1498500 }, { "epoch": 6.1049670583945845, "grad_norm": 44.589752197265625, "learning_rate": 6.053943254560676e-05, "loss": 7.4924, "step": 1498600 }, { "epoch": 6.105374436417966, "grad_norm": 3.9444704055786133, "learning_rate": 6.0440135475553265e-05, "loss": 7.5357, "step": 1498700 }, { "epoch": 6.105781814441348, "grad_norm": 22.431640625, "learning_rate": 6.034093505202546e-05, "loss": 7.5317, "step": 1498800 }, { "epoch": 6.106189192464729, "grad_norm": 12.822697639465332, "learning_rate": 6.024183127894574e-05, "loss": 7.5083, "step": 1498900 }, { "epoch": 6.10659657048811, "grad_norm": 19.53934669494629, "learning_rate": 6.014282416023171e-05, "loss": 7.5185, "step": 1499000 }, { "epoch": 6.10659657048811, "eval_MaskedAccuracy": 0.5143693548931055, "eval_loss": 1.572809100151062, "eval_runtime": 162.4281, "eval_samples_per_second": 390.794, "eval_steps_per_second": 1.527, "step": 1499000 }, { "epoch": 6.107003948511491, "grad_norm": 22.403322219848633, "learning_rate": 6.0043913699797884e-05, "loss": 7.5242, "step": 1499100 }, { "epoch": 6.107411326534873, "grad_norm": 10.545846939086914, "learning_rate": 5.994509990155476e-05, "loss": 7.5222, "step": 1499200 }, { "epoch": 6.107818704558254, "grad_norm": 39.69652557373047, "learning_rate": 5.9846382769408816e-05, "loss": 7.5152, "step": 1499300 }, { "epoch": 6.108226082581636, "grad_norm": 26.553680419921875, "learning_rate": 5.97477623072627e-05, "loss": 7.5034, "step": 1499400 }, { "epoch": 6.108633460605017, "grad_norm": 15.599024772644043, "learning_rate": 5.964923851901581e-05, "loss": 7.5221, "step": 1499500 }, { "epoch": 6.109040838628399, "grad_norm": 20.136043548583984, "learning_rate": 5.955081140856323e-05, "loss": 7.5389, "step": 1499600 }, { "epoch": 6.1094482166517805, "grad_norm": 16.722702026367188, "learning_rate": 5.9452480979796396e-05, "loss": 7.523, "step": 1499700 }, { "epoch": 6.109855594675162, "grad_norm": 3.7607839107513428, "learning_rate": 5.9354247236602686e-05, "loss": 7.51, "step": 1499800 }, { "epoch": 6.110262972698544, "grad_norm": 15.021867752075195, "learning_rate": 5.925611018286561e-05, "loss": 7.5226, "step": 1499900 }, { "epoch": 6.110670350721925, "grad_norm": 3.6220667362213135, "learning_rate": 5.915806982246545e-05, "loss": 7.5128, "step": 1500000 }, { "epoch": 6.110670350721925, "eval_MaskedAccuracy": 0.5136200421896125, "eval_loss": 1.5887588262557983, "eval_runtime": 171.5534, "eval_samples_per_second": 370.007, "eval_steps_per_second": 1.446, "step": 1500000 }, { "epoch": 6.111077728745307, "grad_norm": 16.785629272460938, "learning_rate": 5.906012615927822e-05, "loss": 7.5288, "step": 1500100 }, { "epoch": 6.111485106768687, "grad_norm": 29.793237686157227, "learning_rate": 5.8962279197175594e-05, "loss": 7.5424, "step": 1500200 }, { "epoch": 6.111892484792069, "grad_norm": 12.748746871948242, "learning_rate": 5.886452894002673e-05, "loss": 7.533, "step": 1500300 }, { "epoch": 6.11229986281545, "grad_norm": 18.582733154296875, "learning_rate": 5.876687539169579e-05, "loss": 7.5181, "step": 1500400 }, { "epoch": 6.112707240838832, "grad_norm": 4.814263343811035, "learning_rate": 5.866931855604359e-05, "loss": 7.5212, "step": 1500500 }, { "epoch": 6.113114618862213, "grad_norm": 10.030101776123047, "learning_rate": 5.8571858436927295e-05, "loss": 7.5309, "step": 1500600 }, { "epoch": 6.113521996885595, "grad_norm": 32.51324462890625, "learning_rate": 5.847449503819984e-05, "loss": 7.5445, "step": 1500700 }, { "epoch": 6.1139293749089765, "grad_norm": 6.151972770690918, "learning_rate": 5.83772283637103e-05, "loss": 7.505, "step": 1500800 }, { "epoch": 6.114336752932358, "grad_norm": 9.38499927520752, "learning_rate": 5.8280058417304595e-05, "loss": 7.5265, "step": 1500900 }, { "epoch": 6.1147441309557395, "grad_norm": 8.609825134277344, "learning_rate": 5.818298520282393e-05, "loss": 7.4827, "step": 1501000 }, { "epoch": 6.1147441309557395, "eval_MaskedAccuracy": 0.5139675753318365, "eval_loss": 1.581284523010254, "eval_runtime": 154.483, "eval_samples_per_second": 410.893, "eval_steps_per_second": 1.605, "step": 1501000 }, { "epoch": 6.115151508979121, "grad_norm": 15.545623779296875, "learning_rate": 5.808600872410646e-05, "loss": 7.5159, "step": 1501100 }, { "epoch": 6.115558887002503, "grad_norm": 5.443942546844482, "learning_rate": 5.798912898498633e-05, "loss": 7.5376, "step": 1501200 }, { "epoch": 6.115966265025884, "grad_norm": 21.568159103393555, "learning_rate": 5.789234598929311e-05, "loss": 7.5197, "step": 1501300 }, { "epoch": 6.116373643049266, "grad_norm": 16.5999813079834, "learning_rate": 5.779565974085333e-05, "loss": 7.5445, "step": 1501400 }, { "epoch": 6.116781021072646, "grad_norm": 24.65265464782715, "learning_rate": 5.76990702434901e-05, "loss": 7.52, "step": 1501500 }, { "epoch": 6.117188399096028, "grad_norm": 10.785026550292969, "learning_rate": 5.7602577501021594e-05, "loss": 7.5149, "step": 1501600 }, { "epoch": 6.117595777119409, "grad_norm": 24.120969772338867, "learning_rate": 5.750618151726301e-05, "loss": 7.5243, "step": 1501700 }, { "epoch": 6.118003155142791, "grad_norm": 5.7547993659973145, "learning_rate": 5.7409882296024915e-05, "loss": 7.4853, "step": 1501800 }, { "epoch": 6.1184105331661724, "grad_norm": 11.60728931427002, "learning_rate": 5.731367984111482e-05, "loss": 7.5163, "step": 1501900 }, { "epoch": 6.118817911189554, "grad_norm": 14.78231143951416, "learning_rate": 5.7217574156336114e-05, "loss": 7.5376, "step": 1502000 }, { "epoch": 6.118817911189554, "eval_MaskedAccuracy": 0.5125785951915778, "eval_loss": 1.5911993980407715, "eval_runtime": 161.9036, "eval_samples_per_second": 392.06, "eval_steps_per_second": 1.532, "step": 1502000 }, { "epoch": 6.1192252892129355, "grad_norm": 29.454513549804688, "learning_rate": 5.712156524548849e-05, "loss": 7.5158, "step": 1502100 }, { "epoch": 6.119632667236317, "grad_norm": 22.591447830200195, "learning_rate": 5.7025653112367435e-05, "loss": 7.5205, "step": 1502200 }, { "epoch": 6.120040045259699, "grad_norm": 17.294477462768555, "learning_rate": 5.692983776076528e-05, "loss": 7.5469, "step": 1502300 }, { "epoch": 6.12044742328308, "grad_norm": 4.483959197998047, "learning_rate": 5.683411919446954e-05, "loss": 7.5126, "step": 1502400 }, { "epoch": 6.120854801306462, "grad_norm": 8.0223388671875, "learning_rate": 5.673849741726482e-05, "loss": 7.5208, "step": 1502500 }, { "epoch": 6.121262179329843, "grad_norm": 8.338587760925293, "learning_rate": 5.6642972432931614e-05, "loss": 7.5289, "step": 1502600 }, { "epoch": 6.121669557353224, "grad_norm": 9.082412719726562, "learning_rate": 5.654754424524656e-05, "loss": 7.5317, "step": 1502700 }, { "epoch": 6.122076935376605, "grad_norm": 5.886031627655029, "learning_rate": 5.6452212857982136e-05, "loss": 7.5404, "step": 1502800 }, { "epoch": 6.122484313399987, "grad_norm": 4.301381587982178, "learning_rate": 5.635697827490751e-05, "loss": 7.5355, "step": 1502900 }, { "epoch": 6.122891691423368, "grad_norm": 21.232097625732422, "learning_rate": 5.626184049978793e-05, "loss": 7.5442, "step": 1503000 }, { "epoch": 6.122891691423368, "eval_MaskedAccuracy": 0.5132958692081369, "eval_loss": 1.5840513706207275, "eval_runtime": 167.5835, "eval_samples_per_second": 378.772, "eval_steps_per_second": 1.48, "step": 1503000 }, { "epoch": 6.12329906944675, "grad_norm": 7.63557767868042, "learning_rate": 5.616679953638451e-05, "loss": 7.5196, "step": 1503100 }, { "epoch": 6.1237064474701315, "grad_norm": 20.64089012145996, "learning_rate": 5.6071855388454756e-05, "loss": 7.5087, "step": 1503200 }, { "epoch": 6.124113825493513, "grad_norm": 3.5224153995513916, "learning_rate": 5.5977008059752304e-05, "loss": 7.5247, "step": 1503300 }, { "epoch": 6.1245212035168946, "grad_norm": 10.130736351013184, "learning_rate": 5.5882257554026586e-05, "loss": 7.5134, "step": 1503400 }, { "epoch": 6.124928581540276, "grad_norm": 9.070799827575684, "learning_rate": 5.578760387502442e-05, "loss": 7.5448, "step": 1503500 }, { "epoch": 6.125335959563658, "grad_norm": 15.010527610778809, "learning_rate": 5.569304702648751e-05, "loss": 7.5372, "step": 1503600 }, { "epoch": 6.125743337587039, "grad_norm": 30.21007537841797, "learning_rate": 5.5598587012154466e-05, "loss": 7.5026, "step": 1503700 }, { "epoch": 6.126150715610421, "grad_norm": 34.350929260253906, "learning_rate": 5.5504223835759575e-05, "loss": 7.5271, "step": 1503800 }, { "epoch": 6.126558093633802, "grad_norm": 5.31763219833374, "learning_rate": 5.540995750103348e-05, "loss": 7.5206, "step": 1503900 }, { "epoch": 6.126965471657183, "grad_norm": 14.548882484436035, "learning_rate": 5.531578801170296e-05, "loss": 7.5329, "step": 1504000 }, { "epoch": 6.126965471657183, "eval_MaskedAccuracy": 0.5132471680824442, "eval_loss": 1.5844782590866089, "eval_runtime": 164.7584, "eval_samples_per_second": 385.267, "eval_steps_per_second": 1.505, "step": 1504000 }, { "epoch": 6.127372849680564, "grad_norm": 4.667428493499756, "learning_rate": 5.522171537149146e-05, "loss": 7.525, "step": 1504100 }, { "epoch": 6.127780227703946, "grad_norm": 21.363624572753906, "learning_rate": 5.512773958411767e-05, "loss": 7.5196, "step": 1504200 }, { "epoch": 6.1281876057273275, "grad_norm": 17.82527732849121, "learning_rate": 5.5033860653297334e-05, "loss": 7.5239, "step": 1504300 }, { "epoch": 6.128594983750709, "grad_norm": 28.17323112487793, "learning_rate": 5.4940078582741645e-05, "loss": 7.5321, "step": 1504400 }, { "epoch": 6.1290023617740905, "grad_norm": 23.739816665649414, "learning_rate": 5.4846393376158575e-05, "loss": 7.563, "step": 1504500 }, { "epoch": 6.129409739797472, "grad_norm": 10.429069519042969, "learning_rate": 5.475280503725186e-05, "loss": 7.5084, "step": 1504600 }, { "epoch": 6.129817117820854, "grad_norm": 10.665395736694336, "learning_rate": 5.465931356972176e-05, "loss": 7.5192, "step": 1504700 }, { "epoch": 6.130224495844235, "grad_norm": 18.882339477539062, "learning_rate": 5.456591897726429e-05, "loss": 7.5131, "step": 1504800 }, { "epoch": 6.130631873867617, "grad_norm": 15.698235511779785, "learning_rate": 5.447262126357175e-05, "loss": 7.519, "step": 1504900 }, { "epoch": 6.131039251890998, "grad_norm": 9.932863235473633, "learning_rate": 5.437942043233305e-05, "loss": 7.5235, "step": 1505000 }, { "epoch": 6.131039251890998, "eval_MaskedAccuracy": 0.5135489777019518, "eval_loss": 1.5909253358840942, "eval_runtime": 205.4964, "eval_samples_per_second": 308.891, "eval_steps_per_second": 1.207, "step": 1505000 }, { "epoch": 6.13144662991438, "grad_norm": 11.029427528381348, "learning_rate": 5.4286316487232534e-05, "loss": 7.5275, "step": 1505100 }, { "epoch": 6.13185400793776, "grad_norm": 15.226016998291016, "learning_rate": 5.419330943195129e-05, "loss": 7.5026, "step": 1505200 }, { "epoch": 6.132261385961142, "grad_norm": 8.539198875427246, "learning_rate": 5.410039927016648e-05, "loss": 7.5387, "step": 1505300 }, { "epoch": 6.132668763984523, "grad_norm": 3.748371124267578, "learning_rate": 5.400758600555089e-05, "loss": 7.5239, "step": 1505400 }, { "epoch": 6.133076142007905, "grad_norm": 19.625595092773438, "learning_rate": 5.391486964177449e-05, "loss": 7.5148, "step": 1505500 }, { "epoch": 6.1334835200312865, "grad_norm": 18.956878662109375, "learning_rate": 5.3822250182502516e-05, "loss": 7.5166, "step": 1505600 }, { "epoch": 6.133890898054668, "grad_norm": 16.757740020751953, "learning_rate": 5.372972763139697e-05, "loss": 7.5499, "step": 1505700 }, { "epoch": 6.13429827607805, "grad_norm": 27.8960018157959, "learning_rate": 5.3637301992115615e-05, "loss": 7.484, "step": 1505800 }, { "epoch": 6.134705654101431, "grad_norm": 16.285743713378906, "learning_rate": 5.354497326831234e-05, "loss": 7.5491, "step": 1505900 }, { "epoch": 6.135113032124813, "grad_norm": 3.0651907920837402, "learning_rate": 5.345274146363771e-05, "loss": 7.5537, "step": 1506000 }, { "epoch": 6.135113032124813, "eval_MaskedAccuracy": 0.5133615278180056, "eval_loss": 1.5865812301635742, "eval_runtime": 173.4388, "eval_samples_per_second": 365.985, "eval_steps_per_second": 1.43, "step": 1506000 }, { "epoch": 6.135520410148194, "grad_norm": 8.346098899841309, "learning_rate": 5.336060658173798e-05, "loss": 7.556, "step": 1506100 }, { "epoch": 6.135927788171576, "grad_norm": 7.614825248718262, "learning_rate": 5.3268568626255686e-05, "loss": 7.5436, "step": 1506200 }, { "epoch": 6.136335166194957, "grad_norm": 24.567646026611328, "learning_rate": 5.317662760082977e-05, "loss": 7.5348, "step": 1506300 }, { "epoch": 6.136742544218338, "grad_norm": 12.6890287399292, "learning_rate": 5.3084783509094744e-05, "loss": 7.5319, "step": 1506400 }, { "epoch": 6.137149922241719, "grad_norm": 17.08099365234375, "learning_rate": 5.299303635468242e-05, "loss": 7.533, "step": 1506500 }, { "epoch": 6.137557300265101, "grad_norm": 13.692460060119629, "learning_rate": 5.290138614121951e-05, "loss": 7.4986, "step": 1506600 }, { "epoch": 6.1379646782884825, "grad_norm": 9.78744125366211, "learning_rate": 5.280983287232919e-05, "loss": 7.5201, "step": 1506700 }, { "epoch": 6.138372056311864, "grad_norm": 10.857870101928711, "learning_rate": 5.271837655163163e-05, "loss": 7.5503, "step": 1506800 }, { "epoch": 6.1387794343352455, "grad_norm": 18.279020309448242, "learning_rate": 5.2627017182742455e-05, "loss": 7.5017, "step": 1506900 }, { "epoch": 6.139186812358627, "grad_norm": 10.454538345336914, "learning_rate": 5.2535754769273487e-05, "loss": 7.5279, "step": 1507000 }, { "epoch": 6.139186812358627, "eval_MaskedAccuracy": 0.5133254952590061, "eval_loss": 1.5855728387832642, "eval_runtime": 174.1098, "eval_samples_per_second": 364.574, "eval_steps_per_second": 1.424, "step": 1507000 }, { "epoch": 6.139594190382009, "grad_norm": 9.251683235168457, "learning_rate": 5.244458931483292e-05, "loss": 7.5185, "step": 1507100 }, { "epoch": 6.14000156840539, "grad_norm": 22.32879066467285, "learning_rate": 5.2353520823024836e-05, "loss": 7.5305, "step": 1507200 }, { "epoch": 6.140408946428772, "grad_norm": 22.638103485107422, "learning_rate": 5.2262549297449725e-05, "loss": 7.5294, "step": 1507300 }, { "epoch": 6.140816324452153, "grad_norm": 22.381141662597656, "learning_rate": 5.2171674741704125e-05, "loss": 7.5399, "step": 1507400 }, { "epoch": 6.141223702475535, "grad_norm": 5.647333145141602, "learning_rate": 5.2080897159381016e-05, "loss": 7.4983, "step": 1507500 }, { "epoch": 6.141631080498916, "grad_norm": 24.05869483947754, "learning_rate": 5.199021655406925e-05, "loss": 7.5205, "step": 1507600 }, { "epoch": 6.142038458522297, "grad_norm": 22.714885711669922, "learning_rate": 5.1899632929353695e-05, "loss": 7.5062, "step": 1507700 }, { "epoch": 6.142445836545678, "grad_norm": 15.513699531555176, "learning_rate": 5.180914628881572e-05, "loss": 7.5508, "step": 1507800 }, { "epoch": 6.14285321456906, "grad_norm": 5.411686897277832, "learning_rate": 5.17187566360328e-05, "loss": 7.5067, "step": 1507900 }, { "epoch": 6.1432605925924415, "grad_norm": 15.956671714782715, "learning_rate": 5.162846397457855e-05, "loss": 7.5378, "step": 1508000 }, { "epoch": 6.1432605925924415, "eval_MaskedAccuracy": 0.5133262331475851, "eval_loss": 1.5871728658676147, "eval_runtime": 176.6592, "eval_samples_per_second": 359.313, "eval_steps_per_second": 1.404, "step": 1508000 }, { "epoch": 6.143667970615823, "grad_norm": 10.043844223022461, "learning_rate": 5.153826830802267e-05, "loss": 7.5047, "step": 1508100 }, { "epoch": 6.144075348639205, "grad_norm": 8.08338737487793, "learning_rate": 5.1448169639931225e-05, "loss": 7.5137, "step": 1508200 }, { "epoch": 6.144482726662586, "grad_norm": 14.614755630493164, "learning_rate": 5.1358167973865915e-05, "loss": 7.5398, "step": 1508300 }, { "epoch": 6.144890104685968, "grad_norm": 13.929628372192383, "learning_rate": 5.1268263313385335e-05, "loss": 7.5202, "step": 1508400 }, { "epoch": 6.145297482709349, "grad_norm": 11.499002456665039, "learning_rate": 5.117845566204373e-05, "loss": 7.5273, "step": 1508500 }, { "epoch": 6.145704860732731, "grad_norm": 6.545090675354004, "learning_rate": 5.108874502339197e-05, "loss": 7.5356, "step": 1508600 }, { "epoch": 6.146112238756112, "grad_norm": 19.642810821533203, "learning_rate": 5.099913140097616e-05, "loss": 7.5384, "step": 1508700 }, { "epoch": 6.146519616779494, "grad_norm": 12.959432601928711, "learning_rate": 5.0909614798340014e-05, "loss": 7.5248, "step": 1508800 }, { "epoch": 6.146926994802875, "grad_norm": 15.121339797973633, "learning_rate": 5.082019521902198e-05, "loss": 7.5185, "step": 1508900 }, { "epoch": 6.147334372826256, "grad_norm": 6.627651214599609, "learning_rate": 5.073087266655732e-05, "loss": 7.5288, "step": 1509000 }, { "epoch": 6.147334372826256, "eval_MaskedAccuracy": 0.5135563572028291, "eval_loss": 1.5837535858154297, "eval_runtime": 168.5913, "eval_samples_per_second": 376.508, "eval_steps_per_second": 1.471, "step": 1509000 }, { "epoch": 6.1477417508496375, "grad_norm": 16.430068969726562, "learning_rate": 5.064164714447782e-05, "loss": 7.5318, "step": 1509100 }, { "epoch": 6.148149128873019, "grad_norm": 5.408413887023926, "learning_rate": 5.055251865631089e-05, "loss": 7.5317, "step": 1509200 }, { "epoch": 6.1485565068964005, "grad_norm": 6.283885478973389, "learning_rate": 5.046348720558015e-05, "loss": 7.5328, "step": 1509300 }, { "epoch": 6.148963884919782, "grad_norm": 13.609271049499512, "learning_rate": 5.037455279580546e-05, "loss": 7.5264, "step": 1509400 }, { "epoch": 6.149371262943164, "grad_norm": 19.50933265686035, "learning_rate": 5.02857154305028e-05, "loss": 7.5452, "step": 1509500 }, { "epoch": 6.149778640966545, "grad_norm": 22.08439064025879, "learning_rate": 5.019697511318486e-05, "loss": 7.5474, "step": 1509600 }, { "epoch": 6.150186018989927, "grad_norm": 18.241321563720703, "learning_rate": 5.010833184735959e-05, "loss": 7.495, "step": 1509700 }, { "epoch": 6.150593397013308, "grad_norm": 15.491229057312012, "learning_rate": 5.001978563653154e-05, "loss": 7.5639, "step": 1509800 }, { "epoch": 6.15100077503669, "grad_norm": 3.1370866298675537, "learning_rate": 4.9931336484201755e-05, "loss": 7.5406, "step": 1509900 }, { "epoch": 6.151408153060071, "grad_norm": 18.79566192626953, "learning_rate": 4.984298439386675e-05, "loss": 7.5138, "step": 1510000 }, { "epoch": 6.151408153060071, "eval_MaskedAccuracy": 0.5132923806800755, "eval_loss": 1.5832467079162598, "eval_runtime": 199.0901, "eval_samples_per_second": 318.831, "eval_steps_per_second": 1.246, "step": 1510000 }, { "epoch": 6.151815531083453, "grad_norm": 20.028486251831055, "learning_rate": 4.975472936901961e-05, "loss": 7.5247, "step": 1510100 }, { "epoch": 6.1522229091068334, "grad_norm": 4.8658599853515625, "learning_rate": 4.966657141314969e-05, "loss": 7.5394, "step": 1510200 }, { "epoch": 6.152630287130215, "grad_norm": 36.72107696533203, "learning_rate": 4.9578510529742375e-05, "loss": 7.518, "step": 1510300 }, { "epoch": 6.1530376651535965, "grad_norm": 37.59674835205078, "learning_rate": 4.949054672227884e-05, "loss": 7.5205, "step": 1510400 }, { "epoch": 6.153445043176978, "grad_norm": 3.0896642208099365, "learning_rate": 4.940267999423704e-05, "loss": 7.5015, "step": 1510500 }, { "epoch": 6.15385242120036, "grad_norm": 5.979048252105713, "learning_rate": 4.9314910349090614e-05, "loss": 7.5147, "step": 1510600 }, { "epoch": 6.154259799223741, "grad_norm": 38.82964324951172, "learning_rate": 4.9227237790310136e-05, "loss": 7.5136, "step": 1510700 }, { "epoch": 6.154667177247123, "grad_norm": 7.174513816833496, "learning_rate": 4.913966232136086e-05, "loss": 7.5285, "step": 1510800 }, { "epoch": 6.155074555270504, "grad_norm": 20.411285400390625, "learning_rate": 4.905218394570606e-05, "loss": 7.5316, "step": 1510900 }, { "epoch": 6.155481933293886, "grad_norm": 39.270877838134766, "learning_rate": 4.896480266680332e-05, "loss": 7.5294, "step": 1511000 }, { "epoch": 6.155481933293886, "eval_MaskedAccuracy": 0.5138560436909618, "eval_loss": 1.5828372240066528, "eval_runtime": 168.6993, "eval_samples_per_second": 376.267, "eval_steps_per_second": 1.47, "step": 1511000 }, { "epoch": 6.155889311317267, "grad_norm": 4.460048198699951, "learning_rate": 4.887751848810792e-05, "loss": 7.5489, "step": 1511100 }, { "epoch": 6.156296689340649, "grad_norm": 3.7612500190734863, "learning_rate": 4.879033141307078e-05, "loss": 7.5246, "step": 1511200 }, { "epoch": 6.15670406736403, "grad_norm": 5.42179536819458, "learning_rate": 4.8703241445137996e-05, "loss": 7.5126, "step": 1511300 }, { "epoch": 6.157111445387411, "grad_norm": 18.84461212158203, "learning_rate": 4.861624858775356e-05, "loss": 7.5481, "step": 1511400 }, { "epoch": 6.1575188234107925, "grad_norm": 10.883890151977539, "learning_rate": 4.85293528443564e-05, "loss": 7.4938, "step": 1511500 }, { "epoch": 6.157926201434174, "grad_norm": 30.8356990814209, "learning_rate": 4.8442554218382146e-05, "loss": 7.5118, "step": 1511600 }, { "epoch": 6.1583335794575556, "grad_norm": 9.673890113830566, "learning_rate": 4.835585271326236e-05, "loss": 7.514, "step": 1511700 }, { "epoch": 6.158740957480937, "grad_norm": 23.722261428833008, "learning_rate": 4.826924833242485e-05, "loss": 7.5246, "step": 1511800 }, { "epoch": 6.159148335504319, "grad_norm": 16.473833084106445, "learning_rate": 4.8182741079293106e-05, "loss": 7.534, "step": 1511900 }, { "epoch": 6.1595557135277, "grad_norm": 8.175965309143066, "learning_rate": 4.809633095728808e-05, "loss": 7.4927, "step": 1512000 }, { "epoch": 6.1595557135277, "eval_MaskedAccuracy": 0.5136075506337975, "eval_loss": 1.5865976810455322, "eval_runtime": 163.032, "eval_samples_per_second": 389.347, "eval_steps_per_second": 1.521, "step": 1512000 }, { "epoch": 6.159963091551082, "grad_norm": 3.847217082977295, "learning_rate": 4.801001796982517e-05, "loss": 7.5415, "step": 1512100 }, { "epoch": 6.160370469574463, "grad_norm": 3.657407522201538, "learning_rate": 4.792380212031735e-05, "loss": 7.5284, "step": 1512200 }, { "epoch": 6.160777847597845, "grad_norm": 14.559707641601562, "learning_rate": 4.7837683412172776e-05, "loss": 7.507, "step": 1512300 }, { "epoch": 6.161185225621226, "grad_norm": 3.464966297149658, "learning_rate": 4.7751661848796645e-05, "loss": 7.5167, "step": 1512400 }, { "epoch": 6.161592603644608, "grad_norm": 4.260078430175781, "learning_rate": 4.766573743358945e-05, "loss": 7.5152, "step": 1512500 }, { "epoch": 6.161999981667989, "grad_norm": 4.311563968658447, "learning_rate": 4.757991016994856e-05, "loss": 7.4896, "step": 1512600 }, { "epoch": 6.16240735969137, "grad_norm": 7.310108661651611, "learning_rate": 4.749418006126697e-05, "loss": 7.512, "step": 1512700 }, { "epoch": 6.1628147377147515, "grad_norm": 16.643510818481445, "learning_rate": 4.740854711093398e-05, "loss": 7.5254, "step": 1512800 }, { "epoch": 6.163222115738133, "grad_norm": 15.554253578186035, "learning_rate": 4.732301132233517e-05, "loss": 7.5593, "step": 1512900 }, { "epoch": 6.163629493761515, "grad_norm": 11.946845054626465, "learning_rate": 4.7237572698852495e-05, "loss": 7.5142, "step": 1513000 }, { "epoch": 6.163629493761515, "eval_MaskedAccuracy": 0.5135302717421424, "eval_loss": 1.58794105052948, "eval_runtime": 152.674, "eval_samples_per_second": 415.762, "eval_steps_per_second": 1.624, "step": 1513000 }, { "epoch": 6.164036871784896, "grad_norm": 21.967002868652344, "learning_rate": 4.7152231243863595e-05, "loss": 7.5124, "step": 1513100 }, { "epoch": 6.164444249808278, "grad_norm": 17.184104919433594, "learning_rate": 4.706698696074239e-05, "loss": 7.4828, "step": 1513200 }, { "epoch": 6.164851627831659, "grad_norm": 6.202548027038574, "learning_rate": 4.698183985285888e-05, "loss": 7.5394, "step": 1513300 }, { "epoch": 6.165259005855041, "grad_norm": 24.404335021972656, "learning_rate": 4.6896789923580024e-05, "loss": 7.5371, "step": 1513400 }, { "epoch": 6.165666383878422, "grad_norm": 11.398333549499512, "learning_rate": 4.681183717626802e-05, "loss": 7.5247, "step": 1513500 }, { "epoch": 6.166073761901804, "grad_norm": 6.04208517074585, "learning_rate": 4.6726981614281246e-05, "loss": 7.5326, "step": 1513600 }, { "epoch": 6.166481139925185, "grad_norm": 22.410314559936523, "learning_rate": 4.6642223240974716e-05, "loss": 7.5411, "step": 1513700 }, { "epoch": 6.166888517948567, "grad_norm": 2.2639403343200684, "learning_rate": 4.6557562059699315e-05, "loss": 7.5159, "step": 1513800 }, { "epoch": 6.167295895971948, "grad_norm": 12.063501358032227, "learning_rate": 4.647299807380204e-05, "loss": 7.5088, "step": 1513900 }, { "epoch": 6.167703273995329, "grad_norm": 2.2603001594543457, "learning_rate": 4.6388531286626527e-05, "loss": 7.4819, "step": 1514000 }, { "epoch": 6.167703273995329, "eval_MaskedAccuracy": 0.5137995287013225, "eval_loss": 1.583763599395752, "eval_runtime": 161.2835, "eval_samples_per_second": 393.568, "eval_steps_per_second": 1.538, "step": 1514000 }, { "epoch": 6.168110652018711, "grad_norm": 2.884631633758545, "learning_rate": 4.63041617015118e-05, "loss": 7.5158, "step": 1514100 }, { "epoch": 6.168518030042092, "grad_norm": 27.567920684814453, "learning_rate": 4.621988932179374e-05, "loss": 7.5014, "step": 1514200 }, { "epoch": 6.168925408065474, "grad_norm": 10.135107040405273, "learning_rate": 4.613571415080356e-05, "loss": 7.5129, "step": 1514300 }, { "epoch": 6.169332786088855, "grad_norm": 3.33738374710083, "learning_rate": 4.6051636191870015e-05, "loss": 7.5095, "step": 1514400 }, { "epoch": 6.169740164112237, "grad_norm": 11.586241722106934, "learning_rate": 4.5967655448316196e-05, "loss": 7.5481, "step": 1514500 }, { "epoch": 6.170147542135618, "grad_norm": 23.621122360229492, "learning_rate": 4.5883771923463146e-05, "loss": 7.517, "step": 1514600 }, { "epoch": 6.170554920159, "grad_norm": 4.466002464294434, "learning_rate": 4.5799985620626517e-05, "loss": 7.5084, "step": 1514700 }, { "epoch": 6.170962298182381, "grad_norm": 3.7232768535614014, "learning_rate": 4.571629654311955e-05, "loss": 7.4998, "step": 1514800 }, { "epoch": 6.171369676205763, "grad_norm": 6.155814170837402, "learning_rate": 4.563270469425045e-05, "loss": 7.5222, "step": 1514900 }, { "epoch": 6.171777054229144, "grad_norm": 8.497859954833984, "learning_rate": 4.5549210077323844e-05, "loss": 7.4826, "step": 1515000 }, { "epoch": 6.171777054229144, "eval_MaskedAccuracy": 0.5135340781686724, "eval_loss": 1.5922369956970215, "eval_runtime": 161.6478, "eval_samples_per_second": 392.681, "eval_steps_per_second": 1.534, "step": 1515000 }, { "epoch": 6.172184432252526, "grad_norm": 4.6390204429626465, "learning_rate": 4.5465812695641516e-05, "loss": 7.5042, "step": 1515100 }, { "epoch": 6.1725918102759065, "grad_norm": 3.622335910797119, "learning_rate": 4.538251255249984e-05, "loss": 7.5045, "step": 1515200 }, { "epoch": 6.172999188299288, "grad_norm": 9.310247421264648, "learning_rate": 4.529930965119211e-05, "loss": 7.566, "step": 1515300 }, { "epoch": 6.17340656632267, "grad_norm": 4.652536869049072, "learning_rate": 4.521620399500846e-05, "loss": 7.4942, "step": 1515400 }, { "epoch": 6.173813944346051, "grad_norm": 2.640489101409912, "learning_rate": 4.513319558723418e-05, "loss": 7.4872, "step": 1515500 }, { "epoch": 6.174221322369433, "grad_norm": 3.247558355331421, "learning_rate": 4.5050284431150864e-05, "loss": 7.5346, "step": 1515600 }, { "epoch": 6.174628700392814, "grad_norm": 2.5709688663482666, "learning_rate": 4.496747053003655e-05, "loss": 7.5257, "step": 1515700 }, { "epoch": 6.175036078416196, "grad_norm": 6.035948753356934, "learning_rate": 4.4884753887165394e-05, "loss": 7.5302, "step": 1515800 }, { "epoch": 6.175443456439577, "grad_norm": 22.370046615600586, "learning_rate": 4.4802134505807394e-05, "loss": 7.5076, "step": 1515900 }, { "epoch": 6.175850834462959, "grad_norm": 2.9051003456115723, "learning_rate": 4.4719612389229146e-05, "loss": 7.5285, "step": 1516000 }, { "epoch": 6.175850834462959, "eval_MaskedAccuracy": 0.5134246717936268, "eval_loss": 1.5783573389053345, "eval_runtime": 165.1756, "eval_samples_per_second": 384.294, "eval_steps_per_second": 1.501, "step": 1516000 }, { "epoch": 6.17625821248634, "grad_norm": 5.093085289001465, "learning_rate": 4.463718754069321e-05, "loss": 7.4848, "step": 1516100 }, { "epoch": 6.176665590509722, "grad_norm": 3.852621555328369, "learning_rate": 4.455485996345793e-05, "loss": 7.5193, "step": 1516200 }, { "epoch": 6.177072968533103, "grad_norm": 2.822230339050293, "learning_rate": 4.4472629660778647e-05, "loss": 7.5146, "step": 1516300 }, { "epoch": 6.177480346556484, "grad_norm": 4.570256233215332, "learning_rate": 4.4390496635905943e-05, "loss": 7.5227, "step": 1516400 }, { "epoch": 6.177887724579866, "grad_norm": 13.537642478942871, "learning_rate": 4.430846089208739e-05, "loss": 7.5323, "step": 1516500 }, { "epoch": 6.178295102603247, "grad_norm": 19.775623321533203, "learning_rate": 4.4226522432566154e-05, "loss": 7.4875, "step": 1516600 }, { "epoch": 6.178702480626629, "grad_norm": 26.371177673339844, "learning_rate": 4.414468126058138e-05, "loss": 7.5135, "step": 1516700 }, { "epoch": 6.17910985865001, "grad_norm": 10.75178337097168, "learning_rate": 4.406293737936908e-05, "loss": 7.5432, "step": 1516800 }, { "epoch": 6.179517236673392, "grad_norm": 9.740500450134277, "learning_rate": 4.398129079216093e-05, "loss": 7.5124, "step": 1516900 }, { "epoch": 6.179924614696773, "grad_norm": 4.074995994567871, "learning_rate": 4.38997415021846e-05, "loss": 7.5364, "step": 1517000 }, { "epoch": 6.179924614696773, "eval_MaskedAccuracy": 0.513934802638679, "eval_loss": 1.575881004333496, "eval_runtime": 158.1913, "eval_samples_per_second": 401.261, "eval_steps_per_second": 1.568, "step": 1517000 }, { "epoch": 6.180331992720155, "grad_norm": 8.33947467803955, "learning_rate": 4.3818289512664364e-05, "loss": 7.5176, "step": 1517100 }, { "epoch": 6.180739370743536, "grad_norm": 3.4337880611419678, "learning_rate": 4.3736934826820645e-05, "loss": 7.5319, "step": 1517200 }, { "epoch": 6.181146748766918, "grad_norm": 6.742792129516602, "learning_rate": 4.365567744786916e-05, "loss": 7.5248, "step": 1517300 }, { "epoch": 6.181554126790299, "grad_norm": 6.558507442474365, "learning_rate": 4.3574517379023636e-05, "loss": 7.4924, "step": 1517400 }, { "epoch": 6.181961504813681, "grad_norm": 6.90985107421875, "learning_rate": 4.3493454623491785e-05, "loss": 7.5271, "step": 1517500 }, { "epoch": 6.182368882837062, "grad_norm": 8.464466094970703, "learning_rate": 4.3412489184478695e-05, "loss": 7.5067, "step": 1517600 }, { "epoch": 6.182776260860443, "grad_norm": 3.467275619506836, "learning_rate": 4.333162106518546e-05, "loss": 7.5163, "step": 1517700 }, { "epoch": 6.183183638883825, "grad_norm": 3.563931703567505, "learning_rate": 4.32508502688092e-05, "loss": 7.5211, "step": 1517800 }, { "epoch": 6.183591016907206, "grad_norm": 5.884230613708496, "learning_rate": 4.317017679854316e-05, "loss": 7.5059, "step": 1517900 }, { "epoch": 6.183998394930588, "grad_norm": 16.4660587310791, "learning_rate": 4.3089600657576995e-05, "loss": 7.5391, "step": 1518000 }, { "epoch": 6.183998394930588, "eval_MaskedAccuracy": 0.5139440589898651, "eval_loss": 1.5817654132843018, "eval_runtime": 168.5904, "eval_samples_per_second": 376.51, "eval_steps_per_second": 1.471, "step": 1518000 }, { "epoch": 6.184405772953969, "grad_norm": 3.888291120529175, "learning_rate": 4.300912184909597e-05, "loss": 7.5127, "step": 1518100 }, { "epoch": 6.184813150977351, "grad_norm": 3.8665812015533447, "learning_rate": 4.292874037628201e-05, "loss": 7.531, "step": 1518200 }, { "epoch": 6.185220529000732, "grad_norm": 19.788267135620117, "learning_rate": 4.284845624231284e-05, "loss": 7.5273, "step": 1518300 }, { "epoch": 6.185627907024114, "grad_norm": 23.828781127929688, "learning_rate": 4.2768269450363166e-05, "loss": 7.4969, "step": 1518400 }, { "epoch": 6.186035285047495, "grad_norm": 5.883404731750488, "learning_rate": 4.2688180003602385e-05, "loss": 7.5205, "step": 1518500 }, { "epoch": 6.186442663070877, "grad_norm": 5.5435638427734375, "learning_rate": 4.260818790519722e-05, "loss": 7.531, "step": 1518600 }, { "epoch": 6.186850041094258, "grad_norm": 15.866876602172852, "learning_rate": 4.252829315831023e-05, "loss": 7.5274, "step": 1518700 }, { "epoch": 6.18725741911764, "grad_norm": 26.80928611755371, "learning_rate": 4.244849576610024e-05, "loss": 7.5126, "step": 1518800 }, { "epoch": 6.1876647971410215, "grad_norm": 10.450453758239746, "learning_rate": 4.236879573172178e-05, "loss": 7.5225, "step": 1518900 }, { "epoch": 6.188072175164402, "grad_norm": 10.120019912719727, "learning_rate": 4.228919305832569e-05, "loss": 7.5081, "step": 1519000 }, { "epoch": 6.188072175164402, "eval_MaskedAccuracy": 0.5138981129712925, "eval_loss": 1.5910037755966187, "eval_runtime": 167.1661, "eval_samples_per_second": 379.718, "eval_steps_per_second": 1.484, "step": 1519000 }, { "epoch": 6.188479553187784, "grad_norm": 7.581277847290039, "learning_rate": 4.220968774905958e-05, "loss": 7.5142, "step": 1519100 }, { "epoch": 6.188886931211165, "grad_norm": 16.236650466918945, "learning_rate": 4.2130279807066024e-05, "loss": 7.5143, "step": 1519200 }, { "epoch": 6.189294309234547, "grad_norm": 2.630388021469116, "learning_rate": 4.205096923548509e-05, "loss": 7.5378, "step": 1519300 }, { "epoch": 6.189701687257928, "grad_norm": 5.415067195892334, "learning_rate": 4.197175603745242e-05, "loss": 7.526, "step": 1519400 }, { "epoch": 6.19010906528131, "grad_norm": 3.192352533340454, "learning_rate": 4.189264021609915e-05, "loss": 7.4874, "step": 1519500 }, { "epoch": 6.190516443304691, "grad_norm": 9.589027404785156, "learning_rate": 4.181362177455327e-05, "loss": 7.4991, "step": 1519600 }, { "epoch": 6.190923821328073, "grad_norm": 19.218538284301758, "learning_rate": 4.1734700715939324e-05, "loss": 7.5384, "step": 1519700 }, { "epoch": 6.191331199351454, "grad_norm": 10.557171821594238, "learning_rate": 4.165587704337692e-05, "loss": 7.5177, "step": 1519800 }, { "epoch": 6.191738577374836, "grad_norm": 12.861437797546387, "learning_rate": 4.157715075998255e-05, "loss": 7.5136, "step": 1519900 }, { "epoch": 6.192145955398217, "grad_norm": 4.201754093170166, "learning_rate": 4.1498521868868894e-05, "loss": 7.5285, "step": 1520000 }, { "epoch": 6.192145955398217, "eval_MaskedAccuracy": 0.5133526491845718, "eval_loss": 1.5832812786102295, "eval_runtime": 162.1236, "eval_samples_per_second": 391.529, "eval_steps_per_second": 1.53, "step": 1520000 }, { "epoch": 6.192553333421599, "grad_norm": 11.104284286499023, "learning_rate": 4.1419990373144196e-05, "loss": 7.5235, "step": 1520100 }, { "epoch": 6.19296071144498, "grad_norm": 3.3993284702301025, "learning_rate": 4.1341556275913546e-05, "loss": 7.5234, "step": 1520200 }, { "epoch": 6.193368089468361, "grad_norm": 22.561065673828125, "learning_rate": 4.126321958027776e-05, "loss": 7.5138, "step": 1520300 }, { "epoch": 6.193775467491743, "grad_norm": 4.123826026916504, "learning_rate": 4.118498028933399e-05, "loss": 7.5266, "step": 1520400 }, { "epoch": 6.194182845515124, "grad_norm": 15.193856239318848, "learning_rate": 4.1106838406175176e-05, "loss": 7.5349, "step": 1520500 }, { "epoch": 6.194590223538506, "grad_norm": 3.9755845069885254, "learning_rate": 4.102879393389102e-05, "loss": 7.5272, "step": 1520600 }, { "epoch": 6.194997601561887, "grad_norm": 7.0829291343688965, "learning_rate": 4.095084687556674e-05, "loss": 7.5289, "step": 1520700 }, { "epoch": 6.195404979585269, "grad_norm": 3.483314275741577, "learning_rate": 4.0872997234284283e-05, "loss": 7.5391, "step": 1520800 }, { "epoch": 6.19581235760865, "grad_norm": 17.247650146484375, "learning_rate": 4.0795245013121084e-05, "loss": 7.5351, "step": 1520900 }, { "epoch": 6.196219735632032, "grad_norm": 6.554419994354248, "learning_rate": 4.071759021515159e-05, "loss": 7.5394, "step": 1521000 }, { "epoch": 6.196219735632032, "eval_MaskedAccuracy": 0.5134685783312823, "eval_loss": 1.588786005973816, "eval_runtime": 169.899, "eval_samples_per_second": 373.61, "eval_steps_per_second": 1.46, "step": 1521000 }, { "epoch": 6.196627113655413, "grad_norm": 5.016470909118652, "learning_rate": 4.064003284344553e-05, "loss": 7.5088, "step": 1521100 }, { "epoch": 6.197034491678795, "grad_norm": 6.9189043045043945, "learning_rate": 4.056257290106897e-05, "loss": 7.5119, "step": 1521200 }, { "epoch": 6.1974418697021765, "grad_norm": 3.6506412029266357, "learning_rate": 4.0485210391085124e-05, "loss": 7.5276, "step": 1521300 }, { "epoch": 6.197849247725557, "grad_norm": 13.37964916229248, "learning_rate": 4.040794531655236e-05, "loss": 7.5099, "step": 1521400 }, { "epoch": 6.198256625748939, "grad_norm": 16.245330810546875, "learning_rate": 4.033077768052481e-05, "loss": 7.5414, "step": 1521500 }, { "epoch": 6.19866400377232, "grad_norm": 3.8642942905426025, "learning_rate": 4.025370748605361e-05, "loss": 7.524, "step": 1521600 }, { "epoch": 6.199071381795702, "grad_norm": 2.934980630874634, "learning_rate": 4.0176734736185714e-05, "loss": 7.5104, "step": 1521700 }, { "epoch": 6.199478759819083, "grad_norm": 6.9958062171936035, "learning_rate": 4.009985943396432e-05, "loss": 7.5184, "step": 1521800 }, { "epoch": 6.199886137842465, "grad_norm": 6.103975772857666, "learning_rate": 4.0023081582428976e-05, "loss": 7.5436, "step": 1521900 }, { "epoch": 6.200293515865846, "grad_norm": 3.3343257904052734, "learning_rate": 3.994640118461442e-05, "loss": 7.5268, "step": 1522000 }, { "epoch": 6.200293515865846, "eval_MaskedAccuracy": 0.5136080354808839, "eval_loss": 1.5869801044464111, "eval_runtime": 171.0967, "eval_samples_per_second": 370.995, "eval_steps_per_second": 1.449, "step": 1522000 }, { "epoch": 6.200700893889228, "grad_norm": 5.00773811340332, "learning_rate": 3.986981824355302e-05, "loss": 7.5252, "step": 1522100 }, { "epoch": 6.201108271912609, "grad_norm": 6.265585899353027, "learning_rate": 3.979333276227185e-05, "loss": 7.5237, "step": 1522200 }, { "epoch": 6.201515649935991, "grad_norm": 3.543001651763916, "learning_rate": 3.9716944743795216e-05, "loss": 7.5433, "step": 1522300 }, { "epoch": 6.2019230279593724, "grad_norm": 17.54358673095703, "learning_rate": 3.9640654191142873e-05, "loss": 7.5434, "step": 1522400 }, { "epoch": 6.202330405982754, "grad_norm": 2.7225570678710938, "learning_rate": 3.9564461107331214e-05, "loss": 7.5082, "step": 1522500 }, { "epoch": 6.2027377840061355, "grad_norm": 4.40688943862915, "learning_rate": 3.948836549537261e-05, "loss": 7.5155, "step": 1522600 }, { "epoch": 6.203145162029516, "grad_norm": 7.3498101234436035, "learning_rate": 3.94123673582752e-05, "loss": 7.4926, "step": 1522700 }, { "epoch": 6.203552540052898, "grad_norm": 4.559058666229248, "learning_rate": 3.9336466699043944e-05, "loss": 7.5342, "step": 1522800 }, { "epoch": 6.203959918076279, "grad_norm": 9.517656326293945, "learning_rate": 3.9260663520679325e-05, "loss": 7.5407, "step": 1522900 }, { "epoch": 6.204367296099661, "grad_norm": 11.327088356018066, "learning_rate": 3.9184957826178475e-05, "loss": 7.5095, "step": 1523000 }, { "epoch": 6.204367296099661, "eval_MaskedAccuracy": 0.5134649629722284, "eval_loss": 1.58461594581604, "eval_runtime": 175.035, "eval_samples_per_second": 362.647, "eval_steps_per_second": 1.417, "step": 1523000 }, { "epoch": 6.204774674123042, "grad_norm": 8.06092357635498, "learning_rate": 3.910934961853442e-05, "loss": 7.5309, "step": 1523100 }, { "epoch": 6.205182052146424, "grad_norm": 6.9076457023620605, "learning_rate": 3.903383890073621e-05, "loss": 7.5178, "step": 1523200 }, { "epoch": 6.205589430169805, "grad_norm": 15.919775009155273, "learning_rate": 3.8958425675769104e-05, "loss": 7.5187, "step": 1523300 }, { "epoch": 6.205996808193187, "grad_norm": 5.286274433135986, "learning_rate": 3.8883109946615025e-05, "loss": 7.5551, "step": 1523400 }, { "epoch": 6.206404186216568, "grad_norm": 23.460248947143555, "learning_rate": 3.880789171625117e-05, "loss": 7.514, "step": 1523500 }, { "epoch": 6.20681156423995, "grad_norm": 5.665782928466797, "learning_rate": 3.873277098765169e-05, "loss": 7.528, "step": 1523600 }, { "epoch": 6.2072189422633315, "grad_norm": 9.24010181427002, "learning_rate": 3.865774776378606e-05, "loss": 7.5389, "step": 1523700 }, { "epoch": 6.207626320286713, "grad_norm": 5.158371448516846, "learning_rate": 3.85828220476206e-05, "loss": 7.5244, "step": 1523800 }, { "epoch": 6.2080336983100946, "grad_norm": 9.36806583404541, "learning_rate": 3.8507993842117614e-05, "loss": 7.5292, "step": 1523900 }, { "epoch": 6.208441076333475, "grad_norm": 11.890572547912598, "learning_rate": 3.843326315023518e-05, "loss": 7.509, "step": 1524000 }, { "epoch": 6.208441076333475, "eval_MaskedAccuracy": 0.5138990814876041, "eval_loss": 1.584986686706543, "eval_runtime": 156.982, "eval_samples_per_second": 404.352, "eval_steps_per_second": 1.58, "step": 1524000 }, { "epoch": 6.208848454356857, "grad_norm": 7.550594806671143, "learning_rate": 3.835862997492808e-05, "loss": 7.5279, "step": 1524100 }, { "epoch": 6.209255832380238, "grad_norm": 5.89973783493042, "learning_rate": 3.828409431914691e-05, "loss": 7.5038, "step": 1524200 }, { "epoch": 6.20966321040362, "grad_norm": 10.729607582092285, "learning_rate": 3.820965618583845e-05, "loss": 7.5017, "step": 1524300 }, { "epoch": 6.210070588427001, "grad_norm": 21.639862060546875, "learning_rate": 3.8135315577945516e-05, "loss": 7.5341, "step": 1524400 }, { "epoch": 6.210477966450383, "grad_norm": 7.035912990570068, "learning_rate": 3.806107249840707e-05, "loss": 7.5192, "step": 1524500 }, { "epoch": 6.210885344473764, "grad_norm": 21.44525718688965, "learning_rate": 3.79869269501588e-05, "loss": 7.5253, "step": 1524600 }, { "epoch": 6.211292722497146, "grad_norm": 19.483362197875977, "learning_rate": 3.7912878936131835e-05, "loss": 7.5164, "step": 1524700 }, { "epoch": 6.2117001005205275, "grad_norm": 15.843347549438477, "learning_rate": 3.7838928459253625e-05, "loss": 7.5308, "step": 1524800 }, { "epoch": 6.212107478543909, "grad_norm": 6.50057315826416, "learning_rate": 3.7765075522447876e-05, "loss": 7.505, "step": 1524900 }, { "epoch": 6.2125148565672905, "grad_norm": 11.148566246032715, "learning_rate": 3.769132012863448e-05, "loss": 7.5198, "step": 1525000 }, { "epoch": 6.2125148565672905, "eval_MaskedAccuracy": 0.5138115478730861, "eval_loss": 1.5826061964035034, "eval_runtime": 158.9275, "eval_samples_per_second": 399.402, "eval_steps_per_second": 1.56, "step": 1525000 }, { "epoch": 6.212922234590672, "grad_norm": 3.6122076511383057, "learning_rate": 3.7617662280729385e-05, "loss": 7.5164, "step": 1525100 }, { "epoch": 6.213329612614053, "grad_norm": 6.010428428649902, "learning_rate": 3.7544101981644156e-05, "loss": 7.5452, "step": 1525200 }, { "epoch": 6.213736990637434, "grad_norm": 3.211630344390869, "learning_rate": 3.7470639234287856e-05, "loss": 7.5441, "step": 1525300 }, { "epoch": 6.214144368660816, "grad_norm": 27.52581214904785, "learning_rate": 3.739727404156428e-05, "loss": 7.5137, "step": 1525400 }, { "epoch": 6.214551746684197, "grad_norm": 30.541147232055664, "learning_rate": 3.7324006406374174e-05, "loss": 7.5236, "step": 1525500 }, { "epoch": 6.214959124707579, "grad_norm": 3.39619779586792, "learning_rate": 3.725083633161411e-05, "loss": 7.5184, "step": 1525600 }, { "epoch": 6.21536650273096, "grad_norm": 4.325729846954346, "learning_rate": 3.717776382017712e-05, "loss": 7.5417, "step": 1525700 }, { "epoch": 6.215773880754342, "grad_norm": 24.767698287963867, "learning_rate": 3.710478887495175e-05, "loss": 7.5108, "step": 1525800 }, { "epoch": 6.216181258777723, "grad_norm": 27.488677978515625, "learning_rate": 3.7031911498823575e-05, "loss": 7.544, "step": 1525900 }, { "epoch": 6.216588636801105, "grad_norm": 10.902069091796875, "learning_rate": 3.6959131694673304e-05, "loss": 7.5298, "step": 1526000 }, { "epoch": 6.216588636801105, "eval_MaskedAccuracy": 0.5132878047846802, "eval_loss": 1.5875064134597778, "eval_runtime": 166.3401, "eval_samples_per_second": 381.604, "eval_steps_per_second": 1.491, "step": 1526000 }, { "epoch": 6.2169960148244865, "grad_norm": 19.047046661376953, "learning_rate": 3.688644946537874e-05, "loss": 7.506, "step": 1526100 }, { "epoch": 6.217403392847868, "grad_norm": 3.6258020401000977, "learning_rate": 3.6813864813812876e-05, "loss": 7.5216, "step": 1526200 }, { "epoch": 6.21781077087125, "grad_norm": 9.296761512756348, "learning_rate": 3.6741377742846076e-05, "loss": 7.5151, "step": 1526300 }, { "epoch": 6.21821814889463, "grad_norm": 3.2167930603027344, "learning_rate": 3.666898825534362e-05, "loss": 7.512, "step": 1526400 }, { "epoch": 6.218625526918012, "grad_norm": 6.6329498291015625, "learning_rate": 3.659669635416776e-05, "loss": 7.4934, "step": 1526500 }, { "epoch": 6.219032904941393, "grad_norm": 10.958048820495605, "learning_rate": 3.652450204217629e-05, "loss": 7.5153, "step": 1526600 }, { "epoch": 6.219440282964775, "grad_norm": 4.174325942993164, "learning_rate": 3.6452405322223724e-05, "loss": 7.5157, "step": 1526700 }, { "epoch": 6.219847660988156, "grad_norm": 3.164076328277588, "learning_rate": 3.638040619716043e-05, "loss": 7.5035, "step": 1526800 }, { "epoch": 6.220255039011538, "grad_norm": 9.154379844665527, "learning_rate": 3.6308504669832514e-05, "loss": 7.5006, "step": 1526900 }, { "epoch": 6.220662417034919, "grad_norm": 3.227844476699829, "learning_rate": 3.623670074308285e-05, "loss": 7.5327, "step": 1527000 }, { "epoch": 6.220662417034919, "eval_MaskedAccuracy": 0.5142326860902728, "eval_loss": 1.5779016017913818, "eval_runtime": 430.3278, "eval_samples_per_second": 147.506, "eval_steps_per_second": 0.576, "step": 1527000 }, { "epoch": 6.221069795058301, "grad_norm": 3.534832000732422, "learning_rate": 3.616499441975046e-05, "loss": 7.5095, "step": 1527100 }, { "epoch": 6.2214771730816825, "grad_norm": 2.9680981636047363, "learning_rate": 3.609338570267012e-05, "loss": 7.4944, "step": 1527200 }, { "epoch": 6.221884551105064, "grad_norm": 8.061978340148926, "learning_rate": 3.602187459467277e-05, "loss": 7.5392, "step": 1527300 }, { "epoch": 6.2222919291284455, "grad_norm": 2.630624532699585, "learning_rate": 3.595046109858575e-05, "loss": 7.5047, "step": 1527400 }, { "epoch": 6.222699307151827, "grad_norm": 5.866968154907227, "learning_rate": 3.587914521723235e-05, "loss": 7.5182, "step": 1527500 }, { "epoch": 6.223106685175209, "grad_norm": 7.416944980621338, "learning_rate": 3.580792695343208e-05, "loss": 7.4837, "step": 1527600 }, { "epoch": 6.223514063198589, "grad_norm": 8.481433868408203, "learning_rate": 3.5736806310000895e-05, "loss": 7.5198, "step": 1527700 }, { "epoch": 6.223921441221971, "grad_norm": 18.279815673828125, "learning_rate": 3.56657832897501e-05, "loss": 7.5131, "step": 1527800 }, { "epoch": 6.224328819245352, "grad_norm": 8.598742485046387, "learning_rate": 3.5594857895487754e-05, "loss": 7.5433, "step": 1527900 }, { "epoch": 6.224736197268734, "grad_norm": 22.971736907958984, "learning_rate": 3.5524030130018395e-05, "loss": 7.5203, "step": 1528000 }, { "epoch": 6.224736197268734, "eval_MaskedAccuracy": 0.5146985033384197, "eval_loss": 1.5790197849273682, "eval_runtime": 159.5687, "eval_samples_per_second": 397.797, "eval_steps_per_second": 1.554, "step": 1528000 }, { "epoch": 6.225143575292115, "grad_norm": 20.58827781677246, "learning_rate": 3.545329999614137e-05, "loss": 7.5162, "step": 1528100 }, { "epoch": 6.225550953315497, "grad_norm": 13.78675651550293, "learning_rate": 3.538266749665359e-05, "loss": 7.531, "step": 1528200 }, { "epoch": 6.225958331338878, "grad_norm": 16.073307037353516, "learning_rate": 3.531213263434752e-05, "loss": 7.5065, "step": 1528300 }, { "epoch": 6.22636570936226, "grad_norm": 9.83819580078125, "learning_rate": 3.5241695412011664e-05, "loss": 7.5093, "step": 1528400 }, { "epoch": 6.2267730873856415, "grad_norm": 2.6550517082214355, "learning_rate": 3.517135583243071e-05, "loss": 7.5435, "step": 1528500 }, { "epoch": 6.227180465409023, "grad_norm": 13.715490341186523, "learning_rate": 3.5101113898385543e-05, "loss": 7.5137, "step": 1528600 }, { "epoch": 6.227587843432405, "grad_norm": 7.189116477966309, "learning_rate": 3.503096961265335e-05, "loss": 7.5168, "step": 1528700 }, { "epoch": 6.227995221455786, "grad_norm": 15.785459518432617, "learning_rate": 3.496092297800741e-05, "loss": 7.4834, "step": 1528800 }, { "epoch": 6.228402599479168, "grad_norm": 5.474700450897217, "learning_rate": 3.489097399721666e-05, "loss": 7.5014, "step": 1528900 }, { "epoch": 6.228809977502548, "grad_norm": 9.012724876403809, "learning_rate": 3.482112267304694e-05, "loss": 7.5346, "step": 1529000 }, { "epoch": 6.228809977502548, "eval_MaskedAccuracy": 0.5141386005417878, "eval_loss": 1.5893563032150269, "eval_runtime": 166.1825, "eval_samples_per_second": 381.966, "eval_steps_per_second": 1.492, "step": 1529000 }, { "epoch": 6.22921735552593, "grad_norm": 9.357582092285156, "learning_rate": 3.475136900825944e-05, "loss": 7.5225, "step": 1529100 }, { "epoch": 6.229624733549311, "grad_norm": 4.897263526916504, "learning_rate": 3.4681713005612175e-05, "loss": 7.5036, "step": 1529200 }, { "epoch": 6.230032111572693, "grad_norm": 5.631476402282715, "learning_rate": 3.4612154667859114e-05, "loss": 7.4929, "step": 1529300 }, { "epoch": 6.230439489596074, "grad_norm": 4.668044090270996, "learning_rate": 3.4542693997750315e-05, "loss": 7.5267, "step": 1529400 }, { "epoch": 6.230846867619456, "grad_norm": 9.86772346496582, "learning_rate": 3.447333099803139e-05, "loss": 7.4955, "step": 1529500 }, { "epoch": 6.2312542456428375, "grad_norm": 14.227707862854004, "learning_rate": 3.440406567144532e-05, "loss": 7.5219, "step": 1529600 }, { "epoch": 6.231661623666219, "grad_norm": 6.231542110443115, "learning_rate": 3.433489802073016e-05, "loss": 7.5418, "step": 1529700 }, { "epoch": 6.2320690016896005, "grad_norm": 5.305866241455078, "learning_rate": 3.426582804862045e-05, "loss": 7.5455, "step": 1529800 }, { "epoch": 6.232476379712982, "grad_norm": 3.2614071369171143, "learning_rate": 3.419685575784688e-05, "loss": 7.4905, "step": 1529900 }, { "epoch": 6.232883757736364, "grad_norm": 3.040539264678955, "learning_rate": 3.4127981151136506e-05, "loss": 7.505, "step": 1530000 }, { "epoch": 6.232883757736364, "eval_MaskedAccuracy": 0.514141880386851, "eval_loss": 1.5848793983459473, "eval_runtime": 162.9599, "eval_samples_per_second": 389.519, "eval_steps_per_second": 1.522, "step": 1530000 }, { "epoch": 6.233291135759745, "grad_norm": 13.58469009399414, "learning_rate": 3.4059204231212264e-05, "loss": 7.5222, "step": 1530100 }, { "epoch": 6.233698513783126, "grad_norm": 11.106911659240723, "learning_rate": 3.39905250007929e-05, "loss": 7.5219, "step": 1530200 }, { "epoch": 6.234105891806507, "grad_norm": 18.91869354248047, "learning_rate": 3.392194346259416e-05, "loss": 7.5355, "step": 1530300 }, { "epoch": 6.234513269829889, "grad_norm": 4.576848983764648, "learning_rate": 3.385345961932732e-05, "loss": 7.5324, "step": 1530400 }, { "epoch": 6.23492064785327, "grad_norm": 7.7829108238220215, "learning_rate": 3.378507347369977e-05, "loss": 7.5082, "step": 1530500 }, { "epoch": 6.235328025876652, "grad_norm": 4.414244651794434, "learning_rate": 3.371678502841532e-05, "loss": 7.5208, "step": 1530600 }, { "epoch": 6.2357354039000334, "grad_norm": 9.419246673583984, "learning_rate": 3.364859428617363e-05, "loss": 7.5306, "step": 1530700 }, { "epoch": 6.236142781923415, "grad_norm": 9.71709156036377, "learning_rate": 3.358050124967097e-05, "loss": 7.5313, "step": 1530800 }, { "epoch": 6.2365501599467965, "grad_norm": 6.383625030517578, "learning_rate": 3.3512505921598716e-05, "loss": 7.5133, "step": 1530900 }, { "epoch": 6.236957537970178, "grad_norm": 4.312445640563965, "learning_rate": 3.3444608304645974e-05, "loss": 7.4945, "step": 1531000 }, { "epoch": 6.236957537970178, "eval_MaskedAccuracy": 0.51399551571728, "eval_loss": 1.588167428970337, "eval_runtime": 160.9326, "eval_samples_per_second": 394.426, "eval_steps_per_second": 1.541, "step": 1531000 }, { "epoch": 6.23736491599356, "grad_norm": 6.269240379333496, "learning_rate": 3.3376808401496074e-05, "loss": 7.5257, "step": 1531100 }, { "epoch": 6.237772294016941, "grad_norm": 4.7655863761901855, "learning_rate": 3.330910621483068e-05, "loss": 7.5186, "step": 1531200 }, { "epoch": 6.238179672040323, "grad_norm": 5.086463451385498, "learning_rate": 3.324150174732555e-05, "loss": 7.5303, "step": 1531300 }, { "epoch": 6.238587050063703, "grad_norm": 5.014735221862793, "learning_rate": 3.317399500165379e-05, "loss": 7.5043, "step": 1531400 }, { "epoch": 6.238994428087085, "grad_norm": 4.463014602661133, "learning_rate": 3.31065859804846e-05, "loss": 7.5297, "step": 1531500 }, { "epoch": 6.239401806110466, "grad_norm": 2.4507670402526855, "learning_rate": 3.303927468648234e-05, "loss": 7.5197, "step": 1531600 }, { "epoch": 6.239809184133848, "grad_norm": 8.728959083557129, "learning_rate": 3.2972061122308544e-05, "loss": 7.5108, "step": 1531700 }, { "epoch": 6.240216562157229, "grad_norm": 4.4223504066467285, "learning_rate": 3.29049452906207e-05, "loss": 7.5097, "step": 1531800 }, { "epoch": 6.240623940180611, "grad_norm": 3.243269681930542, "learning_rate": 3.283792719407194e-05, "loss": 7.5469, "step": 1531900 }, { "epoch": 6.2410313182039925, "grad_norm": 14.289515495300293, "learning_rate": 3.2771006835311846e-05, "loss": 7.5332, "step": 1532000 }, { "epoch": 6.2410313182039925, "eval_MaskedAccuracy": 0.5137271034820764, "eval_loss": 1.579954981803894, "eval_runtime": 165.4127, "eval_samples_per_second": 383.743, "eval_steps_per_second": 1.499, "step": 1532000 }, { "epoch": 6.241438696227374, "grad_norm": 8.382390975952148, "learning_rate": 3.2704184216986484e-05, "loss": 7.5481, "step": 1532100 }, { "epoch": 6.2418460742507555, "grad_norm": 5.989925861358643, "learning_rate": 3.2637459341737396e-05, "loss": 7.5342, "step": 1532200 }, { "epoch": 6.242253452274137, "grad_norm": 7.9933762550354, "learning_rate": 3.257083221220268e-05, "loss": 7.509, "step": 1532300 }, { "epoch": 6.242660830297519, "grad_norm": 12.534502983093262, "learning_rate": 3.250430283101642e-05, "loss": 7.5119, "step": 1532400 }, { "epoch": 6.2430682083209, "grad_norm": 7.783627510070801, "learning_rate": 3.2437871200809e-05, "loss": 7.5304, "step": 1532500 }, { "epoch": 6.243475586344282, "grad_norm": 10.44333553314209, "learning_rate": 3.237153732420656e-05, "loss": 7.52, "step": 1532600 }, { "epoch": 6.243882964367662, "grad_norm": 8.387107849121094, "learning_rate": 3.2305301203831845e-05, "loss": 7.5243, "step": 1532700 }, { "epoch": 6.244290342391044, "grad_norm": 9.744940757751465, "learning_rate": 3.223916284230359e-05, "loss": 7.5026, "step": 1532800 }, { "epoch": 6.244697720414425, "grad_norm": 17.366146087646484, "learning_rate": 3.217312224223639e-05, "loss": 7.5411, "step": 1532900 }, { "epoch": 6.245105098437807, "grad_norm": 11.222664833068848, "learning_rate": 3.210717940624129e-05, "loss": 7.5166, "step": 1533000 }, { "epoch": 6.245105098437807, "eval_MaskedAccuracy": 0.5139523056378643, "eval_loss": 1.584529995918274, "eval_runtime": 162.7064, "eval_samples_per_second": 390.126, "eval_steps_per_second": 1.524, "step": 1533000 }, { "epoch": 6.2455124764611885, "grad_norm": 10.221765518188477, "learning_rate": 3.2041334336925136e-05, "loss": 7.5256, "step": 1533100 }, { "epoch": 6.24591985448457, "grad_norm": 6.283722877502441, "learning_rate": 3.1975587036891684e-05, "loss": 7.5122, "step": 1533200 }, { "epoch": 6.2463272325079515, "grad_norm": 3.4898505210876465, "learning_rate": 3.190993750873981e-05, "loss": 7.5054, "step": 1533300 }, { "epoch": 6.246734610531333, "grad_norm": 5.075881004333496, "learning_rate": 3.1844385755065e-05, "loss": 7.5287, "step": 1533400 }, { "epoch": 6.247141988554715, "grad_norm": 7.452518939971924, "learning_rate": 3.1778931778459104e-05, "loss": 7.5467, "step": 1533500 }, { "epoch": 6.247549366578096, "grad_norm": 5.194850444793701, "learning_rate": 3.1713575581509666e-05, "loss": 7.5174, "step": 1533600 }, { "epoch": 6.247956744601478, "grad_norm": 5.435227394104004, "learning_rate": 3.164831716680072e-05, "loss": 7.5003, "step": 1533700 }, { "epoch": 6.248364122624859, "grad_norm": 8.2947416305542, "learning_rate": 3.15831565369123e-05, "loss": 7.5104, "step": 1533800 }, { "epoch": 6.248771500648241, "grad_norm": 11.16115951538086, "learning_rate": 3.151809369442027e-05, "loss": 7.5256, "step": 1533900 }, { "epoch": 6.249178878671621, "grad_norm": 2.5040526390075684, "learning_rate": 3.1453128641897385e-05, "loss": 7.5049, "step": 1534000 }, { "epoch": 6.249178878671621, "eval_MaskedAccuracy": 0.5138741488608206, "eval_loss": 1.5921443700790405, "eval_runtime": 186.762, "eval_samples_per_second": 339.876, "eval_steps_per_second": 1.328, "step": 1534000 }, { "epoch": 6.249586256695003, "grad_norm": 7.071235179901123, "learning_rate": 3.138826138191145e-05, "loss": 7.4999, "step": 1534100 }, { "epoch": 6.249993634718384, "grad_norm": 12.430635452270508, "learning_rate": 3.1323491917027747e-05, "loss": 7.5131, "step": 1534200 }, { "epoch": 6.250401012741766, "grad_norm": 23.40462303161621, "learning_rate": 3.125882024980636e-05, "loss": 7.5258, "step": 1534300 }, { "epoch": 6.2508083907651475, "grad_norm": 9.99567985534668, "learning_rate": 3.119424638280426e-05, "loss": 7.5132, "step": 1534400 }, { "epoch": 6.251215768788529, "grad_norm": 7.518438816070557, "learning_rate": 3.112977031857426e-05, "loss": 7.505, "step": 1534500 }, { "epoch": 6.251623146811911, "grad_norm": 5.996940612792969, "learning_rate": 3.106539205966588e-05, "loss": 7.4946, "step": 1534600 }, { "epoch": 6.252030524835292, "grad_norm": 14.199740409851074, "learning_rate": 3.100111160862424e-05, "loss": 7.5317, "step": 1534700 }, { "epoch": 6.252437902858674, "grad_norm": 9.197056770324707, "learning_rate": 3.0936928967990237e-05, "loss": 7.5315, "step": 1534800 }, { "epoch": 6.252845280882055, "grad_norm": 16.446992874145508, "learning_rate": 3.087284414030175e-05, "loss": 7.5215, "step": 1534900 }, { "epoch": 6.253252658905437, "grad_norm": 4.474786281585693, "learning_rate": 3.08088571280925e-05, "loss": 7.5534, "step": 1535000 }, { "epoch": 6.253252658905437, "eval_MaskedAccuracy": 0.5135466027244717, "eval_loss": 1.5894109010696411, "eval_runtime": 151.873, "eval_samples_per_second": 417.955, "eval_steps_per_second": 1.633, "step": 1535000 }, { "epoch": 6.253660036928818, "grad_norm": 5.775058746337891, "learning_rate": 3.074496793389151e-05, "loss": 7.5043, "step": 1535100 }, { "epoch": 6.254067414952199, "grad_norm": 12.942586898803711, "learning_rate": 3.068117656022559e-05, "loss": 7.5248, "step": 1535200 }, { "epoch": 6.25447479297558, "grad_norm": 2.3693482875823975, "learning_rate": 3.061748300961626e-05, "loss": 7.5243, "step": 1535300 }, { "epoch": 6.254882170998962, "grad_norm": 2.6879940032958984, "learning_rate": 3.055388728458171e-05, "loss": 7.5414, "step": 1535400 }, { "epoch": 6.2552895490223435, "grad_norm": 3.1551711559295654, "learning_rate": 3.0490389387636312e-05, "loss": 7.5223, "step": 1535500 }, { "epoch": 6.255696927045725, "grad_norm": 2.420809745788574, "learning_rate": 3.0426989321290488e-05, "loss": 7.4876, "step": 1535600 }, { "epoch": 6.2561043050691065, "grad_norm": 2.420135974884033, "learning_rate": 3.0363687088050593e-05, "loss": 7.5174, "step": 1535700 }, { "epoch": 6.256511683092488, "grad_norm": 10.525727272033691, "learning_rate": 3.0300482690419547e-05, "loss": 7.5498, "step": 1535800 }, { "epoch": 6.25691906111587, "grad_norm": 6.917478084564209, "learning_rate": 3.023737613089595e-05, "loss": 7.5377, "step": 1535900 }, { "epoch": 6.257326439139251, "grad_norm": 7.227997779846191, "learning_rate": 3.0174367411974718e-05, "loss": 7.5425, "step": 1536000 }, { "epoch": 6.257326439139251, "eval_MaskedAccuracy": 0.5133481368811607, "eval_loss": 1.5899070501327515, "eval_runtime": 167.0447, "eval_samples_per_second": 379.994, "eval_steps_per_second": 1.485, "step": 1536000 }, { "epoch": 6.257733817162633, "grad_norm": 20.933719635009766, "learning_rate": 3.01114565361472e-05, "loss": 7.529, "step": 1536100 }, { "epoch": 6.258141195186014, "grad_norm": 7.92340087890625, "learning_rate": 3.00486435059006e-05, "loss": 7.5243, "step": 1536200 }, { "epoch": 6.258548573209396, "grad_norm": 3.8228554725646973, "learning_rate": 2.9985928323717917e-05, "loss": 7.5142, "step": 1536300 }, { "epoch": 6.258955951232776, "grad_norm": 7.7100419998168945, "learning_rate": 2.9923310992078577e-05, "loss": 7.5011, "step": 1536400 }, { "epoch": 6.259363329256158, "grad_norm": 13.193169593811035, "learning_rate": 2.9860791513458714e-05, "loss": 7.5286, "step": 1536500 }, { "epoch": 6.259770707279539, "grad_norm": 14.40185260772705, "learning_rate": 2.9798369890329746e-05, "loss": 7.5324, "step": 1536600 }, { "epoch": 6.260178085302921, "grad_norm": 2.7055928707122803, "learning_rate": 2.9736046125159188e-05, "loss": 7.514, "step": 1536700 }, { "epoch": 6.2605854633263025, "grad_norm": 6.486224174499512, "learning_rate": 2.967382022041179e-05, "loss": 7.5186, "step": 1536800 }, { "epoch": 6.260992841349684, "grad_norm": 4.19632625579834, "learning_rate": 2.961169217854702e-05, "loss": 7.523, "step": 1536900 }, { "epoch": 6.261400219373066, "grad_norm": 10.856332778930664, "learning_rate": 2.9549662002021355e-05, "loss": 7.5569, "step": 1537000 }, { "epoch": 6.261400219373066, "eval_MaskedAccuracy": 0.5135443953707618, "eval_loss": 1.5890820026397705, "eval_runtime": 180.6532, "eval_samples_per_second": 351.369, "eval_steps_per_second": 1.373, "step": 1537000 }, { "epoch": 6.261807597396447, "grad_norm": 3.8834424018859863, "learning_rate": 2.9487729693287067e-05, "loss": 7.4853, "step": 1537100 }, { "epoch": 6.262214975419829, "grad_norm": 10.020374298095703, "learning_rate": 2.9425895254793066e-05, "loss": 7.54, "step": 1537200 }, { "epoch": 6.26262235344321, "grad_norm": 6.428145408630371, "learning_rate": 2.936415868898367e-05, "loss": 7.529, "step": 1537300 }, { "epoch": 6.263029731466592, "grad_norm": 16.600547790527344, "learning_rate": 2.9302519998299572e-05, "loss": 7.5321, "step": 1537400 }, { "epoch": 6.263437109489973, "grad_norm": 10.559361457824707, "learning_rate": 2.924097918517798e-05, "loss": 7.5275, "step": 1537500 }, { "epoch": 6.263844487513355, "grad_norm": 19.23090171813965, "learning_rate": 2.9179536252051844e-05, "loss": 7.541, "step": 1537600 }, { "epoch": 6.264251865536735, "grad_norm": 20.963829040527344, "learning_rate": 2.9118191201349954e-05, "loss": 7.5423, "step": 1537700 }, { "epoch": 6.264659243560117, "grad_norm": 7.682584762573242, "learning_rate": 2.9056944035498205e-05, "loss": 7.5433, "step": 1537800 }, { "epoch": 6.2650666215834985, "grad_norm": 6.028975009918213, "learning_rate": 2.8995794756917608e-05, "loss": 7.5453, "step": 1537900 }, { "epoch": 6.26547399960688, "grad_norm": 32.538124084472656, "learning_rate": 2.8934743368025824e-05, "loss": 7.5045, "step": 1538000 }, { "epoch": 6.26547399960688, "eval_MaskedAccuracy": 0.5139198570778778, "eval_loss": 1.5882899761199951, "eval_runtime": 167.5515, "eval_samples_per_second": 378.845, "eval_steps_per_second": 1.48, "step": 1538000 }, { "epoch": 6.2658813776302615, "grad_norm": 24.366785049438477, "learning_rate": 2.8873789871236668e-05, "loss": 7.5334, "step": 1538100 }, { "epoch": 6.266288755653643, "grad_norm": 21.920454025268555, "learning_rate": 2.8812934268959743e-05, "loss": 7.5098, "step": 1538200 }, { "epoch": 6.266696133677025, "grad_norm": 10.484559059143066, "learning_rate": 2.875217656360108e-05, "loss": 7.523, "step": 1538300 }, { "epoch": 6.267103511700406, "grad_norm": 9.51395320892334, "learning_rate": 2.869151675756289e-05, "loss": 7.5014, "step": 1538400 }, { "epoch": 6.267510889723788, "grad_norm": 10.454301834106445, "learning_rate": 2.8630954853243323e-05, "loss": 7.5519, "step": 1538500 }, { "epoch": 6.267918267747169, "grad_norm": 10.728215217590332, "learning_rate": 2.8570490853036567e-05, "loss": 7.5193, "step": 1538600 }, { "epoch": 6.268325645770551, "grad_norm": 6.6606340408325195, "learning_rate": 2.8510124759333137e-05, "loss": 7.5209, "step": 1538700 }, { "epoch": 6.268733023793932, "grad_norm": 10.098960876464844, "learning_rate": 2.844985657451964e-05, "loss": 7.5453, "step": 1538800 }, { "epoch": 6.269140401817314, "grad_norm": 10.603069305419922, "learning_rate": 2.83896863009789e-05, "loss": 7.5287, "step": 1538900 }, { "epoch": 6.2695477798406944, "grad_norm": 2.9649927616119385, "learning_rate": 2.832961394108973e-05, "loss": 7.5094, "step": 1539000 }, { "epoch": 6.2695477798406944, "eval_MaskedAccuracy": 0.5134989380885026, "eval_loss": 1.5853767395019531, "eval_runtime": 158.3888, "eval_samples_per_second": 400.761, "eval_steps_per_second": 1.566, "step": 1539000 }, { "epoch": 6.269955157864076, "grad_norm": 9.790529251098633, "learning_rate": 2.826963949722687e-05, "loss": 7.4891, "step": 1539100 }, { "epoch": 6.2703625358874575, "grad_norm": 19.240970611572266, "learning_rate": 2.8209762971761705e-05, "loss": 7.5075, "step": 1539200 }, { "epoch": 6.270769913910839, "grad_norm": 7.542076587677002, "learning_rate": 2.8149984367061514e-05, "loss": 7.537, "step": 1539300 }, { "epoch": 6.271177291934221, "grad_norm": 7.113610744476318, "learning_rate": 2.8090303685489716e-05, "loss": 7.5219, "step": 1539400 }, { "epoch": 6.271584669957602, "grad_norm": 14.183826446533203, "learning_rate": 2.8030720929405435e-05, "loss": 7.5073, "step": 1539500 }, { "epoch": 6.271992047980984, "grad_norm": 13.633611679077148, "learning_rate": 2.7971236101164614e-05, "loss": 7.507, "step": 1539600 }, { "epoch": 6.272399426004365, "grad_norm": 35.740501403808594, "learning_rate": 2.791184920311897e-05, "loss": 7.5294, "step": 1539700 }, { "epoch": 6.272806804027747, "grad_norm": 9.583195686340332, "learning_rate": 2.7852560237616386e-05, "loss": 7.501, "step": 1539800 }, { "epoch": 6.273214182051128, "grad_norm": 7.148852825164795, "learning_rate": 2.7793369207000513e-05, "loss": 7.5262, "step": 1539900 }, { "epoch": 6.27362156007451, "grad_norm": 8.172830581665039, "learning_rate": 2.773427611361206e-05, "loss": 7.5402, "step": 1540000 }, { "epoch": 6.27362156007451, "eval_MaskedAccuracy": 0.5130502717006363, "eval_loss": 1.5896717309951782, "eval_runtime": 162.3102, "eval_samples_per_second": 391.078, "eval_steps_per_second": 1.528, "step": 1540000 }, { "epoch": 6.274028938097891, "grad_norm": 7.16539192199707, "learning_rate": 2.7675280959786952e-05, "loss": 7.5082, "step": 1540100 }, { "epoch": 6.274436316121272, "grad_norm": 10.530797958374023, "learning_rate": 2.761638374785754e-05, "loss": 7.5043, "step": 1540200 }, { "epoch": 6.2748436941446535, "grad_norm": 4.062539577484131, "learning_rate": 2.755758448015251e-05, "loss": 7.529, "step": 1540300 }, { "epoch": 6.275251072168035, "grad_norm": 4.5090179443359375, "learning_rate": 2.7498883158996543e-05, "loss": 7.526, "step": 1540400 }, { "epoch": 6.2756584501914165, "grad_norm": 5.734238147735596, "learning_rate": 2.7440279786710342e-05, "loss": 7.5248, "step": 1540500 }, { "epoch": 6.276065828214798, "grad_norm": 4.6185784339904785, "learning_rate": 2.7381774365610962e-05, "loss": 7.5338, "step": 1540600 }, { "epoch": 6.27647320623818, "grad_norm": 6.488243103027344, "learning_rate": 2.732336689801083e-05, "loss": 7.5309, "step": 1540700 }, { "epoch": 6.276880584261561, "grad_norm": 16.597230911254883, "learning_rate": 2.7265057386219804e-05, "loss": 7.5216, "step": 1540800 }, { "epoch": 6.277287962284943, "grad_norm": 13.147786140441895, "learning_rate": 2.7206845832542878e-05, "loss": 7.4982, "step": 1540900 }, { "epoch": 6.277695340308324, "grad_norm": 3.276792526245117, "learning_rate": 2.7148732239281367e-05, "loss": 7.506, "step": 1541000 }, { "epoch": 6.277695340308324, "eval_MaskedAccuracy": 0.5135812078446895, "eval_loss": 1.5850106477737427, "eval_runtime": 162.1925, "eval_samples_per_second": 391.362, "eval_steps_per_second": 1.529, "step": 1541000 }, { "epoch": 6.278102718331706, "grad_norm": 14.958015441894531, "learning_rate": 2.7090716608733243e-05, "loss": 7.5147, "step": 1541100 }, { "epoch": 6.278510096355087, "grad_norm": 4.234958648681641, "learning_rate": 2.7032798943191818e-05, "loss": 7.4796, "step": 1541200 }, { "epoch": 6.278917474378469, "grad_norm": 4.868026256561279, "learning_rate": 2.697497924494681e-05, "loss": 7.5059, "step": 1541300 }, { "epoch": 6.2793248524018495, "grad_norm": 10.32494831085205, "learning_rate": 2.6917257516284467e-05, "loss": 7.5118, "step": 1541400 }, { "epoch": 6.279732230425231, "grad_norm": 4.418272495269775, "learning_rate": 2.685963375948653e-05, "loss": 7.5131, "step": 1541500 }, { "epoch": 6.2801396084486125, "grad_norm": 6.862038612365723, "learning_rate": 2.6802107976831325e-05, "loss": 7.5109, "step": 1541600 }, { "epoch": 6.280546986471994, "grad_norm": 2.388366460800171, "learning_rate": 2.6744680170592958e-05, "loss": 7.4737, "step": 1541700 }, { "epoch": 6.280954364495376, "grad_norm": 2.647691488265991, "learning_rate": 2.6687350343042097e-05, "loss": 7.541, "step": 1541800 }, { "epoch": 6.281361742518757, "grad_norm": 6.9391374588012695, "learning_rate": 2.663011849644541e-05, "loss": 7.5037, "step": 1541900 }, { "epoch": 6.281769120542139, "grad_norm": 6.617556571960449, "learning_rate": 2.6572984633065148e-05, "loss": 7.5134, "step": 1542000 }, { "epoch": 6.281769120542139, "eval_MaskedAccuracy": 0.5137466046271422, "eval_loss": 1.5850129127502441, "eval_runtime": 164.3074, "eval_samples_per_second": 386.325, "eval_steps_per_second": 1.509, "step": 1542000 }, { "epoch": 6.28217649856552, "grad_norm": 3.4860963821411133, "learning_rate": 2.6515948755160534e-05, "loss": 7.5133, "step": 1542100 }, { "epoch": 6.282583876588902, "grad_norm": 8.580883026123047, "learning_rate": 2.645901086498634e-05, "loss": 7.5207, "step": 1542200 }, { "epoch": 6.282991254612283, "grad_norm": 3.5101170539855957, "learning_rate": 2.640217096479354e-05, "loss": 7.5088, "step": 1542300 }, { "epoch": 6.283398632635665, "grad_norm": 2.9503724575042725, "learning_rate": 2.634542905682933e-05, "loss": 7.5506, "step": 1542400 }, { "epoch": 6.283806010659046, "grad_norm": 6.986172676086426, "learning_rate": 2.628878514333717e-05, "loss": 7.5387, "step": 1542500 }, { "epoch": 6.284213388682428, "grad_norm": 4.252118110656738, "learning_rate": 2.6232239226556366e-05, "loss": 7.5069, "step": 1542600 }, { "epoch": 6.2846207667058085, "grad_norm": 2.297511100769043, "learning_rate": 2.61757913087226e-05, "loss": 7.4889, "step": 1542700 }, { "epoch": 6.28502814472919, "grad_norm": 3.1588618755340576, "learning_rate": 2.6119441392067372e-05, "loss": 7.4981, "step": 1542800 }, { "epoch": 6.285435522752572, "grad_norm": 6.755675792694092, "learning_rate": 2.6063189478818877e-05, "loss": 7.5002, "step": 1542900 }, { "epoch": 6.285842900775953, "grad_norm": 6.133284091949463, "learning_rate": 2.600703557120061e-05, "loss": 7.5354, "step": 1543000 }, { "epoch": 6.285842900775953, "eval_MaskedAccuracy": 0.5135435694030508, "eval_loss": 1.5861613750457764, "eval_runtime": 166.0879, "eval_samples_per_second": 382.183, "eval_steps_per_second": 1.493, "step": 1543000 }, { "epoch": 6.286250278799335, "grad_norm": 19.410377502441406, "learning_rate": 2.595097967143269e-05, "loss": 7.4911, "step": 1543100 }, { "epoch": 6.286657656822716, "grad_norm": 22.64906883239746, "learning_rate": 2.5895021781731402e-05, "loss": 7.5217, "step": 1543200 }, { "epoch": 6.287065034846098, "grad_norm": 4.644437789916992, "learning_rate": 2.5839161904309486e-05, "loss": 7.4868, "step": 1543300 }, { "epoch": 6.287472412869479, "grad_norm": 5.192335605621338, "learning_rate": 2.5783400041374586e-05, "loss": 7.5224, "step": 1543400 }, { "epoch": 6.287879790892861, "grad_norm": 3.651285409927368, "learning_rate": 2.572773619513192e-05, "loss": 7.5123, "step": 1543500 }, { "epoch": 6.288287168916242, "grad_norm": 3.404053211212158, "learning_rate": 2.567217036778169e-05, "loss": 7.5286, "step": 1543600 }, { "epoch": 6.288694546939624, "grad_norm": 11.100665092468262, "learning_rate": 2.5616702561521106e-05, "loss": 7.5262, "step": 1543700 }, { "epoch": 6.289101924963005, "grad_norm": 6.327223300933838, "learning_rate": 2.556133277854287e-05, "loss": 7.5167, "step": 1543800 }, { "epoch": 6.289509302986387, "grad_norm": 6.676726818084717, "learning_rate": 2.550606102103614e-05, "loss": 7.5107, "step": 1543900 }, { "epoch": 6.2899166810097675, "grad_norm": 3.6032087802886963, "learning_rate": 2.5450887291186158e-05, "loss": 7.5253, "step": 1544000 }, { "epoch": 6.2899166810097675, "eval_MaskedAccuracy": 0.5136579686421746, "eval_loss": 1.5964300632476807, "eval_runtime": 153.8311, "eval_samples_per_second": 412.634, "eval_steps_per_second": 1.612, "step": 1544000 }, { "epoch": 6.290324059033149, "grad_norm": 4.43002986907959, "learning_rate": 2.5395811591173996e-05, "loss": 7.5194, "step": 1544100 }, { "epoch": 6.290731437056531, "grad_norm": 3.6076090335845947, "learning_rate": 2.5340833923177492e-05, "loss": 7.5204, "step": 1544200 }, { "epoch": 6.291138815079912, "grad_norm": 5.127583980560303, "learning_rate": 2.5285954289369706e-05, "loss": 7.518, "step": 1544300 }, { "epoch": 6.291546193103294, "grad_norm": 3.441530227661133, "learning_rate": 2.5231172691920852e-05, "loss": 7.4968, "step": 1544400 }, { "epoch": 6.291953571126675, "grad_norm": 2.1325786113739014, "learning_rate": 2.517648913299621e-05, "loss": 7.492, "step": 1544500 }, { "epoch": 6.292360949150057, "grad_norm": 6.765881061553955, "learning_rate": 2.5121903614757894e-05, "loss": 7.5395, "step": 1544600 }, { "epoch": 6.292768327173438, "grad_norm": 3.79774808883667, "learning_rate": 2.506741613936427e-05, "loss": 7.5131, "step": 1544700 }, { "epoch": 6.29317570519682, "grad_norm": 3.0351104736328125, "learning_rate": 2.5013026708969013e-05, "loss": 7.5093, "step": 1544800 }, { "epoch": 6.293583083220201, "grad_norm": 6.138299465179443, "learning_rate": 2.4958735325722757e-05, "loss": 7.5447, "step": 1544900 }, { "epoch": 6.293990461243583, "grad_norm": 7.833155155181885, "learning_rate": 2.490454199177196e-05, "loss": 7.5159, "step": 1545000 }, { "epoch": 6.293990461243583, "eval_MaskedAccuracy": 0.5137496808109812, "eval_loss": 1.5737380981445312, "eval_runtime": 170.0133, "eval_samples_per_second": 373.359, "eval_steps_per_second": 1.459, "step": 1545000 }, { "epoch": 6.294397839266964, "grad_norm": 2.2542214393615723, "learning_rate": 2.485044670925896e-05, "loss": 7.5076, "step": 1545100 }, { "epoch": 6.294805217290345, "grad_norm": 3.1822574138641357, "learning_rate": 2.479644948032277e-05, "loss": 7.5026, "step": 1545200 }, { "epoch": 6.295212595313727, "grad_norm": 5.585951328277588, "learning_rate": 2.4742550307097957e-05, "loss": 7.555, "step": 1545300 }, { "epoch": 6.295619973337108, "grad_norm": 15.215363502502441, "learning_rate": 2.468874919171529e-05, "loss": 7.5263, "step": 1545400 }, { "epoch": 6.29602735136049, "grad_norm": 8.550811767578125, "learning_rate": 2.4635046136302315e-05, "loss": 7.5195, "step": 1545500 }, { "epoch": 6.296434729383871, "grad_norm": 4.234350681304932, "learning_rate": 2.458144114298169e-05, "loss": 7.528, "step": 1545600 }, { "epoch": 6.296842107407253, "grad_norm": 3.0124824047088623, "learning_rate": 2.4527934213873043e-05, "loss": 7.5405, "step": 1545700 }, { "epoch": 6.297249485430634, "grad_norm": 3.5371623039245605, "learning_rate": 2.4474525351091507e-05, "loss": 7.5232, "step": 1545800 }, { "epoch": 6.297656863454016, "grad_norm": 4.27669620513916, "learning_rate": 2.4421214556748977e-05, "loss": 7.5253, "step": 1545900 }, { "epoch": 6.298064241477397, "grad_norm": 15.751446723937988, "learning_rate": 2.4368001832952584e-05, "loss": 7.5517, "step": 1546000 }, { "epoch": 6.298064241477397, "eval_MaskedAccuracy": 0.5144174046331489, "eval_loss": 1.5825165510177612, "eval_runtime": 154.0075, "eval_samples_per_second": 412.162, "eval_steps_per_second": 1.61, "step": 1546000 }, { "epoch": 6.298471619500779, "grad_norm": 2.991913080215454, "learning_rate": 2.4314887181806695e-05, "loss": 7.5131, "step": 1546100 }, { "epoch": 6.29887899752416, "grad_norm": 4.033088207244873, "learning_rate": 2.426187060541071e-05, "loss": 7.551, "step": 1546200 }, { "epoch": 6.299286375547542, "grad_norm": 2.9316580295562744, "learning_rate": 2.420895210586127e-05, "loss": 7.5507, "step": 1546300 }, { "epoch": 6.2996937535709225, "grad_norm": 17.982954025268555, "learning_rate": 2.4156131685249995e-05, "loss": 7.5173, "step": 1546400 }, { "epoch": 6.300101131594304, "grad_norm": 2.853330135345459, "learning_rate": 2.4103409345665162e-05, "loss": 7.5133, "step": 1546500 }, { "epoch": 6.300508509617686, "grad_norm": 16.396175384521484, "learning_rate": 2.4050785089191555e-05, "loss": 7.5211, "step": 1546600 }, { "epoch": 6.300915887641067, "grad_norm": 3.7255969047546387, "learning_rate": 2.399825891790913e-05, "loss": 7.4858, "step": 1546700 }, { "epoch": 6.301323265664449, "grad_norm": 4.791287422180176, "learning_rate": 2.39458308338951e-05, "loss": 7.5008, "step": 1546800 }, { "epoch": 6.30173064368783, "grad_norm": 4.549746513366699, "learning_rate": 2.389350083922225e-05, "loss": 7.5088, "step": 1546900 }, { "epoch": 6.302138021711212, "grad_norm": 4.2029924392700195, "learning_rate": 2.384126893595865e-05, "loss": 7.5219, "step": 1547000 }, { "epoch": 6.302138021711212, "eval_MaskedAccuracy": 0.5134780985385201, "eval_loss": 1.5871491432189941, "eval_runtime": 183.8151, "eval_samples_per_second": 345.325, "eval_steps_per_second": 1.349, "step": 1547000 }, { "epoch": 6.302545399734593, "grad_norm": 3.2818925380706787, "learning_rate": 2.3789135126170228e-05, "loss": 7.52, "step": 1547100 }, { "epoch": 6.302952777757975, "grad_norm": 4.296736240386963, "learning_rate": 2.3737099411917534e-05, "loss": 7.5068, "step": 1547200 }, { "epoch": 6.303360155781356, "grad_norm": 14.635024070739746, "learning_rate": 2.368516179525814e-05, "loss": 7.4914, "step": 1547300 }, { "epoch": 6.303767533804738, "grad_norm": 3.2431273460388184, "learning_rate": 2.3633322278245203e-05, "loss": 7.5023, "step": 1547400 }, { "epoch": 6.304174911828119, "grad_norm": 3.529853343963623, "learning_rate": 2.3581580862928547e-05, "loss": 7.5368, "step": 1547500 }, { "epoch": 6.304582289851501, "grad_norm": 2.9401190280914307, "learning_rate": 2.3529937551353235e-05, "loss": 7.5288, "step": 1547600 }, { "epoch": 6.304989667874882, "grad_norm": 5.547223091125488, "learning_rate": 2.3478392345561626e-05, "loss": 7.4997, "step": 1547700 }, { "epoch": 6.305397045898263, "grad_norm": 4.2137932777404785, "learning_rate": 2.3426945247591316e-05, "loss": 7.496, "step": 1547800 }, { "epoch": 6.305804423921645, "grad_norm": 3.1072821617126465, "learning_rate": 2.3375596259476027e-05, "loss": 7.5234, "step": 1547900 }, { "epoch": 6.306211801945026, "grad_norm": 2.5990145206451416, "learning_rate": 2.332434538324618e-05, "loss": 7.498, "step": 1548000 }, { "epoch": 6.306211801945026, "eval_MaskedAccuracy": 0.5140046198565413, "eval_loss": 1.5883818864822388, "eval_runtime": 158.6248, "eval_samples_per_second": 400.164, "eval_steps_per_second": 1.563, "step": 1548000 }, { "epoch": 6.306619179968408, "grad_norm": 3.3525407314300537, "learning_rate": 2.3273192620928023e-05, "loss": 7.5136, "step": 1548100 }, { "epoch": 6.307026557991789, "grad_norm": 20.4449520111084, "learning_rate": 2.3222137974543414e-05, "loss": 7.5274, "step": 1548200 }, { "epoch": 6.307433936015171, "grad_norm": 3.1215569972991943, "learning_rate": 2.3171181446111346e-05, "loss": 7.5342, "step": 1548300 }, { "epoch": 6.307841314038552, "grad_norm": 3.0884578227996826, "learning_rate": 2.3120323037646223e-05, "loss": 7.5303, "step": 1548400 }, { "epoch": 6.308248692061934, "grad_norm": 3.8087260723114014, "learning_rate": 2.3069562751158775e-05, "loss": 7.56, "step": 1548500 }, { "epoch": 6.308656070085315, "grad_norm": 8.330710411071777, "learning_rate": 2.3018900588655905e-05, "loss": 7.4897, "step": 1548600 }, { "epoch": 6.309063448108697, "grad_norm": 4.615542888641357, "learning_rate": 2.2968336552140338e-05, "loss": 7.537, "step": 1548700 }, { "epoch": 6.309470826132078, "grad_norm": 4.869720458984375, "learning_rate": 2.2917870643611423e-05, "loss": 7.5145, "step": 1548800 }, { "epoch": 6.30987820415546, "grad_norm": 3.726224899291992, "learning_rate": 2.2867502865064127e-05, "loss": 7.4863, "step": 1548900 }, { "epoch": 6.310285582178841, "grad_norm": 3.5653560161590576, "learning_rate": 2.2817233218489566e-05, "loss": 7.5168, "step": 1549000 }, { "epoch": 6.310285582178841, "eval_MaskedAccuracy": 0.5135034707953396, "eval_loss": 1.5824087858200073, "eval_runtime": 160.2875, "eval_samples_per_second": 396.013, "eval_steps_per_second": 1.547, "step": 1549000 }, { "epoch": 6.310692960202222, "grad_norm": 3.5148019790649414, "learning_rate": 2.2767061705875443e-05, "loss": 7.5085, "step": 1549100 }, { "epoch": 6.311100338225604, "grad_norm": 4.2641119956970215, "learning_rate": 2.271698832920565e-05, "loss": 7.5447, "step": 1549200 }, { "epoch": 6.311507716248985, "grad_norm": 4.688575744628906, "learning_rate": 2.2667013090459345e-05, "loss": 7.528, "step": 1549300 }, { "epoch": 6.311915094272367, "grad_norm": 5.592088222503662, "learning_rate": 2.2617135991612394e-05, "loss": 7.5138, "step": 1549400 }, { "epoch": 6.312322472295748, "grad_norm": 3.5355324745178223, "learning_rate": 2.2567357034636735e-05, "loss": 7.5125, "step": 1549500 }, { "epoch": 6.31272985031913, "grad_norm": 3.9834372997283936, "learning_rate": 2.2517676221500475e-05, "loss": 7.5466, "step": 1549600 }, { "epoch": 6.313137228342511, "grad_norm": 3.0744802951812744, "learning_rate": 2.246809355416788e-05, "loss": 7.5269, "step": 1549700 }, { "epoch": 6.313544606365893, "grad_norm": 5.867956161499023, "learning_rate": 2.2418609034598952e-05, "loss": 7.5172, "step": 1549800 }, { "epoch": 6.313951984389274, "grad_norm": 4.516674995422363, "learning_rate": 2.2369222664750145e-05, "loss": 7.5246, "step": 1549900 }, { "epoch": 6.314359362412656, "grad_norm": 3.641747236251831, "learning_rate": 2.2319934446574075e-05, "loss": 7.5129, "step": 1550000 }, { "epoch": 6.314359362412656, "eval_MaskedAccuracy": 0.5141479137000701, "eval_loss": 1.585058331489563, "eval_runtime": 177.9819, "eval_samples_per_second": 356.643, "eval_steps_per_second": 1.393, "step": 1550000 }, { "epoch": 6.3147667404360375, "grad_norm": 3.76349139213562, "learning_rate": 2.2270744382019154e-05, "loss": 7.4931, "step": 1550100 }, { "epoch": 6.315174118459418, "grad_norm": 2.777132511138916, "learning_rate": 2.2221652473030475e-05, "loss": 7.5116, "step": 1550200 }, { "epoch": 6.3155814964828, "grad_norm": 21.24774742126465, "learning_rate": 2.2172658721548655e-05, "loss": 7.517, "step": 1550300 }, { "epoch": 6.315988874506181, "grad_norm": 11.929756164550781, "learning_rate": 2.2123763129510794e-05, "loss": 7.5168, "step": 1550400 }, { "epoch": 6.316396252529563, "grad_norm": 3.7070703506469727, "learning_rate": 2.2074965698849787e-05, "loss": 7.4922, "step": 1550500 }, { "epoch": 6.316803630552944, "grad_norm": 11.011404037475586, "learning_rate": 2.202626643149497e-05, "loss": 7.5135, "step": 1550600 }, { "epoch": 6.317211008576326, "grad_norm": 12.582701683044434, "learning_rate": 2.1977665329371752e-05, "loss": 7.5118, "step": 1550700 }, { "epoch": 6.317618386599707, "grad_norm": 12.216609001159668, "learning_rate": 2.192916239440169e-05, "loss": 7.5235, "step": 1550800 }, { "epoch": 6.318025764623089, "grad_norm": 7.560016632080078, "learning_rate": 2.1880757628501945e-05, "loss": 7.516, "step": 1550900 }, { "epoch": 6.31843314264647, "grad_norm": 15.873871803283691, "learning_rate": 2.1832451033586595e-05, "loss": 7.5127, "step": 1551000 }, { "epoch": 6.31843314264647, "eval_MaskedAccuracy": 0.5135422715831626, "eval_loss": 1.5892564058303833, "eval_runtime": 157.5923, "eval_samples_per_second": 402.786, "eval_steps_per_second": 1.574, "step": 1551000 }, { "epoch": 6.318840520669852, "grad_norm": 4.224386215209961, "learning_rate": 2.1784242611565508e-05, "loss": 7.5359, "step": 1551100 }, { "epoch": 6.3192478986932334, "grad_norm": 14.862460136413574, "learning_rate": 2.1736132364344478e-05, "loss": 7.5073, "step": 1551200 }, { "epoch": 6.319655276716615, "grad_norm": 13.30049991607666, "learning_rate": 2.1688120293825685e-05, "loss": 7.4914, "step": 1551300 }, { "epoch": 6.320062654739996, "grad_norm": 6.62608003616333, "learning_rate": 2.164020640190713e-05, "loss": 7.5474, "step": 1551400 }, { "epoch": 6.320470032763377, "grad_norm": 8.579277992248535, "learning_rate": 2.1592390690483152e-05, "loss": 7.5181, "step": 1551500 }, { "epoch": 6.320877410786759, "grad_norm": 6.059241771697998, "learning_rate": 2.154467316144405e-05, "loss": 7.4969, "step": 1551600 }, { "epoch": 6.32128478881014, "grad_norm": 12.62930965423584, "learning_rate": 2.1497053816676465e-05, "loss": 7.5222, "step": 1551700 }, { "epoch": 6.321692166833522, "grad_norm": 8.592183113098145, "learning_rate": 2.1449532658063176e-05, "loss": 7.5298, "step": 1551800 }, { "epoch": 6.322099544856903, "grad_norm": 11.156678199768066, "learning_rate": 2.1402109687482526e-05, "loss": 7.505, "step": 1551900 }, { "epoch": 6.322506922880285, "grad_norm": 7.913949489593506, "learning_rate": 2.1354784906810086e-05, "loss": 7.5041, "step": 1552000 }, { "epoch": 6.322506922880285, "eval_MaskedAccuracy": 0.5137276939288936, "eval_loss": 1.5954068899154663, "eval_runtime": 157.1305, "eval_samples_per_second": 403.97, "eval_steps_per_second": 1.578, "step": 1552000 }, { "epoch": 6.322914300903666, "grad_norm": 5.202408790588379, "learning_rate": 2.130755831791617e-05, "loss": 7.4768, "step": 1552100 }, { "epoch": 6.323321678927048, "grad_norm": 9.220905303955078, "learning_rate": 2.1260429922668334e-05, "loss": 7.53, "step": 1552200 }, { "epoch": 6.323729056950429, "grad_norm": 8.458250045776367, "learning_rate": 2.1213399722929684e-05, "loss": 7.4975, "step": 1552300 }, { "epoch": 6.324136434973811, "grad_norm": 21.704702377319336, "learning_rate": 2.1166467720559447e-05, "loss": 7.5063, "step": 1552400 }, { "epoch": 6.3245438129971925, "grad_norm": 5.595561504364014, "learning_rate": 2.1119633917413532e-05, "loss": 7.5118, "step": 1552500 }, { "epoch": 6.324951191020574, "grad_norm": 16.391130447387695, "learning_rate": 2.1072898315342825e-05, "loss": 7.486, "step": 1552600 }, { "epoch": 6.325358569043955, "grad_norm": 17.90876007080078, "learning_rate": 2.1026260916195762e-05, "loss": 7.5579, "step": 1552700 }, { "epoch": 6.325765947067336, "grad_norm": 3.0364785194396973, "learning_rate": 2.0979721721815476e-05, "loss": 7.5202, "step": 1552800 }, { "epoch": 6.326173325090718, "grad_norm": 2.3322737216949463, "learning_rate": 2.093328073404243e-05, "loss": 7.5142, "step": 1552900 }, { "epoch": 6.326580703114099, "grad_norm": 7.759130477905273, "learning_rate": 2.0886937954712498e-05, "loss": 7.499, "step": 1553000 }, { "epoch": 6.326580703114099, "eval_MaskedAccuracy": 0.5136656005610186, "eval_loss": 1.5845599174499512, "eval_runtime": 151.8068, "eval_samples_per_second": 418.137, "eval_steps_per_second": 1.634, "step": 1553000 }, { "epoch": 6.326988081137481, "grad_norm": 5.862680912017822, "learning_rate": 2.0840693385657803e-05, "loss": 7.5128, "step": 1553100 }, { "epoch": 6.327395459160862, "grad_norm": 3.3214495182037354, "learning_rate": 2.0794547028706842e-05, "loss": 7.501, "step": 1553200 }, { "epoch": 6.327802837184244, "grad_norm": 4.727343559265137, "learning_rate": 2.0748498885683912e-05, "loss": 7.5259, "step": 1553300 }, { "epoch": 6.328210215207625, "grad_norm": 12.341339111328125, "learning_rate": 2.0702548958409484e-05, "loss": 7.4982, "step": 1553400 }, { "epoch": 6.328617593231007, "grad_norm": 9.620864868164062, "learning_rate": 2.0656697248700478e-05, "loss": 7.5229, "step": 1553500 }, { "epoch": 6.3290249712543885, "grad_norm": 3.068958044052124, "learning_rate": 2.06109437583692e-05, "loss": 7.5123, "step": 1553600 }, { "epoch": 6.32943234927777, "grad_norm": 27.00047492980957, "learning_rate": 2.05652884892248e-05, "loss": 7.4955, "step": 1553700 }, { "epoch": 6.3298397273011515, "grad_norm": 12.979219436645508, "learning_rate": 2.0519731443072178e-05, "loss": 7.5298, "step": 1553800 }, { "epoch": 6.330247105324533, "grad_norm": 3.2111220359802246, "learning_rate": 2.0474272621712674e-05, "loss": 7.4988, "step": 1553900 }, { "epoch": 6.330654483347914, "grad_norm": 3.862499952316284, "learning_rate": 2.0428912026943186e-05, "loss": 7.524, "step": 1554000 }, { "epoch": 6.330654483347914, "eval_MaskedAccuracy": 0.5132917372360736, "eval_loss": 1.5841280221939087, "eval_runtime": 151.9416, "eval_samples_per_second": 417.766, "eval_steps_per_second": 1.632, "step": 1554000 }, { "epoch": 6.331061861371295, "grad_norm": 8.490737915039062, "learning_rate": 2.0383649660557302e-05, "loss": 7.5266, "step": 1554100 }, { "epoch": 6.331469239394677, "grad_norm": 9.859129905700684, "learning_rate": 2.033848552434442e-05, "loss": 7.525, "step": 1554200 }, { "epoch": 6.331876617418058, "grad_norm": 5.727023601531982, "learning_rate": 2.029341962009012e-05, "loss": 7.5259, "step": 1554300 }, { "epoch": 6.33228399544144, "grad_norm": 11.75688648223877, "learning_rate": 2.024845194957605e-05, "loss": 7.4937, "step": 1554400 }, { "epoch": 6.332691373464821, "grad_norm": 6.944560527801514, "learning_rate": 2.0203582514580015e-05, "loss": 7.5038, "step": 1554500 }, { "epoch": 6.333098751488203, "grad_norm": 6.843235969543457, "learning_rate": 2.0158811316875938e-05, "loss": 7.5152, "step": 1554600 }, { "epoch": 6.333506129511584, "grad_norm": 4.89893102645874, "learning_rate": 2.0114138358234142e-05, "loss": 7.5155, "step": 1554700 }, { "epoch": 6.333913507534966, "grad_norm": 7.306215286254883, "learning_rate": 2.006956364042024e-05, "loss": 7.5191, "step": 1554800 }, { "epoch": 6.3343208855583475, "grad_norm": 5.872547626495361, "learning_rate": 2.0025087165197002e-05, "loss": 7.4958, "step": 1554900 }, { "epoch": 6.334728263581729, "grad_norm": 3.7879750728607178, "learning_rate": 1.9980708934322866e-05, "loss": 7.5321, "step": 1555000 }, { "epoch": 6.334728263581729, "eval_MaskedAccuracy": 0.5140193556513327, "eval_loss": 1.5914322137832642, "eval_runtime": 153.473, "eval_samples_per_second": 413.597, "eval_steps_per_second": 1.616, "step": 1555000 }, { "epoch": 6.335135641605111, "grad_norm": 8.08877182006836, "learning_rate": 1.9936428949551795e-05, "loss": 7.5091, "step": 1555100 }, { "epoch": 6.335543019628491, "grad_norm": 6.396212577819824, "learning_rate": 1.9892247212634958e-05, "loss": 7.5296, "step": 1555200 }, { "epoch": 6.335950397651873, "grad_norm": 5.900701522827148, "learning_rate": 1.984816372531885e-05, "loss": 7.508, "step": 1555300 }, { "epoch": 6.336357775675254, "grad_norm": 3.2566726207733154, "learning_rate": 1.9804178489346125e-05, "loss": 7.5052, "step": 1555400 }, { "epoch": 6.336765153698636, "grad_norm": 3.6948935985565186, "learning_rate": 1.9760291506456324e-05, "loss": 7.5557, "step": 1555500 }, { "epoch": 6.337172531722017, "grad_norm": 6.9162068367004395, "learning_rate": 1.9716502778384026e-05, "loss": 7.5245, "step": 1555600 }, { "epoch": 6.337579909745399, "grad_norm": 4.947198390960693, "learning_rate": 1.9672812306860425e-05, "loss": 7.5214, "step": 1555700 }, { "epoch": 6.33798728776878, "grad_norm": 10.452269554138184, "learning_rate": 1.9629220093613275e-05, "loss": 7.5273, "step": 1555800 }, { "epoch": 6.338394665792162, "grad_norm": 4.318188667297363, "learning_rate": 1.9585726140365742e-05, "loss": 7.4954, "step": 1555900 }, { "epoch": 6.3388020438155435, "grad_norm": 13.407175064086914, "learning_rate": 1.954233044883721e-05, "loss": 7.5215, "step": 1556000 }, { "epoch": 6.3388020438155435, "eval_MaskedAccuracy": 0.5139136207573256, "eval_loss": 1.5885847806930542, "eval_runtime": 152.5531, "eval_samples_per_second": 416.091, "eval_steps_per_second": 1.626, "step": 1556000 }, { "epoch": 6.339209421838925, "grad_norm": 4.189796447753906, "learning_rate": 1.9499033020743675e-05, "loss": 7.4855, "step": 1556100 }, { "epoch": 6.3396167998623065, "grad_norm": 3.5075576305389404, "learning_rate": 1.9455833857796718e-05, "loss": 7.5178, "step": 1556200 }, { "epoch": 6.340024177885688, "grad_norm": 3.9251222610473633, "learning_rate": 1.941273296170434e-05, "loss": 7.5157, "step": 1556300 }, { "epoch": 6.340431555909069, "grad_norm": 5.285815715789795, "learning_rate": 1.9369730334170382e-05, "loss": 7.5364, "step": 1556400 }, { "epoch": 6.34083893393245, "grad_norm": 4.4611496925354, "learning_rate": 1.9326825976895263e-05, "loss": 7.5316, "step": 1556500 }, { "epoch": 6.341246311955832, "grad_norm": 3.0326576232910156, "learning_rate": 1.928401989157483e-05, "loss": 7.5232, "step": 1556600 }, { "epoch": 6.341653689979213, "grad_norm": 8.034393310546875, "learning_rate": 1.9241312079901564e-05, "loss": 7.4537, "step": 1556700 }, { "epoch": 6.342061068002595, "grad_norm": 7.434968948364258, "learning_rate": 1.9198702543564097e-05, "loss": 7.5038, "step": 1556800 }, { "epoch": 6.342468446025976, "grad_norm": 5.263147354125977, "learning_rate": 1.9156191284246835e-05, "loss": 7.523, "step": 1556900 }, { "epoch": 6.342875824049358, "grad_norm": 8.264650344848633, "learning_rate": 1.9113778303630574e-05, "loss": 7.4962, "step": 1557000 }, { "epoch": 6.342875824049358, "eval_MaskedAccuracy": 0.5140052365365326, "eval_loss": 1.5736886262893677, "eval_runtime": 153.4389, "eval_samples_per_second": 413.689, "eval_steps_per_second": 1.616, "step": 1557000 }, { "epoch": 6.343283202072739, "grad_norm": 4.549633026123047, "learning_rate": 1.9071463603392052e-05, "loss": 7.5306, "step": 1557100 }, { "epoch": 6.343690580096121, "grad_norm": 6.143858432769775, "learning_rate": 1.9029247185204394e-05, "loss": 7.516, "step": 1557200 }, { "epoch": 6.3440979581195025, "grad_norm": 5.988174915313721, "learning_rate": 1.898712905073646e-05, "loss": 7.4866, "step": 1557300 }, { "epoch": 6.344505336142884, "grad_norm": 7.643118858337402, "learning_rate": 1.8945109201653615e-05, "loss": 7.5416, "step": 1557400 }, { "epoch": 6.344912714166266, "grad_norm": 6.2892656326293945, "learning_rate": 1.8903187639616777e-05, "loss": 7.5139, "step": 1557500 }, { "epoch": 6.345320092189647, "grad_norm": 3.603832960128784, "learning_rate": 1.886136436628375e-05, "loss": 7.4909, "step": 1557600 }, { "epoch": 6.345727470213028, "grad_norm": 5.11536979675293, "learning_rate": 1.881963938330741e-05, "loss": 7.5166, "step": 1557700 }, { "epoch": 6.346134848236409, "grad_norm": 45.51047134399414, "learning_rate": 1.877801269233813e-05, "loss": 7.5349, "step": 1557800 }, { "epoch": 6.346542226259791, "grad_norm": 33.30308532714844, "learning_rate": 1.8736484295020995e-05, "loss": 7.5352, "step": 1557900 }, { "epoch": 6.346949604283172, "grad_norm": 16.877361297607422, "learning_rate": 1.869505419299835e-05, "loss": 7.5148, "step": 1558000 }, { "epoch": 6.346949604283172, "eval_MaskedAccuracy": 0.5138252345160843, "eval_loss": 1.5794929265975952, "eval_runtime": 153.5837, "eval_samples_per_second": 413.299, "eval_steps_per_second": 1.615, "step": 1558000 }, { "epoch": 6.347356982306554, "grad_norm": 26.333221435546875, "learning_rate": 1.865372238790754e-05, "loss": 7.5101, "step": 1558100 }, { "epoch": 6.347764360329935, "grad_norm": 4.551407814025879, "learning_rate": 1.8612488881383453e-05, "loss": 7.5327, "step": 1558200 }, { "epoch": 6.348171738353317, "grad_norm": 5.632599353790283, "learning_rate": 1.857135367505535e-05, "loss": 7.502, "step": 1558300 }, { "epoch": 6.3485791163766985, "grad_norm": 8.501633644104004, "learning_rate": 1.8530316770550363e-05, "loss": 7.5343, "step": 1558400 }, { "epoch": 6.34898649440008, "grad_norm": 4.900854587554932, "learning_rate": 1.8489378169490323e-05, "loss": 7.5467, "step": 1558500 }, { "epoch": 6.3493938724234615, "grad_norm": 13.692998886108398, "learning_rate": 1.8448537873493815e-05, "loss": 7.5175, "step": 1558600 }, { "epoch": 6.349801250446843, "grad_norm": 4.244467258453369, "learning_rate": 1.840779588417597e-05, "loss": 7.5005, "step": 1558700 }, { "epoch": 6.350208628470225, "grad_norm": 25.029279708862305, "learning_rate": 1.8367152203147006e-05, "loss": 7.5113, "step": 1558800 }, { "epoch": 6.350616006493606, "grad_norm": 4.887223243713379, "learning_rate": 1.8326606832013838e-05, "loss": 7.5298, "step": 1558900 }, { "epoch": 6.351023384516987, "grad_norm": 3.9406700134277344, "learning_rate": 1.8286159772379687e-05, "loss": 7.5382, "step": 1559000 }, { "epoch": 6.351023384516987, "eval_MaskedAccuracy": 0.5132906179554274, "eval_loss": 1.5843143463134766, "eval_runtime": 158.9723, "eval_samples_per_second": 399.29, "eval_steps_per_second": 1.56, "step": 1559000 }, { "epoch": 6.351430762540368, "grad_norm": 6.848637104034424, "learning_rate": 1.8245811025843708e-05, "loss": 7.5133, "step": 1559100 }, { "epoch": 6.35183814056375, "grad_norm": 8.351176261901855, "learning_rate": 1.820556059400087e-05, "loss": 7.5041, "step": 1559200 }, { "epoch": 6.352245518587131, "grad_norm": 2.5494906902313232, "learning_rate": 1.8165408478442223e-05, "loss": 7.5183, "step": 1559300 }, { "epoch": 6.352652896610513, "grad_norm": 4.4983038902282715, "learning_rate": 1.812535468075584e-05, "loss": 7.5481, "step": 1559400 }, { "epoch": 6.3530602746338944, "grad_norm": 3.0655312538146973, "learning_rate": 1.8085399202524756e-05, "loss": 7.5151, "step": 1559500 }, { "epoch": 6.353467652657276, "grad_norm": 4.258170127868652, "learning_rate": 1.8045542045328738e-05, "loss": 7.5208, "step": 1559600 }, { "epoch": 6.3538750306806575, "grad_norm": 5.146317005157471, "learning_rate": 1.800578321074391e-05, "loss": 7.51, "step": 1559700 }, { "epoch": 6.354282408704039, "grad_norm": 7.607929229736328, "learning_rate": 1.79661227003414e-05, "loss": 7.5159, "step": 1559800 }, { "epoch": 6.354689786727421, "grad_norm": 4.449549674987793, "learning_rate": 1.792656051568987e-05, "loss": 7.4969, "step": 1559900 }, { "epoch": 6.355097164750802, "grad_norm": 3.163651704788208, "learning_rate": 1.7887096658353308e-05, "loss": 7.5189, "step": 1560000 }, { "epoch": 6.355097164750802, "eval_MaskedAccuracy": 0.513831468785368, "eval_loss": 1.5857032537460327, "eval_runtime": 161.4409, "eval_samples_per_second": 393.184, "eval_steps_per_second": 1.536, "step": 1560000 }, { "epoch": 6.355504542774184, "grad_norm": 6.928320407867432, "learning_rate": 1.7847731129891757e-05, "loss": 7.5037, "step": 1560100 }, { "epoch": 6.355911920797564, "grad_norm": 5.489813804626465, "learning_rate": 1.7808463931861707e-05, "loss": 7.5059, "step": 1560200 }, { "epoch": 6.356319298820946, "grad_norm": 2.625778913497925, "learning_rate": 1.776929506581545e-05, "loss": 7.5187, "step": 1560300 }, { "epoch": 6.356726676844327, "grad_norm": 4.402017593383789, "learning_rate": 1.7730224533301418e-05, "loss": 7.501, "step": 1560400 }, { "epoch": 6.357134054867709, "grad_norm": 9.853277206420898, "learning_rate": 1.7691252335864725e-05, "loss": 7.4903, "step": 1560500 }, { "epoch": 6.35754143289109, "grad_norm": 4.044744968414307, "learning_rate": 1.7652378475045767e-05, "loss": 7.5104, "step": 1560600 }, { "epoch": 6.357948810914472, "grad_norm": 16.034936904907227, "learning_rate": 1.76136029523816e-05, "loss": 7.5028, "step": 1560700 }, { "epoch": 6.3583561889378535, "grad_norm": 11.029666900634766, "learning_rate": 1.757492576940514e-05, "loss": 7.5431, "step": 1560800 }, { "epoch": 6.358763566961235, "grad_norm": 7.335923671722412, "learning_rate": 1.7536346927645733e-05, "loss": 7.4897, "step": 1560900 }, { "epoch": 6.3591709449846165, "grad_norm": 2.46474552154541, "learning_rate": 1.7497866428628268e-05, "loss": 7.5329, "step": 1561000 }, { "epoch": 6.3591709449846165, "eval_MaskedAccuracy": 0.5139506078771523, "eval_loss": 1.5861738920211792, "eval_runtime": 163.5106, "eval_samples_per_second": 388.207, "eval_steps_per_second": 1.517, "step": 1561000 }, { "epoch": 6.359578323007998, "grad_norm": 3.765188455581665, "learning_rate": 1.745948427387403e-05, "loss": 7.5007, "step": 1561100 }, { "epoch": 6.35998570103138, "grad_norm": 3.644688606262207, "learning_rate": 1.7421200464900977e-05, "loss": 7.4808, "step": 1561200 }, { "epoch": 6.360393079054761, "grad_norm": 3.6915690898895264, "learning_rate": 1.738301500322235e-05, "loss": 7.5105, "step": 1561300 }, { "epoch": 6.360800457078142, "grad_norm": 8.23080062866211, "learning_rate": 1.7344927890347818e-05, "loss": 7.4961, "step": 1561400 }, { "epoch": 6.361207835101523, "grad_norm": 2.4154865741729736, "learning_rate": 1.7306939127783145e-05, "loss": 7.5105, "step": 1561500 }, { "epoch": 6.361615213124905, "grad_norm": 4.842697620391846, "learning_rate": 1.726904871703026e-05, "loss": 7.5211, "step": 1561600 }, { "epoch": 6.362022591148286, "grad_norm": 3.899458885192871, "learning_rate": 1.723125665958718e-05, "loss": 7.5516, "step": 1561700 }, { "epoch": 6.362429969171668, "grad_norm": 5.276438236236572, "learning_rate": 1.71935629569483e-05, "loss": 7.5145, "step": 1561800 }, { "epoch": 6.3628373471950495, "grad_norm": 20.220172882080078, "learning_rate": 1.7155967610603337e-05, "loss": 7.5103, "step": 1561900 }, { "epoch": 6.363244725218431, "grad_norm": 8.841887474060059, "learning_rate": 1.7118470622038957e-05, "loss": 7.529, "step": 1562000 }, { "epoch": 6.363244725218431, "eval_MaskedAccuracy": 0.5141052740534775, "eval_loss": 1.5849666595458984, "eval_runtime": 160.3905, "eval_samples_per_second": 395.759, "eval_steps_per_second": 1.546, "step": 1562000 }, { "epoch": 6.3636521032418125, "grad_norm": 15.423996925354004, "learning_rate": 1.708107199273767e-05, "loss": 7.4776, "step": 1562100 }, { "epoch": 6.364059481265194, "grad_norm": 8.583870887756348, "learning_rate": 1.7043771724177823e-05, "loss": 7.4877, "step": 1562200 }, { "epoch": 6.364466859288576, "grad_norm": 7.320174694061279, "learning_rate": 1.7006569817834142e-05, "loss": 7.5218, "step": 1562300 }, { "epoch": 6.364874237311957, "grad_norm": 15.85832405090332, "learning_rate": 1.6969466275177817e-05, "loss": 7.5264, "step": 1562400 }, { "epoch": 6.365281615335339, "grad_norm": 3.109506368637085, "learning_rate": 1.6932461097674995e-05, "loss": 7.4765, "step": 1562500 }, { "epoch": 6.36568899335872, "grad_norm": 2.3691346645355225, "learning_rate": 1.6895554286789353e-05, "loss": 7.5039, "step": 1562600 }, { "epoch": 6.366096371382101, "grad_norm": 10.584104537963867, "learning_rate": 1.6858745843979863e-05, "loss": 7.4992, "step": 1562700 }, { "epoch": 6.366503749405482, "grad_norm": 7.2751946449279785, "learning_rate": 1.6822035770701287e-05, "loss": 7.5276, "step": 1562800 }, { "epoch": 6.366911127428864, "grad_norm": 5.5859761238098145, "learning_rate": 1.6785424068405414e-05, "loss": 7.5109, "step": 1562900 }, { "epoch": 6.367318505452245, "grad_norm": 14.105605125427246, "learning_rate": 1.6748910738539864e-05, "loss": 7.5131, "step": 1563000 }, { "epoch": 6.367318505452245, "eval_MaskedAccuracy": 0.5144510631890555, "eval_loss": 1.588063359260559, "eval_runtime": 162.8593, "eval_samples_per_second": 389.76, "eval_steps_per_second": 1.523, "step": 1563000 }, { "epoch": 6.367725883475627, "grad_norm": 10.73180103302002, "learning_rate": 1.6712495782547813e-05, "loss": 7.4896, "step": 1563100 }, { "epoch": 6.3681332614990085, "grad_norm": 13.278154373168945, "learning_rate": 1.667617920186908e-05, "loss": 7.4768, "step": 1563200 }, { "epoch": 6.36854063952239, "grad_norm": 11.352387428283691, "learning_rate": 1.6639960997939677e-05, "loss": 7.5252, "step": 1563300 }, { "epoch": 6.368948017545772, "grad_norm": 4.686359882354736, "learning_rate": 1.6603841172191113e-05, "loss": 7.4968, "step": 1563400 }, { "epoch": 6.369355395569153, "grad_norm": 9.022072792053223, "learning_rate": 1.6567819726051885e-05, "loss": 7.5334, "step": 1563500 }, { "epoch": 6.369762773592535, "grad_norm": 5.520549774169922, "learning_rate": 1.6531896660945527e-05, "loss": 7.5159, "step": 1563600 }, { "epoch": 6.370170151615916, "grad_norm": 4.639460563659668, "learning_rate": 1.6496071978292727e-05, "loss": 7.5079, "step": 1563700 }, { "epoch": 6.370577529639298, "grad_norm": 2.850416660308838, "learning_rate": 1.6460345679509558e-05, "loss": 7.5264, "step": 1563800 }, { "epoch": 6.370984907662679, "grad_norm": 5.534444808959961, "learning_rate": 1.6424717766008685e-05, "loss": 7.5156, "step": 1563900 }, { "epoch": 6.37139228568606, "grad_norm": 23.55628776550293, "learning_rate": 1.638918823919837e-05, "loss": 7.5402, "step": 1564000 }, { "epoch": 6.37139228568606, "eval_MaskedAccuracy": 0.5138514868274114, "eval_loss": 1.5761414766311646, "eval_runtime": 161.0599, "eval_samples_per_second": 394.114, "eval_steps_per_second": 1.54, "step": 1564000 }, { "epoch": 6.371799663709441, "grad_norm": 8.773727416992188, "learning_rate": 1.635375710048358e-05, "loss": 7.5176, "step": 1564100 }, { "epoch": 6.372207041732823, "grad_norm": 7.016404628753662, "learning_rate": 1.6318424351264843e-05, "loss": 7.5226, "step": 1564200 }, { "epoch": 6.3726144197562045, "grad_norm": 5.481175422668457, "learning_rate": 1.6283189992939363e-05, "loss": 7.4929, "step": 1564300 }, { "epoch": 6.373021797779586, "grad_norm": 10.70868968963623, "learning_rate": 1.624805402689984e-05, "loss": 7.5332, "step": 1564400 }, { "epoch": 6.3734291758029675, "grad_norm": 3.794466972351074, "learning_rate": 1.6213016454535483e-05, "loss": 7.5421, "step": 1564500 }, { "epoch": 6.373836553826349, "grad_norm": 23.68807601928711, "learning_rate": 1.6178077277231848e-05, "loss": 7.5147, "step": 1564600 }, { "epoch": 6.374243931849731, "grad_norm": 6.079192638397217, "learning_rate": 1.6143236496369767e-05, "loss": 7.5127, "step": 1564700 }, { "epoch": 6.374651309873112, "grad_norm": 8.446185111999512, "learning_rate": 1.6108494113326774e-05, "loss": 7.5139, "step": 1564800 }, { "epoch": 6.375058687896494, "grad_norm": 3.0648036003112793, "learning_rate": 1.607385012947678e-05, "loss": 7.4996, "step": 1564900 }, { "epoch": 6.375466065919875, "grad_norm": 4.1956915855407715, "learning_rate": 1.6039304546189004e-05, "loss": 7.5252, "step": 1565000 }, { "epoch": 6.375466065919875, "eval_MaskedAccuracy": 0.5143203620581782, "eval_loss": 1.5797115564346313, "eval_runtime": 152.8501, "eval_samples_per_second": 415.283, "eval_steps_per_second": 1.623, "step": 1565000 }, { "epoch": 6.375873443943257, "grad_norm": 3.805178642272949, "learning_rate": 1.600485736482963e-05, "loss": 7.5215, "step": 1565100 }, { "epoch": 6.376280821966637, "grad_norm": 17.14861488342285, "learning_rate": 1.5970508586760117e-05, "loss": 7.5125, "step": 1565200 }, { "epoch": 6.376688199990019, "grad_norm": 4.253889560699463, "learning_rate": 1.5936258213338865e-05, "loss": 7.5076, "step": 1565300 }, { "epoch": 6.3770955780134, "grad_norm": 8.798266410827637, "learning_rate": 1.590210624591957e-05, "loss": 7.4796, "step": 1565400 }, { "epoch": 6.377502956036782, "grad_norm": 5.30897855758667, "learning_rate": 1.5868052685852874e-05, "loss": 7.5318, "step": 1565500 }, { "epoch": 6.3779103340601635, "grad_norm": 6.551238536834717, "learning_rate": 1.583409753448474e-05, "loss": 7.5349, "step": 1565600 }, { "epoch": 6.378317712083545, "grad_norm": 8.6677885055542, "learning_rate": 1.5800240793157513e-05, "loss": 7.5319, "step": 1565700 }, { "epoch": 6.378725090106927, "grad_norm": 3.4282755851745605, "learning_rate": 1.576648246321022e-05, "loss": 7.5404, "step": 1565800 }, { "epoch": 6.379132468130308, "grad_norm": 5.345614910125732, "learning_rate": 1.5732822545977163e-05, "loss": 7.5257, "step": 1565900 }, { "epoch": 6.37953984615369, "grad_norm": 5.274416923522949, "learning_rate": 1.569926104278932e-05, "loss": 7.5251, "step": 1566000 }, { "epoch": 6.37953984615369, "eval_MaskedAccuracy": 0.5136792658576195, "eval_loss": 1.5815542936325073, "eval_runtime": 150.9756, "eval_samples_per_second": 420.439, "eval_steps_per_second": 1.643, "step": 1566000 }, { "epoch": 6.379947224177071, "grad_norm": 3.6069445610046387, "learning_rate": 1.566579795497325e-05, "loss": 7.5443, "step": 1566100 }, { "epoch": 6.380354602200453, "grad_norm": 3.70088529586792, "learning_rate": 1.563243328385192e-05, "loss": 7.5102, "step": 1566200 }, { "epoch": 6.380761980223834, "grad_norm": 3.7640960216522217, "learning_rate": 1.559916703074499e-05, "loss": 7.512, "step": 1566300 }, { "epoch": 6.381169358247215, "grad_norm": 7.848387241363525, "learning_rate": 1.5565999196966772e-05, "loss": 7.5263, "step": 1566400 }, { "epoch": 6.381576736270596, "grad_norm": 4.667734622955322, "learning_rate": 1.5532929783829444e-05, "loss": 7.5417, "step": 1566500 }, { "epoch": 6.381984114293978, "grad_norm": 7.917548656463623, "learning_rate": 1.5499958792639604e-05, "loss": 7.5363, "step": 1566600 }, { "epoch": 6.3823914923173595, "grad_norm": 3.24745512008667, "learning_rate": 1.546708622470142e-05, "loss": 7.5244, "step": 1566700 }, { "epoch": 6.382798870340741, "grad_norm": 7.438083648681641, "learning_rate": 1.5434312081314263e-05, "loss": 7.5467, "step": 1566800 }, { "epoch": 6.3832062483641225, "grad_norm": 4.114525318145752, "learning_rate": 1.540163636377395e-05, "loss": 7.5005, "step": 1566900 }, { "epoch": 6.383613626387504, "grad_norm": 9.119081497192383, "learning_rate": 1.536905907337213e-05, "loss": 7.5167, "step": 1567000 }, { "epoch": 6.383613626387504, "eval_MaskedAccuracy": 0.5140174664665519, "eval_loss": 1.583139181137085, "eval_runtime": 158.7008, "eval_samples_per_second": 399.973, "eval_steps_per_second": 1.563, "step": 1567000 }, { "epoch": 6.384021004410886, "grad_norm": 14.878480911254883, "learning_rate": 1.5336580211396862e-05, "loss": 7.5111, "step": 1567100 }, { "epoch": 6.384428382434267, "grad_norm": 2.7665915489196777, "learning_rate": 1.53041997791323e-05, "loss": 7.5391, "step": 1567200 }, { "epoch": 6.384835760457649, "grad_norm": 3.556154251098633, "learning_rate": 1.527191777785848e-05, "loss": 7.5148, "step": 1567300 }, { "epoch": 6.38524313848103, "grad_norm": 3.4475274085998535, "learning_rate": 1.5239734208852098e-05, "loss": 7.525, "step": 1567400 }, { "epoch": 6.385650516504412, "grad_norm": 5.463974475860596, "learning_rate": 1.5207649073384855e-05, "loss": 7.5549, "step": 1567500 }, { "epoch": 6.386057894527793, "grad_norm": 12.315871238708496, "learning_rate": 1.5175662372725715e-05, "loss": 7.5281, "step": 1567600 }, { "epoch": 6.386465272551174, "grad_norm": 4.808372974395752, "learning_rate": 1.5143774108138895e-05, "loss": 7.5086, "step": 1567700 }, { "epoch": 6.386872650574555, "grad_norm": 3.1701083183288574, "learning_rate": 1.5111984280885853e-05, "loss": 7.4936, "step": 1567800 }, { "epoch": 6.387280028597937, "grad_norm": 4.402795791625977, "learning_rate": 1.508029289222251e-05, "loss": 7.5438, "step": 1567900 }, { "epoch": 6.3876874066213185, "grad_norm": 4.104953289031982, "learning_rate": 1.5048699943402289e-05, "loss": 7.5011, "step": 1568000 }, { "epoch": 6.3876874066213185, "eval_MaskedAccuracy": 0.5136394569990271, "eval_loss": 1.5913456678390503, "eval_runtime": 161.1058, "eval_samples_per_second": 394.002, "eval_steps_per_second": 1.539, "step": 1568000 }, { "epoch": 6.3880947846447, "grad_norm": 3.2469029426574707, "learning_rate": 1.5017205435674459e-05, "loss": 7.5338, "step": 1568100 }, { "epoch": 6.388502162668082, "grad_norm": 7.085195541381836, "learning_rate": 1.4985809370283602e-05, "loss": 7.5311, "step": 1568200 }, { "epoch": 6.388909540691463, "grad_norm": 6.949836254119873, "learning_rate": 1.4954511748471229e-05, "loss": 7.5079, "step": 1568300 }, { "epoch": 6.389316918714845, "grad_norm": 6.27656888961792, "learning_rate": 1.4923312571474701e-05, "loss": 7.5389, "step": 1568400 }, { "epoch": 6.389724296738226, "grad_norm": 9.262924194335938, "learning_rate": 1.4892211840527508e-05, "loss": 7.5219, "step": 1568500 }, { "epoch": 6.390131674761608, "grad_norm": 3.329096555709839, "learning_rate": 1.4861209556859257e-05, "loss": 7.5307, "step": 1568600 }, { "epoch": 6.390539052784989, "grad_norm": 8.52308177947998, "learning_rate": 1.4830305721695655e-05, "loss": 7.502, "step": 1568700 }, { "epoch": 6.390946430808371, "grad_norm": 7.067248821258545, "learning_rate": 1.4799500336258288e-05, "loss": 7.5316, "step": 1568800 }, { "epoch": 6.391353808831752, "grad_norm": 9.274039268493652, "learning_rate": 1.4768793401765402e-05, "loss": 7.5129, "step": 1568900 }, { "epoch": 6.391761186855133, "grad_norm": 11.559528350830078, "learning_rate": 1.4738184919430536e-05, "loss": 7.5088, "step": 1569000 }, { "epoch": 6.391761186855133, "eval_MaskedAccuracy": 0.5130918001902454, "eval_loss": 1.5823763608932495, "eval_runtime": 170.4927, "eval_samples_per_second": 372.309, "eval_steps_per_second": 1.455, "step": 1569000 }, { "epoch": 6.3921685648785145, "grad_norm": 16.58714485168457, "learning_rate": 1.4707674890464179e-05, "loss": 7.4964, "step": 1569100 }, { "epoch": 6.392575942901896, "grad_norm": 8.326848030090332, "learning_rate": 1.467726331607267e-05, "loss": 7.5183, "step": 1569200 }, { "epoch": 6.3929833209252775, "grad_norm": 14.714838027954102, "learning_rate": 1.4646950197457929e-05, "loss": 7.4614, "step": 1569300 }, { "epoch": 6.393390698948659, "grad_norm": 2.6382505893707275, "learning_rate": 1.4616735535818535e-05, "loss": 7.5017, "step": 1569400 }, { "epoch": 6.393798076972041, "grad_norm": 5.560884952545166, "learning_rate": 1.4586619332349186e-05, "loss": 7.5287, "step": 1569500 }, { "epoch": 6.394205454995422, "grad_norm": 13.803794860839844, "learning_rate": 1.4556601588240438e-05, "loss": 7.5174, "step": 1569600 }, { "epoch": 6.394612833018804, "grad_norm": 4.421621799468994, "learning_rate": 1.4526682304678949e-05, "loss": 7.5351, "step": 1569700 }, { "epoch": 6.395020211042185, "grad_norm": 9.589630126953125, "learning_rate": 1.4496861482847798e-05, "loss": 7.5342, "step": 1569800 }, { "epoch": 6.395427589065567, "grad_norm": 4.070805072784424, "learning_rate": 1.446713912392588e-05, "loss": 7.4844, "step": 1569900 }, { "epoch": 6.395834967088948, "grad_norm": 12.339241981506348, "learning_rate": 1.4437515229088253e-05, "loss": 7.52, "step": 1570000 }, { "epoch": 6.395834967088948, "eval_MaskedAccuracy": 0.5141620454833628, "eval_loss": 1.5788440704345703, "eval_runtime": 175.9873, "eval_samples_per_second": 360.685, "eval_steps_per_second": 1.409, "step": 1570000 }, { "epoch": 6.39624234511233, "grad_norm": 7.558104038238525, "learning_rate": 1.4407989799506074e-05, "loss": 7.4956, "step": 1570100 }, { "epoch": 6.3966497231357105, "grad_norm": 6.623258113861084, "learning_rate": 1.4378562836346616e-05, "loss": 7.5005, "step": 1570200 }, { "epoch": 6.397057101159092, "grad_norm": 9.98888874053955, "learning_rate": 1.4349234340773585e-05, "loss": 7.4881, "step": 1570300 }, { "epoch": 6.3974644791824735, "grad_norm": 5.980010986328125, "learning_rate": 1.4320004313945938e-05, "loss": 7.5196, "step": 1570400 }, { "epoch": 6.397871857205855, "grad_norm": 4.522244453430176, "learning_rate": 1.429087275701985e-05, "loss": 7.5148, "step": 1570500 }, { "epoch": 6.398279235229237, "grad_norm": 2.6743404865264893, "learning_rate": 1.4261839671146826e-05, "loss": 7.5055, "step": 1570600 }, { "epoch": 6.398686613252618, "grad_norm": 3.34663987159729, "learning_rate": 1.4232905057474483e-05, "loss": 7.5199, "step": 1570700 }, { "epoch": 6.399093991276, "grad_norm": 10.206490516662598, "learning_rate": 1.4204068917147113e-05, "loss": 7.5251, "step": 1570800 }, { "epoch": 6.399501369299381, "grad_norm": 4.575131893157959, "learning_rate": 1.4175331251304563e-05, "loss": 7.5042, "step": 1570900 }, { "epoch": 6.399908747322763, "grad_norm": 2.9370667934417725, "learning_rate": 1.4146692061082811e-05, "loss": 7.5048, "step": 1571000 }, { "epoch": 6.399908747322763, "eval_MaskedAccuracy": 0.5134899479556091, "eval_loss": 1.5859872102737427, "eval_runtime": 152.4059, "eval_samples_per_second": 416.493, "eval_steps_per_second": 1.627, "step": 1571000 }, { "epoch": 6.400316125346144, "grad_norm": 4.213624477386475, "learning_rate": 1.411815134761482e-05, "loss": 7.5082, "step": 1571100 }, { "epoch": 6.400723503369526, "grad_norm": 12.737908363342285, "learning_rate": 1.4089709112028228e-05, "loss": 7.4791, "step": 1571200 }, { "epoch": 6.401130881392907, "grad_norm": 9.245916366577148, "learning_rate": 1.4061365355447684e-05, "loss": 7.4917, "step": 1571300 }, { "epoch": 6.401538259416288, "grad_norm": 6.643359184265137, "learning_rate": 1.4033120078993914e-05, "loss": 7.5082, "step": 1571400 }, { "epoch": 6.4019456374396695, "grad_norm": 5.419043064117432, "learning_rate": 1.400497328378352e-05, "loss": 7.5098, "step": 1571500 }, { "epoch": 6.402353015463051, "grad_norm": 3.9829065799713135, "learning_rate": 1.3976924970929485e-05, "loss": 7.4848, "step": 1571600 }, { "epoch": 6.4027603934864326, "grad_norm": 6.7930827140808105, "learning_rate": 1.3948975141540344e-05, "loss": 7.5118, "step": 1571700 }, { "epoch": 6.403167771509814, "grad_norm": 9.234661102294922, "learning_rate": 1.3921123796721353e-05, "loss": 7.5022, "step": 1571800 }, { "epoch": 6.403575149533196, "grad_norm": 7.889405250549316, "learning_rate": 1.3893370937573576e-05, "loss": 7.5065, "step": 1571900 }, { "epoch": 6.403982527556577, "grad_norm": 3.2302517890930176, "learning_rate": 1.3865716565194205e-05, "loss": 7.5146, "step": 1572000 }, { "epoch": 6.403982527556577, "eval_MaskedAccuracy": 0.5136081210469635, "eval_loss": 1.5881190299987793, "eval_runtime": 164.6993, "eval_samples_per_second": 385.405, "eval_steps_per_second": 1.506, "step": 1572000 }, { "epoch": 6.404389905579959, "grad_norm": 5.261856555938721, "learning_rate": 1.3838160680676323e-05, "loss": 7.5059, "step": 1572100 }, { "epoch": 6.40479728360334, "grad_norm": 4.127289772033691, "learning_rate": 1.38107032851099e-05, "loss": 7.5274, "step": 1572200 }, { "epoch": 6.405204661626722, "grad_norm": 3.562062978744507, "learning_rate": 1.3783344379580229e-05, "loss": 7.5379, "step": 1572300 }, { "epoch": 6.405612039650103, "grad_norm": 3.4243810176849365, "learning_rate": 1.3756083965168728e-05, "loss": 7.4834, "step": 1572400 }, { "epoch": 6.406019417673485, "grad_norm": 4.1015143394470215, "learning_rate": 1.3728922042953475e-05, "loss": 7.5272, "step": 1572500 }, { "epoch": 6.406426795696866, "grad_norm": 3.1138134002685547, "learning_rate": 1.3701858614008138e-05, "loss": 7.4946, "step": 1572600 }, { "epoch": 6.406834173720247, "grad_norm": 5.690712928771973, "learning_rate": 1.3674893679402758e-05, "loss": 7.5256, "step": 1572700 }, { "epoch": 6.4072415517436285, "grad_norm": 5.451653003692627, "learning_rate": 1.3648027240203498e-05, "loss": 7.5216, "step": 1572800 }, { "epoch": 6.40764892976701, "grad_norm": 5.183475494384766, "learning_rate": 1.3621259297472122e-05, "loss": 7.5454, "step": 1572900 }, { "epoch": 6.408056307790392, "grad_norm": 4.239622592926025, "learning_rate": 1.3594589852267604e-05, "loss": 7.4933, "step": 1573000 }, { "epoch": 6.408056307790392, "eval_MaskedAccuracy": 0.5141377363086594, "eval_loss": 1.58185875415802, "eval_runtime": 223.9942, "eval_samples_per_second": 283.382, "eval_steps_per_second": 1.107, "step": 1573000 }, { "epoch": 6.408463685813773, "grad_norm": 4.208160877227783, "learning_rate": 1.3568018905643638e-05, "loss": 7.5127, "step": 1573100 }, { "epoch": 6.408871063837155, "grad_norm": 5.692768573760986, "learning_rate": 1.3541546458650899e-05, "loss": 7.5169, "step": 1573200 }, { "epoch": 6.409278441860536, "grad_norm": 15.91248893737793, "learning_rate": 1.3515172512336166e-05, "loss": 7.5104, "step": 1573300 }, { "epoch": 6.409685819883918, "grad_norm": 5.291548252105713, "learning_rate": 1.3488897067742353e-05, "loss": 7.5227, "step": 1573400 }, { "epoch": 6.410093197907299, "grad_norm": 7.054795265197754, "learning_rate": 1.3462720125907653e-05, "loss": 7.502, "step": 1573500 }, { "epoch": 6.410500575930681, "grad_norm": 8.391228675842285, "learning_rate": 1.3436641687867511e-05, "loss": 7.5509, "step": 1573600 }, { "epoch": 6.410907953954062, "grad_norm": 2.310081720352173, "learning_rate": 1.3410661754652617e-05, "loss": 7.5209, "step": 1573700 }, { "epoch": 6.411315331977444, "grad_norm": 5.13197135925293, "learning_rate": 1.3384780327290112e-05, "loss": 7.5029, "step": 1573800 }, { "epoch": 6.411722710000825, "grad_norm": 2.7105376720428467, "learning_rate": 1.3358997406803501e-05, "loss": 7.5187, "step": 1573900 }, { "epoch": 6.412130088024206, "grad_norm": 5.311029434204102, "learning_rate": 1.333331299421189e-05, "loss": 7.4873, "step": 1574000 }, { "epoch": 6.412130088024206, "eval_MaskedAccuracy": 0.5134961140972867, "eval_loss": 1.5884283781051636, "eval_runtime": 165.2433, "eval_samples_per_second": 384.137, "eval_steps_per_second": 1.501, "step": 1574000 }, { "epoch": 6.412537466047588, "grad_norm": 4.1327900886535645, "learning_rate": 1.3307727090530741e-05, "loss": 7.4987, "step": 1574100 }, { "epoch": 6.412944844070969, "grad_norm": 3.0449063777923584, "learning_rate": 1.328223969677193e-05, "loss": 7.5378, "step": 1574200 }, { "epoch": 6.413352222094351, "grad_norm": 11.36555290222168, "learning_rate": 1.325685081394237e-05, "loss": 7.4879, "step": 1574300 }, { "epoch": 6.413759600117732, "grad_norm": 3.7845590114593506, "learning_rate": 1.3231560443046744e-05, "loss": 7.5143, "step": 1574400 }, { "epoch": 6.414166978141114, "grad_norm": 13.816906929016113, "learning_rate": 1.3206368585084195e-05, "loss": 7.5011, "step": 1574500 }, { "epoch": 6.414574356164495, "grad_norm": 12.928583145141602, "learning_rate": 1.318127524105082e-05, "loss": 7.4846, "step": 1574600 }, { "epoch": 6.414981734187877, "grad_norm": 9.984003067016602, "learning_rate": 1.3156280411938826e-05, "loss": 7.5424, "step": 1574700 }, { "epoch": 6.415389112211258, "grad_norm": 15.186087608337402, "learning_rate": 1.3131384098736561e-05, "loss": 7.5193, "step": 1574800 }, { "epoch": 6.41579649023464, "grad_norm": 3.359351634979248, "learning_rate": 1.310658630242818e-05, "loss": 7.5105, "step": 1574900 }, { "epoch": 6.416203868258021, "grad_norm": 5.87614631652832, "learning_rate": 1.3081887023994011e-05, "loss": 7.5329, "step": 1575000 }, { "epoch": 6.416203868258021, "eval_MaskedAccuracy": 0.5141055018772199, "eval_loss": 1.5858298540115356, "eval_runtime": 160.8309, "eval_samples_per_second": 394.675, "eval_steps_per_second": 1.542, "step": 1575000 }, { "epoch": 6.416611246281403, "grad_norm": 3.7395567893981934, "learning_rate": 1.3057286264410479e-05, "loss": 7.5396, "step": 1575100 }, { "epoch": 6.4170186243047835, "grad_norm": 7.1665940284729, "learning_rate": 1.3032784024650144e-05, "loss": 7.5205, "step": 1575200 }, { "epoch": 6.417426002328165, "grad_norm": 2.775817632675171, "learning_rate": 1.3008380305681945e-05, "loss": 7.5332, "step": 1575300 }, { "epoch": 6.417833380351547, "grad_norm": 11.79802131652832, "learning_rate": 1.2984075108470668e-05, "loss": 7.4973, "step": 1575400 }, { "epoch": 6.418240758374928, "grad_norm": 18.12425994873047, "learning_rate": 1.2959868433977236e-05, "loss": 7.523, "step": 1575500 }, { "epoch": 6.41864813639831, "grad_norm": 4.883094787597656, "learning_rate": 1.2935760283158411e-05, "loss": 7.5362, "step": 1575600 }, { "epoch": 6.419055514421691, "grad_norm": 5.846532344818115, "learning_rate": 1.2911750656967353e-05, "loss": 7.5088, "step": 1575700 }, { "epoch": 6.419462892445073, "grad_norm": 3.527452230453491, "learning_rate": 1.2887839556353617e-05, "loss": 7.5245, "step": 1575800 }, { "epoch": 6.419870270468454, "grad_norm": 15.958311080932617, "learning_rate": 1.286402698226233e-05, "loss": 7.5036, "step": 1575900 }, { "epoch": 6.420277648491836, "grad_norm": 8.36783218383789, "learning_rate": 1.2840312935635002e-05, "loss": 7.5173, "step": 1576000 }, { "epoch": 6.420277648491836, "eval_MaskedAccuracy": 0.5133222906597588, "eval_loss": 1.5864717960357666, "eval_runtime": 157.9601, "eval_samples_per_second": 401.848, "eval_steps_per_second": 1.57, "step": 1576000 }, { "epoch": 6.420685026515217, "grad_norm": 7.594944000244141, "learning_rate": 1.2816697417408736e-05, "loss": 7.499, "step": 1576100 }, { "epoch": 6.421092404538599, "grad_norm": 8.242910385131836, "learning_rate": 1.2793180428517868e-05, "loss": 7.4981, "step": 1576200 }, { "epoch": 6.42149978256198, "grad_norm": 13.289484977722168, "learning_rate": 1.2769761969891999e-05, "loss": 7.5153, "step": 1576300 }, { "epoch": 6.421907160585361, "grad_norm": 5.258137226104736, "learning_rate": 1.2746442042456588e-05, "loss": 7.5096, "step": 1576400 }, { "epoch": 6.422314538608743, "grad_norm": 12.008098602294922, "learning_rate": 1.2723220647133773e-05, "loss": 7.5106, "step": 1576500 }, { "epoch": 6.422721916632124, "grad_norm": 12.856749534606934, "learning_rate": 1.2700097784841802e-05, "loss": 7.504, "step": 1576600 }, { "epoch": 6.423129294655506, "grad_norm": 12.059041976928711, "learning_rate": 1.2677073456494521e-05, "loss": 7.5214, "step": 1576700 }, { "epoch": 6.423536672678887, "grad_norm": 10.986212730407715, "learning_rate": 1.265414766300243e-05, "loss": 7.5158, "step": 1576800 }, { "epoch": 6.423944050702269, "grad_norm": 12.987505912780762, "learning_rate": 1.263132040527213e-05, "loss": 7.4991, "step": 1576900 }, { "epoch": 6.42435142872565, "grad_norm": 8.581718444824219, "learning_rate": 1.2608591684205837e-05, "loss": 7.5328, "step": 1577000 }, { "epoch": 6.42435142872565, "eval_MaskedAccuracy": 0.5135896450978338, "eval_loss": 1.5826035737991333, "eval_runtime": 153.6081, "eval_samples_per_second": 413.233, "eval_steps_per_second": 1.614, "step": 1577000 }, { "epoch": 6.424758806749032, "grad_norm": 5.405650615692139, "learning_rate": 1.2585961500701854e-05, "loss": 7.4957, "step": 1577100 }, { "epoch": 6.425166184772413, "grad_norm": 10.33969497680664, "learning_rate": 1.2563429855655443e-05, "loss": 7.5112, "step": 1577200 }, { "epoch": 6.425573562795795, "grad_norm": 20.685165405273438, "learning_rate": 1.2540996749957157e-05, "loss": 7.5436, "step": 1577300 }, { "epoch": 6.425980940819176, "grad_norm": 8.307595252990723, "learning_rate": 1.251866218449396e-05, "loss": 7.5399, "step": 1577400 }, { "epoch": 6.426388318842558, "grad_norm": 9.786670684814453, "learning_rate": 1.2496426160148366e-05, "loss": 7.4785, "step": 1577500 }, { "epoch": 6.426795696865939, "grad_norm": 17.664812088012695, "learning_rate": 1.2474288677800156e-05, "loss": 7.484, "step": 1577600 }, { "epoch": 6.42720307488932, "grad_norm": 17.866188049316406, "learning_rate": 1.2452249738324344e-05, "loss": 7.5178, "step": 1577700 }, { "epoch": 6.427610452912702, "grad_norm": 10.082836151123047, "learning_rate": 1.2430309342592104e-05, "loss": 7.5313, "step": 1577800 }, { "epoch": 6.428017830936083, "grad_norm": 7.397581577301025, "learning_rate": 1.2408467491471011e-05, "loss": 7.5053, "step": 1577900 }, { "epoch": 6.428425208959465, "grad_norm": 17.505754470825195, "learning_rate": 1.2386724185824467e-05, "loss": 7.485, "step": 1578000 }, { "epoch": 6.428425208959465, "eval_MaskedAccuracy": 0.514182775242332, "eval_loss": 1.5848979949951172, "eval_runtime": 152.7666, "eval_samples_per_second": 415.51, "eval_steps_per_second": 1.623, "step": 1578000 }, { "epoch": 6.428832586982846, "grad_norm": 19.836593627929688, "learning_rate": 1.2365079426512016e-05, "loss": 7.5257, "step": 1578100 }, { "epoch": 6.429239965006228, "grad_norm": 5.72556734085083, "learning_rate": 1.2343533214389587e-05, "loss": 7.511, "step": 1578200 }, { "epoch": 6.429647343029609, "grad_norm": 11.72077751159668, "learning_rate": 1.232208555030923e-05, "loss": 7.5083, "step": 1578300 }, { "epoch": 6.430054721052991, "grad_norm": 11.613635063171387, "learning_rate": 1.2300736435118291e-05, "loss": 7.5003, "step": 1578400 }, { "epoch": 6.430462099076372, "grad_norm": 6.428407669067383, "learning_rate": 1.2279485869661065e-05, "loss": 7.4788, "step": 1578500 }, { "epoch": 6.430869477099754, "grad_norm": 6.941511631011963, "learning_rate": 1.2258333854777984e-05, "loss": 7.5334, "step": 1578600 }, { "epoch": 6.431276855123135, "grad_norm": 21.483781814575195, "learning_rate": 1.2237280391305018e-05, "loss": 7.4977, "step": 1578700 }, { "epoch": 6.431684233146517, "grad_norm": 19.6728572845459, "learning_rate": 1.221632548007456e-05, "loss": 7.5281, "step": 1578800 }, { "epoch": 6.4320916111698985, "grad_norm": 9.46621322631836, "learning_rate": 1.219546912191512e-05, "loss": 7.5062, "step": 1578900 }, { "epoch": 6.432498989193279, "grad_norm": 6.844732761383057, "learning_rate": 1.2174711317651337e-05, "loss": 7.4939, "step": 1579000 }, { "epoch": 6.432498989193279, "eval_MaskedAccuracy": 0.5135044825236108, "eval_loss": 1.5933804512023926, "eval_runtime": 160.879, "eval_samples_per_second": 394.557, "eval_steps_per_second": 1.542, "step": 1579000 }, { "epoch": 6.432906367216661, "grad_norm": 19.491687774658203, "learning_rate": 1.2154052068103675e-05, "loss": 7.5178, "step": 1579100 }, { "epoch": 6.433313745240042, "grad_norm": 8.367425918579102, "learning_rate": 1.2133491374089022e-05, "loss": 7.518, "step": 1579200 }, { "epoch": 6.433721123263424, "grad_norm": 8.211845397949219, "learning_rate": 1.2113029236420363e-05, "loss": 7.5384, "step": 1579300 }, { "epoch": 6.434128501286805, "grad_norm": 8.700518608093262, "learning_rate": 1.2092665655906547e-05, "loss": 7.5014, "step": 1579400 }, { "epoch": 6.434535879310187, "grad_norm": 15.144051551818848, "learning_rate": 1.2072400633352532e-05, "loss": 7.5121, "step": 1579500 }, { "epoch": 6.434943257333568, "grad_norm": 12.884886741638184, "learning_rate": 1.2052234169559683e-05, "loss": 7.5048, "step": 1579600 }, { "epoch": 6.43535063535695, "grad_norm": 2.9523839950561523, "learning_rate": 1.2032166265325214e-05, "loss": 7.4923, "step": 1579700 }, { "epoch": 6.435758013380331, "grad_norm": 10.289582252502441, "learning_rate": 1.2012196921442453e-05, "loss": 7.5141, "step": 1579800 }, { "epoch": 6.436165391403713, "grad_norm": 12.150877952575684, "learning_rate": 1.1992326138701138e-05, "loss": 7.4757, "step": 1579900 }, { "epoch": 6.436572769427094, "grad_norm": 7.628262042999268, "learning_rate": 1.1972553917886819e-05, "loss": 7.5078, "step": 1580000 }, { "epoch": 6.436572769427094, "eval_MaskedAccuracy": 0.5136051176975212, "eval_loss": 1.576341152191162, "eval_runtime": 192.0348, "eval_samples_per_second": 330.544, "eval_steps_per_second": 1.291, "step": 1580000 }, { "epoch": 6.436980147450476, "grad_norm": 10.1820707321167, "learning_rate": 1.1952880259780914e-05, "loss": 7.4935, "step": 1580100 }, { "epoch": 6.437387525473857, "grad_norm": 5.5103044509887695, "learning_rate": 1.1933305165161531e-05, "loss": 7.5113, "step": 1580200 }, { "epoch": 6.437794903497238, "grad_norm": 3.9326303005218506, "learning_rate": 1.1913828634802331e-05, "loss": 7.5108, "step": 1580300 }, { "epoch": 6.43820228152062, "grad_norm": 12.331070899963379, "learning_rate": 1.1894450669473385e-05, "loss": 7.5332, "step": 1580400 }, { "epoch": 6.438609659544001, "grad_norm": 8.85410213470459, "learning_rate": 1.1875171269940862e-05, "loss": 7.5079, "step": 1580500 }, { "epoch": 6.439017037567383, "grad_norm": 9.459521293640137, "learning_rate": 1.1855990436967075e-05, "loss": 7.4942, "step": 1580600 }, { "epoch": 6.439424415590764, "grad_norm": 7.890523910522461, "learning_rate": 1.1836908171310452e-05, "loss": 7.4848, "step": 1580700 }, { "epoch": 6.439831793614146, "grad_norm": 25.802410125732422, "learning_rate": 1.1817924473724993e-05, "loss": 7.5027, "step": 1580800 }, { "epoch": 6.440239171637527, "grad_norm": 5.142045974731445, "learning_rate": 1.1799039344961371e-05, "loss": 7.5088, "step": 1580900 }, { "epoch": 6.440646549660909, "grad_norm": 7.51984167098999, "learning_rate": 1.1780252785766629e-05, "loss": 7.5149, "step": 1581000 }, { "epoch": 6.440646549660909, "eval_MaskedAccuracy": 0.5139996727286729, "eval_loss": 1.575803518295288, "eval_runtime": 161.763, "eval_samples_per_second": 392.401, "eval_steps_per_second": 1.533, "step": 1581000 }, { "epoch": 6.44105392768429, "grad_norm": 3.9571900367736816, "learning_rate": 1.1761564796882889e-05, "loss": 7.5341, "step": 1581100 }, { "epoch": 6.441461305707672, "grad_norm": 2.9896321296691895, "learning_rate": 1.1742975379049166e-05, "loss": 7.5011, "step": 1581200 }, { "epoch": 6.4418686837310535, "grad_norm": 10.502874374389648, "learning_rate": 1.1724484533000904e-05, "loss": 7.4909, "step": 1581300 }, { "epoch": 6.442276061754434, "grad_norm": 10.83086109161377, "learning_rate": 1.1706092259468554e-05, "loss": 7.504, "step": 1581400 }, { "epoch": 6.442683439777816, "grad_norm": 2.590233564376831, "learning_rate": 1.1687798559179252e-05, "loss": 7.549, "step": 1581500 }, { "epoch": 6.443090817801197, "grad_norm": 4.046431064605713, "learning_rate": 1.1669603432856774e-05, "loss": 7.5088, "step": 1581600 }, { "epoch": 6.443498195824579, "grad_norm": 9.649344444274902, "learning_rate": 1.165150688121996e-05, "loss": 7.5432, "step": 1581700 }, { "epoch": 6.44390557384796, "grad_norm": 3.338127374649048, "learning_rate": 1.1633508904984299e-05, "loss": 7.5146, "step": 1581800 }, { "epoch": 6.444312951871342, "grad_norm": 18.291717529296875, "learning_rate": 1.1615609504861685e-05, "loss": 7.5071, "step": 1581900 }, { "epoch": 6.444720329894723, "grad_norm": 3.2955408096313477, "learning_rate": 1.1597808681559567e-05, "loss": 7.5291, "step": 1582000 }, { "epoch": 6.444720329894723, "eval_MaskedAccuracy": 0.5137903031881628, "eval_loss": 1.5809904336929321, "eval_runtime": 159.7544, "eval_samples_per_second": 397.335, "eval_steps_per_second": 1.552, "step": 1582000 }, { "epoch": 6.445127707918105, "grad_norm": 11.436583518981934, "learning_rate": 1.1580106435781803e-05, "loss": 7.5191, "step": 1582100 }, { "epoch": 6.445535085941486, "grad_norm": 5.290562629699707, "learning_rate": 1.156250276822811e-05, "loss": 7.5033, "step": 1582200 }, { "epoch": 6.445942463964868, "grad_norm": 3.2766430377960205, "learning_rate": 1.1544997679594573e-05, "loss": 7.4911, "step": 1582300 }, { "epoch": 6.4463498419882495, "grad_norm": 7.670306205749512, "learning_rate": 1.1527591170573147e-05, "loss": 7.5289, "step": 1582400 }, { "epoch": 6.446757220011631, "grad_norm": 6.270802974700928, "learning_rate": 1.1510283241851915e-05, "loss": 7.5287, "step": 1582500 }, { "epoch": 6.4471645980350125, "grad_norm": 4.065499305725098, "learning_rate": 1.1493073894115615e-05, "loss": 7.5492, "step": 1582600 }, { "epoch": 6.447571976058393, "grad_norm": 7.180081844329834, "learning_rate": 1.1475963128043996e-05, "loss": 7.5232, "step": 1582700 }, { "epoch": 6.447979354081775, "grad_norm": 5.466172218322754, "learning_rate": 1.1458950944314066e-05, "loss": 7.5087, "step": 1582800 }, { "epoch": 6.448386732105156, "grad_norm": 3.34328031539917, "learning_rate": 1.1442037343598078e-05, "loss": 7.495, "step": 1582900 }, { "epoch": 6.448794110128538, "grad_norm": 8.226127624511719, "learning_rate": 1.1425222326565008e-05, "loss": 7.4944, "step": 1583000 }, { "epoch": 6.448794110128538, "eval_MaskedAccuracy": 0.5132201020882595, "eval_loss": 1.5804826021194458, "eval_runtime": 237.9313, "eval_samples_per_second": 266.783, "eval_steps_per_second": 1.042, "step": 1583000 }, { "epoch": 6.449201488151919, "grad_norm": 12.55417251586914, "learning_rate": 1.1408505893879386e-05, "loss": 7.5197, "step": 1583100 }, { "epoch": 6.449608866175301, "grad_norm": 10.142425537109375, "learning_rate": 1.139188804620213e-05, "loss": 7.535, "step": 1583200 }, { "epoch": 6.450016244198682, "grad_norm": 6.337942600250244, "learning_rate": 1.1375368784190545e-05, "loss": 7.5153, "step": 1583300 }, { "epoch": 6.450423622222064, "grad_norm": 13.39480972290039, "learning_rate": 1.1358948108497264e-05, "loss": 7.5049, "step": 1583400 }, { "epoch": 6.450831000245445, "grad_norm": 6.626671314239502, "learning_rate": 1.1342626019771572e-05, "loss": 7.5002, "step": 1583500 }, { "epoch": 6.451238378268827, "grad_norm": 2.58742356300354, "learning_rate": 1.1326402518659157e-05, "loss": 7.5136, "step": 1583600 }, { "epoch": 6.4516457562922085, "grad_norm": 5.8868184089660645, "learning_rate": 1.1310277605800997e-05, "loss": 7.5153, "step": 1583700 }, { "epoch": 6.45205313431559, "grad_norm": 8.069238662719727, "learning_rate": 1.1294251281834755e-05, "loss": 7.5035, "step": 1583800 }, { "epoch": 6.4524605123389716, "grad_norm": 5.264612197875977, "learning_rate": 1.12783235473942e-05, "loss": 7.5259, "step": 1583900 }, { "epoch": 6.452867890362352, "grad_norm": 16.8353271484375, "learning_rate": 1.1262494403108685e-05, "loss": 7.4895, "step": 1584000 }, { "epoch": 6.452867890362352, "eval_MaskedAccuracy": 0.5131780757268724, "eval_loss": 1.5877623558044434, "eval_runtime": 168.9802, "eval_samples_per_second": 375.642, "eval_steps_per_second": 1.468, "step": 1584000 }, { "epoch": 6.453275268385734, "grad_norm": 9.595535278320312, "learning_rate": 1.124676384960422e-05, "loss": 7.4899, "step": 1584100 }, { "epoch": 6.453682646409115, "grad_norm": 3.171947479248047, "learning_rate": 1.1231131887502687e-05, "loss": 7.5199, "step": 1584200 }, { "epoch": 6.454090024432497, "grad_norm": 5.0405449867248535, "learning_rate": 1.121559851742233e-05, "loss": 7.4804, "step": 1584300 }, { "epoch": 6.454497402455878, "grad_norm": 2.5416932106018066, "learning_rate": 1.1200163739977016e-05, "loss": 7.5072, "step": 1584400 }, { "epoch": 6.45490478047926, "grad_norm": 6.3112568855285645, "learning_rate": 1.1184827555776944e-05, "loss": 7.5124, "step": 1584500 }, { "epoch": 6.455312158502641, "grad_norm": 5.8980631828308105, "learning_rate": 1.116958996542848e-05, "loss": 7.4989, "step": 1584600 }, { "epoch": 6.455719536526023, "grad_norm": 14.551684379577637, "learning_rate": 1.1154450969533821e-05, "loss": 7.5118, "step": 1584700 }, { "epoch": 6.4561269145494045, "grad_norm": 4.331643104553223, "learning_rate": 1.1139410568692128e-05, "loss": 7.5193, "step": 1584800 }, { "epoch": 6.456534292572786, "grad_norm": 2.8261187076568604, "learning_rate": 1.1124468763497561e-05, "loss": 7.5006, "step": 1584900 }, { "epoch": 6.4569416705961675, "grad_norm": 8.621109962463379, "learning_rate": 1.1109625554540683e-05, "loss": 7.5418, "step": 1585000 }, { "epoch": 6.4569416705961675, "eval_MaskedAccuracy": 0.5142094645817232, "eval_loss": 1.581610918045044, "eval_runtime": 175.0879, "eval_samples_per_second": 362.538, "eval_steps_per_second": 1.416, "step": 1585000 }, { "epoch": 6.457349048619549, "grad_norm": 7.384837627410889, "learning_rate": 1.1094880942408735e-05, "loss": 7.5623, "step": 1585100 }, { "epoch": 6.45775642664293, "grad_norm": 7.93504524230957, "learning_rate": 1.1080234927684532e-05, "loss": 7.5121, "step": 1585200 }, { "epoch": 6.458163804666311, "grad_norm": 8.416412353515625, "learning_rate": 1.1065687510947014e-05, "loss": 7.5209, "step": 1585300 }, { "epoch": 6.458571182689693, "grad_norm": 8.54407024383545, "learning_rate": 1.1051238692771492e-05, "loss": 7.5198, "step": 1585400 }, { "epoch": 6.458978560713074, "grad_norm": 5.531566143035889, "learning_rate": 1.1036888473729149e-05, "loss": 7.5146, "step": 1585500 }, { "epoch": 6.459385938736456, "grad_norm": 4.485382080078125, "learning_rate": 1.102263685438701e-05, "loss": 7.5261, "step": 1585600 }, { "epoch": 6.459793316759837, "grad_norm": 3.3528404235839844, "learning_rate": 1.1008483835308761e-05, "loss": 7.5484, "step": 1585700 }, { "epoch": 6.460200694783219, "grad_norm": 2.1114022731781006, "learning_rate": 1.099442941705395e-05, "loss": 7.5096, "step": 1585800 }, { "epoch": 6.4606080728066, "grad_norm": 3.5055644512176514, "learning_rate": 1.0980473600178517e-05, "loss": 7.5133, "step": 1585900 }, { "epoch": 6.461015450829982, "grad_norm": 15.722739219665527, "learning_rate": 1.0966616385233708e-05, "loss": 7.5503, "step": 1586000 }, { "epoch": 6.461015450829982, "eval_MaskedAccuracy": 0.51397272595115, "eval_loss": 1.589124083518982, "eval_runtime": 185.0763, "eval_samples_per_second": 342.972, "eval_steps_per_second": 1.34, "step": 1586000 }, { "epoch": 6.4614228288533635, "grad_norm": 5.017574310302734, "learning_rate": 1.0952857772767406e-05, "loss": 7.5233, "step": 1586100 }, { "epoch": 6.461830206876745, "grad_norm": 2.8360648155212402, "learning_rate": 1.0939197763323929e-05, "loss": 7.5366, "step": 1586200 }, { "epoch": 6.462237584900127, "grad_norm": 2.794994831085205, "learning_rate": 1.0925636357443144e-05, "loss": 7.525, "step": 1586300 }, { "epoch": 6.462644962923507, "grad_norm": 4.378689289093018, "learning_rate": 1.0912173555661342e-05, "loss": 7.4783, "step": 1586400 }, { "epoch": 6.463052340946889, "grad_norm": 6.236534595489502, "learning_rate": 1.089880935851034e-05, "loss": 7.5189, "step": 1586500 }, { "epoch": 6.46345971897027, "grad_norm": 3.1669552326202393, "learning_rate": 1.088554376651895e-05, "loss": 7.4782, "step": 1586600 }, { "epoch": 6.463867096993652, "grad_norm": 2.9984211921691895, "learning_rate": 1.087237678021153e-05, "loss": 7.5291, "step": 1586700 }, { "epoch": 6.464274475017033, "grad_norm": 20.27840805053711, "learning_rate": 1.0859308400108288e-05, "loss": 7.5295, "step": 1586800 }, { "epoch": 6.464681853040415, "grad_norm": 2.6895344257354736, "learning_rate": 1.0846338626726658e-05, "loss": 7.5427, "step": 1586900 }, { "epoch": 6.465089231063796, "grad_norm": 3.2413620948791504, "learning_rate": 1.0833467460578534e-05, "loss": 7.505, "step": 1587000 }, { "epoch": 6.465089231063796, "eval_MaskedAccuracy": 0.5138568338712798, "eval_loss": 1.5885602235794067, "eval_runtime": 154.1563, "eval_samples_per_second": 411.764, "eval_steps_per_second": 1.609, "step": 1587000 }, { "epoch": 6.465496609087178, "grad_norm": 4.166514873504639, "learning_rate": 1.0820694902173316e-05, "loss": 7.5256, "step": 1587100 }, { "epoch": 6.4659039871105595, "grad_norm": 4.5649943351745605, "learning_rate": 1.0808020952015985e-05, "loss": 7.5255, "step": 1587200 }, { "epoch": 6.466311365133941, "grad_norm": 10.997081756591797, "learning_rate": 1.079544561060736e-05, "loss": 7.4938, "step": 1587300 }, { "epoch": 6.4667187431573225, "grad_norm": 4.487954139709473, "learning_rate": 1.0782968878444657e-05, "loss": 7.5526, "step": 1587400 }, { "epoch": 6.467126121180704, "grad_norm": 15.784062385559082, "learning_rate": 1.0770590756021208e-05, "loss": 7.5212, "step": 1587500 }, { "epoch": 6.467533499204086, "grad_norm": 2.7647440433502197, "learning_rate": 1.0758311243826476e-05, "loss": 7.5318, "step": 1587600 }, { "epoch": 6.467940877227466, "grad_norm": 5.251222610473633, "learning_rate": 1.0746130342345768e-05, "loss": 7.4894, "step": 1587700 }, { "epoch": 6.468348255250848, "grad_norm": 9.594130516052246, "learning_rate": 1.0734048052060783e-05, "loss": 7.5199, "step": 1587800 }, { "epoch": 6.468755633274229, "grad_norm": 3.82230544090271, "learning_rate": 1.0722064373449081e-05, "loss": 7.5112, "step": 1587900 }, { "epoch": 6.469163011297611, "grad_norm": 8.367039680480957, "learning_rate": 1.0710179306984607e-05, "loss": 7.5348, "step": 1588000 }, { "epoch": 6.469163011297611, "eval_MaskedAccuracy": 0.513640790322325, "eval_loss": 1.5872994661331177, "eval_runtime": 164.0861, "eval_samples_per_second": 386.846, "eval_steps_per_second": 1.511, "step": 1588000 }, { "epoch": 6.469570389320992, "grad_norm": 3.6034107208251953, "learning_rate": 1.0698392853136869e-05, "loss": 7.5058, "step": 1588100 }, { "epoch": 6.469977767344374, "grad_norm": 13.84794807434082, "learning_rate": 1.0686705012372343e-05, "loss": 7.5032, "step": 1588200 }, { "epoch": 6.470385145367755, "grad_norm": 4.37919282913208, "learning_rate": 1.0675115785153065e-05, "loss": 7.5362, "step": 1588300 }, { "epoch": 6.470792523391137, "grad_norm": 3.441962242126465, "learning_rate": 1.0663625171936911e-05, "loss": 7.525, "step": 1588400 }, { "epoch": 6.4711999014145185, "grad_norm": 8.499149322509766, "learning_rate": 1.0652233173178164e-05, "loss": 7.5046, "step": 1588500 }, { "epoch": 6.4716072794379, "grad_norm": 21.030723571777344, "learning_rate": 1.0640939789327512e-05, "loss": 7.476, "step": 1588600 }, { "epoch": 6.472014657461282, "grad_norm": 4.852090835571289, "learning_rate": 1.0629745020831204e-05, "loss": 7.5316, "step": 1588700 }, { "epoch": 6.472422035484663, "grad_norm": 3.8750147819519043, "learning_rate": 1.0618648868131876e-05, "loss": 7.5006, "step": 1588800 }, { "epoch": 6.472829413508045, "grad_norm": 7.608121871948242, "learning_rate": 1.0607651331668303e-05, "loss": 7.5461, "step": 1588900 }, { "epoch": 6.473236791531425, "grad_norm": 3.698364496231079, "learning_rate": 1.0596752411875099e-05, "loss": 7.5182, "step": 1589000 }, { "epoch": 6.473236791531425, "eval_MaskedAccuracy": 0.5138018319626427, "eval_loss": 1.5852344036102295, "eval_runtime": 157.879, "eval_samples_per_second": 402.055, "eval_steps_per_second": 1.571, "step": 1589000 }, { "epoch": 6.473644169554807, "grad_norm": 5.7360968589782715, "learning_rate": 1.0585952109183555e-05, "loss": 7.5142, "step": 1589100 }, { "epoch": 6.474051547578188, "grad_norm": 2.508575916290283, "learning_rate": 1.0575250424020241e-05, "loss": 7.4523, "step": 1589200 }, { "epoch": 6.47445892560157, "grad_norm": 2.6369049549102783, "learning_rate": 1.0564647356808142e-05, "loss": 7.5342, "step": 1589300 }, { "epoch": 6.474866303624951, "grad_norm": 2.3004262447357178, "learning_rate": 1.0554142907967208e-05, "loss": 7.5372, "step": 1589400 }, { "epoch": 6.475273681648333, "grad_norm": 4.236722946166992, "learning_rate": 1.054373707791181e-05, "loss": 7.5406, "step": 1589500 }, { "epoch": 6.4756810596717145, "grad_norm": 2.2029640674591064, "learning_rate": 1.0533429867053856e-05, "loss": 7.5114, "step": 1589600 }, { "epoch": 6.476088437695096, "grad_norm": 4.125149726867676, "learning_rate": 1.0523221275801079e-05, "loss": 7.5253, "step": 1589700 }, { "epoch": 6.4764958157184775, "grad_norm": 3.957239866256714, "learning_rate": 1.0513111304556538e-05, "loss": 7.513, "step": 1589800 }, { "epoch": 6.476903193741859, "grad_norm": 9.370373725891113, "learning_rate": 1.0503099953719934e-05, "loss": 7.4979, "step": 1589900 }, { "epoch": 6.477310571765241, "grad_norm": 3.218662977218628, "learning_rate": 1.0493187223687658e-05, "loss": 7.5451, "step": 1590000 }, { "epoch": 6.477310571765241, "eval_MaskedAccuracy": 0.5137643071789635, "eval_loss": 1.5946288108825684, "eval_runtime": 168.1511, "eval_samples_per_second": 377.494, "eval_steps_per_second": 1.475, "step": 1590000 }, { "epoch": 6.477717949788622, "grad_norm": 5.178323745727539, "learning_rate": 1.0483373114851103e-05, "loss": 7.4875, "step": 1590100 }, { "epoch": 6.478125327812003, "grad_norm": 3.749709129333496, "learning_rate": 1.047365762759836e-05, "loss": 7.5125, "step": 1590200 }, { "epoch": 6.478532705835384, "grad_norm": 3.386944532394409, "learning_rate": 1.0464040762313626e-05, "loss": 7.5194, "step": 1590300 }, { "epoch": 6.478940083858766, "grad_norm": 5.481697082519531, "learning_rate": 1.0454522519377218e-05, "loss": 7.5216, "step": 1590400 }, { "epoch": 6.479347461882147, "grad_norm": 3.3972976207733154, "learning_rate": 1.0445102899165296e-05, "loss": 7.5015, "step": 1590500 }, { "epoch": 6.479754839905529, "grad_norm": 4.845938682556152, "learning_rate": 1.043578190205017e-05, "loss": 7.5048, "step": 1590600 }, { "epoch": 6.4801622179289105, "grad_norm": 3.8213133811950684, "learning_rate": 1.0426559528400503e-05, "loss": 7.5027, "step": 1590700 }, { "epoch": 6.480569595952292, "grad_norm": 9.342162132263184, "learning_rate": 1.0417435778580849e-05, "loss": 7.4939, "step": 1590800 }, { "epoch": 6.4809769739756735, "grad_norm": 5.004269599914551, "learning_rate": 1.0408410652951845e-05, "loss": 7.5404, "step": 1590900 }, { "epoch": 6.481384351999055, "grad_norm": 10.357196807861328, "learning_rate": 1.0399484151870282e-05, "loss": 7.5, "step": 1591000 }, { "epoch": 6.481384351999055, "eval_MaskedAccuracy": 0.5139598372954575, "eval_loss": 1.5865259170532227, "eval_runtime": 170.4569, "eval_samples_per_second": 372.387, "eval_steps_per_second": 1.455, "step": 1591000 }, { "epoch": 6.481791730022437, "grad_norm": 3.1712253093719482, "learning_rate": 1.0390656275689327e-05, "loss": 7.4713, "step": 1591100 }, { "epoch": 6.482199108045818, "grad_norm": 4.492165565490723, "learning_rate": 1.0381927024758006e-05, "loss": 7.5415, "step": 1591200 }, { "epoch": 6.4826064860692, "grad_norm": 3.1798746585845947, "learning_rate": 1.0373296399420902e-05, "loss": 7.5001, "step": 1591300 }, { "epoch": 6.48301386409258, "grad_norm": 2.6678378582000732, "learning_rate": 1.0364764400019843e-05, "loss": 7.5141, "step": 1591400 }, { "epoch": 6.483421242115962, "grad_norm": 2.719944477081299, "learning_rate": 1.0356331026891933e-05, "loss": 7.5104, "step": 1591500 }, { "epoch": 6.483828620139343, "grad_norm": 5.455245018005371, "learning_rate": 1.0347996280370401e-05, "loss": 7.5199, "step": 1591600 }, { "epoch": 6.484235998162725, "grad_norm": 2.79030704498291, "learning_rate": 1.0339760160784878e-05, "loss": 7.514, "step": 1591700 }, { "epoch": 6.484643376186106, "grad_norm": 6.094854354858398, "learning_rate": 1.0331622668461119e-05, "loss": 7.5171, "step": 1591800 }, { "epoch": 6.485050754209488, "grad_norm": 9.372957229614258, "learning_rate": 1.0323583803720725e-05, "loss": 7.5203, "step": 1591900 }, { "epoch": 6.4854581322328695, "grad_norm": 2.224552869796753, "learning_rate": 1.0315643566881688e-05, "loss": 7.5276, "step": 1592000 }, { "epoch": 6.4854581322328695, "eval_MaskedAccuracy": 0.5127872637926909, "eval_loss": 1.585646152496338, "eval_runtime": 157.3491, "eval_samples_per_second": 403.409, "eval_steps_per_second": 1.576, "step": 1592000 }, { "epoch": 6.485865510256251, "grad_norm": 10.867362022399902, "learning_rate": 1.030780195825758e-05, "loss": 7.5037, "step": 1592100 }, { "epoch": 6.4862728882796326, "grad_norm": 3.9829726219177246, "learning_rate": 1.0300058978158635e-05, "loss": 7.5175, "step": 1592200 }, { "epoch": 6.486680266303014, "grad_norm": 13.730901718139648, "learning_rate": 1.0292414626890936e-05, "loss": 7.5117, "step": 1592300 }, { "epoch": 6.487087644326396, "grad_norm": 3.234405040740967, "learning_rate": 1.0284868904756694e-05, "loss": 7.4918, "step": 1592400 }, { "epoch": 6.487495022349777, "grad_norm": 6.621614933013916, "learning_rate": 1.0277421812054517e-05, "loss": 7.4883, "step": 1592500 }, { "epoch": 6.487902400373159, "grad_norm": 2.6514699459075928, "learning_rate": 1.0270073349078306e-05, "loss": 7.5101, "step": 1592600 }, { "epoch": 6.488309778396539, "grad_norm": 2.6967132091522217, "learning_rate": 1.0262823516118905e-05, "loss": 7.5188, "step": 1592700 }, { "epoch": 6.488717156419921, "grad_norm": 5.549939155578613, "learning_rate": 1.0255672313462732e-05, "loss": 7.5004, "step": 1592800 }, { "epoch": 6.489124534443302, "grad_norm": 2.9948530197143555, "learning_rate": 1.0248619741392903e-05, "loss": 7.4946, "step": 1592900 }, { "epoch": 6.489531912466684, "grad_norm": 7.538805961608887, "learning_rate": 1.0241665800188049e-05, "loss": 7.5029, "step": 1593000 }, { "epoch": 6.489531912466684, "eval_MaskedAccuracy": 0.5136503409021163, "eval_loss": 1.5819203853607178, "eval_runtime": 152.7341, "eval_samples_per_second": 415.598, "eval_steps_per_second": 1.624, "step": 1593000 }, { "epoch": 6.4899392904900655, "grad_norm": 3.414398193359375, "learning_rate": 1.0234810490122685e-05, "loss": 7.5272, "step": 1593100 }, { "epoch": 6.490346668513447, "grad_norm": 5.589507579803467, "learning_rate": 1.0228053811468562e-05, "loss": 7.5081, "step": 1593200 }, { "epoch": 6.4907540465368285, "grad_norm": 3.89789080619812, "learning_rate": 1.0221395764492432e-05, "loss": 7.515, "step": 1593300 }, { "epoch": 6.49116142456021, "grad_norm": 10.572711944580078, "learning_rate": 1.0214836349457435e-05, "loss": 7.5403, "step": 1593400 }, { "epoch": 6.491568802583592, "grad_norm": 16.077180862426758, "learning_rate": 1.0208375566622848e-05, "loss": 7.4979, "step": 1593500 }, { "epoch": 6.491976180606973, "grad_norm": 9.048277854919434, "learning_rate": 1.0202013416244615e-05, "loss": 7.5185, "step": 1593600 }, { "epoch": 6.492383558630355, "grad_norm": 4.375500202178955, "learning_rate": 1.0195749898573698e-05, "loss": 7.5342, "step": 1593700 }, { "epoch": 6.492790936653736, "grad_norm": 4.217859745025635, "learning_rate": 1.0189585013857742e-05, "loss": 7.5132, "step": 1593800 }, { "epoch": 6.493198314677118, "grad_norm": 5.354471206665039, "learning_rate": 1.0183518762341066e-05, "loss": 7.4876, "step": 1593900 }, { "epoch": 6.493605692700498, "grad_norm": 3.2286570072174072, "learning_rate": 1.0177551144262986e-05, "loss": 7.5053, "step": 1594000 }, { "epoch": 6.493605692700498, "eval_MaskedAccuracy": 0.5137151254428228, "eval_loss": 1.5947133302688599, "eval_runtime": 151.9626, "eval_samples_per_second": 417.708, "eval_steps_per_second": 1.632, "step": 1594000 }, { "epoch": 6.49401307072388, "grad_norm": 6.318943500518799, "learning_rate": 1.0171682159859514e-05, "loss": 7.529, "step": 1594100 }, { "epoch": 6.494420448747261, "grad_norm": 3.0594611167907715, "learning_rate": 1.0165911809362497e-05, "loss": 7.5077, "step": 1594200 }, { "epoch": 6.494827826770643, "grad_norm": 7.546759605407715, "learning_rate": 1.0160240093000742e-05, "loss": 7.5603, "step": 1594300 }, { "epoch": 6.4952352047940245, "grad_norm": 3.6959216594696045, "learning_rate": 1.0154667010997782e-05, "loss": 7.5156, "step": 1594400 }, { "epoch": 6.495642582817406, "grad_norm": 3.110666513442993, "learning_rate": 1.0149192563574114e-05, "loss": 7.5325, "step": 1594500 }, { "epoch": 6.496049960840788, "grad_norm": 2.758005380630493, "learning_rate": 1.014381675094635e-05, "loss": 7.5314, "step": 1594600 }, { "epoch": 6.496457338864169, "grad_norm": 4.565113544464111, "learning_rate": 1.0138539573326957e-05, "loss": 7.5346, "step": 1594700 }, { "epoch": 6.496864716887551, "grad_norm": 6.778021812438965, "learning_rate": 1.0133361030924785e-05, "loss": 7.5082, "step": 1594800 }, { "epoch": 6.497272094910932, "grad_norm": 6.886841773986816, "learning_rate": 1.0128281123943979e-05, "loss": 7.5191, "step": 1594900 }, { "epoch": 6.497679472934314, "grad_norm": 4.827107906341553, "learning_rate": 1.0123299852585925e-05, "loss": 7.5382, "step": 1595000 }, { "epoch": 6.497679472934314, "eval_MaskedAccuracy": 0.5138491514953356, "eval_loss": 1.5851107835769653, "eval_runtime": 172.8377, "eval_samples_per_second": 367.258, "eval_steps_per_second": 1.435, "step": 1595000 }, { "epoch": 6.498086850957695, "grad_norm": 3.5091304779052734, "learning_rate": 1.0118417217047295e-05, "loss": 7.5279, "step": 1595100 }, { "epoch": 6.498494228981076, "grad_norm": 3.0194764137268066, "learning_rate": 1.011363321752116e-05, "loss": 7.5539, "step": 1595200 }, { "epoch": 6.498901607004457, "grad_norm": 4.65777063369751, "learning_rate": 1.010894785419698e-05, "loss": 7.5182, "step": 1595300 }, { "epoch": 6.499308985027839, "grad_norm": 18.234819412231445, "learning_rate": 1.0104361127259519e-05, "loss": 7.5082, "step": 1595400 }, { "epoch": 6.4997163630512205, "grad_norm": 3.018636465072632, "learning_rate": 1.0099873036890212e-05, "loss": 7.5009, "step": 1595500 }, { "epoch": 6.500123741074602, "grad_norm": 3.6853644847869873, "learning_rate": 1.009548358326662e-05, "loss": 7.5047, "step": 1595600 }, { "epoch": 6.5005311190979835, "grad_norm": 3.3983805179595947, "learning_rate": 1.0091192766562143e-05, "loss": 7.496, "step": 1595700 }, { "epoch": 6.500938497121365, "grad_norm": 14.519161224365234, "learning_rate": 1.0087000586946863e-05, "loss": 7.5264, "step": 1595800 }, { "epoch": 6.501345875144747, "grad_norm": 10.01481819152832, "learning_rate": 1.008290704458587e-05, "loss": 7.5127, "step": 1595900 }, { "epoch": 6.501753253168128, "grad_norm": 5.591251850128174, "learning_rate": 1.0078912139641481e-05, "loss": 7.5212, "step": 1596000 }, { "epoch": 6.501753253168128, "eval_MaskedAccuracy": 0.5138266559577939, "eval_loss": 1.5807507038116455, "eval_runtime": 155.3243, "eval_samples_per_second": 408.667, "eval_steps_per_second": 1.597, "step": 1596000 }, { "epoch": 6.50216063119151, "grad_norm": 2.69199800491333, "learning_rate": 1.007501587227132e-05, "loss": 7.517, "step": 1596100 }, { "epoch": 6.502568009214891, "grad_norm": 3.9832923412323, "learning_rate": 1.0071218242629667e-05, "loss": 7.5131, "step": 1596200 }, { "epoch": 6.502975387238273, "grad_norm": 2.246173620223999, "learning_rate": 1.0067519250866656e-05, "loss": 7.5379, "step": 1596300 }, { "epoch": 6.503382765261653, "grad_norm": 16.615299224853516, "learning_rate": 1.0063918897128273e-05, "loss": 7.5131, "step": 1596400 }, { "epoch": 6.503790143285035, "grad_norm": 3.7326560020446777, "learning_rate": 1.0060417181557158e-05, "loss": 7.5267, "step": 1596500 }, { "epoch": 6.504197521308416, "grad_norm": 12.541718482971191, "learning_rate": 1.005701410429156e-05, "loss": 7.5, "step": 1596600 }, { "epoch": 6.504604899331798, "grad_norm": 3.1988630294799805, "learning_rate": 1.0053709665466081e-05, "loss": 7.5298, "step": 1596700 }, { "epoch": 6.5050122773551795, "grad_norm": 2.962094783782959, "learning_rate": 1.005050386521148e-05, "loss": 7.5127, "step": 1596800 }, { "epoch": 6.505419655378561, "grad_norm": 4.029211521148682, "learning_rate": 1.004739670365433e-05, "loss": 7.4984, "step": 1596900 }, { "epoch": 6.505827033401943, "grad_norm": 12.007691383361816, "learning_rate": 1.0044388180917625e-05, "loss": 7.5096, "step": 1597000 }, { "epoch": 6.505827033401943, "eval_MaskedAccuracy": 0.513536260494133, "eval_loss": 1.5880944728851318, "eval_runtime": 170.5549, "eval_samples_per_second": 372.173, "eval_steps_per_second": 1.454, "step": 1597000 }, { "epoch": 6.506234411425324, "grad_norm": 3.1685709953308105, "learning_rate": 1.004147829711993e-05, "loss": 7.5305, "step": 1597100 }, { "epoch": 6.506641789448706, "grad_norm": 3.2527971267700195, "learning_rate": 1.0038667052376744e-05, "loss": 7.4967, "step": 1597200 }, { "epoch": 6.507049167472087, "grad_norm": 4.4246368408203125, "learning_rate": 1.0035954446799152e-05, "loss": 7.5019, "step": 1597300 }, { "epoch": 6.507456545495469, "grad_norm": 6.173441410064697, "learning_rate": 1.0033340480494348e-05, "loss": 7.5042, "step": 1597400 }, { "epoch": 6.50786392351885, "grad_norm": 10.910491943359375, "learning_rate": 1.0030825153565378e-05, "loss": 7.5051, "step": 1597500 }, { "epoch": 6.508271301542232, "grad_norm": 6.449568748474121, "learning_rate": 1.0028408466111964e-05, "loss": 7.5127, "step": 1597600 }, { "epoch": 6.508678679565612, "grad_norm": 10.72154712677002, "learning_rate": 1.0026090418229666e-05, "loss": 7.5266, "step": 1597700 }, { "epoch": 6.509086057588994, "grad_norm": 3.6500210762023926, "learning_rate": 1.0023871010010177e-05, "loss": 7.5066, "step": 1597800 }, { "epoch": 6.5094934356123755, "grad_norm": 7.185964584350586, "learning_rate": 1.0021750241541024e-05, "loss": 7.5379, "step": 1597900 }, { "epoch": 6.509900813635757, "grad_norm": 3.6083433628082275, "learning_rate": 1.0019728112906143e-05, "loss": 7.5008, "step": 1598000 }, { "epoch": 6.509900813635757, "eval_MaskedAccuracy": 0.5139002103389801, "eval_loss": 1.58592689037323, "eval_runtime": 166.8944, "eval_samples_per_second": 380.336, "eval_steps_per_second": 1.486, "step": 1598000 }, { "epoch": 6.5103081916591385, "grad_norm": 13.888603210449219, "learning_rate": 1.0017804624185296e-05, "loss": 7.5217, "step": 1598100 }, { "epoch": 6.51071556968252, "grad_norm": 13.128150939941406, "learning_rate": 1.001597977545496e-05, "loss": 7.4866, "step": 1598200 }, { "epoch": 6.511122947705902, "grad_norm": 3.8094708919525146, "learning_rate": 1.0014253566787136e-05, "loss": 7.5137, "step": 1598300 }, { "epoch": 6.511530325729283, "grad_norm": 3.978328227996826, "learning_rate": 1.0012625998249971e-05, "loss": 7.5076, "step": 1598400 }, { "epoch": 6.511937703752665, "grad_norm": 7.387543678283691, "learning_rate": 1.0011097069907731e-05, "loss": 7.5153, "step": 1598500 }, { "epoch": 6.512345081776046, "grad_norm": 3.7918782234191895, "learning_rate": 1.0009666781821084e-05, "loss": 7.5407, "step": 1598600 }, { "epoch": 6.512752459799428, "grad_norm": 4.427021503448486, "learning_rate": 1.0008335134046257e-05, "loss": 7.5382, "step": 1598700 }, { "epoch": 6.513159837822809, "grad_norm": 12.303589820861816, "learning_rate": 1.0007102126636154e-05, "loss": 7.4989, "step": 1598800 }, { "epoch": 6.513567215846191, "grad_norm": 10.409649848937988, "learning_rate": 1.0005967759639529e-05, "loss": 7.5282, "step": 1598900 }, { "epoch": 6.5139745938695714, "grad_norm": 7.53159761428833, "learning_rate": 1.000493203310126e-05, "loss": 7.4865, "step": 1599000 }, { "epoch": 6.5139745938695714, "eval_MaskedAccuracy": 0.5138469709881768, "eval_loss": 1.5772393941879272, "eval_runtime": 162.0602, "eval_samples_per_second": 391.682, "eval_steps_per_second": 1.53, "step": 1599000 }, { "epoch": 6.514381971892953, "grad_norm": 5.471723556518555, "learning_rate": 1.0003994947062063e-05, "loss": 7.5011, "step": 1599100 }, { "epoch": 6.5147893499163345, "grad_norm": 2.725353956222534, "learning_rate": 1.0003156501559048e-05, "loss": 7.4923, "step": 1599200 }, { "epoch": 6.515196727939716, "grad_norm": 3.509063959121704, "learning_rate": 1.0002416696625745e-05, "loss": 7.5084, "step": 1599300 }, { "epoch": 6.515604105963098, "grad_norm": 5.341278076171875, "learning_rate": 1.0001775532290948e-05, "loss": 7.5007, "step": 1599400 }, { "epoch": 6.516011483986479, "grad_norm": 3.1526851654052734, "learning_rate": 1.000123300857988e-05, "loss": 7.5017, "step": 1599500 }, { "epoch": 6.516418862009861, "grad_norm": 9.90170669555664, "learning_rate": 1.0000789125514688e-05, "loss": 7.5098, "step": 1599600 }, { "epoch": 6.516826240033242, "grad_norm": 3.8604884147644043, "learning_rate": 1.0000443883112281e-05, "loss": 7.5306, "step": 1599700 }, { "epoch": 6.517233618056624, "grad_norm": 8.72430419921875, "learning_rate": 1.00001972813865e-05, "loss": 7.5348, "step": 1599800 }, { "epoch": 6.517640996080005, "grad_norm": 8.924925804138184, "learning_rate": 1.000004932034704e-05, "loss": 7.5241, "step": 1599900 }, { "epoch": 6.518048374103387, "grad_norm": 4.717263221740723, "learning_rate": 1e-05, "loss": 7.5391, "step": 1600000 }, { "epoch": 6.518048374103387, "eval_MaskedAccuracy": 0.5132993789243037, "eval_loss": 1.5939823389053345, "eval_runtime": 242.1229, "eval_samples_per_second": 262.164, "eval_steps_per_second": 1.024, "step": 1600000 } ], "logging_steps": 100, "max_steps": 1600000, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.272945468178432e+20, "train_batch_size": 400, "trial_name": null, "trial_params": null }